diff options
Diffstat (limited to 'includes/parser')
-rw-r--r-- | includes/parser/CoreParserFunctions.php | 231 | ||||
-rw-r--r-- | includes/parser/DateFormatter.php | 122 | ||||
-rw-r--r-- | includes/parser/Parser.php | 360 | ||||
-rw-r--r-- | includes/parser/ParserCache.php | 26 | ||||
-rw-r--r-- | includes/parser/ParserOptions.php | 17 | ||||
-rw-r--r-- | includes/parser/ParserOutput.php | 10 | ||||
-rw-r--r-- | includes/parser/Preprocessor_DOM.php | 82 | ||||
-rw-r--r-- | includes/parser/Preprocessor_Hash.php | 41 | ||||
-rw-r--r-- | includes/parser/Tidy.php | 170 |
9 files changed, 824 insertions, 235 deletions
diff --git a/includes/parser/CoreParserFunctions.php b/includes/parser/CoreParserFunctions.php index a3b5189a..774e96a7 100644 --- a/includes/parser/CoreParserFunctions.php +++ b/includes/parser/CoreParserFunctions.php @@ -27,9 +27,11 @@ class CoreParserFunctions { $parser->setFunctionHook( 'fullurle', array( __CLASS__, 'fullurle' ), SFH_NO_HASH ); $parser->setFunctionHook( 'formatnum', array( __CLASS__, 'formatnum' ), SFH_NO_HASH ); $parser->setFunctionHook( 'grammar', array( __CLASS__, 'grammar' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'gender', array( __CLASS__, 'gender' ), SFH_NO_HASH ); $parser->setFunctionHook( 'plural', array( __CLASS__, 'plural' ), SFH_NO_HASH ); $parser->setFunctionHook( 'numberofpages', array( __CLASS__, 'numberofpages' ), SFH_NO_HASH ); $parser->setFunctionHook( 'numberofusers', array( __CLASS__, 'numberofusers' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'numberofactiveusers', array( __CLASS__, 'numberofactiveusers' ), SFH_NO_HASH ); $parser->setFunctionHook( 'numberofarticles', array( __CLASS__, 'numberofarticles' ), SFH_NO_HASH ); $parser->setFunctionHook( 'numberoffiles', array( __CLASS__, 'numberoffiles' ), SFH_NO_HASH ); $parser->setFunctionHook( 'numberofadmins', array( __CLASS__, 'numberofadmins' ), SFH_NO_HASH ); @@ -45,7 +47,27 @@ class CoreParserFunctions { $parser->setFunctionHook( 'filepath', array( __CLASS__, 'filepath' ), SFH_NO_HASH ); $parser->setFunctionHook( 'pagesincategory', array( __CLASS__, 'pagesincategory' ), SFH_NO_HASH ); $parser->setFunctionHook( 'pagesize', array( __CLASS__, 'pagesize' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'protectionlevel', array( __CLASS__, 'protectionlevel' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'namespace', array( __CLASS__, 'mwnamespace' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'namespacee', array( __CLASS__, 'namespacee' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'talkspace', array( __CLASS__, 'talkspace' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'talkspacee', array( __CLASS__, 'talkspacee' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'subjectspace', array( __CLASS__, 'subjectspace' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'subjectspacee', array( __CLASS__, 'subjectspacee' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'pagename', array( __CLASS__, 'pagename' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'pagenamee', array( __CLASS__, 'pagenamee' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'fullpagename', array( __CLASS__, 'fullpagename' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'fullpagenamee', array( __CLASS__, 'fullpagenamee' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'basepagename', array( __CLASS__, 'basepagename' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'basepagenamee', array( __CLASS__, 'basepagenamee' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'subpagename', array( __CLASS__, 'subpagename' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'subpagenamee', array( __CLASS__, 'subpagenamee' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'talkpagename', array( __CLASS__, 'talkpagename' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'talkpagenamee', array( __CLASS__, 'talkpagenamee' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'subjectpagename', array( __CLASS__, 'subjectpagename' ), SFH_NO_HASH ); + $parser->setFunctionHook( 'subjectpagenamee', array( __CLASS__, 'subjectpagenamee' ), SFH_NO_HASH ); $parser->setFunctionHook( 'tag', array( __CLASS__, 'tagObj' ), SFH_OBJECT_ARGS ); + $parser->setFunctionHook( 'formatdate', array( __CLASS__, 'formatDate' ) ); if ( $wgAllowDisplayTitle ) { $parser->setFunctionHook( 'displaytitle', array( __CLASS__, 'displaytitle' ), SFH_NO_HASH ); @@ -66,6 +88,22 @@ class CoreParserFunctions { return array( 'found' => false ); } } + + static function formatDate( $parser, $date, $defaultPref = null ) { + $df = DateFormatter::getInstance(); + + $date = trim($date); + + $pref = $parser->mOptions->getDateFormat(); + + // Specify a different default date format other than the the normal default + // iff the user has 'default' for their setting + if ($pref == 'default' && $defaultPref) + $pref = $defaultPref; + + $date = $df->reformat( $pref, $date, array('match-whole') ); + return $date; + } static function ns( $parser, $part1 = '' ) { global $wgContLang; @@ -154,6 +192,28 @@ class CoreParserFunctions { return $parser->getFunctionLang()->convertGrammar( $word, $case ); } + static function gender( $parser, $user ) { + $forms = array_slice( func_get_args(), 2); + + // default + $gender = User::getDefaultOption( 'gender' ); + + // allow prefix. + $title = Title::newFromText( $user ); + + if (is_object( $title ) && $title->getNamespace() == NS_USER) + $user = $title->getText(); + + // check parameter, or use $wgUser if in interface message + $user = User::newFromName( $user ); + if ( $user ) { + $gender = $user->getOption( 'gender' ); + } elseif ( $parser->mOptions->getInterfaceMessage() ) { + global $wgUser; + $gender = $wgUser->getOption( 'gender' ); + } + return $parser->getFunctionLang()->gender( $gender, $forms ); + } static function plural( $parser, $text = '') { $forms = array_slice( func_get_args(), 2); $text = $parser->getFunctionLang()->parseFormattedNumber( $text ); @@ -208,6 +268,9 @@ class CoreParserFunctions { static function numberofusers( $parser, $raw = null ) { return self::formatRaw( SiteStats::users(), $raw ); } + static function numberofactiveusers( $parser, $raw = null ) { + return self::formatRaw( SiteStats::activeUsers(), $raw ); + } static function numberofarticles( $parser, $raw = null ) { return self::formatRaw( SiteStats::articles(), $raw ); } @@ -230,6 +293,126 @@ class CoreParserFunctions { return self::formatRaw( SiteStats::numberingroup( strtolower( $name ) ), $raw ); } + + /** + * Given a title, return the namespace name that would be given by the + * corresponding magic word + * Note: function name changed to "mwnamespace" rather than "namespace" + * to not break PHP 5.3 + */ + static function mwnamespace( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) ) + return ''; + return str_replace( '_', ' ', $t->getNsText() ); + } + static function namespacee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) ) + return ''; + return wfUrlencode( $t->getNsText() ); + } + static function talkspace( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) || !$t->canTalk() ) + return ''; + return str_replace( '_', ' ', $t->getTalkNsText() ); + } + static function talkspacee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) || !$t->canTalk() ) + return ''; + return wfUrlencode( $t->getTalkNsText() ); + } + static function subjectspace( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) ) + return ''; + return str_replace( '_', ' ', $t->getSubjectNsText() ); + } + static function subjectspacee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) ) + return ''; + return wfUrlencode( $t->getSubjectNsText() ); + } + /* + * Functions to get and normalize pagenames, corresponding to the magic words + * of the same names + */ + static function pagename( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) ) + return ''; + return wfEscapeWikiText( $t->getText() ); + } + static function pagenamee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) ) + return ''; + return $t->getPartialURL(); + } + static function fullpagename( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) || !$t->canTalk() ) + return ''; + return wfEscapeWikiText( $t->getPrefixedText() ); + } + static function fullpagenamee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) || !$t->canTalk() ) + return ''; + return $t->getPrefixedURL(); + } + static function subpagename( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) ) + return ''; + return $t->getSubpageText(); + } + static function subpagenamee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) ) + return ''; + return $t->getSubpageUrlForm(); + } + static function basepagename( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) ) + return ''; + return $t->getBaseText(); + } + static function basepagenamee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) ) + return ''; + return wfUrlEncode( str_replace( ' ', '_', $t->getBaseText() ) ); + } + static function talkpagename( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) || !$t->canTalk() ) + return ''; + return wfEscapeWikiText( $t->getTalkPage()->getPrefixedText() ); + } + static function talkpagenamee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) || !$t->canTalk() ) + return ''; + return $t->getTalkPage()->getPrefixedUrl(); + } + static function subjectpagename( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) ) + return ''; + return wfEscapeWikiText( $t->getSubjectPage()->getPrefixedText() ); + } + static function subjectpagenamee( $parser, $title = null ) { + $t = Title::newFromText( $title ); + if ( is_null($t) ) + return ''; + return $t->getSubjectPage()->getPrefixedUrl(); + } + /** * Return the number of pages in the given category, or 0 if it's nonexis- * tent. This is an expensive parser function and can't be called too many @@ -292,6 +475,16 @@ class CoreParserFunctions { } return self::formatRaw( $length, $raw ); } + + /** + * Returns the requested protection level for the current page + */ + static function protectionlevel( $parser, $type = '' ) { + $restrictions = $parser->mTitle->getRestrictions( strtolower( $type ) ); + # Title::getRestrictions returns an array, its possible it may have + # multiple values in the future + return implode( $restrictions, ',' ); + } static function language( $parser, $arg = '' ) { global $wgContLang; @@ -299,20 +492,38 @@ class CoreParserFunctions { return $lang != '' ? $lang : $arg; } - static function pad( $string = '', $length = 0, $char = 0, $direction = STR_PAD_RIGHT ) { - $length = min( max( $length, 0 ), 500 ); - $char = substr( $char, 0, 1 ); - return ( $string !== '' && (int)$length > 0 && strlen( trim( (string)$char ) ) > 0 ) - ? str_pad( $string, $length, (string)$char, $direction ) - : $string; + /** + * Unicode-safe str_pad with the restriction that $length is forced to be <= 500 + */ + static function pad( $string, $length, $padding = '0', $direction = STR_PAD_RIGHT ) { + $lengthOfPadding = mb_strlen( $padding ); + if ( $lengthOfPadding == 0 ) return $string; + + # The remaining length to add counts down to 0 as padding is added + $length = min( $length, 500 ) - mb_strlen( $string ); + # $finalPadding is just $padding repeated enough times so that + # mb_strlen( $string ) + mb_strlen( $finalPadding ) == $length + $finalPadding = ''; + while ( $length > 0 ) { + # If $length < $lengthofPadding, truncate $padding so we get the + # exact length desired. + $finalPadding .= mb_substr( $padding, 0, $length ); + $length -= $lengthOfPadding; + } + + if ( $direction == STR_PAD_LEFT ) { + return $finalPadding . $string; + } else { + return $string . $finalPadding; + } } - static function padleft( $parser, $string = '', $length = 0, $char = 0 ) { - return self::pad( $string, $length, $char, STR_PAD_LEFT ); + static function padleft( $parser, $string = '', $length = 0, $padding = '0' ) { + return self::pad( $string, $length, $padding, STR_PAD_LEFT ); } - static function padright( $parser, $string = '', $length = 0, $char = 0 ) { - return self::pad( $string, $length, $char ); + static function padright( $parser, $string = '', $length = 0, $padding = '0' ) { + return self::pad( $string, $length, $padding ); } static function anchorencode( $parser, $text ) { diff --git a/includes/parser/DateFormatter.php b/includes/parser/DateFormatter.php index 9ef11d5e..aa6415e4 100644 --- a/includes/parser/DateFormatter.php +++ b/includes/parser/DateFormatter.php @@ -41,11 +41,11 @@ class DateFormatter $this->regexTrail = '(?![a-z])/iu'; # Partial regular expressions - $this->prxDM = '\[\[(\d{1,2})[ _](' . $this->monthNames . ')]]'; - $this->prxMD = '\[\[(' . $this->monthNames . ')[ _](\d{1,2})]]'; - $this->prxY = '\[\[(\d{1,4}([ _]BC|))]]'; - $this->prxISO1 = '\[\[(-?\d{4})]]-\[\[(\d{2})-(\d{2})]]'; - $this->prxISO2 = '\[\[(-?\d{4})-(\d{2})-(\d{2})]]'; + $this->prxDM = '\[\[(\d{1,2})[ _](' . $this->monthNames . ')\]\]'; + $this->prxMD = '\[\[(' . $this->monthNames . ')[ _](\d{1,2})\]\]'; + $this->prxY = '\[\[(\d{1,4}([ _]BC|))\]\]'; + $this->prxISO1 = '\[\[(-?\d{4})]]-\[\[(\d{2})-(\d{2})\]\]'; + $this->prxISO2 = '\[\[(-?\d{4})-(\d{2})-(\d{2})\]\]'; # Real regular expressions $this->regexes[self::DMY] = "/{$this->prxDM} *,? *{$this->prxY}{$this->regexTrail}"; @@ -96,9 +96,11 @@ class DateFormatter } /** - * @static + * Get a DateFormatter object + * + * @return DateFormatter object */ - function &getInstance() { + public static function &getInstance() { global $wgMemc; static $dateFormatter = false; if ( !$dateFormatter ) { @@ -112,10 +114,14 @@ class DateFormatter } /** - * @param string $preference User preference - * @param string $text Text to reformat + * @param $preference String: User preference + * @param $text String: Text to reformat */ - function reformat( $preference, $text ) { + function reformat( $preference, $text, $options = array('linked') ) { + + $linked = in_array( 'linked', $options ); + $match_whole = in_array( 'match-whole', $options ); + if ( isset( $this->preferences[$preference] ) ) { $preference = $this->preferences[$preference]; } else { @@ -136,7 +142,24 @@ class DateFormatter # Default $this->mTarget = $i; } - $text = preg_replace_callback( $this->regexes[$i], array( &$this, 'replace' ), $text ); + $regex = $this->regexes[$i]; + + // Horrible hack + if (!$linked) { + $regex = str_replace( array( '\[\[', '\]\]' ), '', $regex ); + } + + if ($match_whole) { + // Let's hope this works + $regex = preg_replace( '!^/!', '/^', $regex ); + $regex = str_replace( $this->regexTrail, + '$'.$this->regexTrail, $regex ); + } + + // Another horrible hack + $this->mLinked = $linked; + $text = preg_replace_callback( $regex, array( &$this, 'replace' ), $text ); + unset($this->mLinked); } return $text; } @@ -146,6 +169,10 @@ class DateFormatter */ function replace( $matches ) { # Extract information from $matches + $linked = true; + if ( isset( $this->mLinked ) ) + $linked = $this->mLinked; + $bits = array(); $key = $this->keys[$this->mSource]; for ( $p=0; $p < strlen($key); $p++ ) { @@ -153,41 +180,54 @@ class DateFormatter $bits[$key{$p}] = $matches[$p+1]; } } - + + return $this->formatDate( $bits, $linked ); + } + + function formatDate( $bits, $link = true ) { $format = $this->targets[$this->mTarget]; + + if (!$link) { + // strip piped links + $format = preg_replace( '/\[\[[^|]+\|([^\]]+)\]\]/', '$1', $format ); + // strip remaining links + $format = str_replace( array( '[[', ']]' ), '', $format ); + } # Construct new date $text = ''; $fail = false; + + // Pre-generate y/Y stuff because we need the year for the <span> title. + if ( !isset( $bits['y'] ) && isset( $bits['Y'] ) ) + $bits['y'] = $this->makeIsoYear( $bits['Y'] ); + if ( !isset( $bits['Y'] ) && isset( $bits['y'] ) ) + $bits['Y'] = $this->makeNormalYear( $bits['y'] ); + + if ( !isset( $bits['m'] ) ) { + $m = $this->makeIsoMonth( $bits['F'] ); + if ( !$m || $m == '00' ) { + $fail = true; + } else { + $bits['m'] = $m; + } + } + + if ( !isset($bits['d']) ) { + $bits['d'] = sprintf( '%02d', $bits['j'] ); + } for ( $p=0; $p < strlen( $format ); $p++ ) { $char = $format{$p}; switch ( $char ) { case 'd': # ISO day of month - if ( !isset($bits['d']) ) { - $text .= sprintf( '%02d', $bits['j'] ); - } else { - $text .= $bits['d']; - } + $text .= $bits['d']; break; case 'm': # ISO month - if ( !isset($bits['m']) ) { - $m = $this->makeIsoMonth( $bits['F'] ); - if ( !$m || $m == '00' ) { - $fail = true; - } else { - $text .= $m; - } - } else { - $text .= $bits['m']; - } + $text .= $bits['m']; break; case 'y': # ISO year - if ( !isset( $bits['y'] ) ) { - $text .= $this->makeIsoYear( $bits['Y'] ); - } else { - $text .= $bits['y']; - } + $text .= $bits['y']; break; case 'j': # ordinary day of month if ( !isset($bits['j']) ) { @@ -210,11 +250,7 @@ class DateFormatter } break; case 'Y': # ordinary (optional BC) year - if ( !isset( $bits['Y'] ) ) { - $text .= $this->makeNormalYear( $bits['y'] ); - } else { - $text .= $bits['Y']; - } + $text .= $bits['Y']; break; default: $text .= $char; @@ -223,6 +259,18 @@ class DateFormatter if ( $fail ) { $text = $matches[0]; } + + $isoBits = array(); + if ( isset($bits['y']) ) + $isoBits[] = $bits['y']; + $isoBits[] = $bits['m']; + $isoBits[] = $bits['d']; + $isoDate = implode( '-', $isoBits );; + + // Output is not strictly HTML (it's wikitext), but <span> is whitelisted. + $text = Xml::tags( 'span', + array( 'class' => 'mw-formatted-date', 'title' => $isoDate ), $text ); + return $text; } diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index 7fcfb90a..e6a68782 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -374,8 +374,8 @@ class Parser $text = Sanitizer::normalizeCharReferences( $text ); - if (($wgUseTidy and $this->mOptions->mTidy) or $wgAlwaysUseTidy) { - $text = self::tidy($text); + if ( ( $wgUseTidy && $this->mOptions->mTidy ) || $wgAlwaysUseTidy ) { + $text = MWTidy::tidy( $text ); } else { # attempt to sanitize at least some nesting problems # (bug #2702 and quite a few others) @@ -648,126 +648,14 @@ class Parser $this->mStripState->general->setPair( $rnd, $text ); return $rnd; } - - /** - * Interface with html tidy, used if $wgUseTidy = true. - * If tidy isn't able to correct the markup, the original will be - * returned in all its glory with a warning comment appended. - * - * Either the external tidy program or the in-process tidy extension - * will be used depending on availability. Override the default - * $wgTidyInternal setting to disable the internal if it's not working. - * - * @param string $text Hideous HTML input - * @return string Corrected HTML output - * @public - * @static - */ - function tidy( $text ) { - global $wgTidyInternal; - - $wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"'. -' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>'. -'<head><title>test</title></head><body>'.$text.'</body></html>'; - - # Tidy is known to clobber tabs; convert 'em to entities - $wrappedtext = str_replace("\t", '	', $wrappedtext); - - if( $wgTidyInternal ) { - $correctedtext = self::internalTidy( $wrappedtext ); - } else { - $correctedtext = self::externalTidy( $wrappedtext ); - } - if( is_null( $correctedtext ) ) { - wfDebug( "Tidy error detected!\n" ); - return $text . "\n<!-- Tidy found serious XHTML errors -->\n"; - } - - # Convert the tabs back from entities - $correctedtext = str_replace('	', "\t", $correctedtext); - - return $correctedtext; - } - - /** - * Spawn an external HTML tidy process and get corrected markup back from it. - * - * @private - * @static - */ - function externalTidy( $text ) { - global $wgTidyConf, $wgTidyBin, $wgTidyOpts; - wfProfileIn( __METHOD__ ); - - $cleansource = ''; - $opts = ' -utf8'; - - $descriptorspec = array( - 0 => array('pipe', 'r'), - 1 => array('pipe', 'w'), - 2 => array('file', wfGetNull(), 'a') - ); - $pipes = array(); - if( function_exists('proc_open') ) { - $process = proc_open("$wgTidyBin -config $wgTidyConf $wgTidyOpts$opts", $descriptorspec, $pipes); - if (is_resource($process)) { - // Theoretically, this style of communication could cause a deadlock - // here. If the stdout buffer fills up, then writes to stdin could - // block. This doesn't appear to happen with tidy, because tidy only - // writes to stdout after it's finished reading from stdin. Search - // for tidyParseStdin and tidySaveStdout in console/tidy.c - fwrite($pipes[0], $text); - fclose($pipes[0]); - while (!feof($pipes[1])) { - $cleansource .= fgets($pipes[1], 1024); - } - fclose($pipes[1]); - proc_close($process); - } - } - - wfProfileOut( __METHOD__ ); - - if( $cleansource == '' && $text != '') { - // Some kind of error happened, so we couldn't get the corrected text. - // Just give up; we'll use the source text and append a warning. - return null; - } else { - return $cleansource; - } - } - + /** - * Use the HTML tidy PECL extension to use the tidy library in-process, - * saving the overhead of spawning a new process. - * - * 'pear install tidy' should be able to compile the extension module. - * - * @private - * @static + * Interface with html tidy + * @deprecated Use MWTidy::tidy() */ - function internalTidy( $text ) { - global $wgTidyConf, $IP, $wgDebugTidy; - wfProfileIn( __METHOD__ ); - - $tidy = new tidy; - $tidy->parseString( $text, $wgTidyConf, 'utf8' ); - $tidy->cleanRepair(); - if( $tidy->getStatus() == 2 ) { - // 2 is magic number for fatal error - // http://www.php.net/manual/en/function.tidy-get-status.php - $cleansource = null; - } else { - $cleansource = tidy_get_output( $tidy ); - } - if ( $wgDebugTidy && $tidy->getStatus() > 0 ) { - $cleansource .= "<!--\nTidy reports:\n" . - str_replace( '-->', '-->', $tidy->errorBuffer ) . - "\n-->"; - } - - wfProfileOut( __METHOD__ ); - return $cleansource; + public static function tidy( $text ) { + wfDeprecated( __METHOD__ ); + return MWTidy::tidy( $text ); } /** @@ -998,7 +886,7 @@ class Parser $text = $this->doDoubleUnderscore( $text ); $text = $this->doHeadings( $text ); - if($this->mOptions->getUseDynamicDates()) { + if( $this->mOptions->getUseDynamicDates() ) { $df = DateFormatter::getInstance(); $text = $df->reformat( $this->mOptions->getDateFormat(), $text ); } @@ -1008,7 +896,7 @@ class Parser # replaceInternalLinks may sometimes leave behind # absolute URLs, which have to be masked to hide them from replaceExternalLinks - $text = str_replace($this->mUniqPrefix."NOPARSE", "", $text); + $text = str_replace($this->mUniqPrefix.'NOPARSE', '', $text); $text = $this->doMagicLinks( $text ); $text = $this->formatHeadings( $text, $isMain ); @@ -1045,16 +933,16 @@ class Parser } function magicLinkCallback( $m ) { - if ( isset( $m[1] ) && strval( $m[1] ) !== '' ) { + if ( isset( $m[1] ) && $m[1] !== '' ) { # Skip anchor return $m[0]; - } elseif ( isset( $m[2] ) && strval( $m[2] ) !== '' ) { + } elseif ( isset( $m[2] ) && $m[2] !== '' ) { # Skip HTML element return $m[0]; - } elseif ( isset( $m[3] ) && strval( $m[3] ) !== '' ) { + } elseif ( isset( $m[3] ) && $m[3] !== '' ) { # Free external link return $this->makeFreeExternalLink( $m[0] ); - } elseif ( isset( $m[4] ) && strval( $m[4] ) !== '' ) { + } elseif ( isset( $m[4] ) && $m[4] !== '' ) { # RFC or PMID if ( substr( $m[0], 0, 3 ) === 'RFC' ) { $keyword = 'RFC'; @@ -1072,7 +960,7 @@ class Parser $sk = $this->mOptions->getSkin(); $la = $sk->getExternalLinkAttributes( $url, $keyword.$id ); return "<a href=\"{$url}\"{$la}>{$keyword} {$id}</a>"; - } elseif ( isset( $m[5] ) && strval( $m[5] ) !== '' ) { + } elseif ( isset( $m[5] ) && $m[5] !== '' ) { # ISBN $isbn = $m[5]; $num = strtr( $isbn, array( @@ -1130,7 +1018,7 @@ class Parser if ( $text === false ) { # Not an image, make a link $text = $sk->makeExternalLink( $url, $wgContLang->markNoConversion($url), true, 'free', - $this->getExternalLinkAttribs() ); + $this->getExternalLinkAttribs( $url ) ); # Register it in the output object... # Replace unnecessary URL escape codes with their equivalent characters $pasteurized = self::replaceUnusualEscapes( $url ); @@ -1406,18 +1294,12 @@ class Parser $url = Sanitizer::cleanUrl( $url ); - if ( $this->mOptions->mExternalLinkTarget ) { - $attribs = array( 'target' => $this->mOptions->mExternalLinkTarget ); - } else { - $attribs = array(); - } - # Use the encoded URL # This means that users can paste URLs directly into the text # Funny characters like ö aren't valid in URLs anyway # This was changed in August 2004 - $s .= $sk->makeExternalLink( $url, $text, false, $linktype, $this->getExternalLinkAttribs() ) - . $dtrail . $trail; + $s .= $sk->makeExternalLink( $url, $text, false, $linktype, + $this->getExternalLinkAttribs( $url ) ) . $dtrail . $trail; # Register link in the output object. # Replace unnecessary URL escape codes with the referenced character @@ -1430,12 +1312,36 @@ class Parser return $s; } - function getExternalLinkAttribs() { + /** + * Get an associative array of additional HTML attributes appropriate for a + * particular external link. This currently may include rel => nofollow + * (depending on configuration, namespace, and the URL's domain) and/or a + * target attribute (depending on configuration). + * + * @param string $url Optional URL, to extract the domain from for rel => + * nofollow if appropriate + * @return array Associative array of HTML attributes + */ + function getExternalLinkAttribs( $url = false ) { $attribs = array(); global $wgNoFollowLinks, $wgNoFollowNsExceptions; $ns = $this->mTitle->getNamespace(); if( $wgNoFollowLinks && !in_array($ns, $wgNoFollowNsExceptions) ) { $attribs['rel'] = 'nofollow'; + + global $wgNoFollowDomainExceptions; + if ( $wgNoFollowDomainExceptions ) { + $bits = wfParseUrl( $url ); + if ( is_array( $bits ) && isset( $bits['host'] ) ) { + foreach ( $wgNoFollowDomainExceptions as $domain ) { + if( substr( $bits['host'], -strlen( $domain ) ) + == $domain ) { + unset( $attribs['rel'] ); + break; + } + } + } + } } if ( $this->mOptions->getExternalLinkTarget() ) { $attribs['target'] = $this->mOptions->getExternalLinkTarget(); @@ -1697,7 +1603,7 @@ class Parser wfProfileOut( __METHOD__."-misc" ); wfProfileIn( __METHOD__."-title" ); $nt = Title::newFromText( $this->mStripState->unstripNoWiki($link) ); - if( !$nt ) { + if( $nt === NULL ) { $s .= $prefix . '[[' . $line; wfProfileOut( __METHOD__."-title" ); continue; @@ -1823,6 +1729,7 @@ class Parser # NS_MEDIA is a pseudo-namespace for linking directly to a file # FIXME: Should do batch file existence checks, see comment below if( $ns == NS_MEDIA ) { + wfProfileIn( __METHOD__."-media" ); # Give extensions a chance to select the file revision for us $skip = $time = false; wfRunHooks( 'BeforeParserMakeImageLinkObj', array( &$this, &$nt, &$skip, &$time ) ); @@ -1834,9 +1741,11 @@ class Parser # Cloak with NOPARSE to avoid replacement in replaceExternalLinks $s .= $prefix . $this->armorLinks( $link ) . $trail; $this->mOutput->addImage( $nt->getDBkey() ); + wfProfileOut( __METHOD__."-media" ); continue; } + wfProfileIn( __METHOD__."-always_known" ); # Some titles, such as valid special pages or files in foreign repos, should # be shown as bluelinks even though they're not included in the page table # @@ -1849,6 +1758,7 @@ class Parser # Links will be added to the output link list after checking $s .= $holders->makeHolder( $nt, $text, '', $trail, $prefix ); } + wfProfileOut( __METHOD__."-always_known" ); } wfProfileOut( __METHOD__ ); return $holders; @@ -2178,7 +2088,7 @@ class Parser $inBlockElem = true; } } else if ( !$inBlockElem && !$this->mInPre ) { - if ( ' ' == $t{0} and ( $this->mLastSection === 'pre' or trim($t) != '' ) ) { + if ( ' ' == substr( $t, 0, 1 ) and ( $this->mLastSection === 'pre' or trim($t) != '' ) ) { // pre if ($this->mLastSection !== 'pre') { $paragraphStack = false; @@ -2540,6 +2450,12 @@ class Parser $this->mOutput->setFlag( 'vary-revision' ); wfDebug( __METHOD__ . ": {{REVISIONTIMESTAMP}} used, setting vary-revision...\n" ); return $this->getRevisionTimestamp(); + case 'revisionuser': + // Let the edit saving system know we should parse the page + // *after* a revision ID has been assigned. This is for null edits. + $this->mOutput->setFlag( 'vary-revision' ); + wfDebug( __METHOD__ . ": {{REVISIONUSER}} used, setting vary-revision...\n" ); + return $this->getRevisionUser(); case 'namespace': return str_replace('_',' ',$wgContLang->getNsText( $this->mTitle->getNamespace() ) ); case 'namespacee': @@ -2586,6 +2502,8 @@ class Parser return $this->mVarCache[$index] = $wgContLang->formatNum( SiteStats::images() ); case 'numberofusers': return $this->mVarCache[$index] = $wgContLang->formatNum( SiteStats::users() ); + case 'numberofactiveusers': + return $this->mVarCache[$index] = $wgContLang->formatNum( SiteStats::activeUsers() ); case 'numberofpages': return $this->mVarCache[$index] = $wgContLang->formatNum( SiteStats::pages() ); case 'numberofadmins': @@ -2696,11 +2614,10 @@ class Parser * @private */ function replaceVariables( $text, $frame = false, $argsOnly = false ) { - # Prevent too big inclusions - if( strlen( $text ) > $this->mOptions->getMaxIncludeSize() ) { + # Is there any text? Also, Prevent too big inclusions! + if ( strlen( $text ) < 1 || strlen( $text ) > $this->mOptions->getMaxIncludeSize() ) { return $text; } - wfProfileIn( __METHOD__ ); if ( $frame === false ) { @@ -2776,7 +2693,7 @@ class Parser * @private */ function braceSubstitution( $piece, $frame ) { - global $wgContLang, $wgAllowDisplayTitle, $wgNonincludableNamespaces; + global $wgContLang, $wgNonincludableNamespaces; wfProfileIn( __METHOD__ ); wfProfileIn( __METHOD__.'-setup' ); @@ -2936,12 +2853,6 @@ class Parser if($wgContLang->hasVariants() && $title->getArticleID() == 0){ $wgContLang->findVariantLink( $part1, $title, true ); } - # Do infinite loop check - if ( !$frame->loopCheck( $title ) ) { - $found = true; - $text = '<span class="error">' . wfMsgForContent( 'parser-template-loop-warning', $titleText ) . '</span>'; - wfDebug( __METHOD__.": template loop broken at '$titleText'\n" ); - } # Do recursion depth check $limit = $this->mOptions->getMaxTemplateDepth(); if ( $frame->depth >= $limit ) { @@ -2991,6 +2902,14 @@ class Parser } $found = true; } + + # Do infinite loop check + # This has to be done after redirect resolution to avoid infinite loops via redirects + if ( !$frame->loopCheck( $title ) ) { + $found = true; + $text = '<span class="error">' . wfMsgForContent( 'parser-template-loop-warning', $titleText ) . '</span>'; + wfDebug( __METHOD__.": template loop broken at '$titleText'\n" ); + } wfProfileOut( __METHOD__ . '-loadtpl' ); } @@ -3304,6 +3223,7 @@ class Parser throw new MWException( '<html> extension tag encountered unexpectedly' ); } case 'nowiki': + $content = strtr($content, array('-{' => '-{', '}-' => '}-')); $output = Xml::escapeTagsOnly( $content ); break; case 'math': @@ -3387,6 +3307,7 @@ class Parser * Fills $this->mDoubleUnderscores, returns the modified text */ function doDoubleUnderscore( $text ) { + wfProfileIn( __METHOD__ ); // The position of __TOC__ needs to be recorded $mw = MagicWord::get( 'toc' ); if( $mw->match( $text ) ) { @@ -3429,7 +3350,7 @@ class Parser } elseif( isset( $this->mDoubleUnderscores['index'] ) ) { $this->mOutput->setIndexPolicy( 'index' ); } - + wfProfileOut( __METHOD__ ); return $text; } @@ -3459,7 +3380,7 @@ class Parser } # Inhibit editsection links if requested in the page - if ( isset( $this->mDoubleUnderscores['noeditsection'] ) ) { + if ( isset( $this->mDoubleUnderscores['noeditsection'] ) || $this->mOptions->getIsPrintable() ) { $showEditLink = 0; } @@ -3479,6 +3400,12 @@ class Parser $this->mOutput->setNewSection( true ); } + # Allow user to remove the "new section" + # link via __NONEWSECTIONLINK__ + if ( isset( $this->mDoubleUnderscores['nonewsectionlink'] ) ) { + $this->mOutput->hideNewSection( true ); + } + # if the string __FORCETOC__ (not case-sensitive) occurs in the HTML, # override above conditions and always show TOC above first header if ( isset( $this->mDoubleUnderscores['forcetoc'] ) ) { @@ -3762,13 +3689,13 @@ class Parser * * @param string $text the text to transform * @param Title &$title the Title object for the current article - * @param User &$user the User object describing the current user + * @param User $user the User object describing the current user * @param ParserOptions $options parsing options * @param bool $clearState whether to clear the parser state first * @return string the altered wiki markup * @public */ - function preSaveTransform( $text, &$title, $user, $options, $clearState = true ) { + function preSaveTransform( $text, Title $title, $user, $options, $clearState = true ) { $this->mOptions = $options; $this->setTitle( $title ); $this->setOutputType( self::OT_WIKI ); @@ -3808,6 +3735,15 @@ class Parser putenv( 'TZ='.$wgLocaltimezone ); $ts = date( 'YmdHis', $unixts ); $tz = date( 'T', $unixts ); # might vary on DST changeover! + + /* Allow translation of timezones trough wiki. date() can return + * whatever crap the system uses, localised or not, so we cannot + * ship premade translations. + */ + $key = 'timezone-' . strtolower( trim( $tz ) ); + $value = wfMsgForContent( $key ); + if ( !wfEmptyMsg( $key, $value ) ) $tz = $value; + putenv( 'TZ='.$oldtz ); } @@ -4627,7 +4563,11 @@ class Parser // Output the replacement text // Add two newlines on -- trailing whitespace in $newText is conventionally // stripped by the editor, so we need both newlines to restore the paragraph gap - $outText .= $newText . "\n\n"; + // Only add trailing whitespace if there is newText + if($newText != "") { + $outText .= $newText . "\n\n"; + } + while ( $node ) { $outText .= $frame->expand( $node, PPFrame::RECOVER_ORIG ); $node = $node->getNextSibling(); @@ -4694,6 +4634,22 @@ class Parser } /** + * Get the name of the user that edited the last revision + */ + function getRevisionUser() { + // if this template is subst: the revision id will be blank, + // so just use the current user's name + if( $this->mRevisionId ) { + $revision = Revision::newFromId( $this->mRevisionId ); + $revuser = $revision->getUserText(); + } else { + global $wgUser; + $revuser = $wgUser->getName(); + } + return $revuser; + } + + /** * Mutator for $mDefaultSort * * @param $sort New value @@ -4844,6 +4800,102 @@ class Parser } return $out; } + + function serialiseHalfParsedText( $text ) { + $data = array(); + $data['text'] = $text; + + // First, find all strip markers, and store their + // data in an array. + $stripState = new StripState; + $pos = 0; + while( ( $start_pos = strpos( $text, $this->mUniqPrefix, $pos ) ) && ( $end_pos = strpos( $text, self::MARKER_SUFFIX, $pos ) ) ) { + $end_pos += strlen( self::MARKER_SUFFIX ); + $marker = substr( $text, $start_pos, $end_pos-$start_pos ); + + if ( !empty( $this->mStripState->general->data[$marker] ) ) { + $replaceArray = $stripState->general; + $stripText = $this->mStripState->general->data[$marker]; + } elseif ( !empty( $this->mStripState->nowiki->data[$marker] ) ) { + $replaceArray = $stripState->nowiki; + $stripText = $this->mStripState->nowiki->data[$marker]; + } else { + throw new MWException( "Hanging strip marker: '$marker'." ); + } + + $replaceArray->setPair( $marker, $stripText ); + $pos = $end_pos; + } + $data['stripstate'] = $stripState; + + // Now, find all of our links, and store THEIR + // data in an array! :) + $links = array( 'internal' => array(), 'interwiki' => array() ); + $pos = 0; + + // Internal links + while( ( $start_pos = strpos( $text, '<!--LINK ', $pos ) ) ) { + list( $ns, $trail ) = explode( ':', substr( $text, $start_pos + strlen( '<!--LINK ' ) ), 2 ); + + $ns = trim($ns); + if (empty( $links['internal'][$ns] )) { + $links['internal'][$ns] = array(); + } + + $key = trim( substr( $trail, 0, strpos( $trail, '-->' ) ) ); + $links['internal'][$ns][] = $this->mLinkHolders->internals[$ns][$key]; + $pos = $start_pos + strlen( "<!--LINK $ns:$key-->" ); + } + + $pos = 0; + + // Interwiki links + while( ( $start_pos = strpos( $text, '<!--IWLINK ', $pos ) ) ) { + $data = substr( $text, $start_pos ); + $key = trim( substr( $data, 0, strpos( $data, '-->' ) ) ); + $links['interwiki'][] = $this->mLinkHolders->interwiki[$key]; + $pos = $start_pos + strlen( "<!--IWLINK $key-->" ); + } + + $data['linkholder'] = $links; + + return $data; + } + + function unserialiseHalfParsedText( $data, $intPrefix = null /* Unique identifying prefix */ ) { + if (!$intPrefix) + $intPrefix = $this->getRandomString(); + + // First, extract the strip state. + $stripState = $data['stripstate']; + $this->mStripState->general->merge( $stripState->general ); + $this->mStripState->nowiki->merge( $stripState->nowiki ); + + // Now, extract the text, and renumber links + $text = $data['text']; + $links = $data['linkholder']; + + // Internal... + foreach( $links['internal'] as $ns => $nsLinks ) { + foreach( $nsLinks as $key => $entry ) { + $newKey = $intPrefix . '-' . $key; + $this->mLinkHolders->internals[$ns][$newKey] = $entry; + + $text = str_replace( "<!--LINK $ns:$key-->", "<!--LINK $ns:$newKey-->", $text ); + } + } + + // Interwiki... + foreach( $links['interwiki'] as $key => $entry ) { + $newKey = "$intPrefix-$key"; + $this->mLinkHolders->interwikis[$newKey] = $entry; + + $text = str_replace( "<!--IWLINK $key-->", "<!--IWLINK $newKey-->", $text ); + } + + // Should be good to go. + return $text; + } } /** diff --git a/includes/parser/ParserCache.php b/includes/parser/ParserCache.php index 7e61157a..d17214c3 100644 --- a/includes/parser/ParserCache.php +++ b/includes/parser/ParserCache.php @@ -26,8 +26,14 @@ class ParserCache { $this->mMemc =& $memCached; } - function getKey( &$article, &$user ) { - global $action; + function getKey( &$article, $popts ) { + global $wgRequest; + + if( $popts instanceof User ) // It used to be getKey( &$article, &$user ) + $popts = ParserOptions::newFromUser( $popts ); + + $user = $popts->mUser; + $printable = ( $popts->getIsPrintable() ) ? '!printable=1' : ''; $hash = $user->getPageRenderingHash(); if( !$article->mTitle->quickUserCan( 'edit' ) ) { // section edit links are suppressed even if the user has them on @@ -36,21 +42,21 @@ class ParserCache { $edit = ''; } $pageid = $article->getID(); - $renderkey = (int)($action == 'render'); - $key = wfMemcKey( 'pcache', 'idhash', "{$pageid}-{$renderkey}!{$hash}{$edit}" ); + $renderkey = (int)($wgRequest->getVal('action') == 'render'); + $key = wfMemcKey( 'pcache', 'idhash', "{$pageid}-{$renderkey}!{$hash}{$edit}{$printable}" ); return $key; } - function getETag( &$article, &$user ) { - return 'W/"' . $this->getKey($article, $user) . "--" . $article->mTouched. '"'; + function getETag( &$article, $popts ) { + return 'W/"' . $this->getKey($article, $popts) . "--" . $article->mTouched. '"'; } - function get( &$article, &$user ) { + function get( &$article, $popts ) { global $wgCacheEpoch; $fname = 'ParserCache::get'; wfProfileIn( $fname ); - $key = $this->getKey( $article, $user ); + $key = $this->getKey( $article, $popts ); wfDebug( "Trying parser cache $key\n" ); $value = $this->mMemc->get( $key ); @@ -86,9 +92,9 @@ class ParserCache { return $value; } - function save( $parserOutput, &$article, &$user ){ + function save( $parserOutput, &$article, $popts ){ global $wgParserCacheExpireTime; - $key = $this->getKey( $article, $user ); + $key = $this->getKey( $article, $popts ); if( $parserOutput->getCacheTime() != -1 ) { diff --git a/includes/parser/ParserOptions.php b/includes/parser/ParserOptions.php index 5b8cd3ee..e6a9f3a7 100644 --- a/includes/parser/ParserOptions.php +++ b/includes/parser/ParserOptions.php @@ -33,7 +33,10 @@ class ParserOptions var $mExternalLinkTarget; # Target attribute for external links var $mUser; # Stored user object, just used to initialise the skin - + var $mIsPreview; # Parsing the page for a "preview" operation + var $mIsSectionPreview; # Parsing the page for a "preview" operation on a single section + var $mIsPrintable; # Parsing the printable version of the page + function getUseTeX() { return $this->mUseTeX; } function getUseDynamicDates() { return $this->mUseDynamicDates; } function getInterwikiMagic() { return $this->mInterwikiMagic; } @@ -54,7 +57,10 @@ class ParserOptions function getEnableLimitReport() { return $this->mEnableLimitReport; } function getCleanSignatures() { return $this->mCleanSignatures; } function getExternalLinkTarget() { return $this->mExternalLinkTarget; } - + function getIsPreview() { return $this->mIsPreview; } + function getIsSectionPreview() { return $this->mIsSectionPreview; } + function getIsPrintable() { return $this->mIsPrintable; } + function getSkin() { if ( !isset( $this->mSkin ) ) { $this->mSkin = $this->mUser->getSkin(); @@ -99,7 +105,10 @@ class ParserOptions function setTimestamp( $x ) { return wfSetVar( $this->mTimestamp, $x ); } function setCleanSignatures( $x ) { return wfSetVar( $this->mCleanSignatures, $x ); } function setExternalLinkTarget( $x ) { return wfSetVar( $this->mExternalLinkTarget, $x ); } - + function setIsPreview( $x ) { return wfSetVar( $this->mIsPreview, $x ); } + function setIsSectionPreview( $x ) { return wfSetVar( $this->mIsSectionPreview, $x ); } + function setIsPrintable( $x ) { return wfSetVar( $this->mIsPrintable, $x ); } + function __construct( $user = null ) { $this->initialiseFromUser( $user ); } @@ -156,6 +165,8 @@ class ParserOptions $this->mEnableLimitReport = false; $this->mCleanSignatures = $wgCleanSignatures; $this->mExternalLinkTarget = $wgExternalLinkTarget; + $this->mIsPreview = false; + $this->mIsSectionPreview = false; wfProfileOut( $fname ); } } diff --git a/includes/parser/ParserOutput.php b/includes/parser/ParserOutput.php index 35cb5c92..22c1dfba 100644 --- a/includes/parser/ParserOutput.php +++ b/includes/parser/ParserOutput.php @@ -18,6 +18,7 @@ class ParserOutput $mImages = array(), # DB keys of the images used, in the array key only $mExternalLinks = array(), # External link URLs, in the key only $mNewSection = false, # Show a new section link? + $mHideNewSection = false, # Hide the new section link? $mNoGallery = false, # No gallery on category page? (__NOGALLERY__) $mHeadItems = array(), # Items to put in the <head> section $mOutputHooks = array(), # Hook tags as per $wgParserOutputHooks @@ -80,6 +81,12 @@ class ParserOutput function setNewSection( $value ) { $this->mNewSection = (bool)$value; } + function hideNewSection ( $value ) { + $this->mHideNewSection = (bool)$value; + } + function getHideNewSection () { + return (bool)$this->mHideNewSection; + } function getNewSection() { return (bool)$this->mNewSection; } @@ -94,6 +101,9 @@ class ParserOutput // We don't record Special: links currently // It might actually be wise to, but we'd need to do some normalization. return; + } elseif( $dbk === '' ) { + // Don't record self links - [[#Foo]] + return; } if ( !isset( $this->mLinks[$ns] ) ) { $this->mLinks[$ns] = array(); diff --git a/includes/parser/Preprocessor_DOM.php b/includes/parser/Preprocessor_DOM.php index af591b67..2e114545 100644 --- a/includes/parser/Preprocessor_DOM.php +++ b/includes/parser/Preprocessor_DOM.php @@ -6,6 +6,8 @@ class Preprocessor_DOM implements Preprocessor { var $parser, $memoryLimit; + const CACHE_VERSION = 1; + function __construct( $parser ) { $this->parser = $parser; $mem = ini_get( 'memory_limit' ); @@ -63,8 +65,61 @@ class Preprocessor_DOM implements Preprocessor { */ function preprocessToObj( $text, $flags = 0 ) { wfProfileIn( __METHOD__ ); - wfProfileIn( __METHOD__.'-makexml' ); + global $wgMemc, $wgPreprocessorCacheThreshold; + + $xml = false; + $cacheable = strlen( $text ) > $wgPreprocessorCacheThreshold; + if ( $cacheable ) { + wfProfileIn( __METHOD__.'-cacheable' ); + + $cacheKey = wfMemcKey( 'preprocess-xml', md5($text), $flags ); + $cacheValue = $wgMemc->get( $cacheKey ); + if ( $cacheValue ) { + $version = substr( $cacheValue, 0, 8 ); + if ( intval( $version ) == self::CACHE_VERSION ) { + $xml = substr( $cacheValue, 8 ); + // From the cache + wfDebugLog( "Preprocessor", "Loaded preprocessor XML from memcached (key $cacheKey)" ); + } + } + } + if ( $xml === false ) { + if ( $cacheable ) { + wfProfileIn( __METHOD__.'-cache-miss' ); + $xml = $this->preprocessToXml( $text, $flags ); + $cacheValue = sprintf( "%08d", self::CACHE_VERSION ) . $xml; + $wgMemc->set( $cacheKey, $cacheValue, 86400 ); + wfProfileOut( __METHOD__.'-cache-miss' ); + wfDebugLog( "Preprocessor", "Saved preprocessor XML to memcached (key $cacheKey)" ); + } else { + $xml = $this->preprocessToXml( $text, $flags ); + } + } + wfProfileIn( __METHOD__.'-loadXML' ); + $dom = new DOMDocument; + wfSuppressWarnings(); + $result = $dom->loadXML( $xml ); + wfRestoreWarnings(); + if ( !$result ) { + // Try running the XML through UtfNormal to get rid of invalid characters + $xml = UtfNormal::cleanUp( $xml ); + $result = $dom->loadXML( $xml ); + if ( !$result ) { + throw new MWException( __METHOD__.' generated invalid XML' ); + } + } + $obj = new PPNode_DOM( $dom->documentElement ); + wfProfileOut( __METHOD__.'-loadXML' ); + if ( $cacheable ) { + wfProfileOut( __METHOD__.'-cacheable' ); + } + wfProfileOut( __METHOD__ ); + return $obj; + } + + function preprocessToXml( $text, $flags = 0 ) { + wfProfileIn( __METHOD__ ); $rules = array( '{' => array( 'end' => '}', @@ -304,7 +359,9 @@ class Preprocessor_DOM implements Preprocessor { } else { $attrEnd = $tagEndPos; // Find closing tag - if ( preg_match( "/<\/$name\s*>/i", $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 ) ) { + if ( preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i", + $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 ) ) + { $inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 ); $i = $matches[0][1] + strlen( $matches[0][0] ); $close = '<close>' . htmlspecialchars( $matches[0][0] ) . '</close>'; @@ -569,24 +626,9 @@ class Preprocessor_DOM implements Preprocessor { $stack->rootAccum .= '</root>'; $xml = $stack->rootAccum; - wfProfileOut( __METHOD__.'-makexml' ); - wfProfileIn( __METHOD__.'-loadXML' ); - $dom = new DOMDocument; - wfSuppressWarnings(); - $result = $dom->loadXML( $xml ); - wfRestoreWarnings(); - if ( !$result ) { - // Try running the XML through UtfNormal to get rid of invalid characters - $xml = UtfNormal::cleanUp( $xml ); - $result = $dom->loadXML( $xml ); - if ( !$result ) { - throw new MWException( __METHOD__.' generated invalid XML' ); - } - } - $obj = new PPNode_DOM( $dom->documentElement ); - wfProfileOut( __METHOD__.'-loadXML' ); wfProfileOut( __METHOD__ ); - return $obj; + + return $xml; } } @@ -831,7 +873,6 @@ class PPFrame_DOM implements PPFrame { if ( is_string( $root ) ) { return $root; } - wfProfileIn( __METHOD__ ); if ( ++$this->parser->mPPNodeCount > $this->parser->mOptions->mMaxPPNodeCount ) { @@ -841,6 +882,7 @@ class PPFrame_DOM implements PPFrame { if ( $expansionDepth > $this->parser->mOptions->mMaxPPExpandDepth ) { return '<span class="error">Expansion depth limit exceeded</span>'; } + wfProfileIn( __METHOD__ ); ++$expansionDepth; if ( $root instanceof PPNode_DOM ) { diff --git a/includes/parser/Preprocessor_Hash.php b/includes/parser/Preprocessor_Hash.php index 62028291..f46ee40c 100644 --- a/includes/parser/Preprocessor_Hash.php +++ b/includes/parser/Preprocessor_Hash.php @@ -8,6 +8,8 @@ */ class Preprocessor_Hash implements Preprocessor { var $parser; + + const CACHE_VERSION = 1; function __construct( $parser ) { $this->parser = $parser; @@ -45,6 +47,31 @@ class Preprocessor_Hash implements Preprocessor { */ function preprocessToObj( $text, $flags = 0 ) { wfProfileIn( __METHOD__ ); + + + // Check cache. + global $wgMemc, $wgPreprocessorCacheThreshold; + + $cacheable = strlen( $text ) > $wgPreprocessorCacheThreshold; + if ( $cacheable ) { + wfProfileIn( __METHOD__.'-cacheable' ); + + $cacheKey = wfMemcKey( 'preprocess-hash', md5($text), $flags ); + $cacheValue = $wgMemc->get( $cacheKey ); + if ( $cacheValue ) { + $version = substr( $cacheValue, 0, 8 ); + if ( intval( $version ) == self::CACHE_VERSION ) { + $hash = unserialize( substr( $cacheValue, 8 ) ); + // From the cache + wfDebugLog( "Preprocessor", + "Loaded preprocessor hash from memcached (key $cacheKey)" ); + wfProfileOut( __METHOD__.'-cacheable' ); + wfProfileOut( __METHOD__ ); + return $hash; + } + } + wfProfileIn( __METHOD__.'-cache-miss' ); + } $rules = array( '{' => array( @@ -288,7 +315,9 @@ class Preprocessor_Hash implements Preprocessor { } else { $attrEnd = $tagEndPos; // Find closing tag - if ( preg_match( "/<\/$name\s*>/i", $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 ) ) { + if ( preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i", + $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 ) ) + { $inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 ); $i = $matches[0][1] + strlen( $matches[0][0] ); $close = $matches[0][0]; @@ -615,6 +644,16 @@ class Preprocessor_Hash implements Preprocessor { $rootNode = new PPNode_Hash_Tree( 'root' ); $rootNode->firstChild = $stack->rootAccum->firstNode; $rootNode->lastChild = $stack->rootAccum->lastNode; + + // Cache + if ($cacheable) { + $cacheValue = sprintf( "%08d", self::CACHE_VERSION ) . serialize( $rootNode );; + $wgMemc->set( $cacheKey, $cacheValue, 86400 ); + wfProfileOut( __METHOD__.'-cache-miss' ); + wfProfileOut( __METHOD__.'-cacheable' ); + wfDebugLog( "Preprocessor", "Saved preprocessor Hash to memcached (key $cacheKey)" ); + } + wfProfileOut( __METHOD__ ); return $rootNode; } diff --git a/includes/parser/Tidy.php b/includes/parser/Tidy.php new file mode 100644 index 00000000..95f83621 --- /dev/null +++ b/includes/parser/Tidy.php @@ -0,0 +1,170 @@ +<?php + +/** + * Class to interact with HTML tidy + * + * Either the external tidy program or the in-process tidy extension + * will be used depending on availability. Override the default + * $wgTidyInternal setting to disable the internal if it's not working. + * + * @ingroup Parser + */ +class MWTidy { + + /** + * Interface with html tidy, used if $wgUseTidy = true. + * If tidy isn't able to correct the markup, the original will be + * returned in all its glory with a warning comment appended. + * + * @param string $text Hideous HTML input + * @return string Corrected HTML output + */ + public static function tidy( $text ) { + global $wgTidyInternal; + + $wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"'. +' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>'. +'<head><title>test</title></head><body>'.$text.'</body></html>'; + + # Tidy is known to clobber tabs; convert them to entities + $wrappedtext = str_replace( "\t", '	', $wrappedtext ); + + if( $wgTidyInternal ) { + $correctedtext = self::execInternalTidy( $wrappedtext ); + } else { + $correctedtext = self::execExternalTidy( $wrappedtext ); + } + if( is_null( $correctedtext ) ) { + wfDebug( "Tidy error detected!\n" ); + return $text . "\n<!-- Tidy found serious XHTML errors -->\n"; + } + + # Convert the tabs back from entities + $correctedtext = str_replace( '	', "\t", $correctedtext ); + + return $correctedtext; + } + + /** + * Check HTML for errors, used if $wgValidateAllHtml = true. + * + * @param $text String + * @param &$errorStr String: return the error string + * @return Boolean: whether the HTML is valid + */ + public static function checkErrors( $text, &$errorStr = null ) { + global $wgTidyInternal; + + $retval = 0; + if( $wgTidyInternal ) { + $errorStr = self::execInternalTidy( $text, true, $retval ); + } else { + $errorStr = self::execExternalTidy( $text, true, $retval ); + } + return ( $retval < 0 && $errorStr == '' ) || $retval == 0; + } + + /** + * Spawn an external HTML tidy process and get corrected markup back from it. + * Also called in OutputHandler.php for full page validation + * + * @param $text String: HTML to check + * @param $stderr Boolean: Whether to read from STDERR rather than STDOUT + * @param &$retval Exit code (-1 on internal error) + * @retrun mixed String or null + */ + private static function execExternalTidy( $text, $stderr = false, &$retval = null ) { + global $wgTidyConf, $wgTidyBin, $wgTidyOpts; + wfProfileIn( __METHOD__ ); + + $cleansource = ''; + $opts = ' -utf8'; + + if( $stderr ) { + $descriptorspec = array( + 0 => array( 'pipe', 'r' ), + 1 => array( 'file', wfGetNull(), 'a' ), + 2 => array( 'pipe', 'w' ) + ); + } else { + $descriptorspec = array( + 0 => array( 'pipe', 'r' ), + 1 => array( 'pipe', 'w' ), + 2 => array( 'file', wfGetNull(), 'a' ) + ); + } + + $readpipe = $stderr ? 2 : 1; + $pipes = array(); + + if( function_exists( 'proc_open' ) ) { + $process = proc_open( "$wgTidyBin -config $wgTidyConf $wgTidyOpts$opts", $descriptorspec, $pipes ); + if ( is_resource( $process ) ) { + // Theoretically, this style of communication could cause a deadlock + // here. If the stdout buffer fills up, then writes to stdin could + // block. This doesn't appear to happen with tidy, because tidy only + // writes to stdout after it's finished reading from stdin. Search + // for tidyParseStdin and tidySaveStdout in console/tidy.c + fwrite( $pipes[0], $text ); + fclose( $pipes[0] ); + while ( !feof( $pipes[$readpipe] ) ) { + $cleansource .= fgets( $pipes[$readpipe], 1024 ); + } + fclose( $pipes[$readpipe] ); + $retval = proc_close( $process ); + } else { + $retval = -1; + } + } else { + $retval = -1; + } + + wfProfileOut( __METHOD__ ); + + if( !$stderr && $cleansource == '' && $text != '' ) { + // Some kind of error happened, so we couldn't get the corrected text. + // Just give up; we'll use the source text and append a warning. + return null; + } else { + return $cleansource; + } + } + + /** + * Use the HTML tidy PECL extension to use the tidy library in-process, + * saving the overhead of spawning a new process. + * + * 'pear install tidy' should be able to compile the extension module. + */ + private static function execInternalTidy( $text, $stderr = false, &$retval = null ) { + global $wgTidyConf, $IP, $wgDebugTidy; + wfProfileIn( __METHOD__ ); + + $tidy = new tidy; + $tidy->parseString( $text, $wgTidyConf, 'utf8' ); + + if( $stderr ) { + $retval = $tidy->getStatus(); + return $tidy->errorBuffer; + } else { + $tidy->cleanRepair(); + $retval = $tidy->getStatus(); + if( $retval == 2 ) { + // 2 is magic number for fatal error + // http://www.php.net/manual/en/function.tidy-get-status.php + $cleansource = null; + } else { + $cleansource = tidy_get_output( $tidy ); + } + if ( $wgDebugTidy && $retval > 0 ) { + $cleansource .= "<!--\nTidy reports:\n" . + str_replace( '-->', '-->', $tidy->errorBuffer ) . + "\n-->"; + } + + wfProfileOut( __METHOD__ ); + return $cleansource; + } + } + +} |