From c1f9b1f7b1b77776192048005dcc66dcf3df2bfb Mon Sep 17 00:00:00 2001 From: Pierre Schmitz Date: Sat, 27 Dec 2014 15:41:37 +0100 Subject: Update to MediaWiki 1.24.1 --- includes/parser/MWTidy.php | 291 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 291 insertions(+) create mode 100644 includes/parser/MWTidy.php (limited to 'includes/parser/MWTidy.php') diff --git a/includes/parser/MWTidy.php b/includes/parser/MWTidy.php new file mode 100644 index 00000000..b310862f --- /dev/null +++ b/includes/parser/MWTidy.php @@ -0,0 +1,291 @@ +mTokens = null; + $this->mUniqPrefix = null; + } + + /** + * @param string $text + * @return string + */ + public function getWrapped( $text ) { + $this->mTokens = new ReplacementArray; + $this->mUniqPrefix = "\x7fUNIQ" . + dechex( mt_rand( 0, 0x7fffffff ) ) . dechex( mt_rand( 0, 0x7fffffff ) ); + $this->mMarkerIndex = 0; + + // Replace elements with placeholders + $wrappedtext = preg_replace_callback( ParserOutput::EDITSECTION_REGEX, + array( &$this, 'replaceCallback' ), $text ); + // ...and markers + $wrappedtext = preg_replace_callback( '/\<\\/?mw:toc\>/', + array( &$this, 'replaceCallback' ), $wrappedtext ); + // ... and tags + $wrappedtext = preg_replace_callback( '/\/s', + array( &$this, 'replaceCallback' ), $wrappedtext ); + // Modify inline Microdata and elements so they say and so + // we can trick Tidy into not stripping them out by including them in tidy's new-empty-tags config + $wrappedtext = preg_replace( '!<(link|meta)([^>]*?)(/{0,1}>)!', '' . + 'test' . $wrappedtext . ''; + + return $wrappedtext; + } + + /** + * @param array $m + * + * @return string + */ + public function replaceCallback( $m ) { + $marker = "{$this->mUniqPrefix}-item-{$this->mMarkerIndex}" . Parser::MARKER_SUFFIX; + $this->mMarkerIndex++; + $this->mTokens->setPair( $marker, $m[0] ); + return $marker; + } + + /** + * @param string $text + * @return string + */ + public function postprocess( $text ) { + // Revert back to <{link,meta}> + $text = preg_replace( '!]*?)(/{0,1}>)!', '<$1$2$3', $text ); + + // Restore the contents of placeholder tokens + $text = $this->mTokens->replace( $text ); + + return $text; + } + +} + +/** + * Class to interact with HTML tidy + * + * Either the external tidy program or the in-process tidy extension + * will be used depending on availability. Override the default + * $wgTidyInternal setting to disable the internal if it's not working. + * + * @ingroup Parser + */ +class MWTidy { + /** + * Interface with html tidy, used if $wgUseTidy = true. + * If tidy isn't able to correct the markup, the original will be + * returned in all its glory with a warning comment appended. + * + * @param string $text Hideous HTML input + * @return string Corrected HTML output + */ + public static function tidy( $text ) { + global $wgTidyInternal; + + $wrapper = new MWTidyWrapper; + $wrappedtext = $wrapper->getWrapped( $text ); + + $retVal = null; + if ( $wgTidyInternal ) { + $correctedtext = self::execInternalTidy( $wrappedtext, false, $retVal ); + } else { + $correctedtext = self::execExternalTidy( $wrappedtext, false, $retVal ); + } + + if ( $retVal < 0 ) { + wfDebug( "Possible tidy configuration error!\n" ); + return $text . "\n\n"; + } elseif ( is_null( $correctedtext ) ) { + wfDebug( "Tidy error detected!\n" ); + return $text . "\n\n"; + } + + $correctedtext = $wrapper->postprocess( $correctedtext ); // restore any hidden tokens + + return $correctedtext; + } + + /** + * Check HTML for errors, used if $wgValidateAllHtml = true. + * + * @param string $text + * @param string &$errorStr Return the error string + * @return bool Whether the HTML is valid + */ + public static function checkErrors( $text, &$errorStr = null ) { + global $wgTidyInternal; + + $retval = 0; + if ( $wgTidyInternal ) { + $errorStr = self::execInternalTidy( $text, true, $retval ); + } else { + $errorStr = self::execExternalTidy( $text, true, $retval ); + } + + return ( $retval < 0 && $errorStr == '' ) || $retval == 0; + } + + /** + * Spawn an external HTML tidy process and get corrected markup back from it. + * Also called in OutputHandler.php for full page validation + * + * @param string $text HTML to check + * @param bool $stderr Whether to read result from STDERR rather than STDOUT + * @param int &$retval Exit code (-1 on internal error) + * @return string|null + */ + private static function execExternalTidy( $text, $stderr = false, &$retval = null ) { + global $wgTidyConf, $wgTidyBin, $wgTidyOpts; + wfProfileIn( __METHOD__ ); + + $cleansource = ''; + $opts = ' -utf8'; + + if ( $stderr ) { + $descriptorspec = array( + 0 => array( 'pipe', 'r' ), + 1 => array( 'file', wfGetNull(), 'a' ), + 2 => array( 'pipe', 'w' ) + ); + } else { + $descriptorspec = array( + 0 => array( 'pipe', 'r' ), + 1 => array( 'pipe', 'w' ), + 2 => array( 'file', wfGetNull(), 'a' ) + ); + } + + $readpipe = $stderr ? 2 : 1; + $pipes = array(); + + $process = proc_open( + "$wgTidyBin -config $wgTidyConf $wgTidyOpts$opts", $descriptorspec, $pipes ); + + //NOTE: At least on linux, the process will be created even if tidy is not installed. + // This means that missing tidy will be treated as a validation failure. + + if ( is_resource( $process ) ) { + // Theoretically, this style of communication could cause a deadlock + // here. If the stdout buffer fills up, then writes to stdin could + // block. This doesn't appear to happen with tidy, because tidy only + // writes to stdout after it's finished reading from stdin. Search + // for tidyParseStdin and tidySaveStdout in console/tidy.c + fwrite( $pipes[0], $text ); + fclose( $pipes[0] ); + while ( !feof( $pipes[$readpipe] ) ) { + $cleansource .= fgets( $pipes[$readpipe], 1024 ); + } + fclose( $pipes[$readpipe] ); + $retval = proc_close( $process ); + } else { + wfWarn( "Unable to start external tidy process" ); + $retval = -1; + } + + if ( !$stderr && $cleansource == '' && $text != '' ) { + // Some kind of error happened, so we couldn't get the corrected text. + // Just give up; we'll use the source text and append a warning. + $cleansource = null; + } + + wfProfileOut( __METHOD__ ); + return $cleansource; + } + + /** + * Use the HTML tidy extension to use the tidy library in-process, + * saving the overhead of spawning a new process. + * + * @param string $text HTML to check + * @param bool $stderr Whether to read result from error status instead of output + * @param int &$retval Exit code (-1 on internal error) + * @return string|null + */ + private static function execInternalTidy( $text, $stderr = false, &$retval = null ) { + global $wgTidyConf, $wgDebugTidy; + wfProfileIn( __METHOD__ ); + + if ( !class_exists( 'tidy' ) ) { + wfWarn( "Unable to load internal tidy class." ); + $retval = -1; + + wfProfileOut( __METHOD__ ); + return null; + } + + $tidy = new tidy; + $tidy->parseString( $text, $wgTidyConf, 'utf8' ); + + if ( $stderr ) { + $retval = $tidy->getStatus(); + + wfProfileOut( __METHOD__ ); + return $tidy->errorBuffer; + } + + $tidy->cleanRepair(); + $retval = $tidy->getStatus(); + if ( $retval == 2 ) { + // 2 is magic number for fatal error + // http://www.php.net/manual/en/function.tidy-get-status.php + $cleansource = null; + } else { + $cleansource = tidy_get_output( $tidy ); + if ( $wgDebugTidy && $retval > 0 ) { + $cleansource .= "', '-->', $tidy->errorBuffer ) . + "\n-->"; + } + } + + wfProfileOut( __METHOD__ ); + return $cleansource; + } +} -- cgit v1.2.3-54-g00ecf