diff options
Diffstat (limited to 'includes/tidy')
-rw-r--r-- | includes/tidy/Html5Depurate.php | 45 | ||||
-rw-r--r-- | includes/tidy/RaggettBase.php | 47 | ||||
-rw-r--r-- | includes/tidy/RaggettExternal.php | 73 | ||||
-rw-r--r-- | includes/tidy/RaggettInternalHHVM.php | 29 | ||||
-rw-r--r-- | includes/tidy/RaggettInternalPHP.php | 52 | ||||
-rw-r--r-- | includes/tidy/RaggettWrapper.php | 89 | ||||
-rw-r--r-- | includes/tidy/TidyDriverBase.php | 40 | ||||
-rw-r--r-- | includes/tidy/tidy.conf | 22 |
8 files changed, 397 insertions, 0 deletions
diff --git a/includes/tidy/Html5Depurate.php b/includes/tidy/Html5Depurate.php new file mode 100644 index 00000000..23e445fa --- /dev/null +++ b/includes/tidy/Html5Depurate.php @@ -0,0 +1,45 @@ +<?php + +namespace MediaWiki\Tidy; +use MWHttpRequest; +use Exception; + +class Html5Depurate extends TidyDriverBase { + public function __construct( array $config ) { + parent::__construct( $config + array( + 'url' => 'http://localhost:4339/document', + 'timeout' => 10, + 'connectTimeout' => 0.5, + ) ); + } + + public function tidy( $text ) { + $wrappedtext = '<!DOCTYPE html><html>' . + '<body>' . $text . '</body></html>'; + + $req = MWHttpRequest::factory( $this->config['url'], + array( + 'method' => 'POST', + 'timeout' => $this->config['timeout'], + 'connectTimeout' => $this->config['connectTimeout'], + 'postData' => array( + 'text' => $wrappedtext + ) + ) ); + $status = $req->execute(); + if ( !$status->isOK() ) { + throw new Exception( "Error contacting depurate service: " . $status->getWikiText() ); + } elseif ( $req->getStatus() !== 200 ) { + throw new Exception( "Depurate returned error: " . $status->getWikiText() ); + } + $result = $req->getContent(); + $startBody = strpos( $result, "<body>" ); + $endBody = strrpos( $result, "</body>" ); + if ( $startBody !== false && $endBody !== false && $endBody > $startBody ) { + $startBody += strlen( "<body>" ); + return substr( $result, $startBody, $endBody - $startBody ); + } else { + return $text . "\n<!-- Html5Depurate returned an invalid result -->"; + } + } +} diff --git a/includes/tidy/RaggettBase.php b/includes/tidy/RaggettBase.php new file mode 100644 index 00000000..a3717b2b --- /dev/null +++ b/includes/tidy/RaggettBase.php @@ -0,0 +1,47 @@ +<?php + +namespace MediaWiki\Tidy; + +abstract class RaggettBase extends TidyDriverBase { + /** + * Generic interface for wrapping and unwrapping HTML for Dave Raggett's tidy. + * + * @param string $text Hideous HTML input + * @return string Corrected HTML output + */ + public function tidy( $text ) { + $wrapper = new RaggettWrapper; + $wrappedtext = $wrapper->getWrapped( $text ); + + $retVal = null; + $correctedtext = $this->cleanWrapped( $wrappedtext, false, $retVal ); + + if ( $retVal < 0 ) { + wfDebug( "Possible tidy configuration error!\n" ); + return $text . "\n<!-- Tidy was unable to run -->\n"; + } elseif ( is_null( $correctedtext ) ) { + wfDebug( "Tidy error detected!\n" ); + return $text . "\n<!-- Tidy found serious XHTML errors -->\n"; + } + + $correctedtext = $wrapper->postprocess( $correctedtext ); // restore any hidden tokens + + return $correctedtext; + } + + public function validate( $text, &$errorStr ) { + $retval = 0; + $errorStr = $this->cleanWrapped( $text, true, $retval ); + return ( $retval < 0 && $errorStr == '' ) || $retval == 0; + } + + /** + * Perform a clean/repair operation + * @param string $text HTML to check + * @param bool $stderr Whether to read result from STDERR rather than STDOUT + * @param int &$retval Exit code (-1 on internal error) + * @return null|string + * @throws MWException + */ + abstract protected function cleanWrapped( $text, $stderr = false, &$retval = null ); +} diff --git a/includes/tidy/RaggettExternal.php b/includes/tidy/RaggettExternal.php new file mode 100644 index 00000000..11933188 --- /dev/null +++ b/includes/tidy/RaggettExternal.php @@ -0,0 +1,73 @@ +<?php + +namespace MediaWiki\Tidy; + +class RaggettExternal extends RaggettBase { + /** + * Spawn an external HTML tidy process and get corrected markup back from it. + * Also called in OutputHandler.php for full page validation + * + * @param string $text HTML to check + * @param bool $stderr Whether to read result from STDERR rather than STDOUT + * @param int &$retval Exit code (-1 on internal error) + * @return string|null + */ + protected function cleanWrapped( $text, $stderr = false, &$retval = null ) { + $cleansource = ''; + $opts = ' -utf8'; + + if ( $stderr ) { + $descriptorspec = array( + 0 => array( 'pipe', 'r' ), + 1 => array( 'file', wfGetNull(), 'a' ), + 2 => array( 'pipe', 'w' ) + ); + } else { + $descriptorspec = array( + 0 => array( 'pipe', 'r' ), + 1 => array( 'pipe', 'w' ), + 2 => array( 'file', wfGetNull(), 'a' ) + ); + } + + $readpipe = $stderr ? 2 : 1; + $pipes = array(); + + $process = proc_open( + "{$this->config['tidyBin']} -config {$this->config['tidyConfigFile']} " . + $this->config['tidyCommandLine'] . $opts, $descriptorspec, $pipes ); + + //NOTE: At least on linux, the process will be created even if tidy is not installed. + // This means that missing tidy will be treated as a validation failure. + + if ( is_resource( $process ) ) { + // Theoretically, this style of communication could cause a deadlock + // here. If the stdout buffer fills up, then writes to stdin could + // block. This doesn't appear to happen with tidy, because tidy only + // writes to stdout after it's finished reading from stdin. Search + // for tidyParseStdin and tidySaveStdout in console/tidy.c + fwrite( $pipes[0], $text ); + fclose( $pipes[0] ); + while ( !feof( $pipes[$readpipe] ) ) { + $cleansource .= fgets( $pipes[$readpipe], 1024 ); + } + fclose( $pipes[$readpipe] ); + $retval = proc_close( $process ); + } else { + wfWarn( "Unable to start external tidy process" ); + $retval = -1; + } + + if ( !$stderr && $cleansource == '' && $text != '' ) { + // Some kind of error happened, so we couldn't get the corrected text. + // Just give up; we'll use the source text and append a warning. + $cleansource = null; + } + + return $cleansource; + } + + public function supportsValidate() { + return true; + } +} diff --git a/includes/tidy/RaggettInternalHHVM.php b/includes/tidy/RaggettInternalHHVM.php new file mode 100644 index 00000000..2a3986df --- /dev/null +++ b/includes/tidy/RaggettInternalHHVM.php @@ -0,0 +1,29 @@ +<?php + +namespace MediaWiki\Tidy; + +class RaggettInternalHHVM extends RaggettBase { + /** + * Use the HTML tidy extension to use the tidy library in-process, + * saving the overhead of spawning a new process. + * + * @param string $text HTML to check + * @param bool $stderr Whether to read result from error status instead of output + * @param int &$retval Exit code (-1 on internal error) + * @return string|null + */ + protected function cleanWrapped( $text, $stderr = false, &$retval = null ) { + if ( $stderr ) { + throw new Exception( "\$stderr cannot be used with RaggettInternalHHVM" ); + } + $cleansource = tidy_repair_string( $text, $this->config['tidyConfigFile'], 'utf8' ); + if ( $cleansource === false ) { + $cleansource = null; + $retval = -1; + } else { + $retval = 0; + } + + return $cleansource; + } +} diff --git a/includes/tidy/RaggettInternalPHP.php b/includes/tidy/RaggettInternalPHP.php new file mode 100644 index 00000000..1ce14b60 --- /dev/null +++ b/includes/tidy/RaggettInternalPHP.php @@ -0,0 +1,52 @@ +<?php + +namespace MediaWiki\Tidy; + +class RaggettInternalPHP extends RaggettBase { + /** + * Use the HTML tidy extension to use the tidy library in-process, + * saving the overhead of spawning a new process. + * + * @param string $text HTML to check + * @param bool $stderr Whether to read result from error status instead of output + * @param int &$retval Exit code (-1 on internal error) + * @return string|null + */ + protected function cleanWrapped( $text, $stderr = false, &$retval = null ) { + if ( !class_exists( 'tidy' ) ) { + wfWarn( "Unable to load internal tidy class." ); + $retval = -1; + + return null; + } + + $tidy = new \tidy; + $tidy->parseString( $text, $this->config['tidyConfigFile'], 'utf8' ); + + if ( $stderr ) { + $retval = $tidy->getStatus(); + return $tidy->errorBuffer; + } + + $tidy->cleanRepair(); + $retval = $tidy->getStatus(); + if ( $retval == 2 ) { + // 2 is magic number for fatal error + // http://www.php.net/manual/en/function.tidy-get-status.php + $cleansource = null; + } else { + $cleansource = tidy_get_output( $tidy ); + if ( !empty( $this->config['debugComment'] ) && $retval > 0 ) { + $cleansource .= "<!--\nTidy reports:\n" . + str_replace( '-->', '-->', $tidy->errorBuffer ) . + "\n-->"; + } + } + + return $cleansource; + } + + public function supportsValidate() { + return true; + } +} diff --git a/includes/tidy/RaggettWrapper.php b/includes/tidy/RaggettWrapper.php new file mode 100644 index 00000000..083f4020 --- /dev/null +++ b/includes/tidy/RaggettWrapper.php @@ -0,0 +1,89 @@ +<?php +namespace MediaWiki\Tidy; + +use ReplacementArray; +use ParserOutput; +use Parser; + +/** + * Class used to hide mw:editsection tokens from Tidy so that it doesn't break them + * or break on them. This is a bit of a hack for now, but hopefully in the future + * we may create a real postprocessor or something that will replace this. + * It's called wrapper because for now it basically takes over MWTidy::tidy's task + * of wrapping the text in a xhtml block + * + * This re-uses some of the parser's UNIQ tricks, though some of it is private so it's + * duplicated. Perhaps we should create an abstract marker hiding class. + * + * @ingroup Parser + */ +class RaggettWrapper { + + /** + * @var ReplacementArray + */ + protected $mTokens; + + protected $mMarkerIndex; + + public function __construct() { + $this->mTokens = null; + } + + /** + * @param string $text + * @return string + */ + public function getWrapped( $text ) { + $this->mTokens = new ReplacementArray; + $this->mMarkerIndex = 0; + + // Replace <mw:editsection> elements with placeholders + $wrappedtext = preg_replace_callback( ParserOutput::EDITSECTION_REGEX, + array( &$this, 'replaceCallback' ), $text ); + // ...and <mw:toc> markers + $wrappedtext = preg_replace_callback( '/\<\\/?mw:toc\>/', + array( &$this, 'replaceCallback' ), $wrappedtext ); + // ... and <math> tags + $wrappedtext = preg_replace_callback( '/\<math(.*?)\<\\/math\>/s', + array( &$this, 'replaceCallback' ), $wrappedtext ); + // Modify inline Microdata <link> and <meta> elements so they say <html-link> and <html-meta> so + // we can trick Tidy into not stripping them out by including them in tidy's new-empty-tags config + $wrappedtext = preg_replace( '!<(link|meta)([^>]*?)(/{0,1}>)!', '<html-$1$2$3', $wrappedtext ); + + // Wrap the whole thing in a doctype and body for Tidy. + $wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' . + ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>' . + '<head><title>test</title></head><body>' . $wrappedtext . '</body></html>'; + + return $wrappedtext; + } + + /** + * @param array $m + * + * @return string + */ + public function replaceCallback( $m ) { + $marker = Parser::MARKER_PREFIX . "-item-{$this->mMarkerIndex}" . Parser::MARKER_SUFFIX; + $this->mMarkerIndex++; + $this->mTokens->setPair( $marker, $m[0] ); + return $marker; + } + + /** + * @param string $text + * @return string + */ + public function postprocess( $text ) { + // Revert <html-{link,meta}> back to <{link,meta}> + $text = preg_replace( '!<html-(link|meta)([^>]*?)(/{0,1}>)!', '<$1$2$3', $text ); + + // Restore the contents of placeholder tokens + $text = $this->mTokens->replace( $text ); + + return $text; + } + +} +?> diff --git a/includes/tidy/TidyDriverBase.php b/includes/tidy/TidyDriverBase.php new file mode 100644 index 00000000..1d994aa1 --- /dev/null +++ b/includes/tidy/TidyDriverBase.php @@ -0,0 +1,40 @@ +<?php + +namespace MediaWiki\Tidy; + +/** + * Base class for HTML cleanup utilities + */ +abstract class TidyDriverBase { + protected $config; + + function __construct( $config ) { + $this->config = $config; + } + + /** + * Return true if validate() can be used + */ + public function supportsValidate() { + return false; + } + + /** + * Check HTML for errors, used if $wgValidateAllHtml = true. + * + * @param string $text + * @param string &$errorStr Return the error string + * @return bool Whether the HTML is valid + */ + public function validate( $text, &$errorStr ) { + throw new MWException( get_class( $this ) . " does not support validate()" ); + } + + /** + * Clean up HTML + * + * @param string HTML document fragment to clean up + * @param string The corrected HTML output + */ + public abstract function tidy( $text ); +} diff --git a/includes/tidy/tidy.conf b/includes/tidy/tidy.conf new file mode 100644 index 00000000..4c4daed5 --- /dev/null +++ b/includes/tidy/tidy.conf @@ -0,0 +1,22 @@ +# html tidy (http://tidy.sf.net) configuration +# tidy - validate, correct, and pretty-print HTML files +# see: man 1 tidy, http://tidy.sourceforge.net/docs/quickref.html + +show-body-only: yes +force-output: yes +tidy-mark: no +wrap: 0 +wrap-attributes: no +literal-attributes: yes +output-xhtml: yes +numeric-entities: yes +enclose-text: yes +enclose-block-text: yes +quiet: yes +quote-nbsp: yes +fix-backslash: no +fix-uri: no +# Don't strip html5 elements we support +# html-{meta,link} is a hack we use to prevent Tidy from stripping <meta> and <link> used in the body for Microdata +new-empty-tags: html-meta, html-link, wbr +new-inline-tags: video, audio, source, track, bdi, data, time, mark |