From 4ac9fa081a7c045f6a9f1cfc529d82423f485b2e Mon Sep 17 00:00:00 2001 From: Pierre Schmitz Date: Sun, 8 Dec 2013 09:55:49 +0100 Subject: Update to MediaWiki 1.22.0 --- includes/HtmlFormatter.php | 356 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 356 insertions(+) create mode 100644 includes/HtmlFormatter.php (limited to 'includes/HtmlFormatter.php') diff --git a/includes/HtmlFormatter.php b/includes/HtmlFormatter.php new file mode 100644 index 00000000..248a76fe --- /dev/null +++ b/includes/HtmlFormatter.php @@ -0,0 +1,356 @@ +html = $html; + } + + /** + * Turns a chunk of HTML into a proper document + * @param string $html + * @return string + */ + public static function wrapHTML( $html ) { + return '' . $html . ''; + } + + /** + * Override this in descendant class to modify HTML after it has been converted from DOM tree + * @param string $html: HTML to process + * @return string: Processed HTML + */ + protected function onHtmlReady( $html ) { + return $html; + } + + /** + * @return DOMDocument: DOM to manipulate + */ + public function getDoc() { + if ( !$this->doc ) { + $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' ); + + // Workaround for bug that caused spaces before references + // to disappear during processing: + // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086 + // + // Please replace with a better fix if one can be found. + $html = str_replace( ' <', ' <', $html ); + + libxml_use_internal_errors( true ); + $loader = libxml_disable_entity_loader(); + $this->doc = new DOMDocument(); + $this->doc->strictErrorChecking = false; + $this->doc->loadHTML( $html ); + libxml_disable_entity_loader( $loader ); + libxml_use_internal_errors( false ); + $this->doc->encoding = 'UTF-8'; + } + return $this->doc; + } + + /** + * Sets whether images/videos/sounds should be removed from output + * @param bool $flag + */ + public function setRemoveMedia( $flag = true ) { + $this->removeMedia = $flag; + } + + /** + * Adds one or more selector of content to remove. A subset of CSS selector + * syntax is supported: + * + * + * .class + * . + * # + * + * @param Array|string $selectors: Selector(s) of stuff to remove + */ + public function remove( $selectors ) { + $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors ); + } + + /** + * Adds one or more element name to the list to flatten (remove tag, but not its content) + * Can accept undelimited regexes + * + * Note this interface may fail in surprising unexpected ways due to usage of regexes, + * so should not be relied on for HTML markup security measures. + * + * @param Array|string $elements: Name(s) of tag(s) to flatten + */ + public function flatten( $elements ) { + $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements ); + } + + /** + * Instructs the formatter to flatten all tags + */ + public function flattenAllTags() { + $this->flatten( '[?!]?[a-z0-9]+' ); + } + + /** + * Removes content we've chosen to remove + */ + public function filterContent() { + wfProfileIn( __METHOD__ ); + $removals = $this->parseItemsToRemove(); + + if ( !$removals ) { + wfProfileOut( __METHOD__ ); + return; + } + + $doc = $this->getDoc(); + + // Remove tags + + // You can't remove DOMNodes from a DOMNodeList as you're iterating + // over them in a foreach loop. It will seemingly leave the internal + // iterator on the foreach out of wack and results will be quite + // strange. Though, making a queue of items to remove seems to work. + $domElemsToRemove = array(); + foreach ( $removals['TAG'] as $tagToRemove ) { + $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove ); + foreach ( $tagToRemoveNodes as $tagToRemoveNode ) { + if ( $tagToRemoveNode ) { + $domElemsToRemove[] = $tagToRemoveNode; + } + } + } + + $this->removeElements( $domElemsToRemove ); + + // Elements with named IDs + $domElemsToRemove = array(); + foreach ( $removals['ID'] as $itemToRemove ) { + $itemToRemoveNode = $doc->getElementById( $itemToRemove ); + if ( $itemToRemoveNode ) { + $domElemsToRemove[] = $itemToRemoveNode; + } + } + $this->removeElements( $domElemsToRemove ); + + // CSS Classes + $domElemsToRemove = array(); + $xpath = new DOMXpath( $doc ); + foreach ( $removals['CLASS'] as $classToRemove ) { + $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' ); + + /** @var $element DOMElement */ + foreach ( $elements as $element ) { + $classes = $element->getAttribute( 'class' ); + if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) { + $domElemsToRemove[] = $element; + } + } + } + $this->removeElements( $domElemsToRemove ); + + // Tags with CSS Classes + foreach ( $removals['TAG_CLASS'] as $classToRemove ) { + $parts = explode( '.', $classToRemove ); + + $elements = $xpath->query( + '//' . $parts[0] . '[@class="' . $parts[1] . '"]' + ); + + $this->removeElements( $elements ); + } + + wfProfileOut( __METHOD__ ); + } + + /** + * Removes a list of elelments from DOMDocument + * @param array|DOMNodeList $elements + */ + private function removeElements( $elements ) { + $list = $elements; + if ( $elements instanceof DOMNodeList ) { + $list = array(); + foreach ( $elements as $element ) { + $list[] = $element; + } + } + /** @var $element DOMElement */ + foreach ( $list as $element ) { + if ( $element->parentNode ) { + $element->parentNode->removeChild( $element ); + } + } + } + + /** + * libxml in its usual pointlessness converts many chars to entities - this function + * perfoms a reverse conversion + * @param string $html + * @return string + */ + private function fixLibXML( $html ) { + wfProfileIn( __METHOD__ ); + static $replacements; + if ( ! $replacements ) { + // We don't include rules like '"' => '&quot;' because entities had already been + // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE! + $replacements = new ReplacementArray( array( + '"' => '&quot;', + '&' => '&amp;', + '<' => '&lt;', + '>' => '&gt;', + ) ); + } + $html = $replacements->replace( $html ); + $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' ); + wfProfileOut( __METHOD__ ); + return $html; + } + + /** + * Performs final transformations and returns resulting HTML + * + * @param DOMElement|string|null $element: ID of element to get HTML from or false to get it from the whole tree + * @return string: Processed HTML + */ + public function getText( $element = null ) { + wfProfileIn( __METHOD__ ); + + if ( $this->doc ) { + if ( $element !== null && !( $element instanceof DOMElement ) ) { + $element = $this->doc->getElementById( $element ); + } + if ( $element ) { + $body = $this->doc->getElementsByTagName( 'body' )->item( 0 ); + $nodesArray = array(); + foreach ( $body->childNodes as $node ) { + $nodesArray[] = $node; + } + foreach ( $nodesArray as $nodeArray ) { + $body->removeChild( $nodeArray ); + } + $body->appendChild( $element ); + } + $html = $this->doc->saveHTML(); + $html = $this->fixLibXml( $html ); + } else { + $html = $this->html; + } + if ( wfIsWindows() ) { + // Appears to be cleanup for CRLF misprocessing of unknown origin + // when running server on Windows platform. + // + // If this error continues in the future, please track it down in the + // XML code paths if possible and fix there. + $html = str_replace( ' ', '', $html ); + } + $html = preg_replace( '/|^.*?|<\/body>.*$/s', '', $html ); + $html = $this->onHtmlReady( $html ); + + if ( $this->elementsToFlatten ) { + $elements = implode( '|', $this->elementsToFlatten ); + $html = preg_replace( "#]*>#is", '', $html ); + } + + wfProfileOut( __METHOD__ ); + return $html; + } + + /** + * @param $selector: CSS selector to parse + * @param $type + * @param $rawName + * @return bool: Whether the selector was successfully recognised + */ + protected function parseSelector( $selector, &$type, &$rawName ) { + if ( strpos( $selector, '.' ) === 0 ) { + $type = 'CLASS'; + $rawName = substr( $selector, 1 ); + } elseif ( strpos( $selector, '#' ) === 0 ) { + $type = 'ID'; + $rawName = substr( $selector, 1 ); + } elseif ( strpos( $selector, '.' ) !== 0 && + strpos( $selector, '.' ) !== false ) + { + $type = 'TAG_CLASS'; + $rawName = $selector; + } elseif ( strpos( $selector, '[' ) === false + && strpos( $selector, ']' ) === false ) + { + $type = 'TAG'; + $rawName = $selector; + } else { + throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" ); + } + + return true; + } + + /** + * Transforms CSS selectors into an internal representation suitable for processing + * @return array + */ + protected function parseItemsToRemove() { + wfProfileIn( __METHOD__ ); + $removals = array( + 'ID' => array(), + 'TAG' => array(), + 'CLASS' => array(), + 'TAG_CLASS' => array(), + ); + + foreach ( $this->itemsToRemove as $itemToRemove ) { + $type = ''; + $rawName = ''; + if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) { + $removals[$type][] = $rawName; + } + } + + if ( $this->removeMedia ) { + $removals['TAG'][] = 'img'; + $removals['TAG'][] = 'audio'; + $removals['TAG'][] = 'video'; + } + + wfProfileOut( __METHOD__ ); + return $removals; + } +} -- cgit v1.2.3-54-g00ecf