html = $html; } /** * Turns a chunk of HTML into a proper document * @param string $html * @return string */ public static function wrapHTML( $html ) { return '' . $html . ''; } /** * Override this in descendant class to modify HTML after it has been converted from DOM tree * @param string $html HTML to process * @return string Processed HTML */ protected function onHtmlReady( $html ) { return $html; } /** * @return DOMDocument DOM to manipulate */ public function getDoc() { if ( !$this->doc ) { // DOMDocument::loadHTML apparently isn't very good with encodings, so // convert input to ASCII by encoding everything above 128 as entities. if ( function_exists( 'mb_convert_encoding' ) ) { $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' ); } else { $html = preg_replace_callback( '/[\x{80}-\x{10ffff}]/u', function ( $m ) { return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';'; }, $this->html ); } // Workaround for bug that caused spaces before references // to disappear during processing: // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086 // // Please replace with a better fix if one can be found. $html = str_replace( ' <', ' <', $html ); libxml_use_internal_errors( true ); $loader = libxml_disable_entity_loader(); $this->doc = new DOMDocument(); $this->doc->strictErrorChecking = false; $this->doc->loadHTML( $html ); libxml_disable_entity_loader( $loader ); libxml_use_internal_errors( false ); $this->doc->encoding = 'UTF-8'; } return $this->doc; } /** * Sets whether images/videos/sounds should be removed from output * @param bool $flag */ public function setRemoveMedia( $flag = true ) { $this->removeMedia = $flag; } /** * Adds one or more selector of content to remove. A subset of CSS selector * syntax is supported: * * * .class * . * # * * @param array|string $selectors Selector(s) of stuff to remove */ public function remove( $selectors ) { $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors ); } /** * Adds one or more element name to the list to flatten (remove tag, but not its content) * Can accept undelimited regexes * * Note this interface may fail in surprising unexpected ways due to usage of regexes, * so should not be relied on for HTML markup security measures. * * @param array|string $elements Name(s) of tag(s) to flatten */ public function flatten( $elements ) { $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements ); } /** * Instructs the formatter to flatten all tags */ public function flattenAllTags() { $this->flatten( '[?!]?[a-z0-9]+' ); } /** * Removes content we've chosen to remove. The text of the removed elements can be * extracted with the getText method. * @return array Array of removed DOMElements */ public function filterContent() { $removals = $this->parseItemsToRemove(); // Bail out early if nothing to do if ( array_reduce( $removals, function ( $carry, $item ) { return $carry && !$item; }, true ) ) { return array(); } $doc = $this->getDoc(); // Remove tags // You can't remove DOMNodes from a DOMNodeList as you're iterating // over them in a foreach loop. It will seemingly leave the internal // iterator on the foreach out of wack and results will be quite // strange. Though, making a queue of items to remove seems to work. $domElemsToRemove = array(); foreach ( $removals['TAG'] as $tagToRemove ) { $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove ); foreach ( $tagToRemoveNodes as $tagToRemoveNode ) { if ( $tagToRemoveNode ) { $domElemsToRemove[] = $tagToRemoveNode; } } } $removed = $this->removeElements( $domElemsToRemove ); // Elements with named IDs $domElemsToRemove = array(); foreach ( $removals['ID'] as $itemToRemove ) { $itemToRemoveNode = $doc->getElementById( $itemToRemove ); if ( $itemToRemoveNode ) { $domElemsToRemove[] = $itemToRemoveNode; } } $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); // CSS Classes $domElemsToRemove = array(); $xpath = new DOMXPath( $doc ); foreach ( $removals['CLASS'] as $classToRemove ) { $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' ); /** @var $element DOMElement */ foreach ( $elements as $element ) { $classes = $element->getAttribute( 'class' ); if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) { $domElemsToRemove[] = $element; } } } $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); // Tags with CSS Classes foreach ( $removals['TAG_CLASS'] as $classToRemove ) { $parts = explode( '.', $classToRemove ); $elements = $xpath->query( '//' . $parts[0] . '[@class="' . $parts[1] . '"]' ); $removed = array_merge( $removed, $this->removeElements( $elements ) ); } return $removed; } /** * Removes a list of elelments from DOMDocument * @param array|DOMNodeList $elements * @return array Array of removed elements */ private function removeElements( $elements ) { $list = $elements; if ( $elements instanceof DOMNodeList ) { $list = array(); foreach ( $elements as $element ) { $list[] = $element; } } /** @var $element DOMElement */ foreach ( $list as $element ) { if ( $element->parentNode ) { $element->parentNode->removeChild( $element ); } } return $list; } /** * libxml in its usual pointlessness converts many chars to entities - this function * perfoms a reverse conversion * @param string $html * @return string */ private function fixLibXML( $html ) { static $replacements; if ( !$replacements ) { // We don't include rules like '"' => '"' because entities had already been // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE! $replacements = new ReplacementArray( array( '"' => '"', '&' => '&', '<' => '<', '>' => '>', ) ); } $html = $replacements->replace( $html ); if ( function_exists( 'mb_convert_encoding' ) ) { // Just in case the conversion in getDoc() above used named // entities that aren't known to html_entity_decode(). $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' ); } else { $html = html_entity_decode( $html, ENT_COMPAT, 'utf-8' ); } return $html; } /** * Performs final transformations and returns resulting HTML. Note that if you want to call this * both without an element and with an element you should call it without an element first. If you * specify the $element in the method it'll change the underlying dom and you won't be able to get * it back. * * @param DOMElement|string|null $element ID of element to get HTML from or * false to get it from the whole tree * @return string Processed HTML */ public function getText( $element = null ) { if ( $this->doc ) { if ( $element !== null && !( $element instanceof DOMElement ) ) { $element = $this->doc->getElementById( $element ); } if ( $element ) { $body = $this->doc->getElementsByTagName( 'body' )->item( 0 ); $nodesArray = array(); foreach ( $body->childNodes as $node ) { $nodesArray[] = $node; } foreach ( $nodesArray as $nodeArray ) { $body->removeChild( $nodeArray ); } $body->appendChild( $element ); } $html = $this->doc->saveHTML(); $html = $this->fixLibXml( $html ); if ( wfIsWindows() ) { // Cleanup for CRLF misprocessing of unknown origin on Windows. // // If this error continues in the future, please track it down in the // XML code paths if possible and fix there. $html = str_replace( ' ', '', $html ); } } else { $html = $this->html; } // Remove stuff added by wrapHTML() $html = preg_replace( '/|^.*?|<\/body>.*$/s', '', $html ); $html = $this->onHtmlReady( $html ); if ( $this->elementsToFlatten ) { $elements = implode( '|', $this->elementsToFlatten ); $html = preg_replace( "#]*>#is", '', $html ); } return $html; } /** * Helper function for parseItemsToRemove(). This function extracts the selector type * and the raw name of a selector from a CSS-style selector string and assigns those * values to parameters passed by reference. For example, if given '#toc' as the * $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName. * @param string $selector CSS selector to parse * @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG) * @param string $rawName The raw name of the selector * @return bool Whether the selector was successfully recognised * @throws MWException */ protected function parseSelector( $selector, &$type, &$rawName ) { if ( strpos( $selector, '.' ) === 0 ) { $type = 'CLASS'; $rawName = substr( $selector, 1 ); } elseif ( strpos( $selector, '#' ) === 0 ) { $type = 'ID'; $rawName = substr( $selector, 1 ); } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) { $type = 'TAG_CLASS'; $rawName = $selector; } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) { $type = 'TAG'; $rawName = $selector; } else { throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" ); } return true; } /** * Transforms CSS-style selectors into an internal representation suitable for * processing by filterContent() * @return array */ protected function parseItemsToRemove() { $removals = array( 'ID' => array(), 'TAG' => array(), 'CLASS' => array(), 'TAG_CLASS' => array(), ); foreach ( $this->itemsToRemove as $itemToRemove ) { $type = ''; $rawName = ''; if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) { $removals[$type][] = $rawName; } } if ( $this->removeMedia ) { $removals['TAG'][] = 'img'; $removals['TAG'][] = 'audio'; $removals['TAG'][] = 'video'; } return $removals; } }