diff options
Diffstat (limited to 'includes/Sanitizer.php')
-rw-r--r-- | includes/Sanitizer.php | 111 |
1 files changed, 69 insertions, 42 deletions
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index 96193a74..de63af79 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -346,12 +346,9 @@ class Sanitizer { ($space*=$space* (?: # The attribute value: quoted or alone - \"([^<\"]*)\" - | '([^<']*)' + \"([^<\"]*)(?:\"|\$) + | '([^<']*)(?:'|\$) | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) - | (\#[0-9a-fA-F]+) # Technically wrong, but lots of - # colors are specified like this. - # We'll be normalizing it. ) )?(?=$space|\$)/sx"; } @@ -359,20 +356,13 @@ class Sanitizer { } /** - * Cleans up HTML, removes dangerous tags and attributes, and - * removes HTML comments - * @param string $text - * @param callable $processCallback Callback to do any variable or parameter - * replacements in HTML attribute values - * @param array|bool $args Arguments for the processing callback + * Return the various lists of recognized tags * @param array $extratags For any extra tags to include * @param array $removetags For any tags (default or extra) to exclude - * @return string + * @return array */ - public static function removeHTMLtags( $text, $processCallback = null, - $args = array(), $extratags = array(), $removetags = array() - ) { - global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag; + public static function getRecognizedTagData( $extratags = array(), $removetags = array() ) { + global $wgAllowMicrodataAttributes, $wgAllowImageTag; static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; @@ -381,7 +371,6 @@ class Sanitizer { // are changed (like in the screwed up test system) we will re-initialise the settings. $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) ); if ( !$staticInitialised || $staticInitialised != $globalContext ) { - $htmlpairsStatic = array( # Tags that must be closed 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', @@ -431,17 +420,47 @@ class Sanitizer { } $staticInitialised = $globalContext; } + # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays $extratags = array_flip( $extratags ); $removetags = array_flip( $removetags ); $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags ); + return array( + 'htmlpairs' => $htmlpairs, + 'htmlsingle' => $htmlsingle, + 'htmlsingleonly' => $htmlsingleonly, + 'htmlnest' => $htmlnest, + 'tabletags' => $tabletags, + 'htmllist' => $htmllist, + 'listtags' => $listtags, + 'htmlsingleallowed' => $htmlsingleallowed, + 'htmlelements' => $htmlelements, + ); + } + + /** + * Cleans up HTML, removes dangerous tags and attributes, and + * removes HTML comments + * @param string $text + * @param callable $processCallback Callback to do any variable or parameter + * replacements in HTML attribute values + * @param array|bool $args Arguments for the processing callback + * @param array $extratags For any extra tags to include + * @param array $removetags For any tags (default or extra) to exclude + * @return string + */ + public static function removeHTMLtags( $text, $processCallback = null, + $args = array(), $extratags = array(), $removetags = array() + ) { + extract( self::getRecognizedTagData( $extratags, $removetags ) ); + # Remove HTML comments $text = Sanitizer::removeHTMLcomments( $text ); $bits = explode( '<', $text ); $text = str_replace( '>', '>', array_shift( $bits ) ); - if ( !$wgUseTidy ) { + if ( !MWTidy::isEnabled() ) { $tagstack = $tablestack = array(); foreach ( $bits as $x ) { $regs = array(); @@ -463,9 +482,9 @@ class Sanitizer { $badtag = true; } elseif ( $slash ) { # Closing a tag... is it the one we just opened? - wfSuppressWarnings(); + MediaWiki\suppressWarnings(); $ot = array_pop( $tagstack ); - wfRestoreWarnings(); + MediaWiki\restoreWarnings(); if ( $ot != $t ) { if ( isset( $htmlsingleallowed[$ot] ) ) { @@ -473,32 +492,32 @@ class Sanitizer { # and see if we find a match below them $optstack = array(); array_push( $optstack, $ot ); - wfSuppressWarnings(); + MediaWiki\suppressWarnings(); $ot = array_pop( $tagstack ); - wfRestoreWarnings(); + MediaWiki\restoreWarnings(); while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) { array_push( $optstack, $ot ); - wfSuppressWarnings(); + MediaWiki\suppressWarnings(); $ot = array_pop( $tagstack ); - wfRestoreWarnings(); + MediaWiki\restoreWarnings(); } if ( $t != $ot ) { # No match. Push the optional elements back again $badtag = true; - wfSuppressWarnings(); + MediaWiki\suppressWarnings(); $ot = array_pop( $optstack ); - wfRestoreWarnings(); + MediaWiki\restoreWarnings(); while ( $ot ) { array_push( $tagstack, $ot ); - wfSuppressWarnings(); + MediaWiki\suppressWarnings(); $ot = array_pop( $optstack ); - wfRestoreWarnings(); + MediaWiki\restoreWarnings(); } } } else { - wfSuppressWarnings(); + MediaWiki\suppressWarnings(); array_push( $tagstack, $ot ); - wfRestoreWarnings(); + MediaWiki\restoreWarnings(); # <li> can be nested in <ul> or <ol>, skip those cases: if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) { @@ -729,7 +748,7 @@ class Sanitizer { } # Allow any attribute beginning with "data-" - if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) { + if ( !preg_match( '/^data-(?!ooui)/i', $attribute ) && !isset( $whitelist[$attribute] ) ) { continue; } @@ -942,7 +961,8 @@ class Sanitizer { $value = self::normalizeCss( $value ); // Reject problematic keywords and control characters - if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) { + if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) || + strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) { return '/* invalid control char */'; } elseif ( preg_match( '! expression @@ -1239,10 +1259,7 @@ class Sanitizer { * @return string */ private static function getTagAttributeCallback( $set ) { - if ( isset( $set[6] ) ) { - # Illegal #XXXXXX color with no quotes. - return $set[6]; - } elseif ( isset( $set[5] ) ) { + if ( isset( $set[5] ) ) { # No quotes. return $set[5]; } elseif ( isset( $set[4] ) ) { @@ -1252,9 +1269,10 @@ class Sanitizer { # Double-quoted return $set[3]; } elseif ( !isset( $set[2] ) ) { - # In XHTML, attributes must have a value. - # For 'reduced' form, return explicitly the attribute name here. - return $set[1]; + # In XHTML, attributes must have a value so return an empty string. + # See "Empty attribute syntax", + # http://www.w3.org/TR/html5/syntax.html#syntax-attribute-name + return ""; } else { throw new MWException( "Tag conditions not met. This should never happen and is a bug." ); } @@ -1374,15 +1392,19 @@ class Sanitizer { } /** - * Returns true if a given Unicode codepoint is a valid character in XML. + * Returns true if a given Unicode codepoint is a valid character in + * both HTML5 and XML. * @param int $codepoint * @return bool */ private static function validateCodepoint( $codepoint ) { + # U+000C is valid in HTML5 but not allowed in XML. + # U+000D is valid in XML but not allowed in HTML5. + # U+007F - U+009F are disallowed in HTML5 (control characters). return $codepoint == 0x09 || $codepoint == 0x0a - || $codepoint == 0x0d - || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff ) + || ( $codepoint >= 0x20 && $codepoint <= 0x7e ) + || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff ) || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd ) || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff ); } @@ -1784,6 +1806,11 @@ class Sanitizer { $host = preg_replace( $strip, '', $host ); + // IPv6 host names are bracketed with []. Url-decode these. + if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 && preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches ) ) { + $host = '//[' . $matches[1] . ']' . $matches[2]; + } + // @todo FIXME: Validate hostnames here return $protocol . $host . $rest; |