From cecb985bee3bdd252e1b8dc0bd500b37cd52be01 Mon Sep 17 00:00:00 2001 From: Pierre Schmitz Date: Wed, 16 May 2007 20:58:53 +0000 Subject: Aktualisierung auf MediaWiki 1.10.0 Plugins angepasst und verbessert kleine Korrekturen am Design --- includes/Sanitizer.php | 184 ++++++++++++++++++++++++++----------------------- 1 file changed, 99 insertions(+), 85 deletions(-) (limited to 'includes/Sanitizer.php') diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index 0c0f7244..fa5416dc 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -20,8 +20,7 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * http://www.gnu.org/copyleft/gpl.html * - * @package MediaWiki - * @subpackage Parser + * @addtogroup Parser */ /** @@ -29,7 +28,7 @@ * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences */ define( 'MW_CHAR_REFS_REGEX', - '/&([A-Za-z0-9]+); + '/&([A-Za-z0-9\x80-\xff]+); |&\#([0-9]+); |&\#x([0-9A-Za-z]+); |&\#X([0-9A-Za-z]+); @@ -316,7 +315,20 @@ $wgHtmlEntities = array( 'zwj' => 8205, 'zwnj' => 8204 ); -/** @package MediaWiki */ +/** + * Character entity aliases accepted by MediaWiki + */ +global $wgHtmlEntityAliases; +$wgHtmlEntityAliases = array( + 'רלמ' => 'rlm', + 'رلم' => 'rlm', +); + + +/** + * XHTML sanitizer for MediaWiki + * @addtogroup Parser + */ class Sanitizer { /** * Cleans up HTML, removes dangerous tags and attributes, and @@ -328,48 +340,41 @@ class Sanitizer { * @return string */ static function removeHTMLtags( $text, $processCallback = null, $args = array() ) { - global $wgUseTidy, $wgUserHtml; + global $wgUseTidy; - static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, + static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised; - + wfProfileIn( __METHOD__ ); - + if ( !$staticInitialised ) { - if( $wgUserHtml ) { - $htmlpairs = array( # Tags that must be closed - 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', - 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', - 'strike', 'strong', 'tt', 'var', 'div', 'center', - 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', - 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u' - ); - $htmlsingle = array( - 'br', 'hr', 'li', 'dt', 'dd' - ); - $htmlsingleonly = array( # Elements that cannot have close tags - 'br', 'hr' - ); - $htmlnest = array( # Tags that can be nested--?? - 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', - 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span' - ); - $tabletags = array( # Can only appear inside table - 'td', 'th', 'tr', - ); - $htmllist = array( # Tags used by list - 'ul','ol', - ); - $listtags = array( # Tags that can appear in a list - 'li', - ); - - } else { - $htmlpairs = array(); - $htmlsingle = array(); - $htmlnest = array(); - $tabletags = array(); - } + + $htmlpairs = array( # Tags that must be closed + 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', + 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', + 'strike', 'strong', 'tt', 'var', 'div', 'center', + 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', + 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u' + ); + $htmlsingle = array( + 'br', 'hr', 'li', 'dt', 'dd' + ); + $htmlsingleonly = array( # Elements that cannot have close tags + 'br', 'hr' + ); + $htmlnest = array( # Tags that can be nested--?? + 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', + 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span' + ); + $tabletags = array( # Can only appear inside table, we will close them + 'td', 'th', 'tr', + ); + $htmllist = array( # Tags used by list + 'ul','ol', + ); + $listtags = array( # Tags that can appear in a list + 'li', + ); $htmlsingleallowed = array_merge( $htmlsingle, $tabletags ); $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest ); @@ -386,7 +391,7 @@ class Sanitizer { # Remove HTML comments $text = Sanitizer::removeHTMLcomments( $text ); $bits = explode( '<', $text ); - $text = array_shift( $bits ); + $text = str_replace( '>', '>', array_shift( $bits ) ); if(!$wgUseTidy) { $tagstack = $tablestack = array(); foreach ( $bits as $x ) { @@ -396,7 +401,7 @@ class Sanitizer { } else { $slash = $t = $params = $brace = $rest = null; } - + $badtag = 0 ; if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { # Check our stack @@ -453,6 +458,10 @@ class Sanitizer { } else if( isset( $htmlsingle[$t] ) ) { # Hack to not close $htmlsingle tags $brace = NULL; + } else if( isset( $tabletags[$t] ) + && in_array($t ,$tagstack) ) { + // New table tag but forgot to close the previous one + $text .= ""; } else { if ( $t == 'table' ) { array_push( $tablestack, $tagstack ); @@ -472,7 +481,7 @@ class Sanitizer { } if ( ! $badtag ) { $rest = str_replace( '>', '>', $rest ); - $close = ( $brace == '/>' ) ? ' /' : ''; + $close = ( $brace == '/>' && !$slash ) ? ' /' : ''; $text .= "<$slash$t$newparams$close>$rest"; continue; } @@ -613,7 +622,7 @@ class Sanitizer { $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e', 'codepointToUtf8(hexdec("$1"))', $stripped ); $stripped = str_replace( '\\', '', $stripped ); - if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is', + if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is', $stripped ) ) { # haxx0r return false; @@ -645,15 +654,15 @@ class Sanitizer { if( trim( $text ) == '' ) { return ''; } - + $stripped = Sanitizer::validateTagAttributes( Sanitizer::decodeTagAttributes( $text ), $element ); - + $attribs = array(); foreach( $stripped as $attribute => $value ) { $encAttribute = htmlspecialchars( $attribute ); $encValue = Sanitizer::safeEncodeAttribute( $value ); - + $attribs[] = "$encAttribute=\"$encValue\""; } return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; @@ -666,7 +675,7 @@ class Sanitizer { */ static function encodeAttribute( $text ) { $encValue = htmlspecialchars( $text ); - + // Whitespace is normalized during attribute decoding, // so if we've been passed non-spaces we must encode them // ahead of time or they won't be preserved. @@ -675,10 +684,10 @@ class Sanitizer { "\r" => ' ', "\t" => ' ', ) ); - + return $encValue; } - + /** * Encode an attribute value for HTML tags, with extra armoring * against further wiki processing. @@ -687,7 +696,7 @@ class Sanitizer { */ static function safeEncodeAttribute( $text ) { $encValue = Sanitizer::encodeAttribute( $text ); - + # Templates and links may be expanded in later parsing, # creating invalid or dangerous output. Suppress this. $encValue = strtr( $encValue, array( @@ -716,12 +725,10 @@ class Sanitizer { * Given a value escape it so that it can be used in an id attribute and * return it, this does not validate the value however (see first link) * - * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters + * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters * in the id and * name attributes - * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute - * - * @bug 4461 + * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute * * @static * @@ -743,9 +750,9 @@ class Sanitizer { * Given a value, escape it so that it can be used as a CSS class and * return it. * - * TODO: For extra validity, input should be validated UTF-8. + * @todo For extra validity, input should be validated UTF-8. * - * @link http://www.w3.org/TR/CSS21/syndata.html Valid characters/format + * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format * * @param string $class * @return string @@ -795,11 +802,11 @@ class Sanitizer { foreach( $pairs as $set ) { $attribute = strtolower( $set[1] ); $value = Sanitizer::getTagAttributeCallback( $set ); - + // Normalize whitespace $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); $value = trim( $value ); - + // Decode character references $attribs[$attribute] = Sanitizer::decodeCharReferences( $value ); } @@ -850,11 +857,16 @@ class Sanitizer { */ private static function normalizeAttributeValue( $text ) { return str_replace( '"', '"', - preg_replace( - '/\r\n|[\x20\x0d\x0a\x09]/', - ' ', + self::normalizeWhitespace( Sanitizer::normalizeCharReferences( $text ) ) ); } + + private static function normalizeWhitespace( $text ) { + return preg_replace( + '/\r\n|[\x20\x0d\x0a\x09]/', + ' ', + $text ); + } /** * Ensure that any entities and character references are legal @@ -900,16 +912,19 @@ class Sanitizer { /** * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, - * return the named entity reference as is. Otherwise, returns - * HTML-escaped text of pseudo-entity source (eg &foo;) + * return the named entity reference as is. If the entity is a + * MediaWiki-specific alias, returns the HTML equivalent. Otherwise, + * returns HTML-escaped text of pseudo-entity source (eg &foo;) * * @param string $name * @return string * @static */ static function normalizeEntity( $name ) { - global $wgHtmlEntities; - if( isset( $wgHtmlEntities[$name] ) ) { + global $wgHtmlEntities, $wgHtmlEntityAliases; + if ( isset( $wgHtmlEntityAliases[$name] ) ) { + return "&{$wgHtmlEntityAliases[$name]};"; + } elseif( isset( $wgHtmlEntities[$name] ) ) { return "&$name;"; } else { return "&$name;"; @@ -1006,7 +1021,10 @@ class Sanitizer { * @return string */ static function decodeEntity( $name ) { - global $wgHtmlEntities; + global $wgHtmlEntities, $wgHtmlEntityAliases; + if ( isset( $wgHtmlEntityAliases[$name] ) ) { + $name = $wgHtmlEntityAliases[$name]; + } if( isset( $wgHtmlEntities[$name] ) ) { return codepointToUtf8( $wgHtmlEntities[$name] ); } else { @@ -1173,8 +1191,10 @@ class Sanitizer { /** * Take a fragment of (potentially invalid) HTML and return - * a version with any tags removed, encoded suitably for literal - * inclusion in an attribute value. + * a version with any tags removed, encoded as plain text. + * + * Warning: this return value must be further escaped for literal + * inclusion in HTML output as of 1.10! * * @param string $text HTML fragment * @return string @@ -1184,14 +1204,8 @@ class Sanitizer { $text = StringUtils::delimiterReplace( '<', '>', '', $text ); # Normalize &entities and whitespace - $text = Sanitizer::normalizeAttributeValue( $text ); - - # Will be placed into "double-quoted" attributes, - # make sure remaining bits are safe. - $text = str_replace( - array('<', '>', '"'), - array('<', '>', '"'), - $text ); + $text = self::decodeCharReferences( $text ); + $text = self::normalizeWhitespace( $text ); return $text; } @@ -1215,7 +1229,7 @@ class Sanitizer { $out .= "]>\n"; return $out; } - + static function cleanUrl( $url, $hostname=true ) { # Normalize any HTML entities in input. They will be # re-escaped by makeExternalLink(). @@ -1223,12 +1237,12 @@ class Sanitizer { # Escape any control characters introduced by the above step $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url ); - + # Validate hostname portion $matches = array(); if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) { list( /* $whole */, $protocol, $host, $rest ) = $matches; - + // Characters that will be ignored in IDNs. // http://tools.ietf.org/html/3454#section-3.1 // Strip them before further processing so blacklists and such work. @@ -1247,11 +1261,11 @@ class Sanitizer { \xe2\x80\x8d| # 200d ZERO WIDTH JOINER [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16 /xuD"; - + $host = preg_replace( $strip, '', $host ); - + // @fixme: validate hostnames here - + return $protocol . $host . $rest; } else { return $url; -- cgit v1.2.3-54-g00ecf