diff options
author | Pierre Schmitz <pierre@archlinux.de> | 2011-06-22 11:28:20 +0200 |
---|---|---|
committer | Pierre Schmitz <pierre@archlinux.de> | 2011-06-22 11:28:20 +0200 |
commit | 9db190c7e736ec8d063187d4241b59feaf7dc2d1 (patch) | |
tree | 46d1a0dee7febef5c2d57a9f7b972be16a163b3d /includes/Sanitizer.php | |
parent | 78677c7bbdcc9739f6c10c75935898a20e1acd9e (diff) |
update to MediaWiki 1.17.0
Diffstat (limited to 'includes/Sanitizer.php')
-rw-r--r-- | includes/Sanitizer.php | 96 |
1 files changed, 74 insertions, 22 deletions
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index 26837b3c..a6c64264 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -2,7 +2,7 @@ /** * XHTML sanitizer for MediaWiki * - * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al + * Copyright © 2002-2005 Brion Vibber <brion@pobox.com> et al * http://www.mediawiki.org/ * * This program is free software; you can redistribute it and/or modify @@ -40,10 +40,11 @@ define( 'MW_CHAR_REFS_REGEX', * Allows some... latitude. * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes */ -$attrib = '[A-Za-z0-9]'; +$attribFirst = '[:A-Z_a-z0-9]'; +$attrib = '[:A-Z_a-z-.0-9]'; $space = '[\x09\x0a\x0d\x20]'; define( 'MW_ATTRIBS_REGEX', - "/(?:^|$space)((?:xml:|xmlns:)?$attrib+) + "/(?:^|$space)({$attribFirst}{$attrib}*) ($space*=$space* (?: # The attribute value: quoted or alone @@ -367,7 +368,8 @@ class Sanitizer { 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', - 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr' + 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn', + 'kbd', 'samp' ); $htmlsingle = array( 'br', 'hr', 'li', 'dt', 'dd' @@ -389,6 +391,12 @@ class Sanitizer { 'li', ); + global $wgAllowImageTag; + if ( $wgAllowImageTag ) { + $htmlsingle[] = 'img'; + $htmlsingleonly[] = 'img'; + } + $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); @@ -620,7 +628,7 @@ class Sanitizer { * @todo Check for unique id attribute :P */ static function validateAttributes( $attribs, $whitelist ) { - global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes; + global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5; $whitelist = array_flip( $whitelist ); $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/'; @@ -636,7 +644,8 @@ class Sanitizer { continue; } - if( !isset( $whitelist[$attribute] ) ) { + # Allow any attribute beginning with "data-", if in HTML5 mode + if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) { continue; } @@ -914,7 +923,9 @@ class Sanitizer { * * To ensure we don't have to bother escaping anything, we also strip ', ", * & even if $wgExperimentalIds is true. TODO: Is this the best tactic? - * We also strip # because it upsets IE6. + * We also strip # because it upsets IE, and % because it could be + * ambiguous if it's part of something that looks like a percent escape + * (which don't work reliably in fragments cross-browser). * * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters * in the id and @@ -940,7 +951,7 @@ class Sanitizer { if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { $id = Sanitizer::decodeCharReferences( $id ); - $id = preg_replace( '/[ \t\n\r\f_\'"&#]+/', '_', $id ); + $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); $id = trim( $id, '_' ); if ( $id === '' ) { # Must have been all whitespace to start with. @@ -988,17 +999,16 @@ class Sanitizer { /** * Given HTML input, escape with htmlspecialchars but un-escape entites. - * This allows (generally harmless) entities like to survive. + * This allows (generally harmless) entities like   to survive. * * @param $html String to escape * @return String: escaped input */ static function escapeHtmlAllowEntities( $html ) { + $html = Sanitizer::decodeCharReferences( $html ); # It seems wise to escape ' as well as ", as a matter of course. Can't # hurt. $html = htmlspecialchars( $html, ENT_QUOTES ); - $html = str_replace( '&', '&', $html ); - $html = Sanitizer::normalizeCharReferences( $html ); return $html; } @@ -1102,11 +1112,24 @@ class Sanitizer { } /** + * Normalizes whitespace in a section name, such as might be returned + * by Parser::stripSectionName(), for use in the id's that are used for + * section links. + * + * @param $section String + * @return String + */ + static function normalizeSectionNameWhitespace( $section ) { + return trim( preg_replace( '/[ _]+/', ' ', $section ) ); + } + + /** * Ensure that any entities and character references are legal * for XML and XHTML specifically. Any stray bits will be * &-escaped to result in a valid text fragment. * - * a. any named char refs must be known in XHTML + * a. named char refs can only be < > & ", others are + * numericized (this way we're well-formed even without a DTD) * b. any numeric char refs must be legal chars, not invalid or forbidden * c. use &#x, not &#X * d. fix or reject non-valid attributes @@ -1145,9 +1168,10 @@ class Sanitizer { /** * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, - * return the named entity reference as is. If the entity is a - * MediaWiki-specific alias, returns the HTML equivalent. Otherwise, - * returns HTML-escaped text of pseudo-entity source (eg &foo;) + * return the equivalent numeric entity reference (except for the core < + * > & "). If the entity is a MediaWiki-specific alias, returns + * the HTML equivalent. Otherwise, returns HTML-escaped text of + * pseudo-entity source (eg &foo;) * * @param $name String * @return String @@ -1156,8 +1180,11 @@ class Sanitizer { global $wgHtmlEntities, $wgHtmlEntityAliases; if ( isset( $wgHtmlEntityAliases[$name] ) ) { return "&{$wgHtmlEntityAliases[$name]};"; - } elseif( isset( $wgHtmlEntities[$name] ) ) { + } elseif ( in_array( $name, + array( 'lt', 'gt', 'amp', 'quot' ) ) ) { return "&$name;"; + } elseif ( isset( $wgHtmlEntities[$name] ) ) { + return "&#{$wgHtmlEntities[$name]};"; } else { return "&$name;"; } @@ -1210,6 +1237,30 @@ class Sanitizer { } /** + * Decode any character references, numeric or named entities, + * in the next and normalize the resulting string. (bug 14952) + * + * This is useful for page titles, not for text to be displayed, + * MediaWiki allows HTML entities to escape normalization as a feature. + * + * @param $text String (already normalized, containing entities) + * @return String (still normalized, without entities) + */ + public static function decodeCharReferencesAndNormalize( $text ) { + global $wgContLang; + $text = preg_replace_callback( + MW_CHAR_REFS_REGEX, + array( 'Sanitizer', 'decodeCharReferencesCallback' ), + $text, /* limit */ -1, $count ); + + if ( $count ) { + return $wgContLang->normalize( $text ); + } else { + return $text; + } + } + + /** * @param $matches String * @return String */ @@ -1342,10 +1393,10 @@ class Sanitizer { 'em' => $common, 'strong' => $common, 'cite' => $common, - # dfn + 'dfn' => $common, 'code' => $common, - # samp - # kbd + 'samp' => $common, + 'kbd' => $common, 'var' => $common, 'abbr' => $common, # acronym @@ -1412,8 +1463,9 @@ class Sanitizer { # 13.2 # Not usually allowed, but may be used for extension-style hooks - # such as <math> when it is rasterized - 'img' => array_merge( $common, array( 'alt' ) ), + # such as <math> when it is rasterized, or if $wgAllowImageTag is + # true + 'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ), # 15.2.1 'tt' => $common, @@ -1495,7 +1547,7 @@ class Sanitizer { $url = Sanitizer::decodeCharReferences( $url ); # Escape any control characters introduced by the above step - $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url ); + $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F\|]/e', "urlencode('\\0')", $url ); # Validate hostname portion $matches = array(); |