diff options
Diffstat (limited to 'includes/Sanitizer.php')
-rw-r--r-- | includes/Sanitizer.php | 224 |
1 files changed, 140 insertions, 84 deletions
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index f5a24dfa..185679f6 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -327,77 +327,90 @@ class Sanitizer { * @param array $args for the processing callback * @return string */ - function removeHTMLtags( $text, $processCallback = null, $args = array() ) { + static function removeHTMLtags( $text, $processCallback = null, $args = array() ) { global $wgUseTidy, $wgUserHtml; - $fname = 'Parser::removeHTMLtags'; - wfProfileIn( $fname ); - - if( $wgUserHtml ) { - $htmlpairs = array( # Tags that must be closed - 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', - 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', - 'strike', 'strong', 'tt', 'var', 'div', 'center', - 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', - 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u' - ); - $htmlsingle = array( - 'br', 'hr', 'li', 'dt', 'dd' - ); - $htmlsingleonly = array( # Elements that cannot have close tags - 'br', 'hr' - ); - $htmlnest = array( # Tags that can be nested--?? - 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', - 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span' - ); - $tabletags = array( # Can only appear inside table - 'td', 'th', 'tr', - ); - $htmllist = array( # Tags used by list - 'ul','ol', - ); - $listtags = array( # Tags that can appear in a list - 'li', - ); - } else { - $htmlpairs = array(); - $htmlsingle = array(); - $htmlnest = array(); - $tabletags = array(); - } + static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, + $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised; + + wfProfileIn( __METHOD__ ); + + if ( !$staticInitialised ) { + if( $wgUserHtml ) { + $htmlpairs = array( # Tags that must be closed + 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', + 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', + 'strike', 'strong', 'tt', 'var', 'div', 'center', + 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', + 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u' + ); + $htmlsingle = array( + 'br', 'hr', 'li', 'dt', 'dd' + ); + $htmlsingleonly = array( # Elements that cannot have close tags + 'br', 'hr' + ); + $htmlnest = array( # Tags that can be nested--?? + 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', + 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span' + ); + $tabletags = array( # Can only appear inside table + 'td', 'th', 'tr', + ); + $htmllist = array( # Tags used by list + 'ul','ol', + ); + $listtags = array( # Tags that can appear in a list + 'li', + ); + + } else { + $htmlpairs = array(); + $htmlsingle = array(); + $htmlnest = array(); + $tabletags = array(); + } - $htmlsingleallowed = array_merge( $htmlsingle, $tabletags ); - $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest ); + $htmlsingleallowed = array_merge( $htmlsingle, $tabletags ); + $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest ); + + # Convert them all to hashtables for faster lookup + $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', + 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' ); + foreach ( $vars as $var ) { + $$var = array_flip( $$var ); + } + $staticInitialised = true; + } # Remove HTML comments $text = Sanitizer::removeHTMLcomments( $text ); $bits = explode( '<', $text ); $text = array_shift( $bits ); if(!$wgUseTidy) { - $tagstack = array(); $tablestack = array(); + $tagstack = $tablestack = array(); foreach ( $bits as $x ) { $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) ); - preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', - $x, $regs ); + preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ); list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs; error_reporting( $prev ); $badtag = 0 ; - if ( in_array( $t = strtolower( $t ), $htmlelements ) ) { + if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { # Check our stack if ( $slash ) { # Closing a tag... - if( in_array( $t, $htmlsingleonly ) ) { + if( isset( $htmlsingleonly[$t] ) ) { $badtag = 1; } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) { - if ( in_array($ot, $htmlsingleallowed) ) { + if ( isset( $htmlsingleallowed[$ot] ) ) { # Pop all elements with an optional close tag # and see if we find a match below them $optstack = array(); array_push ($optstack, $ot); while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) && - in_array($ot, $htmlsingleallowed) ) { + isset( $htmlsingleallowed[$ot] ) ) + { array_push ($optstack, $ot); } if ( $t != $ot ) { @@ -410,7 +423,7 @@ class Sanitizer { } else { @array_push( $tagstack, $ot ); # <li> can be nested in <ul> or <ol>, skip those cases: - if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) { + if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) { $badtag = 1; } } @@ -422,20 +435,20 @@ class Sanitizer { $newparams = ''; } else { # Keep track for later - if ( in_array( $t, $tabletags ) && + if ( isset( $tabletags[$t] ) && ! in_array( 'table', $tagstack ) ) { $badtag = 1; } else if ( in_array( $t, $tagstack ) && - ! in_array ( $t , $htmlnest ) ) { + ! isset( $htmlnest [$t ] ) ) { $badtag = 1 ; # Is it a self closed htmlpair ? (bug 5487) } else if( $brace == '/>' && - in_array($t, $htmlpairs) ) { + isset( $htmlpairs[$t] ) ) { $badtag = 1; - } elseif( in_array( $t, $htmlsingleonly ) ) { + } elseif( isset( $htmlsingleonly[$t] ) ) { # Hack to force empty tag for uncloseable elements $brace = '/>'; - } else if( in_array( $t, $htmlsingle ) ) { + } else if( isset( $htmlsingle[$t] ) ) { # Hack to not close $htmlsingle tags $brace = NULL; } else { @@ -475,7 +488,7 @@ class Sanitizer { preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', $x, $regs ); @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs; - if ( in_array( $t = strtolower( $t ), $htmlelements ) ) { + if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { if( is_callable( $processCallback ) ) { call_user_func_array( $processCallback, array( &$params, $args ) ); } @@ -487,7 +500,7 @@ class Sanitizer { } } } - wfProfileOut( $fname ); + wfProfileOut( __METHOD__ ); return $text; } @@ -501,9 +514,8 @@ class Sanitizer { * @param string $text * @return string */ - function removeHTMLcomments( $text ) { - $fname='Parser::removeHTMLcomments'; - wfProfileIn( $fname ); + static function removeHTMLcomments( $text ) { + wfProfileIn( __METHOD__ ); while (($start = strpos($text, '<!--')) !== false) { $end = strpos($text, '-->', $start + 4); if ($end === false) { @@ -533,7 +545,7 @@ class Sanitizer { $text = substr_replace($text, '', $start, $end - $start); } } - wfProfileOut( $fname ); + wfProfileOut( __METHOD__ ); return $text; } @@ -551,7 +563,7 @@ class Sanitizer { * @todo Check for legal values where the DTD limits things. * @todo Check for unique id attribute :P */ - function validateTagAttributes( $attribs, $element ) { + static function validateTagAttributes( $attribs, $element ) { $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) ); $out = array(); foreach( $attribs as $attribute => $value ) { @@ -626,7 +638,7 @@ class Sanitizer { * @param string $element * @return string */ - function fixTagAttributes( $text, $element ) { + static function fixTagAttributes( $text, $element ) { if( trim( $text ) == '' ) { return ''; } @@ -649,7 +661,7 @@ class Sanitizer { * @param $text * @return HTML-encoded text fragment */ - function encodeAttribute( $text ) { + static function encodeAttribute( $text ) { $encValue = htmlspecialchars( $text ); // Whitespace is normalized during attribute decoding, @@ -670,7 +682,7 @@ class Sanitizer { * @param $text * @return HTML-encoded text fragment */ - function safeEncodeAttribute( $text ) { + static function safeEncodeAttribute( $text ) { $encValue = Sanitizer::encodeAttribute( $text ); # Templates and links may be expanded in later parsing, @@ -713,7 +725,7 @@ class Sanitizer { * @param string $id * @return string */ - function escapeId( $id ) { + static function escapeId( $id ) { static $replace = array( '%3A' => ':', '%' => '.' @@ -730,7 +742,7 @@ class Sanitizer { * @return string * @private */ - function armorLinksCallback( $matches ) { + private static function armorLinksCallback( $matches ) { return str_replace( ':', ':', $matches[1] ); } @@ -742,7 +754,7 @@ class Sanitizer { * @param string * @return array */ - function decodeTagAttributes( $text ) { + static function decodeTagAttributes( $text ) { $attribs = array(); if( trim( $text ) == '' ) { @@ -780,7 +792,7 @@ class Sanitizer { * @return string * @private */ - function getTagAttributeCallback( $set ) { + private static function getTagAttributeCallback( $set ) { if( isset( $set[6] ) ) { # Illegal #XXXXXX color with no quotes. return $set[6]; @@ -814,7 +826,7 @@ class Sanitizer { * @return string * @private */ - function normalizeAttributeValue( $text ) { + private static function normalizeAttributeValue( $text ) { return str_replace( '"', '"', preg_replace( '/\r\n|[\x20\x0d\x0a\x09]/', @@ -836,7 +848,7 @@ class Sanitizer { * @return string * @private */ - function normalizeCharReferences( $text ) { + static function normalizeCharReferences( $text ) { return preg_replace_callback( MW_CHAR_REFS_REGEX, array( 'Sanitizer', 'normalizeCharReferencesCallback' ), @@ -846,7 +858,7 @@ class Sanitizer { * @param string $matches * @return string */ - function normalizeCharReferencesCallback( $matches ) { + static function normalizeCharReferencesCallback( $matches ) { $ret = null; if( $matches[1] != '' ) { $ret = Sanitizer::normalizeEntity( $matches[1] ); @@ -871,8 +883,9 @@ class Sanitizer { * * @param string $name * @return string + * @static */ - function normalizeEntity( $name ) { + static function normalizeEntity( $name ) { global $wgHtmlEntities; if( isset( $wgHtmlEntities[$name] ) ) { return "&$name;"; @@ -881,7 +894,7 @@ class Sanitizer { } } - function decCharReference( $codepoint ) { + static function decCharReference( $codepoint ) { $point = intval( $codepoint ); if( Sanitizer::validateCodepoint( $point ) ) { return sprintf( '&#%d;', $point ); @@ -890,7 +903,7 @@ class Sanitizer { } } - function hexCharReference( $codepoint ) { + static function hexCharReference( $codepoint ) { $point = hexdec( $codepoint ); if( Sanitizer::validateCodepoint( $point ) ) { return sprintf( '&#x%x;', $point ); @@ -904,7 +917,7 @@ class Sanitizer { * @param int $codepoint * @return bool */ - function validateCodepoint( $codepoint ) { + private static function validateCodepoint( $codepoint ) { return ($codepoint == 0x09) || ($codepoint == 0x0a) || ($codepoint == 0x0d) @@ -920,8 +933,9 @@ class Sanitizer { * @param string $text * @return string * @public + * @static */ - function decodeCharReferences( $text ) { + public static function decodeCharReferences( $text ) { return preg_replace_callback( MW_CHAR_REFS_REGEX, array( 'Sanitizer', 'decodeCharReferencesCallback' ), @@ -932,7 +946,7 @@ class Sanitizer { * @param string $matches * @return string */ - function decodeCharReferencesCallback( $matches ) { + static function decodeCharReferencesCallback( $matches ) { if( $matches[1] != '' ) { return Sanitizer::decodeEntity( $matches[1] ); } elseif( $matches[2] != '' ) { @@ -953,7 +967,7 @@ class Sanitizer { * @return string * @private */ - function decodeChar( $codepoint ) { + static function decodeChar( $codepoint ) { if( Sanitizer::validateCodepoint( $codepoint ) ) { return codepointToUtf8( $codepoint ); } else { @@ -969,7 +983,7 @@ class Sanitizer { * @param string $name * @return string */ - function decodeEntity( $name ) { + static function decodeEntity( $name ) { global $wgHtmlEntities; if( isset( $wgHtmlEntities[$name] ) ) { return codepointToUtf8( $wgHtmlEntities[$name] ); @@ -985,7 +999,7 @@ class Sanitizer { * @param string $element * @return array */ - function attributeWhitelist( $element ) { + static function attributeWhitelist( $element ) { static $list; if( !isset( $list ) ) { $list = Sanitizer::setupAttributeWhitelist(); @@ -996,9 +1010,10 @@ class Sanitizer { } /** + * @todo Document it a bit * @return array */ - function setupAttributeWhitelist() { + static function setupAttributeWhitelist() { $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' ); $block = array_merge( $common, array( 'align' ) ); $tablealign = array( 'align', 'char', 'charoff', 'valign' ); @@ -1082,9 +1097,9 @@ class Sanitizer { # 11.2.1 'table' => array_merge( $common, array( 'summary', 'width', 'border', 'frame', - 'rules', 'cellspacing', 'cellpadding', - 'align', 'bgcolor', 'frame', 'rules', - 'border' ) ), + 'rules', 'cellspacing', 'cellpadding', + 'align', 'bgcolor', + ) ), # 11.2.2 'caption' => array_merge( $common, array( 'align' ) ), @@ -1142,7 +1157,7 @@ class Sanitizer { * @param string $text HTML fragment * @return string */ - function stripAllTags( $text ) { + static function stripAllTags( $text ) { # Actual <tags> $text = preg_replace( '/ < .*? > /x', '', $text ); @@ -1169,7 +1184,7 @@ class Sanitizer { * @return string * @static */ - function hackDocType() { + static function hackDocType() { global $wgHtmlEntities; $out = "<!DOCTYPE html [\n"; foreach( $wgHtmlEntities as $entity => $codepoint ) { @@ -1178,6 +1193,47 @@ class Sanitizer { $out .= "]>\n"; return $out; } + + static function cleanUrl( $url, $hostname=true ) { + # Normalize any HTML entities in input. They will be + # re-escaped by makeExternalLink(). + $url = Sanitizer::decodeCharReferences( $url ); + + # Escape any control characters introduced by the above step + $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url ); + + # Validate hostname portion + if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) { + list( $whole, $protocol, $host, $rest ) = $matches; + + // Characters that will be ignored in IDNs. + // http://tools.ietf.org/html/3454#section-3.1 + // Strip them before further processing so blacklists and such work. + $strip = "/ + \\s| # general whitespace + \xc2\xad| # 00ad SOFT HYPHEN + \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN + \xe2\x80\x8b| # 200b ZERO WIDTH SPACE + \xe2\x81\xa0| # 2060 WORD JOINER + \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE + \xcd\x8f| # 034f COMBINING GRAPHEME JOINER + \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE + \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO + \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE + \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER + \xe2\x80\x8d| # 200d ZERO WIDTH JOINER + [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16 + /xuD"; + + $host = preg_replace( $strip, '', $host ); + + // @fixme: validate hostnames here + + return $protocol . $host . $rest; + } else { + return $url; + } + } } |