diff options
author | Pierre Schmitz <pierre@archlinux.de> | 2011-06-22 11:28:20 +0200 |
---|---|---|
committer | Pierre Schmitz <pierre@archlinux.de> | 2011-06-22 11:28:20 +0200 |
commit | 9db190c7e736ec8d063187d4241b59feaf7dc2d1 (patch) | |
tree | 46d1a0dee7febef5c2d57a9f7b972be16a163b3d /includes/libs | |
parent | 78677c7bbdcc9739f6c10c75935898a20e1acd9e (diff) |
update to MediaWiki 1.17.0
Diffstat (limited to 'includes/libs')
-rw-r--r-- | includes/libs/CSSJanus.php | 323 | ||||
-rw-r--r-- | includes/libs/CSSMin.php | 214 | ||||
-rw-r--r-- | includes/libs/IEContentAnalyzer.php | 824 | ||||
-rw-r--r-- | includes/libs/IEUrlExtension.php | 247 | ||||
-rw-r--r-- | includes/libs/JavaScriptMinifier.php | 579 | ||||
-rw-r--r-- | includes/libs/README | 4 | ||||
-rw-r--r-- | includes/libs/spyc.php | 248 |
7 files changed, 2439 insertions, 0 deletions
diff --git a/includes/libs/CSSJanus.php b/includes/libs/CSSJanus.php new file mode 100644 index 00000000..aa04bc49 --- /dev/null +++ b/includes/libs/CSSJanus.php @@ -0,0 +1,323 @@ +<?php +/** + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +/** + * This is a PHP port of CSSJanus, a utility that transforms CSS style sheets + * written for LTR to RTL. + * + * The original Python version of CSSJanus is Copyright 2008 by Google Inc. and + * is distributed under the Apache license. + * + * Original code: http://code.google.com/p/cssjanus/source/browse/trunk/cssjanus.py + * License of original code: http://code.google.com/p/cssjanus/source/browse/trunk/LICENSE + * @author Roan Kattouw + * + */ +class CSSJanus { + // Patterns defined as null are built dynamically by buildPatterns() + private static $patterns = array( + 'tmpToken' => '`TMP`', + 'nonAscii' => '[\200-\377]', + 'unicode' => '(?:(?:\\[0-9a-f]{1,6})(?:\r\n|\s)?)', + 'num' => '(?:[0-9]*\.[0-9]+|[0-9]+)', + 'unit' => '(?:em|ex|px|cm|mm|in|pt|pc|deg|rad|grad|ms|s|hz|khz|%)', + 'body_selector' => 'body\s*{\s*', + 'direction' => 'direction\s*:\s*', + 'escape' => null, + 'nmstart' => null, + 'nmchar' => null, + 'ident' => null, + 'quantity' => null, + 'possibly_negative_quantity' => null, + 'color' => null, + 'url_special_chars' => '[!#$%&*-~]', + 'valid_after_uri_chars' => '[\'\"]?\s*', + 'url_chars' => null, + 'lookahead_not_open_brace' => null, + 'lookahead_not_closing_paren' => null, + 'lookahead_for_closing_paren' => null, + 'lookbehind_not_letter' => '(?<![a-zA-Z])', + 'chars_within_selector' => '[^\}]*?', + 'noflip_annotation' => '\/\*\s*@noflip\s*\*\/', + 'noflip_single' => null, + 'noflip_class' => null, + 'comment' => '/\/\*[^*]*\*+([^\/*][^*]*\*+)*\//', + 'direction_ltr' => null, + 'direction_rtl' => null, + 'left' => null, + 'right' => null, + 'left_in_url' => null, + 'right_in_url' => null, + 'ltr_in_url' => null, + 'rtl_in_url' => null, + 'cursor_east' => null, + 'cursor_west' => null, + 'four_notation_quantity' => null, + 'four_notation_color' => null, + 'bg_horizontal_percentage' => null, + 'bg_horizontal_percentage_x' => null, + ); + + /** + * Build patterns we can't define above because they depend on other patterns. + */ + private static function buildPatterns() { + if ( !is_null( self::$patterns['escape'] ) ) { + // Patterns have already been built + return; + } + + $patterns =& self::$patterns; + $patterns['escape'] = "(?:{$patterns['unicode']}|\\[^\r\n\f0-9a-f])"; + $patterns['nmstart'] = "(?:[_a-z]|{$patterns['nonAscii']}|{$patterns['escape']})"; + $patterns['nmchar'] = "(?:[_a-z0-9-]|{$patterns['nonAscii']}|{$patterns['escape']})"; + $patterns['ident'] = "-?{$patterns['nmstart']}{$patterns['nmchar']}*"; + $patterns['quantity'] = "{$patterns['num']}(?:\s*{$patterns['unit']}|{$patterns['ident']})?"; + $patterns['possibly_negative_quantity'] = "((?:-?{$patterns['quantity']})|(?:inherit|auto))"; + $patterns['color'] = "(#?{$patterns['nmchar']}+)"; + $patterns['url_chars'] = "(?:{$patterns['url_special_chars']}|{$patterns['nonAscii']}|{$patterns['escape']})*"; + $patterns['lookahead_not_open_brace'] = "(?!({$patterns['nmchar']}|\r?\n|\s|#|\:|\.|\,|\+|>)*?{)"; + $patterns['lookahead_not_closing_paren'] = "(?!{$patterns['url_chars']}?{$patterns['valid_after_uri_chars']}\))"; + $patterns['lookahead_for_closing_paren'] = "(?={$patterns['url_chars']}?{$patterns['valid_after_uri_chars']}\))"; + $patterns['noflip_single'] = "/({$patterns['noflip_annotation']}{$patterns['lookahead_not_open_brace']}[^;}]+;?)/i"; + $patterns['noflip_class'] = "/({$patterns['noflip_annotation']}{$patterns['chars_within_selector']}})/i"; + $patterns['direction_ltr'] = "/({$patterns['direction']})ltr/i"; + $patterns['direction_rtl'] = "/({$patterns['direction']})rtl/i"; + $patterns['left'] = "/{$patterns['lookbehind_not_letter']}(left){$patterns['lookahead_not_closing_paren']}{$patterns['lookahead_not_open_brace']}/i"; + $patterns['right'] = "/{$patterns['lookbehind_not_letter']}(right){$patterns['lookahead_not_closing_paren']}{$patterns['lookahead_not_open_brace']}/i"; + $patterns['left_in_url'] = "/{$patterns['lookbehind_not_letter']}(left){$patterns['lookahead_for_closing_paren']}/i"; + $patterns['right_in_url'] = "/{$patterns['lookbehind_not_letter']}(right){$patterns['lookahead_for_closing_paren']}/i"; + $patterns['ltr_in_url'] = "/{$patterns['lookbehind_not_letter']}(ltr){$patterns['lookahead_for_closing_paren']}/i"; + $patterns['rtl_in_url'] = "/{$patterns['lookbehind_not_letter']}(rtl){$patterns['lookahead_for_closing_paren']}/i"; + $patterns['cursor_east'] = "/{$patterns['lookbehind_not_letter']}([ns]?)e-resize/"; + $patterns['cursor_west'] = "/{$patterns['lookbehind_not_letter']}([ns]?)w-resize/"; + $patterns['four_notation_quantity'] = "/{$patterns['possibly_negative_quantity']}(\s+){$patterns['possibly_negative_quantity']}(\s+){$patterns['possibly_negative_quantity']}(\s+){$patterns['possibly_negative_quantity']}/i"; + $patterns['four_notation_color'] = "/(-color\s*:\s*){$patterns['color']}(\s+){$patterns['color']}(\s+){$patterns['color']}(\s+){$patterns['color']}/i"; + // The two regexes below are parenthesized differently then in the original implementation to make the + // callback's job more straightforward + $patterns['bg_horizontal_percentage'] = "/(background(?:-position)?\s*:\s*[^%]*?)({$patterns['num']})(%\s*(?:{$patterns['quantity']}|{$patterns['ident']}))/"; + $patterns['bg_horizontal_percentage_x'] = "/(background-position-x\s*:\s*)({$patterns['num']})(%)/"; + } + + /** + * Transform an LTR stylesheet to RTL + * @param $css String: stylesheet to transform + * @param $swapLtrRtlInURL Boolean: If true, swap 'ltr' and 'rtl' in URLs + * @param $swapLeftRightInURL Boolean: If true, swap 'left' and 'right' in URLs + * @return Transformed stylesheet + */ + public static function transform( $css, $swapLtrRtlInURL = false, $swapLeftRightInURL = false ) { + // We wrap tokens in ` , not ~ like the original implementation does. + // This was done because ` is not a legal character in CSS and can only + // occur in URLs, where we escape it to %60 before inserting our tokens. + $css = str_replace( '`', '%60', $css ); + + self::buildPatterns(); + + // Tokenize single line rules with /* @noflip */ + $noFlipSingle = new CSSJanus_Tokenizer( self::$patterns['noflip_single'], '`NOFLIP_SINGLE`' ); + $css = $noFlipSingle->tokenize( $css ); + + // Tokenize class rules with /* @noflip */ + $noFlipClass = new CSSJanus_Tokenizer( self::$patterns['noflip_class'], '`NOFLIP_CLASS`' ); + $css = $noFlipClass->tokenize( $css ); + + // Tokenize comments + $comments = new CSSJanus_Tokenizer( self::$patterns['comment'], '`C`' ); + $css = $comments->tokenize( $css ); + + // LTR->RTL fixes start here + $css = self::fixDirection( $css ); + if ( $swapLtrRtlInURL ) { + $css = self::fixLtrRtlInURL( $css ); + } + + if ( $swapLeftRightInURL ) { + $css = self::fixLeftRightInURL( $css ); + } + $css = self::fixLeftAndRight( $css ); + $css = self::fixCursorProperties( $css ); + $css = self::fixFourPartNotation( $css ); + $css = self::fixBackgroundPosition( $css ); + + // Detokenize stuff we tokenized before + $css = $comments->detokenize( $css ); + $css = $noFlipClass->detokenize( $css ); + $css = $noFlipSingle->detokenize( $css ); + + return $css; + } + + /** + * Replace direction: ltr; with direction: rtl; and vice versa. + * + * The original implementation only does this inside body selectors + * and misses "body\n{\ndirection:ltr;\n}". This function does not have + * these problems. + * + * See http://code.google.com/p/cssjanus/issues/detail?id=15 and + * TODO: URL + */ + private static function fixDirection( $css ) { + $css = preg_replace( self::$patterns['direction_ltr'], + '$1' . self::$patterns['tmpToken'], $css ); + $css = preg_replace( self::$patterns['direction_rtl'], '$1ltr', $css ); + $css = str_replace( self::$patterns['tmpToken'], 'rtl', $css ); + + return $css; + } + + /** + * Replace 'ltr' with 'rtl' and vice versa in background URLs + */ + private static function fixLtrRtlInURL( $css ) { + $css = preg_replace( self::$patterns['ltr_in_url'], self::$patterns['tmpToken'], $css ); + $css = preg_replace( self::$patterns['rtl_in_url'], 'ltr', $css ); + $css = str_replace( self::$patterns['tmpToken'], 'rtl', $css ); + + return $css; + } + + /** + * Replace 'left' with 'right' and vice versa in background URLs + */ + private static function fixLeftRightInURL( $css ) { + $css = preg_replace( self::$patterns['left_in_url'], self::$patterns['tmpToken'], $css ); + $css = preg_replace( self::$patterns['right_in_url'], 'left', $css ); + $css = str_replace( self::$patterns['tmpToken'], 'right', $css ); + + return $css; + } + + /** + * Flip rules like left: , padding-right: , etc. + */ + private static function fixLeftAndRight( $css ) { + $css = preg_replace( self::$patterns['left'], self::$patterns['tmpToken'], $css ); + $css = preg_replace( self::$patterns['right'], 'left', $css ); + $css = str_replace( self::$patterns['tmpToken'], 'right', $css ); + + return $css; + } + + /** + * Flip East and West in rules like cursor: nw-resize; + */ + private static function fixCursorProperties( $css ) { + $css = preg_replace( self::$patterns['cursor_east'], + '$1' . self::$patterns['tmpToken'], $css ); + $css = preg_replace( self::$patterns['cursor_west'], '$1e-resize', $css ); + $css = str_replace( self::$patterns['tmpToken'], 'w-resize', $css ); + + return $css; + } + + /** + * Swap the second and fourth parts in four-part notation rules like + * padding: 1px 2px 3px 4px; + * + * Unlike the original implementation, this function doesn't suffer from + * the bug where whitespace is not preserved when flipping four-part rules + * and four-part color rules with multiple whitespace characters between + * colors are not recognized. + * See http://code.google.com/p/cssjanus/issues/detail?id=16 + */ + private static function fixFourPartNotation( $css ) { + $css = preg_replace( self::$patterns['four_notation_quantity'], '$1$2$7$4$5$6$3', $css ); + $css = preg_replace( self::$patterns['four_notation_color'], '$1$2$3$8$5$6$7$4', $css ); + + return $css; + } + + /** + * Flip horizontal background percentages. + */ + private static function fixBackgroundPosition( $css ) { + $css = preg_replace_callback( self::$patterns['bg_horizontal_percentage'], + array( 'self', 'calculateNewBackgroundPosition' ), $css ); + $css = preg_replace_callback( self::$patterns['bg_horizontal_percentage_x'], + array( 'self', 'calculateNewBackgroundPosition' ), $css ); + + return $css; + } + + /** + * Callback for calculateNewBackgroundPosition() + */ + private static function calculateNewBackgroundPosition( $matches ) { + return $matches[1] . ( 100 - $matches[2] ) . $matches[3]; + } +} + +/** + * Utility class used by CSSJanus that tokenizes and untokenizes things we want + * to protect from being janused. + * @author Roan Kattouw + */ +class CSSJanus_Tokenizer { + private $regex, $token; + private $originals; + + /** + * Constructor + * @param $regex string Regular expression whose matches to replace by a token. + * @param $token string Token + */ + public function __construct( $regex, $token ) { + $this->regex = $regex; + $this->token = $token; + $this->originals = array(); + } + + /** + * Replace all occurrences of $regex in $str with a token and remember + * the original strings. + * @param $str String to tokenize + * @return string Tokenized string + */ + public function tokenize( $str ) { + return preg_replace_callback( $this->regex, array( $this, 'tokenizeCallback' ), $str ); + } + + private function tokenizeCallback( $matches ) { + $this->originals[] = $matches[0]; + return $this->token; + } + + /** + * Replace tokens with their originals. If multiple strings were tokenized, it's important they be + * detokenized in exactly the SAME ORDER. + * @param $str String: previously run through tokenize() + * @return string Original string + */ + public function detokenize( $str ) { + // PHP has no function to replace only the first occurrence or to + // replace occurrences of the same string with different values, + // so we use preg_replace_callback() even though we don't really need a regex + return preg_replace_callback( '/' . preg_quote( $this->token, '/' ) . '/', + array( $this, 'detokenizeCallback' ), $str ); + } + + private function detokenizeCallback( $matches ) { + $retval = current( $this->originals ); + next( $this->originals ); + + return $retval; + } +} diff --git a/includes/libs/CSSMin.php b/includes/libs/CSSMin.php new file mode 100644 index 00000000..c0e78112 --- /dev/null +++ b/includes/libs/CSSMin.php @@ -0,0 +1,214 @@ +<?php +/* + * Copyright 2010 Wikimedia Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + * OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ + +/** + * Transforms CSS data + * + * This class provides minification, URL remapping, URL extracting, and data-URL embedding. + * + * @file + * @version 0.1.1 -- 2010-09-11 + * @author Trevor Parscal <tparscal@wikimedia.org> + * @copyright Copyright 2010 Wikimedia Foundation + * @license http://www.apache.org/licenses/LICENSE-2.0 + */ +class CSSMin { + + /* Constants */ + + /** + * Maximum file size to still qualify for in-line embedding as a data-URI + * + * 24,576 is used because Internet Explorer has a 32,768 byte limit for data URIs, + * which when base64 encoded will result in a 1/3 increase in size. + */ + const EMBED_SIZE_LIMIT = 24576; + const URL_REGEX = 'url\(\s*[\'"]?(?P<file>[^\?\)\'"]*)(?P<query>\??[^\)\'"]*)[\'"]?\s*\)'; + + /* Protected Static Members */ + + /** @var array List of common image files extensions and mime-types */ + protected static $mimeTypes = array( + 'gif' => 'image/gif', + 'jpe' => 'image/jpeg', + 'jpeg' => 'image/jpeg', + 'jpg' => 'image/jpeg', + 'png' => 'image/png', + 'tif' => 'image/tiff', + 'tiff' => 'image/tiff', + 'xbm' => 'image/x-xbitmap', + ); + + /* Static Methods */ + + /** + * Gets a list of local file paths which are referenced in a CSS style sheet + * + * @param $source string CSS data to remap + * @param $path string File path where the source was read from (optional) + * @return array List of local file references + */ + public static function getLocalFileReferences( $source, $path = null ) { + $files = array(); + $rFlags = PREG_OFFSET_CAPTURE | PREG_SET_ORDER; + if ( preg_match_all( '/' . self::URL_REGEX . '/', $source, $matches, $rFlags ) ) { + foreach ( $matches as $match ) { + $file = ( isset( $path ) + ? rtrim( $path, '/' ) . '/' + : '' ) . "{$match['file'][0]}"; + + // Only proceed if we can access the file + if ( !is_null( $path ) && file_exists( $file ) ) { + $files[] = $file; + } + } + } + return $files; + } + + protected static function getMimeType( $file ) { + $realpath = realpath( $file ); + // Try a couple of different ways to get the mime-type of a file, in order of + // preference + if ( + $realpath + && function_exists( 'finfo_file' ) + && function_exists( 'finfo_open' ) + && defined( 'FILEINFO_MIME_TYPE' ) + ) { + // As of PHP 5.3, this is how you get the mime-type of a file; it uses the Fileinfo + // PECL extension + return finfo_file( finfo_open( FILEINFO_MIME_TYPE ), $realpath ); + } else if ( function_exists( 'mime_content_type' ) ) { + // Before this was deprecated in PHP 5.3, this was how you got the mime-type of a file + return mime_content_type( $file ); + } else { + // Worst-case scenario has happened, use the file extension to infer the mime-type + $ext = strtolower( pathinfo( $file, PATHINFO_EXTENSION ) ); + if ( isset( self::$mimeTypes[$ext] ) ) { + return self::$mimeTypes[$ext]; + } + } + return false; + } + + /** + * Remaps CSS URL paths and automatically embeds data URIs for URL rules + * preceded by an /* @embed * / comment + * + * @param $source string CSS data to remap + * @param $local string File path where the source was read from + * @param $remote string URL path to the file + * @param $embed ??? + * @return string Remapped CSS data + */ + public static function remap( $source, $local, $remote, $embed = true ) { + $pattern = '/((?P<embed>\s*\/\*\s*\@embed\s*\*\/)(?P<pre>[^\;\}]*))?' . + self::URL_REGEX . '(?P<post>[^;]*)[\;]?/'; + $offset = 0; + while ( preg_match( $pattern, $source, $match, PREG_OFFSET_CAPTURE, $offset ) ) { + // Skip fully-qualified URLs and data URIs + $urlScheme = parse_url( $match['file'][0], PHP_URL_SCHEME ); + if ( $urlScheme ) { + // Move the offset to the end of the match, leaving it alone + $offset = $match[0][1] + strlen( $match[0][0] ); + continue; + } + // URLs with absolute paths like /w/index.php need to be expanded + // to absolute URLs but otherwise left alone + if ( $match['file'][0] !== '' && $match['file'][0][0] === '/' ) { + // Replace the file path with an expanded URL + $source = substr_replace( $source, wfExpandUrl( $match['file'][0] ), + $match['file'][1], strlen( $match['file'][0] ) + ); + // Move the offset to the end of the match, leaving it alone + $offset = $match[0][1] + strlen( $match[0][0] ); + continue; + } + // Shortcuts + $embed = $match['embed'][0]; + $pre = $match['pre'][0]; + $post = $match['post'][0]; + $query = $match['query'][0]; + $url = "{$remote}/{$match['file'][0]}"; + $file = "{$local}/{$match['file'][0]}"; + // bug 27052 - Guard against double slashes, because foo//../bar + // apparently resolves to foo/bar on (some?) clients + $url = preg_replace( '#([^:])//+#', '\1/', $url ); + $replacement = false; + if ( $local !== false && file_exists( $file ) ) { + // Add version parameter as a time-stamp in ISO 8601 format, + // using Z for the timezone, meaning GMT + $url .= '?' . gmdate( 'Y-m-d\TH:i:s\Z', round( filemtime( $file ), -2 ) ); + // Embedding requires a bit of extra processing, so let's skip that if we can + if ( $embed ) { + $type = self::getMimeType( $file ); + // Detect when URLs were preceeded with embed tags, and also verify file size is + // below the limit + if ( + $type + && $match['embed'][1] > 0 + && filesize( $file ) < self::EMBED_SIZE_LIMIT + ) { + // Strip off any trailing = symbols (makes browsers freak out) + $data = base64_encode( file_get_contents( $file ) ); + // Build 2 CSS properties; one which uses a base64 encoded data URI in place + // of the @embed comment to try and retain line-number integrity, and the + // other with a remapped an versioned URL and an Internet Explorer hack + // making it ignored in all browsers that support data URIs + $replacement = "{$pre}url(data:{$type};base64,{$data}){$post};"; + $replacement .= "{$pre}url({$url}){$post}!ie;"; + } + } + if ( $replacement === false ) { + // Assume that all paths are relative to $remote, and make them absolute + $replacement = "{$embed}{$pre}url({$url}){$post};"; + } + } else if ( $local === false ) { + // Assume that all paths are relative to $remote, and make them absolute + $replacement = "{$embed}{$pre}url({$url}{$query}){$post};"; + } + if ( $replacement !== false ) { + // Perform replacement on the source + $source = substr_replace( + $source, $replacement, $match[0][1], strlen( $match[0][0] ) + ); + // Move the offset to the end of the replacement in the source + $offset = $match[0][1] + strlen( $replacement ); + continue; + } + // Move the offset to the end of the match, leaving it alone + $offset = $match[0][1] + strlen( $match[0][0] ); + } + return $source; + } + + /** + * Removes whitespace from CSS data + * + * @param $css string CSS data to minify + * @return string Minified CSS data + */ + public static function minify( $css ) { + return trim( + str_replace( + array( '; ', ': ', ' {', '{ ', ', ', '} ', ';}' ), + array( ';', ':', '{', '{', ',', '}', '}' ), + preg_replace( array( '/\s+/', '/\/\*.*?\*\//s' ), array( ' ', '' ), $css ) + ) + ); + } +} diff --git a/includes/libs/IEContentAnalyzer.php b/includes/libs/IEContentAnalyzer.php new file mode 100644 index 00000000..a2ef1a09 --- /dev/null +++ b/includes/libs/IEContentAnalyzer.php @@ -0,0 +1,824 @@ +<?php + +/** + * This class simulates Microsoft Internet Explorer's terribly broken and + * insecure MIME type detection algorithm. It can be used to check web uploads + * with an apparently safe type, to see if IE will reinterpret them to produce + * something dangerous. + * + * It is full of bugs and strange design choices should not under any + * circumstances be used to determine a MIME type to present to a user or + * client. (Apple Safari developers, this means you too.) + * + * This class is based on a disassembly of IE 5.0, 6.0 and 7.0. Although I have + * attempted to ensure that this code works in exactly the same way as Internet + * Explorer, it does not share any source code, or creative choices such as + * variable names, thus I (Tim Starling) claim copyright on it. + * + * It may be redistributed without restriction. To aid reuse, this class does + * not depend on any MediaWiki module. + */ +class IEContentAnalyzer { + /** + * Relevant data taken from the type table in IE 5 + */ + protected $baseTypeTable = array( + 'ambiguous' /*1*/ => array( + 'text/plain', + 'application/octet-stream', + 'application/x-netcdf', // [sic] + ), + 'text' /*3*/ => array( + 'text/richtext', 'image/x-bitmap', 'application/postscript', 'application/base64', + 'application/macbinhex40', 'application/x-cdf', 'text/scriptlet' + ), + 'binary' /*4*/ => array( + 'application/pdf', 'audio/x-aiff', 'audio/basic', 'audio/wav', 'image/gif', + 'image/pjpeg', 'image/jpeg', 'image/tiff', 'image/x-png', 'image/png', 'image/bmp', + 'image/x-jg', 'image/x-art', 'image/x-emf', 'image/x-wmf', 'video/avi', + 'video/x-msvideo', 'video/mpeg', 'application/x-compressed', + 'application/x-zip-compressed', 'application/x-gzip-compressed', 'application/java', + 'application/x-msdownload' + ), + 'html' /*5*/ => array( 'text/html' ), + ); + + /** + * Changes to the type table in later versions of IE + */ + protected $addedTypes = array( + 'ie07' => array( + 'text' => array( 'text/xml', 'application/xml' ) + ), + ); + + /** + * An approximation of the "Content Type" values in HKEY_CLASSES_ROOT in a + * typical Windows installation. + * + * Used for extension to MIME type mapping if detection fails. + */ + protected $registry = array( + '.323' => 'text/h323', + '.3g2' => 'video/3gpp2', + '.3gp' => 'video/3gpp', + '.3gp2' => 'video/3gpp2', + '.3gpp' => 'video/3gpp', + '.aac' => 'audio/aac', + '.ac3' => 'audio/ac3', + '.accda' => 'application/msaccess', + '.accdb' => 'application/msaccess', + '.accdc' => 'application/msaccess', + '.accde' => 'application/msaccess', + '.accdr' => 'application/msaccess', + '.accdt' => 'application/msaccess', + '.ade' => 'application/msaccess', + '.adp' => 'application/msaccess', + '.adts' => 'audio/aac', + '.ai' => 'application/postscript', + '.aif' => 'audio/aiff', + '.aifc' => 'audio/aiff', + '.aiff' => 'audio/aiff', + '.amc' => 'application/x-mpeg', + '.application' => 'application/x-ms-application', + '.asf' => 'video/x-ms-asf', + '.asx' => 'video/x-ms-asf', + '.au' => 'audio/basic', + '.avi' => 'video/avi', + '.bmp' => 'image/bmp', + '.caf' => 'audio/x-caf', + '.cat' => 'application/vnd.ms-pki.seccat', + '.cbo' => 'application/sha', + '.cdda' => 'audio/aiff', + '.cer' => 'application/x-x509-ca-cert', + '.conf' => 'text/plain', + '.crl' => 'application/pkix-crl', + '.crt' => 'application/x-x509-ca-cert', + '.css' => 'text/css', + '.csv' => 'application/vnd.ms-excel', + '.der' => 'application/x-x509-ca-cert', + '.dib' => 'image/bmp', + '.dif' => 'video/x-dv', + '.dll' => 'application/x-msdownload', + '.doc' => 'application/msword', + '.docm' => 'application/vnd.ms-word.document.macroEnabled.12', + '.docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + '.dot' => 'application/msword', + '.dotm' => 'application/vnd.ms-word.template.macroEnabled.12', + '.dotx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.template', + '.dv' => 'video/x-dv', + '.dwfx' => 'model/vnd.dwfx+xps', + '.edn' => 'application/vnd.adobe.edn', + '.eml' => 'message/rfc822', + '.eps' => 'application/postscript', + '.etd' => 'application/x-ebx', + '.exe' => 'application/x-msdownload', + '.fdf' => 'application/vnd.fdf', + '.fif' => 'application/fractals', + '.gif' => 'image/gif', + '.gsm' => 'audio/x-gsm', + '.hqx' => 'application/mac-binhex40', + '.hta' => 'application/hta', + '.htc' => 'text/x-component', + '.htm' => 'text/html', + '.html' => 'text/html', + '.htt' => 'text/webviewhtml', + '.hxa' => 'application/xml', + '.hxc' => 'application/xml', + '.hxd' => 'application/octet-stream', + '.hxe' => 'application/xml', + '.hxf' => 'application/xml', + '.hxh' => 'application/octet-stream', + '.hxi' => 'application/octet-stream', + '.hxk' => 'application/xml', + '.hxq' => 'application/octet-stream', + '.hxr' => 'application/octet-stream', + '.hxs' => 'application/octet-stream', + '.hxt' => 'application/xml', + '.hxv' => 'application/xml', + '.hxw' => 'application/octet-stream', + '.ico' => 'image/x-icon', + '.iii' => 'application/x-iphone', + '.ins' => 'application/x-internet-signup', + '.iqy' => 'text/x-ms-iqy', + '.isp' => 'application/x-internet-signup', + '.jfif' => 'image/jpeg', + '.jnlp' => 'application/x-java-jnlp-file', + '.jpe' => 'image/jpeg', + '.jpeg' => 'image/jpeg', + '.jpg' => 'image/jpeg', + '.jtx' => 'application/x-jtx+xps', + '.latex' => 'application/x-latex', + '.log' => 'text/plain', + '.m1v' => 'video/mpeg', + '.m2v' => 'video/mpeg', + '.m3u' => 'audio/x-mpegurl', + '.mac' => 'image/x-macpaint', + '.man' => 'application/x-troff-man', + '.mda' => 'application/msaccess', + '.mdb' => 'application/msaccess', + '.mde' => 'application/msaccess', + '.mfp' => 'application/x-shockwave-flash', + '.mht' => 'message/rfc822', + '.mhtml' => 'message/rfc822', + '.mid' => 'audio/mid', + '.midi' => 'audio/mid', + '.mod' => 'video/mpeg', + '.mov' => 'video/quicktime', + '.mp2' => 'video/mpeg', + '.mp2v' => 'video/mpeg', + '.mp3' => 'audio/mpeg', + '.mp4' => 'video/mp4', + '.mpa' => 'video/mpeg', + '.mpe' => 'video/mpeg', + '.mpeg' => 'video/mpeg', + '.mpf' => 'application/vnd.ms-mediapackage', + '.mpg' => 'video/mpeg', + '.mpv2' => 'video/mpeg', + '.mqv' => 'video/quicktime', + '.NMW' => 'application/nmwb', + '.nws' => 'message/rfc822', + '.odc' => 'text/x-ms-odc', + '.ols' => 'application/vnd.ms-publisher', + '.p10' => 'application/pkcs10', + '.p12' => 'application/x-pkcs12', + '.p7b' => 'application/x-pkcs7-certificates', + '.p7c' => 'application/pkcs7-mime', + '.p7m' => 'application/pkcs7-mime', + '.p7r' => 'application/x-pkcs7-certreqresp', + '.p7s' => 'application/pkcs7-signature', + '.pct' => 'image/pict', + '.pdf' => 'application/pdf', + '.pdx' => 'application/vnd.adobe.pdx', + '.pfx' => 'application/x-pkcs12', + '.pic' => 'image/pict', + '.pict' => 'image/pict', + '.pinstall' => 'application/x-picasa-detect', + '.pko' => 'application/vnd.ms-pki.pko', + '.png' => 'image/png', + '.pnt' => 'image/x-macpaint', + '.pntg' => 'image/x-macpaint', + '.pot' => 'application/vnd.ms-powerpoint', + '.potm' => 'application/vnd.ms-powerpoint.template.macroEnabled.12', + '.potx' => 'application/vnd.openxmlformats-officedocument.presentationml.template', + '.ppa' => 'application/vnd.ms-powerpoint', + '.ppam' => 'application/vnd.ms-powerpoint.addin.macroEnabled.12', + '.pps' => 'application/vnd.ms-powerpoint', + '.ppsm' => 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12', + '.ppsx' => 'application/vnd.openxmlformats-officedocument.presentationml.slideshow', + '.ppt' => 'application/vnd.ms-powerpoint', + '.pptm' => 'application/vnd.ms-powerpoint.presentation.macroEnabled.12', + '.pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + '.prf' => 'application/pics-rules', + '.ps' => 'application/postscript', + '.pub' => 'application/vnd.ms-publisher', + '.pwz' => 'application/vnd.ms-powerpoint', + '.py' => 'text/plain', + '.pyw' => 'text/plain', + '.qht' => 'text/x-html-insertion', + '.qhtm' => 'text/x-html-insertion', + '.qt' => 'video/quicktime', + '.qti' => 'image/x-quicktime', + '.qtif' => 'image/x-quicktime', + '.qtl' => 'application/x-quicktimeplayer', + '.rat' => 'application/rat-file', + '.rmf' => 'application/vnd.adobe.rmf', + '.rmi' => 'audio/mid', + '.rqy' => 'text/x-ms-rqy', + '.rtf' => 'application/msword', + '.sct' => 'text/scriptlet', + '.sd2' => 'audio/x-sd2', + '.sdp' => 'application/sdp', + '.shtml' => 'text/html', + '.sit' => 'application/x-stuffit', + '.sldm' => 'application/vnd.ms-powerpoint.slide.macroEnabled.12', + '.sldx' => 'application/vnd.openxmlformats-officedocument.presentationml.slide', + '.slk' => 'application/vnd.ms-excel', + '.snd' => 'audio/basic', + '.so' => 'application/x-apachemodule', + '.sol' => 'text/plain', + '.sor' => 'text/plain', + '.spc' => 'application/x-pkcs7-certificates', + '.spl' => 'application/futuresplash', + '.sst' => 'application/vnd.ms-pki.certstore', + '.stl' => 'application/vnd.ms-pki.stl', + '.swf' => 'application/x-shockwave-flash', + '.thmx' => 'application/vnd.ms-officetheme', + '.tif' => 'image/tiff', + '.tiff' => 'image/tiff', + '.txt' => 'text/plain', + '.uls' => 'text/iuls', + '.vcf' => 'text/x-vcard', + '.vdx' => 'application/vnd.ms-visio.viewer', + '.vsd' => 'application/vnd.ms-visio.viewer', + '.vss' => 'application/vnd.ms-visio.viewer', + '.vst' => 'application/vnd.ms-visio.viewer', + '.vsx' => 'application/vnd.ms-visio.viewer', + '.vtx' => 'application/vnd.ms-visio.viewer', + '.wav' => 'audio/wav', + '.wax' => 'audio/x-ms-wax', + '.wbk' => 'application/msword', + '.wdp' => 'image/vnd.ms-photo', + '.wiz' => 'application/msword', + '.wm' => 'video/x-ms-wm', + '.wma' => 'audio/x-ms-wma', + '.wmd' => 'application/x-ms-wmd', + '.wmv' => 'video/x-ms-wmv', + '.wmx' => 'video/x-ms-wmx', + '.wmz' => 'application/x-ms-wmz', + '.wpl' => 'application/vnd.ms-wpl', + '.wsc' => 'text/scriptlet', + '.wvx' => 'video/x-ms-wvx', + '.xaml' => 'application/xaml+xml', + '.xbap' => 'application/x-ms-xbap', + '.xdp' => 'application/vnd.adobe.xdp+xml', + '.xfdf' => 'application/vnd.adobe.xfdf', + '.xht' => 'application/xhtml+xml', + '.xhtml' => 'application/xhtml+xml', + '.xla' => 'application/vnd.ms-excel', + '.xlam' => 'application/vnd.ms-excel.addin.macroEnabled.12', + '.xlk' => 'application/vnd.ms-excel', + '.xll' => 'application/vnd.ms-excel', + '.xlm' => 'application/vnd.ms-excel', + '.xls' => 'application/vnd.ms-excel', + '.xlsb' => 'application/vnd.ms-excel.sheet.binary.macroEnabled.12', + '.xlsm' => 'application/vnd.ms-excel.sheet.macroEnabled.12', + '.xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + '.xlt' => 'application/vnd.ms-excel', + '.xltm' => 'application/vnd.ms-excel.template.macroEnabled.12', + '.xltx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.template', + '.xlw' => 'application/vnd.ms-excel', + '.xml' => 'text/xml', + '.xps' => 'application/vnd.ms-xpsdocument', + '.xsl' => 'text/xml', + ); + + /** + * IE versions which have been analysed to bring you this class, and for + * which some substantive difference exists. These will appear as keys + * in the return value of getRealMimesFromData(). The names are chosen to sort correctly. + */ + protected $versions = array( 'ie05', 'ie06', 'ie07', 'ie07.strict', 'ie07.nohtml' ); + + /** + * Type table with versions expanded + */ + protected $typeTable = array(); + + /** constructor */ + function __construct() { + // Construct versioned type arrays from the base type array plus additions + $types = $this->baseTypeTable; + foreach ( $this->versions as $version ) { + if ( isset( $this->addedTypes[$version] ) ) { + foreach ( $this->addedTypes[$version] as $format => $addedTypes ) { + $types[$format] = array_merge( $types[$format], $addedTypes ); + } + } + $this->typeTable[$version] = $types; + } + } + + /** + * Get the MIME types from getMimesFromData(), but convert the result from IE's + * idiosyncratic private types into something other apps will understand. + * + * @param $fileName String: the file name (unused at present) + * @param $chunk String: the first 256 bytes of the file + * @param $proposed String: the MIME type proposed by the server + * + * @return Array: map of IE version to detected mime type + */ + public function getRealMimesFromData( $fileName, $chunk, $proposed ) { + $types = $this->getMimesFromData( $fileName, $chunk, $proposed ); + $types = array_map( array( $this, 'translateMimeType' ), $types ); + return $types; + } + + /** + * Translate a MIME type from IE's idiosyncratic private types into + * more commonly understood type strings + */ + public function translateMimeType( $type ) { + static $table = array( + 'image/pjpeg' => 'image/jpeg', + 'image/x-png' => 'image/png', + 'image/x-wmf' => 'application/x-msmetafile', + 'image/bmp' => 'image/x-bmp', + 'application/x-zip-compressed' => 'application/zip', + 'application/x-compressed' => 'application/x-compress', + 'application/x-gzip-compressed' => 'application/x-gzip', + 'audio/mid' => 'audio/midi', + ); + if ( isset( $table[$type] ) ) { + $type = $table[$type]; + } + return $type; + } + + /** + * Get the untranslated MIME types for all known versions + * + * @param $fileName String: the file name (unused at present) + * @param $chunk String: the first 256 bytes of the file + * @param $proposed String: the MIME type proposed by the server + * + * @return Array: map of IE version to detected mime type + */ + public function getMimesFromData( $fileName, $chunk, $proposed ) { + $types = array(); + foreach ( $this->versions as $version ) { + $types[$version] = $this->getMimeTypeForVersion( $version, $fileName, $chunk, $proposed ); + } + return $types; + } + + /** + * Get the MIME type for a given named version + */ + protected function getMimeTypeForVersion( $version, $fileName, $chunk, $proposed ) { + // Strip text after a semicolon + $semiPos = strpos( $proposed, ';' ); + if ( $semiPos !== false ) { + $proposed = substr( $proposed, 0, $semiPos ); + } + + $proposedFormat = $this->getDataFormat( $version, $proposed ); + if ( $proposedFormat == 'unknown' + && $proposed != 'multipart/mixed' + && $proposed != 'multipart/x-mixed-replace' ) + { + return $proposed; + } + if ( strval( $chunk ) === '' ) { + return $proposed; + } + + // Truncate chunk at 255 bytes + $chunk = substr( $chunk, 0, 255 ); + + // IE does the Check*Headers() calls last, and instead does the following image + // type checks by directly looking for the magic numbers. What I do here should + // have the same effect since the magic number checks are identical in both cases. + $result = $this->sampleData( $version, $chunk ); + $sampleFound = $result['found']; + $counters = $result['counters']; + $binaryType = $this->checkBinaryHeaders( $version, $chunk ); + $textType = $this->checkTextHeaders( $version, $chunk ); + + if ( $proposed == 'text/html' && isset( $sampleFound['html'] ) ) { + return 'text/html'; + } + if ( $proposed == 'image/gif' && $binaryType == 'image/gif' ) { + return 'image/gif'; + } + if ( ( $proposed == 'image/pjpeg' || $proposed == 'image/jpeg' ) + && $binaryType == 'image/pjpeg' ) + { + return $proposed; + } + // PNG check added in IE 7 + if ( $version >= 'ie07' + && ( $proposed == 'image/x-png' || $proposed == 'image/png' ) + && $binaryType == 'image/x-png' ) + { + return $proposed; + } + + // CDF was removed in IE 7 so it won't be in $sampleFound for later versions + if ( isset( $sampleFound['cdf'] ) ) { + return 'application/x-cdf'; + } + + // RSS and Atom were added in IE 7 so they won't be in $sampleFound for + // previous versions + if ( isset( $sampleFound['rss'] ) ) { + return 'application/rss+xml'; + } + if ( isset( $sampleFound['rdf-tag'] ) + && isset( $sampleFound['rdf-url'] ) + && isset( $sampleFound['rdf-purl'] ) ) + { + return 'application/rss+xml'; + } + if ( isset( $sampleFound['atom'] ) ) { + return 'application/atom+xml'; + } + + if ( isset( $sampleFound['xml'] ) ) { + // TODO: I'm not sure under what circumstances this flag is enabled + if ( strpos( $version, 'strict' ) !== false ) { + if ( $proposed == 'text/html' || $proposed == 'text/xml' ) { + return 'text/xml'; + } + } else { + return 'text/xml'; + } + } + if ( isset( $sampleFound['html'] ) ) { + // TODO: I'm not sure under what circumstances this flag is enabled + if ( strpos( $version, 'nohtml' ) !== false ) { + if ( $proposed == 'text/plain' ) { + return 'text/html'; + } + } else { + return 'text/html'; + } + } + if ( isset( $sampleFound['xbm'] ) ) { + return 'image/x-bitmap'; + } + if ( isset( $sampleFound['binhex'] ) ) { + return 'application/macbinhex40'; + } + if ( isset( $sampleFound['scriptlet'] ) ) { + if ( strpos( $version, 'strict' ) !== false ) { + if ( $proposed == 'text/plain' || $proposed == 'text/scriptlet' ) { + return 'text/scriptlet'; + } + } else { + return 'text/scriptlet'; + } + } + + // Freaky heuristics to determine if the data is text or binary + // The heuristic is of course broken for non-ASCII text + if ( $counters['ctrl'] != 0 && ( $counters['ff'] + $counters['low'] ) + < ( $counters['ctrl'] + $counters['high'] ) * 16 ) + { + $kindOfBinary = true; + $type = $binaryType ? $binaryType : $textType; + if ( $type === false ) { + $type = 'application/octet-stream'; + } + } else { + $kindOfBinary = false; + $type = $textType ? $textType : $binaryType; + if ( $type === false ) { + $type = 'text/plain'; + } + } + + // Check if the output format is ambiguous + // This generally means that detection failed, real types aren't ambiguous + $detectedFormat = $this->getDataFormat( $version, $type ); + if ( $detectedFormat != 'ambiguous' ) { + return $type; + } + + if ( $proposedFormat != 'ambiguous' ) { + // FormatAgreesWithData() + if ( $proposedFormat == 'text' && !$kindOfBinary ) { + return $proposed; + } + if ( $proposedFormat == 'binary' && $kindOfBinary ) { + return $proposed; + } + if ( $proposedFormat == 'html' ) { + return $proposed; + } + } + + // Find a MIME type by searching the registry for the file extension. + $dotPos = strrpos( $fileName, '.' ); + if ( $dotPos === false ) { + return $type; + } + $ext = substr( $fileName, $dotPos ); + if ( isset( $this->registry[$ext] ) ) { + return $this->registry[$ext]; + } + + // TODO: If the extension has an application registered to it, IE will return + // application/octet-stream. We'll skip that, so we could erroneously + // return text/plain or application/x-netcdf where application/octet-stream + // would be correct. + + return $type; + } + + /** + * Check for text headers at the start of the chunk + * Confirmed same in 5 and 7. + */ + private function checkTextHeaders( $version, $chunk ) { + $chunk2 = substr( $chunk, 0, 2 ); + $chunk4 = substr( $chunk, 0, 4 ); + $chunk5 = substr( $chunk, 0, 5 ); + if ( $chunk4 == '%PDF' ) { + return 'application/pdf'; + } + if ( $chunk2 == '%!' ) { + return 'application/postscript'; + } + if ( $chunk5 == '{\\rtf' ) { + return 'text/richtext'; + } + if ( $chunk5 == 'begin' ) { + return 'application/base64'; + } + return false; + } + + /** + * Check for binary headers at the start of the chunk + * Confirmed same in 5 and 7. + */ + private function checkBinaryHeaders( $version, $chunk ) { + $chunk2 = substr( $chunk, 0, 2 ); + $chunk3 = substr( $chunk, 0, 3 ); + $chunk4 = substr( $chunk, 0, 4 ); + $chunk5 = substr( $chunk, 0, 5 ); + $chunk5uc = strtoupper( $chunk5 ); + $chunk8 = substr( $chunk, 0, 8 ); + if ( $chunk5uc == 'GIF87' || $chunk5uc == 'GIF89' ) { + return 'image/gif'; + } + if ( $chunk2 == "\xff\xd8" ) { + return 'image/pjpeg'; // actually plain JPEG but this is what IE returns + } + + if ( $chunk2 == 'BM' + && substr( $chunk, 6, 2 ) == "\000\000" + && substr( $chunk, 8, 2 ) == "\000\000" ) + { + return 'image/bmp'; // another non-standard MIME + } + if ( $chunk4 == 'RIFF' + && substr( $chunk, 8, 4 ) == 'WAVE' ) + { + return 'audio/wav'; + } + // These were integer literals in IE + // Perhaps the author was not sure what the target endianness was + if ( $chunk4 == ".sd\000" + || $chunk4 == ".snd" + || $chunk4 == "\000ds." + || $chunk4 == "dns." ) + { + return 'audio/basic'; + } + if ( $chunk3 == "MM\000" ) { + return 'image/tiff'; + } + if ( $chunk2 == 'MZ' ) { + return 'application/x-msdownload'; + } + if ( $chunk8 == "\x89PNG\x0d\x0a\x1a\x0a" ) { + return 'image/x-png'; // [sic] + } + if ( strlen( $chunk ) >= 5 ) { + $byte2 = ord( $chunk[2] ); + $byte4 = ord( $chunk[4] ); + if ( $byte2 >= 3 && $byte2 <= 31 && $byte4 == 0 && $chunk2 == 'JG' ) { + return 'image/x-jg'; + } + } + // More endian confusion? + if ( $chunk4 == 'MROF' ) { + return 'audio/x-aiff'; + } + $chunk4_8 = substr( $chunk, 8, 4 ); + if ( $chunk4 == 'FORM' && ( $chunk4_8 == 'AIFF' || $chunk4_8 == 'AIFC' ) ) { + return 'audio/x-aiff'; + } + if ( $chunk4 == 'RIFF' && $chunk4_8 == 'AVI ' ) { + return 'video/avi'; + } + if ( $chunk4 == "\x00\x00\x01\xb3" || $chunk4 == "\x00\x00\x01\xba" ) { + return 'video/mpeg'; + } + if ( $chunk4 == "\001\000\000\000" + && substr( $chunk, 40, 4 ) == ' EMF' ) + { + return 'image/x-emf'; + } + if ( $chunk4 == "\xd7\xcd\xc6\x9a" ) { + return 'image/x-wmf'; + } + if ( $chunk4 == "\xca\xfe\xba\xbe" ) { + return 'application/java'; + } + if ( $chunk2 == 'PK' ) { + return 'application/x-zip-compressed'; + } + if ( $chunk2 == "\x1f\x9d" ) { + return 'application/x-compressed'; + } + if ( $chunk2 == "\x1f\x8b" ) { + return 'application/x-gzip-compressed'; + } + // Skip redundant check for ZIP + if ( $chunk5 == "MThd\000" ) { + return 'audio/mid'; + } + if ( $chunk4 == '%PDF' ) { + return 'application/pdf'; + } + return false; + } + + /** + * Do heuristic checks on the bulk of the data sample. + * Search for HTML tags. + */ + protected function sampleData( $version, $chunk ) { + $found = array(); + $counters = array( + 'ctrl' => 0, + 'high' => 0, + 'low' => 0, + 'lf' => 0, + 'cr' => 0, + 'ff' => 0 + ); + $htmlTags = array( + 'html', + 'head', + 'title', + 'body', + 'script', + 'a href', + 'pre', + 'img', + 'plaintext', + 'table' + ); + $rdfUrl = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'; + $rdfPurl = 'http://purl.org/rss/1.0/'; + $xbmMagic1 = '#define'; + $xbmMagic2 = '_width'; + $xbmMagic3 = '_bits'; + $binhexMagic = 'converted with BinHex'; + + for ( $offset = 0; $offset < strlen( $chunk ); $offset++ ) { + $curChar = $chunk[$offset]; + if ( $curChar == "\x0a" ) { + $counters['lf']++; + continue; + } elseif ( $curChar == "\x0d" ) { + $counters['cr']++; + continue; + } elseif ( $curChar == "\x0c" ) { + $counters['ff']++; + continue; + } elseif ( $curChar == "\t" ) { + $counters['low']++; + continue; + } elseif ( ord( $curChar ) < 32 ) { + $counters['ctrl']++; + continue; + } elseif ( ord( $curChar ) >= 128 ) { + $counters['high']++; + continue; + } + + $counters['low']++; + if ( $curChar == '<' ) { + // XML + $remainder = substr( $chunk, $offset + 1 ); + if ( !strncasecmp( $remainder, '?XML', 4 ) ) { + $nextChar = substr( $chunk, $offset + 5, 1 ); + if ( $nextChar == ':' || $nextChar == ' ' || $nextChar == "\t" ) { + $found['xml'] = true; + } + } + // Scriptlet (JSP) + if ( !strncasecmp( $remainder, 'SCRIPTLET', 9 ) ) { + $found['scriptlet'] = true; + break; + } + // HTML + foreach ( $htmlTags as $tag ) { + if ( !strncasecmp( $remainder, $tag, strlen( $tag ) ) ) { + $found['html'] = true; + } + } + // Skip broken check for additional tags (HR etc.) + + // CHANNEL replaced by RSS, RDF and FEED in IE 7 + if ( $version < 'ie07' ) { + if ( !strncasecmp( $remainder, 'CHANNEL', 7 ) ) { + $found['cdf'] = true; + } + } else { + // RSS + if ( !strncasecmp( $remainder, 'RSS', 3 ) ) { + $found['rss'] = true; + break; // return from SampleData + } + if ( !strncasecmp( $remainder, 'rdf:RDF', 7 ) ) { + $found['rdf-tag'] = true; + // no break + } + if ( !strncasecmp( $remainder, 'FEED', 4 ) ) { + $found['atom'] = true; + break; + } + } + continue; + } + // Skip broken check for --> + + // RSS URL checks + // For some reason both URLs must appear before it is recognised + $remainder = substr( $chunk, $offset ); + if ( !strncasecmp( $remainder, $rdfUrl, strlen( $rdfUrl ) ) ) { + $found['rdf-url'] = true; + if ( isset( $found['rdf-tag'] ) + && isset( $found['rdf-purl'] ) ) // [sic] + { + break; + } + continue; + } + + if ( !strncasecmp( $remainder, $rdfPurl, strlen( $rdfPurl ) ) ) { + if ( isset( $found['rdf-tag'] ) + && isset( $found['rdf-url'] ) ) // [sic] + { + break; + } + continue; + } + + // XBM checks + if ( !strncasecmp( $remainder, $xbmMagic1, strlen( $xbmMagic1 ) ) ) { + $found['xbm1'] = true; + continue; + } + if ( $curChar == '_' ) { + if ( isset( $found['xbm2'] ) ) { + if ( !strncasecmp( $remainder, $xbmMagic3, strlen( $xbmMagic3 ) ) ) { + $found['xbm'] = true; + break; + } + } elseif ( isset( $found['xbm1'] ) ) { + if ( !strncasecmp( $remainder, $xbmMagic2, strlen( $xbmMagic2 ) ) ) { + $found['xbm2'] = true; + } + } + } + + // BinHex + if ( !strncmp( $remainder, $binhexMagic, strlen( $binhexMagic ) ) ) { + $found['binhex'] = true; + } + } + return array( 'found' => $found, 'counters' => $counters ); + } + + protected function getDataFormat( $version, $type ) { + $types = $this->typeTable[$version]; + if ( $type == '(null)' || strval( $type ) === '' ) { + return 'ambiguous'; + } + foreach ( $types as $format => $list ) { + if ( in_array( $type, $list ) ) { + return $format; + } + } + return 'unknown'; + } +} + diff --git a/includes/libs/IEUrlExtension.php b/includes/libs/IEUrlExtension.php new file mode 100644 index 00000000..100454d4 --- /dev/null +++ b/includes/libs/IEUrlExtension.php @@ -0,0 +1,247 @@ +<?php + +/** + * Internet Explorer derives a cache filename from a URL, and then in certain + * circumstances, uses the extension of the resulting file to determine the + * content type of the data, ignoring the Content-Type header. + * + * This can be a problem, especially when non-HTML content is sent by MediaWiki, + * and Internet Explorer interprets it as HTML, exposing an XSS vulnerability. + * + * Usually the script filename (e.g. api.php) is present in the URL, and this + * makes Internet Explorer think the extension is a harmless script extension. + * But Internet Explorer 6 and earlier allows the script extension to be + * obscured by encoding the dot as "%2E". + * + * This class contains functions which help in detecting and dealing with this + * situation. + * + * Checking the URL for a bad extension is somewhat complicated due to the fact + * that CGI doesn't provide a standard method to determine the URL. Instead it + * is necessary to pass a subset of $_SERVER variables, which we then attempt + * to use to guess parts of the URL. + */ +class IEUrlExtension { + /** + * Check a subset of $_SERVER (or the whole of $_SERVER if you like) + * to see if it indicates that the request was sent with a bad file + * extension. Returns true if the request should be denied or modified, + * false otherwise. The relevant $_SERVER elements are: + * + * - SERVER_SOFTWARE + * - REQUEST_URI + * - QUERY_STRING + * - PATH_INFO + * + * If the a variable is unset in $_SERVER, it should be unset in $vars. + * + * @param $vars A subset of $_SERVER. + * @param $extWhitelist Extensions which are allowed, assumed harmless. + */ + public static function areServerVarsBad( $vars, $extWhitelist = array() ) { + // Check QUERY_STRING or REQUEST_URI + if ( isset( $vars['SERVER_SOFTWARE'] ) + && isset( $vars['REQUEST_URI'] ) + && self::haveUndecodedRequestUri( $vars['SERVER_SOFTWARE'] ) ) + { + $urlPart = $vars['REQUEST_URI']; + } elseif ( isset( $vars['QUERY_STRING'] ) ) { + $urlPart = $vars['QUERY_STRING']; + } else { + $urlPart = ''; + } + + if ( self::isUrlExtensionBad( $urlPart, $extWhitelist ) ) { + return true; + } + + // Some servers have PATH_INFO but not REQUEST_URI, so we check both + // to be on the safe side. + if ( isset( $vars['PATH_INFO'] ) + && self::isUrlExtensionBad( $vars['PATH_INFO'], $extWhitelist ) ) + { + return true; + } + + // All checks passed + return false; + } + + /** + * Given a right-hand portion of a URL, determine whether IE would detect + * a potentially harmful file extension. + * + * @param $urlPart The right-hand portion of a URL + * @param $extWhitelist An array of file extensions which may occur in this + * URL, and which should be allowed. + * @return bool + */ + public static function isUrlExtensionBad( $urlPart, $extWhitelist = array() ) { + if ( strval( $urlPart ) === '' ) { + return false; + } + + $extension = self::findIE6Extension( $urlPart ); + if ( strval( $extension ) === '' ) { + // No extension or empty extension + return false; + } + + if ( in_array( $extension, array( 'php', 'php5' ) ) ) { + // Script extension, OK + return false; + } + if ( in_array( $extension, $extWhitelist ) ) { + // Whitelisted extension + return false; + } + + if ( !preg_match( '/^[a-zA-Z0-9_-]+$/', $extension ) ) { + // Non-alphanumeric extension, unlikely to be registered. + // + // The regex above is known to match all registered file extensions + // in a default Windows XP installation. It's important to allow + // extensions with ampersands and percent signs, since that reduces + // the number of false positives substantially. + return false; + } + + // Possibly bad extension + return true; + } + + /** + * Returns a variant of $url which will pass isUrlExtensionBad() but has the + * same GET parameters, or false if it can't figure one out. + */ + public static function fixUrlForIE6( $url, $extWhitelist = array() ) { + $questionPos = strpos( $url, '?' ); + if ( $questionPos === false ) { + $beforeQuery = $url . '?'; + $query = ''; + } elseif ( $questionPos === strlen( $url ) - 1 ) { + $beforeQuery = $url; + $query = ''; + } else { + $beforeQuery = substr( $url, 0, $questionPos + 1 ); + $query = substr( $url, $questionPos + 1 ); + } + + // Multiple question marks cause problems. Encode the second and + // subsequent question mark. + $query = str_replace( '?', '%3E', $query ); + // Append an invalid path character so that IE6 won't see the end of the + // query string as an extension + $query .= '&*'; + // Put the URL back together + $url = $beforeQuery . $query; + if ( self::isUrlExtensionBad( $url, $extWhitelist ) ) { + // Avoid a redirect loop + return false; + } + return $url; + } + + /** + * Determine what extension IE6 will infer from a certain query string. + * If the URL has an extension before the question mark, IE6 will use + * that and ignore the query string, but per the comment at + * isPathInfoBad() we don't have a reliable way to determine the URL, + * so isPathInfoBad() just passes in the query string for $url. + * All entry points have safe extensions (php, php5) anyway, so + * checking the query string is possibly overly paranoid but never + * insecure. + * + * The criteria for finding an extension are as follows: + * - a possible extension is a dot followed by one or more characters not + * in <>\"/:|?.# + * - if we find a possible extension followed by the end of the string or + * a #, that's our extension + * - if we find a possible extension followed by a ?, that's our extension + * - UNLESS it's exe, dll or cgi, in which case we ignore it and continue + * searching for another possible extension + * - if we find a possible extension followed by a dot or another illegal + * character, we ignore it and continue searching + * + * @param $url string URL + * @return mixed Detected extension (string), or false if none found + */ + public static function findIE6Extension( $url ) { + $pos = 0; + $hashPos = strpos( $url, '#' ); + if ( $hashPos !== false ) { + $urlLength = $hashPos; + } else { + $urlLength = strlen( $url ); + } + $remainingLength = $urlLength; + while ( $remainingLength > 0 ) { + // Skip ahead to the next dot + $pos += strcspn( $url, '.', $pos, $remainingLength ); + if ( $pos >= $urlLength ) { + // End of string, we're done + return false; + } + + // We found a dot. Skip past it + $pos++; + $remainingLength = $urlLength - $pos; + + // Check for illegal characters in our prospective extension, + // or for another dot + $nextPos = $pos + strcspn( $url, "<>\\\"/:|?*.", $pos, $remainingLength ); + if ( $nextPos >= $urlLength ) { + // No illegal character or next dot + // We have our extension + return substr( $url, $pos, $urlLength - $pos ); + } + if ( $url[$nextPos] === '?' ) { + // We've found a legal extension followed by a question mark + // If the extension is NOT exe, dll or cgi, return it + $extension = substr( $url, $pos, $nextPos - $pos ); + if ( strcasecmp( $extension, 'exe' ) && strcasecmp( $extension, 'dll' ) && + strcasecmp( $extension, 'cgi' ) ) + { + return $extension; + } + // Else continue looking + } + // We found an illegal character or another dot + // Skip to that character and continue the loop + $pos = $nextPos + 1; + $remainingLength = $urlLength - $pos; + } + return false; + } + + /** + * When passed the value of $_SERVER['SERVER_SOFTWARE'], this function + * returns true if that server is known to have a REQUEST_URI variable + * with %2E not decoded to ".". On such a server, it is possible to detect + * whether the script filename has been obscured. + * + * The function returns false if the server is not known to have this + * behaviour. Microsoft IIS in particular is known to decode escaped script + * filenames. + * + * SERVER_SOFTWARE typically contains either a plain string such as "Zeus", + * or a specification in the style of a User-Agent header, such as + * "Apache/1.3.34 (Unix) mod_ssl/2.8.25 OpenSSL/0.9.8a PHP/4.4.2" + * + * @param $serverSoftware + * @return bool + * + */ + public static function haveUndecodedRequestUri( $serverSoftware ) { + static $whitelist = array( + 'Apache', + 'Zeus', + 'LiteSpeed' ); + if ( preg_match( '/^(.*?)($|\/| )/', $serverSoftware, $m ) ) { + return in_array( $m[1], $whitelist ); + } else { + return false; + } + } + +} diff --git a/includes/libs/JavaScriptMinifier.php b/includes/libs/JavaScriptMinifier.php new file mode 100644 index 00000000..a991d915 --- /dev/null +++ b/includes/libs/JavaScriptMinifier.php @@ -0,0 +1,579 @@ +<?php +/** + * JavaScript Minifier + * + * This class is meant to safely minify javascript code, while leaving syntactically correct + * programs intact. Other libraries, such as JSMin require a certain coding style to work + * correctly. OTOH, libraries like jsminplus, that do parse the code correctly are rather + * slow, because they construct a complete parse tree before outputting the code minified. + * So this class is meant to allow arbitrary (but syntactically correct) input, while being + * fast enough to be used for on-the-fly minifying. + * + * Author: Paul Copperman <paul.copperman@gmail.com> + * License: choose any of Apache, MIT, GPL, LGPL + */ + +class JavaScriptMinifier { + + /* Class constants */ + /* Parsing states. + * The state machine is only necessary to decide whether to parse a slash as division + * operator or as regexp literal. + * States are named after the next expected item. We only distinguish states when the + * distinction is relevant for our purpose. + */ + const STATEMENT = 0; + const CONDITION = 1; + const PROPERTY_ASSIGNMENT = 2; + const EXPRESSION = 3; + const EXPRESSION_NO_NL = 4; // only relevant for semicolon insertion + const EXPRESSION_OP = 5; + const EXPRESSION_FUNC = 6; + const EXPRESSION_TERNARY = 7; // used to determine the role of a colon + const EXPRESSION_TERNARY_OP = 8; + const EXPRESSION_TERNARY_FUNC = 9; + const PAREN_EXPRESSION = 10; // expression which is not on the top level + const PAREN_EXPRESSION_OP = 11; + const PAREN_EXPRESSION_FUNC = 12; + const PROPERTY_EXPRESSION = 13; // expression which is within an object literal + const PROPERTY_EXPRESSION_OP = 14; + const PROPERTY_EXPRESSION_FUNC = 15; + + /* Token types */ + const TYPE_UN_OP = 1; // unary operators + const TYPE_INCR_OP = 2; // ++ and -- + const TYPE_BIN_OP = 3; // binary operators + const TYPE_ADD_OP = 4; // + and - which can be either unary or binary ops + const TYPE_HOOK = 5; // ? + const TYPE_COLON = 6; // : + const TYPE_COMMA = 7; // , + const TYPE_SEMICOLON = 8; // ; + const TYPE_BRACE_OPEN = 9; // { + const TYPE_BRACE_CLOSE = 10; // } + const TYPE_PAREN_OPEN = 11; // ( and [ + const TYPE_PAREN_CLOSE = 12; // ) and ] + const TYPE_RETURN = 13; // keywords: break, continue, return, throw + const TYPE_IF = 14; // keywords: catch, for, with, switch, while, if + const TYPE_DO = 15; // keywords: case, var, finally, else, do, try + const TYPE_FUNC = 16; // keywords: function + const TYPE_LITERAL = 17; // all literals, identifiers and unrecognised tokens + + // Sanity limit to avoid excessive memory usage + const STACK_LIMIT = 1000; + + /* Static functions */ + + /** + * Returns minified JavaScript code. + * + * NOTE: $maxLineLength isn't a strict maximum. Longer lines will be produced when + * literals (e.g. quoted strings) longer than $maxLineLength are encountered + * or when required to guard against semicolon insertion. + * + * @param $s String JavaScript code to minify + * @param $statementsOnOwnLine Bool Whether to put each statement on its own line + * @param $maxLineLength Int Maximum length of a single line, or -1 for no maximum. + * @return String Minified code + */ + public static function minify( $s, $statementsOnOwnLine = false, $maxLineLength = 1000 ) { + // First we declare a few tables that contain our parsing rules + + // $opChars : characters, which can be combined without whitespace in between them + $opChars = array( + '!' => true, + '"' => true, + '%' => true, + '&' => true, + "'" => true, + '(' => true, + ')' => true, + '*' => true, + '+' => true, + ',' => true, + '-' => true, + '.' => true, + '/' => true, + ':' => true, + ';' => true, + '<' => true, + '=' => true, + '>' => true, + '?' => true, + '[' => true, + ']' => true, + '^' => true, + '{' => true, + '|' => true, + '}' => true, + '~' => true + ); + + // $tokenTypes : maps keywords and operators to their corresponding token type + $tokenTypes = array( + '!' => self::TYPE_UN_OP, + '~' => self::TYPE_UN_OP, + 'delete' => self::TYPE_UN_OP, + 'new' => self::TYPE_UN_OP, + 'typeof' => self::TYPE_UN_OP, + 'void' => self::TYPE_UN_OP, + '++' => self::TYPE_INCR_OP, + '--' => self::TYPE_INCR_OP, + '!=' => self::TYPE_BIN_OP, + '!==' => self::TYPE_BIN_OP, + '%' => self::TYPE_BIN_OP, + '%=' => self::TYPE_BIN_OP, + '&' => self::TYPE_BIN_OP, + '&&' => self::TYPE_BIN_OP, + '&=' => self::TYPE_BIN_OP, + '*' => self::TYPE_BIN_OP, + '*=' => self::TYPE_BIN_OP, + '+=' => self::TYPE_BIN_OP, + '-=' => self::TYPE_BIN_OP, + '.' => self::TYPE_BIN_OP, + '/' => self::TYPE_BIN_OP, + '/=' => self::TYPE_BIN_OP, + '<' => self::TYPE_BIN_OP, + '<<' => self::TYPE_BIN_OP, + '<<=' => self::TYPE_BIN_OP, + '<=' => self::TYPE_BIN_OP, + '=' => self::TYPE_BIN_OP, + '==' => self::TYPE_BIN_OP, + '===' => self::TYPE_BIN_OP, + '>' => self::TYPE_BIN_OP, + '>=' => self::TYPE_BIN_OP, + '>>' => self::TYPE_BIN_OP, + '>>=' => self::TYPE_BIN_OP, + '>>>' => self::TYPE_BIN_OP, + '>>>=' => self::TYPE_BIN_OP, + '^' => self::TYPE_BIN_OP, + '^=' => self::TYPE_BIN_OP, + '|' => self::TYPE_BIN_OP, + '|=' => self::TYPE_BIN_OP, + '||' => self::TYPE_BIN_OP, + 'in' => self::TYPE_BIN_OP, + 'instanceof' => self::TYPE_BIN_OP, + '+' => self::TYPE_ADD_OP, + '-' => self::TYPE_ADD_OP, + '?' => self::TYPE_HOOK, + ':' => self::TYPE_COLON, + ',' => self::TYPE_COMMA, + ';' => self::TYPE_SEMICOLON, + '{' => self::TYPE_BRACE_OPEN, + '}' => self::TYPE_BRACE_CLOSE, + '(' => self::TYPE_PAREN_OPEN, + '[' => self::TYPE_PAREN_OPEN, + ')' => self::TYPE_PAREN_CLOSE, + ']' => self::TYPE_PAREN_CLOSE, + 'break' => self::TYPE_RETURN, + 'continue' => self::TYPE_RETURN, + 'return' => self::TYPE_RETURN, + 'throw' => self::TYPE_RETURN, + 'catch' => self::TYPE_IF, + 'for' => self::TYPE_IF, + 'if' => self::TYPE_IF, + 'switch' => self::TYPE_IF, + 'while' => self::TYPE_IF, + 'with' => self::TYPE_IF, + 'case' => self::TYPE_DO, + 'do' => self::TYPE_DO, + 'else' => self::TYPE_DO, + 'finally' => self::TYPE_DO, + 'try' => self::TYPE_DO, + 'var' => self::TYPE_DO, + 'function' => self::TYPE_FUNC + ); + + // $goto : This is the main table for our state machine. For every state/token pair + // the following state is defined. When no rule exists for a given pair, + // the state is left unchanged. + $goto = array( + self::STATEMENT => array( + self::TYPE_UN_OP => self::EXPRESSION, + self::TYPE_INCR_OP => self::EXPRESSION, + self::TYPE_ADD_OP => self::EXPRESSION, + self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION, + self::TYPE_RETURN => self::EXPRESSION_NO_NL, + self::TYPE_IF => self::CONDITION, + self::TYPE_FUNC => self::CONDITION, + self::TYPE_LITERAL => self::EXPRESSION_OP + ), + self::CONDITION => array( + self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION + ), + self::PROPERTY_ASSIGNMENT => array( + self::TYPE_COLON => self::PROPERTY_EXPRESSION, + self::TYPE_BRACE_OPEN => self::STATEMENT + ), + self::EXPRESSION => array( + self::TYPE_SEMICOLON => self::STATEMENT, + self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT, + self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION, + self::TYPE_FUNC => self::EXPRESSION_FUNC, + self::TYPE_LITERAL => self::EXPRESSION_OP + ), + self::EXPRESSION_NO_NL => array( + self::TYPE_SEMICOLON => self::STATEMENT, + self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT, + self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION, + self::TYPE_FUNC => self::EXPRESSION_FUNC, + self::TYPE_LITERAL => self::EXPRESSION_OP + ), + self::EXPRESSION_OP => array( + self::TYPE_BIN_OP => self::EXPRESSION, + self::TYPE_ADD_OP => self::EXPRESSION, + self::TYPE_HOOK => self::EXPRESSION_TERNARY, + self::TYPE_COLON => self::STATEMENT, + self::TYPE_COMMA => self::EXPRESSION, + self::TYPE_SEMICOLON => self::STATEMENT, + self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION + ), + self::EXPRESSION_FUNC => array( + self::TYPE_BRACE_OPEN => self::STATEMENT + ), + self::EXPRESSION_TERNARY => array( + self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT, + self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION, + self::TYPE_FUNC => self::EXPRESSION_TERNARY_FUNC, + self::TYPE_LITERAL => self::EXPRESSION_TERNARY_OP + ), + self::EXPRESSION_TERNARY_OP => array( + self::TYPE_BIN_OP => self::EXPRESSION_TERNARY, + self::TYPE_ADD_OP => self::EXPRESSION_TERNARY, + self::TYPE_HOOK => self::EXPRESSION_TERNARY, + self::TYPE_COMMA => self::EXPRESSION_TERNARY, + self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION + ), + self::EXPRESSION_TERNARY_FUNC => array( + self::TYPE_BRACE_OPEN => self::STATEMENT + ), + self::PAREN_EXPRESSION => array( + self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT, + self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION, + self::TYPE_FUNC => self::PAREN_EXPRESSION_FUNC, + self::TYPE_LITERAL => self::PAREN_EXPRESSION_OP + ), + self::PAREN_EXPRESSION_OP => array( + self::TYPE_BIN_OP => self::PAREN_EXPRESSION, + self::TYPE_ADD_OP => self::PAREN_EXPRESSION, + self::TYPE_HOOK => self::PAREN_EXPRESSION, + self::TYPE_COLON => self::PAREN_EXPRESSION, + self::TYPE_COMMA => self::PAREN_EXPRESSION, + self::TYPE_SEMICOLON => self::PAREN_EXPRESSION, + self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION + ), + self::PAREN_EXPRESSION_FUNC => array( + self::TYPE_BRACE_OPEN => self::STATEMENT + ), + self::PROPERTY_EXPRESSION => array( + self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT, + self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION, + self::TYPE_FUNC => self::PROPERTY_EXPRESSION_FUNC, + self::TYPE_LITERAL => self::PROPERTY_EXPRESSION_OP + ), + self::PROPERTY_EXPRESSION_OP => array( + self::TYPE_BIN_OP => self::PROPERTY_EXPRESSION, + self::TYPE_ADD_OP => self::PROPERTY_EXPRESSION, + self::TYPE_HOOK => self::PROPERTY_EXPRESSION, + self::TYPE_COMMA => self::PROPERTY_ASSIGNMENT, + self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION + ), + self::PROPERTY_EXPRESSION_FUNC => array( + self::TYPE_BRACE_OPEN => self::STATEMENT + ) + ); + + // $push : This table contains the rules for when to push a state onto the stack. + // The pushed state is the state to return to when the corresponding + // closing token is found + $push = array( + self::STATEMENT => array( + self::TYPE_BRACE_OPEN => self::STATEMENT, + self::TYPE_PAREN_OPEN => self::EXPRESSION_OP + ), + self::CONDITION => array( + self::TYPE_PAREN_OPEN => self::STATEMENT + ), + self::PROPERTY_ASSIGNMENT => array( + self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT + ), + self::EXPRESSION => array( + self::TYPE_BRACE_OPEN => self::EXPRESSION_OP, + self::TYPE_PAREN_OPEN => self::EXPRESSION_OP + ), + self::EXPRESSION_NO_NL => array( + self::TYPE_BRACE_OPEN => self::EXPRESSION_OP, + self::TYPE_PAREN_OPEN => self::EXPRESSION_OP + ), + self::EXPRESSION_OP => array( + self::TYPE_HOOK => self::EXPRESSION, + self::TYPE_PAREN_OPEN => self::EXPRESSION_OP + ), + self::EXPRESSION_FUNC => array( + self::TYPE_BRACE_OPEN => self::EXPRESSION_OP + ), + self::EXPRESSION_TERNARY => array( + self::TYPE_BRACE_OPEN => self::EXPRESSION_TERNARY_OP, + self::TYPE_PAREN_OPEN => self::EXPRESSION_TERNARY_OP + ), + self::EXPRESSION_TERNARY_OP => array( + self::TYPE_HOOK => self::EXPRESSION_TERNARY, + self::TYPE_PAREN_OPEN => self::EXPRESSION_TERNARY_OP + ), + self::EXPRESSION_TERNARY_FUNC => array( + self::TYPE_BRACE_OPEN => self::EXPRESSION_TERNARY_OP + ), + self::PAREN_EXPRESSION => array( + self::TYPE_BRACE_OPEN => self::PAREN_EXPRESSION_OP, + self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION_OP + ), + self::PAREN_EXPRESSION_OP => array( + self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION_OP + ), + self::PAREN_EXPRESSION_FUNC => array( + self::TYPE_BRACE_OPEN => self::PAREN_EXPRESSION_OP + ), + self::PROPERTY_EXPRESSION => array( + self::TYPE_BRACE_OPEN => self::PROPERTY_EXPRESSION_OP, + self::TYPE_PAREN_OPEN => self::PROPERTY_EXPRESSION_OP + ), + self::PROPERTY_EXPRESSION_OP => array( + self::TYPE_PAREN_OPEN => self::PROPERTY_EXPRESSION_OP + ), + self::PROPERTY_EXPRESSION_FUNC => array( + self::TYPE_BRACE_OPEN => self::PROPERTY_EXPRESSION_OP + ) + ); + + // $pop : Rules for when to pop a state from the stack + $pop = array( + self::STATEMENT => array( self::TYPE_BRACE_CLOSE => true ), + self::PROPERTY_ASSIGNMENT => array( self::TYPE_BRACE_CLOSE => true ), + self::EXPRESSION => array( self::TYPE_BRACE_CLOSE => true ), + self::EXPRESSION_NO_NL => array( self::TYPE_BRACE_CLOSE => true ), + self::EXPRESSION_OP => array( self::TYPE_BRACE_CLOSE => true ), + self::EXPRESSION_TERNARY_OP => array( self::TYPE_COLON => true ), + self::PAREN_EXPRESSION => array( self::TYPE_PAREN_CLOSE => true ), + self::PAREN_EXPRESSION_OP => array( self::TYPE_PAREN_CLOSE => true ), + self::PROPERTY_EXPRESSION => array( self::TYPE_BRACE_CLOSE => true ), + self::PROPERTY_EXPRESSION_OP => array( self::TYPE_BRACE_CLOSE => true ) + ); + + // $semicolon : Rules for when a semicolon insertion is appropriate + $semicolon = array( + self::EXPRESSION_NO_NL => array( + self::TYPE_UN_OP => true, + self::TYPE_INCR_OP => true, + self::TYPE_ADD_OP => true, + self::TYPE_BRACE_OPEN => true, + self::TYPE_PAREN_OPEN => true, + self::TYPE_RETURN => true, + self::TYPE_IF => true, + self::TYPE_DO => true, + self::TYPE_FUNC => true, + self::TYPE_LITERAL => true + ), + self::EXPRESSION_OP => array( + self::TYPE_UN_OP => true, + self::TYPE_INCR_OP => true, + self::TYPE_BRACE_OPEN => true, + self::TYPE_RETURN => true, + self::TYPE_IF => true, + self::TYPE_DO => true, + self::TYPE_FUNC => true, + self::TYPE_LITERAL => true + ) + ); + + // Rules for when newlines should be inserted if + // $statementsOnOwnLine is enabled. + // $newlineBefore is checked before switching state, + // $newlineAfter is checked after + $newlineBefore = array( + self::STATEMENT => array( + self::TYPE_BRACE_CLOSE => true, + ), + ); + $newlineAfter = array( + self::STATEMENT => array( + self::TYPE_BRACE_OPEN => true, + self::TYPE_PAREN_CLOSE => true, + self::TYPE_SEMICOLON => true, + ), + ); + + // $divStates : Contains all states that can be followed by a division operator + $divStates = array( + self::EXPRESSION_OP => true, + self::EXPRESSION_TERNARY_OP => true, + self::PAREN_EXPRESSION_OP => true, + self::PROPERTY_EXPRESSION_OP => true + ); + + // Here's where the minifying takes place: Loop through the input, looking for tokens + // and output them to $out, taking actions to the above defined rules when appropriate. + $out = ''; + $pos = 0; + $length = strlen( $s ); + $lineLength = 0; + $newlineFound = true; + $state = self::STATEMENT; + $stack = array(); + $last = ';'; // Pretend that we have seen a semicolon yet + while( $pos < $length ) { + // First, skip over any whitespace and multiline comments, recording whether we + // found any newline character + $skip = strspn( $s, " \t\n\r\xb\xc", $pos ); + if( !$skip ) { + $ch = $s[$pos]; + if( $ch === '/' && substr( $s, $pos, 2 ) === '/*' ) { + // Multiline comment. Search for the end token or EOT. + $end = strpos( $s, '*/', $pos + 2 ); + $skip = $end === false ? $length - $pos : $end - $pos + 2; + } + } + if( $skip ) { + // The semicolon insertion mechanism needs to know whether there was a newline + // between two tokens, so record it now. + if( !$newlineFound && strcspn( $s, "\r\n", $pos, $skip ) !== $skip ) { + $newlineFound = true; + } + $pos += $skip; + continue; + } + // Handle C++-style comments and html comments, which are treated as single line + // comments by the browser, regardless of whether the end tag is on the same line. + // Handle --> the same way, but only if it's at the beginning of the line + if( ( $ch === '/' && substr( $s, $pos, 2 ) === '//' ) + || ( $ch === '<' && substr( $s, $pos, 4 ) === '<!--' ) + || ( $ch === '-' && $newlineFound && substr( $s, $pos, 3 ) === '-->' ) + ) { + $pos += strcspn( $s, "\r\n", $pos ); + continue; + } + + // Find out which kind of token we're handling. $end will point past the end of it. + $end = $pos + 1; + // Handle string literals + if( $ch === "'" || $ch === '"' ) { + // Search to the end of the string literal, skipping over backslash escapes + $search = $ch . '\\'; + do{ + $end += strcspn( $s, $search, $end ) + 2; + } while( $end - 2 < $length && $s[$end - 2] === '\\' ); + $end--; + // We have to distinguish between regexp literals and division operators + // A division operator is only possible in certain states + } elseif( $ch === '/' && !isset( $divStates[$state] ) ) { + // Regexp literal, search to the end, skipping over backslash escapes and + // character classes + for( ; ; ) { + do{ + $end += strcspn( $s, '/[\\', $end ) + 2; + } while( $end - 2 < $length && $s[$end - 2] === '\\' ); + $end--; + if( $end - 1 >= $length || $s[$end - 1] === '/' ) { + break; + } + do{ + $end += strcspn( $s, ']\\', $end ) + 2; + } while( $end - 2 < $length && $s[$end - 2] === '\\' ); + $end--; + }; + // Search past the regexp modifiers (gi) + while( $end < $length && ctype_alpha( $s[$end] ) ) { + $end++; + } + } elseif( + ctype_digit( $ch ) + || ( $ch === '.' && $pos + 1 < $length && ctype_digit( $s[$pos + 1] ) ) + ) { + // Numeric literal. Search for the end of it, but don't care about [+-]exponent + // at the end, as the results of "numeric [+-] numeric" and "numeric" are + // identical to our state machine. + $end += strspn( $s, '0123456789ABCDEFabcdefXx.', $end ); + while( $s[$end - 1] === '.' ) { + // Special case: When a numeric ends with a dot, we have to check the + // literal for proper syntax + $decimal = strspn( $s, '0123456789', $pos, $end - $pos - 1 ); + if( $decimal === $end - $pos - 1 ) { + break; + } else { + $end--; + } + } + } elseif( isset( $opChars[$ch] ) ) { + // Punctuation character. Search for the longest matching operator. + while( + $end < $length + && isset( $tokenTypes[substr( $s, $pos, $end - $pos + 1 )] ) + ) { + $end++; + } + } else { + // Identifier or reserved word. Search for the end by excluding whitespace and + // punctuation. + $end += strcspn( $s, " \t\n.;,=<>+-{}()[]?:*/%'\"!&|^~\xb\xc\r", $end ); + } + + // Now get the token type from our type array + $token = substr( $s, $pos, $end - $pos ); // so $end - $pos == strlen( $token ) + $type = isset( $tokenTypes[$token] ) ? $tokenTypes[$token] : self::TYPE_LITERAL; + + if( $newlineFound && isset( $semicolon[$state][$type] ) ) { + // This token triggers the semicolon insertion mechanism of javascript. While we + // could add the ; token here ourselves, keeping the newline has a few advantages. + $out .= "\n"; + $state = self::STATEMENT; + $lineLength = 0; + } elseif( $maxLineLength > 0 && $lineLength + $end - $pos > $maxLineLength && + !isset( $semicolon[$state][$type] ) && $type !== self::TYPE_INCR_OP ) + { + // This line would get too long if we added $token, so add a newline first. + // Only do this if it won't trigger semicolon insertion and if it won't + // put a postfix increment operator on its own line, which is illegal in js. + $out .= "\n"; + $lineLength = 0; + // Check, whether we have to separate the token from the last one with whitespace + } elseif( !isset( $opChars[$last] ) && !isset( $opChars[$ch] ) ) { + $out .= ' '; + $lineLength++; + // Don't accidentally create ++, -- or // tokens + } elseif( $last === $ch && ( $ch === '+' || $ch === '-' || $ch === '/' ) ) { + $out .= ' '; + $lineLength++; + } + + $out .= $token; + $lineLength += $end - $pos; // += strlen( $token ) + $last = $s[$end - 1]; + $pos = $end; + $newlineFound = false; + + // Output a newline after the token if required + // This is checked before AND after switching state + $newlineAdded = false; + if ( $statementsOnOwnLine && !$newlineAdded && isset( $newlineBefore[$state][$type] ) ) { + $out .= "\n"; + $lineLength = 0; + $newlineAdded = true; + } + + // Now that we have output our token, transition into the new state. + if( isset( $push[$state][$type] ) && count( $stack ) < self::STACK_LIMIT ) { + $stack[] = $push[$state][$type]; + } + if( $stack && isset( $pop[$state][$type] ) ) { + $state = array_pop( $stack ); + } elseif( isset( $goto[$state][$type] ) ) { + $state = $goto[$state][$type]; + } + + // Check for newline insertion again + if ( $statementsOnOwnLine && !$newlineAdded && isset( $newlineAfter[$state][$type] ) ) { + $out .= "\n"; + $lineLength = 0; + } + } + return $out; + } +} diff --git a/includes/libs/README b/includes/libs/README new file mode 100644 index 00000000..85e3db3c --- /dev/null +++ b/includes/libs/README @@ -0,0 +1,4 @@ +The classes in this directory ./includes/libs are considered standalone +from the remainder of the MediaWiki codebase. They do not call on any other +portions of MediaWiki code, and can be used in other projects without +dependency issues. diff --git a/includes/libs/spyc.php b/includes/libs/spyc.php new file mode 100644 index 00000000..bc92e869 --- /dev/null +++ b/includes/libs/spyc.php @@ -0,0 +1,248 @@ +<?php +/** + * Spyc -- A Simple PHP YAML Class + * + * @file + * @version 0.2.3 -- 2006-02-04 + * @author Chris Wanstrath <chris@ozmm.org> + * @see http://spyc.sourceforge.net/ + * @copyright Copyright 2005-2006 Chris Wanstrath + * @license http://www.opensource.org/licenses/mit-license.php MIT License + */ + +/** + * The Simple PHP YAML Class. + * + * This class can be used to read a YAML file and convert its contents + * into a PHP array. It currently supports a very limited subsection of + * the YAML spec. + * + * @ingroup API + */ +class Spyc { + + /** + * Dump YAML from PHP array statically + * + * The dump method, when supplied with an array, will do its best + * to convert the array into friendly YAML. Pretty simple. Feel free to + * save the returned string as nothing.yml and pass it around. + * + * Oh, and you can decide how big the indent is and what the wordwrap + * for folding is. Pretty cool -- just pass in 'false' for either if + * you want to use the default. + * + * Indent's default is 2 spaces, wordwrap's default is 40 characters. And + * you can turn off wordwrap by passing in 0. + * + * @param $array Array: PHP array + * @param $indent Integer: Pass in false to use the default, which is 2 + * @param $wordwrap Integer: Pass in 0 for no wordwrap, false for default (40) + * @return String + */ + public static function YAMLDump( $array, $indent = false, $wordwrap = false ) { + $spyc = new Spyc; + return $spyc->dump( $array, $indent, $wordwrap ); + } + + /** + * Dump PHP array to YAML + * + * The dump method, when supplied with an array, will do its best + * to convert the array into friendly YAML. Pretty simple. Feel free to + * save the returned string as tasteful.yml and pass it around. + * + * Oh, and you can decide how big the indent is and what the wordwrap + * for folding is. Pretty cool -- just pass in 'false' for either if + * you want to use the default. + * + * Indent's default is 2 spaces, wordwrap's default is 40 characters. And + * you can turn off wordwrap by passing in 0. + * + * @param $array Array: PHP array + * @param $indent Integer: Pass in false to use the default, which is 2 + * @param $wordwrap Integer: Pass in 0 for no wordwrap, false for default (40) + * @return String + */ + public function dump( $array, $indent = false, $wordwrap = false ) { + // Dumps to some very clean YAML. We'll have to add some more features + // and options soon. And better support for folding. + + // New features and options. + if ( $indent === false or !is_numeric( $indent ) ) { + $this->_dumpIndent = 2; + } else { + $this->_dumpIndent = $indent; + } + + if ( $wordwrap === false or !is_numeric( $wordwrap ) ) { + $this->_dumpWordWrap = 40; + } else { + $this->_dumpWordWrap = $wordwrap; + } + + // New YAML document + $string = "---\n"; + + // Start at the base of the array and move through it. + foreach ( $array as $key => $value ) { + $string .= $this->_yamlize( $key, $value, 0 ); + } + return $string; + } + + /**** Private Properties ****/ + + /** + * Unused variables, but just commented rather than deleting + * to save altering the library + private $_haveRefs; + private $_allNodes; + private $_lastIndent; + private $_lastNode; + private $_inBlock; + private $_isInline; + **/ + private $_dumpIndent; + private $_dumpWordWrap; + + /**** Private Methods ****/ + + /** + * Attempts to convert a key / value array item to YAML + * + * @param $key Mixed: the name of the key + * @param $value Mixed: the value of the item + * @param $indent Integer: the indent of the current node + * @return String + */ + private function _yamlize( $key, $value, $indent ) { + if ( is_array( $value ) ) { + // It has children. What to do? + // Make it the right kind of item + $string = $this->_dumpNode( $key, null, $indent ); + // Add the indent + $indent += $this->_dumpIndent; + // Yamlize the array + $string .= $this->_yamlizeArray( $value, $indent ); + } elseif ( !is_array( $value ) ) { + // It doesn't have children. Yip. + $string = $this->_dumpNode( $key, $value, $indent ); + } + return $string; + } + + /** + * Attempts to convert an array to YAML + * + * @param $array Array: the array you want to convert + * @param $indent Integer: the indent of the current level + * @return String + */ + private function _yamlizeArray( $array, $indent ) { + if ( is_array( $array ) ) { + $string = ''; + foreach ( $array as $key => $value ) { + $string .= $this->_yamlize( $key, $value, $indent ); + } + return $string; + } else { + return false; + } + } + + /** + * Find out whether a string needs to be output as a literal rather than in plain style. + * Added by Roan Kattouw 13-03-2008 + * + * @param $value String: the string to check + * @return Boolean + */ + function _needLiteral( $value ) { + // Check whether the string contains # or : or begins with any of: + // [ - ? , [ ] { } ! * & | > ' " % @ ` ] + // or is a number or contains newlines + return (bool)( gettype( $value ) == "string" && + ( is_numeric( $value ) || + strpos( $value, "\n" ) || + preg_match( "/[#:]/", $value ) || + preg_match( "/^[-?,[\]{}!*&|>'\"%@`]/", $value ) ) ); + } + + /** + * Returns YAML from a key and a value + * + * @param $key Mixed: the name of the key + * @param $value Mixed: the value of the item + * @param $indent Integer: the indent of the current node + * @return String + */ + private function _dumpNode( $key, $value, $indent ) { + // do some folding here, for blocks + if ( $this->_needLiteral( $value ) ) { + $value = $this->_doLiteralBlock( $value, $indent ); + } else { + $value = $this->_doFolding( $value, $indent ); + } + + $spaces = str_repeat( ' ', $indent ); + + if ( is_int( $key ) ) { + // It's a sequence + if ( $value !== '' && !is_null( $value ) ) + $string = $spaces . '- ' . $value . "\n"; + else + $string = $spaces . "-\n"; + } else { + if ( $key == '*' ) // bug 21922 - Quote asterix used as keys + $key = "'*'"; + + // It's mapped + if ( $value !== '' && !is_null( $value ) ) + $string = $spaces . $key . ': ' . $value . "\n"; + else + $string = $spaces . $key . ":\n"; + } + return $string; + } + + /** + * Creates a literal block for dumping + * + * @param $value String + * @param $indent Integer: the value of the indent + * @return String + */ + private function _doLiteralBlock( $value, $indent ) { + $exploded = explode( "\n", $value ); + $newValue = '|-'; + $indent += $this->_dumpIndent; + $spaces = str_repeat( ' ', $indent ); + foreach ( $exploded as $line ) { + $newValue .= "\n" . $spaces . trim( $line ); + } + return $newValue; + } + + /** + * Folds a string of text, if necessary + * + * @param $value String: the string you wish to fold + * @param $indent Integer: the indent of the current node + * @return String + */ + private function _doFolding( $value, $indent ) { + // Don't do anything if wordwrap is set to 0 + if ( $this->_dumpWordWrap === 0 ) { + return $value; + } + + if ( strlen( $value ) > $this->_dumpWordWrap ) { + $indent += $this->_dumpIndent; + $indent = str_repeat( ' ', $indent ); + $wrapped = wordwrap( $value, $this->_dumpWordWrap, "\n$indent" ); + $value = ">-\n" . $indent . $wrapped; + } + return $value; + } +} |