diff options
Diffstat (limited to 'includes/utils')
-rw-r--r-- | includes/utils/ArrayUtils.php | 187 | ||||
-rw-r--r-- | includes/utils/Cdb.php | 163 | ||||
-rw-r--r-- | includes/utils/CdbDBA.php | 75 | ||||
-rw-r--r-- | includes/utils/CdbPHP.php | 494 | ||||
-rw-r--r-- | includes/utils/IP.php | 738 | ||||
-rw-r--r-- | includes/utils/MWCryptHKDF.php | 332 | ||||
-rw-r--r-- | includes/utils/MWCryptRand.php | 516 | ||||
-rw-r--r-- | includes/utils/MWFunction.php | 63 | ||||
-rw-r--r-- | includes/utils/README | 9 | ||||
-rw-r--r-- | includes/utils/StringUtils.php | 612 | ||||
-rw-r--r-- | includes/utils/UIDGenerator.php | 507 | ||||
-rw-r--r-- | includes/utils/ZipDirectoryReader.php | 732 |
12 files changed, 4428 insertions, 0 deletions
diff --git a/includes/utils/ArrayUtils.php b/includes/utils/ArrayUtils.php new file mode 100644 index 00000000..1e521cb8 --- /dev/null +++ b/includes/utils/ArrayUtils.php @@ -0,0 +1,187 @@ +<?php +/** + * Methods to play with arrays. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * A collection of static methods to play with arrays. + * + * @since 1.21 + */ +class ArrayUtils { + /** + * Sort the given array in a pseudo-random order which depends only on the + * given key and each element value. This is typically used for load + * balancing between servers each with a local cache. + * + * Keys are preserved. The input array is modified in place. + * + * Note: Benchmarking on PHP 5.3 and 5.4 indicates that for small + * strings, md5() is only 10% slower than hash('joaat',...) etc., + * since the function call overhead dominates. So there's not much + * justification for breaking compatibility with installations + * compiled with ./configure --disable-hash. + * + * @param array $array Array to sort + * @param string $key + * @param string $separator A separator used to delimit the array elements and the + * key. This can be chosen to provide backwards compatibility with + * various consistent hash implementations that existed before this + * function was introduced. + */ + public static function consistentHashSort( &$array, $key, $separator = "\000" ) { + $hashes = array(); + foreach ( $array as $elt ) { + $hashes[$elt] = md5( $elt . $separator . $key ); + } + uasort( $array, function ( $a, $b ) use ( $hashes ) { + return strcmp( $hashes[$a], $hashes[$b] ); + } ); + } + + /** + * Given an array of non-normalised probabilities, this function will select + * an element and return the appropriate key + * + * @param array $weights + * @return bool|int|string + */ + public static function pickRandom( $weights ) { + if ( !is_array( $weights ) || count( $weights ) == 0 ) { + return false; + } + + $sum = array_sum( $weights ); + if ( $sum == 0 ) { + # No loads on any of them + # In previous versions, this triggered an unweighted random selection, + # but this feature has been removed as of April 2006 to allow for strict + # separation of query groups. + return false; + } + $max = mt_getrandmax(); + $rand = mt_rand( 0, $max ) / $max * $sum; + + $sum = 0; + foreach ( $weights as $i => $w ) { + $sum += $w; + # Do not return keys if they have 0 weight. + # Note that the "all 0 weight" case is handed above + if ( $w > 0 && $sum >= $rand ) { + break; + } + } + + return $i; + } + + /** + * Do a binary search, and return the index of the largest item that sorts + * less than or equal to the target value. + * + * @since 1.23 + * + * @param array $valueCallback A function to call to get the value with + * a given array index. + * @param int $valueCount The number of items accessible via $valueCallback, + * indexed from 0 to $valueCount - 1 + * @param array $comparisonCallback A callback to compare two values, returning + * -1, 0 or 1 in the style of strcmp(). + * @param string $target The target value to find. + * + * @return int|bool The item index of the lower bound, or false if the target value + * sorts before all items. + */ + public static function findLowerBound( $valueCallback, $valueCount, + $comparisonCallback, $target + ) { + if ( $valueCount === 0 ) { + return false; + } + + $min = 0; + $max = $valueCount; + do { + $mid = $min + ( ( $max - $min ) >> 1 ); + $item = call_user_func( $valueCallback, $mid ); + $comparison = call_user_func( $comparisonCallback, $target, $item ); + if ( $comparison > 0 ) { + $min = $mid; + } elseif ( $comparison == 0 ) { + $min = $mid; + break; + } else { + $max = $mid; + } + } while ( $min < $max - 1 ); + + if ( $min == 0 ) { + $item = call_user_func( $valueCallback, $min ); + $comparison = call_user_func( $comparisonCallback, $target, $item ); + if ( $comparison < 0 ) { + // Before the first item + return false; + } + } + return $min; + } + + /** + * Do array_diff_assoc() on multi-dimensional arrays. + * + * Note: empty arrays are removed. + * + * @since 1.23 + * + * @param array $array1 The array to compare from + * @param array $array2,... More arrays to compare against + * @return array An array containing all the values from array1 + * that are not present in any of the other arrays. + */ + public static function arrayDiffAssocRecursive( $array1 ) { + $arrays = func_get_args(); + array_shift( $arrays ); + $ret = array(); + + foreach ( $array1 as $key => $value ) { + if ( is_array( $value ) ) { + $args = array( $value ); + foreach ( $arrays as $array ) { + if ( isset( $array[$key] ) ) { + $args[] = $array[$key]; + } + } + $valueret = call_user_func_array( __METHOD__, $args ); + if ( count( $valueret ) ) { + $ret[$key] = $valueret; + } + } else { + foreach ( $arrays as $array ) { + if ( isset( $array[$key] ) && $array[$key] === $value ) { + continue 2; + } + } + $ret[$key] = $value; + } + } + + return $ret; + } +} diff --git a/includes/utils/Cdb.php b/includes/utils/Cdb.php new file mode 100644 index 00000000..3ceb620f --- /dev/null +++ b/includes/utils/Cdb.php @@ -0,0 +1,163 @@ +<?php +/** + * Native CDB file reader and writer. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * Read from a CDB file. + * Native and pure PHP implementations are provided. + * http://cr.yp.to/cdb.html + */ +abstract class CdbReader { + /** + * The file handle + */ + protected $handle; + + /** + * Open a file and return a subclass instance + * + * @param string $fileName + * + * @return CdbReader + */ + public static function open( $fileName ) { + return self::haveExtension() ? + new CdbReaderDBA( $fileName ) : + new CdbReaderPHP( $fileName ); + } + + /** + * Returns true if the native extension is available + * + * @return bool + */ + public static function haveExtension() { + if ( !function_exists( 'dba_handlers' ) ) { + return false; + } + $handlers = dba_handlers(); + if ( !in_array( 'cdb', $handlers ) || !in_array( 'cdb_make', $handlers ) ) { + return false; + } + + return true; + } + + /** + * Create the object and open the file + * + * @param string $fileName + */ + abstract public function __construct( $fileName ); + + /** + * Close the file. Optional, you can just let the variable go out of scope. + */ + abstract public function close(); + + /** + * Get a value with a given key. Only string values are supported. + * + * @param string $key + */ + abstract public function get( $key ); +} + +/** + * Write to a CDB file. + * Native and pure PHP implementations are provided. + */ +abstract class CdbWriter { + /** + * The file handle + */ + protected $handle; + + /** + * File we'll be writing to when we're done + * @var string + */ + protected $realFileName; + + /** + * File we write to temporarily until we're done + * @var string + */ + protected $tmpFileName; + + /** + * Open a writer and return a subclass instance. + * The user must have write access to the directory, for temporary file creation. + * + * @param string $fileName + * + * @return CdbWriterDBA|CdbWriterPHP + */ + public static function open( $fileName ) { + return CdbReader::haveExtension() ? + new CdbWriterDBA( $fileName ) : + new CdbWriterPHP( $fileName ); + } + + /** + * Create the object and open the file + * + * @param string $fileName + */ + abstract public function __construct( $fileName ); + + /** + * Set a key to a given value. The value will be converted to string. + * @param string $key + * @param string $value + */ + abstract public function set( $key, $value ); + + /** + * Close the writer object. You should call this function before the object + * goes out of scope, to write out the final hashtables. + */ + abstract public function close(); + + /** + * If the object goes out of scope, close it for sanity + */ + public function __destruct() { + if ( isset( $this->handle ) ) { + $this->close(); + } + } + + /** + * Are we running on Windows? + * @return bool + */ + protected function isWindows() { + return substr( php_uname(), 0, 7 ) == 'Windows'; + } +} + +/** + * Exception for Cdb errors. + * This explicitly doesn't subclass MWException to encourage reuse. + */ +class CdbException extends Exception { +} diff --git a/includes/utils/CdbDBA.php b/includes/utils/CdbDBA.php new file mode 100644 index 00000000..efcaf21f --- /dev/null +++ b/includes/utils/CdbDBA.php @@ -0,0 +1,75 @@ +<?php +/** + * DBA-based CDB reader/writer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * Reader class which uses the DBA extension + */ +class CdbReaderDBA extends CdbReader { + public function __construct( $fileName ) { + $this->handle = dba_open( $fileName, 'r-', 'cdb' ); + if ( !$this->handle ) { + throw new CdbException( 'Unable to open CDB file "' . $fileName . '"' ); + } + } + + public function close() { + if ( isset( $this->handle ) ) { + dba_close( $this->handle ); + } + unset( $this->handle ); + } + + public function get( $key ) { + return dba_fetch( $key, $this->handle ); + } +} + +/** + * Writer class which uses the DBA extension + */ +class CdbWriterDBA extends CdbWriter { + public function __construct( $fileName ) { + $this->realFileName = $fileName; + $this->tmpFileName = $fileName . '.tmp.' . mt_rand( 0, 0x7fffffff ); + $this->handle = dba_open( $this->tmpFileName, 'n', 'cdb_make' ); + if ( !$this->handle ) { + throw new CdbException( 'Unable to open CDB file for write "' . $fileName . '"' ); + } + } + + public function set( $key, $value ) { + return dba_insert( $key, $value, $this->handle ); + } + + public function close() { + if ( isset( $this->handle ) ) { + dba_close( $this->handle ); + } + if ( $this->isWindows() ) { + unlink( $this->realFileName ); + } + if ( !rename( $this->tmpFileName, $this->realFileName ) ) { + throw new CdbException( 'Unable to move the new CDB file into place.' ); + } + unset( $this->handle ); + } +} diff --git a/includes/utils/CdbPHP.php b/includes/utils/CdbPHP.php new file mode 100644 index 00000000..19d747a7 --- /dev/null +++ b/includes/utils/CdbPHP.php @@ -0,0 +1,494 @@ +<?php +/** + * This is a port of D.J. Bernstein's CDB to PHP. It's based on the copy that + * appears in PHP 5.3. Changes are: + * * Error returns replaced with exceptions + * * Exception thrown if sizes or offsets are between 2GB and 4GB + * * Some variables renamed + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * Common functions for readers and writers + */ +class CdbFunctions { + /** + * Take a modulo of a signed integer as if it were an unsigned integer. + * $b must be less than 0x40000000 and greater than 0 + * + * @param int $a + * @param int $b + * + * @return int + */ + public static function unsignedMod( $a, $b ) { + if ( $a & 0x80000000 ) { + $m = ( $a & 0x7fffffff ) % $b + 2 * ( 0x40000000 % $b ); + + return $m % $b; + } else { + return $a % $b; + } + } + + /** + * Shift a signed integer right as if it were unsigned + * @param int $a + * @param int $b + * @return int + */ + public static function unsignedShiftRight( $a, $b ) { + if ( $b == 0 ) { + return $a; + } + if ( $a & 0x80000000 ) { + return ( ( $a & 0x7fffffff ) >> $b ) | ( 0x40000000 >> ( $b - 1 ) ); + } else { + return $a >> $b; + } + } + + /** + * The CDB hash function. + * + * @param string $s + * + * @return int + */ + public static function hash( $s ) { + $h = 5381; + $len = strlen( $s ); + for ( $i = 0; $i < $len; $i++ ) { + $h5 = ( $h << 5 ) & 0xffffffff; + // Do a 32-bit sum + // Inlined here for speed + $sum = ( $h & 0x3fffffff ) + ( $h5 & 0x3fffffff ); + $h = + ( + ( $sum & 0x40000000 ? 1 : 0 ) + + ( $h & 0x80000000 ? 2 : 0 ) + + ( $h & 0x40000000 ? 1 : 0 ) + + ( $h5 & 0x80000000 ? 2 : 0 ) + + ( $h5 & 0x40000000 ? 1 : 0 ) + ) << 30 + | ( $sum & 0x3fffffff ); + $h ^= ord( $s[$i] ); + $h &= 0xffffffff; + } + + return $h; + } +} + +/** + * CDB reader class + */ +class CdbReaderPHP extends CdbReader { + /** The filename */ + protected $fileName; + + /* number of hash slots searched under this key */ + protected $loop; + + /* initialized if loop is nonzero */ + protected $khash; + + /* initialized if loop is nonzero */ + protected $kpos; + + /* initialized if loop is nonzero */ + protected $hpos; + + /* initialized if loop is nonzero */ + protected $hslots; + + /* initialized if findNext() returns true */ + protected $dpos; + + /* initialized if cdb_findnext() returns 1 */ + protected $dlen; + + /** + * @param string $fileName + * @throws CdbException + */ + public function __construct( $fileName ) { + $this->fileName = $fileName; + $this->handle = fopen( $fileName, 'rb' ); + if ( !$this->handle ) { + throw new CdbException( 'Unable to open CDB file "' . $this->fileName . '".' ); + } + $this->findStart(); + } + + public function close() { + if ( isset( $this->handle ) ) { + fclose( $this->handle ); + } + unset( $this->handle ); + } + + /** + * @param mixed $key + * @return bool|string + */ + public function get( $key ) { + // strval is required + if ( $this->find( strval( $key ) ) ) { + return $this->read( $this->dlen, $this->dpos ); + } else { + return false; + } + } + + /** + * @param string $key + * @param int $pos + * @return bool + */ + protected function match( $key, $pos ) { + $buf = $this->read( strlen( $key ), $pos ); + + return $buf === $key; + } + + protected function findStart() { + $this->loop = 0; + } + + /** + * @throws CdbException + * @param int $length + * @param int $pos + * @return string + */ + protected function read( $length, $pos ) { + if ( fseek( $this->handle, $pos ) == -1 ) { + // This can easily happen if the internal pointers are incorrect + throw new CdbException( + 'Seek failed, file "' . $this->fileName . '" may be corrupted.' ); + } + + if ( $length == 0 ) { + return ''; + } + + $buf = fread( $this->handle, $length ); + if ( $buf === false || strlen( $buf ) !== $length ) { + throw new CdbException( + 'Read from CDB file failed, file "' . $this->fileName . '" may be corrupted.' ); + } + + return $buf; + } + + /** + * Unpack an unsigned integer and throw an exception if it needs more than 31 bits + * @param string $s + * @throws CdbException + * @return mixed + */ + protected function unpack31( $s ) { + $data = unpack( 'V', $s ); + if ( $data[1] > 0x7fffffff ) { + throw new CdbException( + 'Error in CDB file "' . $this->fileName . '", integer too big.' ); + } + + return $data[1]; + } + + /** + * Unpack a 32-bit signed integer + * @param string $s + * @return int + */ + protected function unpackSigned( $s ) { + $data = unpack( 'va/vb', $s ); + + return $data['a'] | ( $data['b'] << 16 ); + } + + /** + * @param string $key + * @return bool + */ + protected function findNext( $key ) { + if ( !$this->loop ) { + $u = CdbFunctions::hash( $key ); + $buf = $this->read( 8, ( $u << 3 ) & 2047 ); + $this->hslots = $this->unpack31( substr( $buf, 4 ) ); + if ( !$this->hslots ) { + return false; + } + $this->hpos = $this->unpack31( substr( $buf, 0, 4 ) ); + $this->khash = $u; + $u = CdbFunctions::unsignedShiftRight( $u, 8 ); + $u = CdbFunctions::unsignedMod( $u, $this->hslots ); + $u <<= 3; + $this->kpos = $this->hpos + $u; + } + + while ( $this->loop < $this->hslots ) { + $buf = $this->read( 8, $this->kpos ); + $pos = $this->unpack31( substr( $buf, 4 ) ); + if ( !$pos ) { + return false; + } + $this->loop += 1; + $this->kpos += 8; + if ( $this->kpos == $this->hpos + ( $this->hslots << 3 ) ) { + $this->kpos = $this->hpos; + } + $u = $this->unpackSigned( substr( $buf, 0, 4 ) ); + if ( $u === $this->khash ) { + $buf = $this->read( 8, $pos ); + $keyLen = $this->unpack31( substr( $buf, 0, 4 ) ); + if ( $keyLen == strlen( $key ) && $this->match( $key, $pos + 8 ) ) { + // Found + $this->dlen = $this->unpack31( substr( $buf, 4 ) ); + $this->dpos = $pos + 8 + $keyLen; + + return true; + } + } + } + + return false; + } + + /** + * @param mixed $key + * @return bool + */ + protected function find( $key ) { + $this->findStart(); + + return $this->findNext( $key ); + } +} + +/** + * CDB writer class + */ +class CdbWriterPHP extends CdbWriter { + protected $hplist; + + protected $numentries; + + protected $pos; + + /** + * @param string $fileName + */ + public function __construct( $fileName ) { + $this->realFileName = $fileName; + $this->tmpFileName = $fileName . '.tmp.' . mt_rand( 0, 0x7fffffff ); + $this->handle = fopen( $this->tmpFileName, 'wb' ); + if ( !$this->handle ) { + $this->throwException( + 'Unable to open CDB file "' . $this->tmpFileName . '" for write.' ); + } + $this->hplist = array(); + $this->numentries = 0; + $this->pos = 2048; // leaving space for the pointer array, 256 * 8 + if ( fseek( $this->handle, $this->pos ) == -1 ) { + $this->throwException( 'fseek failed in file "' . $this->tmpFileName . '".' ); + } + } + + /** + * @param string $key + * @param string $value + */ + public function set( $key, $value ) { + if ( strval( $key ) === '' ) { + // DBA cross-check hack + return; + } + $this->addbegin( strlen( $key ), strlen( $value ) ); + $this->write( $key ); + $this->write( $value ); + $this->addend( strlen( $key ), strlen( $value ), CdbFunctions::hash( $key ) ); + } + + /** + * @throws CdbException + */ + public function close() { + $this->finish(); + if ( isset( $this->handle ) ) { + fclose( $this->handle ); + } + if ( $this->isWindows() && file_exists( $this->realFileName ) ) { + unlink( $this->realFileName ); + } + if ( !rename( $this->tmpFileName, $this->realFileName ) ) { + $this->throwException( 'Unable to move the new CDB file into place.' ); + } + unset( $this->handle ); + } + + /** + * @throws CdbException + * @param string $buf + */ + protected function write( $buf ) { + $len = fwrite( $this->handle, $buf ); + if ( $len !== strlen( $buf ) ) { + $this->throwException( 'Error writing to CDB file "' . $this->tmpFileName . '".' ); + } + } + + /** + * @throws CdbException + * @param int $len + */ + protected function posplus( $len ) { + $newpos = $this->pos + $len; + if ( $newpos > 0x7fffffff ) { + $this->throwException( + 'A value in the CDB file "' . $this->tmpFileName . '" is too large.' ); + } + $this->pos = $newpos; + } + + /** + * @param int $keylen + * @param int $datalen + * @param int $h + */ + protected function addend( $keylen, $datalen, $h ) { + $this->hplist[] = array( + 'h' => $h, + 'p' => $this->pos + ); + + $this->numentries++; + $this->posplus( 8 ); + $this->posplus( $keylen ); + $this->posplus( $datalen ); + } + + /** + * @throws CdbException + * @param int $keylen + * @param int $datalen + */ + protected function addbegin( $keylen, $datalen ) { + if ( $keylen > 0x7fffffff ) { + $this->throwException( 'Key length too long in file "' . $this->tmpFileName . '".' ); + } + if ( $datalen > 0x7fffffff ) { + $this->throwException( 'Data length too long in file "' . $this->tmpFileName . '".' ); + } + $buf = pack( 'VV', $keylen, $datalen ); + $this->write( $buf ); + } + + /** + * @throws CdbException + */ + protected function finish() { + // Hack for DBA cross-check + $this->hplist = array_reverse( $this->hplist ); + + // Calculate the number of items that will be in each hashtable + $counts = array_fill( 0, 256, 0 ); + foreach ( $this->hplist as $item ) { + ++$counts[255 & $item['h']]; + } + + // Fill in $starts with the *end* indexes + $starts = array(); + $pos = 0; + for ( $i = 0; $i < 256; ++$i ) { + $pos += $counts[$i]; + $starts[$i] = $pos; + } + + // Excessively clever and indulgent code to simultaneously fill $packedTables + // with the packed hashtables, and adjust the elements of $starts + // to actually point to the starts instead of the ends. + $packedTables = array_fill( 0, $this->numentries, false ); + foreach ( $this->hplist as $item ) { + $packedTables[--$starts[255 & $item['h']]] = $item; + } + + $final = ''; + for ( $i = 0; $i < 256; ++$i ) { + $count = $counts[$i]; + + // The size of the hashtable will be double the item count. + // The rest of the slots will be empty. + $len = $count + $count; + $final .= pack( 'VV', $this->pos, $len ); + + $hashtable = array(); + for ( $u = 0; $u < $len; ++$u ) { + $hashtable[$u] = array( 'h' => 0, 'p' => 0 ); + } + + // Fill the hashtable, using the next empty slot if the hashed slot + // is taken. + for ( $u = 0; $u < $count; ++$u ) { + $hp = $packedTables[$starts[$i] + $u]; + $where = CdbFunctions::unsignedMod( + CdbFunctions::unsignedShiftRight( $hp['h'], 8 ), $len ); + while ( $hashtable[$where]['p'] ) { + if ( ++$where == $len ) { + $where = 0; + } + } + $hashtable[$where] = $hp; + } + + // Write the hashtable + for ( $u = 0; $u < $len; ++$u ) { + $buf = pack( 'vvV', + $hashtable[$u]['h'] & 0xffff, + CdbFunctions::unsignedShiftRight( $hashtable[$u]['h'], 16 ), + $hashtable[$u]['p'] ); + $this->write( $buf ); + $this->posplus( 8 ); + } + } + + // Write the pointer array at the start of the file + rewind( $this->handle ); + if ( ftell( $this->handle ) != 0 ) { + $this->throwException( 'Error rewinding to start of file "' . $this->tmpFileName . '".' ); + } + $this->write( $final ); + } + + /** + * Clean up the temp file and throw an exception + * + * @param string $msg + * @throws CdbException + */ + protected function throwException( $msg ) { + if ( $this->handle ) { + fclose( $this->handle ); + unlink( $this->tmpFileName ); + } + throw new CdbException( $msg ); + } +} diff --git a/includes/utils/IP.php b/includes/utils/IP.php new file mode 100644 index 00000000..0e2db8cc --- /dev/null +++ b/includes/utils/IP.php @@ -0,0 +1,738 @@ +<?php +/** + * Functions and constants to play with IP addresses and ranges + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @author Antoine Musso "<hashar at free dot fr>", Aaron Schulz + */ + +// Some regex definition to "play" with IP address and IP address blocks + +// An IPv4 address is made of 4 bytes from x00 to xFF which is d0 to d255 +define( 'RE_IP_BYTE', '(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|0?[0-9]?[0-9])' ); +define( 'RE_IP_ADD', RE_IP_BYTE . '\.' . RE_IP_BYTE . '\.' . RE_IP_BYTE . '\.' . RE_IP_BYTE ); +// An IPv4 block is an IP address and a prefix (d1 to d32) +define( 'RE_IP_PREFIX', '(3[0-2]|[12]?\d)' ); +define( 'RE_IP_BLOCK', RE_IP_ADD . '\/' . RE_IP_PREFIX ); + +// An IPv6 address is made up of 8 words (each x0000 to xFFFF). +// However, the "::" abbreviation can be used on consecutive x0000 words. +define( 'RE_IPV6_WORD', '([0-9A-Fa-f]{1,4})' ); +define( 'RE_IPV6_PREFIX', '(12[0-8]|1[01][0-9]|[1-9]?\d)' ); +define( 'RE_IPV6_ADD', + '(?:' . // starts with "::" (including "::") + ':(?::|(?::' . RE_IPV6_WORD . '){1,7})' . + '|' . // ends with "::" (except "::") + RE_IPV6_WORD . '(?::' . RE_IPV6_WORD . '){0,6}::' . + '|' . // contains one "::" in the middle (the ^ makes the test fail if none found) + RE_IPV6_WORD . '(?::((?(-1)|:))?' . RE_IPV6_WORD . '){1,6}(?(-2)|^)' . + '|' . // contains no "::" + RE_IPV6_WORD . '(?::' . RE_IPV6_WORD . '){7}' . + ')' +); +// An IPv6 block is an IP address and a prefix (d1 to d128) +define( 'RE_IPV6_BLOCK', RE_IPV6_ADD . '\/' . RE_IPV6_PREFIX ); +// For IPv6 canonicalization (NOT for strict validation; these are quite lax!) +define( 'RE_IPV6_GAP', ':(?:0+:)*(?::(?:0+:)*)?' ); +define( 'RE_IPV6_V4_PREFIX', '0*' . RE_IPV6_GAP . '(?:ffff:)?' ); + +// This might be useful for regexps used elsewhere, matches any IPv6 or IPv6 address or network +define( 'IP_ADDRESS_STRING', + '(?:' . + RE_IP_ADD . '(?:\/' . RE_IP_PREFIX . ')?' . // IPv4 + '|' . + RE_IPV6_ADD . '(?:\/' . RE_IPV6_PREFIX . ')?' . // IPv6 + ')' +); + +/** + * A collection of public static functions to play with IP address + * and IP blocks. + */ +class IP { + /** @var IPSet */ + private static $proxyIpSet = null; + + /** + * Determine if a string is as valid IP address or network (CIDR prefix). + * SIIT IPv4-translated addresses are rejected. + * Note: canonicalize() tries to convert translated addresses to IPv4. + * + * @param string $ip Possible IP address + * @return bool + */ + public static function isIPAddress( $ip ) { + return (bool)preg_match( '/^' . IP_ADDRESS_STRING . '$/', $ip ); + } + + /** + * Given a string, determine if it as valid IP in IPv6 only. + * Note: Unlike isValid(), this looks for networks too. + * + * @param string $ip Possible IP address + * @return bool + */ + public static function isIPv6( $ip ) { + return (bool)preg_match( '/^' . RE_IPV6_ADD . '(?:\/' . RE_IPV6_PREFIX . ')?$/', $ip ); + } + + /** + * Given a string, determine if it as valid IP in IPv4 only. + * Note: Unlike isValid(), this looks for networks too. + * + * @param string $ip Possible IP address + * @return bool + */ + public static function isIPv4( $ip ) { + return (bool)preg_match( '/^' . RE_IP_ADD . '(?:\/' . RE_IP_PREFIX . ')?$/', $ip ); + } + + /** + * Validate an IP address. Ranges are NOT considered valid. + * SIIT IPv4-translated addresses are rejected. + * Note: canonicalize() tries to convert translated addresses to IPv4. + * + * @param string $ip + * @return bool True if it is valid + */ + public static function isValid( $ip ) { + return ( preg_match( '/^' . RE_IP_ADD . '$/', $ip ) + || preg_match( '/^' . RE_IPV6_ADD . '$/', $ip ) ); + } + + /** + * Validate an IP Block (valid address WITH a valid prefix). + * SIIT IPv4-translated addresses are rejected. + * Note: canonicalize() tries to convert translated addresses to IPv4. + * + * @param string $ipblock + * @return bool True if it is valid + */ + public static function isValidBlock( $ipblock ) { + return ( preg_match( '/^' . RE_IPV6_BLOCK . '$/', $ipblock ) + || preg_match( '/^' . RE_IP_BLOCK . '$/', $ipblock ) ); + } + + /** + * Convert an IP into a verbose, uppercase, normalized form. + * IPv6 addresses in octet notation are expanded to 8 words. + * IPv4 addresses are just trimmed. + * + * @param string $ip IP address in quad or octet form (CIDR or not). + * @return string + */ + public static function sanitizeIP( $ip ) { + $ip = trim( $ip ); + if ( $ip === '' ) { + return null; + } + if ( self::isIPv4( $ip ) || !self::isIPv6( $ip ) ) { + return $ip; // nothing else to do for IPv4 addresses or invalid ones + } + // Remove any whitespaces, convert to upper case + $ip = strtoupper( $ip ); + // Expand zero abbreviations + $abbrevPos = strpos( $ip, '::' ); + if ( $abbrevPos !== false ) { + // We know this is valid IPv6. Find the last index of the + // address before any CIDR number (e.g. "a:b:c::/24"). + $CIDRStart = strpos( $ip, "/" ); + $addressEnd = ( $CIDRStart !== false ) + ? $CIDRStart - 1 + : strlen( $ip ) - 1; + // If the '::' is at the beginning... + if ( $abbrevPos == 0 ) { + $repeat = '0:'; + $extra = ( $ip == '::' ) ? '0' : ''; // for the address '::' + $pad = 9; // 7+2 (due to '::') + // If the '::' is at the end... + } elseif ( $abbrevPos == ( $addressEnd - 1 ) ) { + $repeat = ':0'; + $extra = ''; + $pad = 9; // 7+2 (due to '::') + // If the '::' is in the middle... + } else { + $repeat = ':0'; + $extra = ':'; + $pad = 8; // 6+2 (due to '::') + } + $ip = str_replace( '::', + str_repeat( $repeat, $pad - substr_count( $ip, ':' ) ) . $extra, + $ip + ); + } + // Remove leading zeros from each bloc as needed + $ip = preg_replace( '/(^|:)0+(' . RE_IPV6_WORD . ')/', '$1$2', $ip ); + + return $ip; + } + + /** + * Prettify an IP for display to end users. + * This will make it more compact and lower-case. + * + * @param string $ip + * @return string + */ + public static function prettifyIP( $ip ) { + $ip = self::sanitizeIP( $ip ); // normalize (removes '::') + if ( self::isIPv6( $ip ) ) { + // Split IP into an address and a CIDR + if ( strpos( $ip, '/' ) !== false ) { + list( $ip, $cidr ) = explode( '/', $ip, 2 ); + } else { + list( $ip, $cidr ) = array( $ip, '' ); + } + // Get the largest slice of words with multiple zeros + $offset = 0; + $longest = $longestPos = false; + while ( preg_match( + '!(?:^|:)0(?::0)+(?:$|:)!', $ip, $m, PREG_OFFSET_CAPTURE, $offset + ) ) { + list( $match, $pos ) = $m[0]; // full match + if ( strlen( $match ) > strlen( $longest ) ) { + $longest = $match; + $longestPos = $pos; + } + $offset = ( $pos + strlen( $match ) ); // advance + } + if ( $longest !== false ) { + // Replace this portion of the string with the '::' abbreviation + $ip = substr_replace( $ip, '::', $longestPos, strlen( $longest ) ); + } + // Add any CIDR back on + if ( $cidr !== '' ) { + $ip = "{$ip}/{$cidr}"; + } + // Convert to lower case to make it more readable + $ip = strtolower( $ip ); + } + + return $ip; + } + + /** + * Given a host/port string, like one might find in the host part of a URL + * per RFC 2732, split the hostname part and the port part and return an + * array with an element for each. If there is no port part, the array will + * have false in place of the port. If the string was invalid in some way, + * false is returned. + * + * This was easy with IPv4 and was generally done in an ad-hoc way, but + * with IPv6 it's somewhat more complicated due to the need to parse the + * square brackets and colons. + * + * A bare IPv6 address is accepted despite the lack of square brackets. + * + * @param string $both The string with the host and port + * @return array + */ + public static function splitHostAndPort( $both ) { + if ( substr( $both, 0, 1 ) === '[' ) { + if ( preg_match( '/^\[(' . RE_IPV6_ADD . ')\](?::(?P<port>\d+))?$/', $both, $m ) ) { + if ( isset( $m['port'] ) ) { + return array( $m[1], intval( $m['port'] ) ); + } else { + return array( $m[1], false ); + } + } else { + // Square bracket found but no IPv6 + return false; + } + } + $numColons = substr_count( $both, ':' ); + if ( $numColons >= 2 ) { + // Is it a bare IPv6 address? + if ( preg_match( '/^' . RE_IPV6_ADD . '$/', $both ) ) { + return array( $both, false ); + } else { + // Not valid IPv6, but too many colons for anything else + return false; + } + } + if ( $numColons >= 1 ) { + // Host:port? + $bits = explode( ':', $both ); + if ( preg_match( '/^\d+/', $bits[1] ) ) { + return array( $bits[0], intval( $bits[1] ) ); + } else { + // Not a valid port + return false; + } + } + + // Plain hostname + return array( $both, false ); + } + + /** + * Given a host name and a port, combine them into host/port string like + * you might find in a URL. If the host contains a colon, wrap it in square + * brackets like in RFC 2732. If the port matches the default port, omit + * the port specification + * + * @param string $host + * @param int $port + * @param bool|int $defaultPort + * @return string + */ + public static function combineHostAndPort( $host, $port, $defaultPort = false ) { + if ( strpos( $host, ':' ) !== false ) { + $host = "[$host]"; + } + if ( $defaultPort !== false && $port == $defaultPort ) { + return $host; + } else { + return "$host:$port"; + } + } + + /** + * Convert an IPv4 or IPv6 hexadecimal representation back to readable format + * + * @param string $hex Number, with "v6-" prefix if it is IPv6 + * @return string Quad-dotted (IPv4) or octet notation (IPv6) + */ + public static function formatHex( $hex ) { + if ( substr( $hex, 0, 3 ) == 'v6-' ) { // IPv6 + return self::hexToOctet( substr( $hex, 3 ) ); + } else { // IPv4 + return self::hexToQuad( $hex ); + } + } + + /** + * Converts a hexadecimal number to an IPv6 address in octet notation + * + * @param string $ip_hex Pure hex (no v6- prefix) + * @return string (of format a:b:c:d:e:f:g:h) + */ + public static function hexToOctet( $ip_hex ) { + // Pad hex to 32 chars (128 bits) + $ip_hex = str_pad( strtoupper( $ip_hex ), 32, '0', STR_PAD_LEFT ); + // Separate into 8 words + $ip_oct = substr( $ip_hex, 0, 4 ); + for ( $n = 1; $n < 8; $n++ ) { + $ip_oct .= ':' . substr( $ip_hex, 4 * $n, 4 ); + } + // NO leading zeroes + $ip_oct = preg_replace( '/(^|:)0+(' . RE_IPV6_WORD . ')/', '$1$2', $ip_oct ); + + return $ip_oct; + } + + /** + * Converts a hexadecimal number to an IPv4 address in quad-dotted notation + * + * @param string $ip_hex Pure hex + * @return string (of format a.b.c.d) + */ + public static function hexToQuad( $ip_hex ) { + // Pad hex to 8 chars (32 bits) + $ip_hex = str_pad( strtoupper( $ip_hex ), 8, '0', STR_PAD_LEFT ); + // Separate into four quads + $s = ''; + for ( $i = 0; $i < 4; $i++ ) { + if ( $s !== '' ) { + $s .= '.'; + } + $s .= base_convert( substr( $ip_hex, $i * 2, 2 ), 16, 10 ); + } + + return $s; + } + + /** + * Determine if an IP address really is an IP address, and if it is public, + * i.e. not RFC 1918 or similar + * + * @param string $ip + * @return bool + */ + public static function isPublic( $ip ) { + static $privateSet = null; + if ( !$privateSet ) { + $privateSet = new IPSet( array( + '10.0.0.0/8', # RFC 1918 (private) + '172.16.0.0/12', # RFC 1918 (private) + '192.168.0.0/16', # RFC 1918 (private) + '0.0.0.0/8', # this network + '127.0.0.0/8', # loopback + 'fc00::/7', # RFC 4193 (local) + '0:0:0:0:0:0:0:1', # loopback + ) ); + } + return !$privateSet->match( $ip ); + } + + /** + * Return a zero-padded upper case hexadecimal representation of an IP address. + * + * Hexadecimal addresses are used because they can easily be extended to + * IPv6 support. To separate the ranges, the return value from this + * function for an IPv6 address will be prefixed with "v6-", a non- + * hexadecimal string which sorts after the IPv4 addresses. + * + * @param string $ip Quad dotted/octet IP address. + * @return string|bool False on failure + */ + public static function toHex( $ip ) { + if ( self::isIPv6( $ip ) ) { + $n = 'v6-' . self::IPv6ToRawHex( $ip ); + } elseif ( self::isIPv4( $ip ) ) { + // Bug 60035: an IP with leading 0's fails in ip2long sometimes (e.g. *.08) + $ip = preg_replace( '/(?<=\.)0+(?=[1-9])/', '', $ip ); + $n = ip2long( $ip ); + if ( $n < 0 ) { + $n += pow( 2, 32 ); + # On 32-bit platforms (and on Windows), 2^32 does not fit into an int, + # so $n becomes a float. We convert it to string instead. + if ( is_float( $n ) ) { + $n = (string)$n; + } + } + if ( $n !== false ) { + # Floating points can handle the conversion; faster than wfBaseConvert() + $n = strtoupper( str_pad( base_convert( $n, 10, 16 ), 8, '0', STR_PAD_LEFT ) ); + } + } else { + $n = false; + } + + return $n; + } + + /** + * Given an IPv6 address in octet notation, returns a pure hex string. + * + * @param string $ip Octet ipv6 IP address. + * @return string|bool Pure hex (uppercase); false on failure + */ + private static function IPv6ToRawHex( $ip ) { + $ip = self::sanitizeIP( $ip ); + if ( !$ip ) { + return false; + } + $r_ip = ''; + foreach ( explode( ':', $ip ) as $v ) { + $r_ip .= str_pad( $v, 4, 0, STR_PAD_LEFT ); + } + + return $r_ip; + } + + /** + * Convert a network specification in CIDR notation + * to an integer network and a number of bits + * + * @param string $range IP with CIDR prefix + * @return array(int or string, int) + */ + public static function parseCIDR( $range ) { + if ( self::isIPv6( $range ) ) { + return self::parseCIDR6( $range ); + } + $parts = explode( '/', $range, 2 ); + if ( count( $parts ) != 2 ) { + return array( false, false ); + } + list( $network, $bits ) = $parts; + $network = ip2long( $network ); + if ( $network !== false && is_numeric( $bits ) && $bits >= 0 && $bits <= 32 ) { + if ( $bits == 0 ) { + $network = 0; + } else { + $network &= ~( ( 1 << ( 32 - $bits ) ) - 1 ); + } + # Convert to unsigned + if ( $network < 0 ) { + $network += pow( 2, 32 ); + } + } else { + $network = false; + $bits = false; + } + + return array( $network, $bits ); + } + + /** + * Given a string range in a number of formats, + * return the start and end of the range in hexadecimal. + * + * Formats are: + * 1.2.3.4/24 CIDR + * 1.2.3.4 - 1.2.3.5 Explicit range + * 1.2.3.4 Single IP + * + * 2001:0db8:85a3::7344/96 CIDR + * 2001:0db8:85a3::7344 - 2001:0db8:85a3::7344 Explicit range + * 2001:0db8:85a3::7344 Single IP + * @param string $range IP range + * @return array(string, string) + */ + public static function parseRange( $range ) { + // CIDR notation + if ( strpos( $range, '/' ) !== false ) { + if ( self::isIPv6( $range ) ) { + return self::parseRange6( $range ); + } + list( $network, $bits ) = self::parseCIDR( $range ); + if ( $network === false ) { + $start = $end = false; + } else { + $start = sprintf( '%08X', $network ); + $end = sprintf( '%08X', $network + pow( 2, ( 32 - $bits ) ) - 1 ); + } + // Explicit range + } elseif ( strpos( $range, '-' ) !== false ) { + list( $start, $end ) = array_map( 'trim', explode( '-', $range, 2 ) ); + if ( self::isIPv6( $start ) && self::isIPv6( $end ) ) { + return self::parseRange6( $range ); + } + if ( self::isIPv4( $start ) && self::isIPv4( $end ) ) { + $start = self::toHex( $start ); + $end = self::toHex( $end ); + if ( $start > $end ) { + $start = $end = false; + } + } else { + $start = $end = false; + } + } else { + # Single IP + $start = $end = self::toHex( $range ); + } + if ( $start === false || $end === false ) { + return array( false, false ); + } else { + return array( $start, $end ); + } + } + + /** + * Convert a network specification in IPv6 CIDR notation to an + * integer network and a number of bits + * + * @param string $range + * + * @return array(string, int) + */ + private static function parseCIDR6( $range ) { + # Explode into <expanded IP,range> + $parts = explode( '/', IP::sanitizeIP( $range ), 2 ); + if ( count( $parts ) != 2 ) { + return array( false, false ); + } + list( $network, $bits ) = $parts; + $network = self::IPv6ToRawHex( $network ); + if ( $network !== false && is_numeric( $bits ) && $bits >= 0 && $bits <= 128 ) { + if ( $bits == 0 ) { + $network = "0"; + } else { + # Native 32 bit functions WONT work here!!! + # Convert to a padded binary number + $network = wfBaseConvert( $network, 16, 2, 128 ); + # Truncate the last (128-$bits) bits and replace them with zeros + $network = str_pad( substr( $network, 0, $bits ), 128, 0, STR_PAD_RIGHT ); + # Convert back to an integer + $network = wfBaseConvert( $network, 2, 10 ); + } + } else { + $network = false; + $bits = false; + } + + return array( $network, (int)$bits ); + } + + /** + * Given a string range in a number of formats, return the + * start and end of the range in hexadecimal. For IPv6. + * + * Formats are: + * 2001:0db8:85a3::7344/96 CIDR + * 2001:0db8:85a3::7344 - 2001:0db8:85a3::7344 Explicit range + * 2001:0db8:85a3::7344/96 Single IP + * + * @param string $range + * + * @return array(string, string) + */ + private static function parseRange6( $range ) { + # Expand any IPv6 IP + $range = IP::sanitizeIP( $range ); + // CIDR notation... + if ( strpos( $range, '/' ) !== false ) { + list( $network, $bits ) = self::parseCIDR6( $range ); + if ( $network === false ) { + $start = $end = false; + } else { + $start = wfBaseConvert( $network, 10, 16, 32, false ); + # Turn network to binary (again) + $end = wfBaseConvert( $network, 10, 2, 128 ); + # Truncate the last (128-$bits) bits and replace them with ones + $end = str_pad( substr( $end, 0, $bits ), 128, 1, STR_PAD_RIGHT ); + # Convert to hex + $end = wfBaseConvert( $end, 2, 16, 32, false ); + # see toHex() comment + $start = "v6-$start"; + $end = "v6-$end"; + } + // Explicit range notation... + } elseif ( strpos( $range, '-' ) !== false ) { + list( $start, $end ) = array_map( 'trim', explode( '-', $range, 2 ) ); + $start = self::toHex( $start ); + $end = self::toHex( $end ); + if ( $start > $end ) { + $start = $end = false; + } + } else { + # Single IP + $start = $end = self::toHex( $range ); + } + if ( $start === false || $end === false ) { + return array( false, false ); + } else { + return array( $start, $end ); + } + } + + /** + * Determine if a given IPv4/IPv6 address is in a given CIDR network + * + * @param string $addr The address to check against the given range. + * @param string $range The range to check the given address against. + * @return bool Whether or not the given address is in the given range. + */ + public static function isInRange( $addr, $range ) { + $hexIP = self::toHex( $addr ); + list( $start, $end ) = self::parseRange( $range ); + + return ( strcmp( $hexIP, $start ) >= 0 && + strcmp( $hexIP, $end ) <= 0 ); + } + + /** + * Convert some unusual representations of IPv4 addresses to their + * canonical dotted quad representation. + * + * This currently only checks a few IPV4-to-IPv6 related cases. More + * unusual representations may be added later. + * + * @param string $addr Something that might be an IP address + * @return string Valid dotted quad IPv4 address or null + */ + public static function canonicalize( $addr ) { + // remove zone info (bug 35738) + $addr = preg_replace( '/\%.*/', '', $addr ); + + if ( self::isValid( $addr ) ) { + return $addr; + } + // Turn mapped addresses from ::ce:ffff:1.2.3.4 to 1.2.3.4 + if ( strpos( $addr, ':' ) !== false && strpos( $addr, '.' ) !== false ) { + $addr = substr( $addr, strrpos( $addr, ':' ) + 1 ); + if ( self::isIPv4( $addr ) ) { + return $addr; + } + } + // IPv6 loopback address + $m = array(); + if ( preg_match( '/^0*' . RE_IPV6_GAP . '1$/', $addr, $m ) ) { + return '127.0.0.1'; + } + // IPv4-mapped and IPv4-compatible IPv6 addresses + if ( preg_match( '/^' . RE_IPV6_V4_PREFIX . '(' . RE_IP_ADD . ')$/i', $addr, $m ) ) { + return $m[1]; + } + if ( preg_match( '/^' . RE_IPV6_V4_PREFIX . RE_IPV6_WORD . + ':' . RE_IPV6_WORD . '$/i', $addr, $m ) + ) { + return long2ip( ( hexdec( $m[1] ) << 16 ) + hexdec( $m[2] ) ); + } + + return null; // give up + } + + /** + * Gets rid of unneeded numbers in quad-dotted/octet IP strings + * For example, 127.111.113.151/24 -> 127.111.113.0/24 + * @param string $range IP address to normalize + * @return string + */ + public static function sanitizeRange( $range ) { + list( /*...*/, $bits ) = self::parseCIDR( $range ); + list( $start, /*...*/ ) = self::parseRange( $range ); + $start = self::formatHex( $start ); + if ( $bits === false ) { + return $start; // wasn't actually a range + } + + return "$start/$bits"; + } + + /** + * Checks if an IP is a trusted proxy provider. + * Useful to tell if X-Forwarded-For data is possibly bogus. + * Squid cache servers for the site are whitelisted. + * @since 1.24 + * + * @param string $ip + * @return bool + */ + public static function isTrustedProxy( $ip ) { + $trusted = self::isConfiguredProxy( $ip ); + wfRunHooks( 'IsTrustedProxy', array( &$ip, &$trusted ) ); + return $trusted; + } + + /** + * Checks if an IP matches a proxy we've configured + * @since 1.24 + * + * @param string $ip + * @return bool + */ + public static function isConfiguredProxy( $ip ) { + global $wgSquidServers, $wgSquidServersNoPurge; + + wfProfileIn( __METHOD__ ); + // Quick check of known singular proxy servers + $trusted = in_array( $ip, $wgSquidServers ); + + // Check against addresses and CIDR nets in the NoPurge list + if ( !$trusted ) { + if ( !self::$proxyIpSet ) { + self::$proxyIpSet = new IPSet( $wgSquidServersNoPurge ); + } + $trusted = self::$proxyIpSet->match( $ip ); + } + wfProfileOut( __METHOD__ ); + + return $trusted; + } + + /** + * Clears precomputed data used for proxy support. + * Use this only for unit tests. + */ + public static function clearCaches() { + self::$proxyIpSet = null; + } +} diff --git a/includes/utils/MWCryptHKDF.php b/includes/utils/MWCryptHKDF.php new file mode 100644 index 00000000..cc136793 --- /dev/null +++ b/includes/utils/MWCryptHKDF.php @@ -0,0 +1,332 @@ +<?php +/** + * Extract-and-Expand Key Derivation Function (HKDF). A cryptographicly + * secure key expansion function based on RFC 5869. + * + * This relies on the secrecy of $wgSecretKey (by default), or $wgHKDFSecret. + * By default, sha256 is used as the underlying hashing algorithm, but any other + * algorithm can be used. Finding the secret key from the output would require + * an attacker to discover the input key (the PRK) to the hmac that generated + * the output, and discover the particular data, hmac'ed with an evolving key + * (salt), to produce the PRK. Even with md5, no publicly known attacks make + * this currently feasible. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @author Chris Steipp + * @file + */ + +class MWCryptHKDF { + + /** + * Singleton instance for public use + */ + protected static $singleton = null; + + /** + * The persistant cache + */ + protected $cache = null; + + /** + * Cache key we'll use for our salt + */ + protected $cacheKey = null; + + /** + * The hash algorithm being used + */ + protected $algorithm = null; + + /** + * binary string, the salt for the HKDF + */ + protected $salt; + + /** + * The pseudorandom key + */ + private $prk; + + /** + * The secret key material. This must be kept secret to preserve + * the security properties of this RNG. + */ + private $skm; + + /** + * The last block (K(i)) of the most recent expanded key + */ + protected $lastK; + + /** + * a "context information" string CTXinfo (which may be null) + * See http://eprint.iacr.org/2010/264.pdf Section 4.1 + */ + protected $context = array(); + + /** + * Round count is computed based on the hash'es output length, + * which neither php nor openssl seem to provide easily. + */ + public static $hashLength = array( + 'md5' => 16, + 'sha1' => 20, + 'sha224' => 28, + 'sha256' => 32, + 'sha384' => 48, + 'sha512' => 64, + 'ripemd128' => 16, + 'ripemd160' => 20, + 'ripemd256' => 32, + 'ripemd320' => 40, + 'whirlpool' => 64, + ); + + + /** + * @param string $secretKeyMaterial + * @param string $algorithm Name of hashing algorithm + * @param BagOStuff $cache + * @param string|array $context Context to mix into HKDF context + */ + public function __construct( $secretKeyMaterial, $algorithm, $cache, $context ) { + if ( strlen( $secretKeyMaterial ) < 16 ) { + throw new MWException( "MWCryptHKDF secret was too short." ); + } + $this->skm = $secretKeyMaterial; + $this->algorithm = $algorithm; + $this->cache = $cache; + $this->salt = ''; // Initialize a blank salt, see getSaltUsingCache() + $this->prk = ''; + $this->context = is_array( $context ) ? $context : array( $context ); + + // To prevent every call from hitting the same memcache server, pick + // from a set of keys to use. mt_rand is only use to pick a random + // server, and does not affect the security of the process. + $this->cacheKey = wfMemcKey( 'HKDF', mt_rand( 0, 16 ) ); + } + + /** + * Save the last block generated, so the next user will compute a different PRK + * from the same SKM. This should keep things unpredictable even if an attacker + * is able to influence CTXinfo. + */ + function __destruct() { + if ( $this->lastK ) { + $this->cache->set( $this->cacheKey, $this->lastK ); + } + } + + /** + * MW specific salt, cached from last run + * @return string Binary string + */ + protected function getSaltUsingCache() { + if ( $this->salt == '' ) { + $lastSalt = $this->cache->get( $this->cacheKey ); + if ( $lastSalt === false ) { + // If we don't have a previous value to use as our salt, we use + // 16 bytes from MWCryptRand, which will use a small amount of + // entropy from our pool. Note, "XTR may be deterministic or keyed + // via an optional “salt value” (i.e., a non-secret random + // value)..." - http://eprint.iacr.org/2010/264.pdf. However, we + // use a strongly random value since we can. + $lastSalt = MWCryptRand::generate( 16 ); + } + // Get a binary string that is hashLen long + $this->salt = hash( $this->algorithm, $lastSalt, true ); + } + return $this->salt; + } + + /** + * Return a singleton instance, based on the global configs. + * @return HKDF + */ + protected static function singleton() { + global $wgHKDFAlgorithm, $wgHKDFSecret, $wgSecretKey; + + $secret = $wgHKDFSecret ?: $wgSecretKey; + if ( !$secret ) { + throw new MWException( "Cannot use MWCryptHKDF without a secret." ); + } + + // In HKDF, the context can be known to the attacker, but this will + // keep simultaneous runs from producing the same output. + $context = array(); + $context[] = microtime(); + $context[] = getmypid(); + $context[] = gethostname(); + + // Setup salt cache. Use APC, or fallback to the main cache if it isn't setup + try { + $cache = ObjectCache::newAccelerator( array() ); + } catch ( Exception $e ) { + $cache = wfGetMainCache(); + } + + if ( is_null( self::$singleton ) ) { + self::$singleton = new self( $secret, $wgHKDFAlgorithm, $cache, $context ); + } + + return self::$singleton; + } + + /** + * Produce $bytes of secure random data. As a side-effect, + * $this->lastK is set to the last hashLen block of key material. + * @param int $bytes Number of bytes of data + * @param string $context Context to mix into CTXinfo + * @return string Binary string of length $bytes + */ + protected function realGenerate( $bytes, $context = '' ) { + + if ( $this->prk === '' ) { + $salt = $this->getSaltUsingCache(); + $this->prk = self::HKDFExtract( + $this->algorithm, + $salt, + $this->skm + ); + } + + $CTXinfo = implode( ':', array_merge( $this->context, array( $context ) ) ); + + return self::HKDFExpand( + $this->algorithm, + $this->prk, + $CTXinfo, + $bytes, + $this->lastK + ); + } + + + /** + * RFC5869 defines HKDF in 2 steps, extraction and expansion. + * From http://eprint.iacr.org/2010/264.pdf: + * + * The scheme HKDF is specifed as: + * HKDF(XTS, SKM, CTXinfo, L) = K(1) || K(2) || ... || K(t) + * where the values K(i) are defined as follows: + * PRK = HMAC(XTS, SKM) + * K(1) = HMAC(PRK, CTXinfo || 0); + * K(i+1) = HMAC(PRK, K(i) || CTXinfo || i), 1 <= i < t; + * where t = [L/k] and the value K(t) is truncated to its first d = L mod k bits; + * the counter i is non-wrapping and of a given fixed size, e.g., a single byte. + * Note that the length of the HMAC output is the same as its key length and therefore + * the scheme is well defined. + * + * XTS is the "extractor salt" + * SKM is the "secret keying material" + * + * N.B. http://eprint.iacr.org/2010/264.pdf seems to differ from RFC 5869 in that the test + * vectors from RFC 5869 only work if K(0) = '' and K(1) = HMAC(PRK, K(0) || CTXinfo || 1) + * + * @param string $hash The hashing function to use (e.g., sha256) + * @param string $ikm The input keying material + * @param string $salt The salt to add to the ikm, to get the prk + * @param string $info Optional context (change the output without affecting + * the randomness properties of the output) + * @param int $L Number of bytes to return + * @return string Cryptographically secure pseudorandom binary string + */ + public static function HKDF( $hash, $ikm, $salt, $info, $L ) { + $prk = self::HKDFExtract( $hash, $salt, $ikm ); + $okm = self::HKDFExpand( $hash, $prk, $info, $L ); + return $okm; + } + + /** + * Extract the PRK, PRK = HMAC(XTS, SKM) + * Note that the hmac is keyed with XTS (the salt), + * and the SKM (source key material) is the "data". + * + * @param string $hash The hashing function to use (e.g., sha256) + * @param string $salt The salt to add to the ikm, to get the prk + * @param string $ikm The input keying material + * @return string Binary string (pseudorandm key) used as input to HKDFExpand + */ + private static function HKDFExtract( $hash, $salt, $ikm ) { + return hash_hmac( $hash, $ikm, $salt, true ); + } + + /** + * Expand the key with the given context + * + * @param string $hash Hashing Algorithm + * @param string $prk A pseudorandom key of at least HashLen octets + * (usually, the output from the extract step) + * @param string $info Optional context and application specific information + * (can be a zero-length string) + * @param int $bytes Length of output keying material in bytes + * (<= 255*HashLen) + * @param string &$lastK Set by this function to the last block of the expansion. + * In MediaWiki, this is used to seed future Extractions. + * @return string Cryptographically secure random string $bytes long + */ + private static function HKDFExpand( $hash, $prk, $info, $bytes, &$lastK = '' ) { + $hashLen = MWCryptHKDF::$hashLength[$hash]; + $rounds = ceil( $bytes / $hashLen ); + $output = ''; + + if ( $bytes > 255 * $hashLen ) { + throw new MWException( "Too many bytes requested from HDKFExpand" ); + } + + // K(1) = HMAC(PRK, CTXinfo || 1); + // K(i) = HMAC(PRK, K(i-1) || CTXinfo || i); 1 < i <= t; + for ( $counter = 1; $counter <= $rounds; ++$counter ) { + $lastK = hash_hmac( + $hash, + $lastK . $info . chr( $counter ), + $prk, + true + ); + $output .= $lastK; + } + + return substr( $output, 0, $bytes ); + } + + /** + * Generate cryptographically random data and return it in raw binary form. + * + * @param int $bytes The number of bytes of random data to generate + * @param string $context String to mix into HMAC context + * @return string Binary string of length $bytes + */ + public static function generate( $bytes, $context ) { + return self::singleton()->realGenerate( $bytes, $context ); + } + + /** + * Generate cryptographically random data and return it in hexadecimal string format. + * See MWCryptRand::realGenerateHex for details of the char-to-byte conversion logic. + * + * @param int $chars The number of hex chars of random data to generate + * @param string $context String to mix into HMAC context + * @return string Random hex characters, $chars long + */ + public static function generateHex( $chars, $context = '' ) { + $bytes = ceil( $chars / 2 ); + $hex = bin2hex( self::singleton()->realGenerate( $bytes, $context ) ); + return substr( $hex, 0, $chars ); + } + +} diff --git a/includes/utils/MWCryptRand.php b/includes/utils/MWCryptRand.php new file mode 100644 index 00000000..b602f78e --- /dev/null +++ b/includes/utils/MWCryptRand.php @@ -0,0 +1,516 @@ +<?php +/** + * A cryptographic random generator class used for generating secret keys + * + * This is based in part on Drupal code as well as what we used in our own code + * prior to introduction of this class. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @author Daniel Friesen + * @file + */ + +class MWCryptRand { + /** + * Minimum number of iterations we want to make in our drift calculations. + */ + const MIN_ITERATIONS = 1000; + + /** + * Number of milliseconds we want to spend generating each separate byte + * of the final generated bytes. + * This is used in combination with the hash length to determine the duration + * we should spend doing drift calculations. + */ + const MSEC_PER_BYTE = 0.5; + + /** + * Singleton instance for public use + */ + protected static $singleton = null; + + /** + * The hash algorithm being used + */ + protected $algo = null; + + /** + * The number of bytes outputted by the hash algorithm + */ + protected $hashLength = null; + + /** + * A boolean indicating whether the previous random generation was done using + * cryptographically strong random number generator or not. + */ + protected $strong = null; + + /** + * Initialize an initial random state based off of whatever we can find + * @return string + */ + protected function initialRandomState() { + // $_SERVER contains a variety of unstable user and system specific information + // It'll vary a little with each page, and vary even more with separate users + // It'll also vary slightly across different machines + $state = serialize( $_SERVER ); + + // To try vary the system information of the state a bit more + // by including the system's hostname into the state + $state .= wfHostname(); + + // Try to gather a little entropy from the different php rand sources + $state .= rand() . uniqid( mt_rand(), true ); + + // Include some information about the filesystem's current state in the random state + $files = array(); + + // We know this file is here so grab some info about ourselves + $files[] = __FILE__; + + // We must also have a parent folder, and with the usual file structure, a grandparent + $files[] = __DIR__; + $files[] = dirname( __DIR__ ); + + // The config file is likely the most often edited file we know should + // be around so include its stat info into the state. + // The constant with its location will almost always be defined, as + // WebStart.php defines MW_CONFIG_FILE to $IP/LocalSettings.php unless + // being configured with MW_CONFIG_CALLBACK (e.g. the installer). + if ( defined( 'MW_CONFIG_FILE' ) ) { + $files[] = MW_CONFIG_FILE; + } + + foreach ( $files as $file ) { + wfSuppressWarnings(); + $stat = stat( $file ); + wfRestoreWarnings(); + if ( $stat ) { + // stat() duplicates data into numeric and string keys so kill off all the numeric ones + foreach ( $stat as $k => $v ) { + if ( is_numeric( $k ) ) { + unset( $k ); + } + } + // The absolute filename itself will differ from install to install so don't leave it out + if ( ( $path = realpath( $file ) ) !== false ) { + $state .= $path; + } else { + $state .= $file; + } + $state .= implode( '', $stat ); + } else { + // The fact that the file isn't there is worth at least a + // minuscule amount of entropy. + $state .= '0'; + } + } + + // Try and make this a little more unstable by including the varying process + // id of the php process we are running inside of if we are able to access it + if ( function_exists( 'getmypid' ) ) { + $state .= getmypid(); + } + + // If available try to increase the instability of the data by throwing in + // the precise amount of memory that we happen to be using at the moment. + if ( function_exists( 'memory_get_usage' ) ) { + $state .= memory_get_usage( true ); + } + + // It's mostly worthless but throw the wiki's id into the data for a little more variance + $state .= wfWikiID(); + + // If we have a secret key set then throw it into the state as well + global $wgSecretKey; + if ( $wgSecretKey ) { + $state .= $wgSecretKey; + } + + return $state; + } + + /** + * Randomly hash data while mixing in clock drift data for randomness + * + * @param string $data The data to randomly hash. + * @return string The hashed bytes + * @author Tim Starling + */ + protected function driftHash( $data ) { + // Minimum number of iterations (to avoid slow operations causing the + // loop to gather little entropy) + $minIterations = self::MIN_ITERATIONS; + // Duration of time to spend doing calculations (in seconds) + $duration = ( self::MSEC_PER_BYTE / 1000 ) * $this->hashLength(); + // Create a buffer to use to trigger memory operations + $bufLength = 10000000; + $buffer = str_repeat( ' ', $bufLength ); + $bufPos = 0; + + // Iterate for $duration seconds or at least $minIterations number of iterations + $iterations = 0; + $startTime = microtime( true ); + $currentTime = $startTime; + while ( $iterations < $minIterations || $currentTime - $startTime < $duration ) { + // Trigger some memory writing to trigger some bus activity + // This may create variance in the time between iterations + $bufPos = ( $bufPos + 13 ) % $bufLength; + $buffer[$bufPos] = ' '; + // Add the drift between this iteration and the last in as entropy + $nextTime = microtime( true ); + $delta = (int)( ( $nextTime - $currentTime ) * 1000000 ); + $data .= $delta; + // Every 100 iterations hash the data and entropy + if ( $iterations % 100 === 0 ) { + $data = sha1( $data ); + } + $currentTime = $nextTime; + $iterations++; + } + $timeTaken = $currentTime - $startTime; + $data = $this->hash( $data ); + + wfDebug( __METHOD__ . ": Clock drift calculation " . + "(time-taken=" . ( $timeTaken * 1000 ) . "ms, " . + "iterations=$iterations, " . + "time-per-iteration=" . ( $timeTaken / $iterations * 1e6 ) . "us)\n" ); + + return $data; + } + + /** + * Return a rolling random state initially build using data from unstable sources + * @return string A new weak random state + */ + protected function randomState() { + static $state = null; + if ( is_null( $state ) ) { + // Initialize the state with whatever unstable data we can find + // It's important that this data is hashed right afterwards to prevent + // it from being leaked into the output stream + $state = $this->hash( $this->initialRandomState() ); + } + // Generate a new random state based on the initial random state or previous + // random state by combining it with clock drift + $state = $this->driftHash( $state ); + + return $state; + } + + /** + * Decide on the best acceptable hash algorithm we have available for hash() + * @throws MWException + * @return string A hash algorithm + */ + protected function hashAlgo() { + if ( !is_null( $this->algo ) ) { + return $this->algo; + } + + $algos = hash_algos(); + $preference = array( 'whirlpool', 'sha256', 'sha1', 'md5' ); + + foreach ( $preference as $algorithm ) { + if ( in_array( $algorithm, $algos ) ) { + $this->algo = $algorithm; + wfDebug( __METHOD__ . ": Using the {$this->algo} hash algorithm.\n" ); + + return $this->algo; + } + } + + // We only reach here if no acceptable hash is found in the list, this should + // be a technical impossibility since most of php's hash list is fixed and + // some of the ones we list are available as their own native functions + // But since we already require at least 5.2 and hash() was default in + // 5.1.2 we don't bother falling back to methods like sha1 and md5. + throw new MWException( "Could not find an acceptable hashing function in hash_algos()" ); + } + + /** + * Return the byte-length output of the hash algorithm we are + * using in self::hash and self::hmac. + * + * @return int Number of bytes the hash outputs + */ + protected function hashLength() { + if ( is_null( $this->hashLength ) ) { + $this->hashLength = strlen( $this->hash( '' ) ); + } + + return $this->hashLength; + } + + /** + * Generate an acceptably unstable one-way-hash of some text + * making use of the best hash algorithm that we have available. + * + * @param string $data + * @return string A raw hash of the data + */ + protected function hash( $data ) { + return hash( $this->hashAlgo(), $data, true ); + } + + /** + * Generate an acceptably unstable one-way-hmac of some text + * making use of the best hash algorithm that we have available. + * + * @param string $data + * @param string $key + * @return string A raw hash of the data + */ + protected function hmac( $data, $key ) { + return hash_hmac( $this->hashAlgo(), $data, $key, true ); + } + + /** + * @see self::wasStrong() + */ + public function realWasStrong() { + if ( is_null( $this->strong ) ) { + throw new MWException( __METHOD__ . ' called before generation of random data' ); + } + + return $this->strong; + } + + /** + * @see self::generate() + */ + public function realGenerate( $bytes, $forceStrong = false ) { + wfProfileIn( __METHOD__ ); + + wfDebug( __METHOD__ . ": Generating cryptographic random bytes for " . + wfGetAllCallers( 5 ) . "\n" ); + + $bytes = floor( $bytes ); + static $buffer = ''; + if ( is_null( $this->strong ) ) { + // Set strength to false initially until we know what source data is coming from + $this->strong = true; + } + + if ( strlen( $buffer ) < $bytes ) { + // If available make use of mcrypt_create_iv URANDOM source to generate randomness + // On unix-like systems this reads from /dev/urandom but does it without any buffering + // and bypasses openbasedir restrictions, so it's preferable to reading directly + // On Windows starting in PHP 5.3.0 Windows' native CryptGenRandom is used to generate + // entropy so this is also preferable to just trying to read urandom because it may work + // on Windows systems as well. + if ( function_exists( 'mcrypt_create_iv' ) ) { + wfProfileIn( __METHOD__ . '-mcrypt' ); + $rem = $bytes - strlen( $buffer ); + $iv = mcrypt_create_iv( $rem, MCRYPT_DEV_URANDOM ); + if ( $iv === false ) { + wfDebug( __METHOD__ . ": mcrypt_create_iv returned false.\n" ); + } else { + $buffer .= $iv; + wfDebug( __METHOD__ . ": mcrypt_create_iv generated " . strlen( $iv ) . + " bytes of randomness.\n" ); + } + wfProfileOut( __METHOD__ . '-mcrypt' ); + } + } + + if ( strlen( $buffer ) < $bytes ) { + // If available make use of openssl's random_pseudo_bytes method to + // attempt to generate randomness. However don't do this on Windows + // with PHP < 5.3.4 due to a bug: + // http://stackoverflow.com/questions/1940168/openssl-random-pseudo-bytes-is-slow-php + // http://git.php.net/?p=php-src.git;a=commitdiff;h=cd62a70863c261b07f6dadedad9464f7e213cad5 + if ( function_exists( 'openssl_random_pseudo_bytes' ) + && ( !wfIsWindows() || version_compare( PHP_VERSION, '5.3.4', '>=' ) ) + ) { + wfProfileIn( __METHOD__ . '-openssl' ); + $rem = $bytes - strlen( $buffer ); + $openssl_bytes = openssl_random_pseudo_bytes( $rem, $openssl_strong ); + if ( $openssl_bytes === false ) { + wfDebug( __METHOD__ . ": openssl_random_pseudo_bytes returned false.\n" ); + } else { + $buffer .= $openssl_bytes; + wfDebug( __METHOD__ . ": openssl_random_pseudo_bytes generated " . + strlen( $openssl_bytes ) . " bytes of " . + ( $openssl_strong ? "strong" : "weak" ) . " randomness.\n" ); + } + if ( strlen( $buffer ) >= $bytes ) { + // openssl tells us if the random source was strong, if some of our data was generated + // using it use it's say on whether the randomness is strong + $this->strong = !!$openssl_strong; + } + wfProfileOut( __METHOD__ . '-openssl' ); + } + } + + // Only read from urandom if we can control the buffer size or were passed forceStrong + if ( strlen( $buffer ) < $bytes && + ( function_exists( 'stream_set_read_buffer' ) || $forceStrong ) + ) { + wfProfileIn( __METHOD__ . '-fopen-urandom' ); + $rem = $bytes - strlen( $buffer ); + if ( !function_exists( 'stream_set_read_buffer' ) && $forceStrong ) { + wfDebug( __METHOD__ . ": Was forced to read from /dev/urandom " . + "without control over the buffer size.\n" ); + } + // /dev/urandom is generally considered the best possible commonly + // available random source, and is available on most *nix systems. + wfSuppressWarnings(); + $urandom = fopen( "/dev/urandom", "rb" ); + wfRestoreWarnings(); + + // Attempt to read all our random data from urandom + // php's fread always does buffered reads based on the stream's chunk_size + // so in reality it will usually read more than the amount of data we're + // asked for and not storing that risks depleting the system's random pool. + // If stream_set_read_buffer is available set the chunk_size to the amount + // of data we need. Otherwise read 8k, php's default chunk_size. + if ( $urandom ) { + // php's default chunk_size is 8k + $chunk_size = 1024 * 8; + if ( function_exists( 'stream_set_read_buffer' ) ) { + // If possible set the chunk_size to the amount of data we need + stream_set_read_buffer( $urandom, $rem ); + $chunk_size = $rem; + } + $random_bytes = fread( $urandom, max( $chunk_size, $rem ) ); + $buffer .= $random_bytes; + fclose( $urandom ); + wfDebug( __METHOD__ . ": /dev/urandom generated " . strlen( $random_bytes ) . + " bytes of randomness.\n" ); + + if ( strlen( $buffer ) >= $bytes ) { + // urandom is always strong, set to true if all our data was generated using it + $this->strong = true; + } + } else { + wfDebug( __METHOD__ . ": /dev/urandom could not be opened.\n" ); + } + wfProfileOut( __METHOD__ . '-fopen-urandom' ); + } + + // If we cannot use or generate enough data from a secure source + // use this loop to generate a good set of pseudo random data. + // This works by initializing a random state using a pile of unstable data + // and continually shoving it through a hash along with a variable salt. + // We hash the random state with more salt to avoid the state from leaking + // out and being used to predict the /randomness/ that follows. + if ( strlen( $buffer ) < $bytes ) { + wfDebug( __METHOD__ . + ": Falling back to using a pseudo random state to generate randomness.\n" ); + } + while ( strlen( $buffer ) < $bytes ) { + wfProfileIn( __METHOD__ . '-fallback' ); + $buffer .= $this->hmac( $this->randomState(), mt_rand() ); + // This code is never really cryptographically strong, if we use it + // at all, then set strong to false. + $this->strong = false; + wfProfileOut( __METHOD__ . '-fallback' ); + } + + // Once the buffer has been filled up with enough random data to fulfill + // the request shift off enough data to handle the request and leave the + // unused portion left inside the buffer for the next request for random data + $generated = substr( $buffer, 0, $bytes ); + $buffer = substr( $buffer, $bytes ); + + wfDebug( __METHOD__ . ": " . strlen( $buffer ) . + " bytes of randomness leftover in the buffer.\n" ); + + wfProfileOut( __METHOD__ ); + + return $generated; + } + + /** + * @see self::generateHex() + */ + public function realGenerateHex( $chars, $forceStrong = false ) { + // hex strings are 2x the length of raw binary so we divide the length in half + // odd numbers will result in a .5 that leads the generate() being 1 character + // short, so we use ceil() to ensure that we always have enough bytes + $bytes = ceil( $chars / 2 ); + // Generate the data and then convert it to a hex string + $hex = bin2hex( $this->generate( $bytes, $forceStrong ) ); + + // A bit of paranoia here, the caller asked for a specific length of string + // here, and it's possible (eg when given an odd number) that we may actually + // have at least 1 char more than they asked for. Just in case they made this + // call intending to insert it into a database that does truncation we don't + // want to give them too much and end up with their database and their live + // code having two different values because part of what we gave them is truncated + // hence, we strip out any run of characters longer than what we were asked for. + return substr( $hex, 0, $chars ); + } + + /** Publicly exposed static methods **/ + + /** + * Return a singleton instance of MWCryptRand + * @return MWCryptRand + */ + protected static function singleton() { + if ( is_null( self::$singleton ) ) { + self::$singleton = new self; + } + + return self::$singleton; + } + + /** + * Return a boolean indicating whether or not the source used for cryptographic + * random bytes generation in the previously run generate* call + * was cryptographically strong. + * + * @return bool Returns true if the source was strong, false if not. + */ + public static function wasStrong() { + return self::singleton()->realWasStrong(); + } + + /** + * Generate a run of (ideally) cryptographically random data and return + * it in raw binary form. + * You can use MWCryptRand::wasStrong() if you wish to know if the source used + * was cryptographically strong. + * + * @param int $bytes The number of bytes of random data to generate + * @param bool $forceStrong Pass true if you want generate to prefer cryptographically + * strong sources of entropy even if reading from them may steal + * more entropy from the system than optimal. + * @return string Raw binary random data + */ + public static function generate( $bytes, $forceStrong = false ) { + return self::singleton()->realGenerate( $bytes, $forceStrong ); + } + + /** + * Generate a run of (ideally) cryptographically random data and return + * it in hexadecimal string format. + * You can use MWCryptRand::wasStrong() if you wish to know if the source used + * was cryptographically strong. + * + * @param int $chars The number of hex chars of random data to generate + * @param bool $forceStrong Pass true if you want generate to prefer cryptographically + * strong sources of entropy even if reading from them may steal + * more entropy from the system than optimal. + * @return string Hexadecimal random data + */ + public static function generateHex( $chars, $forceStrong = false ) { + return self::singleton()->realGenerateHex( $chars, $forceStrong ); + } +} diff --git a/includes/utils/MWFunction.php b/includes/utils/MWFunction.php new file mode 100644 index 00000000..3a0492dc --- /dev/null +++ b/includes/utils/MWFunction.php @@ -0,0 +1,63 @@ +<?php +/** + * Helper methods to call functions and instance objects. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +class MWFunction { + + /** + * @deprecated since 1.22; use call_user_func() + * @param callable $callback + * @return mixed + */ + public static function call( $callback ) { + wfDeprecated( __METHOD__, '1.22' ); + $args = func_get_args(); + + return call_user_func_array( 'call_user_func', $args ); + } + + /** + * @deprecated since 1.22; use call_user_func_array() + * @param callable $callback + * @param array $argsarams + * @return mixed + */ + public static function callArray( $callback, $argsarams ) { + wfDeprecated( __METHOD__, '1.22' ); + + return call_user_func_array( $callback, $argsarams ); + } + + /** + * @param string $class + * @param array $args + * @return object + */ + public static function newObj( $class, $args = array() ) { + if ( !count( $args ) ) { + return new $class; + } + + $ref = new ReflectionClass( $class ); + + return $ref->newInstanceArgs( $args ); + } +} diff --git a/includes/utils/README b/includes/utils/README new file mode 100644 index 00000000..b5b8ec88 --- /dev/null +++ b/includes/utils/README @@ -0,0 +1,9 @@ +The classes in this directory are general utilities for use by any part of +MediaWiki. They do not favour any particular user interface and are not +constrained to serve any particular feature. This is similar to includes/libs, +except that some dependency on the MediaWiki framework (such as the use of +MWException, Status or wfDebug()) disqualifies them from use outside of +MediaWiki without modification. + +Utilities should not use global configuration variables, rather they should rely +on the caller to configure their behaviour. diff --git a/includes/utils/StringUtils.php b/includes/utils/StringUtils.php new file mode 100644 index 00000000..86f45122 --- /dev/null +++ b/includes/utils/StringUtils.php @@ -0,0 +1,612 @@ +<?php +/** + * Methods to play with strings. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * A collection of static methods to play with strings. + */ +class StringUtils { + /** + * Test whether a string is valid UTF-8. + * + * The function check for invalid byte sequences, overlong encoding but + * not for different normalisations. + * + * This relies internally on the mbstring function mb_check_encoding() + * hardcoded to check against UTF-8. Whenever the function is not available + * we fallback to a pure PHP implementation. Setting $disableMbstring to + * true will skip the use of mb_check_encoding, this is mostly intended for + * unit testing our internal implementation. + * + * @since 1.21 + * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation. + * In particular, the pure PHP code path did not in fact check for overlong forms. + * Beware of this when backporting code to that version of MediaWiki. + * + * @param string $value String to check + * @param bool $disableMbstring Whether to use the pure PHP + * implementation instead of trying mb_check_encoding. Intended for unit + * testing. Default: false + * + * @return bool Whether the given $value is a valid UTF-8 encoded string + */ + static function isUtf8( $value, $disableMbstring = false ) { + $value = (string)$value; + + // If the mbstring extension is loaded, use it. However, before PHP 5.4, values above + // U+10FFFF are incorrectly allowed, so we have to check for them separately. + if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) { + static $newPHP; + if ( $newPHP === null ) { + $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' ); + } + + return mb_check_encoding( $value, 'UTF-8' ) && + ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 ); + } + + if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) { + // String contains only ASCII characters, has to be valid + return true; + } + + // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault) + // for large input, we check for invalid sequences (<= 5 bytes) rather than valid + // sequences, which can be as long as the input string is. Multiple short regexes are + // used rather than a single long regex for performance. + static $regexes; + if ( $regexes === null ) { + $cont = "[\x80-\xbf]"; + $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here + $regexes = array( + // Continuation byte at the start + "/^$cont/", + + // ASCII byte followed by a continuation byte + "/[\\x00-\x7f]$cont/S", + + // Illegal byte + "/[\xc0\xc1\xf5-\xff]/S", + + // Invalid 2-byte sequence, or valid one then an extra continuation byte + "/[\xc2-\xdf](?!$cont$after)/S", + + // Invalid 3-byte sequence, or valid one then an extra continuation byte + "/\xe0(?![\xa0-\xbf]$cont$after)/", + "/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S", + "/\xed(?![\x80-\x9f]$cont$after)/", + + // Invalid 4-byte sequence, or valid one then an extra continuation byte + "/\xf0(?![\x90-\xbf]$cont{2}$after)/", + "/[\xf1-\xf3](?!$cont{3}$after)/S", + "/\xf4(?![\x80-\x8f]$cont{2}$after)/", + ); + } + + foreach ( $regexes as $regex ) { + if ( preg_match( $regex, $value ) !== 0 ) { + return false; + } + } + + return true; + } + + /** + * Perform an operation equivalent to + * + * preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject ); + * + * except that it's worst-case O(N) instead of O(N^2) + * + * Compared to delimiterReplace(), this implementation is fast but memory- + * hungry and inflexible. The memory requirements are such that I don't + * recommend using it on anything but guaranteed small chunks of text. + * + * @param string $startDelim + * @param string $endDelim + * @param string $replace + * @param string $subject + * + * @return string + */ + static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) { + $segments = explode( $startDelim, $subject ); + $output = array_shift( $segments ); + foreach ( $segments as $s ) { + $endDelimPos = strpos( $s, $endDelim ); + if ( $endDelimPos === false ) { + $output .= $startDelim . $s; + } else { + $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) ); + } + } + + return $output; + } + + /** + * Perform an operation equivalent to + * + * preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject ) + * + * This implementation is slower than hungryDelimiterReplace but uses far less + * memory. The delimiters are literal strings, not regular expressions. + * + * If the start delimiter ends with an initial substring of the end delimiter, + * e.g. in the case of C-style comments, the behavior differs from the model + * regex. In this implementation, the end must share no characters with the + * start, so e.g. /*\/ is not considered to be both the start and end of a + * comment. /*\/xy/*\/ is considered to be a single comment with contents /xy/. + * + * @param string $startDelim Start delimiter + * @param string $endDelim End delimiter + * @param callable $callback Function to call on each match + * @param string $subject + * @param string $flags Regular expression flags + * @throws MWException + * @return string + */ + static function delimiterReplaceCallback( $startDelim, $endDelim, $callback, + $subject, $flags = '' + ) { + $inputPos = 0; + $outputPos = 0; + $output = ''; + $foundStart = false; + $encStart = preg_quote( $startDelim, '!' ); + $encEnd = preg_quote( $endDelim, '!' ); + $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp'; + $endLength = strlen( $endDelim ); + $m = array(); + + while ( $inputPos < strlen( $subject ) && + preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) + ) { + $tokenOffset = $m[0][1]; + if ( $m[1][0] != '' ) { + if ( $foundStart && + $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0 + ) { + # An end match is present at the same location + $tokenType = 'end'; + $tokenLength = $endLength; + } else { + $tokenType = 'start'; + $tokenLength = strlen( $m[0][0] ); + } + } elseif ( $m[2][0] != '' ) { + $tokenType = 'end'; + $tokenLength = strlen( $m[0][0] ); + } else { + throw new MWException( 'Invalid delimiter given to ' . __METHOD__ ); + } + + if ( $tokenType == 'start' ) { + # Only move the start position if we haven't already found a start + # This means that START START END matches outer pair + if ( !$foundStart ) { + # Found start + $inputPos = $tokenOffset + $tokenLength; + # Write out the non-matching section + $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos ); + $outputPos = $tokenOffset; + $contentPos = $inputPos; + $foundStart = true; + } else { + # Move the input position past the *first character* of START, + # to protect against missing END when it overlaps with START + $inputPos = $tokenOffset + 1; + } + } elseif ( $tokenType == 'end' ) { + if ( $foundStart ) { + # Found match + $output .= call_user_func( $callback, array( + substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ), + substr( $subject, $contentPos, $tokenOffset - $contentPos ) + ) ); + $foundStart = false; + } else { + # Non-matching end, write it out + $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos ); + } + $inputPos = $outputPos = $tokenOffset + $tokenLength; + } else { + throw new MWException( 'Invalid delimiter given to ' . __METHOD__ ); + } + } + if ( $outputPos < strlen( $subject ) ) { + $output .= substr( $subject, $outputPos ); + } + + return $output; + } + + /** + * Perform an operation equivalent to + * + * preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject ) + * + * @param string $startDelim Start delimiter regular expression + * @param string $endDelim End delimiter regular expression + * @param string $replace Replacement string. May contain $1, which will be + * replaced by the text between the delimiters + * @param string $subject String to search + * @param string $flags Regular expression flags + * @return string The string with the matches replaced + */ + static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) { + $replacer = new RegexlikeReplacer( $replace ); + + return self::delimiterReplaceCallback( $startDelim, $endDelim, + $replacer->cb(), $subject, $flags ); + } + + /** + * More or less "markup-safe" explode() + * Ignores any instances of the separator inside <...> + * @param string $separator + * @param string $text + * @return array + */ + static function explodeMarkup( $separator, $text ) { + $placeholder = "\x00"; + + // Remove placeholder instances + $text = str_replace( $placeholder, '', $text ); + + // Replace instances of the separator inside HTML-like tags with the placeholder + $replacer = new DoubleReplacer( $separator, $placeholder ); + $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text ); + + // Explode, then put the replaced separators back in + $items = explode( $separator, $cleaned ); + foreach ( $items as $i => $str ) { + $items[$i] = str_replace( $placeholder, $separator, $str ); + } + + return $items; + } + + /** + * Escape a string to make it suitable for inclusion in a preg_replace() + * replacement parameter. + * + * @param string $string + * @return string + */ + static function escapeRegexReplacement( $string ) { + $string = str_replace( '\\', '\\\\', $string ); + $string = str_replace( '$', '\\$', $string ); + + return $string; + } + + /** + * Workalike for explode() with limited memory usage. + * Returns an Iterator + * @param string $separator + * @param string $subject + * @return ArrayIterator|ExplodeIterator + */ + static function explode( $separator, $subject ) { + if ( substr_count( $subject, $separator ) > 1000 ) { + return new ExplodeIterator( $separator, $subject ); + } else { + return new ArrayIterator( explode( $separator, $subject ) ); + } + } +} + +/** + * Base class for "replacers", objects used in preg_replace_callback() and + * StringUtils::delimiterReplaceCallback() + */ +class Replacer { + /** + * @return array + */ + function cb() { + return array( &$this, 'replace' ); + } +} + +/** + * Class to replace regex matches with a string similar to that used in preg_replace() + */ +class RegexlikeReplacer extends Replacer { + private $r; + + /** + * @param string $r + */ + function __construct( $r ) { + $this->r = $r; + } + + /** + * @param array $matches + * @return string + */ + function replace( $matches ) { + $pairs = array(); + foreach ( $matches as $i => $match ) { + $pairs["\$$i"] = $match; + } + + return strtr( $this->r, $pairs ); + } +} + +/** + * Class to perform secondary replacement within each replacement string + */ +class DoubleReplacer extends Replacer { + /** + * @param mixed $from + * @param mixed $to + * @param int $index + */ + function __construct( $from, $to, $index = 0 ) { + $this->from = $from; + $this->to = $to; + $this->index = $index; + } + + /** + * @param array $matches + * @return mixed + */ + function replace( $matches ) { + return str_replace( $this->from, $this->to, $matches[$this->index] ); + } +} + +/** + * Class to perform replacement based on a simple hashtable lookup + */ +class HashtableReplacer extends Replacer { + private $table, $index; + + /** + * @param array $table + * @param int $index + */ + function __construct( $table, $index = 0 ) { + $this->table = $table; + $this->index = $index; + } + + /** + * @param array $matches + * @return mixed + */ + function replace( $matches ) { + return $this->table[$matches[$this->index]]; + } +} + +/** + * Replacement array for FSS with fallback to strtr() + * Supports lazy initialisation of FSS resource + */ +class ReplacementArray { + private $data = false; + private $fss = false; + + /** + * Create an object with the specified replacement array + * The array should have the same form as the replacement array for strtr() + * @param array $data + */ + function __construct( $data = array() ) { + $this->data = $data; + } + + /** + * @return array + */ + function __sleep() { + return array( 'data' ); + } + + function __wakeup() { + $this->fss = false; + } + + /** + * Set the whole replacement array at once + * @param array $data + */ + function setArray( $data ) { + $this->data = $data; + $this->fss = false; + } + + /** + * @return array|bool + */ + function getArray() { + return $this->data; + } + + /** + * Set an element of the replacement array + * @param string $from + * @param string $to + */ + function setPair( $from, $to ) { + $this->data[$from] = $to; + $this->fss = false; + } + + /** + * @param array $data + */ + function mergeArray( $data ) { + $this->data = array_merge( $this->data, $data ); + $this->fss = false; + } + + /** + * @param ReplacementArray $other + */ + function merge( $other ) { + $this->data = array_merge( $this->data, $other->data ); + $this->fss = false; + } + + /** + * @param string $from + */ + function removePair( $from ) { + unset( $this->data[$from] ); + $this->fss = false; + } + + /** + * @param array $data + */ + function removeArray( $data ) { + foreach ( $data as $from => $to ) { + $this->removePair( $from ); + } + $this->fss = false; + } + + /** + * @param string $subject + * @return string + */ + function replace( $subject ) { + if ( function_exists( 'fss_prep_replace' ) ) { + wfProfileIn( __METHOD__ . '-fss' ); + if ( $this->fss === false ) { + $this->fss = fss_prep_replace( $this->data ); + } + $result = fss_exec_replace( $this->fss, $subject ); + wfProfileOut( __METHOD__ . '-fss' ); + } else { + wfProfileIn( __METHOD__ . '-strtr' ); + $result = strtr( $subject, $this->data ); + wfProfileOut( __METHOD__ . '-strtr' ); + } + + return $result; + } +} + +/** + * An iterator which works exactly like: + * + * foreach ( explode( $delim, $s ) as $element ) { + * ... + * } + * + * Except it doesn't use 193 byte per element + */ +class ExplodeIterator implements Iterator { + // The subject string + private $subject, $subjectLength; + + // The delimiter + private $delim, $delimLength; + + // The position of the start of the line + private $curPos; + + // The position after the end of the next delimiter + private $endPos; + + // The current token + private $current; + + /** + * Construct a DelimIterator + * @param string $delim + * @param string $subject + */ + function __construct( $delim, $subject ) { + $this->subject = $subject; + $this->delim = $delim; + + // Micro-optimisation (theoretical) + $this->subjectLength = strlen( $subject ); + $this->delimLength = strlen( $delim ); + + $this->rewind(); + } + + function rewind() { + $this->curPos = 0; + $this->endPos = strpos( $this->subject, $this->delim ); + $this->refreshCurrent(); + } + + function refreshCurrent() { + if ( $this->curPos === false ) { + $this->current = false; + } elseif ( $this->curPos >= $this->subjectLength ) { + $this->current = ''; + } elseif ( $this->endPos === false ) { + $this->current = substr( $this->subject, $this->curPos ); + } else { + $this->current = substr( $this->subject, $this->curPos, $this->endPos - $this->curPos ); + } + } + + function current() { + return $this->current; + } + + /** + * @return int|bool Current position or boolean false if invalid + */ + function key() { + return $this->curPos; + } + + /** + * @return string + */ + function next() { + if ( $this->endPos === false ) { + $this->curPos = false; + } else { + $this->curPos = $this->endPos + $this->delimLength; + if ( $this->curPos >= $this->subjectLength ) { + $this->endPos = false; + } else { + $this->endPos = strpos( $this->subject, $this->delim, $this->curPos ); + } + } + $this->refreshCurrent(); + + return $this->current; + } + + /** + * @return bool + */ + function valid() { + return $this->curPos !== false; + } +} diff --git a/includes/utils/UIDGenerator.php b/includes/utils/UIDGenerator.php new file mode 100644 index 00000000..5346afa6 --- /dev/null +++ b/includes/utils/UIDGenerator.php @@ -0,0 +1,507 @@ +<?php +/** + * This file deals with UID generation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + * @author Aaron Schulz + */ + +/** + * Class for getting statistically unique IDs + * + * @since 1.21 + */ +class UIDGenerator { + /** @var UIDGenerator */ + protected static $instance = null; + + protected $nodeIdFile; // string; local file path + protected $nodeId32; // string; node ID in binary (32 bits) + protected $nodeId48; // string; node ID in binary (48 bits) + + protected $lockFile88; // string; local file path + protected $lockFile128; // string; local file path + + /** @var array */ + protected $fileHandles = array(); // cache file handles + + const QUICK_RAND = 1; // get randomness from fast and insecure sources + const QUICK_VOLATILE = 2; // use an APC like in-memory counter if available + + protected function __construct() { + $this->nodeIdFile = wfTempDir() . '/mw-' . __CLASS__ . '-UID-nodeid'; + $nodeId = ''; + if ( is_file( $this->nodeIdFile ) ) { + $nodeId = file_get_contents( $this->nodeIdFile ); + } + // Try to get some ID that uniquely identifies this machine (RFC 4122)... + if ( !preg_match( '/^[0-9a-f]{12}$/i', $nodeId ) ) { + wfSuppressWarnings(); + if ( wfIsWindows() ) { + // http://technet.microsoft.com/en-us/library/bb490913.aspx + $csv = trim( wfShellExec( 'getmac /NH /FO CSV' ) ); + $line = substr( $csv, 0, strcspn( $csv, "\n" ) ); + $info = str_getcsv( $line ); + $nodeId = isset( $info[0] ) ? str_replace( '-', '', $info[0] ) : ''; + } elseif ( is_executable( '/sbin/ifconfig' ) ) { // Linux/BSD/Solaris/OS X + // See http://linux.die.net/man/8/ifconfig + $m = array(); + preg_match( '/\s([0-9a-f]{2}(:[0-9a-f]{2}){5})\s/', + wfShellExec( '/sbin/ifconfig -a' ), $m ); + $nodeId = isset( $m[1] ) ? str_replace( ':', '', $m[1] ) : ''; + } + wfRestoreWarnings(); + if ( !preg_match( '/^[0-9a-f]{12}$/i', $nodeId ) ) { + $nodeId = MWCryptRand::generateHex( 12, true ); + $nodeId[1] = dechex( hexdec( $nodeId[1] ) | 0x1 ); // set multicast bit + } + file_put_contents( $this->nodeIdFile, $nodeId ); // cache + } + $this->nodeId32 = wfBaseConvert( substr( sha1( $nodeId ), 0, 8 ), 16, 2, 32 ); + $this->nodeId48 = wfBaseConvert( $nodeId, 16, 2, 48 ); + // If different processes run as different users, they may have different temp dirs. + // This is dealt with by initializing the clock sequence number and counters randomly. + $this->lockFile88 = wfTempDir() . '/mw-' . __CLASS__ . '-UID-88'; + $this->lockFile128 = wfTempDir() . '/mw-' . __CLASS__ . '-UID-128'; + } + + /** + * @return UIDGenerator + */ + protected static function singleton() { + if ( self::$instance === null ) { + self::$instance = new self(); + } + + return self::$instance; + } + + /** + * Get a statistically unique 88-bit unsigned integer ID string. + * The bits of the UID are prefixed with the time (down to the millisecond). + * + * These IDs are suitable as values for the shard key of distributed data. + * If a column uses these as values, it should be declared UNIQUE to handle collisions. + * New rows almost always have higher UIDs, which makes B-TREE updates on INSERT fast. + * They can also be stored "DECIMAL(27) UNSIGNED" or BINARY(11) in MySQL. + * + * UID generation is serialized on each server (as the node ID is for the whole machine). + * + * @param int $base Specifies a base other than 10 + * @return string Number + * @throws MWException + */ + public static function newTimestampedUID88( $base = 10 ) { + if ( !is_integer( $base ) || $base > 36 || $base < 2 ) { + throw new MWException( "Base must an integer be between 2 and 36" ); + } + $gen = self::singleton(); + $time = $gen->getTimestampAndDelay( 'lockFile88', 1, 1024 ); + + return wfBaseConvert( $gen->getTimestampedID88( $time ), 2, $base ); + } + + /** + * @param array $info (UIDGenerator::millitime(), counter, clock sequence) + * @return string 88 bits + */ + protected function getTimestampedID88( array $info ) { + list( $time, $counter ) = $info; + // Take the 46 MSBs of "milliseconds since epoch" + $id_bin = $this->millisecondsSinceEpochBinary( $time ); + // Add a 10 bit counter resulting in 56 bits total + $id_bin .= str_pad( decbin( $counter ), 10, '0', STR_PAD_LEFT ); + // Add the 32 bit node ID resulting in 88 bits total + $id_bin .= $this->nodeId32; + // Convert to a 1-27 digit integer string + if ( strlen( $id_bin ) !== 88 ) { + throw new MWException( "Detected overflow for millisecond timestamp." ); + } + + return $id_bin; + } + + /** + * Get a statistically unique 128-bit unsigned integer ID string. + * The bits of the UID are prefixed with the time (down to the millisecond). + * + * These IDs are suitable as globally unique IDs, without any enforced uniqueness. + * New rows almost always have higher UIDs, which makes B-TREE updates on INSERT fast. + * They can also be stored as "DECIMAL(39) UNSIGNED" or BINARY(16) in MySQL. + * + * UID generation is serialized on each server (as the node ID is for the whole machine). + * + * @param int $base Specifies a base other than 10 + * @return string Number + * @throws MWException + */ + public static function newTimestampedUID128( $base = 10 ) { + if ( !is_integer( $base ) || $base > 36 || $base < 2 ) { + throw new MWException( "Base must be an integer between 2 and 36" ); + } + $gen = self::singleton(); + $time = $gen->getTimestampAndDelay( 'lockFile128', 16384, 1048576 ); + + return wfBaseConvert( $gen->getTimestampedID128( $time ), 2, $base ); + } + + /** + * @param array $info (UIDGenerator::millitime(), counter, clock sequence) + * @return string 128 bits + */ + protected function getTimestampedID128( array $info ) { + list( $time, $counter, $clkSeq ) = $info; + // Take the 46 MSBs of "milliseconds since epoch" + $id_bin = $this->millisecondsSinceEpochBinary( $time ); + // Add a 20 bit counter resulting in 66 bits total + $id_bin .= str_pad( decbin( $counter ), 20, '0', STR_PAD_LEFT ); + // Add a 14 bit clock sequence number resulting in 80 bits total + $id_bin .= str_pad( decbin( $clkSeq ), 14, '0', STR_PAD_LEFT ); + // Add the 48 bit node ID resulting in 128 bits total + $id_bin .= $this->nodeId48; + // Convert to a 1-39 digit integer string + if ( strlen( $id_bin ) !== 128 ) { + throw new MWException( "Detected overflow for millisecond timestamp." ); + } + + return $id_bin; + } + + /** + * Return an RFC4122 compliant v4 UUID + * + * @param int $flags Bitfield (supports UIDGenerator::QUICK_RAND) + * @return string + * @throws MWException + */ + public static function newUUIDv4( $flags = 0 ) { + $hex = ( $flags & self::QUICK_RAND ) + ? wfRandomString( 31 ) + : MWCryptRand::generateHex( 31 ); + + return sprintf( '%s-%s-%s-%s-%s', + // "time_low" (32 bits) + substr( $hex, 0, 8 ), + // "time_mid" (16 bits) + substr( $hex, 8, 4 ), + // "time_hi_and_version" (16 bits) + '4' . substr( $hex, 12, 3 ), + // "clk_seq_hi_res (8 bits, variant is binary 10x) and "clk_seq_low" (8 bits) + dechex( 0x8 | ( hexdec( $hex[15] ) & 0x3 ) ) . $hex[16] . substr( $hex, 17, 2 ), + // "node" (48 bits) + substr( $hex, 19, 12 ) + ); + } + + /** + * Return an RFC4122 compliant v4 UUID + * + * @param int $flags Bitfield (supports UIDGenerator::QUICK_RAND) + * @return string 32 hex characters with no hyphens + * @throws MWException + */ + public static function newRawUUIDv4( $flags = 0 ) { + return str_replace( '-', '', self::newUUIDv4( $flags ) ); + } + + /** + * Return an ID that is sequential *only* for this node and bucket + * + * These IDs are suitable for per-host sequence numbers, e.g. for some packet protocols. + * If UIDGenerator::QUICK_VOLATILE is used the counter might reset on server restart. + * + * @param string $bucket Arbitrary bucket name (should be ASCII) + * @param int $bits Bit size (<=48) of resulting numbers before wrap-around + * @param int $flags (supports UIDGenerator::QUICK_VOLATILE) + * @return float Integer value as float + * @since 1.23 + */ + public static function newSequentialPerNodeID( $bucket, $bits = 48, $flags = 0 ) { + return current( self::newSequentialPerNodeIDs( $bucket, $bits, 1, $flags ) ); + } + + /** + * Return IDs that are sequential *only* for this node and bucket + * + * @see UIDGenerator::newSequentialPerNodeID() + * @param string $bucket Arbitrary bucket name (should be ASCII) + * @param int $bits Bit size (16 to 48) of resulting numbers before wrap-around + * @param int $count Number of IDs to return (1 to 10000) + * @param int $flags (supports UIDGenerator::QUICK_VOLATILE) + * @return array Ordered list of float integer values + * @since 1.23 + */ + public static function newSequentialPerNodeIDs( $bucket, $bits, $count, $flags = 0 ) { + $gen = self::singleton(); + return $gen->getSequentialPerNodeIDs( $bucket, $bits, $count, $flags ); + } + + /** + * Return IDs that are sequential *only* for this node and bucket + * + * @see UIDGenerator::newSequentialPerNodeID() + * @param string $bucket Arbitrary bucket name (should be ASCII) + * @param int $bits Bit size (16 to 48) of resulting numbers before wrap-around + * @param int $count Number of IDs to return (1 to 10000) + * @param int $flags (supports UIDGenerator::QUICK_VOLATILE) + * @return array Ordered list of float integer values + */ + protected function getSequentialPerNodeIDs( $bucket, $bits, $count, $flags ) { + if ( $count <= 0 ) { + return array(); // nothing to do + } elseif ( $count > 10000 ) { + throw new MWException( "Number of requested IDs ($count) is too high." ); + } elseif ( $bits < 16 || $bits > 48 ) { + throw new MWException( "Requested bit size ($bits) is out of range." ); + } + + $counter = null; // post-increment persistent counter value + + // Use APC/eAccelerator/xcache if requested, available, and not in CLI mode; + // Counter values would not survive accross script instances in CLI mode. + $cache = null; + if ( ( $flags & self::QUICK_VOLATILE ) && PHP_SAPI !== 'cli' ) { + try { + $cache = ObjectCache::newAccelerator( array() ); + } catch ( MWException $e ) { + // not supported + } + } + if ( $cache ) { + $counter = $cache->incr( $bucket, $count ); + if ( $counter === false ) { + if ( !$cache->add( $bucket, (int)$count ) ) { + throw new MWException( 'Unable to set value to ' . get_class( $cache ) ); + } + $counter = $count; + } + } + + // Note: use of fmod() avoids "division by zero" on 32 bit machines + if ( $counter === null ) { + $path = wfTempDir() . '/mw-' . __CLASS__ . '-' . rawurlencode( $bucket ) . '-48'; + // Get the UID lock file handle + if ( isset( $this->fileHandles[$path] ) ) { + $handle = $this->fileHandles[$path]; + } else { + $handle = fopen( $path, 'cb+' ); + $this->fileHandles[$path] = $handle ?: null; // cache + } + // Acquire the UID lock file + if ( $handle === false ) { + throw new MWException( "Could not open '{$path}'." ); + } elseif ( !flock( $handle, LOCK_EX ) ) { + fclose( $handle ); + throw new MWException( "Could not acquire '{$path}'." ); + } + // Fetch the counter value and increment it... + rewind( $handle ); + $counter = floor( trim( fgets( $handle ) ) ) + $count; // fetch as float + // Write back the new counter value + ftruncate( $handle, 0 ); + rewind( $handle ); + fwrite( $handle, fmod( $counter, pow( 2, 48 ) ) ); // warp-around as needed + fflush( $handle ); + // Release the UID lock file + flock( $handle, LOCK_UN ); + } + + $ids = array(); + $divisor = pow( 2, $bits ); + $currentId = floor( $counter - $count ); // pre-increment counter value + for ( $i = 0; $i < $count; ++$i ) { + $ids[] = fmod( ++$currentId, $divisor ); + } + + return $ids; + } + + /** + * Get a (time,counter,clock sequence) where (time,counter) is higher + * than any previous (time,counter) value for the given clock sequence. + * This is useful for making UIDs sequential on a per-node bases. + * + * @param string $lockFile Name of a local lock file + * @param int $clockSeqSize The number of possible clock sequence values + * @param int $counterSize The number of possible counter values + * @return array (result of UIDGenerator::millitime(), counter, clock sequence) + * @throws MWException + */ + protected function getTimestampAndDelay( $lockFile, $clockSeqSize, $counterSize ) { + // Get the UID lock file handle + $path = $this->$lockFile; + if ( isset( $this->fileHandles[$path] ) ) { + $handle = $this->fileHandles[$path]; + } else { + $handle = fopen( $path, 'cb+' ); + $this->fileHandles[$path] = $handle ?: null; // cache + } + // Acquire the UID lock file + if ( $handle === false ) { + throw new MWException( "Could not open '{$this->$lockFile}'." ); + } elseif ( !flock( $handle, LOCK_EX ) ) { + fclose( $handle ); + throw new MWException( "Could not acquire '{$this->$lockFile}'." ); + } + // Get the current timestamp, clock sequence number, last time, and counter + rewind( $handle ); + $data = explode( ' ', fgets( $handle ) ); // "<clk seq> <sec> <msec> <counter> <offset>" + $clockChanged = false; // clock set back significantly? + if ( count( $data ) == 5 ) { // last UID info already initialized + $clkSeq = (int)$data[0] % $clockSeqSize; + $prevTime = array( (int)$data[1], (int)$data[2] ); + $offset = (int)$data[4] % $counterSize; // random counter offset + $counter = 0; // counter for UIDs with the same timestamp + // Delay until the clock reaches the time of the last ID. + // This detects any microtime() drift among processes. + $time = $this->timeWaitUntil( $prevTime ); + if ( !$time ) { // too long to delay? + $clockChanged = true; // bump clock sequence number + $time = self::millitime(); + } elseif ( $time == $prevTime ) { + // Bump the counter if there are timestamp collisions + $counter = (int)$data[3] % $counterSize; + if ( ++$counter >= $counterSize ) { // sanity (starts at 0) + flock( $handle, LOCK_UN ); // abort + throw new MWException( "Counter overflow for timestamp value." ); + } + } + } else { // last UID info not initialized + $clkSeq = mt_rand( 0, $clockSeqSize - 1 ); + $counter = 0; + $offset = mt_rand( 0, $counterSize - 1 ); + $time = self::millitime(); + } + // microtime() and gettimeofday() can drift from time() at least on Windows. + // The drift is immediate for processes running while the system clock changes. + // time() does not have this problem. See https://bugs.php.net/bug.php?id=42659. + if ( abs( time() - $time[0] ) >= 2 ) { + // We don't want processes using too high or low timestamps to avoid duplicate + // UIDs and clock sequence number churn. This process should just be restarted. + flock( $handle, LOCK_UN ); // abort + throw new MWException( "Process clock is outdated or drifted." ); + } + // If microtime() is synced and a clock change was detected, then the clock went back + if ( $clockChanged ) { + // Bump the clock sequence number and also randomize the counter offset, + // which is useful for UIDs that do not include the clock sequence number. + $clkSeq = ( $clkSeq + 1 ) % $clockSeqSize; + $offset = mt_rand( 0, $counterSize - 1 ); + trigger_error( "Clock was set back; sequence number incremented." ); + } + // Update the (clock sequence number, timestamp, counter) + ftruncate( $handle, 0 ); + rewind( $handle ); + fwrite( $handle, "{$clkSeq} {$time[0]} {$time[1]} {$counter} {$offset}" ); + fflush( $handle ); + // Release the UID lock file + flock( $handle, LOCK_UN ); + + return array( $time, ( $counter + $offset ) % $counterSize, $clkSeq ); + } + + /** + * Wait till the current timestamp reaches $time and return the current + * timestamp. This returns false if it would have to wait more than 10ms. + * + * @param array $time Result of UIDGenerator::millitime() + * @return array|bool UIDGenerator::millitime() result or false + */ + protected function timeWaitUntil( array $time ) { + do { + $ct = self::millitime(); + if ( $ct >= $time ) { // http://php.net/manual/en/language.operators.comparison.php + return $ct; // current timestamp is higher than $time + } + } while ( ( ( $time[0] - $ct[0] ) * 1000 + ( $time[1] - $ct[1] ) ) <= 10 ); + + return false; + } + + /** + * @param array $time Result of UIDGenerator::millitime() + * @return string 46 MSBs of "milliseconds since epoch" in binary (rolls over in 4201) + */ + protected function millisecondsSinceEpochBinary( array $time ) { + list( $sec, $msec ) = $time; + $ts = 1000 * $sec + $msec; + if ( $ts > pow( 2, 52 ) ) { + throw new MWException( __METHOD__ . + ': sorry, this function doesn\'t work after the year 144680' ); + } + + return substr( wfBaseConvert( $ts, 10, 2, 46 ), -46 ); + } + + /** + * @return array (current time in seconds, milliseconds since then) + */ + protected static function millitime() { + list( $msec, $sec ) = explode( ' ', microtime() ); + + return array( (int)$sec, (int)( $msec * 1000 ) ); + } + + /** + * Delete all cache files that have been created. + * + * This is a cleanup method primarily meant to be used from unit tests to + * avoid poluting the local filesystem. If used outside of a unit test + * environment it should be used with caution as it may destroy state saved + * in the files. + * + * @see unitTestTearDown + * @since 1.23 + */ + protected function deleteCacheFiles() { + // Bug: 44850 + foreach ( $this->fileHandles as $path => $handle ) { + if ( $handle !== null ) { + fclose( $handle ); + } + if ( is_file( $path ) ) { + unlink( $path ); + } + unset( $this->fileHandles[$path] ); + } + if ( is_file( $this->nodeIdFile ) ) { + unlink( $this->nodeIdFile ); + } + } + + /** + * Cleanup resources when tearing down after a unit test. + * + * This is a cleanup method primarily meant to be used from unit tests to + * avoid poluting the local filesystem. If used outside of a unit test + * environment it should be used with caution as it may destroy state saved + * in the files. + * + * @see deleteCacheFiles + * @since 1.23 + */ + public static function unitTestTearDown() { + // Bug: 44850 + $gen = self::singleton(); + $gen->deleteCacheFiles(); + } + + function __destruct() { + array_map( 'fclose', array_filter( $this->fileHandles ) ); + } +} diff --git a/includes/utils/ZipDirectoryReader.php b/includes/utils/ZipDirectoryReader.php new file mode 100644 index 00000000..bc849766 --- /dev/null +++ b/includes/utils/ZipDirectoryReader.php @@ -0,0 +1,732 @@ +<?php +/** + * ZIP file directories reader, for the purposes of upload verification. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * A class for reading ZIP file directories, for the purposes of upload + * verification. + * + * Only a functional interface is provided: ZipFileReader::read(). No access is + * given to object instances. + * + */ +class ZipDirectoryReader { + /** + * Read a ZIP file and call a function for each file discovered in it. + * + * Because this class is aimed at verification, an error is raised on + * suspicious or ambiguous input, instead of emulating some standard + * behavior. + * + * @param string $fileName The archive file name + * @param array $callback The callback function. It will be called for each file + * with a single associative array each time, with members: + * + * - name: The file name. Directories conventionally have a trailing + * slash. + * + * - mtime: The file modification time, in MediaWiki 14-char format + * + * - size: The uncompressed file size + * + * @param array $options An associative array of read options, with the option + * name in the key. This may currently contain: + * + * - zip64: If this is set to true, then we will emulate a + * library with ZIP64 support, like OpenJDK 7. If it is set to + * false, then we will emulate a library with no knowledge of + * ZIP64. + * + * NOTE: The ZIP64 code is untested and probably doesn't work. It + * turned out to be easier to just reject ZIP64 archive uploads, + * since they are likely to be very rare. Confirming safety of a + * ZIP64 file is fairly complex. What do you do with a file that is + * ambiguous and broken when read with a non-ZIP64 reader, but valid + * when read with a ZIP64 reader? This situation is normal for a + * valid ZIP64 file, and working out what non-ZIP64 readers will make + * of such a file is not trivial. + * + * @return Status A Status object. The following fatal errors are defined: + * + * - zip-file-open-error: The file could not be opened. + * + * - zip-wrong-format: The file does not appear to be a ZIP file. + * + * - zip-bad: There was something wrong or ambiguous about the file + * data. + * + * - zip-unsupported: The ZIP file uses features which + * ZipDirectoryReader does not support. + * + * The default messages for those fatal errors are written in a way that + * makes sense for upload verification. + * + * If a fatal error is returned, more information about the error will be + * available in the debug log. + * + * Note that the callback function may be called any number of times before + * a fatal error is returned. If this occurs, the data sent to the callback + * function should be discarded. + */ + public static function read( $fileName, $callback, $options = array() ) { + $zdr = new self( $fileName, $callback, $options ); + + return $zdr->execute(); + } + + /** The file name */ + protected $fileName; + + /** The opened file resource */ + protected $file; + + /** The cached length of the file, or null if it has not been loaded yet. */ + protected $fileLength; + + /** A segmented cache of the file contents */ + protected $buffer; + + /** The file data callback */ + protected $callback; + + /** The ZIP64 mode */ + protected $zip64 = false; + + /** Stored headers */ + protected $eocdr, $eocdr64, $eocdr64Locator; + + protected $data; + + /** The "extra field" ID for ZIP64 central directory entries */ + const ZIP64_EXTRA_HEADER = 0x0001; + + /** The segment size for the file contents cache */ + const SEGSIZE = 16384; + + /** The index of the "general field" bit for UTF-8 file names */ + const GENERAL_UTF8 = 11; + + /** The index of the "general field" bit for central directory encryption */ + const GENERAL_CD_ENCRYPTED = 13; + + /** + * Private constructor + * @param string $fileName + * @param callable $callback + * @param array $options + */ + protected function __construct( $fileName, $callback, $options ) { + $this->fileName = $fileName; + $this->callback = $callback; + + if ( isset( $options['zip64'] ) ) { + $this->zip64 = $options['zip64']; + } + } + + /** + * Read the directory according to settings in $this. + * + * @return Status + */ + function execute() { + $this->file = fopen( $this->fileName, 'r' ); + $this->data = array(); + if ( !$this->file ) { + return Status::newFatal( 'zip-file-open-error' ); + } + + $status = Status::newGood(); + try { + $this->readEndOfCentralDirectoryRecord(); + if ( $this->zip64 ) { + list( $offset, $size ) = $this->findZip64CentralDirectory(); + $this->readCentralDirectory( $offset, $size ); + } else { + if ( $this->eocdr['CD size'] == 0xffffffff + || $this->eocdr['CD offset'] == 0xffffffff + || $this->eocdr['CD entries total'] == 0xffff + ) { + $this->error( 'zip-unsupported', 'Central directory header indicates ZIP64, ' . + 'but we are in legacy mode. Rejecting this upload is necessary to avoid ' . + 'opening vulnerabilities on clients using OpenJDK 7 or later.' ); + } + + list( $offset, $size ) = $this->findOldCentralDirectory(); + $this->readCentralDirectory( $offset, $size ); + } + } catch ( ZipDirectoryReaderError $e ) { + $status->fatal( $e->getErrorCode() ); + } + + fclose( $this->file ); + + return $status; + } + + /** + * Throw an error, and log a debug message + * @param mixed $code + * @param string $debugMessage + */ + function error( $code, $debugMessage ) { + wfDebug( __CLASS__ . ": Fatal error: $debugMessage\n" ); + throw new ZipDirectoryReaderError( $code ); + } + + /** + * Read the header which is at the end of the central directory, + * unimaginatively called the "end of central directory record" by the ZIP + * spec. + */ + function readEndOfCentralDirectoryRecord() { + $info = array( + 'signature' => 4, + 'disk' => 2, + 'CD start disk' => 2, + 'CD entries this disk' => 2, + 'CD entries total' => 2, + 'CD size' => 4, + 'CD offset' => 4, + 'file comment length' => 2, + ); + $structSize = $this->getStructSize( $info ); + $startPos = $this->getFileLength() - 65536 - $structSize; + if ( $startPos < 0 ) { + $startPos = 0; + } + + $block = $this->getBlock( $startPos ); + $sigPos = strrpos( $block, "PK\x05\x06" ); + if ( $sigPos === false ) { + $this->error( 'zip-wrong-format', + "zip file lacks EOCDR signature. It probably isn't a zip file." ); + } + + $this->eocdr = $this->unpack( substr( $block, $sigPos ), $info ); + $this->eocdr['EOCDR size'] = $structSize + $this->eocdr['file comment length']; + + if ( $structSize + $this->eocdr['file comment length'] != strlen( $block ) - $sigPos ) { + $this->error( 'zip-bad', 'trailing bytes after the end of the file comment' ); + } + if ( $this->eocdr['disk'] !== 0 + || $this->eocdr['CD start disk'] !== 0 + ) { + $this->error( 'zip-unsupported', 'more than one disk (in EOCDR)' ); + } + $this->eocdr += $this->unpack( + $block, + array( 'file comment' => array( 'string', $this->eocdr['file comment length'] ) ), + $sigPos + $structSize ); + $this->eocdr['position'] = $startPos + $sigPos; + } + + /** + * Read the header called the "ZIP64 end of central directory locator". An + * error will be raised if it does not exist. + */ + function readZip64EndOfCentralDirectoryLocator() { + $info = array( + 'signature' => array( 'string', 4 ), + 'eocdr64 start disk' => 4, + 'eocdr64 offset' => 8, + 'number of disks' => 4, + ); + $structSize = $this->getStructSize( $info ); + + $start = $this->getFileLength() - $this->eocdr['EOCDR size'] - $structSize; + $block = $this->getBlock( $start, $structSize ); + $this->eocdr64Locator = $data = $this->unpack( $block, $info ); + + if ( $data['signature'] !== "PK\x06\x07" ) { + // Note: Java will allow this and continue to read the + // EOCDR64, so we have to reject the upload, we can't + // just use the EOCDR header instead. + $this->error( 'zip-bad', 'wrong signature on Zip64 end of central directory locator' ); + } + } + + /** + * Read the header called the "ZIP64 end of central directory record". It + * may replace the regular "end of central directory record" in ZIP64 files. + */ + function readZip64EndOfCentralDirectoryRecord() { + if ( $this->eocdr64Locator['eocdr64 start disk'] != 0 + || $this->eocdr64Locator['number of disks'] != 0 + ) { + $this->error( 'zip-unsupported', 'more than one disk (in EOCDR64 locator)' ); + } + + $info = array( + 'signature' => array( 'string', 4 ), + 'EOCDR64 size' => 8, + 'version made by' => 2, + 'version needed' => 2, + 'disk' => 4, + 'CD start disk' => 4, + 'CD entries this disk' => 8, + 'CD entries total' => 8, + 'CD size' => 8, + 'CD offset' => 8 + ); + $structSize = $this->getStructSize( $info ); + $block = $this->getBlock( $this->eocdr64Locator['eocdr64 offset'], $structSize ); + $this->eocdr64 = $data = $this->unpack( $block, $info ); + if ( $data['signature'] !== "PK\x06\x06" ) { + $this->error( 'zip-bad', 'wrong signature on Zip64 end of central directory record' ); + } + if ( $data['disk'] !== 0 + || $data['CD start disk'] !== 0 + ) { + $this->error( 'zip-unsupported', 'more than one disk (in EOCDR64)' ); + } + } + + /** + * Find the location of the central directory, as would be seen by a + * non-ZIP64 reader. + * + * @return array List containing offset, size and end position. + */ + function findOldCentralDirectory() { + $size = $this->eocdr['CD size']; + $offset = $this->eocdr['CD offset']; + $endPos = $this->eocdr['position']; + + // Some readers use the EOCDR position instead of the offset field + // to find the directory, so to be safe, we check if they both agree. + if ( $offset + $size != $endPos ) { + $this->error( 'zip-bad', 'the central directory does not immediately precede the end ' . + 'of central directory record' ); + } + + return array( $offset, $size ); + } + + /** + * Find the location of the central directory, as would be seen by a + * ZIP64-compliant reader. + * + * @return array List containing offset, size and end position. + */ + function findZip64CentralDirectory() { + // The spec is ambiguous about the exact rules of precedence between the + // ZIP64 headers and the original headers. Here we follow zip_util.c + // from OpenJDK 7. + $size = $this->eocdr['CD size']; + $offset = $this->eocdr['CD offset']; + $numEntries = $this->eocdr['CD entries total']; + $endPos = $this->eocdr['position']; + if ( $size == 0xffffffff + || $offset == 0xffffffff + || $numEntries == 0xffff + ) { + $this->readZip64EndOfCentralDirectoryLocator(); + + if ( isset( $this->eocdr64Locator['eocdr64 offset'] ) ) { + $this->readZip64EndOfCentralDirectoryRecord(); + if ( isset( $this->eocdr64['CD offset'] ) ) { + $size = $this->eocdr64['CD size']; + $offset = $this->eocdr64['CD offset']; + $endPos = $this->eocdr64Locator['eocdr64 offset']; + } + } + } + // Some readers use the EOCDR position instead of the offset field + // to find the directory, so to be safe, we check if they both agree. + if ( $offset + $size != $endPos ) { + $this->error( 'zip-bad', 'the central directory does not immediately precede the end ' . + 'of central directory record' ); + } + + return array( $offset, $size ); + } + + /** + * Read the central directory at the given location + * @param int $offset + * @param int $size + */ + function readCentralDirectory( $offset, $size ) { + $block = $this->getBlock( $offset, $size ); + + $fixedInfo = array( + 'signature' => array( 'string', 4 ), + 'version made by' => 2, + 'version needed' => 2, + 'general bits' => 2, + 'compression method' => 2, + 'mod time' => 2, + 'mod date' => 2, + 'crc-32' => 4, + 'compressed size' => 4, + 'uncompressed size' => 4, + 'name length' => 2, + 'extra field length' => 2, + 'comment length' => 2, + 'disk number start' => 2, + 'internal attrs' => 2, + 'external attrs' => 4, + 'local header offset' => 4, + ); + $fixedSize = $this->getStructSize( $fixedInfo ); + + $pos = 0; + while ( $pos < $size ) { + $data = $this->unpack( $block, $fixedInfo, $pos ); + $pos += $fixedSize; + + if ( $data['signature'] !== "PK\x01\x02" ) { + $this->error( 'zip-bad', 'Invalid signature found in directory entry' ); + } + + $variableInfo = array( + 'name' => array( 'string', $data['name length'] ), + 'extra field' => array( 'string', $data['extra field length'] ), + 'comment' => array( 'string', $data['comment length'] ), + ); + $data += $this->unpack( $block, $variableInfo, $pos ); + $pos += $this->getStructSize( $variableInfo ); + + if ( $this->zip64 && ( + $data['compressed size'] == 0xffffffff + || $data['uncompressed size'] == 0xffffffff + || $data['local header offset'] == 0xffffffff ) + ) { + $zip64Data = $this->unpackZip64Extra( $data['extra field'] ); + if ( $zip64Data ) { + $data = $zip64Data + $data; + } + } + + if ( $this->testBit( $data['general bits'], self::GENERAL_CD_ENCRYPTED ) ) { + $this->error( 'zip-unsupported', 'central directory encryption is not supported' ); + } + + // Convert the timestamp into MediaWiki format + // For the format, please see the MS-DOS 2.0 Programmer's Reference, + // pages 3-5 and 3-6. + $time = $data['mod time']; + $date = $data['mod date']; + + $year = 1980 + ( $date >> 9 ); + $month = ( $date >> 5 ) & 15; + $day = $date & 31; + $hour = ( $time >> 11 ) & 31; + $minute = ( $time >> 5 ) & 63; + $second = ( $time & 31 ) * 2; + $timestamp = sprintf( "%04d%02d%02d%02d%02d%02d", + $year, $month, $day, $hour, $minute, $second ); + + // Convert the character set in the file name + if ( $this->testBit( $data['general bits'], self::GENERAL_UTF8 ) ) { + $name = $data['name']; + } else { + $name = iconv( 'CP437', 'UTF-8', $data['name'] ); + } + + // Compile a data array for the user, with a sensible format + $userData = array( + 'name' => $name, + 'mtime' => $timestamp, + 'size' => $data['uncompressed size'], + ); + call_user_func( $this->callback, $userData ); + } + } + + /** + * Interpret ZIP64 "extra field" data and return an associative array. + * @param string $extraField + * @return array|bool + */ + function unpackZip64Extra( $extraField ) { + $extraHeaderInfo = array( + 'id' => 2, + 'size' => 2, + ); + $extraHeaderSize = $this->getStructSize( $extraHeaderInfo ); + + $zip64ExtraInfo = array( + 'uncompressed size' => 8, + 'compressed size' => 8, + 'local header offset' => 8, + 'disk number start' => 4, + ); + + $extraPos = 0; + while ( $extraPos < strlen( $extraField ) ) { + $extra = $this->unpack( $extraField, $extraHeaderInfo, $extraPos ); + $extraPos += $extraHeaderSize; + $extra += $this->unpack( $extraField, + array( 'data' => array( 'string', $extra['size'] ) ), + $extraPos ); + $extraPos += $extra['size']; + + if ( $extra['id'] == self::ZIP64_EXTRA_HEADER ) { + return $this->unpack( $extra['data'], $zip64ExtraInfo ); + } + } + + return false; + } + + /** + * Get the length of the file. + * @return int + */ + function getFileLength() { + if ( $this->fileLength === null ) { + $stat = fstat( $this->file ); + $this->fileLength = $stat['size']; + } + + return $this->fileLength; + } + + /** + * Get the file contents from a given offset. If there are not enough bytes + * in the file to satisfy the request, an exception will be thrown. + * + * @param int $start The byte offset of the start of the block. + * @param int $length The number of bytes to return. If omitted, the remainder + * of the file will be returned. + * + * @return string + */ + function getBlock( $start, $length = null ) { + $fileLength = $this->getFileLength(); + if ( $start >= $fileLength ) { + $this->error( 'zip-bad', "getBlock() requested position $start, " . + "file length is $fileLength" ); + } + if ( $length === null ) { + $length = $fileLength - $start; + } + $end = $start + $length; + if ( $end > $fileLength ) { + $this->error( 'zip-bad', "getBlock() requested end position $end, " . + "file length is $fileLength" ); + } + $startSeg = floor( $start / self::SEGSIZE ); + $endSeg = ceil( $end / self::SEGSIZE ); + + $block = ''; + for ( $segIndex = $startSeg; $segIndex <= $endSeg; $segIndex++ ) { + $block .= $this->getSegment( $segIndex ); + } + + $block = substr( $block, + $start - $startSeg * self::SEGSIZE, + $length ); + + if ( strlen( $block ) < $length ) { + $this->error( 'zip-bad', 'getBlock() returned an unexpectedly small amount of data' ); + } + + return $block; + } + + /** + * Get a section of the file starting at position $segIndex * self::SEGSIZE, + * of length self::SEGSIZE. The result is cached. This is a helper function + * for getBlock(). + * + * If there are not enough bytes in the file to satisfy the request, the + * return value will be truncated. If a request is made for a segment beyond + * the end of the file, an empty string will be returned. + * + * @param int $segIndex + * + * @return string + */ + function getSegment( $segIndex ) { + if ( !isset( $this->buffer[$segIndex] ) ) { + $bytePos = $segIndex * self::SEGSIZE; + if ( $bytePos >= $this->getFileLength() ) { + $this->buffer[$segIndex] = ''; + + return ''; + } + if ( fseek( $this->file, $bytePos ) ) { + $this->error( 'zip-bad', "seek to $bytePos failed" ); + } + $seg = fread( $this->file, self::SEGSIZE ); + if ( $seg === false ) { + $this->error( 'zip-bad', "read from $bytePos failed" ); + } + $this->buffer[$segIndex] = $seg; + } + + return $this->buffer[$segIndex]; + } + + /** + * Get the size of a structure in bytes. See unpack() for the format of $struct. + * @param array $struct + * @return int + */ + function getStructSize( $struct ) { + $size = 0; + foreach ( $struct as $type ) { + if ( is_array( $type ) ) { + list( , $fieldSize ) = $type; + $size += $fieldSize; + } else { + $size += $type; + } + } + + return $size; + } + + /** + * Unpack a binary structure. This is like the built-in unpack() function + * except nicer. + * + * @param string $string The binary data input + * + * @param array $struct An associative array giving structure members and their + * types. In the key is the field name. The value may be either an + * integer, in which case the field is a little-endian unsigned integer + * encoded in the given number of bytes, or an array, in which case the + * first element of the array is the type name, and the subsequent + * elements are type-dependent parameters. Only one such type is defined: + * - "string": The second array element gives the length of string. + * Not null terminated. + * + * @param int $offset The offset into the string at which to start unpacking. + * + * @throws MWException + * @return array Unpacked associative array. Note that large integers in the input + * may be represented as floating point numbers in the return value, so + * the use of weak comparison is advised. + */ + function unpack( $string, $struct, $offset = 0 ) { + $size = $this->getStructSize( $struct ); + if ( $offset + $size > strlen( $string ) ) { + $this->error( 'zip-bad', 'unpack() would run past the end of the supplied string' ); + } + + $data = array(); + $pos = $offset; + foreach ( $struct as $key => $type ) { + if ( is_array( $type ) ) { + list( $typeName, $fieldSize ) = $type; + switch ( $typeName ) { + case 'string': + $data[$key] = substr( $string, $pos, $fieldSize ); + $pos += $fieldSize; + break; + default: + throw new MWException( __METHOD__ . ": invalid type \"$typeName\"" ); + } + } else { + // Unsigned little-endian integer + $length = intval( $type ); + + // Calculate the value. Use an algorithm which automatically + // upgrades the value to floating point if necessary. + $value = 0; + for ( $i = $length - 1; $i >= 0; $i-- ) { + $value *= 256; + $value += ord( $string[$pos + $i] ); + } + + // Throw an exception if there was loss of precision + if ( $value > pow( 2, 52 ) ) { + $this->error( 'zip-unsupported', 'number too large to be stored in a double. ' . + 'This could happen if we tried to unpack a 64-bit structure ' . + 'at an invalid location.' ); + } + $data[$key] = $value; + $pos += $length; + } + } + + return $data; + } + + /** + * Returns a bit from a given position in an integer value, converted to + * boolean. + * + * @param int $value + * @param int $bitIndex The index of the bit, where 0 is the LSB. + * @return bool + */ + function testBit( $value, $bitIndex ) { + return (bool)( ( $value >> $bitIndex ) & 1 ); + } + + /** + * Debugging helper function which dumps a string in hexdump -C format. + * @param string $s + */ + function hexDump( $s ) { + $n = strlen( $s ); + for ( $i = 0; $i < $n; $i += 16 ) { + printf( "%08X ", $i ); + for ( $j = 0; $j < 16; $j++ ) { + print " "; + if ( $j == 8 ) { + print " "; + } + if ( $i + $j >= $n ) { + print " "; + } else { + printf( "%02X", ord( $s[$i + $j] ) ); + } + } + + print " |"; + for ( $j = 0; $j < 16; $j++ ) { + if ( $i + $j >= $n ) { + print " "; + } elseif ( ctype_print( $s[$i + $j] ) ) { + print $s[$i + $j]; + } else { + print '.'; + } + } + print "|\n"; + } + } +} + +/** + * Internal exception class. Will be caught by private code. + */ +class ZipDirectoryReaderError extends Exception { + protected $errorCode; + + function __construct( $code ) { + $this->errorCode = $code; + parent::__construct( "ZipDirectoryReader error: $code" ); + } + + /** + * @return mixed + */ + function getErrorCode() { + return $this->errorCode; + } +} |