diff options
Diffstat (limited to 'includes/utils/ZipDirectoryReader.php')
-rw-r--r-- | includes/utils/ZipDirectoryReader.php | 732 |
1 files changed, 732 insertions, 0 deletions
diff --git a/includes/utils/ZipDirectoryReader.php b/includes/utils/ZipDirectoryReader.php new file mode 100644 index 00000000..bc849766 --- /dev/null +++ b/includes/utils/ZipDirectoryReader.php @@ -0,0 +1,732 @@ +<?php +/** + * ZIP file directories reader, for the purposes of upload verification. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * A class for reading ZIP file directories, for the purposes of upload + * verification. + * + * Only a functional interface is provided: ZipFileReader::read(). No access is + * given to object instances. + * + */ +class ZipDirectoryReader { + /** + * Read a ZIP file and call a function for each file discovered in it. + * + * Because this class is aimed at verification, an error is raised on + * suspicious or ambiguous input, instead of emulating some standard + * behavior. + * + * @param string $fileName The archive file name + * @param array $callback The callback function. It will be called for each file + * with a single associative array each time, with members: + * + * - name: The file name. Directories conventionally have a trailing + * slash. + * + * - mtime: The file modification time, in MediaWiki 14-char format + * + * - size: The uncompressed file size + * + * @param array $options An associative array of read options, with the option + * name in the key. This may currently contain: + * + * - zip64: If this is set to true, then we will emulate a + * library with ZIP64 support, like OpenJDK 7. If it is set to + * false, then we will emulate a library with no knowledge of + * ZIP64. + * + * NOTE: The ZIP64 code is untested and probably doesn't work. It + * turned out to be easier to just reject ZIP64 archive uploads, + * since they are likely to be very rare. Confirming safety of a + * ZIP64 file is fairly complex. What do you do with a file that is + * ambiguous and broken when read with a non-ZIP64 reader, but valid + * when read with a ZIP64 reader? This situation is normal for a + * valid ZIP64 file, and working out what non-ZIP64 readers will make + * of such a file is not trivial. + * + * @return Status A Status object. The following fatal errors are defined: + * + * - zip-file-open-error: The file could not be opened. + * + * - zip-wrong-format: The file does not appear to be a ZIP file. + * + * - zip-bad: There was something wrong or ambiguous about the file + * data. + * + * - zip-unsupported: The ZIP file uses features which + * ZipDirectoryReader does not support. + * + * The default messages for those fatal errors are written in a way that + * makes sense for upload verification. + * + * If a fatal error is returned, more information about the error will be + * available in the debug log. + * + * Note that the callback function may be called any number of times before + * a fatal error is returned. If this occurs, the data sent to the callback + * function should be discarded. + */ + public static function read( $fileName, $callback, $options = array() ) { + $zdr = new self( $fileName, $callback, $options ); + + return $zdr->execute(); + } + + /** The file name */ + protected $fileName; + + /** The opened file resource */ + protected $file; + + /** The cached length of the file, or null if it has not been loaded yet. */ + protected $fileLength; + + /** A segmented cache of the file contents */ + protected $buffer; + + /** The file data callback */ + protected $callback; + + /** The ZIP64 mode */ + protected $zip64 = false; + + /** Stored headers */ + protected $eocdr, $eocdr64, $eocdr64Locator; + + protected $data; + + /** The "extra field" ID for ZIP64 central directory entries */ + const ZIP64_EXTRA_HEADER = 0x0001; + + /** The segment size for the file contents cache */ + const SEGSIZE = 16384; + + /** The index of the "general field" bit for UTF-8 file names */ + const GENERAL_UTF8 = 11; + + /** The index of the "general field" bit for central directory encryption */ + const GENERAL_CD_ENCRYPTED = 13; + + /** + * Private constructor + * @param string $fileName + * @param callable $callback + * @param array $options + */ + protected function __construct( $fileName, $callback, $options ) { + $this->fileName = $fileName; + $this->callback = $callback; + + if ( isset( $options['zip64'] ) ) { + $this->zip64 = $options['zip64']; + } + } + + /** + * Read the directory according to settings in $this. + * + * @return Status + */ + function execute() { + $this->file = fopen( $this->fileName, 'r' ); + $this->data = array(); + if ( !$this->file ) { + return Status::newFatal( 'zip-file-open-error' ); + } + + $status = Status::newGood(); + try { + $this->readEndOfCentralDirectoryRecord(); + if ( $this->zip64 ) { + list( $offset, $size ) = $this->findZip64CentralDirectory(); + $this->readCentralDirectory( $offset, $size ); + } else { + if ( $this->eocdr['CD size'] == 0xffffffff + || $this->eocdr['CD offset'] == 0xffffffff + || $this->eocdr['CD entries total'] == 0xffff + ) { + $this->error( 'zip-unsupported', 'Central directory header indicates ZIP64, ' . + 'but we are in legacy mode. Rejecting this upload is necessary to avoid ' . + 'opening vulnerabilities on clients using OpenJDK 7 or later.' ); + } + + list( $offset, $size ) = $this->findOldCentralDirectory(); + $this->readCentralDirectory( $offset, $size ); + } + } catch ( ZipDirectoryReaderError $e ) { + $status->fatal( $e->getErrorCode() ); + } + + fclose( $this->file ); + + return $status; + } + + /** + * Throw an error, and log a debug message + * @param mixed $code + * @param string $debugMessage + */ + function error( $code, $debugMessage ) { + wfDebug( __CLASS__ . ": Fatal error: $debugMessage\n" ); + throw new ZipDirectoryReaderError( $code ); + } + + /** + * Read the header which is at the end of the central directory, + * unimaginatively called the "end of central directory record" by the ZIP + * spec. + */ + function readEndOfCentralDirectoryRecord() { + $info = array( + 'signature' => 4, + 'disk' => 2, + 'CD start disk' => 2, + 'CD entries this disk' => 2, + 'CD entries total' => 2, + 'CD size' => 4, + 'CD offset' => 4, + 'file comment length' => 2, + ); + $structSize = $this->getStructSize( $info ); + $startPos = $this->getFileLength() - 65536 - $structSize; + if ( $startPos < 0 ) { + $startPos = 0; + } + + $block = $this->getBlock( $startPos ); + $sigPos = strrpos( $block, "PK\x05\x06" ); + if ( $sigPos === false ) { + $this->error( 'zip-wrong-format', + "zip file lacks EOCDR signature. It probably isn't a zip file." ); + } + + $this->eocdr = $this->unpack( substr( $block, $sigPos ), $info ); + $this->eocdr['EOCDR size'] = $structSize + $this->eocdr['file comment length']; + + if ( $structSize + $this->eocdr['file comment length'] != strlen( $block ) - $sigPos ) { + $this->error( 'zip-bad', 'trailing bytes after the end of the file comment' ); + } + if ( $this->eocdr['disk'] !== 0 + || $this->eocdr['CD start disk'] !== 0 + ) { + $this->error( 'zip-unsupported', 'more than one disk (in EOCDR)' ); + } + $this->eocdr += $this->unpack( + $block, + array( 'file comment' => array( 'string', $this->eocdr['file comment length'] ) ), + $sigPos + $structSize ); + $this->eocdr['position'] = $startPos + $sigPos; + } + + /** + * Read the header called the "ZIP64 end of central directory locator". An + * error will be raised if it does not exist. + */ + function readZip64EndOfCentralDirectoryLocator() { + $info = array( + 'signature' => array( 'string', 4 ), + 'eocdr64 start disk' => 4, + 'eocdr64 offset' => 8, + 'number of disks' => 4, + ); + $structSize = $this->getStructSize( $info ); + + $start = $this->getFileLength() - $this->eocdr['EOCDR size'] - $structSize; + $block = $this->getBlock( $start, $structSize ); + $this->eocdr64Locator = $data = $this->unpack( $block, $info ); + + if ( $data['signature'] !== "PK\x06\x07" ) { + // Note: Java will allow this and continue to read the + // EOCDR64, so we have to reject the upload, we can't + // just use the EOCDR header instead. + $this->error( 'zip-bad', 'wrong signature on Zip64 end of central directory locator' ); + } + } + + /** + * Read the header called the "ZIP64 end of central directory record". It + * may replace the regular "end of central directory record" in ZIP64 files. + */ + function readZip64EndOfCentralDirectoryRecord() { + if ( $this->eocdr64Locator['eocdr64 start disk'] != 0 + || $this->eocdr64Locator['number of disks'] != 0 + ) { + $this->error( 'zip-unsupported', 'more than one disk (in EOCDR64 locator)' ); + } + + $info = array( + 'signature' => array( 'string', 4 ), + 'EOCDR64 size' => 8, + 'version made by' => 2, + 'version needed' => 2, + 'disk' => 4, + 'CD start disk' => 4, + 'CD entries this disk' => 8, + 'CD entries total' => 8, + 'CD size' => 8, + 'CD offset' => 8 + ); + $structSize = $this->getStructSize( $info ); + $block = $this->getBlock( $this->eocdr64Locator['eocdr64 offset'], $structSize ); + $this->eocdr64 = $data = $this->unpack( $block, $info ); + if ( $data['signature'] !== "PK\x06\x06" ) { + $this->error( 'zip-bad', 'wrong signature on Zip64 end of central directory record' ); + } + if ( $data['disk'] !== 0 + || $data['CD start disk'] !== 0 + ) { + $this->error( 'zip-unsupported', 'more than one disk (in EOCDR64)' ); + } + } + + /** + * Find the location of the central directory, as would be seen by a + * non-ZIP64 reader. + * + * @return array List containing offset, size and end position. + */ + function findOldCentralDirectory() { + $size = $this->eocdr['CD size']; + $offset = $this->eocdr['CD offset']; + $endPos = $this->eocdr['position']; + + // Some readers use the EOCDR position instead of the offset field + // to find the directory, so to be safe, we check if they both agree. + if ( $offset + $size != $endPos ) { + $this->error( 'zip-bad', 'the central directory does not immediately precede the end ' . + 'of central directory record' ); + } + + return array( $offset, $size ); + } + + /** + * Find the location of the central directory, as would be seen by a + * ZIP64-compliant reader. + * + * @return array List containing offset, size and end position. + */ + function findZip64CentralDirectory() { + // The spec is ambiguous about the exact rules of precedence between the + // ZIP64 headers and the original headers. Here we follow zip_util.c + // from OpenJDK 7. + $size = $this->eocdr['CD size']; + $offset = $this->eocdr['CD offset']; + $numEntries = $this->eocdr['CD entries total']; + $endPos = $this->eocdr['position']; + if ( $size == 0xffffffff + || $offset == 0xffffffff + || $numEntries == 0xffff + ) { + $this->readZip64EndOfCentralDirectoryLocator(); + + if ( isset( $this->eocdr64Locator['eocdr64 offset'] ) ) { + $this->readZip64EndOfCentralDirectoryRecord(); + if ( isset( $this->eocdr64['CD offset'] ) ) { + $size = $this->eocdr64['CD size']; + $offset = $this->eocdr64['CD offset']; + $endPos = $this->eocdr64Locator['eocdr64 offset']; + } + } + } + // Some readers use the EOCDR position instead of the offset field + // to find the directory, so to be safe, we check if they both agree. + if ( $offset + $size != $endPos ) { + $this->error( 'zip-bad', 'the central directory does not immediately precede the end ' . + 'of central directory record' ); + } + + return array( $offset, $size ); + } + + /** + * Read the central directory at the given location + * @param int $offset + * @param int $size + */ + function readCentralDirectory( $offset, $size ) { + $block = $this->getBlock( $offset, $size ); + + $fixedInfo = array( + 'signature' => array( 'string', 4 ), + 'version made by' => 2, + 'version needed' => 2, + 'general bits' => 2, + 'compression method' => 2, + 'mod time' => 2, + 'mod date' => 2, + 'crc-32' => 4, + 'compressed size' => 4, + 'uncompressed size' => 4, + 'name length' => 2, + 'extra field length' => 2, + 'comment length' => 2, + 'disk number start' => 2, + 'internal attrs' => 2, + 'external attrs' => 4, + 'local header offset' => 4, + ); + $fixedSize = $this->getStructSize( $fixedInfo ); + + $pos = 0; + while ( $pos < $size ) { + $data = $this->unpack( $block, $fixedInfo, $pos ); + $pos += $fixedSize; + + if ( $data['signature'] !== "PK\x01\x02" ) { + $this->error( 'zip-bad', 'Invalid signature found in directory entry' ); + } + + $variableInfo = array( + 'name' => array( 'string', $data['name length'] ), + 'extra field' => array( 'string', $data['extra field length'] ), + 'comment' => array( 'string', $data['comment length'] ), + ); + $data += $this->unpack( $block, $variableInfo, $pos ); + $pos += $this->getStructSize( $variableInfo ); + + if ( $this->zip64 && ( + $data['compressed size'] == 0xffffffff + || $data['uncompressed size'] == 0xffffffff + || $data['local header offset'] == 0xffffffff ) + ) { + $zip64Data = $this->unpackZip64Extra( $data['extra field'] ); + if ( $zip64Data ) { + $data = $zip64Data + $data; + } + } + + if ( $this->testBit( $data['general bits'], self::GENERAL_CD_ENCRYPTED ) ) { + $this->error( 'zip-unsupported', 'central directory encryption is not supported' ); + } + + // Convert the timestamp into MediaWiki format + // For the format, please see the MS-DOS 2.0 Programmer's Reference, + // pages 3-5 and 3-6. + $time = $data['mod time']; + $date = $data['mod date']; + + $year = 1980 + ( $date >> 9 ); + $month = ( $date >> 5 ) & 15; + $day = $date & 31; + $hour = ( $time >> 11 ) & 31; + $minute = ( $time >> 5 ) & 63; + $second = ( $time & 31 ) * 2; + $timestamp = sprintf( "%04d%02d%02d%02d%02d%02d", + $year, $month, $day, $hour, $minute, $second ); + + // Convert the character set in the file name + if ( $this->testBit( $data['general bits'], self::GENERAL_UTF8 ) ) { + $name = $data['name']; + } else { + $name = iconv( 'CP437', 'UTF-8', $data['name'] ); + } + + // Compile a data array for the user, with a sensible format + $userData = array( + 'name' => $name, + 'mtime' => $timestamp, + 'size' => $data['uncompressed size'], + ); + call_user_func( $this->callback, $userData ); + } + } + + /** + * Interpret ZIP64 "extra field" data and return an associative array. + * @param string $extraField + * @return array|bool + */ + function unpackZip64Extra( $extraField ) { + $extraHeaderInfo = array( + 'id' => 2, + 'size' => 2, + ); + $extraHeaderSize = $this->getStructSize( $extraHeaderInfo ); + + $zip64ExtraInfo = array( + 'uncompressed size' => 8, + 'compressed size' => 8, + 'local header offset' => 8, + 'disk number start' => 4, + ); + + $extraPos = 0; + while ( $extraPos < strlen( $extraField ) ) { + $extra = $this->unpack( $extraField, $extraHeaderInfo, $extraPos ); + $extraPos += $extraHeaderSize; + $extra += $this->unpack( $extraField, + array( 'data' => array( 'string', $extra['size'] ) ), + $extraPos ); + $extraPos += $extra['size']; + + if ( $extra['id'] == self::ZIP64_EXTRA_HEADER ) { + return $this->unpack( $extra['data'], $zip64ExtraInfo ); + } + } + + return false; + } + + /** + * Get the length of the file. + * @return int + */ + function getFileLength() { + if ( $this->fileLength === null ) { + $stat = fstat( $this->file ); + $this->fileLength = $stat['size']; + } + + return $this->fileLength; + } + + /** + * Get the file contents from a given offset. If there are not enough bytes + * in the file to satisfy the request, an exception will be thrown. + * + * @param int $start The byte offset of the start of the block. + * @param int $length The number of bytes to return. If omitted, the remainder + * of the file will be returned. + * + * @return string + */ + function getBlock( $start, $length = null ) { + $fileLength = $this->getFileLength(); + if ( $start >= $fileLength ) { + $this->error( 'zip-bad', "getBlock() requested position $start, " . + "file length is $fileLength" ); + } + if ( $length === null ) { + $length = $fileLength - $start; + } + $end = $start + $length; + if ( $end > $fileLength ) { + $this->error( 'zip-bad', "getBlock() requested end position $end, " . + "file length is $fileLength" ); + } + $startSeg = floor( $start / self::SEGSIZE ); + $endSeg = ceil( $end / self::SEGSIZE ); + + $block = ''; + for ( $segIndex = $startSeg; $segIndex <= $endSeg; $segIndex++ ) { + $block .= $this->getSegment( $segIndex ); + } + + $block = substr( $block, + $start - $startSeg * self::SEGSIZE, + $length ); + + if ( strlen( $block ) < $length ) { + $this->error( 'zip-bad', 'getBlock() returned an unexpectedly small amount of data' ); + } + + return $block; + } + + /** + * Get a section of the file starting at position $segIndex * self::SEGSIZE, + * of length self::SEGSIZE. The result is cached. This is a helper function + * for getBlock(). + * + * If there are not enough bytes in the file to satisfy the request, the + * return value will be truncated. If a request is made for a segment beyond + * the end of the file, an empty string will be returned. + * + * @param int $segIndex + * + * @return string + */ + function getSegment( $segIndex ) { + if ( !isset( $this->buffer[$segIndex] ) ) { + $bytePos = $segIndex * self::SEGSIZE; + if ( $bytePos >= $this->getFileLength() ) { + $this->buffer[$segIndex] = ''; + + return ''; + } + if ( fseek( $this->file, $bytePos ) ) { + $this->error( 'zip-bad', "seek to $bytePos failed" ); + } + $seg = fread( $this->file, self::SEGSIZE ); + if ( $seg === false ) { + $this->error( 'zip-bad', "read from $bytePos failed" ); + } + $this->buffer[$segIndex] = $seg; + } + + return $this->buffer[$segIndex]; + } + + /** + * Get the size of a structure in bytes. See unpack() for the format of $struct. + * @param array $struct + * @return int + */ + function getStructSize( $struct ) { + $size = 0; + foreach ( $struct as $type ) { + if ( is_array( $type ) ) { + list( , $fieldSize ) = $type; + $size += $fieldSize; + } else { + $size += $type; + } + } + + return $size; + } + + /** + * Unpack a binary structure. This is like the built-in unpack() function + * except nicer. + * + * @param string $string The binary data input + * + * @param array $struct An associative array giving structure members and their + * types. In the key is the field name. The value may be either an + * integer, in which case the field is a little-endian unsigned integer + * encoded in the given number of bytes, or an array, in which case the + * first element of the array is the type name, and the subsequent + * elements are type-dependent parameters. Only one such type is defined: + * - "string": The second array element gives the length of string. + * Not null terminated. + * + * @param int $offset The offset into the string at which to start unpacking. + * + * @throws MWException + * @return array Unpacked associative array. Note that large integers in the input + * may be represented as floating point numbers in the return value, so + * the use of weak comparison is advised. + */ + function unpack( $string, $struct, $offset = 0 ) { + $size = $this->getStructSize( $struct ); + if ( $offset + $size > strlen( $string ) ) { + $this->error( 'zip-bad', 'unpack() would run past the end of the supplied string' ); + } + + $data = array(); + $pos = $offset; + foreach ( $struct as $key => $type ) { + if ( is_array( $type ) ) { + list( $typeName, $fieldSize ) = $type; + switch ( $typeName ) { + case 'string': + $data[$key] = substr( $string, $pos, $fieldSize ); + $pos += $fieldSize; + break; + default: + throw new MWException( __METHOD__ . ": invalid type \"$typeName\"" ); + } + } else { + // Unsigned little-endian integer + $length = intval( $type ); + + // Calculate the value. Use an algorithm which automatically + // upgrades the value to floating point if necessary. + $value = 0; + for ( $i = $length - 1; $i >= 0; $i-- ) { + $value *= 256; + $value += ord( $string[$pos + $i] ); + } + + // Throw an exception if there was loss of precision + if ( $value > pow( 2, 52 ) ) { + $this->error( 'zip-unsupported', 'number too large to be stored in a double. ' . + 'This could happen if we tried to unpack a 64-bit structure ' . + 'at an invalid location.' ); + } + $data[$key] = $value; + $pos += $length; + } + } + + return $data; + } + + /** + * Returns a bit from a given position in an integer value, converted to + * boolean. + * + * @param int $value + * @param int $bitIndex The index of the bit, where 0 is the LSB. + * @return bool + */ + function testBit( $value, $bitIndex ) { + return (bool)( ( $value >> $bitIndex ) & 1 ); + } + + /** + * Debugging helper function which dumps a string in hexdump -C format. + * @param string $s + */ + function hexDump( $s ) { + $n = strlen( $s ); + for ( $i = 0; $i < $n; $i += 16 ) { + printf( "%08X ", $i ); + for ( $j = 0; $j < 16; $j++ ) { + print " "; + if ( $j == 8 ) { + print " "; + } + if ( $i + $j >= $n ) { + print " "; + } else { + printf( "%02X", ord( $s[$i + $j] ) ); + } + } + + print " |"; + for ( $j = 0; $j < 16; $j++ ) { + if ( $i + $j >= $n ) { + print " "; + } elseif ( ctype_print( $s[$i + $j] ) ) { + print $s[$i + $j]; + } else { + print '.'; + } + } + print "|\n"; + } + } +} + +/** + * Internal exception class. Will be caught by private code. + */ +class ZipDirectoryReaderError extends Exception { + protected $errorCode; + + function __construct( $code ) { + $this->errorCode = $code; + parent::__construct( "ZipDirectoryReader error: $code" ); + } + + /** + * @return mixed + */ + function getErrorCode() { + return $this->errorCode; + } +} |