summaryrefslogtreecommitdiff
path: root/includes/ZipDirectoryReader.php
diff options
context:
space:
mode:
Diffstat (limited to 'includes/ZipDirectoryReader.php')
-rw-r--r--includes/ZipDirectoryReader.php684
1 files changed, 684 insertions, 0 deletions
diff --git a/includes/ZipDirectoryReader.php b/includes/ZipDirectoryReader.php
new file mode 100644
index 00000000..d21cf3b0
--- /dev/null
+++ b/includes/ZipDirectoryReader.php
@@ -0,0 +1,684 @@
+<?php
+
+/**
+ * A class for reading ZIP file directories, for the purposes of upload
+ * verification.
+ *
+ * Only a functional interface is provided: ZipFileReader::read(). No access is
+ * given to object instances.
+ *
+ */
+class ZipDirectoryReader {
+ /**
+ * Read a ZIP file and call a function for each file discovered in it.
+ *
+ * Because this class is aimed at verification, an error is raised on
+ * suspicious or ambiguous input, instead of emulating some standard
+ * behaviour.
+ *
+ * @param $fileName string The archive file name
+ * @param $callback Array The callback function. It will be called for each file
+ * with a single associative array each time, with members:
+ *
+ * - name: The file name. Directories conventionally have a trailing
+ * slash.
+ *
+ * - mtime: The file modification time, in MediaWiki 14-char format
+ *
+ * - size: The uncompressed file size
+ *
+ * @param $options Array An associative array of read options, with the option
+ * name in the key. This may currently contain:
+ *
+ * - zip64: If this is set to true, then we will emulate a
+ * library with ZIP64 support, like OpenJDK 7. If it is set to
+ * false, then we will emulate a library with no knowledge of
+ * ZIP64.
+ *
+ * NOTE: The ZIP64 code is untested and probably doesn't work. It
+ * turned out to be easier to just reject ZIP64 archive uploads,
+ * since they are likely to be very rare. Confirming safety of a
+ * ZIP64 file is fairly complex. What do you do with a file that is
+ * ambiguous and broken when read with a non-ZIP64 reader, but valid
+ * when read with a ZIP64 reader? This situation is normal for a
+ * valid ZIP64 file, and working out what non-ZIP64 readers will make
+ * of such a file is not trivial.
+ *
+ * @return Status object. The following fatal errors are defined:
+ *
+ * - zip-file-open-error: The file could not be opened.
+ *
+ * - zip-wrong-format: The file does not appear to be a ZIP file.
+ *
+ * - zip-bad: There was something wrong or ambiguous about the file
+ * data.
+ *
+ * - zip-unsupported: The ZIP file uses features which
+ * ZipDirectoryReader does not support.
+ *
+ * The default messages for those fatal errors are written in a way that
+ * makes sense for upload verification.
+ *
+ * If a fatal error is returned, more information about the error will be
+ * available in the debug log.
+ *
+ * Note that the callback function may be called any number of times before
+ * a fatal error is returned. If this occurs, the data sent to the callback
+ * function should be discarded.
+ */
+ public static function read( $fileName, $callback, $options = array() ) {
+ $zdr = new self( $fileName, $callback, $options );
+ return $zdr->execute();
+ }
+
+ /** The file name */
+ var $fileName;
+
+ /** The opened file resource */
+ var $file;
+
+ /** The cached length of the file, or null if it has not been loaded yet. */
+ var $fileLength;
+
+ /** A segmented cache of the file contents */
+ var $buffer;
+
+ /** The file data callback */
+ var $callback;
+
+ /** The ZIP64 mode */
+ var $zip64 = false;
+
+ /** Stored headers */
+ var $eocdr, $eocdr64, $eocdr64Locator;
+
+ /** The "extra field" ID for ZIP64 central directory entries */
+ const ZIP64_EXTRA_HEADER = 0x0001;
+
+ /** The segment size for the file contents cache */
+ const SEGSIZE = 16384;
+
+ /** The index of the "general field" bit for UTF-8 file names */
+ const GENERAL_UTF8 = 11;
+
+ /** The index of the "general field" bit for central directory encryption */
+ const GENERAL_CD_ENCRYPTED = 13;
+
+
+ /**
+ * Private constructor
+ */
+ protected function __construct( $fileName, $callback, $options ) {
+ $this->fileName = $fileName;
+ $this->callback = $callback;
+
+ if ( isset( $options['zip64'] ) ) {
+ $this->zip64 = $options['zip64'];
+ }
+ }
+
+ /**
+ * Read the directory according to settings in $this.
+ *
+ * @return Status
+ */
+ function execute() {
+ $this->file = fopen( $this->fileName, 'r' );
+ $this->data = array();
+ if ( !$this->file ) {
+ return Status::newFatal( 'zip-file-open-error' );
+ }
+
+ $status = Status::newGood();
+ try {
+ $this->readEndOfCentralDirectoryRecord();
+ if ( $this->zip64 ) {
+ list( $offset, $size ) = $this->findZip64CentralDirectory();
+ $this->readCentralDirectory( $offset, $size );
+ } else {
+ if ( $this->eocdr['CD size'] == 0xffffffff
+ || $this->eocdr['CD offset'] == 0xffffffff
+ || $this->eocdr['CD entries total'] == 0xffff )
+ {
+ $this->error( 'zip-unsupported', 'Central directory header indicates ZIP64, ' .
+ 'but we are in legacy mode. Rejecting this upload is necessary to avoid '.
+ 'opening vulnerabilities on clients using OpenJDK 7 or later.' );
+ }
+
+ list( $offset, $size ) = $this->findOldCentralDirectory();
+ $this->readCentralDirectory( $offset, $size );
+ }
+ } catch ( ZipDirectoryReaderError $e ) {
+ $status->fatal( $e->getErrorCode() );
+ }
+
+ fclose( $this->file );
+ return $status;
+ }
+
+ /**
+ * Throw an error, and log a debug message
+ */
+ function error( $code, $debugMessage ) {
+ wfDebug( __CLASS__.": Fatal error: $debugMessage\n" );
+ throw new ZipDirectoryReaderError( $code );
+ }
+
+ /**
+ * Read the header which is at the end of the central directory,
+ * unimaginatively called the "end of central directory record" by the ZIP
+ * spec.
+ */
+ function readEndOfCentralDirectoryRecord() {
+ $info = array(
+ 'signature' => 4,
+ 'disk' => 2,
+ 'CD start disk' => 2,
+ 'CD entries this disk' => 2,
+ 'CD entries total' => 2,
+ 'CD size' => 4,
+ 'CD offset' => 4,
+ 'file comment length' => 2,
+ );
+ $structSize = $this->getStructSize( $info );
+ $startPos = $this->getFileLength() - 65536 - $structSize;
+ if ( $startPos < 0 ) {
+ $startPos = 0;
+ }
+
+ $block = $this->getBlock( $startPos );
+ $sigPos = strrpos( $block, "PK\x05\x06" );
+ if ( $sigPos === false ) {
+ $this->error( 'zip-wrong-format',
+ "zip file lacks EOCDR signature. It probably isn't a zip file." );
+ }
+
+ $this->eocdr = $this->unpack( substr( $block, $sigPos ), $info );
+ $this->eocdr['EOCDR size'] = $structSize + $this->eocdr['file comment length'];
+
+ if ( $structSize + $this->eocdr['file comment length'] != strlen( $block ) - $sigPos ) {
+ $this->error( 'zip-bad', 'trailing bytes after the end of the file comment' );
+ }
+ if ( $this->eocdr['disk'] !== 0
+ || $this->eocdr['CD start disk'] !== 0 )
+ {
+ $this->error( 'zip-unsupported', 'more than one disk (in EOCDR)' );
+ }
+ $this->eocdr += $this->unpack(
+ $block,
+ array( 'file comment' => array( 'string', $this->eocdr['file comment length'] ) ),
+ $sigPos + $structSize );
+ $this->eocdr['position'] = $startPos + $sigPos;
+ }
+
+ /**
+ * Read the header called the "ZIP64 end of central directory locator". An
+ * error will be raised if it does not exist.
+ */
+ function readZip64EndOfCentralDirectoryLocator() {
+ $info = array(
+ 'signature' => array( 'string', 4 ),
+ 'eocdr64 start disk' => 4,
+ 'eocdr64 offset' => 8,
+ 'number of disks' => 4,
+ );
+ $structSize = $this->getStructSize( $info );
+
+ $block = $this->getBlock( $this->getFileLength() - $this->eocdr['EOCDR size']
+ - $structSize, $structSize );
+ $this->eocdr64Locator = $data = $this->unpack( $block, $info );
+
+ if ( $data['signature'] !== "PK\x06\x07" ) {
+ // Note: Java will allow this and continue to read the
+ // EOCDR64, so we have to reject the upload, we can't
+ // just use the EOCDR header instead.
+ $this->error( 'zip-bad', 'wrong signature on Zip64 end of central directory locator' );
+ }
+ }
+
+ /**
+ * Read the header called the "ZIP64 end of central directory record". It
+ * may replace the regular "end of central directory record" in ZIP64 files.
+ */
+ function readZip64EndOfCentralDirectoryRecord() {
+ if ( $this->eocdr64Locator['eocdr64 start disk'] != 0
+ || $this->eocdr64Locator['number of disks'] != 0 )
+ {
+ $this->error( 'zip-unsupported', 'more than one disk (in EOCDR64 locator)' );
+ }
+
+ $info = array(
+ 'signature' => array( 'string', 4 ),
+ 'EOCDR64 size' => 8,
+ 'version made by' => 2,
+ 'version needed' => 2,
+ 'disk' => 4,
+ 'CD start disk' => 4,
+ 'CD entries this disk' => 8,
+ 'CD entries total' => 8,
+ 'CD size' => 8,
+ 'CD offset' => 8
+ );
+ $structSize = $this->getStructSize( $info );
+ $block = $this->getBlock( $this->eocdr64Locator['eocdr64 offset'], $structSize );
+ $this->eocdr64 = $data = $this->unpack( $block, $info );
+ if ( $data['signature'] !== "PK\x06\x06" ) {
+ $this->error( 'zip-bad', 'wrong signature on Zip64 end of central directory record' );
+ }
+ if ( $data['disk'] !== 0
+ || $data['CD start disk'] !== 0 )
+ {
+ $this->error( 'zip-unsupported', 'more than one disk (in EOCDR64)' );
+ }
+ }
+
+ /**
+ * Find the location of the central directory, as would be seen by a
+ * non-ZIP64 reader.
+ *
+ * @return List containing offset, size and end position.
+ */
+ function findOldCentralDirectory() {
+ $size = $this->eocdr['CD size'];
+ $offset = $this->eocdr['CD offset'];
+ $endPos = $this->eocdr['position'];
+
+ // Some readers use the EOCDR position instead of the offset field
+ // to find the directory, so to be safe, we check if they both agree.
+ if ( $offset + $size != $endPos ) {
+ $this->error( 'zip-bad', 'the central directory does not immediately precede the end ' .
+ 'of central directory record' );
+ }
+ return array( $offset, $size );
+ }
+
+ /**
+ * Find the location of the central directory, as would be seen by a
+ * ZIP64-compliant reader.
+ *
+ * @return List containing offset, size and end position.
+ */
+ function findZip64CentralDirectory() {
+ // The spec is ambiguous about the exact rules of precedence between the
+ // ZIP64 headers and the original headers. Here we follow zip_util.c
+ // from OpenJDK 7.
+ $size = $this->eocdr['CD size'];
+ $offset = $this->eocdr['CD offset'];
+ $numEntries = $this->eocdr['CD entries total'];
+ $endPos = $this->eocdr['position'];
+ if ( $size == 0xffffffff
+ || $offset == 0xffffffff
+ || $numEntries == 0xffff )
+ {
+ $this->readZip64EndOfCentralDirectoryLocator();
+
+ if ( isset( $this->eocdr64Locator['eocdr64 offset'] ) ) {
+ $this->readZip64EndOfCentralDirectoryRecord();
+ if ( isset( $this->eocdr64['CD offset'] ) ) {
+ $size = $this->eocdr64['CD size'];
+ $offset = $this->eocdr64['CD offset'];
+ $endPos = $this->eocdr64Locator['eocdr64 offset'];
+ }
+ }
+ }
+ // Some readers use the EOCDR position instead of the offset field
+ // to find the directory, so to be safe, we check if they both agree.
+ if ( $offset + $size != $endPos ) {
+ $this->error( 'zip-bad', 'the central directory does not immediately precede the end ' .
+ 'of central directory record' );
+ }
+ return array( $offset, $size );
+ }
+
+ /**
+ * Read the central directory at the given location
+ */
+ function readCentralDirectory( $offset, $size ) {
+ $block = $this->getBlock( $offset, $size );
+
+ $fixedInfo = array(
+ 'signature' => array( 'string', 4 ),
+ 'version made by' => 2,
+ 'version needed' => 2,
+ 'general bits' => 2,
+ 'compression method' => 2,
+ 'mod time' => 2,
+ 'mod date' => 2,
+ 'crc-32' => 4,
+ 'compressed size' => 4,
+ 'uncompressed size' => 4,
+ 'name length' => 2,
+ 'extra field length' => 2,
+ 'comment length' => 2,
+ 'disk number start' => 2,
+ 'internal attrs' => 2,
+ 'external attrs' => 4,
+ 'local header offset' => 4,
+ );
+ $fixedSize = $this->getStructSize( $fixedInfo );
+
+ $pos = 0;
+ while ( $pos < $size ) {
+ $data = $this->unpack( $block, $fixedInfo, $pos );
+ $pos += $fixedSize;
+
+ if ( $data['signature'] !== "PK\x01\x02" ) {
+ $this->error( 'zip-bad', 'Invalid signature found in directory entry' );
+ }
+
+ $variableInfo = array(
+ 'name' => array( 'string', $data['name length'] ),
+ 'extra field' => array( 'string', $data['extra field length'] ),
+ 'comment' => array( 'string', $data['comment length'] ),
+ );
+ $data += $this->unpack( $block, $variableInfo, $pos );
+ $pos += $this->getStructSize( $variableInfo );
+
+ if ( $this->zip64 && (
+ $data['compressed size'] == 0xffffffff
+ || $data['uncompressed size'] == 0xffffffff
+ || $data['local header offset'] == 0xffffffff ) )
+ {
+ $zip64Data = $this->unpackZip64Extra( $data['extra field'] );
+ if ( $zip64Data ) {
+ $data = $zip64Data + $data;
+ }
+ }
+
+ if ( $this->testBit( $data['general bits'], self::GENERAL_CD_ENCRYPTED ) ) {
+ $this->error( 'zip-unsupported', 'central directory encryption is not supported' );
+ }
+
+ // Convert the timestamp into MediaWiki format
+ // For the format, please see the MS-DOS 2.0 Programmer's Reference,
+ // pages 3-5 and 3-6.
+ $time = $data['mod time'];
+ $date = $data['mod date'];
+
+ $year = 1980 + ( $date >> 9 );
+ $month = ( $date >> 5 ) & 15;
+ $day = $date & 31;
+ $hour = ( $time >> 11 ) & 31;
+ $minute = ( $time >> 5 ) & 63;
+ $second = ( $time & 31 ) * 2;
+ $timestamp = sprintf( "%04d%02d%02d%02d%02d%02d",
+ $year, $month, $day, $hour, $minute, $second );
+
+ // Convert the character set in the file name
+ if ( !function_exists( 'iconv' )
+ || $this->testBit( $data['general bits'], self::GENERAL_UTF8 ) )
+ {
+ $name = $data['name'];
+ } else {
+ $name = iconv( 'CP437', 'UTF-8', $data['name'] );
+ }
+
+ // Compile a data array for the user, with a sensible format
+ $userData = array(
+ 'name' => $name,
+ 'mtime' => $timestamp,
+ 'size' => $data['uncompressed size'],
+ );
+ call_user_func( $this->callback, $userData );
+ }
+ }
+
+ /**
+ * Interpret ZIP64 "extra field" data and return an associative array.
+ */
+ function unpackZip64Extra( $extraField ) {
+ $extraHeaderInfo = array(
+ 'id' => 2,
+ 'size' => 2,
+ );
+ $extraHeaderSize = $this->getStructSize( $extraHeaderInfo );
+
+ $zip64ExtraInfo = array(
+ 'uncompressed size' => 8,
+ 'compressed size' => 8,
+ 'local header offset' => 8,
+ 'disk number start' => 4,
+ );
+
+ $extraPos = 0;
+ while ( $extraPos < strlen( $extraField ) ) {
+ $extra = $this->unpack( $extraField, $extraHeaderInfo, $extraPos );
+ $extraPos += $extraHeaderSize;
+ $extra += $this->unpack( $extraField,
+ array( 'data' => array( 'string', $extra['size'] ) ),
+ $extraPos );
+ $extraPos += $extra['size'];
+
+ if ( $extra['id'] == self::ZIP64_EXTRA_HEADER ) {
+ return $this->unpack( $extra['data'], $zip64ExtraInfo );
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Get the length of the file.
+ */
+ function getFileLength() {
+ if ( $this->fileLength === null ) {
+ $stat = fstat( $this->file );
+ $this->fileLength = $stat['size'];
+ }
+ return $this->fileLength;
+ }
+
+ /**
+ * Get the file contents from a given offset. If there are not enough bytes
+ * in the file to satisfy the request, an exception will be thrown.
+ *
+ * @param $start The byte offset of the start of the block.
+ * @param $length The number of bytes to return. If omitted, the remainder
+ * of the file will be returned.
+ *
+ * @return string
+ */
+ function getBlock( $start, $length = null ) {
+ $fileLength = $this->getFileLength();
+ if ( $start >= $fileLength ) {
+ $this->error( 'zip-bad', "getBlock() requested position $start, " .
+ "file length is $fileLength" );
+ }
+ if ( $length === null ) {
+ $length = $fileLength - $start;
+ }
+ $end = $start + $length;
+ if ( $end > $fileLength ) {
+ $this->error( 'zip-bad', "getBlock() requested end position $end, " .
+ "file length is $fileLength" );
+ }
+ $startSeg = floor( $start / self::SEGSIZE );
+ $endSeg = ceil( $end / self::SEGSIZE );
+
+ $block = '';
+ for ( $segIndex = $startSeg; $segIndex <= $endSeg; $segIndex++ ) {
+ $block .= $this->getSegment( $segIndex );
+ }
+
+ $block = substr( $block,
+ $start - $startSeg * self::SEGSIZE,
+ $length );
+
+ if ( strlen( $block ) < $length ) {
+ $this->error( 'zip-bad', 'getBlock() returned an unexpectedly small amount of data' );
+ }
+
+ return $block;
+ }
+
+ /**
+ * Get a section of the file starting at position $segIndex * self::SEGSIZE,
+ * of length self::SEGSIZE. The result is cached. This is a helper function
+ * for getBlock().
+ *
+ * If there are not enough bytes in the file to satsify the request, the
+ * return value will be truncated. If a request is made for a segment beyond
+ * the end of the file, an empty string will be returned.
+ */
+ function getSegment( $segIndex ) {
+ if ( !isset( $this->buffer[$segIndex] ) ) {
+ $bytePos = $segIndex * self::SEGSIZE;
+ if ( $bytePos >= $this->getFileLength() ) {
+ $this->buffer[$segIndex] = '';
+ return '';
+ }
+ if ( fseek( $this->file, $bytePos ) ) {
+ $this->error( 'zip-bad', "seek to $bytePos failed" );
+ }
+ $seg = fread( $this->file, self::SEGSIZE );
+ if ( $seg === false ) {
+ $this->error( 'zip-bad', "read from $bytePos failed" );
+ }
+ $this->buffer[$segIndex] = $seg;
+ }
+ return $this->buffer[$segIndex];
+ }
+
+ /**
+ * Get the size of a structure in bytes. See unpack() for the format of $struct.
+ */
+ function getStructSize( $struct ) {
+ $size = 0;
+ foreach ( $struct as $type ) {
+ if ( is_array( $type ) ) {
+ list( $typeName, $fieldSize ) = $type;
+ $size += $fieldSize;
+ } else {
+ $size += $type;
+ }
+ }
+ return $size;
+ }
+
+ /**
+ * Unpack a binary structure. This is like the built-in unpack() function
+ * except nicer.
+ *
+ * @param $string The binary data input
+ *
+ * @param $struct An associative array giving structure members and their
+ * types. In the key is the field name. The value may be either an
+ * integer, in which case the field is a little-endian unsigned integer
+ * encoded in the given number of bytes, or an array, in which case the
+ * first element of the array is the type name, and the subsequent
+ * elements are type-dependent parameters. Only one such type is defined:
+ * - "string": The second array element gives the length of string.
+ * Not null terminated.
+ *
+ * @param $offset The offset into the string at which to start unpacking.
+ *
+ * @return Unpacked associative array. Note that large integers in the input
+ * may be represented as floating point numbers in the return value, so
+ * the use of weak comparison is advised.
+ */
+ function unpack( $string, $struct, $offset = 0 ) {
+ $size = $this->getStructSize( $struct );
+ if ( $offset + $size > strlen( $string ) ) {
+ $this->error( 'zip-bad', 'unpack() would run past the end of the supplied string' );
+ }
+
+ $data = array();
+ $pos = $offset;
+ foreach ( $struct as $key => $type ) {
+ if ( is_array( $type ) ) {
+ list( $typeName, $fieldSize ) = $type;
+ switch ( $typeName ) {
+ case 'string':
+ $data[$key] = substr( $string, $pos, $fieldSize );
+ $pos += $fieldSize;
+ break;
+ default:
+ throw new MWException( __METHOD__.": invalid type \"$typeName\"" );
+ }
+ } else {
+ // Unsigned little-endian integer
+ $length = intval( $type );
+ $bytes = substr( $string, $pos, $length );
+
+ // Calculate the value. Use an algorithm which automatically
+ // upgrades the value to floating point if necessary.
+ $value = 0;
+ for ( $i = $length - 1; $i >= 0; $i-- ) {
+ $value *= 256;
+ $value += ord( $string[$pos + $i] );
+ }
+
+ // Throw an exception if there was loss of precision
+ if ( $value > pow( 2, 52 ) ) {
+ $this->error( 'zip-unsupported', 'number too large to be stored in a double. ' .
+ 'This could happen if we tried to unpack a 64-bit structure ' .
+ 'at an invalid location.' );
+ }
+ $data[$key] = $value;
+ $pos += $length;
+ }
+ }
+
+ return $data;
+ }
+
+ /**
+ * Returns a bit from a given position in an integer value, converted to
+ * boolean.
+ *
+ * @param $value integer
+ * @param $bitIndex The index of the bit, where 0 is the LSB.
+ */
+ function testBit( $value, $bitIndex ) {
+ return (bool)( ( $value >> $bitIndex ) & 1 );
+ }
+
+ /**
+ * Debugging helper function which dumps a string in hexdump -C format.
+ */
+ function hexDump( $s ) {
+ $n = strlen( $s );
+ for ( $i = 0; $i < $n; $i += 16 ) {
+ printf( "%08X ", $i );
+ for ( $j = 0; $j < 16; $j++ ) {
+ print " ";
+ if ( $j == 8 ) {
+ print " ";
+ }
+ if ( $i + $j >= $n ) {
+ print " ";
+ } else {
+ printf( "%02X", ord( $s[$i + $j] ) );
+ }
+ }
+
+ print " |";
+ for ( $j = 0; $j < 16; $j++ ) {
+ if ( $i + $j >= $n ) {
+ print " ";
+ } elseif ( ctype_print( $s[$i + $j] ) ) {
+ print $s[$i + $j];
+ } else {
+ print '.';
+ }
+ }
+ print "|\n";
+ }
+ }
+}
+
+/**
+ * Internal exception class. Will be caught by private code.
+ */
+class ZipDirectoryReaderError extends Exception {
+ var $code;
+
+ function __construct( $code ) {
+ $this->code = $code;
+ parent::__construct( "ZipDirectoryReader error: $code" );
+ }
+
+ function getErrorCode() {
+ return $this->code;
+ }
+}