diff options
Diffstat (limited to 'maintenance/backupPrefetch.inc')
-rw-r--r-- | maintenance/backupPrefetch.inc | 106 |
1 files changed, 41 insertions, 65 deletions
diff --git a/maintenance/backupPrefetch.inc b/maintenance/backupPrefetch.inc index 512af1c7..9d743137 100644 --- a/maintenance/backupPrefetch.inc +++ b/maintenance/backupPrefetch.inc @@ -1,42 +1,10 @@ <?php - -// Some smart guy removed XMLReader's global constants from PHP 5.1 -// and replaced them with class constants. Breaking source compatibility -// is SUPER awesome, and I love languages which do this constantly! -$xmlReaderConstants = array( - "NONE", - "ELEMENT", - "ATTRIBUTE", - "TEXT", - "CDATA", - "ENTITY_REF", - "ENTITY", - "PI", - "COMMENT", - "DOC", - "DOC_TYPE", - "DOC_FRAGMENT", - "NOTATION", - "WHITESPACE", - "SIGNIFICANT_WHITESPACE", - "END_ELEMENT", - "END_ENTITY", - "XML_DECLARATION", - "LOADDTD", - "DEFAULTATTRS", - "VALIDATE", - "SUBST_ENTITIES" ); -foreach( $xmlReaderConstants as $name ) { - $fullName = "XMLREADER_$name"; - $newName = "XMLReader::$name"; - if( !defined( $fullName ) ) { - if( defined( $newName ) ) { - define( $fullName, constant( $newName ) ); - } else { - // broken or missing the extension... - } - } -} +/** + * Helper class for the --prefetch option of dumpTextPass.php + * + * @file + * @ingroup Maintenance + */ /** * Readahead helper for making large MediaWiki data dumps; @@ -51,7 +19,6 @@ foreach( $xmlReaderConstants as $name ) { * - text contents are immutable and should not change once * recorded, so the previous dump is a reliable source * - * Requires PHP 5 and the XMLReader PECL extension. * @ingroup Maintenance */ class BaseDump { @@ -60,9 +27,12 @@ class BaseDump { var $atPageEnd = false; var $lastPage = 0; var $lastRev = 0; + var $infiles = null; function BaseDump( $infile ) { + $this->infiles = explode(';',$infile); $this->reader = new XMLReader(); + $infile = array_shift($this->infiles); $this->reader->open( $infile ); } @@ -71,26 +41,26 @@ class BaseDump { * from the dump stream. May return null if the page is * unavailable. * - * @param int $page ID number of page to read - * @param int $rev ID number of revision to read + * @param $page Integer: ID number of page to read + * @param $rev Integer: ID number of revision to read * @return string or null */ function prefetch( $page, $rev ) { $page = intval( $page ); $rev = intval( $rev ); - while( $this->lastPage < $page && !$this->atEnd ) { + while ( $this->lastPage < $page && !$this->atEnd ) { $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" ); $this->nextPage(); } - if( $this->lastPage > $page || $this->atEnd ) { + if ( $this->lastPage > $page || $this->atEnd ) { $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev [$this->lastPage, $this->lastRev]" ); return null; } - while( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) { + while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) { $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, looking for $page, $rev" ); $this->nextRev(); } - if( $this->lastRev == $rev && !$this->atEnd ) { + if ( $this->lastRev == $rev && !$this->atEnd ) { $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" ); return $this->nextText(); } else { @@ -101,22 +71,27 @@ class BaseDump { function debug( $str ) { wfDebug( $str . "\n" ); - //global $dumper; - //$dumper->progress( $str ); + // global $dumper; + // $dumper->progress( $str ); } /** * @access private */ function nextPage() { - if( $this->skipTo( 'page', 'mediawiki' ) ) { - if( $this->skipTo( 'id' ) ) { + if ( $this->skipTo( 'page', 'mediawiki' ) ) { + if ( $this->skipTo( 'id' ) ) { $this->lastPage = intval( $this->nodeContents() ); $this->lastRev = 0; $this->atPageEnd = false; } } else { - $this->atEnd = true; + $this->close(); + if (count($this->infiles)) { + $infile = array_shift($this->infiles); + $this->reader->open( $infile ); + $this->atEnd = false; + } } } @@ -124,8 +99,8 @@ class BaseDump { * @access private */ function nextRev() { - if( $this->skipTo( 'revision' ) ) { - if( $this->skipTo( 'id' ) ) { + if ( $this->skipTo( 'revision' ) ) { + if ( $this->skipTo( 'id' ) ) { $this->lastRev = intval( $this->nodeContents() ); } } else { @@ -144,16 +119,16 @@ class BaseDump { /** * @access private */ - function skipTo( $name, $parent='page' ) { - if( $this->atEnd ) { + function skipTo( $name, $parent = 'page' ) { + if ( $this->atEnd ) { return false; } - while( $this->reader->read() ) { - if( $this->reader->nodeType == XMLREADER_ELEMENT && + while ( $this->reader->read() ) { + if ( $this->reader->nodeType == XMLReader::ELEMENT && $this->reader->name == $name ) { return true; } - if( $this->reader->nodeType == XMLREADER_END_ELEMENT && + if ( $this->reader->nodeType == XMLReader::END_ELEMENT && $this->reader->name == $parent ) { $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" ); return false; @@ -166,25 +141,26 @@ class BaseDump { * Shouldn't something like this be built-in to XMLReader? * Fetches text contents of the current element, assuming * no sub-elements or such scary things. - * @return string + * + * @return String * @access private */ function nodeContents() { - if( $this->atEnd ) { + if ( $this->atEnd ) { return null; } - if( $this->reader->isEmptyElement ) { + if ( $this->reader->isEmptyElement ) { return ""; } $buffer = ""; - while( $this->reader->read() ) { + while ( $this->reader->read() ) { switch( $this->reader->nodeType ) { - case XMLREADER_TEXT: -// case XMLREADER_WHITESPACE: - case XMLREADER_SIGNIFICANT_WHITESPACE: + case XMLReader::TEXT: +// case XMLReader::WHITESPACE: + case XMLReader::SIGNIFICANT_WHITESPACE: $buffer .= $this->reader->value; break; - case XMLREADER_END_ELEMENT: + case XMLReader::END_ELEMENT: return $buffer; } } |