diff options
Diffstat (limited to 'maintenance/importUseModWiki.php')
-rw-r--r-- | maintenance/importUseModWiki.php | 533 |
1 files changed, 272 insertions, 261 deletions
diff --git a/maintenance/importUseModWiki.php b/maintenance/importUseModWiki.php index bff4cd02..a28d57a5 100644 --- a/maintenance/importUseModWiki.php +++ b/maintenance/importUseModWiki.php @@ -1,5 +1,4 @@ <?php - /** * Import data from a UseModWiki into a MediaWiki wiki * 2003-02-09 Brion VIBBER <brion@pobox.com> @@ -21,45 +20,69 @@ * schema changes. * 2005-03-14 * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * * @todo document * @file * @ingroup Maintenance */ -if ( php_sapi_name() != 'cli' ) { - echo "Please customize the settings and run me from the command line."; - die( -1 ); -} - -/** Set these correctly! */ -$wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */ -$wgRootDirectory = "/kalman/Projects/wiki2002/wiki/lib-http/db/wiki"; +require_once( "Maintenance.php" ); -/* On a large wiki, you might run out of memory */ -@ini_set( 'memory_limit', '40M' ); +class ImportUseModWiki extends Maintenance { -/* globals */ -$wgFieldSeparator = "\xb3"; # Some wikis may use different char - $FS = $wgFieldSeparator ; - $FS1 = $FS . "1" ; - $FS2 = $FS . "2" ; - $FS3 = $FS . "3" ; + private $encoding, $rootDirectory = ''; -# Unicode sanitization tools -require_once( dirname( dirname( __FILE__ ) ) . '/includes/normal/UtfNormal.php' ); - -$usercache = array(); + /** + * Field separators + * @var String + */ + private $FS1, $FS2, $FS3 = ''; -importPages(); + /** + * @var Array + */ + private $usercache, $nowiki = array(); -# ------------------------------------------------------------------------------ + public function __construct() { + parent::__construct(); + $this->mDescription = "Import pages from UseMod wikis"; + $this->addOption( 'encoding', 'Encoding of the imported text, default CP1252', false, true ); + /** + * If UseModWiki's New File System is used: + * $NewFS = 1; # 1 = new multibyte $FS, 0 = old $FS + * Use "\xb3"; for the Old File System + * Changed with UTF-8 UseModWiki + * http://www.usemod.com/cgi-bin/wiki.pl?SupportForUtf8 + * http://www.usemod.com/cgi-bin/wiki.pl?WikiBugs/NewFieldSeparatorWronglyTreated + * http://www.meatballwiki.org/wiki/WikiEngine#Q_amp_A + */ + $this->addOption( 'separator', 'Field separator to use, default \x1E\xFF\xFE\x1E', false, true ); + $this->addArg( 'path', 'Path to your UseMod wiki' ); + } -function importPages() -{ - global $wgRootDirectory; + public function execute() { + $this->rootDirectory = $this->getArg(); + $this->encoding = $this->getOption( 'encoding', 'CP1252' ); + $sep = $this->getOption( 'separator', "\x1E\xFF\xFE\x1E" ); + $this->FS1 = "{$sep}1"; + $this->FS2 = "{$sep}2"; + $this->FS3 = "{$sep}3"; - $gt = '>'; - echo <<<XML + echo <<<XML <?xml version="1.0" encoding="UTF-8" ?> <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" @@ -75,290 +98,278 @@ XML; 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' ); foreach ( $letters as $letter ) { - $dir = "$wgRootDirectory/page/$letter"; + $dir = "{$this->rootDirectory}/page/$letter"; if ( is_dir( $dir ) ) - importPageDirectory( $dir ); + $this->importPageDirectory( $dir ); } echo <<<XML </mediawiki> XML; -} + } -function importPageDirectory( $dir, $prefix = "" ) -{ - echo "\n<!-- Checking page directory " . xmlCommentSafe( $dir ) . " -->\n"; - $mydir = opendir( $dir ); - while ( $entry = readdir( $mydir ) ) { - $m = array(); - if ( preg_match( '/^(.+)\.db$/', $entry, $m ) ) { - echo importPage( $prefix . $m[1] ); - } else { - if ( is_dir( "$dir/$entry" ) ) { - if ( $entry != '.' && $entry != '..' ) { - importPageDirectory( "$dir/$entry", "$entry/" ); - } + private function importPageDirectory( $dir, $prefix = "" ) { + echo "\n<!-- Checking page directory " . $this->xmlCommentSafe( $dir ) . " -->\n"; + $mydir = opendir( $dir ); + while ( $entry = readdir( $mydir ) ) { + $m = array(); + if ( preg_match( '/^(.+)\.db$/', $entry, $m ) ) { + echo $this->importPage( $prefix . $m[1] ); } else { - echo "<!-- File '" . xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n"; + if ( is_dir( "$dir/$entry" ) ) { + if ( $entry != '.' && $entry != '..' ) { + $this->importPageDirectory( "$dir/$entry", "$entry/" ); + } + } else { + echo "<!-- File '" . $this->xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n"; + } } } } -} - -# ------------------------------------------------------------------------------ - -/* fetch_ functions - Grab a given item from the database - */ - -function useModFilename( $title ) { - $c = substr( $title, 0, 1 ); - if ( preg_match( '/[A-Z]/i', $c ) ) { - return strtoupper( $c ) . "/$title"; - } - return "other/$title"; -} - -function fetchPage( $title ) -{ - global $FS1, $FS2, $FS3, $wgRootDirectory; - - $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db"; - if ( !file_exists( $fname ) ) { - echo "Couldn't open file '$fname' for page '$title'.\n"; - die( -1 ); + private function useModFilename( $title ) { + $c = substr( $title, 0, 1 ); + if ( preg_match( '/[A-Z]/i', $c ) ) { + return strtoupper( $c ) . "/$title"; + } + return "other/$title"; } - $page = splitHash( $FS1, file_get_contents( $fname ) ); - $section = splitHash( $FS2, $page["text_default"] ); - $text = splitHash( $FS3, $section["data"] ); - - return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] , - "minor" => $text["minor"] , "ts" => $section["ts"] , - "username" => $section["username"] , "host" => $section["host"] ) ); -} - -function fetchKeptPages( $title ) -{ - global $FS1, $FS2, $FS3, $wgRootDirectory; - - $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp"; - if ( !file_exists( $fname ) ) return array(); + private function fetchPage( $title ) { + $fname = $this->rootDirectory . "/page/" . $this->useModFilename( $title ) . ".db"; + if ( !file_exists( $fname ) ) { + echo "Couldn't open file '$fname' for page '$title'.\n"; + die( -1 ); + } - $keptlist = explode( $FS1, file_get_contents( $fname ) ); - array_shift( $keptlist ); # Drop the junk at beginning of file + $page = $this->splitHash( $this->FS1, file_get_contents( $fname ) ); + $section = $this->splitHash( $this->FS2, $page["text_default"] ); + $text = $this->splitHash( $this->FS3, $section["data"] ); - $revisions = array(); - foreach ( $keptlist as $rev ) { - $section = splitHash( $FS2, $rev ); - $text = splitHash( $FS3, $section["data"] ); - if ( $text["text"] && $text["minor"] != "" && ( $section["ts"] * 1 > 0 ) ) { - array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] , - "minor" => $text["minor"] , "ts" => $section["ts"] , - "username" => $section["username"] , "host" => $section["host"] ) ) ); - } else { - echo "<!-- skipped a bad old revision -->\n"; - } + return $this->array2object( array( "text" => $text["text"] , "summary" => $text["summary"] , + "minor" => $text["minor"] , "ts" => $section["ts"] , + "username" => $section["username"] , "host" => $section["host"] ) ); } - return $revisions; -} -function splitHash ( $sep , $str ) { - $temp = explode ( $sep , $str ) ; - $ret = array () ; - for ( $i = 0; $i + 1 < count ( $temp ) ; $i++ ) { - $ret[$temp[$i]] = $temp[++$i] ; + private function fetchKeptPages( $title ) { + $fname = $this->rootDirectory . "/keep/" . $this->useModFilename( $title ) . ".kp"; + if ( !file_exists( $fname ) ) return array(); + + $keptlist = explode( $this->FS1, file_get_contents( $fname ) ); + array_shift( $keptlist ); # Drop the junk at beginning of file + + $revisions = array(); + foreach ( $keptlist as $rev ) { + $section = $this->splitHash( $this->FS2, $rev ); + $text = $this->splitHash( $this->FS3, $section["data"] ); + if ( $text["text"] && $text["minor"] != "" && ( $section["ts"] * 1 > 0 ) ) { + array_push( $revisions, $this->array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] , + "minor" => $text["minor"] , "ts" => $section["ts"] , + "username" => $section["username"] , "host" => $section["host"] ) ) ); + } else { + echo "<!-- skipped a bad old revision -->\n"; + } } - return $ret ; + return $revisions; } + private function splitHash( $sep , $str ) { + $temp = explode ( $sep , $str ) ; + $ret = array () ; + for ( $i = 0; $i + 1 < count ( $temp ) ; $i++ ) { + $ret[$temp[$i]] = $temp[++$i] ; + } + return $ret ; + } -/* import_ functions - Take a fetched item and produce SQL - */ - -function checkUserCache( $name, $host ) -{ - global $usercache; - - if ( $name ) { - if ( in_array( $name, $usercache ) ) { - $userid = $usercache[$name]; + private function checkUserCache( $name, $host ) { + if ( $name ) { + if ( in_array( $name, $this->usercache ) ) { + $userid = $this->usercache[$name]; + } else { + # If we haven't imported user accounts + $userid = 0; + } + $username = str_replace( '_', ' ', $name ); } else { - # If we haven't imported user accounts $userid = 0; + $username = $host; } - $username = str_replace( '_', ' ', $name ); - } else { - $userid = 0; - $username = $host; + return array( $userid, $username ); } - return array( $userid, $username ); -} - -function importPage( $title ) -{ - echo "\n<!-- Importing page " . xmlCommentSafe( $title ) . " -->\n"; - $page = fetchPage( $title ); - - $newtitle = xmlsafe( str_replace( '_', ' ', recodeText( $title ) ) ); - $munged = mungeFormat( $page->text ); - if ( $munged != $page->text ) { - /** - * Save a *new* revision with the conversion, and put the - * previous last version into the history. - */ - $next = array2object( array( - 'text' => $munged, - 'minor' => 1, - 'username' => 'Conversion script', - 'host' => '127.0.0.1', - 'ts' => time(), - 'summary' => 'link fix', - ) ); - $revisions = array( $page, $next ); - } else { - /** - * Current revision: - */ - $revisions = array( $page ); - } - $xml = <<<XML - <page> - <title>$newtitle</title> + private function importPage( $title ) { + echo "\n<!-- Importing page " . $this->xmlCommentSafe( $title ) . " -->\n"; + $page = $this->fetchPage( $title ); + + $newtitle = $this->xmlsafe( str_replace( '_', ' ', $this->recodeText( $title ) ) ); + + $munged = $this->mungeFormat( $page->text ); + if ( $munged != $page->text ) { + /** + * Save a *new* revision with the conversion, and put the + * previous last version into the history. + */ + $next = $this->array2object( array( + 'text' => $munged, + 'minor' => 1, + 'username' => 'Conversion script', + 'host' => '127.0.0.1', + 'ts' => time(), + 'summary' => 'link fix', + ) ); + $revisions = array( $page, $next ); + } else { + /** + * Current revision: + */ + $revisions = array( $page ); + } + $xml = <<<XML + <page> + <title>$newtitle</title> XML; - # History - $revisions = array_merge( $revisions, fetchKeptPages( $title ) ); - if ( count( $revisions ) == 0 ) { - return NULL; // Was "$sql", which does not appear to be defined. - } + # History + $revisions = array_merge( $revisions, $this->fetchKeptPages( $title ) ); + if ( count( $revisions ) == 0 ) { + return NULL; // Was "$sql", which does not appear to be defined. + } - foreach ( $revisions as $rev ) { - $text = xmlsafe( recodeText( $rev->text ) ); - $minor = ( $rev->minor ? '<minor/>' : '' ); - list( /* $userid */ , $username ) = checkUserCache( $rev->username, $rev->host ); - $username = xmlsafe( recodeText( $username ) ); - $timestamp = xmlsafe( timestamp2ISO8601( $rev->ts ) ); - $comment = xmlsafe( recodeText( $rev->summary ) ); - - $xml .= <<<XML - <revision> - <timestamp>$timestamp</timestamp> - <contributor><username>$username</username></contributor> - $minor - <comment>$comment</comment> - <text>$text</text> - </revision> + foreach ( $revisions as $rev ) { + $text = $this->xmlsafe( $this->recodeText( $rev->text ) ); + $minor = ( $rev->minor ? '<minor/>' : '' ); + list( /* $userid */ , $username ) = $this->checkUserCache( $rev->username, $rev->host ); + $username = $this->xmlsafe( $this->recodeText( $username ) ); + $timestamp = $this->xmlsafe( $this->timestamp2ISO8601( $rev->ts ) ); + $comment = $this->xmlsafe( $this->recodeText( $rev->summary ) ); + + $xml .= <<<XML + <revision> + <timestamp>$timestamp</timestamp> + <contributor><username>$username</username></contributor> + $minor + <comment>$comment</comment> + <text>$text</text> + </revision> XML; + } + $xml .= "</page>\n\n"; + return $xml; } - $xml .= "</page>\n\n"; - return $xml; -} -# Whee! -function recodeText( $string ) { - global $wgImportEncoding; - # For currently latin-1 wikis - $string = str_replace( "\r\n", "\n", $string ); - $string = @iconv( $wgImportEncoding, "UTF-8", $string ); - $string = wfMungeToUtf8( $string ); # Any old Ӓ stuff - return $string; -} - -function wfUtf8Sequence( $codepoint ) { - if ( $codepoint < 0x80 ) return chr( $codepoint ); - if ( $codepoint < 0x800 ) return chr( $codepoint >> 6 & 0x3f | 0xc0 ) . - chr( $codepoint & 0x3f | 0x80 ); - if ( $codepoint < 0x10000 ) return chr( $codepoint >> 12 & 0x0f | 0xe0 ) . - chr( $codepoint >> 6 & 0x3f | 0x80 ) . - chr( $codepoint & 0x3f | 0x80 ); - if ( $codepoint < 0x100000 ) return chr( $codepoint >> 18 & 0x07 | 0xf0 ) . # Double-check this - chr( $codepoint >> 12 & 0x3f | 0x80 ) . - chr( $codepoint >> 6 & 0x3f | 0x80 ) . - chr( $codepoint & 0x3f | 0x80 ); - # Doesn't yet handle outside the BMP - return "&#$codepoint;"; -} + private function recodeText( $string ) { + # For currently latin-1 wikis + $string = str_replace( "\r\n", "\n", $string ); + $string = @iconv( $this->encoding, "UTF-8", $string ); + $string = $this->mungeToUtf8( $string ); # Any old Ӓ stuff + return $string; + } -function wfMungeToUtf8( $string ) { - $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string ); - $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string ); - # Should also do named entities here - return $string; -} + /** + * @todo FIXME: Don't use /e + */ + private function mungeToUtf8( $string ) { + $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string ); + $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string ); + # Should also do named entities here + return $string; + } -function timestamp2ISO8601( $ts ) { - # 2003-08-05T18:30:02Z - return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z'; -} + private function timestamp2ISO8601( $ts ) { + # 2003-08-05T18:30:02Z + return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z'; + } -function xmlsafe( $string ) { /** * The page may contain old data which has not been properly normalized. * Invalid UTF-8 sequences or forbidden control characters will make our * XML output invalid, so be sure to strip them out. + * @param String $string Text to clean up + * @return String */ - $string = UtfNormal::cleanUp( $string ); - - $string = htmlspecialchars( $string ); - return $string; -} - -function xmlCommentSafe( $text ) { - return str_replace( '--', '\\-\\-', xmlsafe( recodeText( $text ) ) ); -} + private function xmlsafe( $string ) { + $string = UtfNormal::cleanUp( $string ); + $string = htmlspecialchars( $string ); + return $string; + } + private function xmlCommentSafe( $text ) { + return str_replace( '--', '\\-\\-', $this->xmlsafe( $this->recodeText( $text ) ) ); + } -function array2object( $arr ) { - $o = (object)0; - foreach ( $arr as $x => $y ) { - $o->$x = $y; + private function array2object( $arr ) { + $o = (object)0; + foreach ( $arr as $x => $y ) { + $o->$x = $y; + } + return $o; } - return $o; -} + /** + * Make CamelCase and /Talk links work + */ + private function mungeFormat( $text ) { + $this->nowiki = array(); + $staged = preg_replace_callback( + '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s', + array( $this, 'nowikiPlaceholder' ), $text ); + + # This is probably not 100% correct, I'm just + # glancing at the UseModWiki code. + $upper = "[A-Z]"; + $lower = "[a-z_0-9]"; + $any = "[A-Za-z_0-9]"; + $camel = "(?:$upper+$lower+$upper+$any*)"; + $subpage = "(?:\\/$any+)"; + $substart = "(?:\\/$upper$any*)"; + + $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/", + '[[$1]]', $staged ); + + $final = preg_replace( '/' . preg_quote( $this->placeholder() ) . '/s', + array( $this, 'nowikiShift' ), $munged ); + return $final; + } -/** - * Make CamelCase and /Talk links work - */ -function mungeFormat( $text ) { - global $nowiki; - $nowiki = array(); - $staged = preg_replace_callback( - '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s', - 'nowikiPlaceholder', $text ); - - # This is probably not 100% correct, I'm just - # glancing at the UseModWiki code. - $upper = "[A-Z]"; - $lower = "[a-z_0-9]"; - $any = "[A-Za-z_0-9]"; - $camel = "(?:$upper+$lower+$upper+$any*)"; - $subpage = "(?:\\/$any+)"; - $substart = "(?:\\/$upper$any*)"; - - $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/", - '[[$1]]', $staged ); - - $final = preg_replace( '/' . preg_quote( placeholder() ) . '/es', - 'array_shift( $nowiki )', $munged ); - return $final; -} + private function placeholder( $x = null ) { + return '\xffplaceholder\xff'; + } + public function nowikiPlaceholder( $matches ) { + $this->nowiki[] = $matches[1]; + return $this->placeholder(); + } -function placeholder( $x = null ) { - return '\xffplaceholder\xff'; + public function nowikiShift() { + return array_shift( $this->nowiki ); + } } -function nowikiPlaceholder( $matches ) { - global $nowiki; - $nowiki[] = $matches[1]; - return placeholder(); +function wfUtf8Sequence( $codepoint ) { + if ( $codepoint < 0x80 ) { + return chr( $codepoint ); + } + if ( $codepoint < 0x800 ) { + return chr( $codepoint >> 6 & 0x3f | 0xc0 ) . + chr( $codepoint & 0x3f | 0x80 ); + } + if ( $codepoint < 0x10000 ) { + return chr( $codepoint >> 12 & 0x0f | 0xe0 ) . + chr( $codepoint >> 6 & 0x3f | 0x80 ) . + chr( $codepoint & 0x3f | 0x80 ); + } + if ( $codepoint < 0x100000 ) { + return chr( $codepoint >> 18 & 0x07 | 0xf0 ) . # Double-check this + chr( $codepoint >> 12 & 0x3f | 0x80 ) . + chr( $codepoint >> 6 & 0x3f | 0x80 ) . + chr( $codepoint & 0x3f | 0x80 ); + } + # Doesn't yet handle outside the BMP + return "&#$codepoint;"; } - +$maintClass = 'ImportUseModWiki'; +require_once( RUN_MAINTENANCE_IF_MAIN ); |