diff options
author | Pierre Schmitz <pierre@archlinux.de> | 2006-10-11 18:12:39 +0000 |
---|---|---|
committer | Pierre Schmitz <pierre@archlinux.de> | 2006-10-11 18:12:39 +0000 |
commit | 183851b06bd6c52f3cae5375f433da720d410447 (patch) | |
tree | a477257decbf3360127f6739c2f9d0ec57a03d39 /maintenance/importUseModWiki.php |
MediaWiki 1.7.1 wiederhergestellt
Diffstat (limited to 'maintenance/importUseModWiki.php')
-rw-r--r-- | maintenance/importUseModWiki.php | 365 |
1 files changed, 365 insertions, 0 deletions
diff --git a/maintenance/importUseModWiki.php b/maintenance/importUseModWiki.php new file mode 100644 index 00000000..15f5e444 --- /dev/null +++ b/maintenance/importUseModWiki.php @@ -0,0 +1,365 @@ +<?php + +/** + * Import data from a UseModWiki into a MediaWiki wiki + * 2003-02-09 Brion VIBBER <brion@pobox.com> + * Based loosely on Magnus's code from 2001-2002 + * + * Updated limited version to get something working temporarily + * 2003-10-09 + * Be sure to run the link & index rebuilding scripts! + * + * Some more munging for charsets etc + * 2003-11-28 + * + * Partial fix for pages starting with lowercase letters (??) + * and CamelCase and /Subpage link conversion + * 2004-11-17 + * + * Rewrite output to create Special:Export format for import + * instead of raw SQL. Should be 'future-proof' against future + * schema changes. + * 2005-03-14 + * + * @todo document + * @package MediaWiki + * @subpackage Maintenance + */ + +if( php_sapi_name() != 'cli' ) { + echo "Please customize the settings and run me from the command line."; + die( -1 ); +} + +/** Set these correctly! */ +$wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */ +$wgRootDirectory = "/kalman/Projects/wiki2002/wiki/lib-http/db/wiki"; + +/* On a large wiki, you might run out of memory */ +@ini_set( 'memory_limit', '40M' ); + +/* globals */ +$wgFieldSeparator = "\xb3"; # Some wikis may use different char + $FS = $wgFieldSeparator ; + $FS1 = $FS."1" ; + $FS2 = $FS."2" ; + $FS3 = $FS."3" ; + +# Unicode sanitization tools +require_once( '../includes/normal/UtfNormal.php' ); + +$usercache = array(); + +importPages(); + +# ------------------------------------------------------------------------------ + +function importPages() +{ + global $wgRootDirectory; + + $gt = '>'; + echo <<<END +<?xml version="1.0" encoding="UTF-8" ?$gt +<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.1/" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.1/ + http://www.mediawiki.org/xml/export-0.1.xsd" + version="0.1" + xml:lang="en"> +<!-- generated by importUseModWiki.php --> + +END; + $letters = array( + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', + 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', + 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' ); + foreach( $letters as $letter ) { + $dir = "$wgRootDirectory/page/$letter"; + if( is_dir( $dir ) ) + importPageDirectory( $dir ); + } + echo <<<END +</mediawiki> + +END; +} + +function importPageDirectory( $dir, $prefix = "" ) +{ + echo "\n<!-- Checking page directory " . xmlCommentSafe( $dir ) . " -->\n"; + $mydir = opendir( $dir ); + while( $entry = readdir( $mydir ) ) { + if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) { + echo importPage( $prefix . $m[1] ); + } else { + if( is_dir( "$dir/$entry" ) ) { + if( $entry != '.' && $entry != '..' ) { + importPageDirectory( "$dir/$entry", "$entry/" ); + } + } else { + echo "<!-- File '" . xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n"; + } + } + } +} + + +# ------------------------------------------------------------------------------ + +/* fetch_ functions + Grab a given item from the database + */ + +function useModFilename( $title ) { + $c = substr( $title, 0, 1 ); + if(preg_match( '/[A-Z]/i', $c ) ) { + return strtoupper( $c ) . "/$title"; + } + return "other/$title"; +} + +function fetchPage( $title ) +{ + global $FS,$FS1,$FS2,$FS3, $wgRootDirectory; + + $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db"; + if( !file_exists( $fname ) ) { + echo "Couldn't open file '$fname' for page '$title'.\n"; + die( -1 ); + } + + $page = splitHash( $FS1, file_get_contents( $fname ) ); + $section = splitHash( $FS2, $page["text_default"] ); + $text = splitHash( $FS3, $section["data"] ); + + return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] , + "minor" => $text["minor"] , "ts" => $section["ts"] , + "username" => $section["username"] , "host" => $section["host"] ) ); +} + +function fetchKeptPages( $title ) +{ + global $FS,$FS1,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection; + + $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp"; + if( !file_exists( $fname ) ) return array(); + + $keptlist = explode( $FS1, file_get_contents( $fname ) ); + array_shift( $keptlist ); # Drop the junk at beginning of file + + $revisions = array(); + foreach( $keptlist as $rev ) { + $section = splitHash( $FS2, $rev ); + $text = splitHash( $FS3, $section["data"] ); + if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) { + array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] , + "minor" => $text["minor"] , "ts" => $section["ts"] , + "username" => $section["username"] , "host" => $section["host"] ) ) ); + } else { + echo "<!-- skipped a bad old revision -->\n"; + } + } + return $revisions; +} + +function splitHash ( $sep , $str ) { + $temp = explode ( $sep , $str ) ; + $ret = array () ; + for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) { + $ret[$temp[$i]] = $temp[++$i] ; + } + return $ret ; + } + + +/* import_ functions + Take a fetched item and produce SQL + */ + +function checkUserCache( $name, $host ) +{ + global $usercache; + + if( $name ) { + if( in_array( $name, $usercache ) ) { + $userid = $usercache[$name]; + } else { + # If we haven't imported user accounts + $userid = 0; + } + $username = str_replace( '_', ' ', $name ); + } else { + $userid = 0; + $username = $host; + } + return array( $userid, $username ); +} + +function importPage( $title ) +{ + global $usercache; + + echo "\n<!-- Importing page " . xmlCommentSafe( $title ) . " -->\n"; + $page = fetchPage( $title ); + + $newtitle = xmlsafe( str_replace( '_', ' ', recodeText( $title ) ) ); + + $munged = mungeFormat( $page->text ); + if( $munged != $page->text ) { + /** + * Save a *new* revision with the conversion, and put the + * previous last version into the history. + */ + $next = array2object( array( + 'text' => $munged, + 'minor' => 1, + 'username' => 'Conversion script', + 'host' => '127.0.0.1', + 'ts' => time(), + 'summary' => 'link fix', + ) ); + $revisions = array( $page, $next ); + } else { + /** + * Current revision: + */ + $revisions = array( $page ); + } + $xml = <<<END + <page> + <title>$newtitle</title> + +END; + + # History + $revisions = array_merge( $revisions, fetchKeptPages( $title ) ); + if(count( $revisions ) == 0 ) { + return $sql; + } + + foreach( $revisions as $rev ) { + $text = xmlsafe( recodeText( $rev->text ) ); + $minor = ($rev->minor ? '<minor/>' : ''); + list( $userid, $username ) = checkUserCache( $rev->username, $rev->host ); + $username = xmlsafe( recodeText( $username ) ); + $timestamp = xmlsafe( timestamp2ISO8601( $rev->ts ) ); + $comment = xmlsafe( recodeText( $rev->summary ) ); + + $xml .= <<<END + <revision> + <timestamp>$timestamp</timestamp> + <contributor><username>$username</username></contributor> + $minor + <comment>$comment</comment> + <text>$text</text> + </revision> + +END; + } + $xml .= "</page>\n\n"; + return $xml; +} + +# Whee! +function recodeText( $string ) { + global $wgImportEncoding; + # For currently latin-1 wikis + $string = str_replace( "\r\n", "\n", $string ); + $string = @iconv( $wgImportEncoding, "UTF-8", $string ); + $string = wfMungeToUtf8( $string ); # Any old Ӓ stuff + return $string; +} + +function wfUtf8Sequence($codepoint) { + if($codepoint < 0x80) return chr($codepoint); + if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) . + chr($codepoint & 0x3f | 0x80); + if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) . + chr($codepoint >> 6 & 0x3f | 0x80) . + chr($codepoint & 0x3f | 0x80); + if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-check this + chr($codepoint >> 12 & 0x3f | 0x80) . + chr($codepoint >> 6 & 0x3f | 0x80) . + chr($codepoint & 0x3f | 0x80); + # Doesn't yet handle outside the BMP + return "&#$codepoint;"; +} + +function wfMungeToUtf8($string) { + $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string ); + $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string ); + # Should also do named entities here + return $string; +} + +function timestamp2ISO8601( $ts ) { + #2003-08-05T18:30:02Z + return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z'; +} + +function xmlsafe( $string ) { + /** + * The page may contain old data which has not been properly normalized. + * Invalid UTF-8 sequences or forbidden control characters will make our + * XML output invalid, so be sure to strip them out. + */ + $string = UtfNormal::cleanUp( $string ); + + $string = htmlspecialchars( $string ); + return $string; +} + +function xmlCommentSafe( $text ) { + return str_replace( '--', '\\-\\-', xmlsafe( recodeText( $text ) ) ); +} + + +function array2object( $arr ) { + $o = (object)0; + foreach( $arr as $x => $y ) { + $o->$x = $y; + } + return $o; +} + + +/** + * Make CamelCase and /Talk links work + */ +function mungeFormat( $text ) { + global $nowiki; + $nowiki = array(); + $staged = preg_replace_callback( + '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s', + 'nowikiPlaceholder', $text ); + + # This is probably not 100% correct, I'm just + # glancing at the UseModWiki code. + $upper = "[A-Z]"; + $lower = "[a-z_0-9]"; + $any = "[A-Za-z_0-9]"; + $camel = "(?:$upper+$lower+$upper+$any*)"; + $subpage = "(?:\\/$any+)"; + $substart = "(?:\\/$upper$any*)"; + + $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/", + '[[$1]]', $staged ); + + $final = preg_replace( '/' . preg_quote( placeholder() ) . '/es', + 'array_shift( $nowiki )', $munged ); + return $final; +} + + +function placeholder( $x = null ) { + return '\xffplaceholder\xff'; +} + +function nowikiPlaceholder( $matches ) { + global $nowiki; + $nowiki[] = $matches[1]; + return placeholder(); +} + +?> |