diff options
author | Pierre Schmitz <pierre@archlinux.de> | 2012-05-03 13:01:35 +0200 |
---|---|---|
committer | Pierre Schmitz <pierre@archlinux.de> | 2012-05-03 13:01:35 +0200 |
commit | d9022f63880ce039446fba8364f68e656b7bf4cb (patch) | |
tree | 16b40fbf17bf7c9ee6f4ead25b16dd192378050a /maintenance/importUseModWiki.php | |
parent | 27cf83d177256813e2e802241085fce5dd0f3fb9 (diff) |
Update to MediaWiki 1.19.0
Diffstat (limited to 'maintenance/importUseModWiki.php')
-rw-r--r-- | maintenance/importUseModWiki.php | 375 |
1 files changed, 0 insertions, 375 deletions
diff --git a/maintenance/importUseModWiki.php b/maintenance/importUseModWiki.php deleted file mode 100644 index a28d57a5..00000000 --- a/maintenance/importUseModWiki.php +++ /dev/null @@ -1,375 +0,0 @@ -<?php -/** - * Import data from a UseModWiki into a MediaWiki wiki - * 2003-02-09 Brion VIBBER <brion@pobox.com> - * Based loosely on Magnus's code from 2001-2002 - * - * Updated limited version to get something working temporarily - * 2003-10-09 - * Be sure to run the link & index rebuilding scripts! - * - * Some more munging for charsets etc - * 2003-11-28 - * - * Partial fix for pages starting with lowercase letters (??) - * and CamelCase and /Subpage link conversion - * 2004-11-17 - * - * Rewrite output to create Special:Export format for import - * instead of raw SQL. Should be 'future-proof' against future - * schema changes. - * 2005-03-14 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - * http://www.gnu.org/copyleft/gpl.html - * - * @todo document - * @file - * @ingroup Maintenance - */ - -require_once( "Maintenance.php" ); - -class ImportUseModWiki extends Maintenance { - - private $encoding, $rootDirectory = ''; - - /** - * Field separators - * @var String - */ - private $FS1, $FS2, $FS3 = ''; - - /** - * @var Array - */ - private $usercache, $nowiki = array(); - - public function __construct() { - parent::__construct(); - $this->mDescription = "Import pages from UseMod wikis"; - $this->addOption( 'encoding', 'Encoding of the imported text, default CP1252', false, true ); - /** - * If UseModWiki's New File System is used: - * $NewFS = 1; # 1 = new multibyte $FS, 0 = old $FS - * Use "\xb3"; for the Old File System - * Changed with UTF-8 UseModWiki - * http://www.usemod.com/cgi-bin/wiki.pl?SupportForUtf8 - * http://www.usemod.com/cgi-bin/wiki.pl?WikiBugs/NewFieldSeparatorWronglyTreated - * http://www.meatballwiki.org/wiki/WikiEngine#Q_amp_A - */ - $this->addOption( 'separator', 'Field separator to use, default \x1E\xFF\xFE\x1E', false, true ); - $this->addArg( 'path', 'Path to your UseMod wiki' ); - } - - public function execute() { - $this->rootDirectory = $this->getArg(); - $this->encoding = $this->getOption( 'encoding', 'CP1252' ); - $sep = $this->getOption( 'separator', "\x1E\xFF\xFE\x1E" ); - $this->FS1 = "{$sep}1"; - $this->FS2 = "{$sep}2"; - $this->FS3 = "{$sep}3"; - - echo <<<XML -<?xml version="1.0" encoding="UTF-8" ?> -<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.1/" - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.1/ - http://www.mediawiki.org/xml/export-0.1.xsd" - version="0.1" - xml:lang="en"> -<!-- generated by importUseModWiki.php --> - -XML; - $letters = array( - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', - 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', - 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' ); - foreach ( $letters as $letter ) { - $dir = "{$this->rootDirectory}/page/$letter"; - if ( is_dir( $dir ) ) - $this->importPageDirectory( $dir ); - } - echo <<<XML -</mediawiki> - -XML; - } - - private function importPageDirectory( $dir, $prefix = "" ) { - echo "\n<!-- Checking page directory " . $this->xmlCommentSafe( $dir ) . " -->\n"; - $mydir = opendir( $dir ); - while ( $entry = readdir( $mydir ) ) { - $m = array(); - if ( preg_match( '/^(.+)\.db$/', $entry, $m ) ) { - echo $this->importPage( $prefix . $m[1] ); - } else { - if ( is_dir( "$dir/$entry" ) ) { - if ( $entry != '.' && $entry != '..' ) { - $this->importPageDirectory( "$dir/$entry", "$entry/" ); - } - } else { - echo "<!-- File '" . $this->xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n"; - } - } - } - } - - private function useModFilename( $title ) { - $c = substr( $title, 0, 1 ); - if ( preg_match( '/[A-Z]/i', $c ) ) { - return strtoupper( $c ) . "/$title"; - } - return "other/$title"; - } - - private function fetchPage( $title ) { - $fname = $this->rootDirectory . "/page/" . $this->useModFilename( $title ) . ".db"; - if ( !file_exists( $fname ) ) { - echo "Couldn't open file '$fname' for page '$title'.\n"; - die( -1 ); - } - - $page = $this->splitHash( $this->FS1, file_get_contents( $fname ) ); - $section = $this->splitHash( $this->FS2, $page["text_default"] ); - $text = $this->splitHash( $this->FS3, $section["data"] ); - - return $this->array2object( array( "text" => $text["text"] , "summary" => $text["summary"] , - "minor" => $text["minor"] , "ts" => $section["ts"] , - "username" => $section["username"] , "host" => $section["host"] ) ); - } - - private function fetchKeptPages( $title ) { - $fname = $this->rootDirectory . "/keep/" . $this->useModFilename( $title ) . ".kp"; - if ( !file_exists( $fname ) ) return array(); - - $keptlist = explode( $this->FS1, file_get_contents( $fname ) ); - array_shift( $keptlist ); # Drop the junk at beginning of file - - $revisions = array(); - foreach ( $keptlist as $rev ) { - $section = $this->splitHash( $this->FS2, $rev ); - $text = $this->splitHash( $this->FS3, $section["data"] ); - if ( $text["text"] && $text["minor"] != "" && ( $section["ts"] * 1 > 0 ) ) { - array_push( $revisions, $this->array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] , - "minor" => $text["minor"] , "ts" => $section["ts"] , - "username" => $section["username"] , "host" => $section["host"] ) ) ); - } else { - echo "<!-- skipped a bad old revision -->\n"; - } - } - return $revisions; - } - - private function splitHash( $sep , $str ) { - $temp = explode ( $sep , $str ) ; - $ret = array () ; - for ( $i = 0; $i + 1 < count ( $temp ) ; $i++ ) { - $ret[$temp[$i]] = $temp[++$i] ; - } - return $ret ; - } - - private function checkUserCache( $name, $host ) { - if ( $name ) { - if ( in_array( $name, $this->usercache ) ) { - $userid = $this->usercache[$name]; - } else { - # If we haven't imported user accounts - $userid = 0; - } - $username = str_replace( '_', ' ', $name ); - } else { - $userid = 0; - $username = $host; - } - return array( $userid, $username ); - } - - private function importPage( $title ) { - echo "\n<!-- Importing page " . $this->xmlCommentSafe( $title ) . " -->\n"; - $page = $this->fetchPage( $title ); - - $newtitle = $this->xmlsafe( str_replace( '_', ' ', $this->recodeText( $title ) ) ); - - $munged = $this->mungeFormat( $page->text ); - if ( $munged != $page->text ) { - /** - * Save a *new* revision with the conversion, and put the - * previous last version into the history. - */ - $next = $this->array2object( array( - 'text' => $munged, - 'minor' => 1, - 'username' => 'Conversion script', - 'host' => '127.0.0.1', - 'ts' => time(), - 'summary' => 'link fix', - ) ); - $revisions = array( $page, $next ); - } else { - /** - * Current revision: - */ - $revisions = array( $page ); - } - $xml = <<<XML - <page> - <title>$newtitle</title> - -XML; - - # History - $revisions = array_merge( $revisions, $this->fetchKeptPages( $title ) ); - if ( count( $revisions ) == 0 ) { - return NULL; // Was "$sql", which does not appear to be defined. - } - - foreach ( $revisions as $rev ) { - $text = $this->xmlsafe( $this->recodeText( $rev->text ) ); - $minor = ( $rev->minor ? '<minor/>' : '' ); - list( /* $userid */ , $username ) = $this->checkUserCache( $rev->username, $rev->host ); - $username = $this->xmlsafe( $this->recodeText( $username ) ); - $timestamp = $this->xmlsafe( $this->timestamp2ISO8601( $rev->ts ) ); - $comment = $this->xmlsafe( $this->recodeText( $rev->summary ) ); - - $xml .= <<<XML - <revision> - <timestamp>$timestamp</timestamp> - <contributor><username>$username</username></contributor> - $minor - <comment>$comment</comment> - <text>$text</text> - </revision> - -XML; - } - $xml .= "</page>\n\n"; - return $xml; - } - - private function recodeText( $string ) { - # For currently latin-1 wikis - $string = str_replace( "\r\n", "\n", $string ); - $string = @iconv( $this->encoding, "UTF-8", $string ); - $string = $this->mungeToUtf8( $string ); # Any old Ӓ stuff - return $string; - } - - /** - * @todo FIXME: Don't use /e - */ - private function mungeToUtf8( $string ) { - $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string ); - $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string ); - # Should also do named entities here - return $string; - } - - private function timestamp2ISO8601( $ts ) { - # 2003-08-05T18:30:02Z - return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z'; - } - - /** - * The page may contain old data which has not been properly normalized. - * Invalid UTF-8 sequences or forbidden control characters will make our - * XML output invalid, so be sure to strip them out. - * @param String $string Text to clean up - * @return String - */ - private function xmlsafe( $string ) { - $string = UtfNormal::cleanUp( $string ); - $string = htmlspecialchars( $string ); - return $string; - } - - private function xmlCommentSafe( $text ) { - return str_replace( '--', '\\-\\-', $this->xmlsafe( $this->recodeText( $text ) ) ); - } - - private function array2object( $arr ) { - $o = (object)0; - foreach ( $arr as $x => $y ) { - $o->$x = $y; - } - return $o; - } - - /** - * Make CamelCase and /Talk links work - */ - private function mungeFormat( $text ) { - $this->nowiki = array(); - $staged = preg_replace_callback( - '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s', - array( $this, 'nowikiPlaceholder' ), $text ); - - # This is probably not 100% correct, I'm just - # glancing at the UseModWiki code. - $upper = "[A-Z]"; - $lower = "[a-z_0-9]"; - $any = "[A-Za-z_0-9]"; - $camel = "(?:$upper+$lower+$upper+$any*)"; - $subpage = "(?:\\/$any+)"; - $substart = "(?:\\/$upper$any*)"; - - $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/", - '[[$1]]', $staged ); - - $final = preg_replace( '/' . preg_quote( $this->placeholder() ) . '/s', - array( $this, 'nowikiShift' ), $munged ); - return $final; - } - - private function placeholder( $x = null ) { - return '\xffplaceholder\xff'; - } - - public function nowikiPlaceholder( $matches ) { - $this->nowiki[] = $matches[1]; - return $this->placeholder(); - } - - public function nowikiShift() { - return array_shift( $this->nowiki ); - } -} - -function wfUtf8Sequence( $codepoint ) { - if ( $codepoint < 0x80 ) { - return chr( $codepoint ); - } - if ( $codepoint < 0x800 ) { - return chr( $codepoint >> 6 & 0x3f | 0xc0 ) . - chr( $codepoint & 0x3f | 0x80 ); - } - if ( $codepoint < 0x10000 ) { - return chr( $codepoint >> 12 & 0x0f | 0xe0 ) . - chr( $codepoint >> 6 & 0x3f | 0x80 ) . - chr( $codepoint & 0x3f | 0x80 ); - } - if ( $codepoint < 0x100000 ) { - return chr( $codepoint >> 18 & 0x07 | 0xf0 ) . # Double-check this - chr( $codepoint >> 12 & 0x3f | 0x80 ) . - chr( $codepoint >> 6 & 0x3f | 0x80 ) . - chr( $codepoint & 0x3f | 0x80 ); - } - # Doesn't yet handle outside the BMP - return "&#$codepoint;"; -} - -$maintClass = 'ImportUseModWiki'; -require_once( RUN_MAINTENANCE_IF_MAIN ); |