From 222b01f5169f1c7e69762e0e8904c24f78f71882 Mon Sep 17 00:00:00 2001 From: Pierre Schmitz Date: Wed, 28 Jul 2010 11:52:48 +0200 Subject: update to MediaWiki 1.16.0 --- maintenance/language/generateNormalizerData.php | 137 ++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 maintenance/language/generateNormalizerData.php (limited to 'maintenance/language/generateNormalizerData.php') diff --git a/maintenance/language/generateNormalizerData.php b/maintenance/language/generateNormalizerData.php new file mode 100644 index 00000000..d6b7aaa6 --- /dev/null +++ b/maintenance/language/generateNormalizerData.php @@ -0,0 +1,137 @@ +addOption( 'unicode-data-file', 'The local location of the data file ' . + 'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true ); + } + + public function execute() { + if ( !$this->hasOption( 'unicode-data-file' ) ) { + $this->dataFile = 'UnicodeData.txt'; + if ( !file_exists( $this->dataFile ) ) { + $this->error( "Unable to find UnicodeData.txt. Please specify its location with --unicode-data-file=" ); + exit( 1 ); + } + } else { + $this->dataFile = $this->getOption( 'unicode-data-file' ); + if ( !file_exists( $this->dataFile ) ) { + $this->error( 'Unable to find the specified data file.' ); + exit( 1 ); + } + } + + $this->generateArabic(); + $this->generateMalayalam(); + } + + function generateArabic() { + $file = fopen( $this->dataFile, 'r' ); + if ( !$file ) { + $this->error( 'Unable to open the data file.' ); + exit( 1 ); + } + + // For the file format, see http://www.unicode.org/reports/tr44/ + $fieldNames = array( + 'Code', + 'Name', + 'General_Category', + 'Canonical_Combining_Class', + 'Bidi_Class', + 'Decomposition_Type_Mapping', + 'Numeric_Type_Value', + 'Bidi_Mirrored', + 'Unicode_1_Name', + 'ISO_Comment', + 'Simple_Uppercase_Mapping', + 'Simple_Lowercase_Mapping', + 'Simple_Titlecase_Mapping' + ); + + $pairs = array(); + + $lineNum = 0; + while ( false !== ( $line = fgets( $file ) ) ) { + ++$lineNum; + + # Strip comments + $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) ); + if ( $line === '' ) { + continue; + } + + # Split fields + $numberedData = explode( ';', $line ); + $data = array(); + foreach ( $fieldNames as $number => $name ) { + $data[$name] = $numberedData[$number]; + } + + $code = base_convert( $data['Code'], 16, 10 ); + if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A + || ( $code >= 0xFE70 && $code <= 0xFEFF ) ) # Arabic presentation forms B + { + if ( $data['Decomposition_Type_Mapping'] === '' ) { + // No decomposition + continue; + } + if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/', + $data['Decomposition_Type_Mapping'], $m ) ) + { + $this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" ); + $this->error( $line ); + continue; + } + + $source = hexSequenceToUtf8( $data['Code'] ); + $dest = hexSequenceToUtf8( $m[2] ); + $pairs[$source] = $dest; + } + } + + global $IP; + file_put_contents( "$IP/serialized/normalize-ar.ser", serialize( $pairs ) ); + echo "ar: " . count( $pairs ) . " pairs written.\n"; + } + + function generateMalayalam() { + $hexPairs = array( + # From http://unicode.org/versions/Unicode5.1.0/#Malayalam_Chillu_Characters + '0D23 0D4D 200D' => '0D7A', + '0D28 0D4D 200D' => '0D7B', + '0D30 0D4D 200D' => '0D7C', + '0D32 0D4D 200D' => '0D7D', + '0D33 0D4D 200D' => '0D7E', + + # From http://permalink.gmane.org/gmane.science.linguistics.wikipedia.technical/46413 + '0D15 0D4D 200D' => '0D7F', + ); + + $pairs = array(); + foreach ( $hexPairs as $hexSource => $hexDest ) { + $source = hexSequenceToUtf8( $hexSource ); + $dest = hexSequenceToUtf8( $hexDest ); + $pairs[$source] = $dest; + } + + global $IP; + file_put_contents( "$IP/serialized/normalize-ml.ser", serialize( $pairs ) ); + echo "ml: " . count( $pairs ) . " pairs written.\n"; + } +} + +$maintClass = 'GenerateNormalizerData'; +require_once( DO_MAINTENANCE ); + -- cgit v1.2.3-54-g00ecf