diff options
author | Pierre Schmitz <pierre@archlinux.de> | 2013-08-12 09:28:15 +0200 |
---|---|---|
committer | Pierre Schmitz <pierre@archlinux.de> | 2013-08-12 09:28:15 +0200 |
commit | 08aa4418c30cfc18ccc69a0f0f9cb9e17be6c196 (patch) | |
tree | 577a29fb579188d16003a209ce2a2e9c5b0aa2bd /maintenance/language/zhtable/Makefile.py | |
parent | cacc939b34e315b85e2d72997811eb6677996cc1 (diff) |
Update to MediaWiki 1.21.1
Diffstat (limited to 'maintenance/language/zhtable/Makefile.py')
-rw-r--r-- | maintenance/language/zhtable/Makefile.py | 391 |
1 files changed, 391 insertions, 0 deletions
diff --git a/maintenance/language/zhtable/Makefile.py b/maintenance/language/zhtable/Makefile.py new file mode 100644 index 00000000..7e197945 --- /dev/null +++ b/maintenance/language/zhtable/Makefile.py @@ -0,0 +1,391 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @author Philip +import tarfile as tf +import zipfile as zf +import os, re, shutil, sys, platform + +pyversion = platform.python_version() +islinux = platform.system().lower() == 'linux' + +if pyversion[:3] in ['2.6', '2.7']: + import urllib as urllib_request + import codecs + open = codecs.open + _unichr = unichr + if sys.maxunicode < 0x10000: + def unichr(i): + if i < 0x10000: + return _unichr(i) + else: + return _unichr( 0xD7C0 + ( i>>10 ) ) + _unichr( 0xDC00 + ( i & 0x3FF ) ) +elif pyversion[:2] == '3.': + import urllib.request as urllib_request + unichr = chr + +def unichr2( *args ): + return [unichr( int( i.split('<')[0][2:], 16 ) ) for i in args] + +def unichr3( *args ): + return [unichr( int( i[2:7], 16 ) ) for i in args if i[2:7]] + +# DEFINE +UNIHAN_VER = '6.2.0' +SF_MIRROR = 'dfn' +SCIM_TABLES_VER = '0.5.11' +SCIM_PINYIN_VER = '0.5.92' +LIBTABE_VER = '0.2.3' +# END OF DEFINE + +def download( url, dest ): + if os.path.isfile( dest ): + print( 'File %s is up to date.' % dest ) + return + global islinux + if islinux: + # we use wget instead urlretrieve under Linux, + # because wget could display details like download progress + os.system( 'wget %s -O %s' % ( url, dest ) ) + else: + print( 'Downloading from [%s] ...' % url ) + urllib_request.urlretrieve( url, dest ) + print( 'Download complete.\n' ) + return + +def uncompress( fp, member, encoding = 'U8' ): + name = member.rsplit( '/', 1 )[-1] + print( 'Extracting %s ...' % name ) + fp.extract( member ) + shutil.move( member, name ) + if '/' in member: + shutil.rmtree( member.split( '/', 1 )[0] ) + return open( name, 'rb', encoding, 'ignore' ) + +unzip = lambda path, member, encoding = 'U8': \ + uncompress( zf.ZipFile( path ), member, encoding ) + +untargz = lambda path, member, encoding = 'U8': \ + uncompress( tf.open( path, 'r:gz' ), member, encoding ) + +def parserCore( fp, pos, beginmark = None, endmark = None ): + if beginmark and endmark: + start = False + else: start = True + mlist = set() + for line in fp: + if beginmark and line.startswith( beginmark ): + start = True + continue + elif endmark and line.startswith( endmark ): + break + if start and not line.startswith( '#' ): + elems = line.split() + if len( elems ) < 2: + continue + elif len( elems[0] ) > 1 and \ + len( elems[pos] ) > 1: # words only + mlist.add( elems[pos] ) + return mlist + +def tablesParser( path, name ): + """ Read file from scim-tables and parse it. """ + global SCIM_TABLES_VER + src = 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name ) + fp = untargz( path, src, 'U8' ) + return parserCore( fp, 1, 'BEGIN_TABLE', 'END_TABLE' ) + +ezbigParser = lambda path: tablesParser( path, 'EZ-Big.txt.in' ) +wubiParser = lambda path: tablesParser( path, 'Wubi.txt.in' ) +zrmParser = lambda path: tablesParser( path, 'Ziranma.txt.in' ) + +def phraseParser( path ): + """ Read phrase_lib.txt and parse it. """ + global SCIM_PINYIN_VER + src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER + dst = 'phrase_lib.txt' + fp = untargz( path, src, 'U8' ) + return parserCore( fp, 0 ) + +def tsiParser( path ): + """ Read tsi.src and parse it. """ + src = 'libtabe/tsi-src/tsi.src' + dst = 'tsi.src' + fp = untargz( path, src, 'big5hkscs' ) + return parserCore( fp, 0 ) + +def unihanParser( path ): + """ Read Unihan_Variants.txt and parse it. """ + fp = unzip( path, 'Unihan_Variants.txt', 'U8' ) + t2s = dict() + s2t = dict() + for line in fp: + if line.startswith( '#' ): + continue + else: + elems = line.split() + if len( elems ) < 3: + continue + type = elems.pop( 1 ) + elems = unichr2( *elems ) + if type == 'kTraditionalVariant': + s2t[elems[0]] = elems[1:] + elif type == 'kSimplifiedVariant': + t2s[elems[0]] = elems[1:] + fp.close() + return ( t2s, s2t ) + +def applyExcludes( mlist, path ): + """ Apply exclude rules from path to mlist. """ + excludes = open( path, 'rb', 'U8' ).read().split() + excludes = [word.split( '#' )[0].strip() for word in excludes] + excludes = '|'.join( excludes ) + excptn = re.compile( '.*(?:%s).*' % excludes ) + diff = [mword for mword in mlist if excptn.search( mword )] + mlist.difference_update( diff ) + return mlist + +def charManualTable( path ): + fp = open( path, 'rb', 'U8' ) + ret = {} + for line in fp: + elems = line.split( '#' )[0].split( '|' ) + elems = unichr3( *elems ) + if len( elems ) > 1: + ret[elems[0]] = elems[1:] + return ret + +def toManyRules( src_table ): + tomany = set() + for ( f, t ) in src_table.iteritems(): + for i in range( 1, len( t ) ): + tomany.add( t[i] ) + return tomany + +def removeRules( path, table ): + fp = open( path, 'rb', 'U8' ) + texc = list() + for line in fp: + elems = line.split( '=>' ) + f = t = elems[0].strip() + if len( elems ) == 2: + t = elems[1].strip() + f = f.strip('"').strip("'") + t = t.strip('"').strip("'") + if f: + try: + table.pop( f ) + except: + pass + if t: + texc.append( t ) + texcptn = re.compile( '^(?:%s)$' % '|'.join( texc ) ) + for (tmp_f, tmp_t) in table.copy().iteritems(): + if texcptn.match( tmp_t ): + table.pop( tmp_f ) + return table + +def customRules( path ): + fp = open( path, 'rb', 'U8' ) + ret = dict() + for line in fp: + elems = line.split( '#' )[0].split() + if len( elems ) > 1: + ret[elems[0]] = elems[1] + return ret + +def dictToSortedList( src_table, pos ): + return sorted( src_table.items(), key = lambda m: m[pos] ) + +def translate( text, conv_table ): + i = 0 + while i < len( text ): + for j in range( len( text ) - i, 0, -1 ): + f = text[i:][:j] + t = conv_table.get( f ) + if t: + text = text[:i] + t + text[i:][j:] + i += len(t) - 1 + break + i += 1 + return text + +def manualWordsTable( path, conv_table, reconv_table ): + fp = open( path, 'rb', 'U8' ) + reconv_table = {} + wordlist = [line.split( '#' )[0].strip() for line in fp] + wordlist = list( set( wordlist ) ) + wordlist.sort( key = len, reverse = True ) + while wordlist: + word = wordlist.pop() + new_word = translate( word, conv_table ) + rcv_word = translate( word, reconv_table ) + if word != rcv_word: + reconv_table[word] = word + reconv_table[new_word] = word + return reconv_table + +def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ): + wordlist = list( src_wordlist ) + wordlist.sort( key = len, reverse = True ) + word_conv_table = {} + word_reconv_table = {} + conv_table = char_conv_table.copy() + reconv_table = char_reconv_table.copy() + tomanyptn = re.compile( '(?:%s)' % '|'.join( src_tomany ) ) + while wordlist: + conv_table.update( word_conv_table ) + reconv_table.update( word_reconv_table ) + word = wordlist.pop() + new_word_len = word_len = len( word ) + while new_word_len == word_len: + add = False + test_word = translate( word, reconv_table ) + new_word = translate( word, conv_table ) + if not reconv_table.get( new_word ) \ + and ( test_word != word \ + or ( tomanyptn.search( word ) \ + and word != translate( new_word, reconv_table ) ) ): + word_conv_table[word] = new_word + word_reconv_table[new_word] = word + try: + word = wordlist.pop() + except IndexError: + break + new_word_len = len(word) + return word_reconv_table + +def PHPArray( table ): + lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t] + return '\n'.join(lines) + +def main(): + #Get Unihan.zip: + url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER + han_dest = 'Unihan.zip' + download( url, han_dest ) + + # Get scim-tables-$(SCIM_TABLES_VER).tar.gz: + url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER ) + tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER + download( url, tbe_dest ) + + # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz: + url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER ) + pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER + download( url, pyn_dest ) + + # Get libtabe-$(LIBTABE_VER).tgz: + url = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER ) + lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER + download( url, lbt_dest ) + + # Unihan.txt + ( t2s_1tomany, s2t_1tomany ) = unihanParser( han_dest ) + + t2s_1tomany.update( charManualTable( 'trad2simp.manual' ) ) + s2t_1tomany.update( charManualTable( 'simp2trad.manual' ) ) + + t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.iteritems()] ) + s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.iteritems()] ) + + s_tomany = toManyRules( t2s_1tomany ) + t_tomany = toManyRules( s2t_1tomany ) + + # noconvert rules + t2s_1to1 = removeRules( 'trad2simp_noconvert.manual', t2s_1to1 ) + s2t_1to1 = removeRules( 'simp2trad_noconvert.manual', s2t_1to1 ) + + # the supper set for word to word conversion + t2s_1to1_supp = t2s_1to1.copy() + s2t_1to1_supp = s2t_1to1.copy() + t2s_1to1_supp.update( customRules( 'trad2simp_supp_set.manual' ) ) + s2t_1to1_supp.update( customRules( 'simp2trad_supp_set.manual' ) ) + + # word to word manual rules + t2s_word2word_manual = manualWordsTable( 'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp ) + t2s_word2word_manual.update( customRules( 'toSimp.manual' ) ) + s2t_word2word_manual = manualWordsTable( 'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp ) + s2t_word2word_manual.update( customRules( 'toTrad.manual' ) ) + + # word to word rules from input methods + t_wordlist = set() + s_wordlist = set() + t_wordlist.update( ezbigParser( tbe_dest ), + tsiParser( lbt_dest ) ) + s_wordlist.update( wubiParser( tbe_dest ), + zrmParser( tbe_dest ), + phraseParser( pyn_dest ) ) + + # exclude + s_wordlist = applyExcludes( s_wordlist, 'simpphrases_exclude.manual' ) + t_wordlist = applyExcludes( t_wordlist, 'tradphrases_exclude.manual' ) + + s2t_supp = s2t_1to1_supp.copy() + s2t_supp.update( s2t_word2word_manual ) + t2s_supp = t2s_1to1_supp.copy() + t2s_supp.update( t2s_word2word_manual ) + + # parse list to dict + t2s_word2word = defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp ) + t2s_word2word.update( t2s_word2word_manual ) + s2t_word2word = defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp ) + s2t_word2word.update( s2t_word2word_manual ) + + # Final tables + # sorted list toHans + t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.iteritems() if f != t] ) + toHans = dictToSortedList( t2s_1to1, 0 ) + dictToSortedList( t2s_word2word, 1 ) + # sorted list toHant + s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.iteritems() if f != t] ) + toHant = dictToSortedList( s2t_1to1, 0 ) + dictToSortedList( s2t_word2word, 1 ) + # sorted list toCN + toCN = dictToSortedList( customRules( 'toCN.manual' ), 1 ) + # sorted list toHK + toHK = dictToSortedList( customRules( 'toHK.manual' ), 1 ) + # sorted list toSG + toSG = dictToSortedList( customRules( 'toSG.manual' ), 1 ) + # sorted list toTW + toTW = dictToSortedList( customRules( 'toTW.manual' ), 1 ) + + # Get PHP Array + php = '''<?php +/** + * Simplified / Traditional Chinese conversion tables + * + * Automatically generated using code and data in includes/zhtable/ + * Do not modify directly! + * + * @file + */ + +$zh2Hant = array(\n''' + php += PHPArray( toHant ) \ + + '\n);\n\n$zh2Hans = array(\n' \ + + PHPArray( toHans ) \ + + '\n);\n\n$zh2TW = array(\n' \ + + PHPArray( toTW ) \ + + '\n);\n\n$zh2HK = array(\n' \ + + PHPArray( toHK ) \ + + '\n);\n\n$zh2CN = array(\n' \ + + PHPArray( toCN ) \ + + '\n);\n\n$zh2SG = array(\n' \ + + PHPArray( toSG ) \ + + '\n);\n' + + f = open( os.path.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'wb', encoding = 'utf8' ) + print ('Writing ZhConversion.php ... ') + f.write( php ) + f.close() + + # Remove temporary files + print ('Deleting temporary files ... ') + os.remove('EZ-Big.txt.in') + os.remove('phrase_lib.txt') + os.remove('tsi.src') + os.remove('Unihan_Variants.txt') + os.remove('Wubi.txt.in') + os.remove('Ziranma.txt.in') + + +if __name__ == '__main__': + main() |