From 08aa4418c30cfc18ccc69a0f0f9cb9e17be6c196 Mon Sep 17 00:00:00 2001 From: Pierre Schmitz Date: Mon, 12 Aug 2013 09:28:15 +0200 Subject: Update to MediaWiki 1.21.1 --- includes/zhtable/Makefile.py | 391 ------------------------------------------- 1 file changed, 391 deletions(-) delete mode 100644 includes/zhtable/Makefile.py (limited to 'includes/zhtable/Makefile.py') diff --git a/includes/zhtable/Makefile.py b/includes/zhtable/Makefile.py deleted file mode 100644 index fd603ce4..00000000 --- a/includes/zhtable/Makefile.py +++ /dev/null @@ -1,391 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# @author Philip -import tarfile as tf -import zipfile as zf -import os, re, shutil, sys, platform - -pyversion = platform.python_version() -islinux = platform.system().lower() == 'linux' - -if pyversion[:3] in ['2.6', '2.7']: - import urllib as urllib_request - import codecs - open = codecs.open - _unichr = unichr - if sys.maxunicode < 0x10000: - def unichr(i): - if i < 0x10000: - return _unichr(i) - else: - return _unichr( 0xD7C0 + ( i>>10 ) ) + _unichr( 0xDC00 + ( i & 0x3FF ) ) -elif pyversion[:2] == '3.': - import urllib.request as urllib_request - unichr = chr - -def unichr2( *args ): - return [unichr( int( i.split('<')[0][2:], 16 ) ) for i in args] - -def unichr3( *args ): - return [unichr( int( i[2:7], 16 ) ) for i in args if i[2:7]] - -# DEFINE -UNIHAN_VER = '5.2.0' -SF_MIRROR = 'dfn' -SCIM_TABLES_VER = '0.5.10' -SCIM_PINYIN_VER = '0.5.91' -LIBTABE_VER = '0.2.3' -# END OF DEFINE - -def download( url, dest ): - if os.path.isfile( dest ): - print( 'File %s is up to date.' % dest ) - return - global islinux - if islinux: - # we use wget instead urlretrieve under Linux, - # because wget could display details like download progress - os.system( 'wget %s -O %s' % ( url, dest ) ) - else: - print( 'Downloading from [%s] ...' % url ) - urllib_request.urlretrieve( url, dest ) - print( 'Download complete.\n' ) - return - -def uncompress( fp, member, encoding = 'U8' ): - name = member.rsplit( '/', 1 )[-1] - print( 'Extracting %s ...' % name ) - fp.extract( member ) - shutil.move( member, name ) - if '/' in member: - shutil.rmtree( member.split( '/', 1 )[0] ) - return open( name, 'rb', encoding, 'ignore' ) - -unzip = lambda path, member, encoding = 'U8': \ - uncompress( zf.ZipFile( path ), member, encoding ) - -untargz = lambda path, member, encoding = 'U8': \ - uncompress( tf.open( path, 'r:gz' ), member, encoding ) - -def parserCore( fp, pos, beginmark = None, endmark = None ): - if beginmark and endmark: - start = False - else: start = True - mlist = set() - for line in fp: - if beginmark and line.startswith( beginmark ): - start = True - continue - elif endmark and line.startswith( endmark ): - break - if start and not line.startswith( '#' ): - elems = line.split() - if len( elems ) < 2: - continue - elif len( elems[0] ) > 1 and \ - len( elems[pos] ) > 1: # words only - mlist.add( elems[pos] ) - return mlist - -def tablesParser( path, name ): - """ Read file from scim-tables and parse it. """ - global SCIM_TABLES_VER - src = 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name ) - fp = untargz( path, src, 'U8' ) - return parserCore( fp, 1, 'BEGIN_TABLE', 'END_TABLE' ) - -ezbigParser = lambda path: tablesParser( path, 'EZ-Big.txt.in' ) -wubiParser = lambda path: tablesParser( path, 'Wubi.txt.in' ) -zrmParser = lambda path: tablesParser( path, 'Ziranma.txt.in' ) - -def phraseParser( path ): - """ Read phrase_lib.txt and parse it. """ - global SCIM_PINYIN_VER - src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER - dst = 'phrase_lib.txt' - fp = untargz( path, src, 'U8' ) - return parserCore( fp, 0 ) - -def tsiParser( path ): - """ Read tsi.src and parse it. """ - src = 'libtabe/tsi-src/tsi.src' - dst = 'tsi.src' - fp = untargz( path, src, 'big5hkscs' ) - return parserCore( fp, 0 ) - -def unihanParser( path ): - """ Read Unihan_Variants.txt and parse it. """ - fp = unzip( path, 'Unihan_Variants.txt', 'U8' ) - t2s = dict() - s2t = dict() - for line in fp: - if line.startswith( '#' ): - continue - else: - elems = line.split() - if len( elems ) < 3: - continue - type = elems.pop( 1 ) - elems = unichr2( *elems ) - if type == 'kTraditionalVariant': - s2t[elems[0]] = elems[1:] - elif type == 'kSimplifiedVariant': - t2s[elems[0]] = elems[1:] - fp.close() - return ( t2s, s2t ) - -def applyExcludes( mlist, path ): - """ Apply exclude rules from path to mlist. """ - excludes = open( path, 'rb', 'U8' ).read().split() - excludes = [word.split( '#' )[0].strip() for word in excludes] - excludes = '|'.join( excludes ) - excptn = re.compile( '.*(?:%s).*' % excludes ) - diff = [mword for mword in mlist if excptn.search( mword )] - mlist.difference_update( diff ) - return mlist - -def charManualTable( path ): - fp = open( path, 'rb', 'U8' ) - ret = {} - for line in fp: - elems = line.split( '#' )[0].split( '|' ) - elems = unichr3( *elems ) - if len( elems ) > 1: - ret[elems[0]] = elems[1:] - return ret - -def toManyRules( src_table ): - tomany = set() - for ( f, t ) in src_table.iteritems(): - for i in range( 1, len( t ) ): - tomany.add( t[i] ) - return tomany - -def removeRules( path, table ): - fp = open( path, 'rb', 'U8' ) - texc = list() - for line in fp: - elems = line.split( '=>' ) - f = t = elems[0].strip() - if len( elems ) == 2: - t = elems[1].strip() - f = f.strip('"').strip("'") - t = t.strip('"').strip("'") - if f: - try: - table.pop( f ) - except: - pass - if t: - texc.append( t ) - texcptn = re.compile( '^(?:%s)$' % '|'.join( texc ) ) - for (tmp_f, tmp_t) in table.copy().iteritems(): - if texcptn.match( tmp_t ): - table.pop( tmp_f ) - return table - -def customRules( path ): - fp = open( path, 'rb', 'U8' ) - ret = dict() - for line in fp: - elems = line.split( '#' )[0].split() - if len( elems ) > 1: - ret[elems[0]] = elems[1] - return ret - -def dictToSortedList( src_table, pos ): - return sorted( src_table.items(), key = lambda m: m[pos] ) - -def translate( text, conv_table ): - i = 0 - while i < len( text ): - for j in range( len( text ) - i, 0, -1 ): - f = text[i:][:j] - t = conv_table.get( f ) - if t: - text = text[:i] + t + text[i:][j:] - i += len(t) - 1 - break - i += 1 - return text - -def manualWordsTable( path, conv_table, reconv_table ): - fp = open( path, 'rb', 'U8' ) - reconv_table = {} - wordlist = [line.split( '#' )[0].strip() for line in fp] - wordlist = list( set( wordlist ) ) - wordlist.sort( key = len, reverse = True ) - while wordlist: - word = wordlist.pop() - new_word = translate( word, conv_table ) - rcv_word = translate( word, reconv_table ) - if word != rcv_word: - reconv_table[word] = word - reconv_table[new_word] = word - return reconv_table - -def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ): - wordlist = list( src_wordlist ) - wordlist.sort( key = len, reverse = True ) - word_conv_table = {} - word_reconv_table = {} - conv_table = char_conv_table.copy() - reconv_table = char_reconv_table.copy() - tomanyptn = re.compile( '(?:%s)' % '|'.join( src_tomany ) ) - while wordlist: - conv_table.update( word_conv_table ) - reconv_table.update( word_reconv_table ) - word = wordlist.pop() - new_word_len = word_len = len( word ) - while new_word_len == word_len: - add = False - test_word = translate( word, reconv_table ) - new_word = translate( word, conv_table ) - if not reconv_table.get( new_word ) \ - and ( test_word != word \ - or ( tomanyptn.search( word ) \ - and word != translate( new_word, reconv_table ) ) ): - word_conv_table[word] = new_word - word_reconv_table[new_word] = word - try: - word = wordlist.pop() - except IndexError: - break - new_word_len = len(word) - return word_reconv_table - -def PHPArray( table ): - lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t] - return '\n'.join(lines) - -def main(): - #Get Unihan.zip: - url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER - han_dest = 'Unihan.zip' - download( url, han_dest ) - - # Get scim-tables-$(SCIM_TABLES_VER).tar.gz: - url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER ) - tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER - download( url, tbe_dest ) - - # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz: - url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER ) - pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER - download( url, pyn_dest ) - - # Get libtabe-$(LIBTABE_VER).tgz: - url = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER ) - lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER - download( url, lbt_dest ) - - # Unihan.txt - ( t2s_1tomany, s2t_1tomany ) = unihanParser( han_dest ) - - t2s_1tomany.update( charManualTable( 'trad2simp.manual' ) ) - s2t_1tomany.update( charManualTable( 'simp2trad.manual' ) ) - - t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.iteritems()] ) - s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.iteritems()] ) - - s_tomany = toManyRules( t2s_1tomany ) - t_tomany = toManyRules( s2t_1tomany ) - - # noconvert rules - t2s_1to1 = removeRules( 'trad2simp_noconvert.manual', t2s_1to1 ) - s2t_1to1 = removeRules( 'simp2trad_noconvert.manual', s2t_1to1 ) - - # the supper set for word to word conversion - t2s_1to1_supp = t2s_1to1.copy() - s2t_1to1_supp = s2t_1to1.copy() - t2s_1to1_supp.update( customRules( 'trad2simp_supp_set.manual' ) ) - s2t_1to1_supp.update( customRules( 'simp2trad_supp_set.manual' ) ) - - # word to word manual rules - t2s_word2word_manual = manualWordsTable( 'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp ) - t2s_word2word_manual.update( customRules( 'toSimp.manual' ) ) - s2t_word2word_manual = manualWordsTable( 'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp ) - s2t_word2word_manual.update( customRules( 'toTrad.manual' ) ) - - # word to word rules from input methods - t_wordlist = set() - s_wordlist = set() - t_wordlist.update( ezbigParser( tbe_dest ), - tsiParser( lbt_dest ) ) - s_wordlist.update( wubiParser( tbe_dest ), - zrmParser( tbe_dest ), - phraseParser( pyn_dest ) ) - - # exclude - s_wordlist = applyExcludes( s_wordlist, 'simpphrases_exclude.manual' ) - t_wordlist = applyExcludes( t_wordlist, 'tradphrases_exclude.manual' ) - - s2t_supp = s2t_1to1_supp.copy() - s2t_supp.update( s2t_word2word_manual ) - t2s_supp = t2s_1to1_supp.copy() - t2s_supp.update( t2s_word2word_manual ) - - # parse list to dict - t2s_word2word = defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp ) - t2s_word2word.update( t2s_word2word_manual ) - s2t_word2word = defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp ) - s2t_word2word.update( s2t_word2word_manual ) - - # Final tables - # sorted list toHans - t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.iteritems() if f != t] ) - toHans = dictToSortedList( t2s_1to1, 0 ) + dictToSortedList( t2s_word2word, 1 ) - # sorted list toHant - s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.iteritems() if f != t] ) - toHant = dictToSortedList( s2t_1to1, 0 ) + dictToSortedList( s2t_word2word, 1 ) - # sorted list toCN - toCN = dictToSortedList( customRules( 'toCN.manual' ), 1 ) - # sorted list toHK - toHK = dictToSortedList( customRules( 'toHK.manual' ), 1 ) - # sorted list toSG - toSG = dictToSortedList( customRules( 'toSG.manual' ), 1 ) - # sorted list toTW - toTW = dictToSortedList( customRules( 'toTW.manual' ), 1 ) - - # Get PHP Array - php = '''