diff options
author | Pierre Schmitz <pierre@archlinux.de> | 2011-06-22 11:28:20 +0200 |
---|---|---|
committer | Pierre Schmitz <pierre@archlinux.de> | 2011-06-22 11:28:20 +0200 |
commit | 9db190c7e736ec8d063187d4241b59feaf7dc2d1 (patch) | |
tree | 46d1a0dee7febef5c2d57a9f7b972be16a163b3d /includes/zhtable | |
parent | 78677c7bbdcc9739f6c10c75935898a20e1acd9e (diff) |
update to MediaWiki 1.17.0
Diffstat (limited to 'includes/zhtable')
-rw-r--r-- | includes/zhtable/Makefile.py | 596 | ||||
-rw-r--r-- | includes/zhtable/simp2trad.manual | 7 | ||||
-rw-r--r-- | includes/zhtable/simpphrases.manual | 11 | ||||
-rw-r--r-- | includes/zhtable/simpphrases_exclude.manual | 3 | ||||
-rw-r--r-- | includes/zhtable/toCN.manual | 3 | ||||
-rw-r--r-- | includes/zhtable/toHK.manual | 1 | ||||
-rw-r--r-- | includes/zhtable/toSimp.manual | 20 | ||||
-rw-r--r-- | includes/zhtable/toTW.manual | 4 | ||||
-rw-r--r-- | includes/zhtable/toTrad.manual | 42 | ||||
-rw-r--r-- | includes/zhtable/trad2simp.manual | 3 | ||||
-rw-r--r-- | includes/zhtable/tradphrases.manual | 376 | ||||
-rw-r--r-- | includes/zhtable/tradphrases_exclude.manual | 9 |
12 files changed, 716 insertions, 359 deletions
diff --git a/includes/zhtable/Makefile.py b/includes/zhtable/Makefile.py index 26e229df..a7822b0b 100644 --- a/includes/zhtable/Makefile.py +++ b/includes/zhtable/Makefile.py @@ -1,25 +1,33 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # @author Philip -import tarfile, zipfile +import tarfile as tf +import zipfile as zf import os, re, shutil, sys, platform pyversion = platform.python_version() -islinux = platform.system().lower() == 'linux' or False +islinux = platform.system().lower() == 'linux' -if pyversion[:3] in ['2.5', '2.6', '2.7']: +if pyversion[:3] in ['2.6', '2.7']: import urllib as urllib_request import codecs - uniopen = codecs.open - def unichr2(i): - if sys.maxunicode >= 0x10000 or i < 0x10000: - return unichr(i) - else: - return unichr(0xD7C0+(i>>10)) + unichr(0xDC00+(i&0x3FF)) + open = codecs.open + _unichr = unichr + if sys.maxunicode < 0x10000: + def unichr(i): + if i < 0x10000: + return _unichr(i) + else: + return _unichr( 0xD7C0 + ( i>>10 ) ) + _unichr( 0xDC00 + ( i & 0x3FF ) ) elif pyversion[:2] == '3.': import urllib.request as urllib_request - uniopen = open - unichr2 = chr + unichr = chr + +def unichr2( *args ): + return [unichr( int( i.split('<')[0][2:], 16 ) ) for i in args] + +def unichr3( *args ): + return [unichr( int( i[2:7], 16 ) ) for i in args if i[2:7]] # DEFINE SF_MIRROR = 'easynews' @@ -28,14 +36,14 @@ SCIM_PINYIN_VER = '0.5.91' LIBTABE_VER = '0.2.3' # END OF DEFINE -def GetFileFromURL( url, dest ): - if os.path.isfile(dest): +def download( url, dest ): + if os.path.isfile( dest ): print( 'File %s up to date.' % dest ) return global islinux if islinux: # we use wget instead urlretrieve under Linux, - # because wget will display details like download progress + # because wget could display details like download progress os.system('wget %s' % url) else: print( 'Downloading from [%s] ...' % url ) @@ -43,191 +51,200 @@ def GetFileFromURL( url, dest ): print( 'Download complete.\n' ) return -def GetFileFromUnihan( path ): - print( 'Extracting files from %s ...' % path ) - text = zipfile.ZipFile(path).read('Unihan_Variants.txt') - uhfile = uniopen('Unihan_Variants.txt', 'w') - uhfile.write(text) - uhfile.close() - return +def uncompress( fp, member, encoding = 'U8' ): + name = member.rsplit( '/', 1 )[-1] + print( 'Extracting %s ...' % name ) + fp.extract( member ) + shutil.move( member, name ) + if '/' in member: + shutil.rmtree( member.split( '/', 1 )[0] ) + return open( name, 'rb', encoding, 'ignore' ) -def GetFileFromTar( path, member, rename ): - print( 'Extracting %s from %s ...' % (rename, path) ) - tarfile.open(path, 'r:gz').extract(member) - shutil.move(member, rename) - tree_rmv = member.split('/')[0] - shutil.rmtree(tree_rmv) - return - -def ReadBIG5File( dest ): - print( 'Reading and decoding %s ...' % dest ) - f1 = uniopen( dest, 'r', encoding='big5hkscs', errors='replace' ) - text = f1.read() - text = text.replace( '\ufffd', '\n' ) - f1.close() - f2 = uniopen( dest, 'w', encoding='utf8' ) - f2.write(text) - f2.close() - return text +unzip = lambda path, member, encoding = 'U8': \ + uncompress( zf.ZipFile( path ), member, encoding ) -def ReadFile( dest ): - print( 'Reading and decoding %s ...' % dest ) - f = uniopen( dest, 'r', encoding='utf8' ) - ret = f.read() - f.close() - return ret +untargz = lambda path, member, encoding = 'U8': \ + uncompress( tf.open( path, 'r:gz' ), member, encoding ) -def ReadUnihanFile( dest ): - print( 'Reading and decoding %s ...' % dest ) - f = uniopen( dest, 'r', encoding='utf8' ) - t2s_code = [] - s2t_code = [] - while True: - line = f.readline() - if line: - if line.startswith('#'): - continue - elif not line.find('kSimplifiedVariant') == -1: - temp = line.split('kSimplifiedVariant') - t2s_code.append( ( temp[0].strip(), temp[1].strip() ) ) - elif not line.find('kTraditionalVariant') == -1: - temp = line.split('kTraditionalVariant') - s2t_code.append( ( temp[0].strip(), temp[1].strip() ) ) - else: +def parserCore( fp, pos, beginmark = None, endmark = None ): + if beginmark and endmark: + start = False + else: start = True + mlist = set() + for line in fp: + if beginmark and line.startswith( beginmark ): + start = True + continue + elif endmark and line.startswith( endmark ): break - f.close() - return ( t2s_code, s2t_code ) + if start and not line.startswith( '#' ): + elems = line.split() + if len( elems ) < 2: + continue + elif len( elems[0] ) > 1: + mlist.add( elems[pos] ) + return mlist -def RemoveRows( text, num ): - text = re.sub( '.*\s*', '', text, num) - return text +def tablesParser( path, name ): + """ Read file from scim-tables and parse it. """ + global SCIM_TABLES_VER + src = 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name ) + fp = untargz( path, src, 'U8' ) + return parserCore( fp, 1, 'BEGIN_TABLE', 'END_TABLE' ) -def RemoveOneCharConv( text ): - preg = re.compile('^.\s*$', re.MULTILINE) - text = preg.sub( '', text ) - return text +ezbigParser = lambda path: tablesParser( path, 'EZ-Big.txt.in' ) +wubiParser = lambda path: tablesParser( path, 'Wubi.txt.in' ) +zrmParser = lambda path: tablesParser( path, 'Ziranma.txt.in' ) -def ConvertToChar( code ): - code = code.split('<')[0] - return unichr2( int( code[2:], 16 ) ) - -def GetDefaultTable( code_table ): - char_table = {} - for ( f, t ) in code_table: - if f and t: - from_char = ConvertToChar( f ) - to_chars = [ConvertToChar( code ) for code in t.split()] - char_table[from_char] = to_chars - return char_table - -def GetManualTable( dest ): - text = ReadFile( dest ) - temp1 = text.split() - char_table = {} - for elem in temp1: - elem = elem.strip('|') - if elem: - temp2 = elem.split( '|', 1 ) - from_char = unichr2( int( temp2[0][2:7], 16 ) ) - to_chars = [unichr2( int( code[2:7], 16 ) ) for code in temp2[1].split('|')] - char_table[from_char] = to_chars - return char_table - -def GetValidTable( src_table ): - valid_table = {} - for f, t in src_table.items(): - valid_table[f] = t[0] - return valid_table - -def GetToManyRules( src_table ): - tomany_table = {} - for f, t in src_table.items(): - for i in range(1, len(t)): - tomany_table[t[i]] = True - return tomany_table - -def RemoveRules( dest, table ): - text = ReadFile( dest ) - temp1 = text.split() - for elem in temp1: - f = '' - t = '' - elem = elem.strip().replace( '"', '' ).replace( '\'', '' ) - if '=>' in elem: - if elem.startswith( '=>' ): - t = elem.replace( '=>', '' ).strip() - elif elem.endswith( '=>' ): - f = elem.replace( '=>', '' ).strip() - else: - temp2 = elem.split( '=>' ) - f = temp2[0].strip() - t = temp2[1].strip() - try: - table.pop(f, t) - continue - except: - continue +def phraseParser( path ): + """ Read phrase_lib.txt and parse it. """ + global SCIM_PINYIN_VER + src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER + dst = 'phrase_lib.txt' + fp = untargz( path, src, 'U8' ) + return parserCore( fp, 0 ) + +def tsiParser( path ): + """ Read tsi.src and parse it. """ + src = 'libtabe/tsi-src/tsi.src' + dst = 'tsi.src' + fp = untargz( path, src, 'big5hkscs' ) + return parserCore( fp, 0 ) + +def unihanParser( path ): + """ Read Unihan_Variants.txt and parse it. """ + fp = unzip( path, 'Unihan_Variants.txt', 'U8' ) + t2s = dict() + s2t = dict() + for line in fp: + if line.startswith( '#' ): + continue else: - f = t = elem + elems = line.split() + if len( elems ) < 3: + continue + type = elems.pop( 1 ) + elems = unichr2( *elems ) + if type == 'kTraditionalVariant': + s2t[elems[0]] = elems[1:] + elif type == 'kSimplifiedVariant': + t2s[elems[0]] = elems[1:] + fp.close() + return ( t2s, s2t ) + +def applyExcludes( mlist, path ): + """ Apply exclude rules from path to mlist. """ + excludes = open( path, 'rb', 'U8' ).read().split() + excludes = [word.split( '#' )[0].strip() for word in excludes] + excludes = '|'.join( excludes ) + excptn = re.compile( '.*(?:%s).*' % excludes ) + diff = [mword for mword in mlist if excptn.search( mword )] + mlist.difference_update( diff ) + return mlist + +def charManualTable( path ): + fp = open( path, 'rb', 'U8' ) + ret = {} + for line in fp: + elems = line.split( '#' )[0].split( '|' ) + elems = unichr3( *elems ) + if len( elems ) > 1: + ret[elems[0]] = elems[1:] + return ret + +def toManyRules( src_table ): + tomany = set() + for ( f, t ) in src_table.iteritems(): + for i in range( 1, len( t ) ): + tomany.add( t[i] ) + return tomany + +def removeRules( path, table ): + fp = open( path, 'rb', 'U8' ) + texc = list() + for line in fp: + elems = line.split( '=>' ) + f = t = elems[0].strip() + if len( elems ) == 2: + t = elems[1].strip() + f = f.strip('"').strip("'") + t = t.strip('"').strip("'") if f: try: - table.pop(f) + table.pop( f ) except: - x = 1 + pass if t: - for temp_f, temp_t in table.copy().items(): - if temp_t == t: - table.pop(temp_f) + texc.append( t ) + texcptn = re.compile( '^(?:%s)$' % '|'.join( texc ) ) + for (tmp_f, tmp_t) in table.copy().iteritems(): + if texcptn.match( tmp_t ): + table.pop( tmp_f ) return table -def DictToSortedList1( src_table ): - return sorted( src_table.items(), key = lambda m: m[0] ) #sorted( temp_table, key = lambda m: len( m[0] ) ) +def customRules( path ): + fp = open( path, 'rb', 'U8' ) + ret = dict() + for line in fp: + elems = line.split( '#' )[0].split() + if len( elems ) > 1: + ret[elems[0]] = elems[1] + return ret -def DictToSortedList2( src_table ): - return sorted( src_table.items(), key = lambda m: m[1] ) +def dictToSortedList( src_table, pos ): + return sorted( src_table.items(), key = lambda m: m[pos] ) -def Converter( string, conv_table ): +def translate( text, conv_table ): i = 0 - while i < len(string): - for j in range(len(string) - i, 0, -1): - f = string[i:][:j] + while i < len( text ): + for j in range( len( text ) - i, 0, -1 ): + f = text[i:][:j] t = conv_table.get( f ) if t: - string = string[:i] + t + string[i:][j:] + text = text[:i] + t + text[i:][j:] i += len(t) - 1 break i += 1 - return string + return text + +def manualWordsTable( path, conv_table, reconv_table ): + fp = open( path, 'rb', 'U8' ) + reconv_table = {} + wordlist = [line.split( '#' )[0].strip() for line in fp] + wordlist = list( set( wordlist ) ) + wordlist.sort( key = len, reverse = True ) + while wordlist: + word = wordlist.pop() + new_word = translate( word, conv_table ) + rcv_word = translate( word, reconv_table ) + if word != rcv_word: + reconv_table[word] = word + reconv_table[new_word] = word + return reconv_table -def GetDefaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ): - wordlist = list( set( src_wordlist ) ) +def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ): + wordlist = list( src_wordlist ) wordlist.sort( key = len, reverse = True ) word_conv_table = {} word_reconv_table = {} + conv_table = char_conv_table.copy() + reconv_table = char_reconv_table.copy() + tomanyptn = re.compile( '(?:%s)' % '|'.join( src_tomany ) ) while wordlist: - conv_table = {} - reconv_table = {} conv_table.update( word_conv_table ) - conv_table.update( char_conv_table ) reconv_table.update( word_reconv_table ) - reconv_table.update( char_reconv_table ) word = wordlist.pop() - new_word_len = word_len = len(word) + new_word_len = word_len = len( word ) while new_word_len == word_len: - rvt_test = False - for char in word: - rvt_test = rvt_test or src_tomany.get(char) - test_word = Converter( word, reconv_table ) - new_word = Converter( word, conv_table ) - if not reconv_table.get( new_word ): - if not test_word == word: - word_conv_table[word] = new_word - word_reconv_table[new_word] = word - elif rvt_test: - rvt_word = Converter( new_word, reconv_table ) - if not rvt_word == word: - word_conv_table[word] = new_word - word_reconv_table[new_word] = word + add = False + test_word = translate( word, reconv_table ) + new_word = translate( word, conv_table ) + if not reconv_table.get( new_word ) \ + and ( test_word != word \ + or ( tomanyptn.search( word ) \ + and word != translate( new_word, reconv_table ) ) ): + word_conv_table[word] = new_word + word_reconv_table[new_word] = word try: word = wordlist.pop() except IndexError: @@ -235,205 +252,98 @@ def GetDefaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv new_word_len = len(word) return word_reconv_table -def GetManualWordsTable( src_wordlist, conv_table ): - src_wordlist = [items.split('#')[0].strip() for items in src_wordlist] - wordlist = list( set( src_wordlist ) ) - wordlist.sort( key = len, reverse = True ) - reconv_table = {} - while wordlist: - word = wordlist.pop() - new_word = Converter( word, conv_table ) - reconv_table[new_word] = word - return reconv_table - -def CustomRules( dest ): - text = ReadFile( dest ) - temp = text.split() - ret = dict() - for i in range( 0, len( temp ), 2 ): - ret[temp[i]] = temp[i + 1] - return ret - -def GetPHPArray( table ): +def PHPArray( table ): lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t] - #lines = ['"%s"=>"%s",' % (f, t) for (f, t) in table] return '\n'.join(lines) -def RemoveSameChar( src_table ): - dst_table = {} - for f, t in src_table.items(): - if f != t: - dst_table[f] = t - return dst_table - def main(): #Get Unihan.zip: url = 'http://www.unicode.org/Public/UNIDATA/Unihan.zip' han_dest = 'Unihan.zip' - GetFileFromURL( url, han_dest ) + download( url, han_dest ) # Get scim-tables-$(SCIM_TABLES_VER).tar.gz: url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER ) tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER - GetFileFromURL( url, tbe_dest ) + download( url, tbe_dest ) # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz: url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER ) pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER - GetFileFromURL( url, pyn_dest ) + download( url, pyn_dest ) # Get libtabe-$(LIBTABE_VER).tgz: url = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER ) lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER - GetFileFromURL( url, lbt_dest ) - - # Extract the file from a comressed files - - # Unihan.txt Simp. & Trad - GetFileFromUnihan( han_dest ) - - # Make word lists - t_wordlist = [] - s_wordlist = [] - - # EZ.txt.in Trad - src = 'scim-tables-%s/tables/zh/EZ-Big.txt.in' % SCIM_TABLES_VER - dst = 'EZ.txt.in' - GetFileFromTar( tbe_dest, src, dst ) - text = ReadFile( dst ) - text = text.split( 'BEGIN_TABLE' )[1].strip() - text = text.split( 'END_TABLE' )[0].strip() - text = re.sub( '.*\t', '', text ) - text = RemoveOneCharConv( text ) - t_wordlist.extend( text.split() ) + download( url, lbt_dest ) - # Wubi.txt.in Simp - src = 'scim-tables-%s/tables/zh/Wubi.txt.in' % SCIM_TABLES_VER - dst = 'Wubi.txt.in' - GetFileFromTar( tbe_dest, src, dst ) - text = ReadFile( dst ) - text = text.split( 'BEGIN_TABLE' )[1].strip() - text = text.split( 'END_TABLE' )[0].strip() - text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text ) - text = RemoveOneCharConv( text ) - s_wordlist.extend( text.split() ) - - # Ziranma.txt.in Simp - src = 'scim-tables-%s/tables/zh/Ziranma.txt.in' % SCIM_TABLES_VER - dst = 'Ziranma.txt.in' - GetFileFromTar( tbe_dest, src, dst ) - text = ReadFile( dst ) - text = text.split( 'BEGIN_TABLE' )[1].strip() - text = text.split( 'END_TABLE' )[0].strip() - text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text ) - text = RemoveOneCharConv( text ) - s_wordlist.extend( text.split() ) - - # phrase_lib.txt Simp - src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER - dst = 'phrase_lib.txt' - GetFileFromTar( pyn_dest, src, dst ) - text = ReadFile( 'phrase_lib.txt' ) - text = re.sub( '(.*)\t\d\d*.*', '\g<1>', text) - text = RemoveRows( text, 5 ) - text = RemoveOneCharConv( text ) - s_wordlist.extend( text.split() ) - - # tsi.src Trad - src = 'libtabe/tsi-src/tsi.src' - dst = 'tsi.src' - GetFileFromTar( lbt_dest, src, dst ) - text = ReadBIG5File( 'tsi.src' ) - text = re.sub( ' \d.*', '', text.replace('# ', '')) - text = RemoveOneCharConv( text ) - t_wordlist.extend( text.split() ) - - # remove duplicate elements - t_wordlist = list( set( t_wordlist ) ) - s_wordlist = list( set( s_wordlist ) ) - - # simpphrases_exclude.manual Simp - text = ReadFile( 'simpphrases_exclude.manual' ) - temp = text.split() - s_string = '\n'.join( s_wordlist ) - for elem in temp: - s_string = re.sub( '.*%s.*\n' % elem, '', s_string ) - s_wordlist = s_string.split('\n') + # Unihan.txt + ( t2s_1tomany, s2t_1tomany ) = unihanParser( han_dest ) + + t2s_1tomany.update( charManualTable( 'trad2simp.manual' ) ) + s2t_1tomany.update( charManualTable( 'simp2trad.manual' ) ) - # tradphrases_exclude.manual Trad - text = ReadFile( 'tradphrases_exclude.manual' ) - temp = text.split() - t_string = '\n'.join( t_wordlist ) - for elem in temp: - t_string = re.sub( '.*%s.*\n' % elem, '', t_string ) - t_wordlist = t_string.split('\n') + t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.iteritems()] ) + s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.iteritems()] ) - # Make char to char convertion table - # Unihan.txt, dict t2s_code, s2t_code = { 'U+XXXX': 'U+YYYY( U+ZZZZ) ... ', ... } - ( t2s_code, s2t_code ) = ReadUnihanFile( 'Unihan_Variants.txt' ) - # dict t2s_1tomany = { '\uXXXX': '\uYYYY\uZZZZ ... ', ... } - t2s_1tomany = {} - t2s_1tomany.update( GetDefaultTable( t2s_code ) ) - t2s_1tomany.update( GetManualTable( 'trad2simp.manual' ) ) - # dict s2t_1tomany - s2t_1tomany = {} - s2t_1tomany.update( GetDefaultTable( s2t_code ) ) - s2t_1tomany.update( GetManualTable( 'simp2trad.manual' ) ) - # dict t2s_1to1 = { '\uXXXX': '\uYYYY', ... }; t2s_trans = { 'ddddd': '', ... } - t2s_1to1 = GetValidTable( t2s_1tomany ) - s_tomany = GetToManyRules( t2s_1tomany ) - # dict s2t_1to1; s2t_trans - s2t_1to1 = GetValidTable( s2t_1tomany ) - t_tomany = GetToManyRules( s2t_1tomany ) - # remove noconvert rules - t2s_1to1 = RemoveRules( 'trad2simp_noconvert.manual', t2s_1to1 ) - s2t_1to1 = RemoveRules( 'simp2trad_noconvert.manual', s2t_1to1 ) + s_tomany = toManyRules( t2s_1tomany ) + t_tomany = toManyRules( s2t_1tomany ) + + # noconvert rules + t2s_1to1 = removeRules( 'trad2simp_noconvert.manual', t2s_1to1 ) + s2t_1to1 = removeRules( 'simp2trad_noconvert.manual', s2t_1to1 ) - # Make word to word convertion table + # the supper set for word to word conversion t2s_1to1_supp = t2s_1to1.copy() s2t_1to1_supp = s2t_1to1.copy() - # trad2simp_supp_set.manual - t2s_1to1_supp.update( CustomRules( 'trad2simp_supp_set.manual' ) ) - # simp2trad_supp_set.manual - s2t_1to1_supp.update( CustomRules( 'simp2trad_supp_set.manual' ) ) - # simpphrases.manual - text = ReadFile( 'simpphrases.manual' ) - s_wordlist_manual = text.split('\n') - t2s_word2word_manual = GetManualWordsTable(s_wordlist_manual, s2t_1to1_supp) - t2s_word2word_manual.update( CustomRules( 'toSimp.manual' ) ) - # tradphrases.manual - text = ReadFile( 'tradphrases.manual' ) - t_wordlist_manual = text.split('\n') - s2t_word2word_manual = GetManualWordsTable(t_wordlist_manual, t2s_1to1_supp) - s2t_word2word_manual.update( CustomRules( 'toTrad.manual' ) ) - # t2s_word2word + t2s_1to1_supp.update( customRules( 'trad2simp_supp_set.manual' ) ) + s2t_1to1_supp.update( customRules( 'simp2trad_supp_set.manual' ) ) + + # word to word manual rules + t2s_word2word_manual = manualWordsTable( 'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp ) + t2s_word2word_manual.update( customRules( 'toSimp.manual' ) ) + s2t_word2word_manual = manualWordsTable( 'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp ) + s2t_word2word_manual.update( customRules( 'toTrad.manual' ) ) + + # word to word rules from input methods + t_wordlist = set() + s_wordlist = set() + t_wordlist.update( ezbigParser( tbe_dest ), + tsiParser( lbt_dest ) ) + s_wordlist.update( wubiParser( tbe_dest ), + zrmParser( tbe_dest ), + phraseParser( pyn_dest ) ) + + # exclude + s_wordlist = applyExcludes( s_wordlist, 'simpphrases_exclude.manual' ) + t_wordlist = applyExcludes( t_wordlist, 'tradphrases_exclude.manual' ) + s2t_supp = s2t_1to1_supp.copy() s2t_supp.update( s2t_word2word_manual ) t2s_supp = t2s_1to1_supp.copy() t2s_supp.update( t2s_word2word_manual ) - t2s_word2word = GetDefaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp ) - ## toSimp.manual + + # parse list to dict + t2s_word2word = defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp ) t2s_word2word.update( t2s_word2word_manual ) - # s2t_word2word - s2t_word2word = GetDefaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp ) - ## toTrad.manual + s2t_word2word = defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp ) s2t_word2word.update( s2t_word2word_manual ) # Final tables # sorted list toHans - t2s_1to1 = RemoveSameChar( t2s_1to1 ) - s2t_1to1 = RemoveSameChar( s2t_1to1 ) - toHans = DictToSortedList1( t2s_1to1 ) + DictToSortedList2( t2s_word2word ) + t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.iteritems() if f != t] ) + toHans = dictToSortedList( t2s_1to1, 0 ) + dictToSortedList( t2s_word2word, 1 ) # sorted list toHant - toHant = DictToSortedList1( s2t_1to1 ) + DictToSortedList2( s2t_word2word ) + s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.iteritems() if f != t] ) + toHant = dictToSortedList( s2t_1to1, 0 ) + dictToSortedList( s2t_word2word, 1 ) # sorted list toCN - toCN = DictToSortedList2( CustomRules( 'toCN.manual' ) ) + toCN = dictToSortedList( customRules( 'toCN.manual' ), 1 ) # sorted list toHK - toHK = DictToSortedList2( CustomRules( 'toHK.manual' ) ) + toHK = dictToSortedList( customRules( 'toHK.manual' ), 1 ) # sorted list toSG - toSG = DictToSortedList2( CustomRules( 'toSG.manual' ) ) + toSG = dictToSortedList( customRules( 'toSG.manual' ), 1 ) # sorted list toTW - toTW = DictToSortedList2( CustomRules( 'toTW.manual' ) ) + toTW = dictToSortedList( customRules( 'toTW.manual' ), 1 ) # Get PHP Array php = '''<?php @@ -442,30 +352,32 @@ def main(): * * Automatically generated using code and data in includes/zhtable/ * Do not modify directly! + * + * @file */ $zh2Hant = array(\n''' - php += GetPHPArray( toHant ) - php += '\n);\n\n$zh2Hans = array(\n' - php += GetPHPArray( toHans ) - php += '\n);\n\n$zh2TW = array(\n' - php += GetPHPArray( toTW ) - php += '\n);\n\n$zh2HK = array(\n' - php += GetPHPArray( toHK ) - php += '\n);\n\n$zh2CN = array(\n' - php += GetPHPArray( toCN ) - php += '\n);\n\n$zh2SG = array(\n' - php += GetPHPArray( toSG ) - php += '\n);' + php += PHPArray( toHant ) \ + + '\n);\n\n$zh2Hans = array(\n' \ + + PHPArray( toHans ) \ + + '\n);\n\n$zh2TW = array(\n' \ + + PHPArray( toTW ) \ + + '\n);\n\n$zh2HK = array(\n' \ + + PHPArray( toHK ) \ + + '\n);\n\n$zh2CN = array(\n' \ + + PHPArray( toCN ) \ + + '\n);\n\n$zh2SG = array(\n' \ + + PHPArray( toSG ) \ + + '\n);' - f = uniopen( 'ZhConversion.php', 'w', encoding = 'utf8' ) + f = open( 'ZhConversion.php', 'wb', encoding = 'utf8' ) print ('Writing ZhConversion.php ... ') f.write( php ) f.close() #Remove temp files print ('Deleting temp files ... ') - os.remove('EZ.txt.in') + os.remove('EZ-Big.txt.in') os.remove('phrase_lib.txt') os.remove('tsi.src') os.remove('Unihan_Variants.txt') diff --git a/includes/zhtable/simp2trad.manual b/includes/zhtable/simp2trad.manual index bb4eb7ef..eb5fa396 100644 --- a/includes/zhtable/simp2trad.manual +++ b/includes/zhtable/simp2trad.manual @@ -10,6 +10,7 @@ U+04CA0䲠|U+09C06鰆| U+04CA1䲡|U+09C0C鰌| U+04CA2䲢|U+09C27鰧| U+04CA3䲣|U+04C77䱷| +U+04DAE䶮|U+09F91龑| U+04E07万|U+0842C萬|U+04E07万| U+04E0E与|U+08207與|U+04E0E与| U+04E11丑|U+04E11丑|U+0919C醜| @@ -41,7 +42,7 @@ U+051C6准|U+051C6准|U+06E96準| U+051E0几|U+05E7E幾|U+051E0几| U+051EB凫|U+09CE7鳧|U+09CEC鳬| U+051FA出|U+051FA出|U+09F63齣| -U+05212划|U+05212划|U+05283劃| +U+05212划|U+05283劃|U+05212划| U+0522B别|U+05225別|U+05F46彆| U+0522E刮|U+0522E刮|U+098B3颳| U+05236制|U+05236制|U+088FD製| @@ -64,7 +65,7 @@ U+05401吁|U+05401吁|U+07C72籲| U+05408合|U+05408合|U+095A4閤| U+0540A吊|U+0540A吊|U+05F14弔| U+0540C同|U+0540C同|U+08855衕| -U+0540E后|U+0540E后|U+05F8C後| +U+0540E后|U+05F8C後|U+0540E后| U+05411向|U+05411向|U+056AE嚮|U+066CF曏| U+0542F启|U+0555F啟|U+05553啓| U+05446呆|U+05446呆|U+07343獃| @@ -369,4 +370,4 @@ U+2A38A𪎊|U+09EA8麨| U+2A38B𪎋|U+04D34䴴| U+2A38C𪎌|U+09EB3麳| U+2A68F𪚏|U+2A600𪘀| -U+2A690𪚐|U+2A62F𪘯|
\ No newline at end of file +U+2A690𪚐|U+2A62F𪘯| diff --git a/includes/zhtable/simpphrases.manual b/includes/zhtable/simpphrases.manual index a015a34b..4b699e26 100644 --- a/includes/zhtable/simpphrases.manual +++ b/includes/zhtable/simpphrases.manual @@ -135,6 +135,15 @@ 乾忠 乾淳 李乾顺 +黄润乾 +男性为乾 +男为乾 +阳为乾 +乾一组 +乾一坛 +陈乾生 +陈公乾生 +字乾生 不着痕迹 不着边际 与着 @@ -2222,3 +2231,5 @@ 醯壶 苧烯 近角聪信 +米泽瑠美 +峯岸南
\ No newline at end of file diff --git a/includes/zhtable/simpphrases_exclude.manual b/includes/zhtable/simpphrases_exclude.manual index 4606041f..3e9d3ecc 100644 --- a/includes/zhtable/simpphrases_exclude.manual +++ b/includes/zhtable/simpphrases_exclude.manual @@ -17,4 +17,5 @@ 簑 樑 摺叠 -餗
\ No newline at end of file +餗 +安甯
\ No newline at end of file diff --git a/includes/zhtable/toCN.manual b/includes/zhtable/toCN.manual index e3c12d0b..54e95765 100644 --- a/includes/zhtable/toCN.manual +++ b/includes/zhtable/toCN.manual @@ -228,6 +228,7 @@ 夜学 夜校 华乐 民乐 中樂 民乐 +軍中樂園 军中乐园 华乐街 华乐街 屋价 房价 計程車 出租车 @@ -277,4 +278,4 @@ 矽钢 矽钢 侏儸紀 侏罗纪 甚麽 什么 -甚麼 什么
\ No newline at end of file +甚麼 什么 diff --git a/includes/zhtable/toHK.manual b/includes/zhtable/toHK.manual index 10a3dfcb..53b354c7 100644 --- a/includes/zhtable/toHK.manual +++ b/includes/zhtable/toHK.manual @@ -2239,3 +2239,4 @@ 分布 分佈 分布于 分佈於 分布於 分佈於 +想象 想像
\ No newline at end of file diff --git a/includes/zhtable/toSimp.manual b/includes/zhtable/toSimp.manual index f424ee73..da04b82e 100644 --- a/includes/zhtable/toSimp.manual +++ b/includes/zhtable/toSimp.manual @@ -41,10 +41,26 @@ 乾鹄 乾鹄 乾鹊 乾鹊 乾龙 乾龙 +张法乾 张法乾 +旋乾转坤 旋乾转坤 天道为乾 天道为乾 易经·乾 易经·乾 易经乾 易经乾 乾务 乾务 +黄润乾 黄润乾 +男性为乾 男性为乾 +男为乾 男为乾 +阳为乾 阳为乾 +男性为乾 男性为乾 +男性爲乾 男性为乾 +男为乾 男为乾 +男爲乾 男为乾 +阳为乾 阳为乾 +陽爲乾 阳为乾 +乾一组 乾一组 +乾一坛 乾一坛 +陈乾生 陈乾生 +陈公乾生 陈公乾生 柳诒徵 柳诒徵 於夫罗 於夫罗 於梨华 於梨华 @@ -86,6 +102,8 @@ 答覆 答复 反反覆覆 反反复复 重覆 重复 +覆核 复核 +覆查 复查 鬱姓 鬱姓 鬱氏 鬱氏 侏儸紀 侏罗纪 @@ -140,3 +158,5 @@ 標誌著 标志着 近角聪信 近角聪信 修鍊 修炼 +米泽瑠美 米泽瑠美 +太閤 太阁 diff --git a/includes/zhtable/toTW.manual b/includes/zhtable/toTW.manual index 2ce16f3f..a638e86b 100644 --- a/includes/zhtable/toTW.manual +++ b/includes/zhtable/toTW.manual @@ -50,7 +50,6 @@ 以太网 乙太網 位图 點陣圖 例程 常式 -信道 通道 光标 游標 光盘 光碟 光驱 光碟機 @@ -329,7 +328,6 @@ 平治 賓士 奔驰 賓士 積架 捷豹 -福士 福斯 雪铁龙 雪鐵龍 萬事得 馬自達 拿破仑 拿破崙 @@ -410,3 +408,5 @@ 館裏 館裡 系列裏 系列裡 村子裏 村子裡 +青霉素 青黴素 +想象 想像 diff --git a/includes/zhtable/toTrad.manual b/includes/zhtable/toTrad.manual index b3459054..0c79178f 100644 --- a/includes/zhtable/toTrad.manual +++ b/includes/zhtable/toTrad.manual @@ -137,6 +137,46 @@ 于韋斯屈萊 于韋斯屈萊 于克-蘭多縣 于克-蘭多縣 于斯納爾斯貝里 于斯納爾斯貝里 +夏于喬 夏于喬 涂澤民 涂澤民 涂長望 涂長望 -台历 枱曆
\ No newline at end of file +涂敏恆 涂敏恆 +台历 枱曆 +艷后 艷后 +廢后 廢后 +后髮座 后髮座 +后髮星系團 后髮星系團 +后髮FK型星 后髮FK型星 +后海灣 后海灣 +賈后 賈后 +賢后 賢后 +呂后 呂后 +蟻后 蟻后 +馬格里布 馬格里布 +佳里鎮 佳里鎮 +埔裡社撫墾局 埔裏社撫墾局 +埔裏社撫墾局 埔裏社撫墾局 +有只採 有只採 +任何表達 任何表達 +會干擾 會干擾 +党項 党項 +余三勝 余三勝 +簡筑翎 簡筑翎 +楊雅筑 楊雅筑 +杰威爾音樂 杰威爾音樂 +尸羅精舍 尸羅精舍 +索馬里 索馬里 +騰格里 騰格里 +村里長 村里長 +進制 進制 +模范三軍 模范三軍 +黃詩杰 黃詩杰 +陳冲 陳冲 +劉佳怜 劉佳怜 +范賢惠 范賢惠 +于國治 于國治 +于楓 于楓 +黎吉雲 黎吉雲 +于飛島 于飛島 +鄉愿 鄉愿 +奇迹 奇蹟 diff --git a/includes/zhtable/trad2simp.manual b/includes/zhtable/trad2simp.manual index 4aed7e1d..6cbc3ee5 100644 --- a/includes/zhtable/trad2simp.manual +++ b/includes/zhtable/trad2simp.manual @@ -90,12 +90,14 @@ U+071EC燬|U+06BC1毁| U+07343獃|U+05446呆| U+07515甕|U+074EE瓮| U+07526甦|U+082CF苏| +U+0752F甯|U+05B81宁| U+0756B畫|U+0753B画|U+05212划| U+07575畵|U+0753B画|U+05212划| U+075E0痠|U+09178酸| U+07652癒|U+06108愈| U+07661癡|U+075F4痴| U+076C3盃|U+0676F杯| +U+0771E眞|U+0771F真| U+077AD瞭|U+04E86了| U+077C7矇|U+08499蒙| U+07843硃|U+06731朱| @@ -228,6 +230,7 @@ U+09EF4黴|U+09709霉| U+09F15鼕|U+051AC冬| U+09F47齇|U+09F44齄| U+09F63齣|U+051FA出| +U+09F91龑|U+04DAE䶮| U+21ED5𡻕|U+05C81岁| U+26A99𦪙|U+0447D䑽| U+2895B𨥛|U+28C40𨱀| diff --git a/includes/zhtable/tradphrases.manual b/includes/zhtable/tradphrases.manual index f3a95335..5a832a60 100644 --- a/includes/zhtable/tradphrases.manual +++ b/includes/zhtable/tradphrases.manual @@ -30,6 +30,26 @@ 7隻 8隻 9隻 +0只支援 +1只支援 +2只支援 +3只支援 +4只支援 +5只支援 +6只支援 +7只支援 +8只支援 +9只支援 +0只支持 +1只支持 +2只支持 +3只支持 +4只支持 +5只支持 +6只支持 +7只支持 +8只支持 +9只支持 百隻 千隻 萬隻 @@ -53,6 +73,7 @@ 多只是 多只需 多只會 +多只用 大只能 大只可 大只在 @@ -109,6 +130,9 @@ 7天後 8天後 9天後 +天後來 +天後天 +天後半 後印 萬象 並存著 @@ -241,6 +265,7 @@ 幹的停當 乾巴 偎乾 +眼乾 偷雞不著 几絲 划著 @@ -431,6 +456,7 @@ 並發現 並發展 並發動 +並發布 火並非 舉手表 揮手表 @@ -649,6 +675,8 @@ 牽一髮 白發其事 后髮座 +后髮星系團 +后髮FK型星 波髮藻 辮髮 逋髮 @@ -698,6 +726,10 @@ 櫛髮工 鬒髮 模范棒棒堂 +模范三軍 +模范七棒 +模范14棒 +模范21棒 顏範 儀範 典範 @@ -734,6 +766,7 @@ 置言成範 吾爲之範我馳驅 天地為範 +範數 丰采 丰標不凡 丰神 @@ -874,6 +907,8 @@ 裏勾外連 裏手 水里鄉 +水里溪 +水里濁水溪 二里頭 年歷史 西歷史 @@ -881,6 +916,7 @@ 國歷代 國歷任 國歷屆 +國歷經 新歷史 夏歷史 百花曆 @@ -926,6 +962,14 @@ 格里高利曆 共和曆 掛曆 +曆獄 +天文曆表 +日心曆表 +地心曆表 +復活節曆表 +月球曆表 +伊爾汗曆表 +延曆 共和歷史 厤物之意 爰定祥厤 @@ -1001,7 +1045,6 @@ 一鍋麵 伊府麵 藥麵兒 -洋麵 意大利麵 湯下麵 茶麵 @@ -1032,6 +1075,7 @@ 太僕 僮僕 金僕姑 +僕婢 樸實 樸訥 樸念仁 @@ -1358,10 +1402,6 @@ 昇平 爾冬陞 澹臺 -涂謹申 -涂鴻欽 -涂壯勳 -涂醒哲 拜託 委託 輓曲 @@ -1860,6 +1900,7 @@ 批准的 核准的 為準 +準直 擺鐘 編鐘 碰鐘 @@ -2029,6 +2070,9 @@ 任何鐘錶 任何鐘 任何錶 +任何表示 +任何表達 +任何表演 選手表現 選手表達 選手表示 @@ -2081,7 +2125,6 @@ 銫鐘 數字鐘錶 數字鐘 -數字錶 顯示鐘錶 顯示鐘 顯示錶 @@ -2288,8 +2331,13 @@ 7餘 8餘 9餘 +余姓 余光生 余光中 +余思敏 +余威德 +余子明 +余三勝 崑山 崑曲 崑腔 @@ -2361,7 +2409,7 @@ 弔場 弔書 弔詞 -弔死 +弔死問孤 弔死問疾 弔撒 弔喪 @@ -2872,6 +2920,8 @@ 陽歷史 額我略歷史 黃歷史 +天曆 +天歷史 美醜 獻醜 出醜 @@ -2898,7 +2948,7 @@ 醜聞 醜語 母醜 -齣子 +一齣子 齣兒 賣獃 發獃 @@ -2935,6 +2985,7 @@ 普鼕鼕 鼕鼕鼓 令人髮指 +爆發指數 開發 剪其髮 吐哺捉髮 @@ -2964,7 +3015,7 @@ 細如髮 繫於一髮 膚髮 -華髮 +生華髮 蒼髮 被髮佯狂 被髮入山 @@ -3007,6 +3058,7 @@ 對表中 對表明 不準確 +並不準確 一伙頭 一伙食 一半只 @@ -3314,6 +3366,7 @@ 南宮适 大蜡 子云 +分子雲 小价 歲聿云暮 崖广 @@ -3690,6 +3743,17 @@ 灕水 點裡 這只是 +這只不 +這只容 +這只允 +這只採 +這只用 +有只是 +有只不 +有只容 +有只允 +有只採 +有只用 葉叶琹 胡子昂 包括 @@ -3807,6 +3871,8 @@ 于韋斯屈萊 于克-蘭多縣 于斯納爾斯貝里 +夏于喬 +涂姓 涂坤 涂天相 涂序瑄 @@ -3815,6 +3881,12 @@ 涂羽卿 涂逢年 涂長望 +涂謹申 +涂鴻欽 +涂壯勳 +涂醒哲 +涂善妮 +涂敏恆 總裁制 故云 強制作用 @@ -3894,8 +3966,294 @@ 注釋 月面 修杰楷 +修杰麟 學裡 獄裡 館裡 系列裡 村子裡 +艷后 +廢后 +妖后 +后海灣 +仙后 +賈后 +賢后 +蜂后 +皇后 +王后 +王侯后 +母后 +武后 +歌后 +影后 +封后 +太后 +天后 +呂后 +后里 +后街 +后羿 +后稷 +后座 +后平路 +后安路 +后土 +后北街 +后冠 +望后石 +后角 +蟻后 +后妃 +大周后 +小周后 +染殿后 +准三后 +風后 +風後, +人如風後入江雲 +中風後 +屏風後 +颱風後 +颳風後 +整風後 +打風後 +遇風後 +聞風後 +逆風後 +順風後 +大風後 +馬格里布 +劃入 +中庄子 +埔裏社撫墾局 +懸掛 +僱傭 +四捨六入 +宿舍 +會干擾 +代表 +高清愿 +瓷製 +竹製 +絲製 +莜麵 +劃入 +簡筑翎 +楊雅筑 +魔杰座 +杰威爾音樂 +彭于晏 +尸羅精舍 +索馬里 # (及以下)避免里海=>裏海的轉換 +西西里 +騰格里 +阿里 +村里長 +進制 +黃詩杰 +陳冲 +何杰 +劉佳怜 +于小惠 +于品海 +于耘婕 +于洋 +于澄 +于光新 +范賢惠 +于國治 +于楓 +于熙珍 +涂善妮 +邱于庭 +熊杰 +卜云吉 +黎吉雲 +于飛島 +代表 +水無怜奈 +傲遊 # 浏览器名 +夏于喬 +賭后 +后海灣 +立后綜 +甲后路 +劉芸后 +謝華后 +趙惠后 +趙威后 +聖后 +陳有后 +許虬 +網遊 +狄志杰 +伊適杰 +于冠華 +于台煙 +于雲鶴 +于忠肅集 +于友澤 +于和偉 +于來山 +于樂 +于天龍 +于謹 +于榮光 +電波鐘 +余三勝 +掛名 +啟發式 +舞后 +甄后 +郭后 +0年 # 協助分詞 +1年 +2年 +3年 +4年 +5年 +6年 +7年 +8年 +9年 +0年 +1年 +2年 +3年 +4年 +5年 +6年 +7年 +8年 +9年 +〇年 +零年 +一年 +兩年 +二年 +三年 +四年 +五年 +六年 +七年 +八年 +九年 +十年 +百年 +千年 +萬年 +億年 +周后 +0周後 +1周後 +2周後 +3周後 +4周後 +5周後 +6周後 +7周後 +8周後 +9周後 +0周後 +1周後 +2周後 +3周後 +4周後 +5周後 +6周後 +7周後 +8周後 +9周後 +零周後 +〇周後 +一周後 +二周後 +兩周後 +三周後 +四周後 +五周後 +六周後 +七周後 +八周後 +九周後 +十周後 +百周後 +千周後 +萬周後 +億周後 +幾周後 +多周後 +前往 +后瑞站 +帝后臺 +新井里美 +樗里子 +伊達里子 +濱田里佳子 +尊后 +叶志穗 +叶不二子 +于立成 +山谷道 +李志喜 +于欣 +于少保 +于海 +於海邊 +於海上 +于凌辰 +于魁智 +于鬯 +于仲文 +于再清 +于震 +於震前 +於震后 +於震中 +固定制 +毗婆尸佛 +尸棄佛 +划船 +划不來 +划拳 +划槳 +划動 +划艇 +划行 +划算 +總裁制 +恒生 +嚴云農 +手裏劍 +秦莊襄王 +伊東怜 +衛後莊公 +餘量 +並行 +郁郁青青 +協防 +對表格 +對表示 +對表達 +對表演 +對表明 +了然後 +戴表元 +張樂于張徐 +余力為 +葉叶琴 +万俟 +幾個 +澀谷區 +協調 +選手 +併發症 +併發重症 +併發模式 +併發型模式 +金色長髮 +紅色長髮 +一頭長髮 +的長髮 +黑色長髮 diff --git a/includes/zhtable/tradphrases_exclude.manual b/includes/zhtable/tradphrases_exclude.manual index 5fec98b2..6ed245c3 100644 --- a/includes/zhtable/tradphrases_exclude.manual +++ b/includes/zhtable/tradphrases_exclude.manual @@ -318,3 +318,12 @@ 註釋 浮遊 冶鍊 +裡子 +裡外 +單隻 +聯係 +那裏 +殺虫藥 +好家伙 +姦污 +併發 |