includes/normal/Utf8CaseGenerate.php


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112

<?php
/**
 * This script generates Utf8Case.php from the Unicode Character Database
 * and supplementary files.
 *
 * Copyright © 2004,2008 Brion Vibber <brion@pobox.com>
 * http://www.mediawiki.org/
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @file
 * @ingroup UtfNormal
 */

if( PHP_SAPI != 'cli' ) {
	die( "Run me from the command line please.\n" );
}

require_once 'UtfNormalDefines.php';
require_once 'UtfNormalUtil.php';

$in = fopen("UnicodeData.txt", "rt" );
if( !$in ) {
	print "Can't open UnicodeData.txt for reading.\n";
	print "If necessary, fetch this file from the internet:\n";
	print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
	exit(-1);
}
$wikiUpperChars = array();
$wikiLowerChars = array();

print "Reading character definitions...\n";
while( false !== ($line = fgets( $in ) ) ) {
	$columns = explode(';', $line);
	$codepoint = $columns[0];
	$name = $columns[1];
	$simpleUpper = $columns[12];
	$simpleLower = $columns[13];

	$source = codepointToUtf8( hexdec( $codepoint ) );
	if( $simpleUpper ) {
		$wikiUpperChars[$source] = codepointToUtf8( hexdec( $simpleUpper ) );
	}
	if( $simpleLower ) {
		$wikiLowerChars[$source] = codepointToUtf8( hexdec( $simpleLower ) );
	}
}
fclose( $in );

$out = fopen( "Utf8Case.php", "wt" );
if( $out ) {
	$outUpperChars = escapeArray( $wikiUpperChars );
	$outLowerChars = escapeArray( $wikiLowerChars );
	$outdata = "<" . "?php
/**
 * Simple 1:1 upper/lowercase switching arrays for utf-8 text.
 * Won't get context-sensitive things yet.
 *
 * Hack for bugs in ucfirst() and company
 *
 * These are pulled from memcached if possible, as this is faster than filling
 * up a big array manually.
 *
 * @file
 * @ingroup Language
 */

/**
 * Translation array to get upper case character
 */
\$wikiUpperChars = $outUpperChars;

/**
 * Translation array to get lower case character
 */
\$wikiLowerChars = $outLowerChars;\n";
	fputs( $out, $outdata );
	fclose( $out );
	print "Wrote out Utf8Case.php\n";
} else {
	print "Can't create file Utf8Case.php\n";
	exit(-1);
}


function escapeArray( $arr ) {
	return "array(\n" .
		implode( ",\n",
			array_map( "escapeLine",
				array_keys( $arr ),
				array_values( $arr ) ) ) .
		"\n)";
}

function escapeLine( $key, $val ) {
	$encKey = escapeSingleString( $key );
	$encVal = escapeSingleString( $val );
	return "\t'$encKey' => '$encVal'";
}