1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
|
<?php
/** Esperanto (Esperanto)
*
* @ingroup Language
* @author Brion Vibber <brion@pobox.com>
*/
class LanguageEo extends Language {
/**
* Wrapper for charset conversions.
*
* In most languages, this calls through to standard system iconv(), but
* for Esperanto we're also adding a special pseudo-charset to convert
* accented characters to/from the ASCII-friendly "X" surrogate coding:
*
* cx = ĉ cxx = cx
* gx = ĝ gxx = gx
* hx = ĥ hxx = hx
* jx = ĵ jxx = jx
* sx = ŝ sxx = sx
* ux = ŭ uxx = ux
* xx = x
*
* http://en.wikipedia.org/wiki/Esperanto_orthography#X-system
* http://eo.wikipedia.org/wiki/X-sistemo
*
* X-conversion is applied, in either direction, between "utf-8" and "x" charsets;
* this comes into effect when input is run through $wgRequest->getText() and the
* $wgEditEncoding is set to 'x'.
*
* In the long run, this should be moved out of here and into the client-side
* editor behavior; the original server-side translation system dates to 2002-2003
* when many browsers with really bad Unicode support were still in use.
*
* @param string $in input character set
* @param string $out output character set
* @param string $string text to be converted
* @return string
*/
function iconv( $in, $out, $string ) {
if ( strcasecmp( $in, 'x' ) == 0 && strcasecmp( $out, 'utf-8' ) == 0 ) {
return preg_replace_callback (
'/([cghjsu]x?)((?:xx)*)(?!x)/i',
array( $this, 'strrtxuCallback' ), $string );
} elseif ( strcasecmp( $in, 'UTF-8' ) == 0 && strcasecmp( $out, 'x' ) == 0 ) {
# Double Xs only if they follow cxapelutaj literoj.
return preg_replace_callback(
'/((?:[cghjsu]|\xc4[\x88\x89\x9c\x9d\xa4\xa5\xb4\xb5]|\xc5[\x9c\x9d\xac\xad])x*)/i',
array( $this, 'strrtuxCallback' ), $string );
}
return parent::iconv( $in, $out, $string );
}
/**
* @param $matches array
* @return string
*/
function strrtuxCallback( $matches ) {
static $ux = array (
'x' => 'xx' , 'X' => 'Xx' ,
"\xc4\x88" => "Cx" , "\xc4\x89" => "cx" ,
"\xc4\x9c" => "Gx" , "\xc4\x9d" => "gx" ,
"\xc4\xa4" => "Hx" , "\xc4\xa5" => "hx" ,
"\xc4\xb4" => "Jx" , "\xc4\xb5" => "jx" ,
"\xc5\x9c" => "Sx" , "\xc5\x9d" => "sx" ,
"\xc5\xac" => "Ux" , "\xc5\xad" => "ux"
);
return strtr( $matches[1], $ux );
}
/**
* @param $matches array
* @return string
*/
function strrtxuCallback( $matches ) {
static $xu = array (
'xx' => 'x' , 'xX' => 'x' ,
'Xx' => 'X' , 'XX' => 'X' ,
"Cx" => "\xc4\x88" , "CX" => "\xc4\x88" ,
"cx" => "\xc4\x89" , "cX" => "\xc4\x89" ,
"Gx" => "\xc4\x9c" , "GX" => "\xc4\x9c" ,
"gx" => "\xc4\x9d" , "gX" => "\xc4\x9d" ,
"Hx" => "\xc4\xa4" , "HX" => "\xc4\xa4" ,
"hx" => "\xc4\xa5" , "hX" => "\xc4\xa5" ,
"Jx" => "\xc4\xb4" , "JX" => "\xc4\xb4" ,
"jx" => "\xc4\xb5" , "jX" => "\xc4\xb5" ,
"Sx" => "\xc5\x9c" , "SX" => "\xc5\x9c" ,
"sx" => "\xc5\x9d" , "sX" => "\xc5\x9d" ,
"Ux" => "\xc5\xac" , "UX" => "\xc5\xac" ,
"ux" => "\xc5\xad" , "uX" => "\xc5\xad"
);
return strtr( $matches[1], $xu ) . strtr( $matches[2], $xu );
}
/**
* @param $s string
* @return string
*/
function checkTitleEncoding( $s ) {
# Check for X-system backwards-compatibility URLs
$ishigh = preg_match( '/[\x80-\xff]/', $s );
$isutf = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
'[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
if ( $ishigh and !$isutf ) {
# Assume Latin1
$s = utf8_encode( $s );
} else {
if ( preg_match( '/(\xc4[\x88\x89\x9c\x9d\xa4\xa5\xb4\xb5]' .
'|\xc5[\x9c\x9d\xac\xad])/', $s ) )
return $s;
}
// if( preg_match( '/[cghjsu]x/i', $s ) )
// return $this->iconv( 'x', 'utf-8', $s );
return $s;
}
function initEncoding() {
global $wgEditEncoding;
$wgEditEncoding = 'x';
}
}
|