1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
|
<?php
/**
* Unicode normalization routines
*
* Copyright © 2004 Brion Vibber <brion@pobox.com>
* http://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup UtfNormal
*/
/**
* @defgroup UtfNormal UtfNormal
*/
define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
/**
* Unicode normalization routines for working with UTF-8 strings.
* Currently assumes that input strings are valid UTF-8!
*
* Not as fast as I'd like, but should be usable for most purposes.
* UtfNormal::toNFC() will bail early if given ASCII text or text
* it can quickly determine is already normalized.
*
* All functions can be called static.
*
* See description of forms at http://www.unicode.org/reports/tr15/
*
* @ingroup UtfNormal
*/
class UtfNormal {
/**
* For using the ICU wrapper
*/
const UNORM_NONE = 1;
const UNORM_NFD = 2;
const UNORM_NFKD = 3;
const UNORM_NFC = 4;
const UNORM_NFKC = 5;
const UNORM_FCD = 6;
const UNORM_DEFAULT = self::UNORM_NFC;
static $utfCombiningClass = null;
static $utfCanonicalComp = null;
static $utfCanonicalDecomp = null;
# Load compatibility decompositions on demand if they are needed.
static $utfCompatibilityDecomp = null;
static $utfCheckNFC;
/**
* The ultimate convenience function! Clean up invalid UTF-8 sequences,
* and convert to normal form C, canonical composition.
*
* Fast return for pure ASCII strings; some lesser optimizations for
* strings containing only known-good characters. Not as fast as toNFC().
*
* @param string $string a UTF-8 string
* @return string a clean, shiny, normalized UTF-8 string
*/
static function cleanUp( $string ) {
if( NORMALIZE_ICU ) {
$string = self::replaceForNativeNormalize( $string );
# UnicodeString constructor fails if the string ends with a
# head byte. Add a junk char at the end, we'll strip it off.
return rtrim( utf8_normalize( $string . "\x01", self::UNORM_NFC ), "\x01" );
} elseif( NORMALIZE_INTL ) {
$string = self::replaceForNativeNormalize( $string );
$norm = normalizer_normalize( $string, Normalizer::FORM_C );
if( $norm === null || $norm === false ) {
# normalizer_normalize will either return false or null
# (depending on which doc you read) if invalid utf8 string.
# quickIsNFCVerify cleans up invalid sequences.
if( UtfNormal::quickIsNFCVerify( $string ) ) {
# if that's true, the string is actually already normal.
return $string;
} else {
# Now we are valid but non-normal
return normalizer_normalize( $string, Normalizer::FORM_C );
}
} else {
return $norm;
}
} elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
# Side effect -- $string has had UTF-8 errors cleaned up.
return $string;
} else {
return UtfNormal::NFC( $string );
}
}
/**
* Convert a UTF-8 string to normal form C, canonical composition.
* Fast return for pure ASCII strings; some lesser optimizations for
* strings containing only known-good characters.
*
* @param string $string a valid UTF-8 string. Input is not validated.
* @return string a UTF-8 string in normal form C
*/
static function toNFC( $string ) {
if( NORMALIZE_INTL )
return normalizer_normalize( $string, Normalizer::FORM_C );
elseif( NORMALIZE_ICU )
return utf8_normalize( $string, self::UNORM_NFC );
elseif( UtfNormal::quickIsNFC( $string ) )
return $string;
else
return UtfNormal::NFC( $string );
}
/**
* Convert a UTF-8 string to normal form D, canonical decomposition.
* Fast return for pure ASCII strings.
*
* @param string $string a valid UTF-8 string. Input is not validated.
* @return string a UTF-8 string in normal form D
*/
static function toNFD( $string ) {
if( NORMALIZE_INTL )
return normalizer_normalize( $string, Normalizer::FORM_D );
elseif( NORMALIZE_ICU )
return utf8_normalize( $string, self::UNORM_NFD );
elseif( preg_match( '/[\x80-\xff]/', $string ) )
return UtfNormal::NFD( $string );
else
return $string;
}
/**
* Convert a UTF-8 string to normal form KC, compatibility composition.
* This may cause irreversible information loss, use judiciously.
* Fast return for pure ASCII strings.
*
* @param string $string a valid UTF-8 string. Input is not validated.
* @return string a UTF-8 string in normal form KC
*/
static function toNFKC( $string ) {
if( NORMALIZE_INTL )
return normalizer_normalize( $string, Normalizer::FORM_KC );
elseif( NORMALIZE_ICU )
return utf8_normalize( $string, self::UNORM_NFKC );
elseif( preg_match( '/[\x80-\xff]/', $string ) )
return UtfNormal::NFKC( $string );
else
return $string;
}
/**
* Convert a UTF-8 string to normal form KD, compatibility decomposition.
* This may cause irreversible information loss, use judiciously.
* Fast return for pure ASCII strings.
*
* @param string $string a valid UTF-8 string. Input is not validated.
* @return string a UTF-8 string in normal form KD
*/
static function toNFKD( $string ) {
if( NORMALIZE_INTL )
return normalizer_normalize( $string, Normalizer::FORM_KD );
elseif( NORMALIZE_ICU )
return utf8_normalize( $string, self::UNORM_NFKD );
elseif( preg_match( '/[\x80-\xff]/', $string ) )
return UtfNormal::NFKD( $string );
else
return $string;
}
/**
* Load the basic composition data if necessary
* @private
*/
static function loadData() {
if( !isset( self::$utfCombiningClass ) ) {
require_once( __DIR__ . '/UtfNormalData.inc' );
}
}
/**
* Returns true if the string is _definitely_ in NFC.
* Returns false if not or uncertain.
* @param string $string a valid UTF-8 string. Input is not validated.
* @return bool
*/
static function quickIsNFC( $string ) {
# ASCII is always valid NFC!
# If it's pure ASCII, let it through.
if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
UtfNormal::loadData();
$len = strlen( $string );
for( $i = 0; $i < $len; $i++ ) {
$c = $string[$i];
$n = ord( $c );
if( $n < 0x80 ) {
continue;
} elseif( $n >= 0xf0 ) {
$c = substr( $string, $i, 4 );
$i += 3;
} elseif( $n >= 0xe0 ) {
$c = substr( $string, $i, 3 );
$i += 2;
} elseif( $n >= 0xc0 ) {
$c = substr( $string, $i, 2 );
$i++;
}
if( isset( self::$utfCheckNFC[$c] ) ) {
# If it's NO or MAYBE, bail and do the slow check.
return false;
}
if( isset( self::$utfCombiningClass[$c] ) ) {
# Combining character? We might have to do sorting, at least.
return false;
}
}
return true;
}
/**
* Returns true if the string is _definitely_ in NFC.
* Returns false if not or uncertain.
* @param string $string a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
* @return bool
*/
static function quickIsNFCVerify( &$string ) {
# Screen out some characters that eg won't be allowed in XML
$string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string );
# ASCII is always valid NFC!
# If we're only ever given plain ASCII, we can avoid the overhead
# of initializing the decomposition tables by skipping out early.
if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
if( !isset( $checkit ) ) {
# Load/build some scary lookup tables...
UtfNormal::loadData();
$utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass );
# Head bytes for sequences which we should do further validity checks
$checkit = array_flip( array_map( 'chr',
array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
# Each UTF-8 head byte is followed by a certain
# number of tail bytes.
$tailBytes = array();
for( $n = 0; $n < 256; $n++ ) {
if( $n < 0xc0 ) {
$remaining = 0;
} elseif( $n < 0xe0 ) {
$remaining = 1;
} elseif( $n < 0xf0 ) {
$remaining = 2;
} elseif( $n < 0xf8 ) {
$remaining = 3;
} elseif( $n < 0xfc ) {
$remaining = 4;
} elseif( $n < 0xfe ) {
$remaining = 5;
} else {
$remaining = 0;
}
$tailBytes[chr($n)] = $remaining;
}
}
# Chop the text into pure-ASCII and non-ASCII areas;
# large ASCII parts can be handled much more quickly.
# Don't chop up Unicode areas for punctuation, though,
# that wastes energy.
$matches = array();
preg_match_all(
'/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
$string, $matches );
$looksNormal = true;
$base = 0;
$replace = array();
foreach( $matches[1] as $str ) {
$chunk = strlen( $str );
if( $str[0] < "\x80" ) {
# ASCII chunk: guaranteed to be valid UTF-8
# and in normal form C, so skip over it.
$base += $chunk;
continue;
}
# We'll have to examine the chunk byte by byte to ensure
# that it consists of valid UTF-8 sequences, and to see
# if any of them might not be normalized.
#
# Since PHP is not the fastest language on earth, some of
# this code is a little ugly with inner loop optimizations.
$head = '';
$len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
for( $i = -1; --$len; ) {
$remaining = $tailBytes[$c = $str[++$i]];
if( $remaining ) {
# UTF-8 head byte!
$sequence = $head = $c;
do {
# Look for the defined number of tail bytes...
if( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) {
# Legal tail bytes are nice.
$sequence .= $c;
} else {
if( 0 == $len ) {
# Premature end of string!
# Drop a replacement character into output to
# represent the invalid UTF-8 sequence.
$replace[] = array( UTF8_REPLACEMENT,
$base + $i + 1 - strlen( $sequence ),
strlen( $sequence ) );
break 2;
} else {
# Illegal tail byte; abandon the sequence.
$replace[] = array( UTF8_REPLACEMENT,
$base + $i - strlen( $sequence ),
strlen( $sequence ) );
# Back up and reprocess this byte; it may itself
# be a legal ASCII or UTF-8 sequence head.
--$i;
++$len;
continue 2;
}
}
} while( --$remaining );
if( isset( $checkit[$head] ) ) {
# Do some more detailed validity checks, for
# invalid characters and illegal sequences.
if( $head == "\xed" ) {
# 0xed is relatively frequent in Korean, which
# abuts the surrogate area, so we're doing
# this check separately to speed things up.
if( $sequence >= UTF8_SURROGATE_FIRST ) {
# Surrogates are legal only in UTF-16 code.
# They are totally forbidden here in UTF-8
# utopia.
$replace[] = array( UTF8_REPLACEMENT,
$base + $i + 1 - strlen( $sequence ),
strlen( $sequence ) );
$head = '';
continue;
}
} else {
# Slower, but rarer checks...
$n = ord( $head );
if(
# "Overlong sequences" are those that are syntactically
# correct but use more UTF-8 bytes than are necessary to
# encode a character. Naïve string comparisons can be
# tricked into failing to see a match for an ASCII
# character, for instance, which can be a security hole
# if blacklist checks are being used.
($n < 0xc2 && $sequence <= UTF8_OVERLONG_A)
|| ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)
|| ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)
# U+FFFE and U+FFFF are explicitly forbidden in Unicode.
|| ($n == 0xef &&
($sequence == UTF8_FFFE)
|| ($sequence == UTF8_FFFF) )
# Unicode has been limited to 21 bits; longer
# sequences are not allowed.
|| ($n >= 0xf0 && $sequence > UTF8_MAX) ) {
$replace[] = array( UTF8_REPLACEMENT,
$base + $i + 1 - strlen( $sequence ),
strlen( $sequence ) );
$head = '';
continue;
}
}
}
if( isset( $utfCheckOrCombining[$sequence] ) ) {
# If it's NO or MAYBE, we'll have to rip
# the string apart and put it back together.
# That's going to be mighty slow.
$looksNormal = false;
}
# The sequence is legal!
$head = '';
} elseif( $c < "\x80" ) {
# ASCII byte.
$head = '';
} elseif( $c < "\xc0" ) {
# Illegal tail bytes
if( $head == '' ) {
# Out of the blue!
$replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
} else {
# Don't add if we're continuing a broken sequence;
# we already put a replacement character when we looked
# at the broken sequence.
$replace[] = array( '', $base + $i, 1 );
}
} else {
# Miscellaneous freaks.
$replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
$head = '';
}
}
$base += $chunk;
}
if( count( $replace ) ) {
# There were illegal UTF-8 sequences we need to fix up.
$out = '';
$last = 0;
foreach( $replace as $rep ) {
list( $replacement, $start, $length ) = $rep;
if( $last < $start ) {
$out .= substr( $string, $last, $start - $last );
}
$out .= $replacement;
$last = $start + $length;
}
if( $last < strlen( $string ) ) {
$out .= substr( $string, $last );
}
$string = $out;
}
return $looksNormal;
}
# These take a string and run the normalization on them, without
# checking for validity or any optimization etc. Input must be
# VALID UTF-8!
/**
* @param $string string
* @return string
* @private
*/
static function NFC( $string ) {
return UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
}
/**
* @param $string string
* @return string
* @private
*/
static function NFD( $string ) {
UtfNormal::loadData();
return UtfNormal::fastCombiningSort(
UtfNormal::fastDecompose( $string, self::$utfCanonicalDecomp ) );
}
/**
* @param $string string
* @return string
* @private
*/
static function NFKC( $string ) {
return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
}
/**
* @param $string string
* @return string
* @private
*/
static function NFKD( $string ) {
if( !isset( self::$utfCompatibilityDecomp ) ) {
require_once( 'UtfNormalDataK.inc' );
}
return self::fastCombiningSort(
self::fastDecompose( $string, self::$utfCompatibilityDecomp ) );
}
/**
* Perform decomposition of a UTF-8 string into either D or KD form
* (depending on which decomposition map is passed to us).
* Input is assumed to be *valid* UTF-8. Invalid code will break.
* @private
* @param string $string valid UTF-8 string
* @param array $map hash of expanded decomposition map
* @return string a UTF-8 string decomposed, not yet normalized (needs sorting)
*/
static function fastDecompose( $string, $map ) {
UtfNormal::loadData();
$len = strlen( $string );
$out = '';
for( $i = 0; $i < $len; $i++ ) {
$c = $string[$i];
$n = ord( $c );
if( $n < 0x80 ) {
# ASCII chars never decompose
# THEY ARE IMMORTAL
$out .= $c;
continue;
} elseif( $n >= 0xf0 ) {
$c = substr( $string, $i, 4 );
$i += 3;
} elseif( $n >= 0xe0 ) {
$c = substr( $string, $i, 3 );
$i += 2;
} elseif( $n >= 0xc0 ) {
$c = substr( $string, $i, 2 );
$i++;
}
if( isset( $map[$c] ) ) {
$out .= $map[$c];
continue;
} else {
if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
# Decompose a hangul syllable into jamo;
# hardcoded for three-byte UTF-8 sequence.
# A lookup table would be slightly faster,
# but adds a lot of memory & disk needs.
#
$index = ( (ord( $c[0] ) & 0x0f) << 12
| (ord( $c[1] ) & 0x3f) << 6
| (ord( $c[2] ) & 0x3f) )
- UNICODE_HANGUL_FIRST;
$l = intval( $index / UNICODE_HANGUL_NCOUNT );
$v = intval( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
$t = $index % UNICODE_HANGUL_TCOUNT;
$out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
if( $t >= 25 ) {
$out .= "\xe1\x87" . chr( 0x80 + $t - 25 );
} elseif( $t ) {
$out .= "\xe1\x86" . chr( 0xa7 + $t );
}
continue;
}
}
$out .= $c;
}
return $out;
}
/**
* Sorts combining characters into canonical order. This is the
* final step in creating decomposed normal forms D and KD.
* @private
* @param string $string a valid, decomposed UTF-8 string. Input is not validated.
* @return string a UTF-8 string with combining characters sorted in canonical order
*/
static function fastCombiningSort( $string ) {
UtfNormal::loadData();
$len = strlen( $string );
$out = '';
$combiners = array();
$lastClass = -1;
for( $i = 0; $i < $len; $i++ ) {
$c = $string[$i];
$n = ord( $c );
if( $n >= 0x80 ) {
if( $n >= 0xf0 ) {
$c = substr( $string, $i, 4 );
$i += 3;
} elseif( $n >= 0xe0 ) {
$c = substr( $string, $i, 3 );
$i += 2;
} elseif( $n >= 0xc0 ) {
$c = substr( $string, $i, 2 );
$i++;
}
if( isset( self::$utfCombiningClass[$c] ) ) {
$lastClass = self::$utfCombiningClass[$c];
if( isset( $combiners[$lastClass] ) ) {
$combiners[$lastClass] .= $c;
} else {
$combiners[$lastClass] = $c;
}
continue;
}
}
if( $lastClass ) {
ksort( $combiners );
$out .= implode( '', $combiners );
$combiners = array();
}
$out .= $c;
$lastClass = 0;
}
if( $lastClass ) {
ksort( $combiners );
$out .= implode( '', $combiners );
}
return $out;
}
/**
* Produces canonically composed sequences, i.e. normal form C or KC.
*
* @private
* @param string $string a valid UTF-8 string in sorted normal form D or KD. Input is not validated.
* @return string a UTF-8 string with canonical precomposed characters used where possible
*/
static function fastCompose( $string ) {
UtfNormal::loadData();
$len = strlen( $string );
$out = '';
$lastClass = -1;
$lastHangul = 0;
$startChar = '';
$combining = '';
$x1 = ord(substr(UTF8_HANGUL_VBASE, 0, 1));
$x2 = ord(substr(UTF8_HANGUL_TEND, 0, 1));
for( $i = 0; $i < $len; $i++ ) {
$c = $string[$i];
$n = ord( $c );
if( $n < 0x80 ) {
# No combining characters here...
$out .= $startChar;
$out .= $combining;
$startChar = $c;
$combining = '';
$lastClass = 0;
continue;
} elseif( $n >= 0xf0 ) {
$c = substr( $string, $i, 4 );
$i += 3;
} elseif( $n >= 0xe0 ) {
$c = substr( $string, $i, 3 );
$i += 2;
} elseif( $n >= 0xc0 ) {
$c = substr( $string, $i, 2 );
$i++;
}
$pair = $startChar . $c;
if( $n > 0x80 ) {
if( isset( self::$utfCombiningClass[$c] ) ) {
# A combining char; see what we can do with it
$class = self::$utfCombiningClass[$c];
if( !empty( $startChar ) &&
$lastClass < $class &&
$class > 0 &&
isset( self::$utfCanonicalComp[$pair] ) ) {
$startChar = self::$utfCanonicalComp[$pair];
$class = 0;
} else {
$combining .= $c;
}
$lastClass = $class;
$lastHangul = 0;
continue;
}
}
# New start char
if( $lastClass == 0 ) {
if( isset( self::$utfCanonicalComp[$pair] ) ) {
$startChar = self::$utfCanonicalComp[$pair];
$lastHangul = 0;
continue;
}
if( $n >= $x1 && $n <= $x2 ) {
# WARNING: Hangul code is painfully slow.
# I apologize for this ugly, ugly code; however
# performance is even more teh suck if we call
# out to nice clean functions. Lookup tables are
# marginally faster, but require a lot of space.
#
if( $c >= UTF8_HANGUL_VBASE &&
$c <= UTF8_HANGUL_VEND &&
$startChar >= UTF8_HANGUL_LBASE &&
$startChar <= UTF8_HANGUL_LEND ) {
#
#$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
#$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
$lIndex = ord( $startChar[2] ) - 0x80;
$vIndex = ord( $c[2] ) - 0xa1;
$hangulPoint = UNICODE_HANGUL_FIRST +
UNICODE_HANGUL_TCOUNT *
(UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
# Hardcode the limited-range UTF-8 conversion:
$startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
chr( $hangulPoint >> 6 & 0x3f | 0x80 ) .
chr( $hangulPoint & 0x3f | 0x80 );
$lastHangul = 0;
continue;
} elseif( $c >= UTF8_HANGUL_TBASE &&
$c <= UTF8_HANGUL_TEND &&
$startChar >= UTF8_HANGUL_FIRST &&
$startChar <= UTF8_HANGUL_LAST &&
!$lastHangul ) {
# $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
$tIndex = ord( $c[2] ) - 0xa7;
if( $tIndex < 0 ) $tIndex = ord( $c[2] ) - 0x80 + (0x11c0 - 0x11a7);
# Increment the code point by $tIndex, without
# the function overhead of decoding and recoding UTF-8
#
$tail = ord( $startChar[2] ) + $tIndex;
if( $tail > 0xbf ) {
$tail -= 0x40;
$mid = ord( $startChar[1] ) + 1;
if( $mid > 0xbf ) {
$startChar[0] = chr( ord( $startChar[0] ) + 1 );
$mid -= 0x40;
}
$startChar[1] = chr( $mid );
}
$startChar[2] = chr( $tail );
# If there's another jamo char after this, *don't* try to merge it.
$lastHangul = 1;
continue;
}
}
}
$out .= $startChar;
$out .= $combining;
$startChar = $c;
$combining = '';
$lastClass = 0;
$lastHangul = 0;
}
$out .= $startChar . $combining;
return $out;
}
/**
* This is just used for the benchmark, comparing how long it takes to
* interate through a string without really doing anything of substance.
* @param $string string
* @return string
*/
static function placebo( $string ) {
$len = strlen( $string );
$out = '';
for( $i = 0; $i < $len; $i++ ) {
$out .= $string[$i];
}
return $out;
}
/**
* Function to replace some characters that we don't want
* but most of the native normalize functions keep.
*
* @param string $string The string
* @return String String with the character codes replaced.
*/
private static function replaceForNativeNormalize( $string ) {
$string = preg_replace(
'/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
UTF8_REPLACEMENT,
$string );
$string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
$string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
return $string;
}
}
|