1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
<?php
/**
* Test feeds random 16-byte strings to both the pure PHP and ICU-based
* UtfNormal::cleanUp() code paths, and checks to see if there's a
* difference. Will run forever until it finds one or you kill it.
*
* Copyright (C) 2004 Brion Vibber <brion@pobox.com>
* http://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup UtfNormal
*/
if( PHP_SAPI != 'cli' ) {
die( "Run me from the command line please.\n" );
}
/** */
require_once( 'UtfNormal.php' );
require_once( '../diff/DifferenceEngine.php' );
dl('php_utfnormal.so' );
# mt_srand( 99999 );
function randomString( $length, $nullOk, $ascii = false ) {
$out = '';
for( $i = 0; $i < $length; $i++ )
$out .= chr( mt_rand( $nullOk ? 0 : 1, $ascii ? 127 : 255 ) );
return $out;
}
/* Duplicate of the cleanUp() path for ICU usage */
function donorm( $str ) {
# We exclude a few chars that ICU would not.
$str = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $str );
$str = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $str );
$str = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $str );
# UnicodeString constructor fails if the string ends with a head byte.
# Add a junk char at the end, we'll strip it off
return rtrim( utf8_normalize( $str . "\x01", UtfNormal::UNORM_NFC ), "\x01" );
}
function showDiffs( $a, $b ) {
$ota = explode( "\n", str_replace( "\r\n", "\n", $a ) );
$nta = explode( "\n", str_replace( "\r\n", "\n", $b ) );
$diffs = new Diff( $ota, $nta );
$formatter = new TableDiffFormatter();
$funky = $formatter->format( $diffs );
$matches = array();
preg_match_all( '/<(?:ins|del) class="diffchange">(.*?)<\/(?:ins|del)>/', $funky, $matches );
foreach( $matches[1] as $bit ) {
$hex = bin2hex( $bit );
echo "\t$hex\n";
}
}
$size = 16;
$n = 0;
while( true ) {
$n++;
echo "$n\n";
$str = randomString( $size, true);
$clean = UtfNormal::cleanUp( $str );
$norm = donorm( $str );
echo strlen( $clean ) . ", " . strlen( $norm );
if( $clean == $norm ) {
echo " (match)\n";
} else {
echo " (FAIL)\n";
echo "\traw: " . bin2hex( $str ) . "\n" .
"\tphp: " . bin2hex( $clean ) . "\n" .
"\ticu: " . bin2hex( $norm ) . "\n";
echo "\n\tdiffs:\n";
showDiffs( $clean, $norm );
die();
}
$str = '';
$clean = '';
$norm = '';
}
|