diff options
author | Pierre Schmitz <pierre@archlinux.de> | 2011-06-22 11:28:20 +0200 |
---|---|---|
committer | Pierre Schmitz <pierre@archlinux.de> | 2011-06-22 11:28:20 +0200 |
commit | 9db190c7e736ec8d063187d4241b59feaf7dc2d1 (patch) | |
tree | 46d1a0dee7febef5c2d57a9f7b972be16a163b3d /includes/normal/UtfNormalTest2.php | |
parent | 78677c7bbdcc9739f6c10c75935898a20e1acd9e (diff) |
update to MediaWiki 1.17.0
Diffstat (limited to 'includes/normal/UtfNormalTest2.php')
-rw-r--r-- | includes/normal/UtfNormalTest2.php | 239 |
1 files changed, 239 insertions, 0 deletions
diff --git a/includes/normal/UtfNormalTest2.php b/includes/normal/UtfNormalTest2.php new file mode 100644 index 00000000..fafd5475 --- /dev/null +++ b/includes/normal/UtfNormalTest2.php @@ -0,0 +1,239 @@ +#!/usr/bin/php +<?php +/** + * Other tests for the unicode normalization module + * + * @file + * @ingroup UtfNormal + */ + +if( php_sapi_name() != 'cli' ) { + die( "Run me from the command line please.\n" ); +} + +// From http://unicode.org/Public/UNIDATA/NormalizationTest.txt +$file = "NormalizationTest.txt"; + +// Anything after this character is a comment +define ( 'COMMENT', '#' ); + +// Semicolons are used to separate the columns +define ( 'SEPARATOR', ';' ); + +$f = fopen($file, "r"); + +/** + * The following section will be used for testing different normalization methods. + * - Pure PHP + ~ no assertion errors + ~ 6.25 minutes + + * - php_utfnormal.so or intl extension: both are wrappers around + libicu so we list the version of libicu when making the + comparison + + * - libicu Ubuntu 3.8.1-3ubuntu1.1 php 5.2.6-3ubuntu4.5 + ~ 2200 assertion errors + ~ 5 seconds + ~ output: http://paste2.org/p/921566 + + * - libicu Ubuntu 4.2.1-3 php 5.3.2-1ubuntu4.2 + ~ 1384 assertion errors + ~ 15 seconds + ~ output: http://paste2.org/p/921435 + + * - libicu Debian 4.4.1-5 php 5.3.2-1ubuntu4.2 + ~ no assertion errors + ~ 13 seconds + + * - Tests comparing pure PHP output with libicu output were added + later and slow down the runtime. + */ + +require_once("./UtfNormal.php"); +function normalize_form_c($c) { return UtfNormal::toNFC($c); } +function normalize_form_d($c) { return UtfNormal::toNFD($c); } +function normalize_form_kc($c) { return UtfNormal::toNFKC($c); } +function normalize_form_kd($c) { return UtfNormal::toNFKD($c); } + +/** + * This set of functions is only useful if youve added a param to the + * following functions to force pure PHP usage. I decided not to + * commit that code since might produce a slowdown in the UTF + * normalization code just for the sake of these tests. -- hexmode + */ +function normalize_form_c_php($c) { return UtfNormal::toNFC($c, "php"); } +function normalize_form_d_php($c) { return UtfNormal::toNFD($c, "php"); } +function normalize_form_kc_php($c) { return UtfNormal::toNFKC($c, "php"); } +function normalize_form_kd_php($c) { return UtfNormal::toNFKD($c, "php"); } + +assert_options(ASSERT_ACTIVE, 1); +assert_options(ASSERT_WARNING, 0); +assert_options(ASSERT_QUIET_EVAL, 1); +assert_options(ASSERT_CALLBACK, 'my_assert'); + +function my_assert( $file, $line, $code ) { + global $col, $lineNo; + echo "Assertion that '$code' failed on line $lineNo ($col[5])\n"; +} + +$count = 0; +$lineNo = 0; +if( $f !== false ) { + while( ( $col = getRow( $f ) ) !== false ) { + $lineNo++; + + if(count($col) == 6) { + $count++; + if( $count % 100 === 0 ) echo "Count: $count\n"; + } else { + continue; + } + + # verify that the pure PHP version is correct + $NFCc1 = normalize_form_c($col[0]); + $NFCc1p = normalize_form_c_php($col[0]); + assert('$NFCc1 === $NFCc1p'); + $NFCc2 = normalize_form_c($col[1]); + $NFCc2p = normalize_form_c_php($col[1]); + assert('$NFCc2 === $NFCc2p'); + $NFCc3 = normalize_form_c($col[2]); + $NFCc3p = normalize_form_c_php($col[2]); + assert('$NFCc3 === $NFCc3p'); + $NFCc4 = normalize_form_c($col[3]); + $NFCc4p = normalize_form_c_php($col[3]); + assert('$NFCc4 === $NFCc4p'); + $NFCc5 = normalize_form_c($col[4]); + $NFCc5p = normalize_form_c_php($col[4]); + assert('$NFCc5 === $NFCc5p'); + + $NFDc1 = normalize_form_d($col[0]); + $NFDc1p = normalize_form_d_php($col[0]); + assert('$NFDc1 === $NFDc1p'); + $NFDc2 = normalize_form_d($col[1]); + $NFDc2p = normalize_form_d_php($col[1]); + assert('$NFDc2 === $NFDc2p'); + $NFDc3 = normalize_form_d($col[2]); + $NFDc3p = normalize_form_d_php($col[2]); + assert('$NFDc3 === $NFDc3p'); + $NFDc4 = normalize_form_d($col[3]); + $NFDc4p = normalize_form_d_php($col[3]); + assert('$NFDc4 === $NFDc4p'); + $NFDc5 = normalize_form_d($col[4]); + $NFDc5p = normalize_form_d_php($col[4]); + assert('$NFDc5 === $NFDc5p'); + + $NFKDc1 = normalize_form_kd($col[0]); + $NFKDc1p = normalize_form_kd_php($col[0]); + assert('$NFKDc1 === $NFKDc1p'); + $NFKDc2 = normalize_form_kd($col[1]); + $NFKDc2p = normalize_form_kd_php($col[1]); + assert('$NFKDc2 === $NFKDc2p'); + $NFKDc3 = normalize_form_kd($col[2]); + $NFKDc3p = normalize_form_kd_php($col[2]); + assert('$NFKDc3 === $NFKDc3p'); + $NFKDc4 = normalize_form_kd($col[3]); + $NFKDc4p = normalize_form_kd_php($col[3]); + assert('$NFKDc4 === $NFKDc4p'); + $NFKDc5 = normalize_form_kd($col[4]); + $NFKDc5p = normalize_form_kd_php($col[4]); + assert('$NFKDc5 === $NFKDc5p'); + + $NFKCc1 = normalize_form_kc($col[0]); + $NFKCc1p = normalize_form_kc_php($col[0]); + assert('$NFKCc1 === $NFKCc1p'); + $NFKCc2 = normalize_form_kc($col[1]); + $NFKCc2p = normalize_form_kc_php($col[1]); + assert('$NFKCc2 === $NFKCc2p'); + $NFKCc3 = normalize_form_kc($col[2]); + $NFKCc3p = normalize_form_kc_php($col[2]); + assert('$NFKCc3 === $NFKCc3p'); + $NFKCc4 = normalize_form_kc($col[3]); + $NFKCc4p = normalize_form_kc_php($col[3]); + assert('$NFKCc4 === $NFKCc4p'); + $NFKCc5 = normalize_form_kc($col[4]); + $NFKCc5p = normalize_form_kc_php($col[4]); + assert('$NFKCc5 === $NFKCc5p'); + + # c2 == NFC(c1) == NFC(c2) == NFC(c3) + assert('$col[1] === $NFCc1'); + assert('$col[1] === $NFCc2'); + assert('$col[1] === $NFCc3'); + + # c4 == NFC(c4) == NFC(c5) + assert('$col[3] === $NFCc4'); + assert('$col[3] === $NFCc5'); + + # c3 == NFD(c1) == NFD(c2) == NFD(c3) + assert('$col[2] === $NFDc1'); + assert('$col[2] === $NFDc2'); + assert('$col[2] === $NFDc3'); + + # c5 == NFD(c4) == NFD(c5) + assert('$col[4] === $NFDc4'); + assert('$col[4] === $NFDc5'); + + # c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5) + assert('$col[3] === $NFKCc1'); + assert('$col[3] === $NFKCc2'); + assert('$col[3] === $NFKCc3'); + assert('$col[3] === $NFKCc4'); + assert('$col[3] === $NFKCc5'); + + # c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5) + assert('$col[4] === $NFKDc1'); + assert('$col[4] === $NFKDc2'); + assert('$col[4] === $NFKDc3'); + assert('$col[4] === $NFKDc4'); + assert('$col[4] === $NFKDc5'); + } +} +echo "done.\n"; + +// Compare against http://en.wikipedia.org/wiki/UTF-8#Description +function unichr($c) { + if ($c <= 0x7F) { + return chr($c); + } else if ($c <= 0x7FF) { + return chr(0xC0 | $c >> 6) . chr(0x80 | $c & 0x3F); + } else if ($c <= 0xFFFF) { + return chr(0xE0 | $c >> 12) . chr(0x80 | $c >> 6 & 0x3F) + . chr(0x80 | $c & 0x3F); + } else if ($c <= 0x10FFFF) { + return chr(0xF0 | $c >> 18) . chr(0x80 | $c >> 12 & 0x3F) + . chr(0x80 | $c >> 6 & 0x3F) + . chr(0x80 | $c & 0x3F); + } else { + return false; + } +} + +function unistr($c) { + return implode("", array_map("unichr", array_map("hexdec", explode(" ", $c)))); +} + +function getRow( $f ) { + $row = fgets( $f ); + if( $row === false ) return false; + $row = rtrim($row); + $pos = strpos( $row, COMMENT ); + $pos2 = strpos( $row, ")" ); + if( $pos === 0 ) return array($row); + $c = ""; + + if( $pos ) { + if($pos2) $c = substr( $row, $pos2 + 2 ); + else $c = substr( $row, $pos ); + $row = substr( $row, 0, $pos ); + } + + $ret = array(); + foreach( explode( SEPARATOR, $row ) as $ent ) { + if( trim( $ent ) !== "" ) { + $ret[] = unistr($ent); + } + } + $ret[] = $c; + + return $ret; +} |