diff options
author | Pierre Schmitz <pierre@archlinux.de> | 2011-06-22 11:28:20 +0200 |
---|---|---|
committer | Pierre Schmitz <pierre@archlinux.de> | 2011-06-22 11:28:20 +0200 |
commit | 9db190c7e736ec8d063187d4241b59feaf7dc2d1 (patch) | |
tree | 46d1a0dee7febef5c2d57a9f7b972be16a163b3d /includes/Collation.php | |
parent | 78677c7bbdcc9739f6c10c75935898a20e1acd9e (diff) |
update to MediaWiki 1.17.0
Diffstat (limited to 'includes/Collation.php')
-rw-r--r-- | includes/Collation.php | 307 |
1 files changed, 307 insertions, 0 deletions
diff --git a/includes/Collation.php b/includes/Collation.php new file mode 100644 index 00000000..f00b568f --- /dev/null +++ b/includes/Collation.php @@ -0,0 +1,307 @@ +<?php + +abstract class Collation { + static $instance; + + static function singleton() { + if ( !self::$instance ) { + global $wgCategoryCollation; + self::$instance = self::factory( $wgCategoryCollation ); + } + return self::$instance; + } + + static function factory( $collationName ) { + switch( $collationName ) { + case 'uppercase': + return new UppercaseCollation; + case 'uca-default': + return new IcuCollation( 'root' ); + default: + throw new MWException( __METHOD__.": unknown collation type \"$collationName\"" ); + } + } + + /** + * Given a string, convert it to a (hopefully short) key that can be used + * for efficient sorting. A binary sort according to the sortkeys + * corresponds to a logical sort of the corresponding strings. Current + * code expects that a line feed character should sort before all others, but + * has no other particular expectations (and that one can be changed if + * necessary). + * + * @param string $string UTF-8 string + * @return string Binary sortkey + */ + abstract function getSortKey( $string ); + + /** + * Given a string, return the logical "first letter" to be used for + * grouping on category pages and so on. This has to be coordinated + * carefully with convertToSortkey(), or else the sorted list might jump + * back and forth between the same "initial letters" or other pathological + * behavior. For instance, if you just return the first character, but "a" + * sorts the same as "A" based on getSortKey(), then you might get a + * list like + * + * == A == + * * [[Aardvark]] + * + * == a == + * * [[antelope]] + * + * == A == + * * [[Ape]] + * + * etc., assuming for the sake of argument that $wgCapitalLinks is false. + * + * @param string $string UTF-8 string + * @return string UTF-8 string corresponding to the first letter of input + */ + abstract function getFirstLetter( $string ); +} + +class UppercaseCollation extends Collation { + var $lang; + function __construct() { + // Get a language object so that we can use the generic UTF-8 uppercase + // function there + $this->lang = Language::factory( 'en' ); + } + + function getSortKey( $string ) { + return $this->lang->uc( $string ); + } + + function getFirstLetter( $string ) { + if ( $string[0] == "\0" ) { + $string = substr( $string, 1 ); + } + return $this->lang->ucfirst( $this->lang->firstChar( $string ) ); + } +} + +class IcuCollation extends Collation { + var $primaryCollator, $mainCollator, $locale; + var $firstLetterData; + + /** + * Unified CJK blocks. + * + * The same definition of a CJK block must be used for both Collation and + * generateCollationData.php. These blocks are omitted from the first + * letter data, as an optimisation measure and because the default UCA table + * is pretty useless for sorting Chinese text anyway. Japanese and Korean + * blocks are not included here, because they are smaller and more useful. + */ + static $cjkBlocks = array( + array( 0x2E80, 0x2EFF ), // CJK Radicals Supplement + array( 0x2F00, 0x2FDF ), // Kangxi Radicals + array( 0x2FF0, 0x2FFF ), // Ideographic Description Characters + array( 0x3000, 0x303F ), // CJK Symbols and Punctuation + array( 0x31C0, 0x31EF ), // CJK Strokes + array( 0x3200, 0x32FF ), // Enclosed CJK Letters and Months + array( 0x3300, 0x33FF ), // CJK Compatibility + array( 0x3400, 0x4DBF ), // CJK Unified Ideographs Extension A + array( 0x4E00, 0x9FFF ), // CJK Unified Ideographs + array( 0xF900, 0xFAFF ), // CJK Compatibility Ideographs + array( 0xFE30, 0xFE4F ), // CJK Compatibility Forms + array( 0x20000, 0x2A6DF ), // CJK Unified Ideographs Extension B + array( 0x2A700, 0x2B73F ), // CJK Unified Ideographs Extension C + array( 0x2B740, 0x2B81F ), // CJK Unified Ideographs Extension D + array( 0x2F800, 0x2FA1F ), // CJK Compatibility Ideographs Supplement + ); + + const RECORD_LENGTH = 14; + + function __construct( $locale ) { + if ( !extension_loaded( 'intl' ) ) { + throw new MWException( 'An ICU collation was requested, ' . + 'but the intl extension is not available.' ); + } + $this->locale = $locale; + $this->mainCollator = Collator::create( $locale ); + if ( !$this->mainCollator ) { + throw new MWException( "Invalid ICU locale specified for collation: $locale" ); + } + + $this->primaryCollator = Collator::create( $locale ); + $this->primaryCollator->setStrength( Collator::PRIMARY ); + } + + function getSortKey( $string ) { + // intl extension produces non null-terminated + // strings. Appending '' fixes it so that it doesn't generate + // a warning on each access in debug php. + wfSuppressWarnings(); + $key = $this->mainCollator->getSortKey( $string ) . ''; + wfRestoreWarnings(); + return $key; + } + + function getPrimarySortKey( $string ) { + wfSuppressWarnings(); + $key = $this->primaryCollator->getSortKey( $string ) . ''; + wfRestoreWarnings(); + return $key; + } + + function getFirstLetter( $string ) { + $string = strval( $string ); + if ( $string === '' ) { + return ''; + } + + // Check for CJK + $firstChar = mb_substr( $string, 0, 1, 'UTF-8' ); + if ( ord( $firstChar ) > 0x7f + && self::isCjk( utf8ToCodepoint( $firstChar ) ) ) + { + return $firstChar; + } + + $sortKey = $this->getPrimarySortKey( $string ); + + // Do a binary search to find the correct letter to sort under + $min = $this->findLowerBound( + array( $this, 'getSortKeyByLetterIndex' ), + $this->getFirstLetterCount(), + 'strcmp', + $sortKey ); + + if ( $min === false ) { + // Before the first letter + return ''; + } + return $this->getLetterByIndex( $min ); + } + + function getFirstLetterData() { + if ( $this->firstLetterData !== null ) { + return $this->firstLetterData; + } + + $cache = wfGetCache( CACHE_ANYTHING ); + $cacheKey = wfMemcKey( 'first-letters', $this->locale ); + $cacheEntry = $cache->get( $cacheKey ); + + if ( $cacheEntry ) { + $this->firstLetterData = $cacheEntry; + return $this->firstLetterData; + } + + // Generate data from serialized data file + + $letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" ); + if ( $letters === false ) { + throw new MWException( "MediaWiki does not support ICU locale " . + "\"{$this->locale}\"" ); + } + + // Sort the letters. + // + // It's impossible to have the precompiled data file properly sorted, + // because the sort order changes depending on ICU version. If the + // array is not properly sorted, the binary search will return random + // results. + // + // We also take this opportunity to remove primary collisions. + $letterMap = array(); + foreach ( $letters as $letter ) { + $key = $this->getPrimarySortKey( $letter ); + if ( isset( $letterMap[$key] ) ) { + // Primary collision + // Keep whichever one sorts first in the main collator + if ( $this->mainCollator->compare( $letter, $letterMap[$key] ) < 0 ) { + $letterMap[$key] = $letter; + } + } else { + $letterMap[$key] = $letter; + } + } + ksort( $letterMap, SORT_STRING ); + $data = array( + 'chars' => array_values( $letterMap ), + 'keys' => array_keys( $letterMap ) + ); + + // Reduce memory usage before caching + unset( $letterMap ); + + // Save to cache + $this->firstLetterData = $data; + $cache->set( $cacheKey, $data, 86400 * 7 /* 1 week */ ); + return $data; + } + + function getLetterByIndex( $index ) { + if ( $this->firstLetterData === null ) { + $this->getFirstLetterData(); + } + return $this->firstLetterData['chars'][$index]; + } + + function getSortKeyByLetterIndex( $index ) { + if ( $this->firstLetterData === null ) { + $this->getFirstLetterData(); + } + return $this->firstLetterData['keys'][$index]; + } + + function getFirstLetterCount() { + if ( $this->firstLetterData === null ) { + $this->getFirstLetterData(); + } + return count( $this->firstLetterData['chars'] ); + } + + /** + * Do a binary search, and return the index of the largest item that sorts + * less than or equal to the target value. + * + * @param $valueCallback A function to call to get the value with + * a given array index. + * @param $valueCount The number of items accessible via $valueCallback, + * indexed from 0 to $valueCount - 1 + * @param $comparisonCallback A callback to compare two values, returning + * -1, 0 or 1 in the style of strcmp(). + * @param $target The target value to find. + * + * @return The item index of the lower bound, or false if the target value + * sorts before all items. + */ + function findLowerBound( $valueCallback, $valueCount, $comparisonCallback, $target ) { + $min = 0; + $max = $valueCount - 1; + do { + $mid = $min + ( ( $max - $min ) >> 1 ); + $item = call_user_func( $valueCallback, $mid ); + $comparison = call_user_func( $comparisonCallback, $target, $item ); + if ( $comparison > 0 ) { + $min = $mid; + } elseif ( $comparison == 0 ) { + $min = $mid; + break; + } else { + $max = $mid; + } + } while ( $min < $max - 1 ); + + if ( $min == 0 && $max == 0 && $comparison > 0 ) { + // Before the first item + return false; + } else { + return $min; + } + } + + static function isCjk( $codepoint ) { + foreach ( self::$cjkBlocks as $block ) { + if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) { + return true; + } + } + return false; + } +} + |