Update to MediaWiki 1.24.1

author: Pierre Schmitz <pierre@archlinux.de> 2014-12-27 15:41:37 +0100
committer: Pierre Schmitz <pierre@archlinux.de> 2014-12-31 11:43:28 +0100
commit: c1f9b1f7b1b77776192048005dcc66dcf3df2bfb (patch)
tree: 2b38796e738dd74cb42ecd9bfd151803108386bc /includes/Collation.php
parent: b88ab0086858470dd1f644e64cb4e4f62bb2be9b (diff)
1 files changed, 106 insertions, 69 deletions
diff --git a/includes/Collation.php b/includes/Collation.php
index b0252c70..1c2c2db3 100644
--- a/includes/Collation.php
+++ b/includes/Collation.php
@@ -21,7 +21,7 @@
  */
 
 abstract class Collation {
-	static $instance;
+	private static $instance;
 
 	/**
 	 * @return Collation
@@ -36,7 +36,7 @@ abstract class Collation {
 
 	/**
 	 * @throws MWException
-	 * @param $collationName string
+	 * @param string $collationName
 	 * @return Collation
 	 */
 	static function factory( $collationName ) {
@@ -47,6 +47,10 @@ abstract class Collation {
 				return new IdentityCollation;
 			case 'uca-default':
 				return new IcuCollation( 'root' );
+			case 'xx-uca-ckb':
+				return new CollationCkb;
+			case 'xx-uca-et':
+				return new CollationEt;
 			default:
 				$match = array();
 				if ( preg_match( '/^uca-([a-z@=-]+)$/', $collationName, $match ) ) {
@@ -106,7 +110,8 @@ abstract class Collation {
 }
 
 class UppercaseCollation extends Collation {
-	var $lang;
+	private $lang;
+
 	function __construct() {
 		// Get a language object so that we can use the generic UTF-8 uppercase
 		// function there
@@ -149,10 +154,22 @@ class IdentityCollation extends Collation {
 }
 
 class IcuCollation extends Collation {
-	const FIRST_LETTER_VERSION = 1;
+	const FIRST_LETTER_VERSION = 2;
+
+	/** @var Collator */
+	private $primaryCollator;
+
+	/** @var Collator */
+	private $mainCollator;
+
+	/** @var string */
+	private $locale;
 
-	var $primaryCollator, $mainCollator, $locale;
-	var $firstLetterData;
+	/** @var Language */
+	protected $digitTransformLanguage;
+
+	/** @var array */
+	private $firstLetterData;
 
 	/**
 	 * Unified CJK blocks.
@@ -163,7 +180,7 @@ class IcuCollation extends Collation {
 	 * is pretty useless for sorting Chinese text anyway. Japanese and Korean
 	 * blocks are not included here, because they are smaller and more useful.
 	 */
-	static $cjkBlocks = array(
+	private static $cjkBlocks = array(
 		array( 0x2E80, 0x2EFF ), // CJK Radicals Supplement
 		array( 0x2F00, 0x2FDF ), // Kangxi Radicals
 		array( 0x2FF0, 0x2FFF ), // Ideographic Description Characters
@@ -202,14 +219,19 @@ class IcuCollation extends Collation {
 	 * Empty arrays are intended; this signifies that the data for the language is
 	 * available and that there are, in fact, no additional letters to consider.
 	 */
-	static $tailoringFirstLetters = array(
+	private static $tailoringFirstLetters = array(
 		// Verified by native speakers
 		'be' => array( "Ё" ),
 		'be-tarask' => array( "Ё" ),
+		'cy' => array( "Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th" ),
 		'en' => array(),
+		'fa' => array( "آ", "ء", "ه" ),
 		'fi' => array( "Å", "Ä", "Ö" ),
+		'fr' => array(),
 		'hu' => array( "Cs", "Dz", "Dzs", "Gy", "Ly", "Ny", "Ö", "Sz", "Ty", "Ü", "Zs" ),
+		'is' => array( "Á", "Ð", "É", "Í", "Ó", "Ú", "Ý", "Þ", "Æ", "Ö", "Å" ),
 		'it' => array(),
+		'lv' => array( "Č", "Ģ", "Ķ", "Ļ", "Ņ", "Š", "Ž" ),
 		'pl' => array( "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż" ),
 		'pt' => array(),
 		'ru' => array(),
@@ -227,18 +249,15 @@ class IcuCollation extends Collation {
 		'ca' => array(),
 		'co' => array(),
 		'cs' => array( "Č", "Ch", "Ř", "Š", "Ž" ),
-		'cy' => array( "Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th" ),
 		'da' => array( "Æ", "Ø", "Å" ),
 		'de' => array(),
 		'dsb' => array( "Č", "Ć", "Dź", "Ě", "Ch", "Ł", "Ń", "Ŕ", "Š", "Ś", "Ž", "Ź" ),
 		'el' => array(),
 		'eo' => array( "Ĉ", "Ĝ", "Ĥ", "Ĵ", "Ŝ", "Ŭ" ),
 		'es' => array( "Ñ" ),
-		'et' => array( "Š", "Ž", "Õ", "Ä", "Ö", "Ü" ),
+		'et' => array( "Š", "Ž", "Õ", "Ä", "Ö", "Ü", "W" ), // added W for CollationEt (xx-uca-et)
 		'eu' => array( "Ñ" ),
-		'fa' => array( "آ", "ء", "ه" ),
 		'fo' => array( "Á", "Ð", "Í", "Ó", "Ú", "Ý", "Æ", "Ø", "Å" ),
-		'fr' => array(),
 		'fur' => array( "À", "Á", "Â", "È", "Ì", "Ò", "Ù" ),
 		'fy' => array(),
 		'ga' => array(),
@@ -246,7 +265,6 @@ class IcuCollation extends Collation {
 		'gl' => array( "Ch", "Ll", "Ñ" ),
 		'hr' => array( "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ),
 		'hsb' => array( "Č", "Dź", "Ě", "Ch", "Ł", "Ń", "Ř", "Š", "Ć", "Ž" ),
-		'is' => array( "Á", "Ð", "É", "Í", "Ó", "Ú", "Ý", "Þ", "Æ", "Ö", "Å" ),
 		'kk' => array( "Ү", "І" ),
 		'kl' => array( "Æ", "Ø", "Å" ),
 		'ku' => array( "Ç", "Ê", "Î", "Ş", "Û" ),
@@ -254,7 +272,6 @@ class IcuCollation extends Collation {
 		'la' => array(),
 		'lb' => array(),
 		'lt' => array( "Č", "Š", "Ž" ),
-		'lv' => array( "Č", "Ģ", "Ķ", "Ļ", "Ņ", "Š", "Ž" ),
 		'mk' => array(),
 		'mo' => array( "Ă", "Â", "Î", "Ş", "Ţ" ),
 		'mt' => array( "Ċ", "Ġ", "Għ", "Ħ", "Ż" ),
@@ -284,7 +301,12 @@ class IcuCollation extends Collation {
 			throw new MWException( 'An ICU collation was requested, ' .
 				'but the intl extension is not available.' );
 		}
+
 		$this->locale = $locale;
+		// Drop everything after the '@' in locale's name
+		$localeParts = explode( '@', $locale );
+		$this->digitTransformLanguage = Language::factory( $locale === 'root' ? 'en' : $localeParts[0] );
+
 		$this->mainCollator = Collator::create( $locale );
 		if ( !$this->mainCollator ) {
 			throw new MWException( "Invalid ICU locale specified for collation: $locale" );
@@ -319,16 +341,14 @@ class IcuCollation extends Collation {
 
 		// Check for CJK
 		$firstChar = mb_substr( $string, 0, 1, 'UTF-8' );
-		if ( ord( $firstChar ) > 0x7f
-			&& self::isCjk( utf8ToCodepoint( $firstChar ) ) )
-		{
+		if ( ord( $firstChar ) > 0x7f && self::isCjk( utf8ToCodepoint( $firstChar ) ) ) {
 			return $firstChar;
 		}
 
 		$sortKey = $this->getPrimarySortKey( $string );
 
 		// Do a binary search to find the correct letter to sort under
-		$min = $this->findLowerBound(
+		$min = ArrayUtils::findLowerBound(
 			array( $this, 'getSortKeyByLetterIndex' ),
 			$this->getFirstLetterCount(),
 			'strcmp',
@@ -347,7 +367,12 @@ class IcuCollation extends Collation {
 		}
 
 		$cache = wfGetCache( CACHE_ANYTHING );
-		$cacheKey = wfMemcKey( 'first-letters', $this->locale );
+		$cacheKey = wfMemcKey(
+			'first-letters',
+			$this->locale,
+			$this->digitTransformLanguage->getCode(),
+			self::getICUVersion()
+		);
 		$cacheEntry = $cache->get( $cacheKey );
 
 		if ( $cacheEntry && isset( $cacheEntry['version'] )
@@ -367,6 +392,12 @@ class IcuCollation extends Collation {
 			if ( isset( self::$tailoringFirstLetters['-' . $this->locale] ) ) {
 				$letters = array_diff( $letters, self::$tailoringFirstLetters['-' . $this->locale] );
 			}
+			// Apply digit transforms
+			$digits = array( '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' );
+			$letters = array_diff( $letters, $digits );
+			foreach ( $digits as $digit ) {
+				$letters[] = $this->digitTransformLanguage->formatNum( $digit, true );
+			}
 		} else {
 			$letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" );
 			if ( $letters === false ) {
@@ -459,7 +490,7 @@ class IcuCollation extends Collation {
 			$prev = $trimmedKey;
 		}
 		foreach ( $duplicatePrefixes as $badKey ) {
-			wfDebug( "Removing '{$letterMap[$badKey]}' from first letters." );
+			wfDebug( "Removing '{$letterMap[$badKey]}' from first letters.\n" );
 			unset( $letterMap[$badKey] );
 			// This code assumes that unsetting does not change sort order.
 		}
@@ -499,53 +530,6 @@ class IcuCollation extends Collation {
 		return count( $this->firstLetterData['chars'] );
 	}
 
-	/**
-	 * Do a binary search, and return the index of the largest item that sorts
-	 * less than or equal to the target value.
-	 *
-	 * @param array $valueCallback A function to call to get the value with
-	 *     a given array index.
-	 * @param int $valueCount The number of items accessible via $valueCallback,
-	 *     indexed from 0 to $valueCount - 1
-	 * @param array $comparisonCallback A callback to compare two values, returning
-	 *     -1, 0 or 1 in the style of strcmp().
-	 * @param string $target The target value to find.
-	 *
-	 * @return int|bool The item index of the lower bound, or false if the target value
-	 *     sorts before all items.
-	 */
-	function findLowerBound( $valueCallback, $valueCount, $comparisonCallback, $target ) {
-		if ( $valueCount === 0 ) {
-			return false;
-		}
-
-		$min = 0;
-		$max = $valueCount;
-		do {
-			$mid = $min + ( ( $max - $min ) >> 1 );
-			$item = call_user_func( $valueCallback, $mid );
-			$comparison = call_user_func( $comparisonCallback, $target, $item );
-			if ( $comparison > 0 ) {
-				$min = $mid;
-			} elseif ( $comparison == 0 ) {
-				$min = $mid;
-				break;
-			} else {
-				$max = $mid;
-			}
-		} while ( $min < $max - 1 );
-
-		if ( $min == 0 ) {
-			$item = call_user_func( $valueCallback, $min );
-			$comparison = call_user_func( $comparisonCallback, $target, $item );
-			if ( $comparison < 0 ) {
-				// Before the first item
-				return false;
-			}
-		}
-		return $min;
-	}
-
 	static function isCjk( $codepoint ) {
 		foreach ( self::$cjkBlocks as $block ) {
 			if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
@@ -565,7 +549,7 @@ class IcuCollation extends Collation {
 	 * This function will return false on older PHPs.
 	 *
 	 * @since 1.21
-	 * @return string|false
+	 * @return string|bool
 	 */
 	static function getICUVersion() {
 		return defined( 'INTL_ICU_VERSION' ) ? INTL_ICU_VERSION : false;
@@ -576,7 +560,7 @@ class IcuCollation extends Collation {
 	 * currently in use, or false when it can't be determined.
 	 *
 	 * @since 1.21
-	 * @return string|false
+	 * @return string|bool
 	 */
 	static function getUnicodeVersionForICU() {
 		$icuVersion = IcuCollation::getICUVersion();
@@ -606,3 +590,56 @@ class IcuCollation extends Collation {
 		}
 	}
 }
+
+/**
+ * Workaround for the lack of support of Sorani Kurdish / Central Kurdish language ('ckb') in ICU.
+ *
+ * Uses the same collation rules as Persian / Farsi ('fa'), but different characters for digits.
+ */
+class CollationCkb extends IcuCollation {
+	function __construct() {
+		// This will set $locale and collators, which affect the actual sorting order
+		parent::__construct( 'fa' );
+		// Override the 'fa' language set by parent constructor, which affects #getFirstLetterData()
+		$this->digitTransformLanguage = Language::factory( 'ckb' );
+	}
+}
+
+/**
+ * Workaround for incorrect collation of Estonian language ('et') in ICU (bug 54168).
+ *
+ * 'W' and 'V' should not be considered the same letter for the purposes of collation in modern
+ * Estonian. We work around this by replacing 'W' and 'w' with 'ᴡ' U+1D21 'LATIN LETTER SMALL
+ * CAPITAL W' for sortkey generation, which is collated like 'W' and is not tailored to have the
+ * same primary weight as 'V' in Estonian.
+ */
+class CollationEt extends IcuCollation {
+	function __construct() {
+		parent::__construct( 'et' );
+	}
+
+	private static function mangle( $string ) {
+		return str_replace(
+			array( 'w', 'W' ),
+			'ᴡ', // U+1D21 'LATIN LETTER SMALL CAPITAL W'
+			$string
+		);
+	}
+
+	private static function unmangle( $string ) {
+		// Casing data is lost…
+		return str_replace(
+			'ᴡ', // U+1D21 'LATIN LETTER SMALL CAPITAL W'
+			'W',
+			$string
+		);
+	}
+
+	function getSortKey( $string ) {
+		return parent::getSortKey( self::mangle( $string ) );
+	}
+
+	function getFirstLetter( $string ) {
+		return self::unmangle( parent::getFirstLetter( self::mangle( $string ) ) );
+	}
+}
author	Pierre Schmitz <pierre@archlinux.de>	2014-12-27 15:41:37 +0100
committer	Pierre Schmitz <pierre@archlinux.de>	2014-12-31 11:43:28 +0100
commit	c1f9b1f7b1b77776192048005dcc66dcf3df2bfb (patch)
tree	2b38796e738dd74cb42ecd9bfd151803108386bc /includes/Collation.php
parent	b88ab0086858470dd1f644e64cb4e4f62bb2be9b (diff)