diff options
Diffstat (limited to 'includes/StringUtils.php')
-rw-r--r-- | includes/StringUtils.php | 73 |
1 files changed, 59 insertions, 14 deletions
diff --git a/includes/StringUtils.php b/includes/StringUtils.php index 43275a66..f4c98f1d 100644 --- a/includes/StringUtils.php +++ b/includes/StringUtils.php @@ -24,6 +24,51 @@ * A collection of static methods to play with strings. */ class StringUtils { + + /** + * Test whether a string is valid UTF-8. + * + * The function check for invalid byte sequences, overlong encoding but + * not for different normalisations. + * + * This relies internally on the mbstring function mb_check_encoding() + * hardcoded to check against UTF-8. Whenever the function is not available + * we fallback to a pure PHP implementation. Setting $disableMbstring to + * true will skip the use of mb_check_encoding, this is mostly intended for + * unit testing our internal implementation. + * + * @since 1.21 + * + * @param string $value String to check + * @param boolean $disableMbstring Whether to use the pure PHP + * implementation instead of trying mb_check_encoding. Intended for unit + * testing. Default: false + * + * @return boolean Whether the given $value is a valid UTF-8 encoded string + */ + static function isUtf8( $value, $disableMbstring = false ) { + + if ( preg_match( '/[\x80-\xff]/', $value ) === 0 ) { + # no high bit set, this is pure ASCII which is de facto + # valid UTF-8 + return true; + } + + if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) { + return mb_check_encoding( $value, 'UTF-8' ); + } else { + $hasUtf8 = preg_match( '/^(?> + [\x00-\x7f] + | [\xc0-\xdf][\x80-\xbf] + | [\xe0-\xef][\x80-\xbf]{2} + | [\xf0-\xf7][\x80-\xbf]{3} + | [\xf8-\xfb][\x80-\xbf]{4} + | \xfc[\x84-\xbf][\x80-\xbf]{4} + )+$/x', $value ); + return ($hasUtf8 > 0 ); + } + } + /** * Perform an operation equivalent to * @@ -65,16 +110,16 @@ class StringUtils { * memory. The delimiters are literal strings, not regular expressions. * * If the start delimiter ends with an initial substring of the end delimiter, - * e.g. in the case of C-style comments, the behaviour differs from the model + * e.g. in the case of C-style comments, the behavior differs from the model * regex. In this implementation, the end must share no characters with the * start, so e.g. /*\/ is not considered to be both the start and end of a * comment. /*\/xy/*\/ is considered to be a single comment with contents /xy/. * - * @param $startDelim String: start delimiter - * @param $endDelim String: end delimiter + * @param string $startDelim start delimiter + * @param string $endDelim end delimiter * @param $callback Callback: function to call on each match * @param $subject String - * @param $flags String: regular expression flags + * @param string $flags regular expression flags * @throws MWException * @return string */ @@ -90,12 +135,12 @@ class StringUtils { $m = array(); while ( $inputPos < strlen( $subject ) && - preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) ) + preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) ) { $tokenOffset = $m[0][1]; if ( $m[1][0] != '' ) { if ( $foundStart && - $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0 ) + $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0 ) { # An end match is present at the same location $tokenType = 'end'; @@ -155,12 +200,12 @@ class StringUtils { * * preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject ) * - * @param $startDelim String: start delimiter regular expression - * @param $endDelim String: end delimiter regular expression - * @param $replace String: replacement string. May contain $1, which will be + * @param string $startDelim start delimiter regular expression + * @param string $endDelim end delimiter regular expression + * @param string $replace replacement string. May contain $1, which will be * replaced by the text between the delimiters - * @param $subject String to search - * @param $flags String: regular expression flags + * @param string $subject to search + * @param string $flags regular expression flags * @return String: The string with the matches replaced */ static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) { @@ -360,7 +405,7 @@ class ReplacementArray { /** * Set an element of the replacement array * @param $from string - * @param $to stromg + * @param $to string */ function setPair( $from, $to ) { $this->data[$from] = $to; @@ -387,7 +432,7 @@ class ReplacementArray { * @param $from string */ function removePair( $from ) { - unset($this->data[$from]); + unset( $this->data[$from] ); $this->fss = false; } @@ -424,7 +469,7 @@ class ReplacementArray { /** * An iterator which works exactly like: - * + * * foreach ( explode( $delim, $s ) as $element ) { * ... * } |