From 7d347a05ce025a9aef28bcf72089e1388dd48d13 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 22 Oct 2004 05:57:51 -0500 Subject: http://web.archive.org/web/20041022055751/http:/www.unicode.org:80/Public/BETA/CVTUTF-1-3/ --- ConvertUTF.c | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) (limited to 'ConvertUTF.c') diff --git a/ConvertUTF.c b/ConvertUTF.c index 649fbc8..9b3deeb 100644 --- a/ConvertUTF.c +++ b/ConvertUTF.c @@ -32,6 +32,7 @@ to eliminate compiler warnings. July 2003: slight mods to back out aggressive FFFE detection. Jan 2004: updated switches in from-UTF8 conversions. + Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. See the header file "ConvertUTF.h" for complete documentation. @@ -82,7 +83,7 @@ ConversionResult ConvertUTF32toUTF16 ( } else { *target++ = (UTF16)ch; /* normal case */ } - } else if (ch > UNI_MAX_UTF16) { + } else if (ch > UNI_MAX_LEGAL_UTF32) { if (flags == strictConversion) { result = sourceIllegal; } else { @@ -166,6 +167,9 @@ if (result == sourceIllegal) { /* * Index into the table below with the first byte of a UTF-8 sequence to * get the number of trailing bytes that are supposed to follow it. + * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is + * left as-is for anyone who may want to do such conversion, which was + * allowed in earlier algorithms. */ static const char trailingBytesForUTF8[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, @@ -190,7 +194,8 @@ static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed * into the first byte, depending on how many bytes follow. There are * as many entries in this table as there are UTF-8 sequence types. - * (I.e., one byte sequence, two byte... six byte sequence.) + * (I.e., one byte sequence, two byte... etc.). Remember that sequencs + * for *legal* UTF-8 will be 4 or fewer bytes total. */ static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; @@ -251,8 +256,8 @@ ConversionResult ConvertUTF16toUTF8 ( if (ch < (UTF32)0x80) { bytesToWrite = 1; } else if (ch < (UTF32)0x800) { bytesToWrite = 2; } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch < (UTF32)0x200000) { bytesToWrite = 4; - } else { bytesToWrite = 2; + } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; + } else { bytesToWrite = 3; ch = UNI_REPLACEMENT_CHAR; } @@ -296,16 +301,19 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) { case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; case 2: if ((a = (*--srcptr)) > 0xBF) return false; + switch (*source) { /* no fall-through in this inner switch */ case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; case 0xF0: if (a < 0x90) return false; break; case 0xF4: if (a > 0x8F) return false; break; - default: if (a < 0x80) return false; + default: if (a < 0x80) return false; } - case 1: if (*source >= 0x80 && *source < 0xC2) return false; - if (*source > 0xF4) return false; + + case 1: if (*source >= 0x80 && *source < 0xC2) return false; } + if (*source > 0xF4) return false; return true; } @@ -346,8 +354,8 @@ ConversionResult ConvertUTF8toUTF16 ( * The cases all fall through. See "Note A" below. */ switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; - case 4: ch += *source++; ch <<= 6; + case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; @@ -418,13 +426,17 @@ ConversionResult ConvertUTF32toUTF8 ( break; } } - /* Figure out how many bytes the result will require */ + /* + * Figure out how many bytes the result will require. Turn any + * illegally large UTF32 things (> Plane 17) into replacement chars. + */ if (ch < (UTF32)0x80) { bytesToWrite = 1; } else if (ch < (UTF32)0x800) { bytesToWrite = 2; } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch < (UTF32)0x200000) { bytesToWrite = 4; - } else { bytesToWrite = 2; + } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; + } else { bytesToWrite = 3; ch = UNI_REPLACEMENT_CHAR; + result = sourceIllegal; } target += bytesToWrite; @@ -481,8 +493,11 @@ ConversionResult ConvertUTF8toUTF32 ( source -= (extraBytesToRead+1); /* Back up the source pointer! */ result = targetExhausted; break; } - if (ch <= UNI_MAX_UTF32) { - /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch <= UNI_MAX_LEGAL_UTF32) { + /* + * UTF-16 surrogate values are illegal in UTF-32, and anything + * over Plane 17 (> 0x10FFFF) is illegal. + */ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { if (flags == strictConversion) { source -= (extraBytesToRead+1); /* return to the illegal value itself */ @@ -494,7 +509,8 @@ ConversionResult ConvertUTF8toUTF32 ( } else { *target++ = ch; } - } else { /* i.e., ch > UNI_MAX_UTF32 */ + } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ + result = sourceIllegal; *target++ = UNI_REPLACEMENT_CHAR; } } -- cgit v1.2.3