From 7d347a05ce025a9aef28bcf72089e1388dd48d13 Mon Sep 17 00:00:00 2001
From: Luke Shumaker <lukeshu@lukeshu.com>
Date: Fri, 22 Oct 2004 05:57:51 -0500
Subject: 
 http://web.archive.org/web/20041022055751/http:/www.unicode.org:80/Public/BETA/CVTUTF-1-3/

---
 ConvertUTF.c | 46 +++++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 15 deletions(-)

(limited to 'ConvertUTF.c')

diff --git a/ConvertUTF.c b/ConvertUTF.c
index 649fbc8..9b3deeb 100644
--- a/ConvertUTF.c
+++ b/ConvertUTF.c
@@ -32,6 +32,7 @@
 	to eliminate compiler warnings.
     July 2003: slight mods to back out aggressive FFFE detection.
     Jan 2004: updated switches in from-UTF8 conversions.
+    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
 
     See the header file "ConvertUTF.h" for complete documentation.
 
@@ -82,7 +83,7 @@ ConversionResult ConvertUTF32toUTF16 (
 	    } else {
 		*target++ = (UTF16)ch; /* normal case */
 	    }
-	} else if (ch > UNI_MAX_UTF16) {
+	} else if (ch > UNI_MAX_LEGAL_UTF32) {
 	    if (flags == strictConversion) {
 		result = sourceIllegal;
 	    } else {
@@ -166,6 +167,9 @@ if (result == sourceIllegal) {
 /*
  * Index into the table below with the first byte of a UTF-8 sequence to
  * get the number of trailing bytes that are supposed to follow it.
+ * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
+ * left as-is for anyone who may want to do such conversion, which was
+ * allowed in earlier algorithms.
  */
 static const char trailingBytesForUTF8[256] = {
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@@ -190,7 +194,8 @@ static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080
  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
  * into the first byte, depending on how many bytes follow.  There are
  * as many entries in this table as there are UTF-8 sequence types.
- * (I.e., one byte sequence, two byte... six byte sequence.)
+ * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
+ * for *legal* UTF-8 will be 4 or fewer bytes total.
  */
 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 
@@ -251,8 +256,8 @@ ConversionResult ConvertUTF16toUTF8 (
 	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
-	} else if (ch < (UTF32)0x200000) {  bytesToWrite = 4;
-	} else {			    bytesToWrite = 2;
+	} else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
+	} else {			    bytesToWrite = 3;
 					    ch = UNI_REPLACEMENT_CHAR;
 	}
 
@@ -296,16 +301,19 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) {
     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
     case 2: if ((a = (*--srcptr)) > 0xBF) return false;
+
 	switch (*source) {
 	    /* no fall-through in this inner switch */
 	    case 0xE0: if (a < 0xA0) return false; break;
+	    case 0xED: if (a > 0x9F) return false; break;
 	    case 0xF0: if (a < 0x90) return false; break;
 	    case 0xF4: if (a > 0x8F) return false; break;
-	    default:  if (a < 0x80) return false;
+	    default:   if (a < 0x80) return false;
 	}
-	case 1: if (*source >= 0x80 && *source < 0xC2) return false;
-	if (*source > 0xF4) return false;
+
+    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
     }
+    if (*source > 0xF4) return false;
     return true;
 }
 
@@ -346,8 +354,8 @@ ConversionResult ConvertUTF8toUTF16 (
 	 * The cases all fall through. See "Note A" below.
 	 */
 	switch (extraBytesToRead) {
-	    case 5: ch += *source++; ch <<= 6;
-	    case 4: ch += *source++; ch <<= 6;
+	    case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
+	    case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
 	    case 3: ch += *source++; ch <<= 6;
 	    case 2: ch += *source++; ch <<= 6;
 	    case 1: ch += *source++; ch <<= 6;
@@ -418,13 +426,17 @@ ConversionResult ConvertUTF32toUTF8 (
 		break;
 	    }
 	}
-	/* Figure out how many bytes the result will require */
+	/*
+	 * Figure out how many bytes the result will require. Turn any
+	 * illegally large UTF32 things (> Plane 17) into replacement chars.
+	 */
 	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
-	} else if (ch < (UTF32)0x200000) {  bytesToWrite = 4;
-	} else {			    bytesToWrite = 2;
+	} else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
+	} else {			    bytesToWrite = 3;
 					    ch = UNI_REPLACEMENT_CHAR;
+					    result = sourceIllegal;
 	}
 	
 	target += bytesToWrite;
@@ -481,8 +493,11 @@ ConversionResult ConvertUTF8toUTF32 (
 	    source -= (extraBytesToRead+1); /* Back up the source pointer! */
 	    result = targetExhausted; break;
 	}
-	if (ch <= UNI_MAX_UTF32) {
-	    /* UTF-16 surrogate values are illegal in UTF-32 */
+	if (ch <= UNI_MAX_LEGAL_UTF32) {
+	    /*
+	     * UTF-16 surrogate values are illegal in UTF-32, and anything
+	     * over Plane 17 (> 0x10FFFF) is illegal.
+	     */
 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 		if (flags == strictConversion) {
 		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
@@ -494,7 +509,8 @@ ConversionResult ConvertUTF8toUTF32 (
 	    } else {
 		*target++ = ch;
 	    }
-	} else { /* i.e., ch > UNI_MAX_UTF32 */
+	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
+	    result = sourceIllegal;
 	    *target++ = UNI_REPLACEMENT_CHAR;
 	}
     }
-- 
cgit v1.2.3