http://web.archive.org/web/20041022055751/http:/www.unicode.org:80/Public/BETA/CVTUTF-1-3/BETA/CVTUTF-1-3

author: Luke Shumaker <lukeshu@lukeshu.com> 2004-10-22 05:57:51 -0500
committer: Luke Shumaker <lukeshu@lukeshu.com> 2004-10-22 05:57:51 -0500
commit: 7d347a05ce025a9aef28bcf72089e1388dd48d13 (patch)
tree: f2eb4b50da34c4823dc4f0f2141323829ff924e0
parent: 766942acf8f0c0d9ef6c16ffbdedefdfda0af4b2 (diff)
6 files changed, 110 insertions, 36 deletions
diff --git a/.metadata.txt b/.metadata.txt
index 4a86b4a..08cfecf 100644
--- a/.metadata.txt
+++ b/.metadata.txt
@@ -1,7 +1,7 @@
-CVTUTF7.C               2004-01-06 17:42
-CVTUTF7.H               2004-01-06 17:42
-ConvertUTF.c            2004-01-06 17:42
-ConvertUTF.h            2004-01-06 17:42
-ExpectedOutput.txt      2004-01-06 17:42
-harness.c               2004-01-06 17:42
-readme.txt              2004-01-06 17:42
+CVTUTF7.C               2004-10-19 16:05
+CVTUTF7.H               2004-10-19 16:05
+ConvertUTF.c            2004-10-19 16:05
+ConvertUTF.h            2004-10-19 16:05
+ExpectedOutput.txt      2004-10-19 16:05
+harness.c               2004-10-19 16:05
+readme.txt              2004-10-19 16:08
diff --git a/ConvertUTF.c b/ConvertUTF.c
index 649fbc8..9b3deeb 100644
--- a/ConvertUTF.c
+++ b/ConvertUTF.c
@@ -32,6 +32,7 @@
 	to eliminate compiler warnings.
     July 2003: slight mods to back out aggressive FFFE detection.
     Jan 2004: updated switches in from-UTF8 conversions.
+    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
 
     See the header file "ConvertUTF.h" for complete documentation.
 
@@ -82,7 +83,7 @@ ConversionResult ConvertUTF32toUTF16 (
 	    } else {
 		*target++ = (UTF16)ch; /* normal case */
 	    }
-	} else if (ch > UNI_MAX_UTF16) {
+	} else if (ch > UNI_MAX_LEGAL_UTF32) {
 	    if (flags == strictConversion) {
 		result = sourceIllegal;
 	    } else {
@@ -166,6 +167,9 @@ if (result == sourceIllegal) {
 /*
  * Index into the table below with the first byte of a UTF-8 sequence to
  * get the number of trailing bytes that are supposed to follow it.
+ * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
+ * left as-is for anyone who may want to do such conversion, which was
+ * allowed in earlier algorithms.
  */
 static const char trailingBytesForUTF8[256] = {
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@@ -190,7 +194,8 @@ static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080
  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
  * into the first byte, depending on how many bytes follow.  There are
  * as many entries in this table as there are UTF-8 sequence types.
- * (I.e., one byte sequence, two byte... six byte sequence.)
+ * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
+ * for *legal* UTF-8 will be 4 or fewer bytes total.
  */
 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 
@@ -251,8 +256,8 @@ ConversionResult ConvertUTF16toUTF8 (
 	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
-	} else if (ch < (UTF32)0x200000) {  bytesToWrite = 4;
-	} else {			    bytesToWrite = 2;
+	} else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
+	} else {			    bytesToWrite = 3;
 					    ch = UNI_REPLACEMENT_CHAR;
 	}
 
@@ -296,16 +301,19 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) {
     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
     case 2: if ((a = (*--srcptr)) > 0xBF) return false;
+
 	switch (*source) {
 	    /* no fall-through in this inner switch */
 	    case 0xE0: if (a < 0xA0) return false; break;
+	    case 0xED: if (a > 0x9F) return false; break;
 	    case 0xF0: if (a < 0x90) return false; break;
 	    case 0xF4: if (a > 0x8F) return false; break;
-	    default:  if (a < 0x80) return false;
+	    default:   if (a < 0x80) return false;
 	}
-	case 1: if (*source >= 0x80 && *source < 0xC2) return false;
-	if (*source > 0xF4) return false;
+
+    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
     }
+    if (*source > 0xF4) return false;
     return true;
 }
 
@@ -346,8 +354,8 @@ ConversionResult ConvertUTF8toUTF16 (
 	 * The cases all fall through. See "Note A" below.
 	 */
 	switch (extraBytesToRead) {
-	    case 5: ch += *source++; ch <<= 6;
-	    case 4: ch += *source++; ch <<= 6;
+	    case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
+	    case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
 	    case 3: ch += *source++; ch <<= 6;
 	    case 2: ch += *source++; ch <<= 6;
 	    case 1: ch += *source++; ch <<= 6;
@@ -418,13 +426,17 @@ ConversionResult ConvertUTF32toUTF8 (
 		break;
 	    }
 	}
-	/* Figure out how many bytes the result will require */
+	/*
+	 * Figure out how many bytes the result will require. Turn any
+	 * illegally large UTF32 things (> Plane 17) into replacement chars.
+	 */
 	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
-	} else if (ch < (UTF32)0x200000) {  bytesToWrite = 4;
-	} else {			    bytesToWrite = 2;
+	} else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
+	} else {			    bytesToWrite = 3;
 					    ch = UNI_REPLACEMENT_CHAR;
+					    result = sourceIllegal;
 	}
 	
 	target += bytesToWrite;
@@ -481,8 +493,11 @@ ConversionResult ConvertUTF8toUTF32 (
 	    source -= (extraBytesToRead+1); /* Back up the source pointer! */
 	    result = targetExhausted; break;
 	}
-	if (ch <= UNI_MAX_UTF32) {
-	    /* UTF-16 surrogate values are illegal in UTF-32 */
+	if (ch <= UNI_MAX_LEGAL_UTF32) {
+	    /*
+	     * UTF-16 surrogate values are illegal in UTF-32, and anything
+	     * over Plane 17 (> 0x10FFFF) is illegal.
+	     */
 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 		if (flags == strictConversion) {
 		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
@@ -494,7 +509,8 @@ ConversionResult ConvertUTF8toUTF32 (
 	    } else {
 		*target++ = ch;
 	    }
-	} else { /* i.e., ch > UNI_MAX_UTF32 */
+	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
+	    result = sourceIllegal;
 	    *target++ = UNI_REPLACEMENT_CHAR;
 	}
     }
diff --git a/ConvertUTF.h b/ConvertUTF.h
index 429ab40..e264915 100644
--- a/ConvertUTF.h
+++ b/ConvertUTF.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2001 Unicode, Inc.
+ * Copyright 2001-2004 Unicode, Inc.
  * 
  * Disclaimer
  * 
@@ -97,6 +97,7 @@ typedef unsigned char	Boolean; /* 0 or 1 */
 #define UNI_MAX_BMP (UTF32)0x0000FFFF
 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
+#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
 
 typedef enum {
 	conversionOK, 		/* conversion successful */
diff --git a/ExpectedOutput.txt b/ExpectedOutput.txt
index cf13a6a..e09d844 100644
--- a/ExpectedOutput.txt
+++ b/ExpectedOutput.txt
@@ -1,4 +1,5 @@
 Three tests of round-trip conversions will be performed.
+One test of illegal UTF-32 will be peroformed.
 Two illegal result messages are expected; one in test 02A; one in test 03A.
 These are for tests of Surrogate conversion.
 
@@ -15,3 +16,6 @@ sourceIllegal	Test03A for 55296 (0xd800); output ; result 3
 !!! Test03A: note expected illegal result for 0x0000D800
 ******** Test03 succeeded without error. ********
 
+Begin Test04
+******** Test04 succeeded without error. ********
+
diff --git a/harness.c b/harness.c
index 1e3dfb9..25b3e9e 100644
--- a/harness.c
+++ b/harness.c
@@ -1,5 +1,5 @@
 /*
- * Copyright 2001 Unicode, Inc.
+ * Copyright 2001-2004 Unicode, Inc.
  * 
  * Disclaimer
  * 
@@ -34,6 +34,8 @@
  * Sept 19, 2002: Corrected error on line 234:  utf16_buf[2] becomes utf16_result[2]
  * 	per report from Iain Murray.
  * July 3, 2003: Updated printout message.
+ * Oct 19, 2004: Updated isLegalUTF8 test data and corrected switch statements to catch
+ *	illegal surrogate use in UTF-8, per report from Frank Tang.
  *
  */
 
@@ -54,7 +56,9 @@
 	00-7F				  0000-  007F
 	C2-DF	80-BF			  0080-  07FF
 	E0	A0-BF	80-BF		  0800-  0FFF
-	E1-EF	80-BF	80-BF		  1000-  FFFF
+	E1-EC   80-BF   80-BF             1000-  CFFF
+	ED      80-9F   80-BF             D000-  D7FF
+	EE-EF   80-BF   80-BF             E000-  FFFF
 	F0	90-BF	80-BF	80-BF	 10000- 3FFFF
 	F1-F3	80-BF	80-BF	80-BF	 40000- FFFFF
 	F4	80-8F	80-BF	80-BF	100000-10FFFF
@@ -88,9 +92,16 @@ struct utf8_test utf8_testData[] = {
     { 0,	2,	{ 0xC0, 0xAF, 0x00, 0x00, 0x00 }},	/* 15 */
     { 0,	3,	{ 0xE0, 0x9F, 0x80, 0x00, 0x00 }},	/* 16 */
     { 0,	4,	{ 0xF0, 0x93, 0xB2, 0xC1, 0x00 }},	/* 17 */
-/* for all > 17 use "short" buffer lengths to detect over-run */
+
+    { 1,	3,	{ 0xED, 0x9F, 0xBF, 0x00, 0x00 }},	/* 18 */
+    { 1,	3,	{ 0xEE, 0x80, 0x80, 0x00, 0x00 }},	/* 19 */
+    { 0,	3,	{ 0xED, 0xA0, 0x80, 0x00, 0x00 }},	/* 20 */
+    { 0,	3,	{ 0xED, 0xBF, 0xBF, 0x00, 0x00 }},	/* 21 */
+
+/* for all > 21 use "short" buffer lengths to detect over-run */
     { 0,	4,	{ 0xF0, 0x93, 0xB2, 0xC3, 0x00 }},	/* 18 use short buflen */
-    { 0,	0,	{ 0x00, 0x00, 0x00, 0x00, 0x00 }}
+    { 0,	0,	{ 0x00, 0x00, 0x00, 0x00, 0x00 }},
+
 };
 
 int test01() {
@@ -103,8 +114,8 @@ int test01() {
 	for (i = 0; utf8_testData[i].utf8_len; i++) {
 		wantVal1 = wantVal2 = utf8_testData[i].utf8_legal;
 		gotVal1 = isLegalUTF8(&(utf8_testData[i].utf8_seq[0]), utf8_testData[i].utf8_len);
-		/* use truncated length for tests over 17 */
-		if (i <= 17) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; }
+		/* use truncated length for tests over 21 */
+		if (i <= 21) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; }
 		gotVal2 = isLegalUTF8Sequence(&(utf8_testData[i].utf8_seq[0]), &(utf8_testData[i].utf8_seq[0])+len2);
 		if ((gotVal1 != wantVal1) || (gotVal2 != wantVal2)) {
 			printf("Test01 error: seq %d is %d & %d (should be %d & %d) for bytes (%x,%x,%x,%x,%x,) & len %d\n",
@@ -380,10 +391,46 @@ int test03() {
 	return 1;
 }
 
+/* ---------------------------------------------------------------------
+	test04 - Test an illegal UTF-32 value > 10FFFF conversion to UTF-8.
+	Expect it will be turned into UNI_REPLACEMENT_CHAR.
+
+   --------------------------------------------------------------------- */
+
+int test04() {
+	int i, n;
+	ConversionResult result;
+	UTF32 utf32_buf[2];
+	UTF8 utf8_buf[8];
+	UTF32 *utf32SourceStart, *utf32TargetStart;
+	UTF8 *utf8SourceStart, *utf8TargetStart;
+
+	printf("Begin Test04\n"); fflush(stdout);
+
+	i = 0x10FFFF + 21; /* an arbitrary value > legal */
+
+	utf32_buf[0] = i; utf32_buf[1] = 0;
+	for (n = 0; n < 8; n++) utf8_buf[n] = 0;
+
+	utf32SourceStart = utf32_buf;
+	utf8TargetStart = utf8_buf;
+
+	/*
+	 * Test UTF32 -> UTF8, with legality check on.
+	 */
+	result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion);
+	if (result != sourceIllegal) {
+		fprintf(stderr, "Test04A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
+	}
+
+	return 1;
+}
+
 /* --------------------------------------------------------------------- */
 
 main() {
 	printf("Three tests of round-trip conversions will be performed.\n");
+	printf("One test of illegal UTF-32 will be peroformed.\n");
 	printf("Two illegal result messages are expected; one in test 02A; one in test 03A.\n");
 	printf("These are for tests of Surrogate conversion.\n\n");
 	fflush(stdout);
@@ -393,4 +440,6 @@ main() {
 	else { printf("-------- Test02 failed. --------\n\n"); }
 	if (test03()) { printf("******** Test03 succeeded without error. ********\n\n"); }
 	else { printf("-------- Test03 failed. --------\n\n"); }
+	if (test04()) { printf("******** Test04 succeeded without error. ********\n\n"); }
+	else { printf("-------- Test04 failed. --------\n\n"); }
 }
diff --git a/readme.txt b/readme.txt
index 722c6f4..b9f17fb 100644
--- a/readme.txt
+++ b/readme.txt
@@ -3,12 +3,12 @@ The accompanying C source code file "ConvertUTF.c" and the associated header
 file "ConvertUTF.h" provide for conversion between various transformation
 formats of Unicode characters.  The following conversions are supported:
 
-	UCS4 to UTF16
-	UCS4 to UTF8
-	UTF16 to UCS4
-	UTF16 to UTF8
-	UTF8 to UTF16
-	UTF8 to UCS4
+	UTF-32 to UTF-16
+	UTF-32 to UTF-8
+	UTF-16 to UTF-32
+	UTF-16 to UTF-8
+	UTF-8 to UTF-16
+	UTF-8 to UTF-32
 
 In addition, there is a test harness which runs various tests.
 
@@ -33,7 +33,11 @@ Version 1.2: corrected switch statements associated with "extraBytesToRead"
 	UTF8, but the table and this code has always catered for those,
 	cases since at one time they were legal.
 
+Version 1.3: Updated UTF-8 legality check;
+	updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions
+	Updated UTF-8 legality tests in harness.c
+ 
 
-Last update: January 6, 2004
+Last update: October 19, 2004
author	Luke Shumaker <lukeshu@lukeshu.com>	2004-10-22 05:57:51 -0500
committer	Luke Shumaker <lukeshu@lukeshu.com>	2004-10-22 05:57:51 -0500
commit	7d347a05ce025a9aef28bcf72089e1388dd48d13 (patch)
tree	f2eb4b50da34c4823dc4f0f2141323829ff924e0
parent	766942acf8f0c0d9ef6c16ffbdedefdfda0af4b2 (diff)