summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2004-10-22 05:57:51 -0500
committerLuke Shumaker <lukeshu@lukeshu.com>2004-10-22 05:57:51 -0500
commit7d347a05ce025a9aef28bcf72089e1388dd48d13 (patch)
treef2eb4b50da34c4823dc4f0f2141323829ff924e0
parent766942acf8f0c0d9ef6c16ffbdedefdfda0af4b2 (diff)
http://web.archive.org/web/20041022055751/http:/www.unicode.org:80/Public/BETA/CVTUTF-1-3/BETA/CVTUTF-1-3
-rw-r--r--.metadata.txt14
-rw-r--r--ConvertUTF.c46
-rw-r--r--ConvertUTF.h3
-rw-r--r--ExpectedOutput.txt4
-rw-r--r--harness.c61
-rw-r--r--readme.txt18
6 files changed, 110 insertions, 36 deletions
diff --git a/.metadata.txt b/.metadata.txt
index 4a86b4a..08cfecf 100644
--- a/.metadata.txt
+++ b/.metadata.txt
@@ -1,7 +1,7 @@
-CVTUTF7.C 2004-01-06 17:42
-CVTUTF7.H 2004-01-06 17:42
-ConvertUTF.c 2004-01-06 17:42
-ConvertUTF.h 2004-01-06 17:42
-ExpectedOutput.txt 2004-01-06 17:42
-harness.c 2004-01-06 17:42
-readme.txt 2004-01-06 17:42
+CVTUTF7.C 2004-10-19 16:05
+CVTUTF7.H 2004-10-19 16:05
+ConvertUTF.c 2004-10-19 16:05
+ConvertUTF.h 2004-10-19 16:05
+ExpectedOutput.txt 2004-10-19 16:05
+harness.c 2004-10-19 16:05
+readme.txt 2004-10-19 16:08
diff --git a/ConvertUTF.c b/ConvertUTF.c
index 649fbc8..9b3deeb 100644
--- a/ConvertUTF.c
+++ b/ConvertUTF.c
@@ -32,6 +32,7 @@
to eliminate compiler warnings.
July 2003: slight mods to back out aggressive FFFE detection.
Jan 2004: updated switches in from-UTF8 conversions.
+ Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
See the header file "ConvertUTF.h" for complete documentation.
@@ -82,7 +83,7 @@ ConversionResult ConvertUTF32toUTF16 (
} else {
*target++ = (UTF16)ch; /* normal case */
}
- } else if (ch > UNI_MAX_UTF16) {
+ } else if (ch > UNI_MAX_LEGAL_UTF32) {
if (flags == strictConversion) {
result = sourceIllegal;
} else {
@@ -166,6 +167,9 @@ if (result == sourceIllegal) {
/*
* Index into the table below with the first byte of a UTF-8 sequence to
* get the number of trailing bytes that are supposed to follow it.
+ * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
+ * left as-is for anyone who may want to do such conversion, which was
+ * allowed in earlier algorithms.
*/
static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@@ -190,7 +194,8 @@ static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080
* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
* into the first byte, depending on how many bytes follow. There are
* as many entries in this table as there are UTF-8 sequence types.
- * (I.e., one byte sequence, two byte... six byte sequence.)
+ * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
+ * for *legal* UTF-8 will be 4 or fewer bytes total.
*/
static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
@@ -251,8 +256,8 @@ ConversionResult ConvertUTF16toUTF8 (
if (ch < (UTF32)0x80) { bytesToWrite = 1;
} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
- } else if (ch < (UTF32)0x200000) { bytesToWrite = 4;
- } else { bytesToWrite = 2;
+ } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
+ } else { bytesToWrite = 3;
ch = UNI_REPLACEMENT_CHAR;
}
@@ -296,16 +301,19 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) {
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
+
switch (*source) {
/* no fall-through in this inner switch */
case 0xE0: if (a < 0xA0) return false; break;
+ case 0xED: if (a > 0x9F) return false; break;
case 0xF0: if (a < 0x90) return false; break;
case 0xF4: if (a > 0x8F) return false; break;
- default: if (a < 0x80) return false;
+ default: if (a < 0x80) return false;
}
- case 1: if (*source >= 0x80 && *source < 0xC2) return false;
- if (*source > 0xF4) return false;
+
+ case 1: if (*source >= 0x80 && *source < 0xC2) return false;
}
+ if (*source > 0xF4) return false;
return true;
}
@@ -346,8 +354,8 @@ ConversionResult ConvertUTF8toUTF16 (
* The cases all fall through. See "Note A" below.
*/
switch (extraBytesToRead) {
- case 5: ch += *source++; ch <<= 6;
- case 4: ch += *source++; ch <<= 6;
+ case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
+ case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
case 3: ch += *source++; ch <<= 6;
case 2: ch += *source++; ch <<= 6;
case 1: ch += *source++; ch <<= 6;
@@ -418,13 +426,17 @@ ConversionResult ConvertUTF32toUTF8 (
break;
}
}
- /* Figure out how many bytes the result will require */
+ /*
+ * Figure out how many bytes the result will require. Turn any
+ * illegally large UTF32 things (> Plane 17) into replacement chars.
+ */
if (ch < (UTF32)0x80) { bytesToWrite = 1;
} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
- } else if (ch < (UTF32)0x200000) { bytesToWrite = 4;
- } else { bytesToWrite = 2;
+ } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
+ } else { bytesToWrite = 3;
ch = UNI_REPLACEMENT_CHAR;
+ result = sourceIllegal;
}
target += bytesToWrite;
@@ -481,8 +493,11 @@ ConversionResult ConvertUTF8toUTF32 (
source -= (extraBytesToRead+1); /* Back up the source pointer! */
result = targetExhausted; break;
}
- if (ch <= UNI_MAX_UTF32) {
- /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (ch <= UNI_MAX_LEGAL_UTF32) {
+ /*
+ * UTF-16 surrogate values are illegal in UTF-32, and anything
+ * over Plane 17 (> 0x10FFFF) is illegal.
+ */
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
if (flags == strictConversion) {
source -= (extraBytesToRead+1); /* return to the illegal value itself */
@@ -494,7 +509,8 @@ ConversionResult ConvertUTF8toUTF32 (
} else {
*target++ = ch;
}
- } else { /* i.e., ch > UNI_MAX_UTF32 */
+ } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
+ result = sourceIllegal;
*target++ = UNI_REPLACEMENT_CHAR;
}
}
diff --git a/ConvertUTF.h b/ConvertUTF.h
index 429ab40..e264915 100644
--- a/ConvertUTF.h
+++ b/ConvertUTF.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2001 Unicode, Inc.
+ * Copyright 2001-2004 Unicode, Inc.
*
* Disclaimer
*
@@ -97,6 +97,7 @@ typedef unsigned char Boolean; /* 0 or 1 */
#define UNI_MAX_BMP (UTF32)0x0000FFFF
#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
+#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
typedef enum {
conversionOK, /* conversion successful */
diff --git a/ExpectedOutput.txt b/ExpectedOutput.txt
index cf13a6a..e09d844 100644
--- a/ExpectedOutput.txt
+++ b/ExpectedOutput.txt
@@ -1,4 +1,5 @@
Three tests of round-trip conversions will be performed.
+One test of illegal UTF-32 will be peroformed.
Two illegal result messages are expected; one in test 02A; one in test 03A.
These are for tests of Surrogate conversion.
@@ -15,3 +16,6 @@ sourceIllegal Test03A for 55296 (0xd800); output ; result 3
!!! Test03A: note expected illegal result for 0x0000D800
******** Test03 succeeded without error. ********
+Begin Test04
+******** Test04 succeeded without error. ********
+
diff --git a/harness.c b/harness.c
index 1e3dfb9..25b3e9e 100644
--- a/harness.c
+++ b/harness.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2001 Unicode, Inc.
+ * Copyright 2001-2004 Unicode, Inc.
*
* Disclaimer
*
@@ -34,6 +34,8 @@
* Sept 19, 2002: Corrected error on line 234: utf16_buf[2] becomes utf16_result[2]
* per report from Iain Murray.
* July 3, 2003: Updated printout message.
+ * Oct 19, 2004: Updated isLegalUTF8 test data and corrected switch statements to catch
+ * illegal surrogate use in UTF-8, per report from Frank Tang.
*
*/
@@ -54,7 +56,9 @@
00-7F 0000- 007F
C2-DF 80-BF 0080- 07FF
E0 A0-BF 80-BF 0800- 0FFF
- E1-EF 80-BF 80-BF 1000- FFFF
+ E1-EC 80-BF 80-BF 1000- CFFF
+ ED 80-9F 80-BF D000- D7FF
+ EE-EF 80-BF 80-BF E000- FFFF
F0 90-BF 80-BF 80-BF 10000- 3FFFF
F1-F3 80-BF 80-BF 80-BF 40000- FFFFF
F4 80-8F 80-BF 80-BF 100000-10FFFF
@@ -88,9 +92,16 @@ struct utf8_test utf8_testData[] = {
{ 0, 2, { 0xC0, 0xAF, 0x00, 0x00, 0x00 }}, /* 15 */
{ 0, 3, { 0xE0, 0x9F, 0x80, 0x00, 0x00 }}, /* 16 */
{ 0, 4, { 0xF0, 0x93, 0xB2, 0xC1, 0x00 }}, /* 17 */
-/* for all > 17 use "short" buffer lengths to detect over-run */
+
+ { 1, 3, { 0xED, 0x9F, 0xBF, 0x00, 0x00 }}, /* 18 */
+ { 1, 3, { 0xEE, 0x80, 0x80, 0x00, 0x00 }}, /* 19 */
+ { 0, 3, { 0xED, 0xA0, 0x80, 0x00, 0x00 }}, /* 20 */
+ { 0, 3, { 0xED, 0xBF, 0xBF, 0x00, 0x00 }}, /* 21 */
+
+/* for all > 21 use "short" buffer lengths to detect over-run */
{ 0, 4, { 0xF0, 0x93, 0xB2, 0xC3, 0x00 }}, /* 18 use short buflen */
- { 0, 0, { 0x00, 0x00, 0x00, 0x00, 0x00 }}
+ { 0, 0, { 0x00, 0x00, 0x00, 0x00, 0x00 }},
+
};
int test01() {
@@ -103,8 +114,8 @@ int test01() {
for (i = 0; utf8_testData[i].utf8_len; i++) {
wantVal1 = wantVal2 = utf8_testData[i].utf8_legal;
gotVal1 = isLegalUTF8(&(utf8_testData[i].utf8_seq[0]), utf8_testData[i].utf8_len);
- /* use truncated length for tests over 17 */
- if (i <= 17) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; }
+ /* use truncated length for tests over 21 */
+ if (i <= 21) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; }
gotVal2 = isLegalUTF8Sequence(&(utf8_testData[i].utf8_seq[0]), &(utf8_testData[i].utf8_seq[0])+len2);
if ((gotVal1 != wantVal1) || (gotVal2 != wantVal2)) {
printf("Test01 error: seq %d is %d & %d (should be %d & %d) for bytes (%x,%x,%x,%x,%x,) & len %d\n",
@@ -380,10 +391,46 @@ int test03() {
return 1;
}
+/* ---------------------------------------------------------------------
+ test04 - Test an illegal UTF-32 value > 10FFFF conversion to UTF-8.
+ Expect it will be turned into UNI_REPLACEMENT_CHAR.
+
+ --------------------------------------------------------------------- */
+
+int test04() {
+ int i, n;
+ ConversionResult result;
+ UTF32 utf32_buf[2];
+ UTF8 utf8_buf[8];
+ UTF32 *utf32SourceStart, *utf32TargetStart;
+ UTF8 *utf8SourceStart, *utf8TargetStart;
+
+ printf("Begin Test04\n"); fflush(stdout);
+
+ i = 0x10FFFF + 21; /* an arbitrary value > legal */
+
+ utf32_buf[0] = i; utf32_buf[1] = 0;
+ for (n = 0; n < 8; n++) utf8_buf[n] = 0;
+
+ utf32SourceStart = utf32_buf;
+ utf8TargetStart = utf8_buf;
+
+ /*
+ * Test UTF32 -> UTF8, with legality check on.
+ */
+ result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion);
+ if (result != sourceIllegal) {
+ fprintf(stderr, "Test04A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
+ }
+
+ return 1;
+}
+
/* --------------------------------------------------------------------- */
main() {
printf("Three tests of round-trip conversions will be performed.\n");
+ printf("One test of illegal UTF-32 will be peroformed.\n");
printf("Two illegal result messages are expected; one in test 02A; one in test 03A.\n");
printf("These are for tests of Surrogate conversion.\n\n");
fflush(stdout);
@@ -393,4 +440,6 @@ main() {
else { printf("-------- Test02 failed. --------\n\n"); }
if (test03()) { printf("******** Test03 succeeded without error. ********\n\n"); }
else { printf("-------- Test03 failed. --------\n\n"); }
+ if (test04()) { printf("******** Test04 succeeded without error. ********\n\n"); }
+ else { printf("-------- Test04 failed. --------\n\n"); }
}
diff --git a/readme.txt b/readme.txt
index 722c6f4..b9f17fb 100644
--- a/readme.txt
+++ b/readme.txt
@@ -3,12 +3,12 @@ The accompanying C source code file "ConvertUTF.c" and the associated header
file "ConvertUTF.h" provide for conversion between various transformation
formats of Unicode characters. The following conversions are supported:
- UCS4 to UTF16
- UCS4 to UTF8
- UTF16 to UCS4
- UTF16 to UTF8
- UTF8 to UTF16
- UTF8 to UCS4
+ UTF-32 to UTF-16
+ UTF-32 to UTF-8
+ UTF-16 to UTF-32
+ UTF-16 to UTF-8
+ UTF-8 to UTF-16
+ UTF-8 to UTF-32
In addition, there is a test harness which runs various tests.
@@ -33,7 +33,11 @@ Version 1.2: corrected switch statements associated with "extraBytesToRead"
UTF8, but the table and this code has always catered for those,
cases since at one time they were legal.
+Version 1.3: Updated UTF-8 legality check;
+ updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions
+ Updated UTF-8 legality tests in harness.c
+
-Last update: January 6, 2004
+Last update: October 19, 2004