summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2004-11-22 04:15:50 -0500
committerLuke Shumaker <lukeshu@lukeshu.com>2004-11-22 04:15:50 -0500
commite49ab49fe9202b6f875cf9cba6bad2f320e24fe4 (patch)
tree0aa2ee8c1c8b6188b6725a7ec1e4139d46fbfbdd
parent040c6f479435a2b4f2a7cb9ef4bd65fca3ec2fcc (diff)
parent7d347a05ce025a9aef28bcf72089e1388dd48d13 (diff)
http://web.archive.org/web/20041122041550/http:/www.unicode.org:80/Public/PROGRAMS/CVTUTF/HEADPROGRAMS/CVTUTF
-rw-r--r--.metadata.txt12
-rw-r--r--ConvertUTF.c708
-rw-r--r--ConvertUTF.h3
-rw-r--r--ExpectedOutput.txt21
-rw-r--r--harness.c72
-rw-r--r--readme.txt43
6 files changed, 521 insertions, 338 deletions
diff --git a/.metadata.txt b/.metadata.txt
index 04a8f5d..65edd80 100644
--- a/.metadata.txt
+++ b/.metadata.txt
@@ -1,5 +1,7 @@
-CVTUTF7.C 2001-08-23 23:56
-CVTUTF7.H 2001-08-23 23:56
-ConvertUTF.c 2001-09-26 17:39
-ConvertUTF.h 2001-09-26 17:39
-harness.c 2001-09-26 17:39
+CVTUTF7.C 2004-11-02 15:08
+CVTUTF7.H 2004-11-02 15:08
+ConvertUTF.c 2004-11-02 15:08
+ConvertUTF.h 2004-11-02 15:08
+ExpectedOutput.txt 2004-11-02 15:08
+harness.c 2004-11-02 15:08
+readme.txt 2004-11-02 15:08
diff --git a/ConvertUTF.c b/ConvertUTF.c
index 23834c4..9b3deeb 100644
--- a/ConvertUTF.c
+++ b/ConvertUTF.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2001 Unicode, Inc.
+ * Copyright 2001-2004 Unicode, Inc.
*
* Disclaimer
*
@@ -23,10 +23,16 @@
/* ---------------------------------------------------------------------
Conversions between UTF32, UTF-16, and UTF-8. Source code file.
- Author: Mark E. Davis, 1994.
- Rev History: Rick McGowan, fixes & updates May 2001.
- Sept 2001: fixed const & error conditions per
- mods suggested by S. Parent & A. Lillich.
+ Author: Mark E. Davis, 1994.
+ Rev History: Rick McGowan, fixes & updates May 2001.
+ Sept 2001: fixed const & error conditions per
+ mods suggested by S. Parent & A. Lillich.
+ June 2002: Tim Dodd added detection and handling of incomplete
+ source sequences, enhanced error detection, added casts
+ to eliminate compiler warnings.
+ July 2003: slight mods to back out aggressive FFFE detection.
+ Jan 2004: updated switches in from-UTF8 conversions.
+ Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
See the header file "ConvertUTF.h" for complete documentation.
@@ -38,106 +44,122 @@
#include <stdio.h>
#endif
-static const int halfShift = 10; /* used for shifting by 10 bits */
+static const int halfShift = 10; /* used for shifting by 10 bits */
-static const UTF32 halfBase = 0x0010000UL;
-static const UTF32 halfMask = 0x3FFUL;
+static const UTF32 halfBase = 0x0010000UL;
+static const UTF32 halfMask = 0x3FFUL;
-#define UNI_SUR_HIGH_START (UTF32)0xD800
-#define UNI_SUR_HIGH_END (UTF32)0xDBFF
-#define UNI_SUR_LOW_START (UTF32)0xDC00
-#define UNI_SUR_LOW_END (UTF32)0xDFFF
-#define false 0
-#define true 1
+#define UNI_SUR_HIGH_START (UTF32)0xD800
+#define UNI_SUR_HIGH_END (UTF32)0xDBFF
+#define UNI_SUR_LOW_START (UTF32)0xDC00
+#define UNI_SUR_LOW_END (UTF32)0xDFFF
+#define false 0
+#define true 1
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF32toUTF16 (
- const UTF32** sourceStart, const UTF32* sourceEnd,
- UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
- ConversionResult result = conversionOK;
- const UTF32* source = *sourceStart;
- UTF16* target = *targetStart;
- while (source < sourceEnd) {
- UTF32 ch;
- if (target >= targetEnd) {
- result = targetExhausted; break;
- }
- ch = *source++;
- if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
- if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) {
- --source; /* return to the illegal value itself */
- result = sourceIllegal;
- break;
- } else {
- *target++ = ch; /* normal case */
- }
- } else if (ch > UNI_MAX_UTF16) {
- if (flags == strictConversion) {
- result = sourceIllegal;
- } else {
- *target++ = UNI_REPLACEMENT_CHAR;
- }
+ const UTF32** sourceStart, const UTF32* sourceEnd,
+ UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
+ ConversionResult result = conversionOK;
+ const UTF32* source = *sourceStart;
+ UTF16* target = *targetStart;
+ while (source < sourceEnd) {
+ UTF32 ch;
+ if (target >= targetEnd) {
+ result = targetExhausted; break;
+ }
+ ch = *source++;
+ if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
+ /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
+ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+ if (flags == strictConversion) {
+ --source; /* return to the illegal value itself */
+ result = sourceIllegal;
+ break;
} else {
- /* target is a character in range 0xFFFF - 0x10FFFF. */
- if (target + 1 >= targetEnd) {
- --source; /* Back up source pointer! */
- result = targetExhausted; break;
- }
- ch -= halfBase;
- *target++ = (ch >> halfShift) + UNI_SUR_HIGH_START;
- *target++ = (ch & halfMask) + UNI_SUR_LOW_START;
+ *target++ = UNI_REPLACEMENT_CHAR;
}
+ } else {
+ *target++ = (UTF16)ch; /* normal case */
+ }
+ } else if (ch > UNI_MAX_LEGAL_UTF32) {
+ if (flags == strictConversion) {
+ result = sourceIllegal;
+ } else {
+ *target++ = UNI_REPLACEMENT_CHAR;
+ }
+ } else {
+ /* target is a character in range 0xFFFF - 0x10FFFF. */
+ if (target + 1 >= targetEnd) {
+ --source; /* Back up source pointer! */
+ result = targetExhausted; break;
+ }
+ ch -= halfBase;
+ *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
+ *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
}
- *sourceStart = source;
- *targetStart = target;
- return result;
+ }
+ *sourceStart = source;
+ *targetStart = target;
+ return result;
}
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF16toUTF32 (
- const UTF16** sourceStart, const UTF16* sourceEnd,
- UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
- ConversionResult result = conversionOK;
- const UTF16* source = *sourceStart;
- UTF32* target = *targetStart;
- UTF32 ch, ch2;
- while (source < sourceEnd) {
- const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
- ch = *source++;
- if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) {
- ch2 = *source;
- if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
- ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
- + (ch2 - UNI_SUR_LOW_START) + halfBase;
- ++source;
- } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
- --source; /* return to the illegal value itself */
- result = sourceIllegal;
- break;
- }
- } else if ((flags == strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)) {
- /* an unpaired low surrogate */
- --source; /* return to the illegal value itself */
- result = sourceIllegal;
- break;
- }
- if (target >= targetEnd) {
- source = oldSource; /* Back up source pointer! */
- result = targetExhausted; break;
+ const UTF16** sourceStart, const UTF16* sourceEnd,
+ UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
+ ConversionResult result = conversionOK;
+ const UTF16* source = *sourceStart;
+ UTF32* target = *targetStart;
+ UTF32 ch, ch2;
+ while (source < sourceEnd) {
+ const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
+ ch = *source++;
+ /* If we have a surrogate pair, convert to UTF32 first. */
+ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
+ /* If the 16 bits following the high surrogate are in the source buffer... */
+ if (source < sourceEnd) {
+ ch2 = *source;
+ /* If it's a low surrogate, convert to UTF32. */
+ if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
+ ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
+ + (ch2 - UNI_SUR_LOW_START) + halfBase;
+ ++source;
+ } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
+ --source; /* return to the illegal value itself */
+ result = sourceIllegal;
+ break;
}
- *target++ = ch;
+ } else { /* We don't have the 16 bits following the high surrogate. */
+ --source; /* return to the high surrogate */
+ result = sourceExhausted;
+ break;
+ }
+ } else if (flags == strictConversion) {
+ /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
+ --source; /* return to the illegal value itself */
+ result = sourceIllegal;
+ break;
+ }
+ }
+ if (target >= targetEnd) {
+ source = oldSource; /* Back up source pointer! */
+ result = targetExhausted; break;
}
- *sourceStart = source;
- *targetStart = target;
+ *target++ = ch;
+ }
+ *sourceStart = source;
+ *targetStart = target;
#ifdef CVTUTF_DEBUG
if (result == sourceIllegal) {
fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
fflush(stderr);
}
#endif
- return result;
+ return result;
}
/* --------------------------------------------------------------------- */
@@ -145,16 +167,19 @@ if (result == sourceIllegal) {
/*
* Index into the table below with the first byte of a UTF-8 sequence to
* get the number of trailing bytes that are supposed to follow it.
+ * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
+ * left as-is for anyone who may want to do such conversion, which was
+ * allowed in earlier algorithms.
*/
static const char trailingBytesForUTF8[256] = {
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
/*
@@ -163,13 +188,14 @@ static const char trailingBytesForUTF8[256] = {
* in a UTF-8 sequence.
*/
static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
- 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
+ 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
/*
* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
* into the first byte, depending on how many bytes follow. There are
* as many entries in this table as there are UTF-8 sequence types.
- * (I.e., one byte sequence, two byte... six byte sequence.)
+ * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
+ * for *legal* UTF-8 will be 4 or fewer bytes total.
*/
static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
@@ -186,60 +212,71 @@ static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF16toUTF8 (
- const UTF16** sourceStart, const UTF16* sourceEnd,
- UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
- ConversionResult result = conversionOK;
- const UTF16* source = *sourceStart;
- UTF8* target = *targetStart;
- while (source < sourceEnd) {
- UTF32 ch;
- unsigned short bytesToWrite = 0;
- const UTF32 byteMask = 0xBF;
- const UTF32 byteMark = 0x80;
- const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
- ch = *source++;
- /* If we have a surrogate pair, convert to UTF32 first. */
- if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) {
- UTF32 ch2 = *source;
- if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
- ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
- + (ch2 - UNI_SUR_LOW_START) + halfBase;
- ++source;
- } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
- --source; /* return to the illegal value itself */
- result = sourceIllegal;
- break;
- }
- } else if ((flags == strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)) {
- --source; /* return to the illegal value itself */
- result = sourceIllegal;
- break;
- }
- /* Figure out how many bytes the result will require */
- if (ch < (UTF32)0x80) { bytesToWrite = 1;
- } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
- } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
- } else if (ch < (UTF32)0x200000) { bytesToWrite = 4;
- } else { bytesToWrite = 2;
- ch = UNI_REPLACEMENT_CHAR;
+ const UTF16** sourceStart, const UTF16* sourceEnd,
+ UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
+ ConversionResult result = conversionOK;
+ const UTF16* source = *sourceStart;
+ UTF8* target = *targetStart;
+ while (source < sourceEnd) {
+ UTF32 ch;
+ unsigned short bytesToWrite = 0;
+ const UTF32 byteMask = 0xBF;
+ const UTF32 byteMark = 0x80;
+ const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
+ ch = *source++;
+ /* If we have a surrogate pair, convert to UTF32 first. */
+ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
+ /* If the 16 bits following the high surrogate are in the source buffer... */
+ if (source < sourceEnd) {
+ UTF32 ch2 = *source;
+ /* If it's a low surrogate, convert to UTF32. */
+ if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
+ ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
+ + (ch2 - UNI_SUR_LOW_START) + halfBase;
+ ++source;
+ } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
+ --source; /* return to the illegal value itself */
+ result = sourceIllegal;
+ break;
}
+ } else { /* We don't have the 16 bits following the high surrogate. */
+ --source; /* return to the high surrogate */
+ result = sourceExhausted;
+ break;
+ }
+ } else if (flags == strictConversion) {
+ /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
+ --source; /* return to the illegal value itself */
+ result = sourceIllegal;
+ break;
+ }
+ }
+ /* Figure out how many bytes the result will require */
+ if (ch < (UTF32)0x80) { bytesToWrite = 1;
+ } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
+ } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
+ } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
+ } else { bytesToWrite = 3;
+ ch = UNI_REPLACEMENT_CHAR;
+ }
- target += bytesToWrite;
- if (target > targetEnd) {
- source = oldSource; /* Back up source pointer! */
- target -= bytesToWrite; result = targetExhausted; break;
- }
- switch (bytesToWrite) { /* note: everything falls through. */
- case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6;
- case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6;
- case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6;
- case 1: *--target = ch | firstByteMark[bytesToWrite];
- }
- target += bytesToWrite;
+ target += bytesToWrite;
+ if (target > targetEnd) {
+ source = oldSource; /* Back up source pointer! */
+ target -= bytesToWrite; result = targetExhausted; break;
+ }
+ switch (bytesToWrite) { /* note: everything falls through. */
+ case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+ case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+ case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+ case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
}
- *sourceStart = source;
- *targetStart = target;
- return result;
+ target += bytesToWrite;
+ }
+ *sourceStart = source;
+ *targetStart = target;
+ return result;
}
/* --------------------------------------------------------------------- */
@@ -248,7 +285,7 @@ ConversionResult ConvertUTF16toUTF8 (
* Utility routine to tell whether a sequence of bytes is legal UTF-8.
* This must be called with the length pre-determined by the first byte.
* If not calling this from ConvertUTF8to*, then the length can be set by:
- * length = trailingBytesForUTF8[*source]+1;
+ * length = trailingBytesForUTF8[*source]+1;
* and the sequence is illegal right away if there aren't that many bytes
* available.
* If presented with a length > 4, this returns false. The Unicode
@@ -256,25 +293,28 @@ ConversionResult ConvertUTF16toUTF8 (
*/
static Boolean isLegalUTF8(const UTF8 *source, int length) {
- UTF8 a;
- const UTF8 *srcptr = source+length;
- switch (length) {
- default: return false;
- /* Everything else falls through when "true"... */
- case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
- case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
- case 2: if ((a = (*--srcptr)) > 0xBF) return false;
- switch (*source) {
- /* no fall-through in this inner switch */
- case 0xE0: if (a < 0xA0) return false; break;
- case 0xF0: if (a < 0x90) return false; break;
- case 0xF4: if (a > 0x8F) return false; break;
- default: if (a < 0x80) return false;
- }
- case 1: if (*source >= 0x80 && *source < 0xC2) return false;
- if (*source > 0xF4) return false;
+ UTF8 a;
+ const UTF8 *srcptr = source+length;
+ switch (length) {
+ default: return false;
+ /* Everything else falls through when "true"... */
+ case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+ case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+ case 2: if ((a = (*--srcptr)) > 0xBF) return false;
+
+ switch (*source) {
+ /* no fall-through in this inner switch */
+ case 0xE0: if (a < 0xA0) return false; break;
+ case 0xED: if (a > 0x9F) return false; break;
+ case 0xF0: if (a < 0x90) return false; break;
+ case 0xF4: if (a > 0x8F) return false; break;
+ default: if (a < 0x80) return false;
}
- return true;
+
+ case 1: if (*source >= 0x80 && *source < 0xC2) return false;
+ }
+ if (*source > 0xF4) return false;
+ return true;
}
/* --------------------------------------------------------------------- */
@@ -284,190 +324,216 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) {
* This is not used here; it's just exported.
*/
Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
- int length = trailingBytesForUTF8[*source]+1;
- if (source+length > sourceEnd) {
- return false;
- }
- return isLegalUTF8(source, length);
+ int length = trailingBytesForUTF8[*source]+1;
+ if (source+length > sourceEnd) {
+ return false;
+ }
+ return isLegalUTF8(source, length);
}
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF8toUTF16 (
- const UTF8** sourceStart, const UTF8* sourceEnd,
- UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
- ConversionResult result = conversionOK;
- const UTF8* source = *sourceStart;
- UTF16* target = *targetStart;
- while (source < sourceEnd) {
- UTF32 ch = 0;
- unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
- if (source + extraBytesToRead >= sourceEnd) {
- result = sourceExhausted; break;
- }
- /* Do this check whether lenient or strict */
- if (! isLegalUTF8(source, extraBytesToRead+1)) {
- result = sourceIllegal;
- break;
- }
- /*
- * The cases all fall through. See "Note A" below.
- */
- switch (extraBytesToRead) {
- case 3: ch += *source++; ch <<= 6;
- case 2: ch += *source++; ch <<= 6;
- case 1: ch += *source++; ch <<= 6;
- case 0: ch += *source++;
- }
- ch -= offsetsFromUTF8[extraBytesToRead];
+ const UTF8** sourceStart, const UTF8* sourceEnd,
+ UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
+ ConversionResult result = conversionOK;
+ const UTF8* source = *sourceStart;
+ UTF16* target = *targetStart;
+ while (source < sourceEnd) {
+ UTF32 ch = 0;
+ unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
+ if (source + extraBytesToRead >= sourceEnd) {
+ result = sourceExhausted; break;
+ }
+ /* Do this check whether lenient or strict */
+ if (! isLegalUTF8(source, extraBytesToRead+1)) {
+ result = sourceIllegal;
+ break;
+ }
+ /*
+ * The cases all fall through. See "Note A" below.
+ */
+ switch (extraBytesToRead) {
+ case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
+ case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
+ case 3: ch += *source++; ch <<= 6;
+ case 2: ch += *source++; ch <<= 6;
+ case 1: ch += *source++; ch <<= 6;
+ case 0: ch += *source++;
+ }
+ ch -= offsetsFromUTF8[extraBytesToRead];
- if (target >= targetEnd) {
- source -= (extraBytesToRead+1); /* Back up source pointer! */
- result = targetExhausted; break;
- }
- if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
- if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) {
- source -= (extraBytesToRead+1); /* return to the illegal value itself */
- result = sourceIllegal;
- break;
- } else {
- *target++ = ch; /* normal case */
- }
- } else if (ch > UNI_MAX_UTF16) {
- if (flags == strictConversion) {
- result = sourceIllegal;
- source -= (extraBytesToRead+1); /* return to the start */
- break; /* Bail out; shouldn't continue */
- } else {
- *target++ = UNI_REPLACEMENT_CHAR;
- }
+ if (target >= targetEnd) {
+ source -= (extraBytesToRead+1); /* Back up source pointer! */
+ result = targetExhausted; break;
+ }
+ if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
+ /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+ if (flags == strictConversion) {
+ source -= (extraBytesToRead+1); /* return to the illegal value itself */
+ result = sourceIllegal;
+ break;
} else {
- /* target is a character in range 0xFFFF - 0x10FFFF. */
- if (target + 1 >= targetEnd) {
- source -= (extraBytesToRead+1); /* Back up source pointer! */
- result = targetExhausted; break;
- }
- ch -= halfBase;
- *target++ = (ch >> halfShift) + UNI_SUR_HIGH_START;
- *target++ = (ch & halfMask) + UNI_SUR_LOW_START;
+ *target++ = UNI_REPLACEMENT_CHAR;
}
+ } else {
+ *target++ = (UTF16)ch; /* normal case */
+ }
+ } else if (ch > UNI_MAX_UTF16) {
+ if (flags == strictConversion) {
+ result = sourceIllegal;
+ source -= (extraBytesToRead+1); /* return to the start */
+ break; /* Bail out; shouldn't continue */
+ } else {
+ *target++ = UNI_REPLACEMENT_CHAR;
+ }
+ } else {
+ /* target is a character in range 0xFFFF - 0x10FFFF. */
+ if (target + 1 >= targetEnd) {
+ source -= (extraBytesToRead+1); /* Back up source pointer! */
+ result = targetExhausted; break;
+ }
+ ch -= halfBase;
+ *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
+ *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
}
- *sourceStart = source;
- *targetStart = target;
- return result;
+ }
+ *sourceStart = source;
+ *targetStart = target;
+ return result;
}
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF32toUTF8 (
- const UTF32** sourceStart, const UTF32* sourceEnd,
- UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
- ConversionResult result = conversionOK;
- const UTF32* source = *sourceStart;
- UTF8* target = *targetStart;
- while (source < sourceEnd) {
- UTF32 ch;
- unsigned short bytesToWrite = 0;
- const UTF32 byteMask = 0xBF;
- const UTF32 byteMark = 0x80;
- ch = *source++;
- /* surrogates of any stripe are not legal UTF32 characters */
- if (flags == strictConversion ) {
- if ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)) {
- --source; /* return to the illegal value itself */
- result = sourceIllegal;
- break;
- }
- }
- /* Figure out how many bytes the result will require */
- if (ch < (UTF32)0x80) { bytesToWrite = 1;
- } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
- } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
- } else if (ch < (UTF32)0x200000) { bytesToWrite = 4;
- } else { bytesToWrite = 2;
- ch = UNI_REPLACEMENT_CHAR;
- }
-
- target += bytesToWrite;
- if (target > targetEnd) {
- --source; /* Back up source pointer! */
- target -= bytesToWrite; result = targetExhausted; break;
- }
- switch (bytesToWrite) { /* note: everything falls through. */
- case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6;
- case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6;
- case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6;
- case 1: *--target = ch | firstByteMark[bytesToWrite];
- }
- target += bytesToWrite;
+ const UTF32** sourceStart, const UTF32* sourceEnd,
+ UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
+ ConversionResult result = conversionOK;
+ const UTF32* source = *sourceStart;
+ UTF8* target = *targetStart;
+ while (source < sourceEnd) {
+ UTF32 ch;
+ unsigned short bytesToWrite = 0;
+ const UTF32 byteMask = 0xBF;
+ const UTF32 byteMark = 0x80;
+ ch = *source++;
+ if (flags == strictConversion ) {
+ /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+ --source; /* return to the illegal value itself */
+ result = sourceIllegal;
+ break;
+ }
+ }
+ /*
+ * Figure out how many bytes the result will require. Turn any
+ * illegally large UTF32 things (> Plane 17) into replacement chars.
+ */
+ if (ch < (UTF32)0x80) { bytesToWrite = 1;
+ } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
+ } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
+ } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
+ } else { bytesToWrite = 3;
+ ch = UNI_REPLACEMENT_CHAR;
+ result = sourceIllegal;
+ }
+
+ target += bytesToWrite;
+ if (target > targetEnd) {
+ --source; /* Back up source pointer! */
+ target -= bytesToWrite; result = targetExhausted; break;
}
- *sourceStart = source;
- *targetStart = target;
- return result;
+ switch (bytesToWrite) { /* note: everything falls through. */
+ case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+ case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+ case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+ case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
+ }
+ target += bytesToWrite;
+ }
+ *sourceStart = source;
+ *targetStart = target;
+ return result;
}
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF8toUTF32 (
- const UTF8** sourceStart, const UTF8* sourceEnd,
- UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
- ConversionResult result = conversionOK;
- const UTF8* source = *sourceStart;
- UTF32* target = *targetStart;
- while (source < sourceEnd) {
- UTF32 ch = 0;
- unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
- if (source + extraBytesToRead >= sourceEnd) {
- result = sourceExhausted; break;
- }
- /* Do this check whether lenient or strict */
- if (! isLegalUTF8(source, extraBytesToRead+1)) {
- result = sourceIllegal;
- break;
- }
- /*
- * The cases all fall through. See "Note A" below.
- */
- switch (extraBytesToRead) {
- case 3: ch += *source++; ch <<= 6;
- case 2: ch += *source++; ch <<= 6;
- case 1: ch += *source++; ch <<= 6;
- case 0: ch += *source++;
- }
- ch -= offsetsFromUTF8[extraBytesToRead];
+ const UTF8** sourceStart, const UTF8* sourceEnd,
+ UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
+ ConversionResult result = conversionOK;
+ const UTF8* source = *sourceStart;
+ UTF32* target = *targetStart;
+ while (source < sourceEnd) {
+ UTF32 ch = 0;
+ unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
+ if (source + extraBytesToRead >= sourceEnd) {
+ result = sourceExhausted; break;
+ }
+ /* Do this check whether lenient or strict */
+ if (! isLegalUTF8(source, extraBytesToRead+1)) {
+ result = sourceIllegal;
+ break;
+ }
+ /*
+ * The cases all fall through. See "Note A" below.
+ */
+ switch (extraBytesToRead) {
+ case 5: ch += *source++; ch <<= 6;
+ case 4: ch += *source++; ch <<= 6;
+ case 3: ch += *source++; ch <<= 6;
+ case 2: ch += *source++; ch <<= 6;
+ case 1: ch += *source++; ch <<= 6;
+ case 0: ch += *source++;
+ }
+ ch -= offsetsFromUTF8[extraBytesToRead];
- if (target >= targetEnd) {
- source -= (extraBytesToRead+1); /* Back up the source pointer! */
- result = targetExhausted; break;
- }
- if (ch <= UNI_MAX_UTF32) {
- *target++ = ch;
- } else { /* i.e., ch > UNI_MAX_UTF32 */
- *target++ = UNI_REPLACEMENT_CHAR;
+ if (target >= targetEnd) {
+ source -= (extraBytesToRead+1); /* Back up the source pointer! */
+ result = targetExhausted; break;
+ }
+ if (ch <= UNI_MAX_LEGAL_UTF32) {
+ /*
+ * UTF-16 surrogate values are illegal in UTF-32, and anything
+ * over Plane 17 (> 0x10FFFF) is illegal.
+ */
+ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+ if (flags == strictConversion) {
+ source -= (extraBytesToRead+1); /* return to the illegal value itself */
+ result = sourceIllegal;
+ break;
+ } else {
+ *target++ = UNI_REPLACEMENT_CHAR;
}
+ } else {
+ *target++ = ch;
+ }
+ } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
+ result = sourceIllegal;
+ *target++ = UNI_REPLACEMENT_CHAR;
}
- *sourceStart = source;
- *targetStart = target;
- return result;
+ }
+ *sourceStart = source;
+ *targetStart = target;
+ return result;
}
/* ---------------------------------------------------------------------
- Note A.
- The fall-through switches in UTF-8 reading code save a
- temp variable, some decrements & conditionals. The switches
- are equivalent to the following loop:
- {
- int tmpBytesToRead = extraBytesToRead+1;
- do {
- ch += *source++;
- --tmpBytesToRead;
- if (tmpBytesToRead) ch <<= 6;
- } while (tmpBytesToRead > 0);
- }
- In UTF-8 writing code, the switches on "bytesToWrite" are
- similarly unrolled loops.
+ Note A.
+ The fall-through switches in UTF-8 reading code save a
+ temp variable, some decrements & conditionals. The switches
+ are equivalent to the following loop:
+ {
+ int tmpBytesToRead = extraBytesToRead+1;
+ do {
+ ch += *source++;
+ --tmpBytesToRead;
+ if (tmpBytesToRead) ch <<= 6;
+ } while (tmpBytesToRead > 0);
+ }
+ In UTF-8 writing code, the switches on "bytesToWrite" are
+ similarly unrolled loops.
--------------------------------------------------------------------- */
-
-
diff --git a/ConvertUTF.h b/ConvertUTF.h
index 429ab40..e264915 100644
--- a/ConvertUTF.h
+++ b/ConvertUTF.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2001 Unicode, Inc.
+ * Copyright 2001-2004 Unicode, Inc.
*
* Disclaimer
*
@@ -97,6 +97,7 @@ typedef unsigned char Boolean; /* 0 or 1 */
#define UNI_MAX_BMP (UTF32)0x0000FFFF
#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
+#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
typedef enum {
conversionOK, /* conversion successful */
diff --git a/ExpectedOutput.txt b/ExpectedOutput.txt
new file mode 100644
index 0000000..e09d844
--- /dev/null
+++ b/ExpectedOutput.txt
@@ -0,0 +1,21 @@
+Three tests of round-trip conversions will be performed.
+One test of illegal UTF-32 will be peroformed.
+Two illegal result messages are expected; one in test 02A; one in test 03A.
+These are for tests of Surrogate conversion.
+
+Begin Test01
+******** Test01 succeeded without error. ********
+
+Begin Test02
+Test02A for 55296, input 0000d800, output 0000,0000, result 3
+!!! Test02A: note expected illegal result for 0x0000D800
+******** Test02 succeeded without error. ********
+
+Begin Test03
+sourceIllegal Test03A for 55296 (0xd800); output ; result 3
+!!! Test03A: note expected illegal result for 0x0000D800
+******** Test03 succeeded without error. ********
+
+Begin Test04
+******** Test04 succeeded without error. ********
+
diff --git a/harness.c b/harness.c
index b3dd500..25b3e9e 100644
--- a/harness.c
+++ b/harness.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2001 Unicode, Inc.
+ * Copyright 2001-2004 Unicode, Inc.
*
* Disclaimer
*
@@ -31,6 +31,11 @@
* $ gcc -g harness.c -o harness
*
* Rev History: Rick McGowan, new file April 2001.
+ * Sept 19, 2002: Corrected error on line 234: utf16_buf[2] becomes utf16_result[2]
+ * per report from Iain Murray.
+ * July 3, 2003: Updated printout message.
+ * Oct 19, 2004: Updated isLegalUTF8 test data and corrected switch statements to catch
+ * illegal surrogate use in UTF-8, per report from Frank Tang.
*
*/
@@ -51,7 +56,9 @@
00-7F 0000- 007F
C2-DF 80-BF 0080- 07FF
E0 A0-BF 80-BF 0800- 0FFF
- E1-EF 80-BF 80-BF 1000- FFFF
+ E1-EC 80-BF 80-BF 1000- CFFF
+ ED 80-9F 80-BF D000- D7FF
+ EE-EF 80-BF 80-BF E000- FFFF
F0 90-BF 80-BF 80-BF 10000- 3FFFF
F1-F3 80-BF 80-BF 80-BF 40000- FFFFF
F4 80-8F 80-BF 80-BF 100000-10FFFF
@@ -85,9 +92,16 @@ struct utf8_test utf8_testData[] = {
{ 0, 2, { 0xC0, 0xAF, 0x00, 0x00, 0x00 }}, /* 15 */
{ 0, 3, { 0xE0, 0x9F, 0x80, 0x00, 0x00 }}, /* 16 */
{ 0, 4, { 0xF0, 0x93, 0xB2, 0xC1, 0x00 }}, /* 17 */
-/* for all > 17 use "short" buffer lengths to detect over-run */
+
+ { 1, 3, { 0xED, 0x9F, 0xBF, 0x00, 0x00 }}, /* 18 */
+ { 1, 3, { 0xEE, 0x80, 0x80, 0x00, 0x00 }}, /* 19 */
+ { 0, 3, { 0xED, 0xA0, 0x80, 0x00, 0x00 }}, /* 20 */
+ { 0, 3, { 0xED, 0xBF, 0xBF, 0x00, 0x00 }}, /* 21 */
+
+/* for all > 21 use "short" buffer lengths to detect over-run */
{ 0, 4, { 0xF0, 0x93, 0xB2, 0xC3, 0x00 }}, /* 18 use short buflen */
- { 0, 0, { 0x00, 0x00, 0x00, 0x00, 0x00 }}
+ { 0, 0, { 0x00, 0x00, 0x00, 0x00, 0x00 }},
+
};
int test01() {
@@ -100,8 +114,8 @@ int test01() {
for (i = 0; utf8_testData[i].utf8_len; i++) {
wantVal1 = wantVal2 = utf8_testData[i].utf8_legal;
gotVal1 = isLegalUTF8(&(utf8_testData[i].utf8_seq[0]), utf8_testData[i].utf8_len);
- /* use truncated length for tests over 17 */
- if (i <= 17) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; }
+ /* use truncated length for tests over 21 */
+ if (i <= 21) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; }
gotVal2 = isLegalUTF8Sequence(&(utf8_testData[i].utf8_seq[0]), &(utf8_testData[i].utf8_seq[0])+len2);
if ((gotVal1 != wantVal1) || (gotVal2 != wantVal2)) {
printf("Test01 error: seq %d is %d & %d (should be %d & %d) for bytes (%x,%x,%x,%x,%x,) & len %d\n",
@@ -195,7 +209,6 @@ int test02() {
case sourceIllegal: printf("sourceIllegal\t"); break;
}
if (result != conversionOK) {
-
printf("Test02B for %d (0x%x), input %04x,%04x; output %s; result %d\n",
i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf8_buf, result);
if ((i != UNI_SUR_LOW_START) && (i != UNI_SUR_HIGH_START)) {
@@ -230,7 +243,7 @@ int test02() {
/*
* Test UTF8 -> UTF16, with legality check on.
*/
- result = ConvertUTF8toUTF16((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion);
+ result = ConvertUTF8toUTF16((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_result[2]), strictConversion);
switch (result) {
default: fprintf(stderr, "Test02C fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
case conversionOK: break;
@@ -281,8 +294,6 @@ int test02() {
printf("Test02E for %d: utf32 input %08x; trip output %08x (utf_16buf is %04x,%04x)\n", i, utf32_buf[0], utf32_result[0], utf16_buf[0], utf16_buf[1]);
return 0;
}
-
-
}
return 1;
}
@@ -380,11 +391,48 @@ int test03() {
return 1;
}
+/* ---------------------------------------------------------------------
+ test04 - Test an illegal UTF-32 value > 10FFFF conversion to UTF-8.
+ Expect it will be turned into UNI_REPLACEMENT_CHAR.
+
+ --------------------------------------------------------------------- */
+
+int test04() {
+ int i, n;
+ ConversionResult result;
+ UTF32 utf32_buf[2];
+ UTF8 utf8_buf[8];
+ UTF32 *utf32SourceStart, *utf32TargetStart;
+ UTF8 *utf8SourceStart, *utf8TargetStart;
+
+ printf("Begin Test04\n"); fflush(stdout);
+
+ i = 0x10FFFF + 21; /* an arbitrary value > legal */
+
+ utf32_buf[0] = i; utf32_buf[1] = 0;
+ for (n = 0; n < 8; n++) utf8_buf[n] = 0;
+
+ utf32SourceStart = utf32_buf;
+ utf8TargetStart = utf8_buf;
+
+ /*
+ * Test UTF32 -> UTF8, with legality check on.
+ */
+ result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion);
+ if (result != sourceIllegal) {
+ fprintf(stderr, "Test04A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
+ }
+
+ return 1;
+}
+
/* --------------------------------------------------------------------- */
main() {
printf("Three tests of round-trip conversions will be performed.\n");
- printf("Two illegal result messages are expected; one in test 02A; one in test 03A .\n\n");
+ printf("One test of illegal UTF-32 will be peroformed.\n");
+ printf("Two illegal result messages are expected; one in test 02A; one in test 03A.\n");
+ printf("These are for tests of Surrogate conversion.\n\n");
fflush(stdout);
if (test01()) { printf("******** Test01 succeeded without error. ********\n\n"); }
else { printf("-------- Test01 failed. --------\n\n"); }
@@ -392,4 +440,6 @@ main() {
else { printf("-------- Test02 failed. --------\n\n"); }
if (test03()) { printf("******** Test03 succeeded without error. ********\n\n"); }
else { printf("-------- Test03 failed. --------\n\n"); }
+ if (test04()) { printf("******** Test04 succeeded without error. ********\n\n"); }
+ else { printf("-------- Test04 failed. --------\n\n"); }
}
diff --git a/readme.txt b/readme.txt
new file mode 100644
index 0000000..b9f17fb
--- /dev/null
+++ b/readme.txt
@@ -0,0 +1,43 @@
+
+The accompanying C source code file "ConvertUTF.c" and the associated header
+file "ConvertUTF.h" provide for conversion between various transformation
+formats of Unicode characters. The following conversions are supported:
+
+ UTF-32 to UTF-16
+ UTF-32 to UTF-8
+ UTF-16 to UTF-32
+ UTF-16 to UTF-8
+ UTF-8 to UTF-16
+ UTF-8 to UTF-32
+
+In addition, there is a test harness which runs various tests.
+
+The files "CVTUTF7.C" and "CVTUTF7.H" are for archival and historical purposes
+only. They have not been updated to Unicode 3.0 or later and should be
+considered obsolescent. "CVTUTF7.C" contains two functions that can convert
+between UCS2 (i.e., the BMP characters only) and UTF-7. Surrogates are
+not supported, the code has not been tested, and should be considered
+unsuitable for general purpose use.
+
+Please submit any bug reports about these programs here:
+
+ http://www.unicode.org/unicode/reporting.html
+
+Version 1.0: initial version.
+
+Version 1.1: corrected some minor problems; added stricter checks.
+
+Version 1.2: corrected switch statements associated with "extraBytesToRead"
+ in 4 & 5 byte cases, in functions for conversion from UTF8.
+ Note: formally, the 4 & 5 byte cases are illegal in the latest
+ UTF8, but the table and this code has always catered for those,
+ cases since at one time they were legal.
+
+Version 1.3: Updated UTF-8 legality check;
+ updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions
+ Updated UTF-8 legality tests in harness.c
+
+
+Last update: October 19, 2004
+
+