From 040c6f479435a2b4f2a7cb9ef4bd65fca3ec2fcc Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Wed, 21 Aug 2002 10:22:04 -0500 Subject: http://web.archive.org/web/20020821102204/http:/www.unicode.org:80/Public/PROGRAMS/CVTUTF/ --- .metadata.txt | 6 +++--- ConvertUTF.c | 67 ++++++++++++++++++++++++++++++++--------------------------- ConvertUTF.h | 44 ++++++++++++++++++++++++--------------- harness.c | 14 ++++++------- readme.txt | 26 ----------------------- 5 files changed, 73 insertions(+), 84 deletions(-) delete mode 100644 readme.txt diff --git a/.metadata.txt b/.metadata.txt index 3c46226..04a8f5d 100644 --- a/.metadata.txt +++ b/.metadata.txt @@ -1,5 +1,5 @@ CVTUTF7.C 2001-08-23 23:56 CVTUTF7.H 2001-08-23 23:56 -ConvertUTF.c 2001-08-23 23:56 -ConvertUTF.h 2001-08-23 23:56 -harness.c 2001-08-23 23:56 +ConvertUTF.c 2001-09-26 17:39 +ConvertUTF.h 2001-09-26 17:39 +harness.c 2001-09-26 17:39 diff --git a/ConvertUTF.c b/ConvertUTF.c index 491fa14..23834c4 100644 --- a/ConvertUTF.c +++ b/ConvertUTF.c @@ -25,6 +25,8 @@ Conversions between UTF32, UTF-16, and UTF-8. Source code file. Author: Mark E. Davis, 1994. Rev History: Rick McGowan, fixes & updates May 2001. + Sept 2001: fixed const & error conditions per + mods suggested by S. Parent & A. Lillich. See the header file "ConvertUTF.h" for complete documentation. @@ -51,10 +53,10 @@ static const UTF32 halfMask = 0x3FFUL; /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF32toUTF16 ( - UTF32** sourceStart, const UTF32* sourceEnd, - UTF16** targetStart, const UTF16* targetEnd, const ConversionFlags flags) { + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - UTF32* source = *sourceStart; + const UTF32* source = *sourceStart; UTF16* target = *targetStart; while (source < sourceEnd) { UTF32 ch; @@ -79,6 +81,7 @@ ConversionResult ConvertUTF32toUTF16 ( } else { /* target is a character in range 0xFFFF - 0x10FFFF. */ if (target + 1 >= targetEnd) { + --source; /* Back up source pointer! */ result = targetExhausted; break; } ch -= halfBase; @@ -94,13 +97,14 @@ ConversionResult ConvertUTF32toUTF16 ( /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF16toUTF32 ( - UTF16** sourceStart, UTF16* sourceEnd, - UTF32** targetStart, const UTF32* targetEnd, const ConversionFlags flags) { + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - UTF16* source = *sourceStart; + const UTF16* source = *sourceStart; UTF32* target = *targetStart; UTF32 ch, ch2; while (source < sourceEnd) { + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ ch = *source++; if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) { ch2 = *source; @@ -120,6 +124,7 @@ ConversionResult ConvertUTF16toUTF32 ( break; } if (target >= targetEnd) { + source = oldSource; /* Back up source pointer! */ result = targetExhausted; break; } *target++ = ch; @@ -181,16 +186,17 @@ static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF16toUTF8 ( - UTF16** sourceStart, const UTF16* sourceEnd, - UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags) { + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - UTF16* source = *sourceStart; + const UTF16* source = *sourceStart; UTF8* target = *targetStart; while (source < sourceEnd) { UTF32 ch; unsigned short bytesToWrite = 0; const UTF32 byteMask = 0xBF; const UTF32 byteMark = 0x80; + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ ch = *source++; /* If we have a surrogate pair, convert to UTF32 first. */ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) { @@ -220,6 +226,7 @@ ConversionResult ConvertUTF16toUTF8 ( target += bytesToWrite; if (target > targetEnd) { + source = oldSource; /* Back up source pointer! */ target -= bytesToWrite; result = targetExhausted; break; } switch (bytesToWrite) { /* note: everything falls through. */ @@ -248,9 +255,9 @@ ConversionResult ConvertUTF16toUTF8 ( * definition of UTF-8 goes up to 4-byte sequences. */ -static Boolean isLegalUTF8(UTF8 *source, int length) { +static Boolean isLegalUTF8(const UTF8 *source, int length) { UTF8 a; - UTF8 *srcptr = source+length; + const UTF8 *srcptr = source+length; switch (length) { default: return false; /* Everything else falls through when "true"... */ @@ -276,7 +283,7 @@ static Boolean isLegalUTF8(UTF8 *source, int length) { * Exported function to return whether a UTF-8 sequence is legal or not. * This is not used here; it's just exported. */ -Boolean isLegalUTF8Sequence(UTF8 *source, UTF8 *sourceEnd) { +Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { int length = trailingBytesForUTF8[*source]+1; if (source+length > sourceEnd) { return false; @@ -287,10 +294,10 @@ Boolean isLegalUTF8Sequence(UTF8 *source, UTF8 *sourceEnd) { /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF8toUTF16 ( - UTF8** sourceStart, UTF8* sourceEnd, - UTF16** targetStart, const UTF16* targetEnd, const ConversionFlags flags) { + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - UTF8* source = *sourceStart; + const UTF8* source = *sourceStart; UTF16* target = *targetStart; while (source < sourceEnd) { UTF32 ch = 0; @@ -315,11 +322,12 @@ ConversionResult ConvertUTF8toUTF16 ( ch -= offsetsFromUTF8[extraBytesToRead]; if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ result = targetExhausted; break; } if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) { - --source; /* return to the illegal value itself */ + source -= (extraBytesToRead+1); /* return to the illegal value itself */ result = sourceIllegal; break; } else { @@ -328,13 +336,15 @@ ConversionResult ConvertUTF8toUTF16 ( } else if (ch > UNI_MAX_UTF16) { if (flags == strictConversion) { result = sourceIllegal; - source -= extraBytesToRead; /* return to the start */ + source -= (extraBytesToRead+1); /* return to the start */ + break; /* Bail out; shouldn't continue */ } else { *target++ = UNI_REPLACEMENT_CHAR; } } else { /* target is a character in range 0xFFFF - 0x10FFFF. */ if (target + 1 >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ result = targetExhausted; break; } ch -= halfBase; @@ -350,10 +360,10 @@ ConversionResult ConvertUTF8toUTF16 ( /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF32toUTF8 ( - UTF32** sourceStart, const UTF32* sourceEnd, - UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags) { + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - UTF32* source = *sourceStart; + const UTF32* source = *sourceStart; UTF8* target = *targetStart; while (source < sourceEnd) { UTF32 ch; @@ -380,6 +390,7 @@ ConversionResult ConvertUTF32toUTF8 ( target += bytesToWrite; if (target > targetEnd) { + --source; /* Back up source pointer! */ target -= bytesToWrite; result = targetExhausted; break; } switch (bytesToWrite) { /* note: everything falls through. */ @@ -398,10 +409,10 @@ ConversionResult ConvertUTF32toUTF8 ( /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF8toUTF32 ( - UTF8** sourceStart, UTF8* sourceEnd, - UTF32** targetStart, const UTF32* targetEnd, ConversionFlags flags) { + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - UTF8* source = *sourceStart; + const UTF8* source = *sourceStart; UTF32* target = *targetStart; while (source < sourceEnd) { UTF32 ch = 0; @@ -426,19 +437,13 @@ ConversionResult ConvertUTF8toUTF32 ( ch -= offsetsFromUTF8[extraBytesToRead]; if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up the source pointer! */ result = targetExhausted; break; } if (ch <= UNI_MAX_UTF32) { *target++ = ch; - } else if (ch > UNI_MAX_UTF32) { + } else { /* i.e., ch > UNI_MAX_UTF32 */ *target++ = UNI_REPLACEMENT_CHAR; - } else { - if (target + 1 >= targetEnd) { - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (ch >> halfShift) + UNI_SUR_HIGH_START; - *target++ = (ch & halfMask) + UNI_SUR_LOW_START; } } *sourceStart = source; diff --git a/ConvertUTF.h b/ConvertUTF.h index 6798183..429ab40 100644 --- a/ConvertUTF.h +++ b/ConvertUTF.h @@ -75,6 +75,7 @@ Author: Mark E. Davis, 1994. Rev History: Rick McGowan, fixes & updates May 2001. + Fixes & updates, Sept 2001. ------------------------------------------------------------------------ */ @@ -109,30 +110,39 @@ typedef enum { lenientConversion } ConversionFlags; -ConversionResult ConvertUTF32toUTF16 ( - UTF32** sourceStart, const UTF32* sourceEnd, - UTF16** targetStart, const UTF16* targetEnd, ConversionFlags flags); +/* This is for C++ and does no harm in C */ +#ifdef __cplusplus +extern "C" { +#endif -ConversionResult ConvertUTF16toUTF32 ( - UTF16** sourceStart, UTF16* sourceEnd, - UTF32** targetStart, const UTF32* targetEnd, ConversionFlags flags); +ConversionResult ConvertUTF8toUTF16 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); ConversionResult ConvertUTF16toUTF8 ( - UTF16** sourceStart, const UTF16* sourceEnd, - UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags); + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); -ConversionResult ConvertUTF8toUTF16 ( - UTF8** sourceStart, UTF8* sourceEnd, - UTF16** targetStart, const UTF16* targetEnd, ConversionFlags flags); +ConversionResult ConvertUTF8toUTF32 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); ConversionResult ConvertUTF32toUTF8 ( - UTF32** sourceStart, const UTF32* sourceEnd, - UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags); + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); -ConversionResult ConvertUTF8toUTF32 ( - UTF8** sourceStart, UTF8* sourceEnd, - UTF32** targetStart, const UTF32* targetEnd, ConversionFlags flags); +ConversionResult ConvertUTF16toUTF32 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF32toUTF16 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); + +Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); -Boolean isLegalUTF8Sequence(UTF8 *source, UTF8 *sourceEnd); +#ifdef __cplusplus +} +#endif /* --------------------------------------------------------------------- */ diff --git a/harness.c b/harness.c index a07792b..b3dd500 100644 --- a/harness.c +++ b/harness.c @@ -157,7 +157,7 @@ int test02() { /* * Test UTF32 -> UTF16 */ - result = ConvertUTF32toUTF16(&utf32SourceStart, &(utf32_buf[1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion); + result = ConvertUTF32toUTF16((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion); if (i < UNI_SUR_HIGH_START || i > UNI_SUR_LOW_END) { /* skip result checking for all but 0000d800, which we know to be illegal */ switch (result) { @@ -186,7 +186,7 @@ int test02() { * for unpaired low surrogates. We do make one check that the lowest low * surrogate, when unpaired, is illegal. */ - result = ConvertUTF16toUTF8(&utf16SourceStart, &(utf16_buf[2]), &utf8TargetStart, &(utf8_buf[7]), strictConversion); + result = ConvertUTF16toUTF8((const UTF16 **) &utf16SourceStart, &(utf16_buf[2]), &utf8TargetStart, &(utf8_buf[7]), strictConversion); switch (result) { default: fprintf(stderr, "Test02B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); case conversionOK: break; @@ -230,7 +230,7 @@ int test02() { /* * Test UTF8 -> UTF16, with legality check on. */ - result = ConvertUTF8toUTF16(&utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion); + result = ConvertUTF8toUTF16((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion); switch (result) { default: fprintf(stderr, "Test02C fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); case conversionOK: break; @@ -257,9 +257,9 @@ int test02() { * back to UTF32. */ if (utf16_result[0] >= UNI_SUR_HIGH_START && utf16_result[0] <= UNI_SUR_HIGH_END) { - result = ConvertUTF16toUTF32(&utf16SourceStart, &(utf16_result[2]), &utf32TargetStart, &(utf32_result[1]), strictConversion); + result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[2]), &utf32TargetStart, &(utf32_result[1]), strictConversion); } else { - result = ConvertUTF16toUTF32(&utf16SourceStart, &(utf16_result[1]), &utf32TargetStart, &(utf32_result[1]), strictConversion); + result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[1]), &utf32TargetStart, &(utf32_result[1]), strictConversion); } switch (result) { default: fprintf(stderr, "Test02D fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); @@ -320,7 +320,7 @@ int test03() { /* * Test UTF32 -> UTF8, with legality check on. */ - result = ConvertUTF32toUTF8(&utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion); + result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion); switch (result) { default: fprintf(stderr, "Test03A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); case conversionOK: break; @@ -355,7 +355,7 @@ int test03() { /* * Test UTF8 -> UTF32, with legality check on. */ - result = ConvertUTF8toUTF32(&utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf32TargetStart, &(utf32_result[1]), strictConversion); + result = ConvertUTF8toUTF32((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf32TargetStart, &(utf32_result[1]), strictConversion); switch (result) { default: fprintf(stderr, "Test03B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); case conversionOK: break; diff --git a/readme.txt b/readme.txt deleted file mode 100644 index 9bb6a00..0000000 --- a/readme.txt +++ /dev/null @@ -1,26 +0,0 @@ - -The accompanying C source code file "ConvertUTF.c" and the associated header -file "ConvertUTF.h" provide for conversion between various transformation -formats of Unicode characters. The following conversions are supported: - - UCS4 to UTF16 - UCS4 to UTF8 - UTF16 to UCS4 - UTF16 to UTF8 - UTF8 to UTF16 - UTF8 to UCS4 - - -The files "CVTUTF7.C" and "CVTUTF7.H" are for archival and historical purposes -only. They have not been updated to Unicode 3.0 and should be considered -obsolescent. "CVTUTF7.C" contains two functions that can convert between -UCS2 (i.e., the BMP characters only) and UTF-7. Surrogates are not supported, -the code has not been tested, and should be considered unsuitable for general -purpose use. - -Please address any bug reports about these programs to: - http://www.unicode.org/unicode/reporting.html - -Last update: July 12, 2001 - - -- cgit v1.2.3