From 040c6f479435a2b4f2a7cb9ef4bd65fca3ec2fcc Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Wed, 21 Aug 2002 10:22:04 -0500 Subject: http://web.archive.org/web/20020821102204/http:/www.unicode.org:80/Public/PROGRAMS/CVTUTF/ --- ConvertUTF.c | 67 ++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 36 insertions(+), 31 deletions(-) (limited to 'ConvertUTF.c') diff --git a/ConvertUTF.c b/ConvertUTF.c index 491fa14..23834c4 100644 --- a/ConvertUTF.c +++ b/ConvertUTF.c @@ -25,6 +25,8 @@ Conversions between UTF32, UTF-16, and UTF-8. Source code file. Author: Mark E. Davis, 1994. Rev History: Rick McGowan, fixes & updates May 2001. + Sept 2001: fixed const & error conditions per + mods suggested by S. Parent & A. Lillich. See the header file "ConvertUTF.h" for complete documentation. @@ -51,10 +53,10 @@ static const UTF32 halfMask = 0x3FFUL; /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF32toUTF16 ( - UTF32** sourceStart, const UTF32* sourceEnd, - UTF16** targetStart, const UTF16* targetEnd, const ConversionFlags flags) { + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - UTF32* source = *sourceStart; + const UTF32* source = *sourceStart; UTF16* target = *targetStart; while (source < sourceEnd) { UTF32 ch; @@ -79,6 +81,7 @@ ConversionResult ConvertUTF32toUTF16 ( } else { /* target is a character in range 0xFFFF - 0x10FFFF. */ if (target + 1 >= targetEnd) { + --source; /* Back up source pointer! */ result = targetExhausted; break; } ch -= halfBase; @@ -94,13 +97,14 @@ ConversionResult ConvertUTF32toUTF16 ( /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF16toUTF32 ( - UTF16** sourceStart, UTF16* sourceEnd, - UTF32** targetStart, const UTF32* targetEnd, const ConversionFlags flags) { + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - UTF16* source = *sourceStart; + const UTF16* source = *sourceStart; UTF32* target = *targetStart; UTF32 ch, ch2; while (source < sourceEnd) { + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ ch = *source++; if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) { ch2 = *source; @@ -120,6 +124,7 @@ ConversionResult ConvertUTF16toUTF32 ( break; } if (target >= targetEnd) { + source = oldSource; /* Back up source pointer! */ result = targetExhausted; break; } *target++ = ch; @@ -181,16 +186,17 @@ static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF16toUTF8 ( - UTF16** sourceStart, const UTF16* sourceEnd, - UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags) { + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - UTF16* source = *sourceStart; + const UTF16* source = *sourceStart; UTF8* target = *targetStart; while (source < sourceEnd) { UTF32 ch; unsigned short bytesToWrite = 0; const UTF32 byteMask = 0xBF; const UTF32 byteMark = 0x80; + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ ch = *source++; /* If we have a surrogate pair, convert to UTF32 first. */ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) { @@ -220,6 +226,7 @@ ConversionResult ConvertUTF16toUTF8 ( target += bytesToWrite; if (target > targetEnd) { + source = oldSource; /* Back up source pointer! */ target -= bytesToWrite; result = targetExhausted; break; } switch (bytesToWrite) { /* note: everything falls through. */ @@ -248,9 +255,9 @@ ConversionResult ConvertUTF16toUTF8 ( * definition of UTF-8 goes up to 4-byte sequences. */ -static Boolean isLegalUTF8(UTF8 *source, int length) { +static Boolean isLegalUTF8(const UTF8 *source, int length) { UTF8 a; - UTF8 *srcptr = source+length; + const UTF8 *srcptr = source+length; switch (length) { default: return false; /* Everything else falls through when "true"... */ @@ -276,7 +283,7 @@ static Boolean isLegalUTF8(UTF8 *source, int length) { * Exported function to return whether a UTF-8 sequence is legal or not. * This is not used here; it's just exported. */ -Boolean isLegalUTF8Sequence(UTF8 *source, UTF8 *sourceEnd) { +Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { int length = trailingBytesForUTF8[*source]+1; if (source+length > sourceEnd) { return false; @@ -287,10 +294,10 @@ Boolean isLegalUTF8Sequence(UTF8 *source, UTF8 *sourceEnd) { /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF8toUTF16 ( - UTF8** sourceStart, UTF8* sourceEnd, - UTF16** targetStart, const UTF16* targetEnd, const ConversionFlags flags) { + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - UTF8* source = *sourceStart; + const UTF8* source = *sourceStart; UTF16* target = *targetStart; while (source < sourceEnd) { UTF32 ch = 0; @@ -315,11 +322,12 @@ ConversionResult ConvertUTF8toUTF16 ( ch -= offsetsFromUTF8[extraBytesToRead]; if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ result = targetExhausted; break; } if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) { - --source; /* return to the illegal value itself */ + source -= (extraBytesToRead+1); /* return to the illegal value itself */ result = sourceIllegal; break; } else { @@ -328,13 +336,15 @@ ConversionResult ConvertUTF8toUTF16 ( } else if (ch > UNI_MAX_UTF16) { if (flags == strictConversion) { result = sourceIllegal; - source -= extraBytesToRead; /* return to the start */ + source -= (extraBytesToRead+1); /* return to the start */ + break; /* Bail out; shouldn't continue */ } else { *target++ = UNI_REPLACEMENT_CHAR; } } else { /* target is a character in range 0xFFFF - 0x10FFFF. */ if (target + 1 >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ result = targetExhausted; break; } ch -= halfBase; @@ -350,10 +360,10 @@ ConversionResult ConvertUTF8toUTF16 ( /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF32toUTF8 ( - UTF32** sourceStart, const UTF32* sourceEnd, - UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags) { + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - UTF32* source = *sourceStart; + const UTF32* source = *sourceStart; UTF8* target = *targetStart; while (source < sourceEnd) { UTF32 ch; @@ -380,6 +390,7 @@ ConversionResult ConvertUTF32toUTF8 ( target += bytesToWrite; if (target > targetEnd) { + --source; /* Back up source pointer! */ target -= bytesToWrite; result = targetExhausted; break; } switch (bytesToWrite) { /* note: everything falls through. */ @@ -398,10 +409,10 @@ ConversionResult ConvertUTF32toUTF8 ( /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF8toUTF32 ( - UTF8** sourceStart, UTF8* sourceEnd, - UTF32** targetStart, const UTF32* targetEnd, ConversionFlags flags) { + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { ConversionResult result = conversionOK; - UTF8* source = *sourceStart; + const UTF8* source = *sourceStart; UTF32* target = *targetStart; while (source < sourceEnd) { UTF32 ch = 0; @@ -426,19 +437,13 @@ ConversionResult ConvertUTF8toUTF32 ( ch -= offsetsFromUTF8[extraBytesToRead]; if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up the source pointer! */ result = targetExhausted; break; } if (ch <= UNI_MAX_UTF32) { *target++ = ch; - } else if (ch > UNI_MAX_UTF32) { + } else { /* i.e., ch > UNI_MAX_UTF32 */ *target++ = UNI_REPLACEMENT_CHAR; - } else { - if (target + 1 >= targetEnd) { - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (ch >> halfShift) + UNI_SUR_HIGH_START; - *target++ = (ch & halfMask) + UNI_SUR_LOW_START; } } *sourceStart = source; -- cgit v1.2.3