From 5a4d1539c965585c67503079c6a19d309bfcfeb8 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Tue, 6 Nov 2001 16:06:20 -0500 Subject: http://web.archive.org/web/20011106160620/http:/www.unicode.org:80/Public/PROGRAMS/CVTUTF/ --- ConvertUTF.h | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 ConvertUTF.h (limited to 'ConvertUTF.h') diff --git a/ConvertUTF.h b/ConvertUTF.h new file mode 100644 index 0000000..6798183 --- /dev/null +++ b/ConvertUTF.h @@ -0,0 +1,138 @@ +/* + * Copyright 2001 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + +/* --------------------------------------------------------------------- + + Conversions between UTF32, UTF-16, and UTF-8. Header file. + + Several funtions are included here, forming a complete set of + conversions between the three formats. UTF-7 is not included + here, but is handled in a separate source file. + + Each of these routines takes pointers to input buffers and output + buffers. The input buffers are const. + + Each routine converts the text between *sourceStart and sourceEnd, + putting the result into the buffer between *targetStart and + targetEnd. Note: the end pointers are *after* the last item: e.g. + *(sourceEnd - 1) is the last item. + + The return result indicates whether the conversion was successful, + and if not, whether the problem was in the source or target buffers. + (Only the first encountered problem is indicated.) + + After the conversion, *sourceStart and *targetStart are both + updated to point to the end of last text successfully converted in + the respective buffers. + + Input parameters: + sourceStart - pointer to a pointer to the source buffer. + The contents of this are modified on return so that + it points at the next thing to be converted. + targetStart - similarly, pointer to pointer to the target buffer. + sourceEnd, targetEnd - respectively pointers to the ends of the + two buffers, for overflow checking only. + + These conversion functions take a ConversionFlags argument. When this + flag is set to strict, both irregular sequences and isolated surrogates + will cause an error. When the flag is set to lenient, both irregular + sequences and isolated surrogates are converted. + + Whether the flag is strict or lenient, all illegal sequences will cause + an error return. This includes sequences such as: , , + or in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code + must check for illegal sequences. + + When the flag is set to lenient, characters over 0x10FFFF are converted + to the replacement character; otherwise (when the flag is set to strict) + they constitute an error. + + Output parameters: + The value "sourceIllegal" is returned from some routines if the input + sequence is malformed. When "sourceIllegal" is returned, the source + value will point to the illegal value that caused the problem. E.g., + in UTF-8 when a sequence is malformed, it points to the start of the + malformed sequence. + + Author: Mark E. Davis, 1994. + Rev History: Rick McGowan, fixes & updates May 2001. + +------------------------------------------------------------------------ */ + +/* --------------------------------------------------------------------- + The following 4 definitions are compiler-specific. + The C standard does not guarantee that wchar_t has at least + 16 bits, so wchar_t is no less portable than unsigned short! + All should be unsigned values to avoid sign extension during + bit mask & shift operations. +------------------------------------------------------------------------ */ + +typedef unsigned long UTF32; /* at least 32 bits */ +typedef unsigned short UTF16; /* at least 16 bits */ +typedef unsigned char UTF8; /* typically 8 bits */ +typedef unsigned char Boolean; /* 0 or 1 */ + +/* Some fundamental constants */ +#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD +#define UNI_MAX_BMP (UTF32)0x0000FFFF +#define UNI_MAX_UTF16 (UTF32)0x0010FFFF +#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF + +typedef enum { + conversionOK, /* conversion successful */ + sourceExhausted, /* partial character in source, but hit end */ + targetExhausted, /* insuff. room in target for conversion */ + sourceIllegal /* source sequence is illegal/malformed */ +} ConversionResult; + +typedef enum { + strictConversion = 0, + lenientConversion +} ConversionFlags; + +ConversionResult ConvertUTF32toUTF16 ( + UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, const UTF16* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF16toUTF32 ( + UTF16** sourceStart, UTF16* sourceEnd, + UTF32** targetStart, const UTF32* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF16toUTF8 ( + UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF8toUTF16 ( + UTF8** sourceStart, UTF8* sourceEnd, + UTF16** targetStart, const UTF16* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF32toUTF8 ( + UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF8toUTF32 ( + UTF8** sourceStart, UTF8* sourceEnd, + UTF32** targetStart, const UTF32* targetEnd, ConversionFlags flags); + +Boolean isLegalUTF8Sequence(UTF8 *source, UTF8 *sourceEnd); + +/* --------------------------------------------------------------------- */ -- cgit v1.2.3 From 040c6f479435a2b4f2a7cb9ef4bd65fca3ec2fcc Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Wed, 21 Aug 2002 10:22:04 -0500 Subject: http://web.archive.org/web/20020821102204/http:/www.unicode.org:80/Public/PROGRAMS/CVTUTF/ --- ConvertUTF.h | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) (limited to 'ConvertUTF.h') diff --git a/ConvertUTF.h b/ConvertUTF.h index 6798183..429ab40 100644 --- a/ConvertUTF.h +++ b/ConvertUTF.h @@ -75,6 +75,7 @@ Author: Mark E. Davis, 1994. Rev History: Rick McGowan, fixes & updates May 2001. + Fixes & updates, Sept 2001. ------------------------------------------------------------------------ */ @@ -109,30 +110,39 @@ typedef enum { lenientConversion } ConversionFlags; -ConversionResult ConvertUTF32toUTF16 ( - UTF32** sourceStart, const UTF32* sourceEnd, - UTF16** targetStart, const UTF16* targetEnd, ConversionFlags flags); +/* This is for C++ and does no harm in C */ +#ifdef __cplusplus +extern "C" { +#endif -ConversionResult ConvertUTF16toUTF32 ( - UTF16** sourceStart, UTF16* sourceEnd, - UTF32** targetStart, const UTF32* targetEnd, ConversionFlags flags); +ConversionResult ConvertUTF8toUTF16 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); ConversionResult ConvertUTF16toUTF8 ( - UTF16** sourceStart, const UTF16* sourceEnd, - UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags); + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); -ConversionResult ConvertUTF8toUTF16 ( - UTF8** sourceStart, UTF8* sourceEnd, - UTF16** targetStart, const UTF16* targetEnd, ConversionFlags flags); +ConversionResult ConvertUTF8toUTF32 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); ConversionResult ConvertUTF32toUTF8 ( - UTF32** sourceStart, const UTF32* sourceEnd, - UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags); + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); -ConversionResult ConvertUTF8toUTF32 ( - UTF8** sourceStart, UTF8* sourceEnd, - UTF32** targetStart, const UTF32* targetEnd, ConversionFlags flags); +ConversionResult ConvertUTF16toUTF32 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF32toUTF16 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); + +Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); -Boolean isLegalUTF8Sequence(UTF8 *source, UTF8 *sourceEnd); +#ifdef __cplusplus +} +#endif /* --------------------------------------------------------------------- */ -- cgit v1.2.3