From 5a42c4a3c2a175170868b77aff9a92197f4a3bf9 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Mon, 21 Aug 2000 23:46:10 -0500 Subject: http://web.archive.org/web/20000821234610/http:/www.unicode.org:80/Public/PROGRAMS/CVTUTF/ --- .metadata.txt | 4 + CVTUTF.C | 331 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ CVTUTF.H | 106 +++++++++++++++++++ CVTUTF7.C | 300 ++++++++++++++++++++++++++++++++++++++++++++++++++++ CVTUTF7.H | 80 ++++++++++++++ 5 files changed, 821 insertions(+) create mode 100644 .metadata.txt create mode 100644 CVTUTF.C create mode 100644 CVTUTF.H create mode 100644 CVTUTF7.C create mode 100644 CVTUTF7.H diff --git a/.metadata.txt b/.metadata.txt new file mode 100644 index 0000000..348ca97 --- /dev/null +++ b/.metadata.txt @@ -0,0 +1,4 @@ +CVTUTF.C 1998-01-27 16:51 +CVTUTF.H 1998-01-27 16:51 +CVTUTF7.C 1998-01-27 16:51 +CVTUTF7.H 1998-01-27 16:51 diff --git a/CVTUTF.C b/CVTUTF.C new file mode 100644 index 0000000..94898bc --- /dev/null +++ b/CVTUTF.C @@ -0,0 +1,331 @@ +/* ================================================================ */ +/* +File: ConvertUTF.C +Author: Mark E. Davis +Copyright (C) 1994 Taligent, Inc. All rights reserved. + +This code is copyrighted. Under the copyright laws, this code may not +be copied, in whole or part, without prior written consent of Taligent. + +Taligent grants the right to use or reprint this code as long as this +ENTIRE copyright notice is reproduced in the code or reproduction. +The code is provided AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES, +EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN +NO EVENT WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, +WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS +INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY +LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN +IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. +BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF +LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE +LIMITATION MAY NOT APPLY TO YOU. + +RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the +government is subject to restrictions as set forth in subparagraph +(c)(l)(ii) of the Rights in Technical Data and Computer Software +clause at DFARS 252.227-7013 and FAR 52.227-19. + +This code may be protected by one or more U.S. and International +Patents. + +TRADEMARKS: Taligent and the Taligent Design Mark are registered +trademarks of Taligent, Inc. +*/ +/* ================================================================ */ + +#include "CVTUTF.H" + +/* ================================================================ */ + +const int halfShift = 10; +const UCS4 halfBase = 0x0010000UL; +const UCS4 halfMask = 0x3FFUL; +const UCS4 kSurrogateHighStart = 0xD800UL; +const UCS4 kSurrogateHighEnd = 0xDBFFUL; +const UCS4 kSurrogateLowStart = 0xDC00UL; +const UCS4 kSurrogateLowEnd = 0xDFFFUL; + +/* ================================================================ */ + +ConversionResult ConvertUCS4toUTF16 ( + UCS4** sourceStart, const UCS4* sourceEnd, + UTF16** targetStart, const UTF16* targetEnd) { + ConversionResult result = ok; + register UCS4* source = *sourceStart; + register UTF16* target = *targetStart; + while (source < sourceEnd) { + register UCS4 ch; + if (target >= targetEnd) { + result = targetExhausted; break; + }; + ch = *source++; + if (ch <= kMaximumUCS2) { + *target++ = ch; + } else if (ch > kMaximumUTF16) { + *target++ = kReplacementCharacter; + } else { + if (target + 1 >= targetEnd) { + result = targetExhausted; break; + }; + ch -= halfBase; + *target++ = (ch >> halfShift) + kSurrogateHighStart; + *target++ = (ch & halfMask) + kSurrogateLowStart; + }; + }; + *sourceStart = source; + *targetStart = target; + return result; +}; + +/* ================================================================ */ + +ConversionResult ConvertUTF16toUCS4 ( + UTF16** sourceStart, UTF16* sourceEnd, + UCS4** targetStart, const UCS4* targetEnd) { + ConversionResult result = ok; + register UTF16* source = *sourceStart; + register UCS4* target = *targetStart; + while (source < sourceEnd) { + register UCS4 ch; + ch = *source++; + if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd && source < sourceEnd) { + register UCS4 ch2 = *source; + if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) { + ch = ((ch - kSurrogateHighStart) << halfShift) + + (ch2 - kSurrogateLowStart) + halfBase; + ++source; + }; + }; + if (target >= targetEnd) { + result = targetExhausted; break; + }; + *target++ = ch; + }; + *sourceStart = source; + *targetStart = target; + return result; +}; + +/* ================================================================ */ + +UCS4 offsetsFromUTF8[6] = {0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL}; +char bytesFromUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5}; + +UTF8 firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC}; + +/* ================================================================ */ +/* This code is similar in effect to making successive calls on the +mbtowc and wctomb routines in FSS-UTF. However, it is considerably +different in code: +* it is adapted to be consistent with UTF16, +* the interface converts a whole buffer to avoid function-call overhead +* constants have been gathered. +* loops & conditionals have been removed as much as possible for +efficiency, in favor of drop-through switch statements. +*/ + +/* ================================================================ */ +ConversionResult ConvertUTF16toUTF8 ( + UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, const UTF8* targetEnd) +{ + ConversionResult result = ok; + register UTF16* source = *sourceStart; + register UTF8* target = *targetStart; + while (source < sourceEnd) { + register UCS4 ch; + register unsigned short bytesToWrite = 0; + register const UCS4 byteMask = 0xBF; + register const UCS4 byteMark = 0x80; + ch = *source++; + if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd + && source < sourceEnd) { + register UCS4 ch2 = *source; + if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) { + ch = ((ch - kSurrogateHighStart) << halfShift) + + (ch2 - kSurrogateLowStart) + halfBase; + ++source; + }; + }; + if (ch < 0x80) { bytesToWrite = 1; + } else if (ch < 0x800) { bytesToWrite = 2; + } else if (ch < 0x10000) { bytesToWrite = 3; + } else if (ch < 0x200000) { bytesToWrite = 4; + } else if (ch < 0x4000000) { bytesToWrite = 5; + } else if (ch <= kMaximumUCS4){ bytesToWrite = 6; + } else { bytesToWrite = 2; + ch = kReplacementCharacter; + }; /* I wish there were a smart way to avoid this conditional */ + + target += bytesToWrite; + if (target > targetEnd) { + target -= bytesToWrite; result = targetExhausted; break; + }; + switch (bytesToWrite) { /* note: code falls through cases! */ + case 6: *--target = (ch | byteMark) & byteMask; ch >>= 6; + case 5: *--target = (ch | byteMark) & byteMask; ch >>= 6; + case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6; + case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6; + case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6; + case 1: *--target = ch | firstByteMark[bytesToWrite]; + }; + target += bytesToWrite; + }; + *sourceStart = source; + *targetStart = target; + return result; +}; + +/* ================================================================ */ + +ConversionResult ConvertUTF8toUTF16 ( + UTF8** sourceStart, UTF8* sourceEnd, + UTF16** targetStart, const UTF16* targetEnd) +{ + ConversionResult result = ok; + register UTF8* source = *sourceStart; + register UTF16* target = *targetStart; + while (source < sourceEnd) { + register UCS4 ch = 0; + register unsigned short extraBytesToWrite = bytesFromUTF8[*source]; + if (source + extraBytesToWrite > sourceEnd) { + result = sourceExhausted; break; + }; + switch(extraBytesToWrite) { /* note: code falls through cases! */ + case 5: ch += *source++; ch <<= 6; + case 4: ch += *source++; ch <<= 6; + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + }; + ch -= offsetsFromUTF8[extraBytesToWrite]; + + if (target >= targetEnd) { + result = targetExhausted; break; + }; + if (ch <= kMaximumUCS2) { + *target++ = ch; + } else if (ch > kMaximumUTF16) { + *target++ = kReplacementCharacter; + } else { + if (target + 1 >= targetEnd) { + result = targetExhausted; break; + }; + ch -= halfBase; + *target++ = (ch >> halfShift) + kSurrogateHighStart; + *target++ = (ch & halfMask) + kSurrogateLowStart; + }; + }; + *sourceStart = source; + *targetStart = target; + return result; +}; + +/* ================================================================ */ +ConversionResult ConvertUCS4toUTF8 ( + UCS4** sourceStart, const UCS4* sourceEnd, + UTF8** targetStart, const UTF8* targetEnd) +{ + ConversionResult result = ok; + register UCS4* source = *sourceStart; + register UTF8* target = *targetStart; + while (source < sourceEnd) { + register UCS4 ch; + register unsigned short bytesToWrite = 0; + register const UCS4 byteMask = 0xBF; + register const UCS4 byteMark = 0x80; + ch = *source++; + if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd + && source < sourceEnd) { + register UCS4 ch2 = *source; + if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) { + ch = ((ch - kSurrogateHighStart) << halfShift) + + (ch2 - kSurrogateLowStart) + halfBase; + ++source; + }; + }; + if (ch < 0x80) { bytesToWrite = 1; + } else if (ch < 0x800) { bytesToWrite = 2; + } else if (ch < 0x10000) { bytesToWrite = 3; + } else if (ch < 0x200000) { bytesToWrite = 4; + } else if (ch < 0x4000000) { bytesToWrite = 5; + } else if (ch <= kMaximumUCS4){ bytesToWrite = 6; + } else { bytesToWrite = 2; + ch = kReplacementCharacter; + }; /* I wish there were a smart way to avoid this conditional */ + + target += bytesToWrite; + if (target > targetEnd) { + target -= bytesToWrite; result = targetExhausted; break; + }; + switch (bytesToWrite) { /* note: code falls through cases! */ + case 6: *--target = (ch | byteMark) & byteMask; ch >>= 6; + case 5: *--target = (ch | byteMark) & byteMask; ch >>= 6; + case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6; + case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6; + case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6; + case 1: *--target = ch | firstByteMark[bytesToWrite]; + }; + target += bytesToWrite; + }; + *sourceStart = source; + *targetStart = target; + return result; +}; + +/* ================================================================ */ + +ConversionResult ConvertUTF8toUCS4 ( + UTF8** sourceStart, UTF8* sourceEnd, + UCS4** targetStart, const UCS4* targetEnd) +{ + ConversionResult result = ok; + register UTF8* source = *sourceStart; + register UCS4* target = *targetStart; + while (source < sourceEnd) { + register UCS4 ch = 0; + register unsigned short extraBytesToWrite = bytesFromUTF8[*source]; + if (source + extraBytesToWrite > sourceEnd) { + result = sourceExhausted; break; + }; + switch(extraBytesToWrite) { /* note: code falls through cases! */ + case 5: ch += *source++; ch <<= 6; + case 4: ch += *source++; ch <<= 6; + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + }; + ch -= offsetsFromUTF8[extraBytesToWrite]; + + if (target >= targetEnd) { + result = targetExhausted; break; + }; + if (ch <= kMaximumUCS2) { + *target++ = ch; + } else if (ch > kMaximumUCS4) { + *target++ = kReplacementCharacter; + } else { + if (target + 1 >= targetEnd) { + result = targetExhausted; break; + }; + ch -= halfBase; + *target++ = (ch >> halfShift) + kSurrogateHighStart; + *target++ = (ch & halfMask) + kSurrogateLowStart; + }; + }; + *sourceStart = source; + *targetStart = target; + return result; +}; diff --git a/CVTUTF.H b/CVTUTF.H new file mode 100644 index 0000000..85fd8ef --- /dev/null +++ b/CVTUTF.H @@ -0,0 +1,106 @@ +/* ================================================================ */ +/* +File: ConvertUTF.h +Author: Mark E. Davis +Copyright (C) 1994 Taligent, Inc. All rights reserved. + +This code is copyrighted. Under the copyright laws, this code may not +be copied, in whole or part, without prior written consent of Taligent. + +Taligent grants the right to use or reprint this code as long as this +ENTIRE copyright notice is reproduced in the code or reproduction. +The code is provided AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES, +EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN +NO EVENT WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, +WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS +INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY +LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN +IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. +BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF +LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE +LIMITATION MAY NOT APPLY TO YOU. + +RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the +government is subject to restrictions as set forth in subparagraph +(c)(l)(ii) of the Rights in Technical Data and Computer Software +clause at DFARS 252.227-7013 and FAR 52.227-19. + +This code may be protected by one or more U.S. and International +Patents. + +TRADEMARKS: Taligent and the Taligent Design Mark are registered +trademarks of Taligent, Inc. +*/ +/* ================================================================ */ + +#include +#include +// #include +#include + +/* ================================================================ */ +/* The following 4 definitions are compiler-specific. + I would use wchar_t for UCS2/UTF16, except that the C standard + does not guarantee that it has at least 16 bits, so wchar_t is + no less portable than unsigned short! +*/ + +typedef unsigned long UCS4; +typedef unsigned short UCS2; +typedef unsigned short UTF16; +typedef unsigned char UTF8; + +typedef enum {false, true} Boolean; + + +const UCS4 kReplacementCharacter = 0x0000FFFDUL; +const UCS4 kMaximumUCS2 = 0x0000FFFFUL; +const UCS4 kMaximumUTF16 = 0x0010FFFFUL; +const UCS4 kMaximumUCS4 = 0x7FFFFFFFUL; + +/* ================================================================ */ +/* Each of these routines converts the text between *sourceStart and +sourceEnd, putting the result into the buffer between *targetStart and +targetEnd. Note: the end pointers are *after* the last item: e.g. +*(sourceEnd - 1) is the last item. + + The return result indicates whether the conversion was successful, +and if not, whether the problem was in the source or target buffers. + + After the conversion, *sourceStart and *targetStart are both +updated to point to the end of last text successfully converted in +the respective buffers. +*/ + +typedef enum { + ok, /* conversion successful */ + sourceExhausted, /* partial character in source, but hit end */ + targetExhausted /* insuff. room in target for conversion */ +} ConversionResult; + +ConversionResult ConvertUCS4toUTF16 ( + UCS4** sourceStart, const UCS4* sourceEnd, + UTF16** targetStart, const UTF16* targetEnd); + +ConversionResult ConvertUTF16toUCS4 ( + UTF16** sourceStart, UTF16* sourceEnd, + UCS4** targetStart, const UCS4* targetEnd); + +ConversionResult ConvertUTF16toUTF8 ( + UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, const UTF8* targetEnd); + +ConversionResult ConvertUTF8toUTF16 ( + UTF8** sourceStart, UTF8* sourceEnd, + UTF16** targetStart, const UTF16* targetEnd); + +ConversionResult ConvertUCS4toUTF8 ( + UCS4** sourceStart, const UCS4* sourceEnd, + UTF8** targetStart, const UTF8* targetEnd); + +ConversionResult ConvertUTF8toUCS4 ( + UTF8** sourceStart, UTF8* sourceEnd, + UCS4** targetStart, const UCS4* targetEnd); + +/* ================================================================ */ diff --git a/CVTUTF7.C b/CVTUTF7.C new file mode 100644 index 0000000..09583cb --- /dev/null +++ b/CVTUTF7.C @@ -0,0 +1,300 @@ +/* ================================================================ */ +/* +File: ConvertUTF7.c +Author: David B. Goldsmith +Copyright (C) 1994, 1996 Taligent, Inc. All rights reserved. + +This code is copyrighted. Under the copyright laws, this code may not +be copied, in whole or part, without prior written consent of Taligent. + +Taligent grants the right to use this code as long as this ENTIRE +copyright notice is reproduced in the code. The code is provided +AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES, EITHER EXPRESS OR +IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT +WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, +WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS +INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY +LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN +IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. +BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF +LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE +LIMITATION MAY NOT APPLY TO YOU. + +RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the +government is subject to restrictions as set forth in subparagraph +(c)(l)(ii) of the Rights in Technical Data and Computer Software +clause at DFARS 252.227-7013 and FAR 52.227-19. + +This code may be protected by one or more U.S. and International +Patents. + +TRADEMARKS: Taligent and the Taligent Design Mark are registered +trademarks of Taligent, Inc. +*/ + +#include "CVTUTF7.H" + +static char base64[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +static short invbase64[128]; + +static char direct[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"; +static char optional[] = "!\"#$%&*;<=>@[]^_`{|}"; +static char spaces[] = " \011\015\012"; /* space, tab, return, line feed */ +static char mustshiftsafe[128]; +static char mustshiftopt[128]; + +static int needtables = 1; + +#define SHIFT_IN '+' +#define SHIFT_OUT '-' + +static void +tabinit() +{ + int i, limit; + + for (i = 0; i < 128; ++i) + { + mustshiftopt[i] = mustshiftsafe[i] = 1; + invbase64[i] = -1; + } + limit = strlen(direct); + for (i = 0; i < limit; ++i) + mustshiftopt[direct[i]] = mustshiftsafe[direct[i]] = 0; + limit = strlen(spaces); + for (i = 0; i < limit; ++i) + mustshiftopt[spaces[i]] = mustshiftsafe[spaces[i]] = 0; + limit = strlen(optional); + for (i = 0; i < limit; ++i) + mustshiftopt[optional[i]] = 0; + limit = strlen(base64); + for (i = 0; i < limit; ++i) + invbase64[base64[i]] = i; + + needtables = 0; +} + +#define DECLARE_BIT_BUFFER register unsigned long BITbuffer = 0, buffertemp = 0; int bufferbits = 0 +#define BITS_IN_BUFFER bufferbits +#define WRITE_N_BITS(x, n) ((BITbuffer |= ( ((x) & ~(-1L<<(n))) << (32-(n)-bufferbits) ) ), bufferbits += (n) ) +#define READ_N_BITS(n) ((buffertemp = (BITbuffer >> (32-(n)))), (BITbuffer <<= (n)), (bufferbits -= (n)), buffertemp) +#define TARGETCHECK {if (target >= targetEnd) {result = targetExhausted; break;}} + +ConversionResult ConvertUCS2toUTF7( + UCS2** sourceStart, UCS2* sourceEnd, + char** targetStart, char* targetEnd, + int optional, int verbose) +{ + ConversionResult result = ok; + DECLARE_BIT_BUFFER; + int shifted = 0, needshift = 0, done = 0; + register UCS2 *source = *sourceStart; + register char *target = *targetStart; + char *mustshift; + + if (needtables) + tabinit(); + + if (optional) + mustshift = mustshiftopt; + else + mustshift = mustshiftsafe; + + do + { + register UCS2 r; + + if (!(done = (source >= sourceEnd))) + r = *source++; + needshift = (!done && ((r > 0x7f) || mustshift[r])); + + if (needshift && !shifted) + { + TARGETCHECK; + *target++ = SHIFT_IN; + /* Special case handling of the SHIFT_IN character */ + if (r == (UCS2)SHIFT_IN) { + TARGETCHECK; + *target++ = SHIFT_OUT; + } + else + shifted = 1; + } + + if (shifted) + { + /* Either write the character to the bit buffer, or pad + the bit buffer out to a full base64 character. + */ + if (needshift) + WRITE_N_BITS(r, 16); + else + WRITE_N_BITS(0, (6 - (BITS_IN_BUFFER % 6))%6); + + /* Flush out as many full base64 characters as possible + from the bit buffer. + */ + while ((target < targetEnd) && BITS_IN_BUFFER >= 6) + { + *target++ = base64[READ_N_BITS(6)]; + } + + if (BITS_IN_BUFFER >= 6) + TARGETCHECK; + + if (!needshift) + { + /* Write the explicit shift out character if + 1) The caller has requested we always do it, or + 2) The directly encoded character is in the + base64 set, or + 3) The directly encoded character is SHIFT_OUT. + */ + if (verbose || ((!done) && (invbase64[r] >=0 || r == SHIFT_OUT))) + { + TARGETCHECK; + *target++ = SHIFT_OUT; + } + shifted = 0; + } + } + + /* The character can be directly encoded as ASCII. */ + if (!needshift && !done) + { + TARGETCHECK; + *target++ = (char) r; + } + + } + while (!done); + + *sourceStart = source; + *targetStart = target; + return result; +} + +ConversionResult ConvertUTF7toUCS2( + char** sourceStart, char* sourceEnd, + UCS2** targetStart, UCS2* targetEnd) +{ + ConversionResult result = ok; + DECLARE_BIT_BUFFER; + int shifted = 0, first = 0, wroteone = 0, base64EOF, base64value, done; + unsigned int c, prevc; + unsigned long junk; + register char *source = *sourceStart; + register UCS2 *target = *targetStart; + + if (needtables) + tabinit(); + + do + { + /* read an ASCII character c */ + if (!(done = (source >= sourceEnd))) + c = *source++; + if (shifted) + { + /* We're done with a base64 string if we hit EOF, it's not a valid + ASCII character, or it's not in the base64 set. + */ + base64EOF = done || (c > 0x7f) || (base64value = invbase64[c]) < 0; + if (base64EOF) + { + shifted = 0; + /* If the character causing us to drop out was SHIFT_IN or + SHIFT_OUT, it may be a special escape for SHIFT_IN. The + test for SHIFT_IN is not necessary, but allows an alternate + form of UTF-7 where SHIFT_IN is escaped by SHIFT_IN. This + only works for some values of SHIFT_IN. + */ + if (!done && (c == SHIFT_IN || c == SHIFT_OUT)) + { + /* get another character c */ + prevc = c; + if (!(done = (source >= sourceEnd))) + c = *source++; + /* If no base64 characters were encountered, and the + character terminating the shift sequence was + SHIFT_OUT, then it's a special escape for SHIFT_IN. + */ + if (first && prevc == SHIFT_OUT) + { + /* write SHIFT_IN unicode */ + TARGETCHECK; + *target++ = (UCS2)SHIFT_IN; + } + else if (!wroteone) + { + result = sourceCorrupt; + /* fprintf(stderr, "UTF7: empty sequence near byte %ld in input\n", source-sourceStart) */; + } + } + else if (!wroteone) + { + result = sourceCorrupt; + /* fprintf(stderr, "UTF7: empty sequence near byte %ld in input\n", source-sourceStart) */; + } + } + else + { + /* Add another 6 bits of base64 to the bit buffer. */ + WRITE_N_BITS(base64value, 6); + first = 0; + } + + /* Extract as many full 16 bit characters as possible from the + bit buffer. + */ + while (BITS_IN_BUFFER >= 16 && (target < targetEnd)) + { + /* write a unicode */ + *target++ = READ_N_BITS(16); + wroteone = 1; + } + + if (BITS_IN_BUFFER >= 16) + TARGETCHECK; + + if (base64EOF) + { + junk = READ_N_BITS(BITS_IN_BUFFER); + if (junk) + { + result = sourceCorrupt; + /* fprintf(stderr, "UTF7: non-zero pad bits near byte %ld in input\n", source-sourceStart) */; + } + } + } + + if (!shifted && !done) + { + if (c == SHIFT_IN) + { + shifted = 1; + first = 1; + wroteone = 0; + } + else + { + /* It must be a directly encoded character. */ + if (c > 0x7f) + { + result = sourceCorrupt; + /* fprintf(stderr, "UTF7: non-ASCII character near byte %ld in input\n", source-sourceStart) */; + } + /* write a unicode */ + TARGETCHECK; + *target++ = c; + } + } + } + while (!done); + + *sourceStart = source; + *targetStart = target; + return result; +} diff --git a/CVTUTF7.H b/CVTUTF7.H new file mode 100644 index 0000000..788e0d1 --- /dev/null +++ b/CVTUTF7.H @@ -0,0 +1,80 @@ +/* ================================================================ */ +/* +File: ConvertUTF7.h +Author: David B. Goldsmith +Copyright (C) 1994 Taligent, Inc. All rights reserved. + +This code is copyrighted. Under the copyright laws, this code may not +be copied, in whole or part, without prior written consent of Taligent. + +Taligent grants the right to use this code as long as this ENTIRE +copyright notice is reproduced in the code. The code is provided +AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES, EITHER EXPRESS OR +IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT +WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, +WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS +INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY +LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN +IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. +BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF +LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE +LIMITATION MAY NOT APPLY TO YOU. + +RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the +government is subject to restrictions as set forth in subparagraph +(c)(l)(ii) of the Rights in Technical Data and Computer Software +clause at DFARS 252.227-7013 and FAR 52.227-19. + +This code may be protected by one or more U.S. and International +Patents. + +TRADEMARKS: Taligent and the Taligent Design Mark are registered +trademarks of Taligent, Inc. +*/ +/* ================================================================ */ + +/* ================================================================ */ +/* The following definitions are compiler-specific. + I would use wchar_t for UCS2/UTF16, except that the C standard + does not guarantee that it has at least 16 bits, so wchar_t is + no more portable than unsigned short! +*/ + +typedef unsigned short UCS2; + +/* ================================================================ */ +/* Each of these routines converts the text between *sourceStart and +sourceEnd, putting the result into the buffer between *targetStart and +targetEnd. Note: the end pointers are *after* the last item: e.g. +*(sourceEnd - 1) is the last item. + + The return result indicates whether the conversion was successful, +and if not, whether the problem was in the source or target buffers. + + After the conversion, *sourceStart and *targetStart are both +updated to point to the end of last text successfully converted in +the respective buffers. + + In ConvertUCS2toUTF7, optional indicates whether UTF-7 optional +characters should be directly encoded, and verbose controls whether the +shift-out character, "-", is always emitted at the end of a shifted +sequence. +*/ + +typedef enum { + ok, /* conversion successful */ + sourceCorrupt, /* source contains invalid UTF-7 */ + targetExhausted /* insuff. room in target for conversion */ +} ConversionResult; + +extern ConversionResult ConvertUCS2toUTF7 ( + UCS2** sourceStart, UCS2* sourceEnd, + char** targetStart, char* targetEnd, + int optional, int verbose); + +extern ConversionResult ConvertUTF7toUCS2 ( + char** sourceStart, char* sourceEnd, + UCS2** targetStart, UCS2* targetEnd); + +/* ================================================================ */ -- cgit v1.2.3