http://web.archive.org/web/20000821234610/http:/www.unicode.org:80/Public/PROGRAMS/CVTUTF/

author: Luke Shumaker <lukeshu@lukeshu.com> 2000-08-21 23:46:10 -0500
committer: Luke Shumaker <lukeshu@lukeshu.com> 2000-08-21 23:46:10 -0500
commit: 5a42c4a3c2a175170868b77aff9a92197f4a3bf9 (patch)
tree: 4a3e4422220e4fb42a98718156f0019dba0f215f
5 files changed, 821 insertions, 0 deletions
diff --git a/.metadata.txt b/.metadata.txt
new file mode 100644
index 0000000..348ca97
--- /dev/null
+++ b/.metadata.txt
@@ -0,0 +1,4 @@
+CVTUTF.C                1998-01-27 16:51
+CVTUTF.H                1998-01-27 16:51
+CVTUTF7.C               1998-01-27 16:51
+CVTUTF7.H               1998-01-27 16:51
diff --git a/CVTUTF.C b/CVTUTF.C
new file mode 100644
index 0000000..94898bc
--- /dev/null
+++ b/CVTUTF.C
@@ -0,0 +1,331 @@
+/* ================================================================ */
+/*
+File:	ConvertUTF.C
+Author: Mark E. Davis
+Copyright (C) 1994 Taligent, Inc. All rights reserved.
+
+This code is copyrighted. Under the copyright laws, this code may not
+be copied, in whole or part, without prior written consent of Taligent. 
+
+Taligent grants the right to use or reprint this code as long as this
+ENTIRE copyright notice is reproduced in the code or reproduction.
+The code is provided AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES,
+EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  IN
+NO EVENT WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING,
+WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS
+INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY
+LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN
+IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF
+LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE
+LIMITATION MAY NOT APPLY TO YOU.
+
+RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the
+government is subject to restrictions as set forth in subparagraph
+(c)(l)(ii) of the Rights in Technical Data and Computer Software
+clause at DFARS 252.227-7013 and FAR 52.227-19.
+
+This code may be protected by one or more U.S. and International
+Patents.
+
+TRADEMARKS: Taligent and the Taligent Design Mark are registered
+trademarks of Taligent, Inc.
+*/
+/* ================================================================ */
+
+#include "CVTUTF.H"
+
+/* ================================================================ */
+
+const int halfShift				= 10;
+const UCS4 halfBase				= 0x0010000UL;
+const UCS4 halfMask				= 0x3FFUL;
+const UCS4 kSurrogateHighStart	= 0xD800UL;
+const UCS4 kSurrogateHighEnd	= 0xDBFFUL;
+const UCS4 kSurrogateLowStart	= 0xDC00UL;
+const UCS4 kSurrogateLowEnd		= 0xDFFFUL;
+
+/* ================================================================ */
+
+ConversionResult	ConvertUCS4toUTF16 (
+		UCS4** sourceStart, const UCS4* sourceEnd, 
+		UTF16** targetStart, const UTF16* targetEnd) {
+	ConversionResult result = ok;
+	register UCS4* source = *sourceStart;
+	register UTF16* target = *targetStart;
+	while (source < sourceEnd) {
+		register UCS4 ch;
+		if (target >= targetEnd) {
+			result = targetExhausted; break;
+		};
+		ch = *source++;
+		if (ch <= kMaximumUCS2) {
+			*target++ = ch;
+		} else if (ch > kMaximumUTF16) {
+			*target++ = kReplacementCharacter;
+		} else {
+			if (target + 1 >= targetEnd) {
+				result = targetExhausted; break;
+			};
+			ch -= halfBase;
+			*target++ = (ch >> halfShift) + kSurrogateHighStart;
+			*target++ = (ch & halfMask) + kSurrogateLowStart;
+		};
+	};
+	*sourceStart = source;
+	*targetStart = target;
+	return result;
+};
+
+/* ================================================================ */
+
+ConversionResult	ConvertUTF16toUCS4 (
+		UTF16** sourceStart, UTF16* sourceEnd, 
+		UCS4** targetStart, const UCS4* targetEnd) {
+	ConversionResult result = ok;
+	register UTF16* source = *sourceStart;
+	register UCS4* target = *targetStart;
+	while (source < sourceEnd) {
+		register UCS4 ch;
+		ch = *source++;
+		if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd && source < sourceEnd) {
+			register UCS4 ch2 = *source;
+			if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) {
+				ch = ((ch - kSurrogateHighStart) << halfShift)
+					+ (ch2 - kSurrogateLowStart) + halfBase;
+				++source;
+			};
+		};
+		if (target >= targetEnd) {
+			result = targetExhausted; break;
+		};
+		*target++ = ch;
+	};
+	*sourceStart = source;
+	*targetStart = target;
+	return result;
+};
+
+/* ================================================================ */
+
+UCS4 offsetsFromUTF8[6] =	{0x00000000UL, 0x00003080UL, 0x000E2080UL, 
+					 	 	 0x03C82080UL, 0xFA082080UL, 0x82082080UL};
+char bytesFromUTF8[256] = {
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5};
+
+UTF8 firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
+
+/* ================================================================ */
+/*	This code is similar in effect to making successive calls on the
+mbtowc and wctomb routines in FSS-UTF. However, it is considerably
+different in code:
+* it is adapted to be consistent with UTF16,
+* the interface converts a whole buffer to avoid function-call overhead
+* constants have been gathered.
+* loops & conditionals have been removed as much as possible for
+efficiency, in favor of drop-through switch statements.
+*/
+
+/* ================================================================ */
+ConversionResult	ConvertUTF16toUTF8 (
+		UTF16** sourceStart, const UTF16* sourceEnd, 
+		UTF8** targetStart, const UTF8* targetEnd)
+{
+	ConversionResult result = ok;
+	register UTF16* source = *sourceStart;
+	register UTF8* target = *targetStart;
+	while (source < sourceEnd) {
+		register UCS4 ch;
+		register unsigned short bytesToWrite = 0;
+		register const UCS4 byteMask = 0xBF;
+		register const UCS4 byteMark = 0x80; 
+		ch = *source++;
+		if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd
+				&& source < sourceEnd) {
+			register UCS4 ch2 = *source;
+			if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) {
+				ch = ((ch - kSurrogateHighStart) << halfShift)
+					+ (ch2 - kSurrogateLowStart) + halfBase;
+				++source;
+			};
+		};
+		if (ch < 0x80) {				bytesToWrite = 1;
+		} else if (ch < 0x800) {		bytesToWrite = 2;
+		} else if (ch < 0x10000) {		bytesToWrite = 3;
+		} else if (ch < 0x200000) {		bytesToWrite = 4;
+		} else if (ch < 0x4000000) {	bytesToWrite = 5;
+		} else if (ch <= kMaximumUCS4){	bytesToWrite = 6;
+		} else {						bytesToWrite = 2;
+										ch = kReplacementCharacter;
+		}; /* I wish there were a smart way to avoid this conditional */
+		
+		target += bytesToWrite;
+		if (target > targetEnd) {
+			target -= bytesToWrite; result = targetExhausted; break;
+		};
+		switch (bytesToWrite) {	/* note: code falls through cases! */
+			case 6:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
+			case 5:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
+			case 4:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
+			case 3:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
+			case 2:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
+			case 1:	*--target =  ch | firstByteMark[bytesToWrite];
+		};
+		target += bytesToWrite;
+	};
+	*sourceStart = source;
+	*targetStart = target;
+	return result;
+};
+
+/* ================================================================ */
+
+ConversionResult	ConvertUTF8toUTF16 (
+		UTF8** sourceStart, UTF8* sourceEnd, 
+		UTF16** targetStart, const UTF16* targetEnd)
+{
+	ConversionResult result = ok;
+	register UTF8* source = *sourceStart;
+	register UTF16* target = *targetStart;
+	while (source < sourceEnd) {
+		register UCS4 ch = 0;
+		register unsigned short extraBytesToWrite = bytesFromUTF8[*source];
+		if (source + extraBytesToWrite > sourceEnd) {
+			result = sourceExhausted; break;
+		};
+		switch(extraBytesToWrite) {	/* note: code falls through cases! */
+			case 5:	ch += *source++; ch <<= 6;
+			case 4:	ch += *source++; ch <<= 6;
+			case 3:	ch += *source++; ch <<= 6;
+			case 2:	ch += *source++; ch <<= 6;
+			case 1:	ch += *source++; ch <<= 6;
+			case 0:	ch += *source++;
+		};
+		ch -= offsetsFromUTF8[extraBytesToWrite];
+
+		if (target >= targetEnd) {
+			result = targetExhausted; break;
+		};
+		if (ch <= kMaximumUCS2) {
+			*target++ = ch;
+		} else if (ch > kMaximumUTF16) {
+			*target++ = kReplacementCharacter;
+		} else {
+			if (target + 1 >= targetEnd) {
+				result = targetExhausted; break;
+			};
+			ch -= halfBase;
+			*target++ = (ch >> halfShift) + kSurrogateHighStart;
+			*target++ = (ch & halfMask) + kSurrogateLowStart;
+		};
+	};
+	*sourceStart = source;
+	*targetStart = target;
+	return result;
+};
+
+/* ================================================================ */
+ConversionResult	ConvertUCS4toUTF8 (
+		UCS4** sourceStart, const UCS4* sourceEnd, 
+		UTF8** targetStart, const UTF8* targetEnd)
+{
+	ConversionResult result = ok;
+	register UCS4* source = *sourceStart;
+	register UTF8* target = *targetStart;
+	while (source < sourceEnd) {
+		register UCS4 ch;
+		register unsigned short bytesToWrite = 0;
+		register const UCS4 byteMask = 0xBF;
+		register const UCS4 byteMark = 0x80; 
+		ch = *source++;
+		if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd
+				&& source < sourceEnd) {
+			register UCS4 ch2 = *source;
+			if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) {
+				ch = ((ch - kSurrogateHighStart) << halfShift)
+					+ (ch2 - kSurrogateLowStart) + halfBase;
+				++source;
+			};
+		};
+		if (ch < 0x80) {				bytesToWrite = 1;
+		} else if (ch < 0x800) {		bytesToWrite = 2;
+		} else if (ch < 0x10000) {		bytesToWrite = 3;
+		} else if (ch < 0x200000) {		bytesToWrite = 4;
+		} else if (ch < 0x4000000) {	bytesToWrite = 5;
+		} else if (ch <= kMaximumUCS4){	bytesToWrite = 6;
+		} else {						bytesToWrite = 2;
+										ch = kReplacementCharacter;
+		}; /* I wish there were a smart way to avoid this conditional */
+		
+		target += bytesToWrite;
+		if (target > targetEnd) {
+			target -= bytesToWrite; result = targetExhausted; break;
+		};
+		switch (bytesToWrite) {	/* note: code falls through cases! */
+			case 6:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
+			case 5:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
+			case 4:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
+			case 3:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
+			case 2:	*--target = (ch | byteMark) & byteMask; ch >>= 6;
+			case 1:	*--target =  ch | firstByteMark[bytesToWrite];
+		};
+		target += bytesToWrite;
+	};
+	*sourceStart = source;
+	*targetStart = target;
+	return result;
+};
+
+/* ================================================================ */
+
+ConversionResult	ConvertUTF8toUCS4 (
+		UTF8** sourceStart, UTF8* sourceEnd, 
+		UCS4** targetStart, const UCS4* targetEnd)
+{
+	ConversionResult result = ok;
+	register UTF8* source = *sourceStart;
+	register UCS4* target = *targetStart;
+	while (source < sourceEnd) {
+		register UCS4 ch = 0;
+		register unsigned short extraBytesToWrite = bytesFromUTF8[*source];
+		if (source + extraBytesToWrite > sourceEnd) {
+			result = sourceExhausted; break;
+		};
+		switch(extraBytesToWrite) {	/* note: code falls through cases! */
+			case 5:	ch += *source++; ch <<= 6;
+			case 4:	ch += *source++; ch <<= 6;
+			case 3:	ch += *source++; ch <<= 6;
+			case 2:	ch += *source++; ch <<= 6;
+			case 1:	ch += *source++; ch <<= 6;
+			case 0:	ch += *source++;
+		};
+		ch -= offsetsFromUTF8[extraBytesToWrite];
+
+		if (target >= targetEnd) {
+			result = targetExhausted; break;
+		};
+		if (ch <= kMaximumUCS2) {
+			*target++ = ch;
+		} else if (ch > kMaximumUCS4) {
+			*target++ = kReplacementCharacter;
+		} else {
+			if (target + 1 >= targetEnd) {
+				result = targetExhausted; break;
+			};
+			ch -= halfBase;
+			*target++ = (ch >> halfShift) + kSurrogateHighStart;
+			*target++ = (ch & halfMask) + kSurrogateLowStart;
+		};
+	};
+	*sourceStart = source;
+	*targetStart = target;
+	return result;
+};
diff --git a/CVTUTF.H b/CVTUTF.H
new file mode 100644
index 0000000..85fd8ef
--- /dev/null
+++ b/CVTUTF.H
@@ -0,0 +1,106 @@
+/* ================================================================ */
+/*
+File:	ConvertUTF.h
+Author: Mark E. Davis
+Copyright (C) 1994 Taligent, Inc. All rights reserved.
+
+This code is copyrighted. Under the copyright laws, this code may not
+be copied, in whole or part, without prior written consent of Taligent. 
+
+Taligent grants the right to use or reprint this code as long as this
+ENTIRE copyright notice is reproduced in the code or reproduction.
+The code is provided AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES,
+EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  IN
+NO EVENT WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING,
+WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS
+INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY
+LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN
+IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF
+LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE
+LIMITATION MAY NOT APPLY TO YOU.
+
+RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the
+government is subject to restrictions as set forth in subparagraph
+(c)(l)(ii) of the Rights in Technical Data and Computer Software
+clause at DFARS 252.227-7013 and FAR 52.227-19.
+
+This code may be protected by one or more U.S. and International
+Patents.
+
+TRADEMARKS: Taligent and the Taligent Design Mark are registered
+trademarks of Taligent, Inc.
+*/
+/* ================================================================ */
+
+#include <stdio.h>
+#include <stdlib.h>
+// #include <types.h>
+#include <string.h>
+
+/* ================================================================ */
+/*	The following 4 definitions are compiler-specific.
+	I would use wchar_t for UCS2/UTF16, except that the C standard
+	does not guarantee that it has at least 16 bits, so wchar_t is
+	no less portable than unsigned short!
+*/
+
+typedef unsigned long	UCS4;
+typedef unsigned short	UCS2;
+typedef unsigned short	UTF16;
+typedef unsigned char	UTF8;
+
+typedef enum {false, true} Boolean;
+
+
+const UCS4 kReplacementCharacter =	0x0000FFFDUL;
+const UCS4 kMaximumUCS2 =			0x0000FFFFUL;
+const UCS4 kMaximumUTF16 =			0x0010FFFFUL;
+const UCS4 kMaximumUCS4 =			0x7FFFFFFFUL;
+
+/* ================================================================ */
+/*	Each of these routines converts the text between *sourceStart and 
+sourceEnd, putting the result into the buffer between *targetStart and
+targetEnd. Note: the end pointers are *after* the last item: e.g. 
+*(sourceEnd - 1) is the last item.
+
+	The return result indicates whether the conversion was successful,
+and if not, whether the problem was in the source or target buffers.
+
+	After the conversion, *sourceStart and *targetStart are both
+updated to point to the end of last text successfully converted in
+the respective buffers.
+*/
+
+typedef enum {
+	ok, 				/* conversion successful */
+	sourceExhausted,	/* partial character in source, but hit end */
+	targetExhausted		/* insuff. room in target for conversion */
+} ConversionResult;
+
+ConversionResult	ConvertUCS4toUTF16 (
+		UCS4** sourceStart, const UCS4* sourceEnd, 
+		UTF16** targetStart, const UTF16* targetEnd);
+
+ConversionResult	ConvertUTF16toUCS4 (
+		UTF16** sourceStart, UTF16* sourceEnd, 
+		UCS4** targetStart, const UCS4* targetEnd);
+
+ConversionResult	ConvertUTF16toUTF8 (
+		UTF16** sourceStart, const UTF16* sourceEnd, 
+		UTF8** targetStart, const UTF8* targetEnd);
+		
+ConversionResult	ConvertUTF8toUTF16 (
+		UTF8** sourceStart, UTF8* sourceEnd, 
+		UTF16** targetStart, const UTF16* targetEnd);
+
+ConversionResult	ConvertUCS4toUTF8 (
+		UCS4** sourceStart, const UCS4* sourceEnd, 
+		UTF8** targetStart, const UTF8* targetEnd);
+		
+ConversionResult	ConvertUTF8toUCS4 (
+		UTF8** sourceStart, UTF8* sourceEnd, 
+		UCS4** targetStart, const UCS4* targetEnd);
+
+/* ================================================================ */
diff --git a/CVTUTF7.C b/CVTUTF7.C
new file mode 100644
index 0000000..09583cb
--- /dev/null
+++ b/CVTUTF7.C
@@ -0,0 +1,300 @@
+/* ================================================================ */
+/*
+File:   ConvertUTF7.c
+Author: David B. Goldsmith
+Copyright (C) 1994, 1996 Taligent, Inc. All rights reserved.
+
+This code is copyrighted. Under the copyright laws, this code may not
+be copied, in whole or part, without prior written consent of Taligent. 
+
+Taligent grants the right to use this code as long as this ENTIRE
+copyright notice is reproduced in the code.  The code is provided
+AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES, EITHER EXPRESS OR
+IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  IN NO EVENT
+WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING,
+WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS
+INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY
+LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN
+IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF
+LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE
+LIMITATION MAY NOT APPLY TO YOU.
+
+RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the
+government is subject to restrictions as set forth in subparagraph
+(c)(l)(ii) of the Rights in Technical Data and Computer Software
+clause at DFARS 252.227-7013 and FAR 52.227-19.
+
+This code may be protected by one or more U.S. and International
+Patents.
+
+TRADEMARKS: Taligent and the Taligent Design Mark are registered
+trademarks of Taligent, Inc.
+*/
+
+#include "CVTUTF7.H"
+
+static char base64[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+static short invbase64[128];
+
+static char direct[] = 
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?";
+static char optional[] = "!\"#$%&*;<=>@[]^_`{|}";
+static char spaces[] = " \011\015\012";		/* space, tab, return, line feed */
+static char mustshiftsafe[128];
+static char mustshiftopt[128];
+
+static int needtables = 1;
+
+#define SHIFT_IN '+'
+#define SHIFT_OUT '-'
+
+static void
+tabinit()
+{
+	int i, limit;
+
+	for (i = 0; i < 128; ++i)
+	{
+		mustshiftopt[i] = mustshiftsafe[i] = 1;
+		invbase64[i] = -1;
+	}
+	limit = strlen(direct);
+	for (i = 0; i < limit; ++i)
+		mustshiftopt[direct[i]] = mustshiftsafe[direct[i]] = 0;
+	limit = strlen(spaces);
+	for (i = 0; i < limit; ++i)
+		mustshiftopt[spaces[i]] = mustshiftsafe[spaces[i]] = 0;
+	limit = strlen(optional);
+	for (i = 0; i < limit; ++i)
+		mustshiftopt[optional[i]] = 0;
+	limit = strlen(base64);
+	for (i = 0; i < limit; ++i)
+		invbase64[base64[i]] = i;
+
+	needtables = 0;
+}
+
+#define DECLARE_BIT_BUFFER register unsigned long BITbuffer = 0, buffertemp = 0; int bufferbits = 0
+#define BITS_IN_BUFFER bufferbits
+#define WRITE_N_BITS(x, n) ((BITbuffer |= ( ((x) & ~(-1L<<(n))) << (32-(n)-bufferbits) ) ), bufferbits += (n) )
+#define READ_N_BITS(n) ((buffertemp = (BITbuffer >> (32-(n)))), (BITbuffer <<= (n)), (bufferbits -= (n)), buffertemp)
+#define TARGETCHECK  {if (target >= targetEnd) {result = targetExhausted; break;}}
+
+ConversionResult ConvertUCS2toUTF7(
+                UCS2** sourceStart, UCS2* sourceEnd, 
+                char** targetStart, char* targetEnd,
+                int optional, int verbose)
+{
+	ConversionResult result = ok;
+	DECLARE_BIT_BUFFER;
+	int shifted = 0, needshift = 0, done = 0;
+	register UCS2 *source = *sourceStart;
+	register char *target = *targetStart;
+	char *mustshift;
+
+	if (needtables)
+		tabinit();
+
+	if (optional)
+		mustshift = mustshiftopt;
+	else
+		mustshift = mustshiftsafe;
+
+	do
+	{
+		register UCS2 r;
+
+		if (!(done = (source >= sourceEnd)))
+			r = *source++;
+		needshift = (!done && ((r > 0x7f) || mustshift[r]));
+
+		if (needshift && !shifted)
+		{
+			TARGETCHECK;
+			*target++ = SHIFT_IN;
+			/* Special case handling of the SHIFT_IN character */
+			if (r == (UCS2)SHIFT_IN) {
+				TARGETCHECK;
+				*target++ = SHIFT_OUT;
+			}
+			else
+				shifted = 1;
+		}
+
+		if (shifted)
+		{
+			/* Either write the character to the bit buffer, or pad
+			   the bit buffer out to a full base64 character.
+			 */
+			if (needshift)
+				WRITE_N_BITS(r, 16);
+			else
+				WRITE_N_BITS(0, (6 - (BITS_IN_BUFFER % 6))%6);
+
+			/* Flush out as many full base64 characters as possible
+			   from the bit buffer.
+			 */
+			while ((target < targetEnd) && BITS_IN_BUFFER >= 6)
+			{
+				*target++ = base64[READ_N_BITS(6)];
+			}
+
+			if (BITS_IN_BUFFER >= 6)
+				TARGETCHECK;
+
+			if (!needshift)
+			{
+				/* Write the explicit shift out character if
+				   1) The caller has requested we always do it, or
+				   2) The directly encoded character is in the
+				   base64 set, or
+				   3) The directly encoded character is SHIFT_OUT.
+				 */
+				if (verbose || ((!done) && (invbase64[r] >=0 || r == SHIFT_OUT)))
+				{
+					TARGETCHECK;
+					*target++ = SHIFT_OUT;
+				}
+				shifted = 0;
+			}
+		}
+
+		/* The character can be directly encoded as ASCII. */
+		if (!needshift && !done)
+		{
+			TARGETCHECK;
+			*target++ = (char) r;
+		}
+
+	}
+	while (!done);
+	
+    *sourceStart = source;
+    *targetStart = target;
+    return result;
+}
+
+ConversionResult ConvertUTF7toUCS2(
+                char** sourceStart, char* sourceEnd, 
+                UCS2** targetStart, UCS2* targetEnd)
+{
+	ConversionResult result = ok;
+	DECLARE_BIT_BUFFER;
+	int shifted = 0, first = 0, wroteone = 0, base64EOF, base64value, done;
+	unsigned int c, prevc;
+	unsigned long junk;
+	register char *source = *sourceStart;
+	register UCS2 *target = *targetStart;
+
+	if (needtables)
+		tabinit();
+
+	do
+	{
+		/* read an ASCII character c */
+		if (!(done = (source >= sourceEnd)))
+			c = *source++;
+		if (shifted)
+		{
+			/* We're done with a base64 string if we hit EOF, it's not a valid
+			   ASCII character, or it's not in the base64 set.
+			 */
+			base64EOF = done || (c > 0x7f) || (base64value = invbase64[c]) < 0;
+			if (base64EOF)
+			{
+				shifted = 0;
+				/* If the character causing us to drop out was SHIFT_IN or
+				   SHIFT_OUT, it may be a special escape for SHIFT_IN. The
+				   test for SHIFT_IN is not necessary, but allows an alternate
+				   form of UTF-7 where SHIFT_IN is escaped by SHIFT_IN. This
+				   only works for some values of SHIFT_IN.
+				 */
+				if (!done && (c == SHIFT_IN || c == SHIFT_OUT))
+				{
+					/* get another character c */
+					prevc = c;
+					if (!(done = (source >= sourceEnd)))
+						c = *source++;
+					/* If no base64 characters were encountered, and the
+					   character terminating the shift sequence was
+					   SHIFT_OUT, then it's a special escape for SHIFT_IN.
+					 */
+					if (first && prevc == SHIFT_OUT)
+					{
+						/* write SHIFT_IN unicode */
+						TARGETCHECK;
+						*target++ = (UCS2)SHIFT_IN;
+					}
+					else if (!wroteone)
+					{
+						result = sourceCorrupt;
+						/* fprintf(stderr, "UTF7: empty sequence near byte %ld in input\n", source-sourceStart) */;
+					}
+				}
+				else if (!wroteone)
+				{
+					result = sourceCorrupt;
+					/* fprintf(stderr, "UTF7: empty sequence near byte %ld in input\n", source-sourceStart) */;
+				}
+			}
+			else
+			{
+				/* Add another 6 bits of base64 to the bit buffer. */
+				WRITE_N_BITS(base64value, 6);
+				first = 0;
+			}
+
+			/* Extract as many full 16 bit characters as possible from the
+			   bit buffer.
+			 */
+			while (BITS_IN_BUFFER >= 16 && (target < targetEnd))
+			{
+				/* write a unicode */
+				*target++ = READ_N_BITS(16);
+				wroteone = 1;
+			}
+
+			if (BITS_IN_BUFFER >= 16)
+				TARGETCHECK;
+
+			if (base64EOF)
+			{
+				junk = READ_N_BITS(BITS_IN_BUFFER);
+				if (junk)
+				{
+					result = sourceCorrupt;
+					/* fprintf(stderr, "UTF7: non-zero pad bits near byte %ld in input\n", source-sourceStart) */;
+				}
+			}
+		}
+
+		if (!shifted && !done)
+		{
+			if (c == SHIFT_IN)
+			{
+				shifted = 1;
+				first = 1;
+				wroteone = 0;
+			}
+			else
+			{
+				/* It must be a directly encoded character. */
+				if (c > 0x7f)
+				{
+					result = sourceCorrupt;
+					/* fprintf(stderr, "UTF7: non-ASCII character near byte %ld in input\n", source-sourceStart) */;
+				}
+				/* write a unicode */
+				TARGETCHECK;
+				*target++ = c;
+			}
+		}
+	}
+	while (!done);
+
+    *sourceStart = source;
+    *targetStart = target;
+    return result;
+}
diff --git a/CVTUTF7.H b/CVTUTF7.H
new file mode 100644
index 0000000..788e0d1
--- /dev/null
+++ b/CVTUTF7.H
@@ -0,0 +1,80 @@
+/* ================================================================ */
+/*
+File:   ConvertUTF7.h
+Author: David B. Goldsmith
+Copyright (C) 1994 Taligent, Inc. All rights reserved.
+
+This code is copyrighted. Under the copyright laws, this code may not
+be copied, in whole or part, without prior written consent of Taligent. 
+
+Taligent grants the right to use this code as long as this ENTIRE
+copyright notice is reproduced in the code.  The code is provided
+AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES, EITHER EXPRESS OR
+IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  IN NO EVENT
+WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING,
+WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS
+INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY
+LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN
+IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF
+LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE
+LIMITATION MAY NOT APPLY TO YOU.
+
+RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the
+government is subject to restrictions as set forth in subparagraph
+(c)(l)(ii) of the Rights in Technical Data and Computer Software
+clause at DFARS 252.227-7013 and FAR 52.227-19.
+
+This code may be protected by one or more U.S. and International
+Patents.
+
+TRADEMARKS: Taligent and the Taligent Design Mark are registered
+trademarks of Taligent, Inc.
+*/
+/* ================================================================ */
+
+/* ================================================================ */
+/*      The following definitions are compiler-specific.
+        I would use wchar_t for UCS2/UTF16, except that the C standard
+        does not guarantee that it has at least 16 bits, so wchar_t is
+        no more portable than unsigned short!
+*/
+
+typedef unsigned short  UCS2;
+
+/* ================================================================ */
+/*      Each of these routines converts the text between *sourceStart and 
+sourceEnd, putting the result into the buffer between *targetStart and
+targetEnd. Note: the end pointers are *after* the last item: e.g. 
+*(sourceEnd - 1) is the last item.
+
+        The return result indicates whether the conversion was successful,
+and if not, whether the problem was in the source or target buffers.
+
+        After the conversion, *sourceStart and *targetStart are both
+updated to point to the end of last text successfully converted in
+the respective buffers.
+
+		In ConvertUCS2toUTF7, optional indicates whether UTF-7 optional
+characters should be directly encoded, and verbose controls whether the
+shift-out character, "-", is always emitted at the end of a shifted
+sequence.
+*/
+
+typedef enum {
+        ok,                             /* conversion successful */
+        sourceCorrupt,          /* source contains invalid UTF-7 */
+        targetExhausted         /* insuff. room in target for conversion */
+} ConversionResult;
+
+extern ConversionResult        ConvertUCS2toUTF7 (
+                UCS2** sourceStart, UCS2* sourceEnd, 
+                char** targetStart, char* targetEnd,
+                int optional, int verbose);
+
+extern ConversionResult        ConvertUTF7toUCS2 (
+                char** sourceStart, char* sourceEnd, 
+                UCS2** targetStart, UCS2* targetEnd);
+
+/* ================================================================ */
author	Luke Shumaker <lukeshu@lukeshu.com>	2000-08-21 23:46:10 -0500
committer	Luke Shumaker <lukeshu@lukeshu.com>	2000-08-21 23:46:10 -0500
commit	5a42c4a3c2a175170868b77aff9a92197f4a3bf9 (patch)
tree	4a3e4422220e4fb42a98718156f0019dba0f215f