From 5a4d1539c965585c67503079c6a19d309bfcfeb8 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Tue, 6 Nov 2001 16:06:20 -0500 Subject: http://web.archive.org/web/20011106160620/http:/www.unicode.org:80/Public/PROGRAMS/CVTUTF/ --- CVTUTF.C | 331 --------------------------------------------------------------- 1 file changed, 331 deletions(-) delete mode 100644 CVTUTF.C (limited to 'CVTUTF.C') diff --git a/CVTUTF.C b/CVTUTF.C deleted file mode 100644 index 94898bc..0000000 --- a/CVTUTF.C +++ /dev/null @@ -1,331 +0,0 @@ -/* ================================================================ */ -/* -File: ConvertUTF.C -Author: Mark E. Davis -Copyright (C) 1994 Taligent, Inc. All rights reserved. - -This code is copyrighted. Under the copyright laws, this code may not -be copied, in whole or part, without prior written consent of Taligent. - -Taligent grants the right to use or reprint this code as long as this -ENTIRE copyright notice is reproduced in the code or reproduction. -The code is provided AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES, -EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN -NO EVENT WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, -WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS -INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY -LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN -IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. -BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF -LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE -LIMITATION MAY NOT APPLY TO YOU. - -RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the -government is subject to restrictions as set forth in subparagraph -(c)(l)(ii) of the Rights in Technical Data and Computer Software -clause at DFARS 252.227-7013 and FAR 52.227-19. - -This code may be protected by one or more U.S. and International -Patents. - -TRADEMARKS: Taligent and the Taligent Design Mark are registered -trademarks of Taligent, Inc. -*/ -/* ================================================================ */ - -#include "CVTUTF.H" - -/* ================================================================ */ - -const int halfShift = 10; -const UCS4 halfBase = 0x0010000UL; -const UCS4 halfMask = 0x3FFUL; -const UCS4 kSurrogateHighStart = 0xD800UL; -const UCS4 kSurrogateHighEnd = 0xDBFFUL; -const UCS4 kSurrogateLowStart = 0xDC00UL; -const UCS4 kSurrogateLowEnd = 0xDFFFUL; - -/* ================================================================ */ - -ConversionResult ConvertUCS4toUTF16 ( - UCS4** sourceStart, const UCS4* sourceEnd, - UTF16** targetStart, const UTF16* targetEnd) { - ConversionResult result = ok; - register UCS4* source = *sourceStart; - register UTF16* target = *targetStart; - while (source < sourceEnd) { - register UCS4 ch; - if (target >= targetEnd) { - result = targetExhausted; break; - }; - ch = *source++; - if (ch <= kMaximumUCS2) { - *target++ = ch; - } else if (ch > kMaximumUTF16) { - *target++ = kReplacementCharacter; - } else { - if (target + 1 >= targetEnd) { - result = targetExhausted; break; - }; - ch -= halfBase; - *target++ = (ch >> halfShift) + kSurrogateHighStart; - *target++ = (ch & halfMask) + kSurrogateLowStart; - }; - }; - *sourceStart = source; - *targetStart = target; - return result; -}; - -/* ================================================================ */ - -ConversionResult ConvertUTF16toUCS4 ( - UTF16** sourceStart, UTF16* sourceEnd, - UCS4** targetStart, const UCS4* targetEnd) { - ConversionResult result = ok; - register UTF16* source = *sourceStart; - register UCS4* target = *targetStart; - while (source < sourceEnd) { - register UCS4 ch; - ch = *source++; - if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd && source < sourceEnd) { - register UCS4 ch2 = *source; - if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) { - ch = ((ch - kSurrogateHighStart) << halfShift) - + (ch2 - kSurrogateLowStart) + halfBase; - ++source; - }; - }; - if (target >= targetEnd) { - result = targetExhausted; break; - }; - *target++ = ch; - }; - *sourceStart = source; - *targetStart = target; - return result; -}; - -/* ================================================================ */ - -UCS4 offsetsFromUTF8[6] = {0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL}; -char bytesFromUTF8[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5}; - -UTF8 firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC}; - -/* ================================================================ */ -/* This code is similar in effect to making successive calls on the -mbtowc and wctomb routines in FSS-UTF. However, it is considerably -different in code: -* it is adapted to be consistent with UTF16, -* the interface converts a whole buffer to avoid function-call overhead -* constants have been gathered. -* loops & conditionals have been removed as much as possible for -efficiency, in favor of drop-through switch statements. -*/ - -/* ================================================================ */ -ConversionResult ConvertUTF16toUTF8 ( - UTF16** sourceStart, const UTF16* sourceEnd, - UTF8** targetStart, const UTF8* targetEnd) -{ - ConversionResult result = ok; - register UTF16* source = *sourceStart; - register UTF8* target = *targetStart; - while (source < sourceEnd) { - register UCS4 ch; - register unsigned short bytesToWrite = 0; - register const UCS4 byteMask = 0xBF; - register const UCS4 byteMark = 0x80; - ch = *source++; - if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd - && source < sourceEnd) { - register UCS4 ch2 = *source; - if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) { - ch = ((ch - kSurrogateHighStart) << halfShift) - + (ch2 - kSurrogateLowStart) + halfBase; - ++source; - }; - }; - if (ch < 0x80) { bytesToWrite = 1; - } else if (ch < 0x800) { bytesToWrite = 2; - } else if (ch < 0x10000) { bytesToWrite = 3; - } else if (ch < 0x200000) { bytesToWrite = 4; - } else if (ch < 0x4000000) { bytesToWrite = 5; - } else if (ch <= kMaximumUCS4){ bytesToWrite = 6; - } else { bytesToWrite = 2; - ch = kReplacementCharacter; - }; /* I wish there were a smart way to avoid this conditional */ - - target += bytesToWrite; - if (target > targetEnd) { - target -= bytesToWrite; result = targetExhausted; break; - }; - switch (bytesToWrite) { /* note: code falls through cases! */ - case 6: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 5: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 1: *--target = ch | firstByteMark[bytesToWrite]; - }; - target += bytesToWrite; - }; - *sourceStart = source; - *targetStart = target; - return result; -}; - -/* ================================================================ */ - -ConversionResult ConvertUTF8toUTF16 ( - UTF8** sourceStart, UTF8* sourceEnd, - UTF16** targetStart, const UTF16* targetEnd) -{ - ConversionResult result = ok; - register UTF8* source = *sourceStart; - register UTF16* target = *targetStart; - while (source < sourceEnd) { - register UCS4 ch = 0; - register unsigned short extraBytesToWrite = bytesFromUTF8[*source]; - if (source + extraBytesToWrite > sourceEnd) { - result = sourceExhausted; break; - }; - switch(extraBytesToWrite) { /* note: code falls through cases! */ - case 5: ch += *source++; ch <<= 6; - case 4: ch += *source++; ch <<= 6; - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - }; - ch -= offsetsFromUTF8[extraBytesToWrite]; - - if (target >= targetEnd) { - result = targetExhausted; break; - }; - if (ch <= kMaximumUCS2) { - *target++ = ch; - } else if (ch > kMaximumUTF16) { - *target++ = kReplacementCharacter; - } else { - if (target + 1 >= targetEnd) { - result = targetExhausted; break; - }; - ch -= halfBase; - *target++ = (ch >> halfShift) + kSurrogateHighStart; - *target++ = (ch & halfMask) + kSurrogateLowStart; - }; - }; - *sourceStart = source; - *targetStart = target; - return result; -}; - -/* ================================================================ */ -ConversionResult ConvertUCS4toUTF8 ( - UCS4** sourceStart, const UCS4* sourceEnd, - UTF8** targetStart, const UTF8* targetEnd) -{ - ConversionResult result = ok; - register UCS4* source = *sourceStart; - register UTF8* target = *targetStart; - while (source < sourceEnd) { - register UCS4 ch; - register unsigned short bytesToWrite = 0; - register const UCS4 byteMask = 0xBF; - register const UCS4 byteMark = 0x80; - ch = *source++; - if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd - && source < sourceEnd) { - register UCS4 ch2 = *source; - if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) { - ch = ((ch - kSurrogateHighStart) << halfShift) - + (ch2 - kSurrogateLowStart) + halfBase; - ++source; - }; - }; - if (ch < 0x80) { bytesToWrite = 1; - } else if (ch < 0x800) { bytesToWrite = 2; - } else if (ch < 0x10000) { bytesToWrite = 3; - } else if (ch < 0x200000) { bytesToWrite = 4; - } else if (ch < 0x4000000) { bytesToWrite = 5; - } else if (ch <= kMaximumUCS4){ bytesToWrite = 6; - } else { bytesToWrite = 2; - ch = kReplacementCharacter; - }; /* I wish there were a smart way to avoid this conditional */ - - target += bytesToWrite; - if (target > targetEnd) { - target -= bytesToWrite; result = targetExhausted; break; - }; - switch (bytesToWrite) { /* note: code falls through cases! */ - case 6: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 5: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 1: *--target = ch | firstByteMark[bytesToWrite]; - }; - target += bytesToWrite; - }; - *sourceStart = source; - *targetStart = target; - return result; -}; - -/* ================================================================ */ - -ConversionResult ConvertUTF8toUCS4 ( - UTF8** sourceStart, UTF8* sourceEnd, - UCS4** targetStart, const UCS4* targetEnd) -{ - ConversionResult result = ok; - register UTF8* source = *sourceStart; - register UCS4* target = *targetStart; - while (source < sourceEnd) { - register UCS4 ch = 0; - register unsigned short extraBytesToWrite = bytesFromUTF8[*source]; - if (source + extraBytesToWrite > sourceEnd) { - result = sourceExhausted; break; - }; - switch(extraBytesToWrite) { /* note: code falls through cases! */ - case 5: ch += *source++; ch <<= 6; - case 4: ch += *source++; ch <<= 6; - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - }; - ch -= offsetsFromUTF8[extraBytesToWrite]; - - if (target >= targetEnd) { - result = targetExhausted; break; - }; - if (ch <= kMaximumUCS2) { - *target++ = ch; - } else if (ch > kMaximumUCS4) { - *target++ = kReplacementCharacter; - } else { - if (target + 1 >= targetEnd) { - result = targetExhausted; break; - }; - ch -= halfBase; - *target++ = (ch >> halfShift) + kSurrogateHighStart; - *target++ = (ch & halfMask) + kSurrogateLowStart; - }; - }; - *sourceStart = source; - *targetStart = target; - return result; -}; -- cgit v1.2.3