Index: icu46/source/common/ucnv_u32.c |
=================================================================== |
--- icu46/source/common/ucnv_u32.c (revision 0) |
+++ icu46/source/common/ucnv_u32.c (revision 0) |
@@ -0,0 +1,1248 @@ |
+/* |
+********************************************************************** |
+* Copyright (C) 2002-2009, International Business Machines |
+* Corporation and others. All Rights Reserved. |
+********************************************************************** |
+* file name: ucnv_u32.c |
+* encoding: US-ASCII |
+* tab size: 8 (not used) |
+* indentation:4 |
+* |
+* created on: 2002jul01 |
+* created by: Markus W. Scherer |
+* |
+* UTF-32 converter implementation. Used to be in ucnv_utf.c. |
+*/ |
+ |
+#include "unicode/utypes.h" |
+ |
+#if !UCONFIG_NO_CONVERSION |
+ |
+#include "unicode/ucnv.h" |
+#include "ucnv_bld.h" |
+#include "ucnv_cnv.h" |
+#include "cmemory.h" |
+ |
+#define MAXIMUM_UCS2 0x0000FFFF |
+#define MAXIMUM_UTF 0x0010FFFF |
+#define HALF_SHIFT 10 |
+#define HALF_BASE 0x0010000 |
+#define HALF_MASK 0x3FF |
+#define SURROGATE_HIGH_START 0xD800 |
+#define SURROGATE_LOW_START 0xDC00 |
+ |
+/* -SURROGATE_LOW_START + HALF_BASE */ |
+#define SURROGATE_LOW_BASE 9216 |
+ |
+enum { |
+ UCNV_NEED_TO_WRITE_BOM=1 |
+}; |
+ |
+/* UTF-32BE ----------------------------------------------------------------- */ |
+ |
+static void |
+T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args, |
+ UErrorCode * err) |
+{ |
+ const unsigned char *mySource = (unsigned char *) args->source; |
+ UChar *myTarget = args->target; |
+ const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
+ const UChar *targetLimit = args->targetLimit; |
+ unsigned char *toUBytes = args->converter->toUBytes; |
+ uint32_t ch, i; |
+ |
+ /* Restore state of current sequence */ |
+ if (args->converter->toUnicodeStatus && myTarget < targetLimit) { |
+ i = args->converter->toULength; /* restore # of bytes consumed */ |
+ args->converter->toULength = 0; |
+ |
+ ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/ |
+ args->converter->toUnicodeStatus = 0; |
+ goto morebytes; |
+ } |
+ |
+ while (mySource < sourceLimit && myTarget < targetLimit) { |
+ i = 0; |
+ ch = 0; |
+morebytes: |
+ while (i < sizeof(uint32_t)) { |
+ if (mySource < sourceLimit) { |
+ ch = (ch << 8) | (uint8_t)(*mySource); |
+ toUBytes[i++] = (char) *(mySource++); |
+ } |
+ else { |
+ /* stores a partially calculated target*/ |
+ /* + 1 to make 0 a valid character */ |
+ args->converter->toUnicodeStatus = ch + 1; |
+ args->converter->toULength = (int8_t) i; |
+ goto donefornow; |
+ } |
+ } |
+ |
+ if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { |
+ /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
+ if (ch <= MAXIMUM_UCS2) |
+ { |
+ /* fits in 16 bits */ |
+ *(myTarget++) = (UChar) ch; |
+ } |
+ else { |
+ /* write out the surrogates */ |
+ *(myTarget++) = U16_LEAD(ch); |
+ ch = U16_TRAIL(ch); |
+ if (myTarget < targetLimit) { |
+ *(myTarget++) = (UChar)ch; |
+ } |
+ else { |
+ /* Put in overflow buffer (not handled here) */ |
+ args->converter->UCharErrorBuffer[0] = (UChar) ch; |
+ args->converter->UCharErrorBufferLength = 1; |
+ *err = U_BUFFER_OVERFLOW_ERROR; |
+ break; |
+ } |
+ } |
+ } |
+ else { |
+ args->converter->toULength = (int8_t)i; |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ break; |
+ } |
+ } |
+ |
+donefornow: |
+ if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { |
+ /* End of target buffer */ |
+ *err = U_BUFFER_OVERFLOW_ERROR; |
+ } |
+ |
+ args->target = myTarget; |
+ args->source = (const char *) mySource; |
+} |
+ |
+static void |
+T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args, |
+ UErrorCode * err) |
+{ |
+ const unsigned char *mySource = (unsigned char *) args->source; |
+ UChar *myTarget = args->target; |
+ int32_t *myOffsets = args->offsets; |
+ const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
+ const UChar *targetLimit = args->targetLimit; |
+ unsigned char *toUBytes = args->converter->toUBytes; |
+ uint32_t ch, i; |
+ int32_t offsetNum = 0; |
+ |
+ /* Restore state of current sequence */ |
+ if (args->converter->toUnicodeStatus && myTarget < targetLimit) { |
+ i = args->converter->toULength; /* restore # of bytes consumed */ |
+ args->converter->toULength = 0; |
+ |
+ ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/ |
+ args->converter->toUnicodeStatus = 0; |
+ goto morebytes; |
+ } |
+ |
+ while (mySource < sourceLimit && myTarget < targetLimit) { |
+ i = 0; |
+ ch = 0; |
+morebytes: |
+ while (i < sizeof(uint32_t)) { |
+ if (mySource < sourceLimit) { |
+ ch = (ch << 8) | (uint8_t)(*mySource); |
+ toUBytes[i++] = (char) *(mySource++); |
+ } |
+ else { |
+ /* stores a partially calculated target*/ |
+ /* + 1 to make 0 a valid character */ |
+ args->converter->toUnicodeStatus = ch + 1; |
+ args->converter->toULength = (int8_t) i; |
+ goto donefornow; |
+ } |
+ } |
+ |
+ if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { |
+ /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
+ if (ch <= MAXIMUM_UCS2) { |
+ /* fits in 16 bits */ |
+ *(myTarget++) = (UChar) ch; |
+ *(myOffsets++) = offsetNum; |
+ } |
+ else { |
+ /* write out the surrogates */ |
+ *(myTarget++) = U16_LEAD(ch); |
+ *myOffsets++ = offsetNum; |
+ ch = U16_TRAIL(ch); |
+ if (myTarget < targetLimit) |
+ { |
+ *(myTarget++) = (UChar)ch; |
+ *(myOffsets++) = offsetNum; |
+ } |
+ else { |
+ /* Put in overflow buffer (not handled here) */ |
+ args->converter->UCharErrorBuffer[0] = (UChar) ch; |
+ args->converter->UCharErrorBufferLength = 1; |
+ *err = U_BUFFER_OVERFLOW_ERROR; |
+ break; |
+ } |
+ } |
+ } |
+ else { |
+ args->converter->toULength = (int8_t)i; |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ break; |
+ } |
+ offsetNum += i; |
+ } |
+ |
+donefornow: |
+ if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
+ { |
+ /* End of target buffer */ |
+ *err = U_BUFFER_OVERFLOW_ERROR; |
+ } |
+ |
+ args->target = myTarget; |
+ args->source = (const char *) mySource; |
+ args->offsets = myOffsets; |
+} |
+ |
+static void |
+T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args, |
+ UErrorCode * err) |
+{ |
+ const UChar *mySource = args->source; |
+ unsigned char *myTarget; |
+ const UChar *sourceLimit = args->sourceLimit; |
+ const unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
+ UChar32 ch, ch2; |
+ unsigned int indexToWrite; |
+ unsigned char temp[sizeof(uint32_t)]; |
+ |
+ if(mySource >= sourceLimit) { |
+ /* no input, nothing to do */ |
+ return; |
+ } |
+ |
+ /* write the BOM if necessary */ |
+ if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
+ static const char bom[]={ 0, 0, (char)0xfe, (char)0xff }; |
+ ucnv_fromUWriteBytes(args->converter, |
+ bom, 4, |
+ &args->target, args->targetLimit, |
+ &args->offsets, -1, |
+ err); |
+ args->converter->fromUnicodeStatus=0; |
+ } |
+ |
+ myTarget = (unsigned char *) args->target; |
+ temp[0] = 0; |
+ |
+ if (args->converter->fromUChar32) { |
+ ch = args->converter->fromUChar32; |
+ args->converter->fromUChar32 = 0; |
+ goto lowsurogate; |
+ } |
+ |
+ while (mySource < sourceLimit && myTarget < targetLimit) { |
+ ch = *(mySource++); |
+ |
+ if (UTF_IS_SURROGATE(ch)) { |
+ if (U_IS_LEAD(ch)) { |
+lowsurogate: |
+ if (mySource < sourceLimit) { |
+ ch2 = *mySource; |
+ if (U_IS_TRAIL(ch2)) { |
+ ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; |
+ mySource++; |
+ } |
+ else { |
+ /* this is an unmatched trail code unit (2nd surrogate) */ |
+ /* callback(illegal) */ |
+ args->converter->fromUChar32 = ch; |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ break; |
+ } |
+ } |
+ else { |
+ /* ran out of source */ |
+ args->converter->fromUChar32 = ch; |
+ if (args->flush) { |
+ /* this is an unmatched trail code unit (2nd surrogate) */ |
+ /* callback(illegal) */ |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ } |
+ break; |
+ } |
+ } |
+ else { |
+ /* this is an unmatched trail code unit (2nd surrogate) */ |
+ /* callback(illegal) */ |
+ args->converter->fromUChar32 = ch; |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ break; |
+ } |
+ } |
+ |
+ /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ |
+ temp[1] = (uint8_t) (ch >> 16 & 0x1F); |
+ temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ |
+ temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ |
+ |
+ for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) { |
+ if (myTarget < targetLimit) { |
+ *(myTarget++) = temp[indexToWrite]; |
+ } |
+ else { |
+ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; |
+ *err = U_BUFFER_OVERFLOW_ERROR; |
+ } |
+ } |
+ } |
+ |
+ if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { |
+ *err = U_BUFFER_OVERFLOW_ERROR; |
+ } |
+ |
+ args->target = (char *) myTarget; |
+ args->source = mySource; |
+} |
+ |
+static void |
+T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args, |
+ UErrorCode * err) |
+{ |
+ const UChar *mySource = args->source; |
+ unsigned char *myTarget; |
+ int32_t *myOffsets; |
+ const UChar *sourceLimit = args->sourceLimit; |
+ const unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
+ UChar32 ch, ch2; |
+ int32_t offsetNum = 0; |
+ unsigned int indexToWrite; |
+ unsigned char temp[sizeof(uint32_t)]; |
+ |
+ if(mySource >= sourceLimit) { |
+ /* no input, nothing to do */ |
+ return; |
+ } |
+ |
+ /* write the BOM if necessary */ |
+ if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
+ static const char bom[]={ 0, 0, (char)0xfe, (char)0xff }; |
+ ucnv_fromUWriteBytes(args->converter, |
+ bom, 4, |
+ &args->target, args->targetLimit, |
+ &args->offsets, -1, |
+ err); |
+ args->converter->fromUnicodeStatus=0; |
+ } |
+ |
+ myTarget = (unsigned char *) args->target; |
+ myOffsets = args->offsets; |
+ temp[0] = 0; |
+ |
+ if (args->converter->fromUChar32) { |
+ ch = args->converter->fromUChar32; |
+ args->converter->fromUChar32 = 0; |
+ goto lowsurogate; |
+ } |
+ |
+ while (mySource < sourceLimit && myTarget < targetLimit) { |
+ ch = *(mySource++); |
+ |
+ if (UTF_IS_SURROGATE(ch)) { |
+ if (U_IS_LEAD(ch)) { |
+lowsurogate: |
+ if (mySource < sourceLimit) { |
+ ch2 = *mySource; |
+ if (U_IS_TRAIL(ch2)) { |
+ ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; |
+ mySource++; |
+ } |
+ else { |
+ /* this is an unmatched trail code unit (2nd surrogate) */ |
+ /* callback(illegal) */ |
+ args->converter->fromUChar32 = ch; |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ break; |
+ } |
+ } |
+ else { |
+ /* ran out of source */ |
+ args->converter->fromUChar32 = ch; |
+ if (args->flush) { |
+ /* this is an unmatched trail code unit (2nd surrogate) */ |
+ /* callback(illegal) */ |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ } |
+ break; |
+ } |
+ } |
+ else { |
+ /* this is an unmatched trail code unit (2nd surrogate) */ |
+ /* callback(illegal) */ |
+ args->converter->fromUChar32 = ch; |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ break; |
+ } |
+ } |
+ |
+ /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ |
+ temp[1] = (uint8_t) (ch >> 16 & 0x1F); |
+ temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ |
+ temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ |
+ |
+ for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) { |
+ if (myTarget < targetLimit) { |
+ *(myTarget++) = temp[indexToWrite]; |
+ *(myOffsets++) = offsetNum; |
+ } |
+ else { |
+ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; |
+ *err = U_BUFFER_OVERFLOW_ERROR; |
+ } |
+ } |
+ offsetNum = offsetNum + 1 + (temp[1] != 0); |
+ } |
+ |
+ if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { |
+ *err = U_BUFFER_OVERFLOW_ERROR; |
+ } |
+ |
+ args->target = (char *) myTarget; |
+ args->source = mySource; |
+ args->offsets = myOffsets; |
+} |
+ |
+static UChar32 |
+T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args, |
+ UErrorCode* err) |
+{ |
+ const uint8_t *mySource; |
+ UChar32 myUChar; |
+ int32_t length; |
+ |
+ mySource = (const uint8_t *)args->source; |
+ if (mySource >= (const uint8_t *)args->sourceLimit) |
+ { |
+ /* no input */ |
+ *err = U_INDEX_OUTOFBOUNDS_ERROR; |
+ return 0xffff; |
+ } |
+ |
+ length = (int32_t)((const uint8_t *)args->sourceLimit - mySource); |
+ if (length < 4) |
+ { |
+ /* got a partial character */ |
+ uprv_memcpy(args->converter->toUBytes, mySource, length); |
+ args->converter->toULength = (int8_t)length; |
+ args->source = (const char *)(mySource + length); |
+ *err = U_TRUNCATED_CHAR_FOUND; |
+ return 0xffff; |
+ } |
+ |
+ /* Don't even try to do a direct cast because the value may be on an odd address. */ |
+ myUChar = ((UChar32)mySource[0] << 24) |
+ | ((UChar32)mySource[1] << 16) |
+ | ((UChar32)mySource[2] << 8) |
+ | ((UChar32)mySource[3]); |
+ |
+ args->source = (const char *)(mySource + 4); |
+ if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) { |
+ return myUChar; |
+ } |
+ |
+ uprv_memcpy(args->converter->toUBytes, mySource, 4); |
+ args->converter->toULength = 4; |
+ |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ return 0xffff; |
+} |
+ |
+static const UConverterImpl _UTF32BEImpl = { |
+ UCNV_UTF32_BigEndian, |
+ |
+ NULL, |
+ NULL, |
+ |
+ NULL, |
+ NULL, |
+ NULL, |
+ |
+ T_UConverter_toUnicode_UTF32_BE, |
+ T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC, |
+ T_UConverter_fromUnicode_UTF32_BE, |
+ T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC, |
+ T_UConverter_getNextUChar_UTF32_BE, |
+ |
+ NULL, |
+ NULL, |
+ NULL, |
+ NULL, |
+ ucnv_getNonSurrogateUnicodeSet |
+}; |
+ |
+/* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */ |
+static const UConverterStaticData _UTF32BEStaticData = { |
+ sizeof(UConverterStaticData), |
+ "UTF-32BE", |
+ 1232, |
+ UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4, |
+ { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE, |
+ 0, |
+ 0, |
+ { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
+}; |
+ |
+const UConverterSharedData _UTF32BEData = { |
+ sizeof(UConverterSharedData), ~((uint32_t) 0), |
+ NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl, |
+ 0 |
+}; |
+ |
+/* UTF-32LE ---------------------------------------------------------- */ |
+ |
+static void |
+T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args, |
+ UErrorCode * err) |
+{ |
+ const unsigned char *mySource = (unsigned char *) args->source; |
+ UChar *myTarget = args->target; |
+ const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
+ const UChar *targetLimit = args->targetLimit; |
+ unsigned char *toUBytes = args->converter->toUBytes; |
+ uint32_t ch, i; |
+ |
+ /* Restore state of current sequence */ |
+ if (args->converter->toUnicodeStatus && myTarget < targetLimit) |
+ { |
+ i = args->converter->toULength; /* restore # of bytes consumed */ |
+ args->converter->toULength = 0; |
+ |
+ /* Stores the previously calculated ch from a previous call*/ |
+ ch = args->converter->toUnicodeStatus - 1; |
+ args->converter->toUnicodeStatus = 0; |
+ goto morebytes; |
+ } |
+ |
+ while (mySource < sourceLimit && myTarget < targetLimit) |
+ { |
+ i = 0; |
+ ch = 0; |
+morebytes: |
+ while (i < sizeof(uint32_t)) |
+ { |
+ if (mySource < sourceLimit) |
+ { |
+ ch |= ((uint8_t)(*mySource)) << (i * 8); |
+ toUBytes[i++] = (char) *(mySource++); |
+ } |
+ else |
+ { |
+ /* stores a partially calculated target*/ |
+ /* + 1 to make 0 a valid character */ |
+ args->converter->toUnicodeStatus = ch + 1; |
+ args->converter->toULength = (int8_t) i; |
+ goto donefornow; |
+ } |
+ } |
+ |
+ if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { |
+ /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
+ if (ch <= MAXIMUM_UCS2) { |
+ /* fits in 16 bits */ |
+ *(myTarget++) = (UChar) ch; |
+ } |
+ else { |
+ /* write out the surrogates */ |
+ *(myTarget++) = U16_LEAD(ch); |
+ ch = U16_TRAIL(ch); |
+ if (myTarget < targetLimit) { |
+ *(myTarget++) = (UChar)ch; |
+ } |
+ else { |
+ /* Put in overflow buffer (not handled here) */ |
+ args->converter->UCharErrorBuffer[0] = (UChar) ch; |
+ args->converter->UCharErrorBufferLength = 1; |
+ *err = U_BUFFER_OVERFLOW_ERROR; |
+ break; |
+ } |
+ } |
+ } |
+ else { |
+ args->converter->toULength = (int8_t)i; |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ break; |
+ } |
+ } |
+ |
+donefornow: |
+ if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
+ { |
+ /* End of target buffer */ |
+ *err = U_BUFFER_OVERFLOW_ERROR; |
+ } |
+ |
+ args->target = myTarget; |
+ args->source = (const char *) mySource; |
+} |
+ |
+static void |
+T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args, |
+ UErrorCode * err) |
+{ |
+ const unsigned char *mySource = (unsigned char *) args->source; |
+ UChar *myTarget = args->target; |
+ int32_t *myOffsets = args->offsets; |
+ const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
+ const UChar *targetLimit = args->targetLimit; |
+ unsigned char *toUBytes = args->converter->toUBytes; |
+ uint32_t ch, i; |
+ int32_t offsetNum = 0; |
+ |
+ /* Restore state of current sequence */ |
+ if (args->converter->toUnicodeStatus && myTarget < targetLimit) |
+ { |
+ i = args->converter->toULength; /* restore # of bytes consumed */ |
+ args->converter->toULength = 0; |
+ |
+ /* Stores the previously calculated ch from a previous call*/ |
+ ch = args->converter->toUnicodeStatus - 1; |
+ args->converter->toUnicodeStatus = 0; |
+ goto morebytes; |
+ } |
+ |
+ while (mySource < sourceLimit && myTarget < targetLimit) |
+ { |
+ i = 0; |
+ ch = 0; |
+morebytes: |
+ while (i < sizeof(uint32_t)) |
+ { |
+ if (mySource < sourceLimit) |
+ { |
+ ch |= ((uint8_t)(*mySource)) << (i * 8); |
+ toUBytes[i++] = (char) *(mySource++); |
+ } |
+ else |
+ { |
+ /* stores a partially calculated target*/ |
+ /* + 1 to make 0 a valid character */ |
+ args->converter->toUnicodeStatus = ch + 1; |
+ args->converter->toULength = (int8_t) i; |
+ goto donefornow; |
+ } |
+ } |
+ |
+ if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) |
+ { |
+ /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
+ if (ch <= MAXIMUM_UCS2) |
+ { |
+ /* fits in 16 bits */ |
+ *(myTarget++) = (UChar) ch; |
+ *(myOffsets++) = offsetNum; |
+ } |
+ else { |
+ /* write out the surrogates */ |
+ *(myTarget++) = U16_LEAD(ch); |
+ *(myOffsets++) = offsetNum; |
+ ch = U16_TRAIL(ch); |
+ if (myTarget < targetLimit) |
+ { |
+ *(myTarget++) = (UChar)ch; |
+ *(myOffsets++) = offsetNum; |
+ } |
+ else |
+ { |
+ /* Put in overflow buffer (not handled here) */ |
+ args->converter->UCharErrorBuffer[0] = (UChar) ch; |
+ args->converter->UCharErrorBufferLength = 1; |
+ *err = U_BUFFER_OVERFLOW_ERROR; |
+ break; |
+ } |
+ } |
+ } |
+ else |
+ { |
+ args->converter->toULength = (int8_t)i; |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ break; |
+ } |
+ offsetNum += i; |
+ } |
+ |
+donefornow: |
+ if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
+ { |
+ /* End of target buffer */ |
+ *err = U_BUFFER_OVERFLOW_ERROR; |
+ } |
+ |
+ args->target = myTarget; |
+ args->source = (const char *) mySource; |
+ args->offsets = myOffsets; |
+} |
+ |
+static void |
+T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args, |
+ UErrorCode * err) |
+{ |
+ const UChar *mySource = args->source; |
+ unsigned char *myTarget; |
+ const UChar *sourceLimit = args->sourceLimit; |
+ const unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
+ UChar32 ch, ch2; |
+ unsigned int indexToWrite; |
+ unsigned char temp[sizeof(uint32_t)]; |
+ |
+ if(mySource >= sourceLimit) { |
+ /* no input, nothing to do */ |
+ return; |
+ } |
+ |
+ /* write the BOM if necessary */ |
+ if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
+ static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 }; |
+ ucnv_fromUWriteBytes(args->converter, |
+ bom, 4, |
+ &args->target, args->targetLimit, |
+ &args->offsets, -1, |
+ err); |
+ args->converter->fromUnicodeStatus=0; |
+ } |
+ |
+ myTarget = (unsigned char *) args->target; |
+ temp[3] = 0; |
+ |
+ if (args->converter->fromUChar32) |
+ { |
+ ch = args->converter->fromUChar32; |
+ args->converter->fromUChar32 = 0; |
+ goto lowsurogate; |
+ } |
+ |
+ while (mySource < sourceLimit && myTarget < targetLimit) |
+ { |
+ ch = *(mySource++); |
+ |
+ if (UTF_IS_SURROGATE(ch)) { |
+ if (U_IS_LEAD(ch)) |
+ { |
+lowsurogate: |
+ if (mySource < sourceLimit) |
+ { |
+ ch2 = *mySource; |
+ if (U_IS_TRAIL(ch2)) { |
+ ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; |
+ mySource++; |
+ } |
+ else { |
+ /* this is an unmatched trail code unit (2nd surrogate) */ |
+ /* callback(illegal) */ |
+ args->converter->fromUChar32 = ch; |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ break; |
+ } |
+ } |
+ else { |
+ /* ran out of source */ |
+ args->converter->fromUChar32 = ch; |
+ if (args->flush) { |
+ /* this is an unmatched trail code unit (2nd surrogate) */ |
+ /* callback(illegal) */ |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ } |
+ break; |
+ } |
+ } |
+ else { |
+ /* this is an unmatched trail code unit (2nd surrogate) */ |
+ /* callback(illegal) */ |
+ args->converter->fromUChar32 = ch; |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ break; |
+ } |
+ } |
+ |
+ /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ |
+ temp[2] = (uint8_t) (ch >> 16 & 0x1F); |
+ temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ |
+ temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ |
+ |
+ for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) |
+ { |
+ if (myTarget < targetLimit) |
+ { |
+ *(myTarget++) = temp[indexToWrite]; |
+ } |
+ else |
+ { |
+ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; |
+ *err = U_BUFFER_OVERFLOW_ERROR; |
+ } |
+ } |
+ } |
+ |
+ if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
+ { |
+ *err = U_BUFFER_OVERFLOW_ERROR; |
+ } |
+ |
+ args->target = (char *) myTarget; |
+ args->source = mySource; |
+} |
+ |
+static void |
+T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args, |
+ UErrorCode * err) |
+{ |
+ const UChar *mySource = args->source; |
+ unsigned char *myTarget; |
+ int32_t *myOffsets; |
+ const UChar *sourceLimit = args->sourceLimit; |
+ const unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
+ UChar32 ch, ch2; |
+ unsigned int indexToWrite; |
+ unsigned char temp[sizeof(uint32_t)]; |
+ int32_t offsetNum = 0; |
+ |
+ if(mySource >= sourceLimit) { |
+ /* no input, nothing to do */ |
+ return; |
+ } |
+ |
+ /* write the BOM if necessary */ |
+ if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
+ static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 }; |
+ ucnv_fromUWriteBytes(args->converter, |
+ bom, 4, |
+ &args->target, args->targetLimit, |
+ &args->offsets, -1, |
+ err); |
+ args->converter->fromUnicodeStatus=0; |
+ } |
+ |
+ myTarget = (unsigned char *) args->target; |
+ myOffsets = args->offsets; |
+ temp[3] = 0; |
+ |
+ if (args->converter->fromUChar32) |
+ { |
+ ch = args->converter->fromUChar32; |
+ args->converter->fromUChar32 = 0; |
+ goto lowsurogate; |
+ } |
+ |
+ while (mySource < sourceLimit && myTarget < targetLimit) |
+ { |
+ ch = *(mySource++); |
+ |
+ if (UTF_IS_SURROGATE(ch)) { |
+ if (U_IS_LEAD(ch)) |
+ { |
+lowsurogate: |
+ if (mySource < sourceLimit) |
+ { |
+ ch2 = *mySource; |
+ if (U_IS_TRAIL(ch2)) |
+ { |
+ ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; |
+ mySource++; |
+ } |
+ else { |
+ /* this is an unmatched trail code unit (2nd surrogate) */ |
+ /* callback(illegal) */ |
+ args->converter->fromUChar32 = ch; |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ break; |
+ } |
+ } |
+ else { |
+ /* ran out of source */ |
+ args->converter->fromUChar32 = ch; |
+ if (args->flush) { |
+ /* this is an unmatched trail code unit (2nd surrogate) */ |
+ /* callback(illegal) */ |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ } |
+ break; |
+ } |
+ } |
+ else { |
+ /* this is an unmatched trail code unit (2nd surrogate) */ |
+ /* callback(illegal) */ |
+ args->converter->fromUChar32 = ch; |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ break; |
+ } |
+ } |
+ |
+ /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ |
+ temp[2] = (uint8_t) (ch >> 16 & 0x1F); |
+ temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ |
+ temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ |
+ |
+ for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) |
+ { |
+ if (myTarget < targetLimit) |
+ { |
+ *(myTarget++) = temp[indexToWrite]; |
+ *(myOffsets++) = offsetNum; |
+ } |
+ else |
+ { |
+ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; |
+ *err = U_BUFFER_OVERFLOW_ERROR; |
+ } |
+ } |
+ offsetNum = offsetNum + 1 + (temp[2] != 0); |
+ } |
+ |
+ if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
+ { |
+ *err = U_BUFFER_OVERFLOW_ERROR; |
+ } |
+ |
+ args->target = (char *) myTarget; |
+ args->source = mySource; |
+ args->offsets = myOffsets; |
+} |
+ |
+static UChar32 |
+T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args, |
+ UErrorCode* err) |
+{ |
+ const uint8_t *mySource; |
+ UChar32 myUChar; |
+ int32_t length; |
+ |
+ mySource = (const uint8_t *)args->source; |
+ if (mySource >= (const uint8_t *)args->sourceLimit) |
+ { |
+ /* no input */ |
+ *err = U_INDEX_OUTOFBOUNDS_ERROR; |
+ return 0xffff; |
+ } |
+ |
+ length = (int32_t)((const uint8_t *)args->sourceLimit - mySource); |
+ if (length < 4) |
+ { |
+ /* got a partial character */ |
+ uprv_memcpy(args->converter->toUBytes, mySource, length); |
+ args->converter->toULength = (int8_t)length; |
+ args->source = (const char *)(mySource + length); |
+ *err = U_TRUNCATED_CHAR_FOUND; |
+ return 0xffff; |
+ } |
+ |
+ /* Don't even try to do a direct cast because the value may be on an odd address. */ |
+ myUChar = ((UChar32)mySource[3] << 24) |
+ | ((UChar32)mySource[2] << 16) |
+ | ((UChar32)mySource[1] << 8) |
+ | ((UChar32)mySource[0]); |
+ |
+ args->source = (const char *)(mySource + 4); |
+ if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) { |
+ return myUChar; |
+ } |
+ |
+ uprv_memcpy(args->converter->toUBytes, mySource, 4); |
+ args->converter->toULength = 4; |
+ |
+ *err = U_ILLEGAL_CHAR_FOUND; |
+ return 0xffff; |
+} |
+ |
+static const UConverterImpl _UTF32LEImpl = { |
+ UCNV_UTF32_LittleEndian, |
+ |
+ NULL, |
+ NULL, |
+ |
+ NULL, |
+ NULL, |
+ NULL, |
+ |
+ T_UConverter_toUnicode_UTF32_LE, |
+ T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC, |
+ T_UConverter_fromUnicode_UTF32_LE, |
+ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC, |
+ T_UConverter_getNextUChar_UTF32_LE, |
+ |
+ NULL, |
+ NULL, |
+ NULL, |
+ NULL, |
+ ucnv_getNonSurrogateUnicodeSet |
+}; |
+ |
+/* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */ |
+static const UConverterStaticData _UTF32LEStaticData = { |
+ sizeof(UConverterStaticData), |
+ "UTF-32LE", |
+ 1234, |
+ UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4, |
+ { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE, |
+ 0, |
+ 0, |
+ { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
+}; |
+ |
+ |
+const UConverterSharedData _UTF32LEData = { |
+ sizeof(UConverterSharedData), ~((uint32_t) 0), |
+ NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl, |
+ 0 |
+}; |
+ |
+/* UTF-32 (Detect BOM) ------------------------------------------------------ */ |
+ |
+/* |
+ * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE |
+ * accordingly. |
+ * |
+ * State values: |
+ * 0 initial state |
+ * 1 saw 00 |
+ * 2 saw 00 00 |
+ * 3 saw 00 00 FE |
+ * 4 - |
+ * 5 saw FF |
+ * 6 saw FF FE |
+ * 7 saw FF FE 00 |
+ * 8 UTF-32BE mode |
+ * 9 UTF-32LE mode |
+ * |
+ * During detection: state&3==number of matching bytes so far. |
+ * |
+ * On output, emit U+FEFF as the first code point. |
+ */ |
+ |
+static void |
+_UTF32Reset(UConverter *cnv, UConverterResetChoice choice) { |
+ if(choice<=UCNV_RESET_TO_UNICODE) { |
+ /* reset toUnicode: state=0 */ |
+ cnv->mode=0; |
+ } |
+ if(choice!=UCNV_RESET_TO_UNICODE) { |
+ /* reset fromUnicode: prepare to output the UTF-32PE BOM */ |
+ cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; |
+ } |
+} |
+ |
+static void |
+_UTF32Open(UConverter *cnv, |
+ UConverterLoadArgs *pArgs, |
+ UErrorCode *pErrorCode) { |
+ _UTF32Reset(cnv, UCNV_RESET_BOTH); |
+} |
+ |
+static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 }; |
+ |
+static void |
+_UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
+ UErrorCode *pErrorCode) { |
+ UConverter *cnv=pArgs->converter; |
+ const char *source=pArgs->source; |
+ const char *sourceLimit=pArgs->sourceLimit; |
+ int32_t *offsets=pArgs->offsets; |
+ |
+ int32_t state, offsetDelta; |
+ char b; |
+ |
+ state=cnv->mode; |
+ |
+ /* |
+ * If we detect a BOM in this buffer, then we must add the BOM size to the |
+ * offsets because the actual converter function will not see and count the BOM. |
+ * offsetDelta will have the number of the BOM bytes that are in the current buffer. |
+ */ |
+ offsetDelta=0; |
+ |
+ while(source<sourceLimit && U_SUCCESS(*pErrorCode)) { |
+ switch(state) { |
+ case 0: |
+ b=*source; |
+ if(b==0) { |
+ state=1; /* could be 00 00 FE FF */ |
+ } else if(b==(char)0xff) { |
+ state=5; /* could be FF FE 00 00 */ |
+ } else { |
+ state=8; /* default to UTF-32BE */ |
+ continue; |
+ } |
+ ++source; |
+ break; |
+ case 1: |
+ case 2: |
+ case 3: |
+ case 5: |
+ case 6: |
+ case 7: |
+ if(*source==utf32BOM[state]) { |
+ ++state; |
+ ++source; |
+ if(state==4) { |
+ state=8; /* detect UTF-32BE */ |
+ offsetDelta=(int32_t)(source-pArgs->source); |
+ } else if(state==8) { |
+ state=9; /* detect UTF-32LE */ |
+ offsetDelta=(int32_t)(source-pArgs->source); |
+ } |
+ } else { |
+ /* switch to UTF-32BE and pass the previous bytes */ |
+ int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */ |
+ |
+ /* reset the source */ |
+ source=pArgs->source; |
+ |
+ if(count==(state&3)) { |
+ /* simple: all in the same buffer, just reset source */ |
+ } else { |
+ UBool oldFlush=pArgs->flush; |
+ |
+ /* some of the bytes are from a previous buffer, replay those first */ |
+ pArgs->source=utf32BOM+(state&4); /* select the correct BOM */ |
+ pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */ |
+ pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */ |
+ |
+ /* no offsets: bytes from previous buffer, and not enough for output */ |
+ T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); |
+ |
+ /* restore real pointers; pArgs->source will be set in case 8/9 */ |
+ pArgs->sourceLimit=sourceLimit; |
+ pArgs->flush=oldFlush; |
+ } |
+ state=8; |
+ continue; |
+ } |
+ break; |
+ case 8: |
+ /* call UTF-32BE */ |
+ pArgs->source=source; |
+ if(offsets==NULL) { |
+ T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); |
+ } else { |
+ T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode); |
+ } |
+ source=pArgs->source; |
+ break; |
+ case 9: |
+ /* call UTF-32LE */ |
+ pArgs->source=source; |
+ if(offsets==NULL) { |
+ T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode); |
+ } else { |
+ T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode); |
+ } |
+ source=pArgs->source; |
+ break; |
+ default: |
+ break; /* does not occur */ |
+ } |
+ } |
+ |
+ /* add BOM size to offsets - see comment at offsetDelta declaration */ |
+ if(offsets!=NULL && offsetDelta!=0) { |
+ int32_t *offsetsLimit=pArgs->offsets; |
+ while(offsets<offsetsLimit) { |
+ *offsets++ += offsetDelta; |
+ } |
+ } |
+ |
+ pArgs->source=source; |
+ |
+ if(source==sourceLimit && pArgs->flush) { |
+ /* handle truncated input */ |
+ switch(state) { |
+ case 0: |
+ break; /* no input at all, nothing to do */ |
+ case 8: |
+ T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); |
+ break; |
+ case 9: |
+ T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode); |
+ break; |
+ default: |
+ /* handle 0<state<8: call UTF-32BE with too-short input */ |
+ pArgs->source=utf32BOM+(state&4); /* select the correct BOM */ |
+ pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */ |
+ |
+ /* no offsets: not enough for output */ |
+ T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); |
+ pArgs->source=source; |
+ pArgs->sourceLimit=sourceLimit; |
+ state=8; |
+ break; |
+ } |
+ } |
+ |
+ cnv->mode=state; |
+} |
+ |
+static UChar32 |
+_UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs, |
+ UErrorCode *pErrorCode) { |
+ switch(pArgs->converter->mode) { |
+ case 8: |
+ return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode); |
+ case 9: |
+ return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode); |
+ default: |
+ return UCNV_GET_NEXT_UCHAR_USE_TO_U; |
+ } |
+} |
+ |
+static const UConverterImpl _UTF32Impl = { |
+ UCNV_UTF32, |
+ |
+ NULL, |
+ NULL, |
+ |
+ _UTF32Open, |
+ NULL, |
+ _UTF32Reset, |
+ |
+ _UTF32ToUnicodeWithOffsets, |
+ _UTF32ToUnicodeWithOffsets, |
+#if U_IS_BIG_ENDIAN |
+ T_UConverter_fromUnicode_UTF32_BE, |
+ T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC, |
+#else |
+ T_UConverter_fromUnicode_UTF32_LE, |
+ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC, |
+#endif |
+ _UTF32GetNextUChar, |
+ |
+ NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ |
+ NULL, |
+ NULL, |
+ NULL, |
+ ucnv_getNonSurrogateUnicodeSet |
+}; |
+ |
+/* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */ |
+static const UConverterStaticData _UTF32StaticData = { |
+ sizeof(UConverterStaticData), |
+ "UTF-32", |
+ 1236, |
+ UCNV_IBM, UCNV_UTF32, 4, 4, |
+#if U_IS_BIG_ENDIAN |
+ { 0, 0, 0xff, 0xfd }, 4, |
+#else |
+ { 0xfd, 0xff, 0, 0 }, 4, |
+#endif |
+ FALSE, FALSE, |
+ 0, |
+ 0, |
+ { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
+}; |
+ |
+const UConverterSharedData _UTF32Data = { |
+ sizeof(UConverterSharedData), ~((uint32_t) 0), |
+ NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl, |
+ 0 |
+}; |
+ |
+#endif |
Property changes on: icu46/source/common/ucnv_u32.c |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |