Index: icu46/source/common/ustrtrns.c |
=================================================================== |
--- icu46/source/common/ustrtrns.c (revision 0) |
+++ icu46/source/common/ustrtrns.c (revision 0) |
@@ -0,0 +1,1640 @@ |
+/* |
+****************************************************************************** |
+* |
+* Copyright (C) 2001-2010, International Business Machines |
+* Corporation and others. All Rights Reserved. |
+* |
+****************************************************************************** |
+* |
+* File ustrtrns.c |
+* |
+* Modification History: |
+* |
+* Date Name Description |
+* 9/10/2001 Ram Creation. |
+****************************************************************************** |
+*/ |
+ |
+/******************************************************************************* |
+ * |
+ * u_strTo* and u_strFrom* APIs |
+ * WCS functions moved to ustr_wcs.c for better modularization |
+ * |
+ ******************************************************************************* |
+ */ |
+ |
+ |
+#include "unicode/putil.h" |
+#include "unicode/ustring.h" |
+#include "cstring.h" |
+#include "cmemory.h" |
+#include "ustr_imp.h" |
+ |
+U_CAPI UChar* U_EXPORT2 |
+u_strFromUTF32WithSub(UChar *dest, |
+ int32_t destCapacity, |
+ int32_t *pDestLength, |
+ const UChar32 *src, |
+ int32_t srcLength, |
+ UChar32 subchar, int32_t *pNumSubstitutions, |
+ UErrorCode *pErrorCode) { |
+ const UChar32 *srcLimit; |
+ UChar32 ch; |
+ UChar *destLimit; |
+ UChar *pDest; |
+ int32_t reqLength; |
+ int32_t numSubstitutions; |
+ |
+ /* args check */ |
+ if(U_FAILURE(*pErrorCode)){ |
+ return NULL; |
+ } |
+ if( (src==NULL && srcLength!=0) || srcLength < -1 || |
+ (destCapacity<0) || (dest == NULL && destCapacity > 0) || |
+ subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
+ ) { |
+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
+ return NULL; |
+ } |
+ |
+ if(pNumSubstitutions != NULL) { |
+ *pNumSubstitutions = 0; |
+ } |
+ |
+ pDest = dest; |
+ destLimit = dest + destCapacity; |
+ reqLength = 0; |
+ numSubstitutions = 0; |
+ |
+ if(srcLength < 0) { |
+ /* simple loop for conversion of a NUL-terminated BMP string */ |
+ while((ch=*src) != 0 && |
+ ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) { |
+ ++src; |
+ if(pDest < destLimit) { |
+ *pDest++ = (UChar)ch; |
+ } else { |
+ ++reqLength; |
+ } |
+ } |
+ srcLimit = src; |
+ if(ch != 0) { |
+ /* "complicated" case, find the end of the remaining string */ |
+ while(*++srcLimit != 0) {} |
+ } |
+ } else { |
+ srcLimit = src + srcLength; |
+ } |
+ |
+ /* convert with length */ |
+ while(src < srcLimit) { |
+ ch = *src++; |
+ do { |
+ /* usually "loops" once; twice only for writing subchar */ |
+ if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) { |
+ if(pDest < destLimit) { |
+ *pDest++ = (UChar)ch; |
+ } else { |
+ ++reqLength; |
+ } |
+ break; |
+ } else if(0x10000 <= ch && ch <= 0x10ffff) { |
+ if((pDest + 2) <= destLimit) { |
+ *pDest++ = U16_LEAD(ch); |
+ *pDest++ = U16_TRAIL(ch); |
+ } else { |
+ reqLength += 2; |
+ } |
+ break; |
+ } else if((ch = subchar) < 0) { |
+ /* surrogate code point, or not a Unicode code point at all */ |
+ *pErrorCode = U_INVALID_CHAR_FOUND; |
+ return NULL; |
+ } else { |
+ ++numSubstitutions; |
+ } |
+ } while(TRUE); |
+ } |
+ |
+ reqLength += (int32_t)(pDest - dest); |
+ if(pDestLength) { |
+ *pDestLength = reqLength; |
+ } |
+ if(pNumSubstitutions != NULL) { |
+ *pNumSubstitutions = numSubstitutions; |
+ } |
+ |
+ /* Terminate the buffer */ |
+ u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); |
+ |
+ return dest; |
+} |
+ |
+U_CAPI UChar* U_EXPORT2 |
+u_strFromUTF32(UChar *dest, |
+ int32_t destCapacity, |
+ int32_t *pDestLength, |
+ const UChar32 *src, |
+ int32_t srcLength, |
+ UErrorCode *pErrorCode) { |
+ return u_strFromUTF32WithSub( |
+ dest, destCapacity, pDestLength, |
+ src, srcLength, |
+ U_SENTINEL, NULL, |
+ pErrorCode); |
+} |
+ |
+U_CAPI UChar32* U_EXPORT2 |
+u_strToUTF32WithSub(UChar32 *dest, |
+ int32_t destCapacity, |
+ int32_t *pDestLength, |
+ const UChar *src, |
+ int32_t srcLength, |
+ UChar32 subchar, int32_t *pNumSubstitutions, |
+ UErrorCode *pErrorCode) { |
+ const UChar *srcLimit; |
+ UChar32 ch; |
+ UChar ch2; |
+ UChar32 *destLimit; |
+ UChar32 *pDest; |
+ int32_t reqLength; |
+ int32_t numSubstitutions; |
+ |
+ /* args check */ |
+ if(U_FAILURE(*pErrorCode)){ |
+ return NULL; |
+ } |
+ if( (src==NULL && srcLength!=0) || srcLength < -1 || |
+ (destCapacity<0) || (dest == NULL && destCapacity > 0) || |
+ subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
+ ) { |
+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
+ return NULL; |
+ } |
+ |
+ if(pNumSubstitutions != NULL) { |
+ *pNumSubstitutions = 0; |
+ } |
+ |
+ pDest = dest; |
+ destLimit = dest + destCapacity; |
+ reqLength = 0; |
+ numSubstitutions = 0; |
+ |
+ if(srcLength < 0) { |
+ /* simple loop for conversion of a NUL-terminated BMP string */ |
+ while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) { |
+ ++src; |
+ if(pDest < destLimit) { |
+ *pDest++ = ch; |
+ } else { |
+ ++reqLength; |
+ } |
+ } |
+ srcLimit = src; |
+ if(ch != 0) { |
+ /* "complicated" case, find the end of the remaining string */ |
+ while(*++srcLimit != 0) {} |
+ } |
+ } else { |
+ srcLimit = src + srcLength; |
+ } |
+ |
+ /* convert with length */ |
+ while(src < srcLimit) { |
+ ch = *src++; |
+ if(!U16_IS_SURROGATE(ch)) { |
+ /* write or count ch below */ |
+ } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) { |
+ ++src; |
+ ch = U16_GET_SUPPLEMENTARY(ch, ch2); |
+ } else if((ch = subchar) < 0) { |
+ /* unpaired surrogate */ |
+ *pErrorCode = U_INVALID_CHAR_FOUND; |
+ return NULL; |
+ } else { |
+ ++numSubstitutions; |
+ } |
+ if(pDest < destLimit) { |
+ *pDest++ = ch; |
+ } else { |
+ ++reqLength; |
+ } |
+ } |
+ |
+ reqLength += (int32_t)(pDest - dest); |
+ if(pDestLength) { |
+ *pDestLength = reqLength; |
+ } |
+ if(pNumSubstitutions != NULL) { |
+ *pNumSubstitutions = numSubstitutions; |
+ } |
+ |
+ /* Terminate the buffer */ |
+ u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode); |
+ |
+ return dest; |
+} |
+ |
+U_CAPI UChar32* U_EXPORT2 |
+u_strToUTF32(UChar32 *dest, |
+ int32_t destCapacity, |
+ int32_t *pDestLength, |
+ const UChar *src, |
+ int32_t srcLength, |
+ UErrorCode *pErrorCode) { |
+ return u_strToUTF32WithSub( |
+ dest, destCapacity, pDestLength, |
+ src, srcLength, |
+ U_SENTINEL, NULL, |
+ pErrorCode); |
+} |
+ |
+/* for utf8_nextCharSafeBodyTerminated() */ |
+static const UChar32 |
+utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; |
+ |
+/* |
+ * Version of utf8_nextCharSafeBody() with the following differences: |
+ * - checks for NUL termination instead of length |
+ * - works with pointers instead of indexes |
+ * - always strict (strict==-1) |
+ * |
+ * *ps points to after the lead byte and will be moved to after the last trail byte. |
+ * c is the lead byte. |
+ * @return the code point, or U_SENTINEL |
+ */ |
+static UChar32 |
+utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) { |
+ const uint8_t *s=*ps; |
+ uint8_t trail, illegal=0; |
+ uint8_t count=UTF8_COUNT_TRAIL_BYTES(c); |
+ UTF8_MASK_LEAD_BYTE((c), count); |
+ /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ |
+ switch(count) { |
+ /* each branch falls through to the next one */ |
+ case 5: |
+ case 4: |
+ /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ |
+ illegal=1; |
+ break; |
+ case 3: |
+ trail=(uint8_t)(*s++ - 0x80); |
+ c=(c<<6)|trail; |
+ if(trail>0x3f || c>=0x110) { |
+ /* not a trail byte, or code point>0x10ffff (outside Unicode) */ |
+ illegal=1; |
+ break; |
+ } |
+ case 2: |
+ trail=(uint8_t)(*s++ - 0x80); |
+ if(trail>0x3f) { |
+ /* not a trail byte */ |
+ illegal=1; |
+ break; |
+ } |
+ c=(c<<6)|trail; |
+ case 1: |
+ trail=(uint8_t)(*s++ - 0x80); |
+ if(trail>0x3f) { |
+ /* not a trail byte */ |
+ illegal=1; |
+ } |
+ c=(c<<6)|trail; |
+ break; |
+ case 0: |
+ return U_SENTINEL; |
+ /* no default branch to optimize switch() - all values are covered */ |
+ } |
+ |
+ /* correct sequence - all trail bytes have (b7..b6)==(10)? */ |
+ /* illegal is also set if count>=4 */ |
+ if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) { |
+ /* error handling */ |
+ /* don't go beyond this sequence */ |
+ s=*ps; |
+ while(count>0 && UTF8_IS_TRAIL(*s)) { |
+ ++s; |
+ --count; |
+ } |
+ c=U_SENTINEL; |
+ } |
+ *ps=s; |
+ return c; |
+} |
+ |
+/* |
+ * Version of utf8_nextCharSafeBody() with the following differences: |
+ * - works with pointers instead of indexes |
+ * - always strict (strict==-1) |
+ * |
+ * *ps points to after the lead byte and will be moved to after the last trail byte. |
+ * c is the lead byte. |
+ * @return the code point, or U_SENTINEL |
+ */ |
+static UChar32 |
+utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) { |
+ const uint8_t *s=*ps; |
+ uint8_t trail, illegal=0; |
+ uint8_t count=UTF8_COUNT_TRAIL_BYTES(c); |
+ if((limit-s)>=count) { |
+ UTF8_MASK_LEAD_BYTE((c), count); |
+ /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ |
+ switch(count) { |
+ /* each branch falls through to the next one */ |
+ case 5: |
+ case 4: |
+ /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ |
+ illegal=1; |
+ break; |
+ case 3: |
+ trail=*s++; |
+ c=(c<<6)|(trail&0x3f); |
+ if(c<0x110) { |
+ illegal|=(trail&0xc0)^0x80; |
+ } else { |
+ /* code point>0x10ffff, outside Unicode */ |
+ illegal=1; |
+ break; |
+ } |
+ case 2: |
+ trail=*s++; |
+ c=(c<<6)|(trail&0x3f); |
+ illegal|=(trail&0xc0)^0x80; |
+ case 1: |
+ trail=*s++; |
+ c=(c<<6)|(trail&0x3f); |
+ illegal|=(trail&0xc0)^0x80; |
+ break; |
+ case 0: |
+ return U_SENTINEL; |
+ /* no default branch to optimize switch() - all values are covered */ |
+ } |
+ } else { |
+ illegal=1; /* too few bytes left */ |
+ } |
+ |
+ /* correct sequence - all trail bytes have (b7..b6)==(10)? */ |
+ /* illegal is also set if count>=4 */ |
+ if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) { |
+ /* error handling */ |
+ /* don't go beyond this sequence */ |
+ s=*ps; |
+ while(count>0 && s<limit && UTF8_IS_TRAIL(*s)) { |
+ ++s; |
+ --count; |
+ } |
+ c=U_SENTINEL; |
+ } |
+ *ps=s; |
+ return c; |
+} |
+ |
+U_CAPI UChar* U_EXPORT2 |
+u_strFromUTF8WithSub(UChar *dest, |
+ int32_t destCapacity, |
+ int32_t *pDestLength, |
+ const char* src, |
+ int32_t srcLength, |
+ UChar32 subchar, int32_t *pNumSubstitutions, |
+ UErrorCode *pErrorCode){ |
+ UChar *pDest = dest; |
+ UChar *pDestLimit = dest+destCapacity; |
+ UChar32 ch; |
+ int32_t reqLength = 0; |
+ const uint8_t* pSrc = (const uint8_t*) src; |
+ uint8_t t1, t2; /* trail bytes */ |
+ int32_t numSubstitutions; |
+ |
+ /* args check */ |
+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ |
+ return NULL; |
+ } |
+ |
+ if( (src==NULL && srcLength!=0) || srcLength < -1 || |
+ (destCapacity<0) || (dest == NULL && destCapacity > 0) || |
+ subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
+ ) { |
+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
+ return NULL; |
+ } |
+ |
+ if(pNumSubstitutions!=NULL) { |
+ *pNumSubstitutions=0; |
+ } |
+ numSubstitutions=0; |
+ |
+ /* |
+ * Inline processing of UTF-8 byte sequences: |
+ * |
+ * Byte sequences for the most common characters are handled inline in |
+ * the conversion loops. In order to reduce the path lengths for those |
+ * characters, the tests are arranged in a kind of binary search. |
+ * ASCII (<=0x7f) is checked first, followed by the dividing point |
+ * between 2- and 3-byte sequences (0xe0). |
+ * The 3-byte branch is tested first to speed up CJK text. |
+ * The compiler should combine the subtractions for the two tests for 0xe0. |
+ * Each branch then tests for the other end of its range. |
+ */ |
+ |
+ if(srcLength < 0){ |
+ /* |
+ * Transform a NUL-terminated string. |
+ * The code explicitly checks for NULs only in the lead byte position. |
+ * A NUL byte in the trail byte position fails the trail byte range check anyway. |
+ */ |
+ while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { |
+ if(ch <= 0x7f){ |
+ *pDest++=(UChar)ch; |
+ ++pSrc; |
+ } else { |
+ if(ch > 0xe0) { |
+ if( /* handle U+1000..U+CFFF inline */ |
+ ch <= 0xec && |
+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && |
+ (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f |
+ ) { |
+ /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
+ *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); |
+ pSrc += 3; |
+ continue; |
+ } |
+ } else if(ch < 0xe0) { |
+ if( /* handle U+0080..U+07FF inline */ |
+ ch >= 0xc2 && |
+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f |
+ ) { |
+ *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); |
+ pSrc += 2; |
+ continue; |
+ } |
+ } |
+ |
+ /* function call for "complicated" and error cases */ |
+ ++pSrc; /* continue after the lead byte */ |
+ ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); |
+ if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { |
+ *pErrorCode = U_INVALID_CHAR_FOUND; |
+ return NULL; |
+ } else if(ch<=0xFFFF) { |
+ *(pDest++)=(UChar)ch; |
+ } else { |
+ *(pDest++)=UTF16_LEAD(ch); |
+ if(pDest<pDestLimit) { |
+ *(pDest++)=UTF16_TRAIL(ch); |
+ } else { |
+ reqLength++; |
+ break; |
+ } |
+ } |
+ } |
+ } |
+ |
+ /* Pre-flight the rest of the string. */ |
+ while((ch = *pSrc) != 0) { |
+ if(ch <= 0x7f){ |
+ ++reqLength; |
+ ++pSrc; |
+ } else { |
+ if(ch > 0xe0) { |
+ if( /* handle U+1000..U+CFFF inline */ |
+ ch <= 0xec && |
+ (uint8_t)(pSrc[1] - 0x80) <= 0x3f && |
+ (uint8_t)(pSrc[2] - 0x80) <= 0x3f |
+ ) { |
+ ++reqLength; |
+ pSrc += 3; |
+ continue; |
+ } |
+ } else if(ch < 0xe0) { |
+ if( /* handle U+0080..U+07FF inline */ |
+ ch >= 0xc2 && |
+ (uint8_t)(pSrc[1] - 0x80) <= 0x3f |
+ ) { |
+ ++reqLength; |
+ pSrc += 2; |
+ continue; |
+ } |
+ } |
+ |
+ /* function call for "complicated" and error cases */ |
+ ++pSrc; /* continue after the lead byte */ |
+ ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); |
+ if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { |
+ *pErrorCode = U_INVALID_CHAR_FOUND; |
+ return NULL; |
+ } |
+ reqLength += U16_LENGTH(ch); |
+ } |
+ } |
+ } else /* srcLength >= 0 */ { |
+ const uint8_t *pSrcLimit = pSrc + srcLength; |
+ int32_t count; |
+ |
+ /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ |
+ for(;;) { |
+ /* |
+ * Each iteration of the inner loop progresses by at most 3 UTF-8 |
+ * bytes and one UChar, for most characters. |
+ * For supplementary code points (4 & 2), which are rare, |
+ * there is an additional adjustment. |
+ */ |
+ count = (int32_t)(pDestLimit - pDest); |
+ srcLength = (int32_t)((pSrcLimit - pSrc) / 3); |
+ if(count > srcLength) { |
+ count = srcLength; /* min(remaining dest, remaining src/3) */ |
+ } |
+ if(count < 3) { |
+ /* |
+ * Too much overhead if we get near the end of the string, |
+ * continue with the next loop. |
+ */ |
+ break; |
+ } |
+ |
+ do { |
+ ch = *pSrc; |
+ if(ch <= 0x7f){ |
+ *pDest++=(UChar)ch; |
+ ++pSrc; |
+ } else { |
+ if(ch > 0xe0) { |
+ if( /* handle U+1000..U+CFFF inline */ |
+ ch <= 0xec && |
+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && |
+ (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f |
+ ) { |
+ /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
+ *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); |
+ pSrc += 3; |
+ continue; |
+ } |
+ } else if(ch < 0xe0) { |
+ if( /* handle U+0080..U+07FF inline */ |
+ ch >= 0xc2 && |
+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f |
+ ) { |
+ *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); |
+ pSrc += 2; |
+ continue; |
+ } |
+ } |
+ |
+ if(ch >= 0xf0 || subchar > 0xffff) { |
+ /* |
+ * We may read up to six bytes and write up to two UChars, |
+ * which we didn't account for with computing count, |
+ * so we adjust it here. |
+ */ |
+ if(--count == 0) { |
+ break; |
+ } |
+ } |
+ |
+ /* function call for "complicated" and error cases */ |
+ ++pSrc; /* continue after the lead byte */ |
+ ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
+ if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ |
+ *pErrorCode = U_INVALID_CHAR_FOUND; |
+ return NULL; |
+ }else if(ch<=0xFFFF){ |
+ *(pDest++)=(UChar)ch; |
+ }else{ |
+ *(pDest++)=UTF16_LEAD(ch); |
+ *(pDest++)=UTF16_TRAIL(ch); |
+ } |
+ } |
+ } while(--count > 0); |
+ } |
+ |
+ while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { |
+ ch = *pSrc; |
+ if(ch <= 0x7f){ |
+ *pDest++=(UChar)ch; |
+ ++pSrc; |
+ } else { |
+ if(ch > 0xe0) { |
+ if( /* handle U+1000..U+CFFF inline */ |
+ ch <= 0xec && |
+ ((pSrcLimit - pSrc) >= 3) && |
+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && |
+ (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f |
+ ) { |
+ /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
+ *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); |
+ pSrc += 3; |
+ continue; |
+ } |
+ } else if(ch < 0xe0) { |
+ if( /* handle U+0080..U+07FF inline */ |
+ ch >= 0xc2 && |
+ ((pSrcLimit - pSrc) >= 2) && |
+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f |
+ ) { |
+ *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); |
+ pSrc += 2; |
+ continue; |
+ } |
+ } |
+ |
+ /* function call for "complicated" and error cases */ |
+ ++pSrc; /* continue after the lead byte */ |
+ ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
+ if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ |
+ *pErrorCode = U_INVALID_CHAR_FOUND; |
+ return NULL; |
+ }else if(ch<=0xFFFF){ |
+ *(pDest++)=(UChar)ch; |
+ }else{ |
+ *(pDest++)=UTF16_LEAD(ch); |
+ if(pDest<pDestLimit){ |
+ *(pDest++)=UTF16_TRAIL(ch); |
+ }else{ |
+ reqLength++; |
+ break; |
+ } |
+ } |
+ } |
+ } |
+ /* do not fill the dest buffer just count the UChars needed */ |
+ while(pSrc < pSrcLimit){ |
+ ch = *pSrc; |
+ if(ch <= 0x7f){ |
+ reqLength++; |
+ ++pSrc; |
+ } else { |
+ if(ch > 0xe0) { |
+ if( /* handle U+1000..U+CFFF inline */ |
+ ch <= 0xec && |
+ ((pSrcLimit - pSrc) >= 3) && |
+ (uint8_t)(pSrc[1] - 0x80) <= 0x3f && |
+ (uint8_t)(pSrc[2] - 0x80) <= 0x3f |
+ ) { |
+ reqLength++; |
+ pSrc += 3; |
+ continue; |
+ } |
+ } else if(ch < 0xe0) { |
+ if( /* handle U+0080..U+07FF inline */ |
+ ch >= 0xc2 && |
+ ((pSrcLimit - pSrc) >= 2) && |
+ (uint8_t)(pSrc[1] - 0x80) <= 0x3f |
+ ) { |
+ reqLength++; |
+ pSrc += 2; |
+ continue; |
+ } |
+ } |
+ |
+ /* function call for "complicated" and error cases */ |
+ ++pSrc; /* continue after the lead byte */ |
+ ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
+ if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ |
+ *pErrorCode = U_INVALID_CHAR_FOUND; |
+ return NULL; |
+ } |
+ reqLength+=UTF_CHAR_LENGTH(ch); |
+ } |
+ } |
+ } |
+ |
+ reqLength+=(int32_t)(pDest - dest); |
+ |
+ if(pNumSubstitutions!=NULL) { |
+ *pNumSubstitutions=numSubstitutions; |
+ } |
+ |
+ if(pDestLength){ |
+ *pDestLength = reqLength; |
+ } |
+ |
+ /* Terminate the buffer */ |
+ u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); |
+ |
+ return dest; |
+} |
+ |
+U_CAPI UChar* U_EXPORT2 |
+u_strFromUTF8(UChar *dest, |
+ int32_t destCapacity, |
+ int32_t *pDestLength, |
+ const char* src, |
+ int32_t srcLength, |
+ UErrorCode *pErrorCode){ |
+ return u_strFromUTF8WithSub( |
+ dest, destCapacity, pDestLength, |
+ src, srcLength, |
+ U_SENTINEL, NULL, |
+ pErrorCode); |
+} |
+ |
+U_CAPI UChar * U_EXPORT2 |
+u_strFromUTF8Lenient(UChar *dest, |
+ int32_t destCapacity, |
+ int32_t *pDestLength, |
+ const char *src, |
+ int32_t srcLength, |
+ UErrorCode *pErrorCode) { |
+ UChar *pDest = dest; |
+ UChar32 ch; |
+ int32_t reqLength = 0; |
+ uint8_t* pSrc = (uint8_t*) src; |
+ |
+ /* args check */ |
+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ |
+ return NULL; |
+ } |
+ |
+ if( (src==NULL && srcLength!=0) || srcLength < -1 || |
+ (destCapacity<0) || (dest == NULL && destCapacity > 0) |
+ ) { |
+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
+ return NULL; |
+ } |
+ |
+ if(srcLength < 0) { |
+ /* Transform a NUL-terminated string. */ |
+ UChar *pDestLimit = dest+destCapacity; |
+ uint8_t t1, t2, t3; /* trail bytes */ |
+ |
+ while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { |
+ if(ch < 0xc0) { |
+ /* |
+ * ASCII, or a trail byte in lead position which is treated like |
+ * a single-byte sequence for better character boundary |
+ * resynchronization after illegal sequences. |
+ */ |
+ *pDest++=(UChar)ch; |
+ ++pSrc; |
+ continue; |
+ } else if(ch < 0xe0) { /* U+0080..U+07FF */ |
+ if((t1 = pSrc[1]) != 0) { |
+ /* 0x3080 = (0xc0 << 6) + 0x80 */ |
+ *pDest++ = (UChar)((ch << 6) + t1 - 0x3080); |
+ pSrc += 2; |
+ continue; |
+ } |
+ } else if(ch < 0xf0) { /* U+0800..U+FFFF */ |
+ if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) { |
+ /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
+ /* 0x2080 = (0x80 << 6) + 0x80 */ |
+ *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080); |
+ pSrc += 3; |
+ continue; |
+ } |
+ } else /* f0..f4 */ { /* U+10000..U+10FFFF */ |
+ if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) { |
+ pSrc += 4; |
+ /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ |
+ ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080; |
+ *(pDest++) = U16_LEAD(ch); |
+ if(pDest < pDestLimit) { |
+ *(pDest++) = U16_TRAIL(ch); |
+ } else { |
+ reqLength = 1; |
+ break; |
+ } |
+ continue; |
+ } |
+ } |
+ |
+ /* truncated character at the end */ |
+ *pDest++ = 0xfffd; |
+ while(*++pSrc != 0) {} |
+ break; |
+ } |
+ |
+ /* Pre-flight the rest of the string. */ |
+ while((ch = *pSrc) != 0) { |
+ if(ch < 0xc0) { |
+ /* |
+ * ASCII, or a trail byte in lead position which is treated like |
+ * a single-byte sequence for better character boundary |
+ * resynchronization after illegal sequences. |
+ */ |
+ ++reqLength; |
+ ++pSrc; |
+ continue; |
+ } else if(ch < 0xe0) { /* U+0080..U+07FF */ |
+ if(pSrc[1] != 0) { |
+ ++reqLength; |
+ pSrc += 2; |
+ continue; |
+ } |
+ } else if(ch < 0xf0) { /* U+0800..U+FFFF */ |
+ if(pSrc[1] != 0 && pSrc[2] != 0) { |
+ ++reqLength; |
+ pSrc += 3; |
+ continue; |
+ } |
+ } else /* f0..f4 */ { /* U+10000..U+10FFFF */ |
+ if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) { |
+ reqLength += 2; |
+ pSrc += 4; |
+ continue; |
+ } |
+ } |
+ |
+ /* truncated character at the end */ |
+ ++reqLength; |
+ break; |
+ } |
+ } else /* srcLength >= 0 */ { |
+ const uint8_t *pSrcLimit = pSrc + srcLength; |
+ |
+ /* |
+ * This function requires that if srcLength is given, then it must be |
+ * destCapatity >= srcLength so that we need not check for |
+ * destination buffer overflow in the loop. |
+ */ |
+ if(destCapacity < srcLength) { |
+ if(pDestLength != NULL) { |
+ *pDestLength = srcLength; /* this likely overestimates the true destLength! */ |
+ } |
+ *pErrorCode = U_BUFFER_OVERFLOW_ERROR; |
+ return NULL; |
+ } |
+ |
+ if((pSrcLimit - pSrc) >= 4) { |
+ pSrcLimit -= 3; /* temporarily reduce pSrcLimit */ |
+ |
+ /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */ |
+ do { |
+ ch = *pSrc++; |
+ if(ch < 0xc0) { |
+ /* |
+ * ASCII, or a trail byte in lead position which is treated like |
+ * a single-byte sequence for better character boundary |
+ * resynchronization after illegal sequences. |
+ */ |
+ *pDest++=(UChar)ch; |
+ } else if(ch < 0xe0) { /* U+0080..U+07FF */ |
+ /* 0x3080 = (0xc0 << 6) + 0x80 */ |
+ *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); |
+ } else if(ch < 0xf0) { /* U+0800..U+FFFF */ |
+ /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
+ /* 0x2080 = (0x80 << 6) + 0x80 */ |
+ ch = (ch << 12) + (*pSrc++ << 6); |
+ *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); |
+ } else /* f0..f4 */ { /* U+10000..U+10FFFF */ |
+ /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ |
+ ch = (ch << 18) + (*pSrc++ << 12); |
+ ch += *pSrc++ << 6; |
+ ch += *pSrc++ - 0x3c82080; |
+ *(pDest++) = U16_LEAD(ch); |
+ *(pDest++) = U16_TRAIL(ch); |
+ } |
+ } while(pSrc < pSrcLimit); |
+ |
+ pSrcLimit += 3; /* restore original pSrcLimit */ |
+ } |
+ |
+ while(pSrc < pSrcLimit) { |
+ ch = *pSrc++; |
+ if(ch < 0xc0) { |
+ /* |
+ * ASCII, or a trail byte in lead position which is treated like |
+ * a single-byte sequence for better character boundary |
+ * resynchronization after illegal sequences. |
+ */ |
+ *pDest++=(UChar)ch; |
+ continue; |
+ } else if(ch < 0xe0) { /* U+0080..U+07FF */ |
+ if(pSrc < pSrcLimit) { |
+ /* 0x3080 = (0xc0 << 6) + 0x80 */ |
+ *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); |
+ continue; |
+ } |
+ } else if(ch < 0xf0) { /* U+0800..U+FFFF */ |
+ if((pSrcLimit - pSrc) >= 2) { |
+ /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
+ /* 0x2080 = (0x80 << 6) + 0x80 */ |
+ ch = (ch << 12) + (*pSrc++ << 6); |
+ *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); |
+ pSrc += 3; |
+ continue; |
+ } |
+ } else /* f0..f4 */ { /* U+10000..U+10FFFF */ |
+ if((pSrcLimit - pSrc) >= 3) { |
+ /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ |
+ ch = (ch << 18) + (*pSrc++ << 12); |
+ ch += *pSrc++ << 6; |
+ ch += *pSrc++ - 0x3c82080; |
+ *(pDest++) = U16_LEAD(ch); |
+ *(pDest++) = U16_TRAIL(ch); |
+ pSrc += 4; |
+ continue; |
+ } |
+ } |
+ |
+ /* truncated character at the end */ |
+ *pDest++ = 0xfffd; |
+ break; |
+ } |
+ } |
+ |
+ reqLength+=(int32_t)(pDest - dest); |
+ |
+ if(pDestLength){ |
+ *pDestLength = reqLength; |
+ } |
+ |
+ /* Terminate the buffer */ |
+ u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); |
+ |
+ return dest; |
+} |
+ |
+static U_INLINE uint8_t * |
+_appendUTF8(uint8_t *pDest, UChar32 c) { |
+ /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */ |
+ if((c)<=0x7f) { |
+ *pDest++=(uint8_t)c; |
+ } else if(c<=0x7ff) { |
+ *pDest++=(uint8_t)((c>>6)|0xc0); |
+ *pDest++=(uint8_t)((c&0x3f)|0x80); |
+ } else if(c<=0xffff) { |
+ *pDest++=(uint8_t)((c>>12)|0xe0); |
+ *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80); |
+ *pDest++=(uint8_t)(((c)&0x3f)|0x80); |
+ } else /* if((uint32_t)(c)<=0x10ffff) */ { |
+ *pDest++=(uint8_t)(((c)>>18)|0xf0); |
+ *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80); |
+ *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80); |
+ *pDest++=(uint8_t)(((c)&0x3f)|0x80); |
+ } |
+ return pDest; |
+} |
+ |
+ |
+U_CAPI char* U_EXPORT2 |
+u_strToUTF8WithSub(char *dest, |
+ int32_t destCapacity, |
+ int32_t *pDestLength, |
+ const UChar *pSrc, |
+ int32_t srcLength, |
+ UChar32 subchar, int32_t *pNumSubstitutions, |
+ UErrorCode *pErrorCode){ |
+ int32_t reqLength=0; |
+ uint32_t ch=0,ch2=0; |
+ uint8_t *pDest = (uint8_t *)dest; |
+ uint8_t *pDestLimit = pDest + destCapacity; |
+ int32_t numSubstitutions; |
+ |
+ /* args check */ |
+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ |
+ return NULL; |
+ } |
+ |
+ if( (pSrc==NULL && srcLength!=0) || srcLength < -1 || |
+ (destCapacity<0) || (dest == NULL && destCapacity > 0) || |
+ subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
+ ) { |
+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
+ return NULL; |
+ } |
+ |
+ if(pNumSubstitutions!=NULL) { |
+ *pNumSubstitutions=0; |
+ } |
+ numSubstitutions=0; |
+ |
+ if(srcLength==-1) { |
+ while((ch=*pSrc)!=0) { |
+ ++pSrc; |
+ if(ch <= 0x7f) { |
+ if(pDest<pDestLimit) { |
+ *pDest++ = (uint8_t)ch; |
+ } else { |
+ reqLength = 1; |
+ break; |
+ } |
+ } else if(ch <= 0x7ff) { |
+ if((pDestLimit - pDest) >= 2) { |
+ *pDest++=(uint8_t)((ch>>6)|0xc0); |
+ *pDest++=(uint8_t)((ch&0x3f)|0x80); |
+ } else { |
+ reqLength = 2; |
+ break; |
+ } |
+ } else if(ch <= 0xd7ff || ch >= 0xe000) { |
+ if((pDestLimit - pDest) >= 3) { |
+ *pDest++=(uint8_t)((ch>>12)|0xe0); |
+ *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
+ *pDest++=(uint8_t)((ch&0x3f)|0x80); |
+ } else { |
+ reqLength = 3; |
+ break; |
+ } |
+ } else /* ch is a surrogate */ { |
+ int32_t length; |
+ |
+ /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/ |
+ if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { |
+ ++pSrc; |
+ ch=UTF16_GET_PAIR_VALUE(ch, ch2); |
+ } else if(subchar>=0) { |
+ ch=subchar; |
+ ++numSubstitutions; |
+ } else { |
+ /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
+ *pErrorCode = U_INVALID_CHAR_FOUND; |
+ return NULL; |
+ } |
+ |
+ length = U8_LENGTH(ch); |
+ if((pDestLimit - pDest) >= length) { |
+ /* convert and append*/ |
+ pDest=_appendUTF8(pDest, ch); |
+ } else { |
+ reqLength = length; |
+ break; |
+ } |
+ } |
+ } |
+ while((ch=*pSrc++)!=0) { |
+ if(ch<=0x7f) { |
+ ++reqLength; |
+ } else if(ch<=0x7ff) { |
+ reqLength+=2; |
+ } else if(!UTF_IS_SURROGATE(ch)) { |
+ reqLength+=3; |
+ } else if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { |
+ ++pSrc; |
+ reqLength+=4; |
+ } else if(subchar>=0) { |
+ reqLength+=U8_LENGTH(subchar); |
+ ++numSubstitutions; |
+ } else { |
+ /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
+ *pErrorCode = U_INVALID_CHAR_FOUND; |
+ return NULL; |
+ } |
+ } |
+ } else { |
+ const UChar *pSrcLimit = pSrc+srcLength; |
+ int32_t count; |
+ |
+ /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ |
+ for(;;) { |
+ /* |
+ * Each iteration of the inner loop progresses by at most 3 UTF-8 |
+ * bytes and one UChar, for most characters. |
+ * For supplementary code points (4 & 2), which are rare, |
+ * there is an additional adjustment. |
+ */ |
+ count = (int32_t)((pDestLimit - pDest) / 3); |
+ srcLength = (int32_t)(pSrcLimit - pSrc); |
+ if(count > srcLength) { |
+ count = srcLength; /* min(remaining dest/3, remaining src) */ |
+ } |
+ if(count < 3) { |
+ /* |
+ * Too much overhead if we get near the end of the string, |
+ * continue with the next loop. |
+ */ |
+ break; |
+ } |
+ do { |
+ ch=*pSrc++; |
+ if(ch <= 0x7f) { |
+ *pDest++ = (uint8_t)ch; |
+ } else if(ch <= 0x7ff) { |
+ *pDest++=(uint8_t)((ch>>6)|0xc0); |
+ *pDest++=(uint8_t)((ch&0x3f)|0x80); |
+ } else if(ch <= 0xd7ff || ch >= 0xe000) { |
+ *pDest++=(uint8_t)((ch>>12)|0xe0); |
+ *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
+ *pDest++=(uint8_t)((ch&0x3f)|0x80); |
+ } else /* ch is a surrogate */ { |
+ /* |
+ * We will read two UChars and probably output four bytes, |
+ * which we didn't account for with computing count, |
+ * so we adjust it here. |
+ */ |
+ if(--count == 0) { |
+ --pSrc; /* undo ch=*pSrc++ for the lead surrogate */ |
+ break; /* recompute count */ |
+ } |
+ |
+ if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { |
+ ++pSrc; |
+ ch=UTF16_GET_PAIR_VALUE(ch, ch2); |
+ |
+ /* writing 4 bytes per 2 UChars is ok */ |
+ *pDest++=(uint8_t)((ch>>18)|0xf0); |
+ *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80); |
+ *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
+ *pDest++=(uint8_t)((ch&0x3f)|0x80); |
+ } else { |
+ /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
+ if(subchar>=0) { |
+ ch=subchar; |
+ ++numSubstitutions; |
+ } else { |
+ *pErrorCode = U_INVALID_CHAR_FOUND; |
+ return NULL; |
+ } |
+ |
+ /* convert and append*/ |
+ pDest=_appendUTF8(pDest, ch); |
+ } |
+ } |
+ } while(--count > 0); |
+ } |
+ |
+ while(pSrc<pSrcLimit) { |
+ ch=*pSrc++; |
+ if(ch <= 0x7f) { |
+ if(pDest<pDestLimit) { |
+ *pDest++ = (uint8_t)ch; |
+ } else { |
+ reqLength = 1; |
+ break; |
+ } |
+ } else if(ch <= 0x7ff) { |
+ if((pDestLimit - pDest) >= 2) { |
+ *pDest++=(uint8_t)((ch>>6)|0xc0); |
+ *pDest++=(uint8_t)((ch&0x3f)|0x80); |
+ } else { |
+ reqLength = 2; |
+ break; |
+ } |
+ } else if(ch <= 0xd7ff || ch >= 0xe000) { |
+ if((pDestLimit - pDest) >= 3) { |
+ *pDest++=(uint8_t)((ch>>12)|0xe0); |
+ *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
+ *pDest++=(uint8_t)((ch&0x3f)|0x80); |
+ } else { |
+ reqLength = 3; |
+ break; |
+ } |
+ } else /* ch is a surrogate */ { |
+ int32_t length; |
+ |
+ if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) { |
+ ++pSrc; |
+ ch=UTF16_GET_PAIR_VALUE(ch, ch2); |
+ } else if(subchar>=0) { |
+ ch=subchar; |
+ ++numSubstitutions; |
+ } else { |
+ /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
+ *pErrorCode = U_INVALID_CHAR_FOUND; |
+ return NULL; |
+ } |
+ |
+ length = U8_LENGTH(ch); |
+ if((pDestLimit - pDest) >= length) { |
+ /* convert and append*/ |
+ pDest=_appendUTF8(pDest, ch); |
+ } else { |
+ reqLength = length; |
+ break; |
+ } |
+ } |
+ } |
+ while(pSrc<pSrcLimit) { |
+ ch=*pSrc++; |
+ if(ch<=0x7f) { |
+ ++reqLength; |
+ } else if(ch<=0x7ff) { |
+ reqLength+=2; |
+ } else if(!UTF_IS_SURROGATE(ch)) { |
+ reqLength+=3; |
+ } else if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) { |
+ ++pSrc; |
+ reqLength+=4; |
+ } else if(subchar>=0) { |
+ reqLength+=U8_LENGTH(subchar); |
+ ++numSubstitutions; |
+ } else { |
+ /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
+ *pErrorCode = U_INVALID_CHAR_FOUND; |
+ return NULL; |
+ } |
+ } |
+ } |
+ |
+ reqLength+=(int32_t)(pDest - (uint8_t *)dest); |
+ |
+ if(pNumSubstitutions!=NULL) { |
+ *pNumSubstitutions=numSubstitutions; |
+ } |
+ |
+ if(pDestLength){ |
+ *pDestLength = reqLength; |
+ } |
+ |
+ /* Terminate the buffer */ |
+ u_terminateChars(dest, destCapacity, reqLength, pErrorCode); |
+ return dest; |
+} |
+ |
+U_CAPI char* U_EXPORT2 |
+u_strToUTF8(char *dest, |
+ int32_t destCapacity, |
+ int32_t *pDestLength, |
+ const UChar *pSrc, |
+ int32_t srcLength, |
+ UErrorCode *pErrorCode){ |
+ return u_strToUTF8WithSub( |
+ dest, destCapacity, pDestLength, |
+ pSrc, srcLength, |
+ U_SENTINEL, NULL, |
+ pErrorCode); |
+} |
+ |
+U_CAPI UChar* U_EXPORT2 |
+u_strFromJavaModifiedUTF8WithSub( |
+ UChar *dest, |
+ int32_t destCapacity, |
+ int32_t *pDestLength, |
+ const char *src, |
+ int32_t srcLength, |
+ UChar32 subchar, int32_t *pNumSubstitutions, |
+ UErrorCode *pErrorCode) { |
+ UChar *pDest = dest; |
+ UChar *pDestLimit = dest+destCapacity; |
+ UChar32 ch; |
+ int32_t reqLength = 0; |
+ const uint8_t* pSrc = (const uint8_t*) src; |
+ const uint8_t *pSrcLimit; |
+ int32_t count; |
+ uint8_t t1, t2; /* trail bytes */ |
+ int32_t numSubstitutions; |
+ |
+ /* args check */ |
+ if(U_FAILURE(*pErrorCode)){ |
+ return NULL; |
+ } |
+ if( (src==NULL && srcLength!=0) || srcLength < -1 || |
+ (dest==NULL && destCapacity!=0) || destCapacity<0 || |
+ subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
+ ) { |
+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
+ return NULL; |
+ } |
+ |
+ if(pNumSubstitutions!=NULL) { |
+ *pNumSubstitutions=0; |
+ } |
+ numSubstitutions=0; |
+ |
+ if(srcLength < 0) { |
+ /* |
+ * Transform a NUL-terminated ASCII string. |
+ * Handle non-ASCII strings with slower code. |
+ */ |
+ while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) { |
+ *pDest++=(UChar)ch; |
+ ++pSrc; |
+ } |
+ if(ch == 0) { |
+ reqLength=(int32_t)(pDest - dest); |
+ if(pDestLength) { |
+ *pDestLength = reqLength; |
+ } |
+ |
+ /* Terminate the buffer */ |
+ u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); |
+ return dest; |
+ } |
+ srcLength = uprv_strlen((const char *)pSrc); |
+ } |
+ |
+ /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ |
+ pSrcLimit = pSrc + srcLength; |
+ for(;;) { |
+ count = (int32_t)(pDestLimit - pDest); |
+ srcLength = (int32_t)(pSrcLimit - pSrc); |
+ if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) { |
+ /* fast ASCII loop */ |
+ const uint8_t *prevSrc = pSrc; |
+ int32_t delta; |
+ while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) { |
+ *pDest++=(UChar)ch; |
+ ++pSrc; |
+ } |
+ delta = (int32_t)(pSrc - prevSrc); |
+ count -= delta; |
+ srcLength -= delta; |
+ } |
+ /* |
+ * Each iteration of the inner loop progresses by at most 3 UTF-8 |
+ * bytes and one UChar. |
+ */ |
+ srcLength /= 3; |
+ if(count > srcLength) { |
+ count = srcLength; /* min(remaining dest, remaining src/3) */ |
+ } |
+ if(count < 3) { |
+ /* |
+ * Too much overhead if we get near the end of the string, |
+ * continue with the next loop. |
+ */ |
+ break; |
+ } |
+ do { |
+ ch = *pSrc; |
+ if(ch <= 0x7f){ |
+ *pDest++=(UChar)ch; |
+ ++pSrc; |
+ } else { |
+ if(ch >= 0xe0) { |
+ if( /* handle U+0000..U+FFFF inline */ |
+ ch <= 0xef && |
+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && |
+ (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f |
+ ) { |
+ /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
+ *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); |
+ pSrc += 3; |
+ continue; |
+ } |
+ } else { |
+ if( /* handle U+0000..U+07FF inline */ |
+ ch >= 0xc0 && |
+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f |
+ ) { |
+ *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); |
+ pSrc += 2; |
+ continue; |
+ } |
+ } |
+ |
+ if(subchar < 0) { |
+ *pErrorCode = U_INVALID_CHAR_FOUND; |
+ return NULL; |
+ } else if(subchar > 0xffff && --count == 0) { |
+ /* |
+ * We need to write two UChars, adjusted count for that, |
+ * and ran out of space. |
+ */ |
+ break; |
+ } else { |
+ /* function call for error cases */ |
+ ++pSrc; /* continue after the lead byte */ |
+ utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
+ ++numSubstitutions; |
+ if(subchar<=0xFFFF) { |
+ *(pDest++)=(UChar)subchar; |
+ } else { |
+ *(pDest++)=U16_LEAD(subchar); |
+ *(pDest++)=U16_TRAIL(subchar); |
+ } |
+ } |
+ } |
+ } while(--count > 0); |
+ } |
+ |
+ while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { |
+ ch = *pSrc; |
+ if(ch <= 0x7f){ |
+ *pDest++=(UChar)ch; |
+ ++pSrc; |
+ } else { |
+ if(ch >= 0xe0) { |
+ if( /* handle U+0000..U+FFFF inline */ |
+ ch <= 0xef && |
+ ((pSrcLimit - pSrc) >= 3) && |
+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && |
+ (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f |
+ ) { |
+ /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
+ *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); |
+ pSrc += 3; |
+ continue; |
+ } |
+ } else { |
+ if( /* handle U+0000..U+07FF inline */ |
+ ch >= 0xc0 && |
+ ((pSrcLimit - pSrc) >= 2) && |
+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f |
+ ) { |
+ *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); |
+ pSrc += 2; |
+ continue; |
+ } |
+ } |
+ |
+ if(subchar < 0) { |
+ *pErrorCode = U_INVALID_CHAR_FOUND; |
+ return NULL; |
+ } else { |
+ /* function call for error cases */ |
+ ++pSrc; /* continue after the lead byte */ |
+ utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
+ ++numSubstitutions; |
+ if(subchar<=0xFFFF) { |
+ *(pDest++)=(UChar)subchar; |
+ } else { |
+ *(pDest++)=U16_LEAD(subchar); |
+ if(pDest<pDestLimit) { |
+ *(pDest++)=U16_TRAIL(subchar); |
+ } else { |
+ reqLength++; |
+ break; |
+ } |
+ } |
+ } |
+ } |
+ } |
+ |
+ /* do not fill the dest buffer just count the UChars needed */ |
+ while(pSrc < pSrcLimit){ |
+ ch = *pSrc; |
+ if(ch <= 0x7f) { |
+ reqLength++; |
+ ++pSrc; |
+ } else { |
+ if(ch >= 0xe0) { |
+ if( /* handle U+0000..U+FFFF inline */ |
+ ch <= 0xef && |
+ ((pSrcLimit - pSrc) >= 3) && |
+ (uint8_t)(pSrc[1] - 0x80) <= 0x3f && |
+ (uint8_t)(pSrc[2] - 0x80) <= 0x3f |
+ ) { |
+ reqLength++; |
+ pSrc += 3; |
+ continue; |
+ } |
+ } else { |
+ if( /* handle U+0000..U+07FF inline */ |
+ ch >= 0xc0 && |
+ ((pSrcLimit - pSrc) >= 2) && |
+ (uint8_t)(pSrc[1] - 0x80) <= 0x3f |
+ ) { |
+ reqLength++; |
+ pSrc += 2; |
+ continue; |
+ } |
+ } |
+ |
+ if(subchar < 0) { |
+ *pErrorCode = U_INVALID_CHAR_FOUND; |
+ return NULL; |
+ } else { |
+ /* function call for error cases */ |
+ ++pSrc; /* continue after the lead byte */ |
+ utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
+ ++numSubstitutions; |
+ reqLength+=U16_LENGTH(ch); |
+ } |
+ } |
+ } |
+ |
+ if(pNumSubstitutions!=NULL) { |
+ *pNumSubstitutions=numSubstitutions; |
+ } |
+ |
+ reqLength+=(int32_t)(pDest - dest); |
+ if(pDestLength) { |
+ *pDestLength = reqLength; |
+ } |
+ |
+ /* Terminate the buffer */ |
+ u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); |
+ return dest; |
+} |
+ |
+U_CAPI char* U_EXPORT2 |
+u_strToJavaModifiedUTF8( |
+ char *dest, |
+ int32_t destCapacity, |
+ int32_t *pDestLength, |
+ const UChar *src, |
+ int32_t srcLength, |
+ UErrorCode *pErrorCode) { |
+ int32_t reqLength=0; |
+ uint32_t ch=0; |
+ uint8_t *pDest = (uint8_t *)dest; |
+ uint8_t *pDestLimit = pDest + destCapacity; |
+ const UChar *pSrcLimit; |
+ int32_t count; |
+ |
+ /* args check */ |
+ if(U_FAILURE(*pErrorCode)){ |
+ return NULL; |
+ } |
+ if( (src==NULL && srcLength!=0) || srcLength < -1 || |
+ (dest==NULL && destCapacity!=0) || destCapacity<0 |
+ ) { |
+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
+ return NULL; |
+ } |
+ |
+ if(srcLength==-1) { |
+ /* Convert NUL-terminated ASCII, then find the string length. */ |
+ while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) { |
+ *pDest++ = (uint8_t)ch; |
+ ++src; |
+ } |
+ if(ch == 0) { |
+ reqLength=(int32_t)(pDest - (uint8_t *)dest); |
+ if(pDestLength) { |
+ *pDestLength = reqLength; |
+ } |
+ |
+ /* Terminate the buffer */ |
+ u_terminateChars(dest, destCapacity, reqLength, pErrorCode); |
+ return dest; |
+ } |
+ srcLength = u_strlen(src); |
+ } |
+ |
+ /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ |
+ pSrcLimit = src+srcLength; |
+ for(;;) { |
+ count = (int32_t)(pDestLimit - pDest); |
+ srcLength = (int32_t)(pSrcLimit - src); |
+ if(count >= srcLength && srcLength > 0 && *src <= 0x7f) { |
+ /* fast ASCII loop */ |
+ const UChar *prevSrc = src; |
+ int32_t delta; |
+ while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) { |
+ *pDest++=(uint8_t)ch; |
+ ++src; |
+ } |
+ delta = (int32_t)(src - prevSrc); |
+ count -= delta; |
+ srcLength -= delta; |
+ } |
+ /* |
+ * Each iteration of the inner loop progresses by at most 3 UTF-8 |
+ * bytes and one UChar. |
+ */ |
+ count /= 3; |
+ if(count > srcLength) { |
+ count = srcLength; /* min(remaining dest/3, remaining src) */ |
+ } |
+ if(count < 3) { |
+ /* |
+ * Too much overhead if we get near the end of the string, |
+ * continue with the next loop. |
+ */ |
+ break; |
+ } |
+ do { |
+ ch=*src++; |
+ if(ch <= 0x7f && ch != 0) { |
+ *pDest++ = (uint8_t)ch; |
+ } else if(ch <= 0x7ff) { |
+ *pDest++=(uint8_t)((ch>>6)|0xc0); |
+ *pDest++=(uint8_t)((ch&0x3f)|0x80); |
+ } else { |
+ *pDest++=(uint8_t)((ch>>12)|0xe0); |
+ *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
+ *pDest++=(uint8_t)((ch&0x3f)|0x80); |
+ } |
+ } while(--count > 0); |
+ } |
+ |
+ while(src<pSrcLimit) { |
+ ch=*src++; |
+ if(ch <= 0x7f && ch != 0) { |
+ if(pDest<pDestLimit) { |
+ *pDest++ = (uint8_t)ch; |
+ } else { |
+ reqLength = 1; |
+ break; |
+ } |
+ } else if(ch <= 0x7ff) { |
+ if((pDestLimit - pDest) >= 2) { |
+ *pDest++=(uint8_t)((ch>>6)|0xc0); |
+ *pDest++=(uint8_t)((ch&0x3f)|0x80); |
+ } else { |
+ reqLength = 2; |
+ break; |
+ } |
+ } else { |
+ if((pDestLimit - pDest) >= 3) { |
+ *pDest++=(uint8_t)((ch>>12)|0xe0); |
+ *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
+ *pDest++=(uint8_t)((ch&0x3f)|0x80); |
+ } else { |
+ reqLength = 3; |
+ break; |
+ } |
+ } |
+ } |
+ while(src<pSrcLimit) { |
+ ch=*src++; |
+ if(ch <= 0x7f && ch != 0) { |
+ ++reqLength; |
+ } else if(ch<=0x7ff) { |
+ reqLength+=2; |
+ } else { |
+ reqLength+=3; |
+ } |
+ } |
+ |
+ reqLength+=(int32_t)(pDest - (uint8_t *)dest); |
+ if(pDestLength){ |
+ *pDestLength = reqLength; |
+ } |
+ |
+ /* Terminate the buffer */ |
+ u_terminateChars(dest, destCapacity, reqLength, pErrorCode); |
+ return dest; |
+} |
Property changes on: icu46/source/common/ustrtrns.c |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |