icu46/source/common/ustrtrns.c - Issue 5516007: Check in the pristine copy of ICU 4.6...

Unified Diff: icu46/source/common/ustrtrns.c

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu46/source/common/ustrtrns.c

===================================================================

--- icu46/source/common/ustrtrns.c (revision 0)

+++ icu46/source/common/ustrtrns.c (revision 0)

@@ -0,0 +1,1640 @@

+/*

+******************************************************************************

+* File ustrtrns.c

+* Modification History:

+* Date Name Description

+* 9/10/2001 Ram Creation.

+******************************************************************************

+*/

+/*******************************************************************************

+ *

+ * u_strTo* and u_strFrom* APIs

+ * WCS functions moved to ustr_wcs.c for better modularization

+ *

+ *******************************************************************************

+ */

+#include "unicode/putil.h"

+#include "unicode/ustring.h"

+#include "cstring.h"

+#include "cmemory.h"

+#include "ustr_imp.h"

+U_CAPI UChar* U_EXPORT2

+u_strFromUTF32WithSub(UChar *dest,

+ int32_t destCapacity,

+ int32_t *pDestLength,

+ const UChar32 *src,

+ int32_t srcLength,

+ UChar32 subchar, int32_t *pNumSubstitutions,

+ UErrorCode *pErrorCode) {

+ const UChar32 *srcLimit;

+ UChar32 ch;

+ UChar *destLimit;

+ UChar *pDest;

+ int32_t reqLength;

+ int32_t numSubstitutions;

+ /* args check */

+ if(U_FAILURE(*pErrorCode)){

+ return NULL;

+ }

+ if( (src==NULL && srcLength!=0) || srcLength < -1 ||

+ (destCapacity<0) || (dest == NULL && destCapacity > 0) ||

+ subchar > 0x10ffff || U_IS_SURROGATE(subchar)

+ ) {

+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;

+ return NULL;

+ }

+ if(pNumSubstitutions != NULL) {

+ *pNumSubstitutions = 0;

+ }

+ pDest = dest;

+ destLimit = dest + destCapacity;

+ reqLength = 0;

+ numSubstitutions = 0;

+ if(srcLength < 0) {

+ /* simple loop for conversion of a NUL-terminated BMP string */

+ while((ch=*src) != 0 &&

+ ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {

+ ++src;

+ if(pDest < destLimit) {

+ *pDest++ = (UChar)ch;

+ } else {

+ ++reqLength;

+ }

+ srcLimit = src;

+ if(ch != 0) {

+ /* "complicated" case, find the end of the remaining string */

+ while(*++srcLimit != 0) {}

+ }

+ } else {

+ srcLimit = src + srcLength;

+ }

+ /* convert with length */

+ while(src < srcLimit) {

+ ch = *src++;

+ do {

+ /* usually "loops" once; twice only for writing subchar */

+ if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {

+ if(pDest < destLimit) {

+ *pDest++ = (UChar)ch;

+ } else {

+ ++reqLength;

+ }

+ break;

+ } else if(0x10000 <= ch && ch <= 0x10ffff) {

+ if((pDest + 2) <= destLimit) {

+ *pDest++ = U16_LEAD(ch);

+ *pDest++ = U16_TRAIL(ch);

+ } else {

+ reqLength += 2;

+ }

+ break;

+ } else if((ch = subchar) < 0) {

+ /* surrogate code point, or not a Unicode code point at all */

+ *pErrorCode = U_INVALID_CHAR_FOUND;

+ return NULL;

+ } else {

+ ++numSubstitutions;

+ }

+ } while(TRUE);

+ }

+ reqLength += (int32_t)(pDest - dest);

+ if(pDestLength) {

+ *pDestLength = reqLength;

+ }

+ if(pNumSubstitutions != NULL) {

+ *pNumSubstitutions = numSubstitutions;

+ }

+ /* Terminate the buffer */

+ u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);

+ return dest;

+U_CAPI UChar* U_EXPORT2

+u_strFromUTF32(UChar *dest,

+ int32_t destCapacity,

+ int32_t *pDestLength,

+ const UChar32 *src,

+ int32_t srcLength,

+ UErrorCode *pErrorCode) {

+ return u_strFromUTF32WithSub(

+ dest, destCapacity, pDestLength,

+ src, srcLength,

+ U_SENTINEL, NULL,

+ pErrorCode);

+U_CAPI UChar32* U_EXPORT2

+u_strToUTF32WithSub(UChar32 *dest,

+ int32_t destCapacity,

+ int32_t *pDestLength,

+ const UChar *src,

+ int32_t srcLength,

+ UChar32 subchar, int32_t *pNumSubstitutions,

+ UErrorCode *pErrorCode) {

+ const UChar *srcLimit;

+ UChar32 ch;

+ UChar ch2;

+ UChar32 *destLimit;

+ UChar32 *pDest;

+ int32_t reqLength;

+ int32_t numSubstitutions;

+ /* args check */

+ if(U_FAILURE(*pErrorCode)){

+ return NULL;

+ }

+ if( (src==NULL && srcLength!=0) || srcLength < -1 ||

+ (destCapacity<0) || (dest == NULL && destCapacity > 0) ||

+ subchar > 0x10ffff || U_IS_SURROGATE(subchar)

+ ) {

+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;

+ return NULL;

+ }

+ if(pNumSubstitutions != NULL) {

+ *pNumSubstitutions = 0;

+ }

+ pDest = dest;

+ destLimit = dest + destCapacity;

+ reqLength = 0;

+ numSubstitutions = 0;

+ if(srcLength < 0) {

+ /* simple loop for conversion of a NUL-terminated BMP string */

+ while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {

+ ++src;

+ if(pDest < destLimit) {

+ *pDest++ = ch;

+ } else {

+ ++reqLength;

+ }

+ srcLimit = src;

+ if(ch != 0) {

+ /* "complicated" case, find the end of the remaining string */

+ while(*++srcLimit != 0) {}

+ }

+ } else {

+ srcLimit = src + srcLength;

+ }

+ /* convert with length */

+ while(src < srcLimit) {

+ ch = *src++;

+ if(!U16_IS_SURROGATE(ch)) {

+ /* write or count ch below */

+ } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {

+ ++src;

+ ch = U16_GET_SUPPLEMENTARY(ch, ch2);

+ } else if((ch = subchar) < 0) {

+ /* unpaired surrogate */

+ *pErrorCode = U_INVALID_CHAR_FOUND;

+ return NULL;

+ } else {

+ ++numSubstitutions;

+ }

+ if(pDest < destLimit) {

+ *pDest++ = ch;

+ } else {

+ ++reqLength;

+ }

+ reqLength += (int32_t)(pDest - dest);

+ if(pDestLength) {

+ *pDestLength = reqLength;

+ }

+ if(pNumSubstitutions != NULL) {

+ *pNumSubstitutions = numSubstitutions;

+ }

+ /* Terminate the buffer */

+ u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);

+ return dest;

+U_CAPI UChar32* U_EXPORT2

+u_strToUTF32(UChar32 *dest,

+ int32_t destCapacity,

+ int32_t *pDestLength,

+ const UChar *src,

+ int32_t srcLength,

+ UErrorCode *pErrorCode) {

+ return u_strToUTF32WithSub(

+ dest, destCapacity, pDestLength,

+ src, srcLength,

+ U_SENTINEL, NULL,

+ pErrorCode);

+/* for utf8_nextCharSafeBodyTerminated() */

+static const UChar32

+utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };

+/*

+ * Version of utf8_nextCharSafeBody() with the following differences:

+ * - checks for NUL termination instead of length

+ * - works with pointers instead of indexes

+ * - always strict (strict==-1)

+ *

+ * *ps points to after the lead byte and will be moved to after the last trail byte.

+ * c is the lead byte.

+ * @return the code point, or U_SENTINEL

+ */

+static UChar32

+utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {

+ const uint8_t *s=*ps;

+ uint8_t trail, illegal=0;

+ uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);

+ UTF8_MASK_LEAD_BYTE((c), count);

+ /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */

+ switch(count) {

+ /* each branch falls through to the next one */

+ case 5:

+ case 4:

+ /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */

+ illegal=1;

+ break;

+ case 3:

+ trail=(uint8_t)(*s++ - 0x80);

+ c=(c<<6)|trail;

+ if(trail>0x3f || c>=0x110) {

+ /* not a trail byte, or code point>0x10ffff (outside Unicode) */

+ illegal=1;

+ break;

+ }

+ case 2:

+ trail=(uint8_t)(*s++ - 0x80);

+ if(trail>0x3f) {

+ /* not a trail byte */

+ illegal=1;

+ break;

+ }

+ c=(c<<6)|trail;

+ case 1:

+ trail=(uint8_t)(*s++ - 0x80);

+ if(trail>0x3f) {

+ /* not a trail byte */

+ illegal=1;

+ }

+ c=(c<<6)|trail;

+ break;

+ case 0:

+ return U_SENTINEL;

+ /* no default branch to optimize switch() - all values are covered */

+ }

+ /* correct sequence - all trail bytes have (b7..b6)==(10)? */

+ /* illegal is also set if count>=4 */

+ if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {

+ /* error handling */

+ /* don't go beyond this sequence */

+ s=*ps;

+ while(count>0 && UTF8_IS_TRAIL(*s)) {

+ ++s;

+ --count;

+ }

+ c=U_SENTINEL;

+ }

+ *ps=s;

+ return c;

+/*

+ * Version of utf8_nextCharSafeBody() with the following differences:

+ * - works with pointers instead of indexes

+ * - always strict (strict==-1)

+ *

+ * *ps points to after the lead byte and will be moved to after the last trail byte.

+ * c is the lead byte.

+ * @return the code point, or U_SENTINEL

+ */

+static UChar32

+utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {

+ const uint8_t *s=*ps;

+ uint8_t trail, illegal=0;

+ uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);

+ if((limit-s)>=count) {

+ UTF8_MASK_LEAD_BYTE((c), count);

+ /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */

+ switch(count) {

+ /* each branch falls through to the next one */

+ case 5:

+ case 4:

+ /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */

+ illegal=1;

+ break;

+ case 3:

+ trail=*s++;

+ c=(c<<6)|(trail&0x3f);

+ if(c<0x110) {

+ illegal|=(trail&0xc0)^0x80;

+ } else {

+ /* code point>0x10ffff, outside Unicode */

+ illegal=1;

+ break;

+ }

+ case 2:

+ trail=*s++;

+ c=(c<<6)|(trail&0x3f);

+ illegal|=(trail&0xc0)^0x80;

+ case 1:

+ trail=*s++;

+ c=(c<<6)|(trail&0x3f);

+ illegal|=(trail&0xc0)^0x80;

+ break;

+ case 0:

+ return U_SENTINEL;

+ /* no default branch to optimize switch() - all values are covered */

+ }

+ } else {

+ illegal=1; /* too few bytes left */

+ }

+ /* correct sequence - all trail bytes have (b7..b6)==(10)? */

+ /* illegal is also set if count>=4 */

+ if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {

+ /* error handling */

+ /* don't go beyond this sequence */

+ s=*ps;

+ while(count>0 && s<limit && UTF8_IS_TRAIL(*s)) {

+ ++s;

+ --count;

+ }

+ c=U_SENTINEL;

+ }

+ *ps=s;

+ return c;

+U_CAPI UChar* U_EXPORT2

+u_strFromUTF8WithSub(UChar *dest,

+ int32_t destCapacity,

+ int32_t *pDestLength,

+ const char* src,

+ int32_t srcLength,

+ UChar32 subchar, int32_t *pNumSubstitutions,

+ UErrorCode *pErrorCode){

+ UChar *pDest = dest;

+ UChar *pDestLimit = dest+destCapacity;

+ UChar32 ch;

+ int32_t reqLength = 0;

+ const uint8_t* pSrc = (const uint8_t*) src;

+ uint8_t t1, t2; /* trail bytes */

+ int32_t numSubstitutions;

+ /* args check */

+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){

+ return NULL;

+ }

+ if( (src==NULL && srcLength!=0) || srcLength < -1 ||

+ (destCapacity<0) || (dest == NULL && destCapacity > 0) ||

+ subchar > 0x10ffff || U_IS_SURROGATE(subchar)

+ ) {

+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;

+ return NULL;

+ }

+ if(pNumSubstitutions!=NULL) {

+ *pNumSubstitutions=0;

+ }

+ numSubstitutions=0;

+ /*

+ * Inline processing of UTF-8 byte sequences:

+ *

+ * Byte sequences for the most common characters are handled inline in

+ * the conversion loops. In order to reduce the path lengths for those

+ * characters, the tests are arranged in a kind of binary search.

+ * ASCII (<=0x7f) is checked first, followed by the dividing point

+ * between 2- and 3-byte sequences (0xe0).

+ * The 3-byte branch is tested first to speed up CJK text.

+ * The compiler should combine the subtractions for the two tests for 0xe0.

+ * Each branch then tests for the other end of its range.

+ */

+ if(srcLength < 0){

+ /*

+ * Transform a NUL-terminated string.

+ * The code explicitly checks for NULs only in the lead byte position.

+ * A NUL byte in the trail byte position fails the trail byte range check anyway.

+ */

+ while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {

+ if(ch <= 0x7f){

+ *pDest++=(UChar)ch;

+ ++pSrc;

+ } else {

+ if(ch > 0xe0) {

+ if( /* handle U+1000..U+CFFF inline */

+ ch <= 0xec &&

+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&

+ (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f

+ ) {

+ /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */

+ *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);

+ pSrc += 3;

+ continue;

+ }

+ } else if(ch < 0xe0) {

+ if( /* handle U+0080..U+07FF inline */

+ ch >= 0xc2 &&

+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f

+ ) {

+ *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);

+ pSrc += 2;

+ continue;

+ }

+ /* function call for "complicated" and error cases */

+ ++pSrc; /* continue after the lead byte */

+ ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);

+ if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {

+ *pErrorCode = U_INVALID_CHAR_FOUND;

+ return NULL;

+ } else if(ch<=0xFFFF) {

+ *(pDest++)=(UChar)ch;

+ } else {

+ *(pDest++)=UTF16_LEAD(ch);

+ if(pDest<pDestLimit) {

+ *(pDest++)=UTF16_TRAIL(ch);

+ } else {

+ reqLength++;

+ break;

+ }

+ /* Pre-flight the rest of the string. */

+ while((ch = *pSrc) != 0) {

+ if(ch <= 0x7f){

+ ++reqLength;

+ ++pSrc;

+ } else {

+ if(ch > 0xe0) {

+ if( /* handle U+1000..U+CFFF inline */

+ ch <= 0xec &&

+ (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&

+ (uint8_t)(pSrc[2] - 0x80) <= 0x3f

+ ) {

+ ++reqLength;

+ pSrc += 3;

+ continue;

+ }

+ } else if(ch < 0xe0) {

+ if( /* handle U+0080..U+07FF inline */

+ ch >= 0xc2 &&

+ (uint8_t)(pSrc[1] - 0x80) <= 0x3f

+ ) {

+ ++reqLength;

+ pSrc += 2;

+ continue;

+ }

+ /* function call for "complicated" and error cases */

+ ++pSrc; /* continue after the lead byte */

+ ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);

+ if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {

+ *pErrorCode = U_INVALID_CHAR_FOUND;

+ return NULL;

+ }

+ reqLength += U16_LENGTH(ch);

+ }

+ } else /* srcLength >= 0 */ {

+ const uint8_t *pSrcLimit = pSrc + srcLength;

+ int32_t count;

+ /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */

+ for(;;) {

+ /*

+ * Each iteration of the inner loop progresses by at most 3 UTF-8

+ * bytes and one UChar, for most characters.

+ * For supplementary code points (4 & 2), which are rare,

+ * there is an additional adjustment.

+ */

+ count = (int32_t)(pDestLimit - pDest);

+ srcLength = (int32_t)((pSrcLimit - pSrc) / 3);

+ if(count > srcLength) {

+ count = srcLength; /* min(remaining dest, remaining src/3) */

+ }

+ if(count < 3) {

+ /*

+ * Too much overhead if we get near the end of the string,

+ * continue with the next loop.

+ */

+ break;

+ }

+ do {

+ ch = *pSrc;

+ if(ch <= 0x7f){

+ *pDest++=(UChar)ch;

+ ++pSrc;

+ } else {

+ if(ch > 0xe0) {

+ if( /* handle U+1000..U+CFFF inline */

+ ch <= 0xec &&

+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&

+ (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f

+ ) {

+ /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */

+ *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);

+ pSrc += 3;

+ continue;

+ }

+ } else if(ch < 0xe0) {

+ if( /* handle U+0080..U+07FF inline */

+ ch >= 0xc2 &&

+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f

+ ) {

+ *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);

+ pSrc += 2;

+ continue;

+ }

+ if(ch >= 0xf0 || subchar > 0xffff) {

+ /*

+ * We may read up to six bytes and write up to two UChars,

+ * which we didn't account for with computing count,

+ * so we adjust it here.

+ */

+ if(--count == 0) {

+ break;

+ }

+ /* function call for "complicated" and error cases */

+ ++pSrc; /* continue after the lead byte */

+ ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);

+ if(ch<0 && (++numSubstitutions, ch = subchar) < 0){

+ *pErrorCode = U_INVALID_CHAR_FOUND;

+ return NULL;

+ }else if(ch<=0xFFFF){

+ *(pDest++)=(UChar)ch;

+ }else{

+ *(pDest++)=UTF16_LEAD(ch);

+ *(pDest++)=UTF16_TRAIL(ch);

+ }

+ } while(--count > 0);

+ }

+ while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {

+ ch = *pSrc;

+ if(ch <= 0x7f){

+ *pDest++=(UChar)ch;

+ ++pSrc;

+ } else {

+ if(ch > 0xe0) {

+ if( /* handle U+1000..U+CFFF inline */

+ ch <= 0xec &&

+ ((pSrcLimit - pSrc) >= 3) &&

+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&

+ (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f

+ ) {

+ /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */

+ *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);

+ pSrc += 3;

+ continue;

+ }

+ } else if(ch < 0xe0) {

+ if( /* handle U+0080..U+07FF inline */

+ ch >= 0xc2 &&

+ ((pSrcLimit - pSrc) >= 2) &&

+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f

+ ) {

+ *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);

+ pSrc += 2;

+ continue;

+ }

+ /* function call for "complicated" and error cases */

+ ++pSrc; /* continue after the lead byte */

+ ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);

+ if(ch<0 && (++numSubstitutions, ch = subchar) < 0){

+ *pErrorCode = U_INVALID_CHAR_FOUND;

+ return NULL;

+ }else if(ch<=0xFFFF){

+ *(pDest++)=(UChar)ch;

+ }else{

+ *(pDest++)=UTF16_LEAD(ch);

+ if(pDest<pDestLimit){

+ *(pDest++)=UTF16_TRAIL(ch);

+ }else{

+ reqLength++;

+ break;

+ }

+ /* do not fill the dest buffer just count the UChars needed */

+ while(pSrc < pSrcLimit){

+ ch = *pSrc;

+ if(ch <= 0x7f){

+ reqLength++;

+ ++pSrc;

+ } else {

+ if(ch > 0xe0) {

+ if( /* handle U+1000..U+CFFF inline */

+ ch <= 0xec &&

+ ((pSrcLimit - pSrc) >= 3) &&

+ (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&

+ (uint8_t)(pSrc[2] - 0x80) <= 0x3f

+ ) {

+ reqLength++;

+ pSrc += 3;

+ continue;

+ }

+ } else if(ch < 0xe0) {

+ if( /* handle U+0080..U+07FF inline */

+ ch >= 0xc2 &&

+ ((pSrcLimit - pSrc) >= 2) &&

+ (uint8_t)(pSrc[1] - 0x80) <= 0x3f

+ ) {

+ reqLength++;

+ pSrc += 2;

+ continue;

+ }

+ /* function call for "complicated" and error cases */

+ ++pSrc; /* continue after the lead byte */

+ ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);

+ if(ch<0 && (++numSubstitutions, ch = subchar) < 0){

+ *pErrorCode = U_INVALID_CHAR_FOUND;

+ return NULL;

+ }

+ reqLength+=UTF_CHAR_LENGTH(ch);

+ }

+ reqLength+=(int32_t)(pDest - dest);

+ if(pNumSubstitutions!=NULL) {

+ *pNumSubstitutions=numSubstitutions;

+ }

+ if(pDestLength){

+ *pDestLength = reqLength;

+ }

+ /* Terminate the buffer */

+ u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);

+ return dest;

+U_CAPI UChar* U_EXPORT2

+u_strFromUTF8(UChar *dest,

+ int32_t destCapacity,

+ int32_t *pDestLength,

+ const char* src,

+ int32_t srcLength,

+ UErrorCode *pErrorCode){

+ return u_strFromUTF8WithSub(

+ dest, destCapacity, pDestLength,

+ src, srcLength,

+ U_SENTINEL, NULL,

+ pErrorCode);

+U_CAPI UChar * U_EXPORT2

+u_strFromUTF8Lenient(UChar *dest,

+ int32_t destCapacity,

+ int32_t *pDestLength,

+ const char *src,

+ int32_t srcLength,

+ UErrorCode *pErrorCode) {

+ UChar *pDest = dest;

+ UChar32 ch;

+ int32_t reqLength = 0;

+ uint8_t* pSrc = (uint8_t*) src;

+ /* args check */

+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){

+ return NULL;

+ }

+ if( (src==NULL && srcLength!=0) || srcLength < -1 ||

+ (destCapacity<0) || (dest == NULL && destCapacity > 0)

+ ) {

+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;

+ return NULL;

+ }

+ if(srcLength < 0) {

+ /* Transform a NUL-terminated string. */

+ UChar *pDestLimit = dest+destCapacity;

+ uint8_t t1, t2, t3; /* trail bytes */

+ while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {

+ if(ch < 0xc0) {

+ /*

+ * ASCII, or a trail byte in lead position which is treated like

+ * a single-byte sequence for better character boundary

+ * resynchronization after illegal sequences.

+ */

+ *pDest++=(UChar)ch;

+ ++pSrc;

+ continue;

+ } else if(ch < 0xe0) { /* U+0080..U+07FF */

+ if((t1 = pSrc[1]) != 0) {

+ /* 0x3080 = (0xc0 << 6) + 0x80 */

+ *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);

+ pSrc += 2;

+ continue;

+ }

+ } else if(ch < 0xf0) { /* U+0800..U+FFFF */

+ if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {

+ /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */

+ /* 0x2080 = (0x80 << 6) + 0x80 */

+ *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);

+ pSrc += 3;

+ continue;

+ }

+ } else /* f0..f4 */ { /* U+10000..U+10FFFF */

+ if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {

+ pSrc += 4;

+ /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */

+ ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;

+ *(pDest++) = U16_LEAD(ch);

+ if(pDest < pDestLimit) {

+ *(pDest++) = U16_TRAIL(ch);

+ } else {

+ reqLength = 1;

+ break;

+ }

+ continue;

+ }

+ /* truncated character at the end */

+ *pDest++ = 0xfffd;

+ while(*++pSrc != 0) {}

+ break;

+ }

+ /* Pre-flight the rest of the string. */

+ while((ch = *pSrc) != 0) {

+ if(ch < 0xc0) {

+ /*

+ * ASCII, or a trail byte in lead position which is treated like

+ * a single-byte sequence for better character boundary

+ * resynchronization after illegal sequences.

+ */

+ ++reqLength;

+ ++pSrc;

+ continue;

+ } else if(ch < 0xe0) { /* U+0080..U+07FF */

+ if(pSrc[1] != 0) {

+ ++reqLength;

+ pSrc += 2;

+ continue;

+ }

+ } else if(ch < 0xf0) { /* U+0800..U+FFFF */

+ if(pSrc[1] != 0 && pSrc[2] != 0) {

+ ++reqLength;

+ pSrc += 3;

+ continue;

+ }

+ } else /* f0..f4 */ { /* U+10000..U+10FFFF */

+ if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {

+ reqLength += 2;

+ pSrc += 4;

+ continue;

+ }

+ /* truncated character at the end */

+ ++reqLength;

+ break;

+ }

+ } else /* srcLength >= 0 */ {

+ const uint8_t *pSrcLimit = pSrc + srcLength;

+ /*

+ * This function requires that if srcLength is given, then it must be

+ * destCapatity >= srcLength so that we need not check for

+ * destination buffer overflow in the loop.

+ */

+ if(destCapacity < srcLength) {

+ if(pDestLength != NULL) {

+ *pDestLength = srcLength; /* this likely overestimates the true destLength! */

+ }

+ *pErrorCode = U_BUFFER_OVERFLOW_ERROR;

+ return NULL;

+ }

+ if((pSrcLimit - pSrc) >= 4) {

+ pSrcLimit -= 3; /* temporarily reduce pSrcLimit */

+ /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */

+ do {

+ ch = *pSrc++;

+ if(ch < 0xc0) {

+ /*

+ * ASCII, or a trail byte in lead position which is treated like

+ * a single-byte sequence for better character boundary

+ * resynchronization after illegal sequences.

+ */

+ *pDest++=(UChar)ch;

+ } else if(ch < 0xe0) { /* U+0080..U+07FF */

+ /* 0x3080 = (0xc0 << 6) + 0x80 */

+ *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);

+ } else if(ch < 0xf0) { /* U+0800..U+FFFF */

+ /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */

+ /* 0x2080 = (0x80 << 6) + 0x80 */

+ ch = (ch << 12) + (*pSrc++ << 6);

+ *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);

+ } else /* f0..f4 */ { /* U+10000..U+10FFFF */

+ /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */

+ ch = (ch << 18) + (*pSrc++ << 12);

+ ch += *pSrc++ << 6;

+ ch += *pSrc++ - 0x3c82080;

+ *(pDest++) = U16_LEAD(ch);

+ *(pDest++) = U16_TRAIL(ch);

+ }

+ } while(pSrc < pSrcLimit);

+ pSrcLimit += 3; /* restore original pSrcLimit */

+ }

+ while(pSrc < pSrcLimit) {

+ ch = *pSrc++;

+ if(ch < 0xc0) {

+ /*

+ * ASCII, or a trail byte in lead position which is treated like

+ * a single-byte sequence for better character boundary

+ * resynchronization after illegal sequences.

+ */

+ *pDest++=(UChar)ch;

+ continue;

+ } else if(ch < 0xe0) { /* U+0080..U+07FF */

+ if(pSrc < pSrcLimit) {

+ /* 0x3080 = (0xc0 << 6) + 0x80 */

+ *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);

+ continue;

+ }

+ } else if(ch < 0xf0) { /* U+0800..U+FFFF */

+ if((pSrcLimit - pSrc) >= 2) {

+ /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */

+ /* 0x2080 = (0x80 << 6) + 0x80 */

+ ch = (ch << 12) + (*pSrc++ << 6);

+ *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);

+ pSrc += 3;

+ continue;

+ }

+ } else /* f0..f4 */ { /* U+10000..U+10FFFF */

+ if((pSrcLimit - pSrc) >= 3) {

+ /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */

+ ch = (ch << 18) + (*pSrc++ << 12);

+ ch += *pSrc++ << 6;

+ ch += *pSrc++ - 0x3c82080;

+ *(pDest++) = U16_LEAD(ch);

+ *(pDest++) = U16_TRAIL(ch);

+ pSrc += 4;

+ continue;

+ }

+ /* truncated character at the end */

+ *pDest++ = 0xfffd;

+ break;

+ }

+ reqLength+=(int32_t)(pDest - dest);

+ if(pDestLength){

+ *pDestLength = reqLength;

+ }

+ /* Terminate the buffer */

+ u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);

+ return dest;

+static U_INLINE uint8_t *

+_appendUTF8(uint8_t *pDest, UChar32 c) {

+ /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */

+ if((c)<=0x7f) {

+ *pDest++=(uint8_t)c;

+ } else if(c<=0x7ff) {

+ *pDest++=(uint8_t)((c>>6)|0xc0);

+ *pDest++=(uint8_t)((c&0x3f)|0x80);

+ } else if(c<=0xffff) {

+ *pDest++=(uint8_t)((c>>12)|0xe0);

+ *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);

+ *pDest++=(uint8_t)(((c)&0x3f)|0x80);

+ } else /* if((uint32_t)(c)<=0x10ffff) */ {

+ *pDest++=(uint8_t)(((c)>>18)|0xf0);

+ *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);

+ *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);

+ *pDest++=(uint8_t)(((c)&0x3f)|0x80);

+ }

+ return pDest;

+U_CAPI char* U_EXPORT2

+u_strToUTF8WithSub(char *dest,

+ int32_t destCapacity,

+ int32_t *pDestLength,

+ const UChar *pSrc,

+ int32_t srcLength,

+ UChar32 subchar, int32_t *pNumSubstitutions,

+ UErrorCode *pErrorCode){

+ int32_t reqLength=0;

+ uint32_t ch=0,ch2=0;

+ uint8_t *pDest = (uint8_t *)dest;

+ uint8_t *pDestLimit = pDest + destCapacity;

+ int32_t numSubstitutions;

+ /* args check */

+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){

+ return NULL;

+ }

+ if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||

+ (destCapacity<0) || (dest == NULL && destCapacity > 0) ||

+ subchar > 0x10ffff || U_IS_SURROGATE(subchar)

+ ) {

+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;

+ return NULL;

+ }

+ if(pNumSubstitutions!=NULL) {

+ *pNumSubstitutions=0;

+ }

+ numSubstitutions=0;

+ if(srcLength==-1) {

+ while((ch=*pSrc)!=0) {

+ ++pSrc;

+ if(ch <= 0x7f) {

+ if(pDest<pDestLimit) {

+ *pDest++ = (uint8_t)ch;

+ } else {

+ reqLength = 1;

+ break;

+ }

+ } else if(ch <= 0x7ff) {

+ if((pDestLimit - pDest) >= 2) {

+ *pDest++=(uint8_t)((ch>>6)|0xc0);

+ *pDest++=(uint8_t)((ch&0x3f)|0x80);

+ } else {

+ reqLength = 2;

+ break;

+ }

+ } else if(ch <= 0xd7ff || ch >= 0xe000) {

+ if((pDestLimit - pDest) >= 3) {

+ *pDest++=(uint8_t)((ch>>12)|0xe0);

+ *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);

+ *pDest++=(uint8_t)((ch&0x3f)|0x80);

+ } else {

+ reqLength = 3;

+ break;

+ }

+ } else /* ch is a surrogate */ {

+ int32_t length;

+ /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/

+ if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {

+ ++pSrc;

+ ch=UTF16_GET_PAIR_VALUE(ch, ch2);

+ } else if(subchar>=0) {

+ ch=subchar;

+ ++numSubstitutions;

+ } else {

+ /* Unicode 3.2 forbids surrogate code points in UTF-8 */

+ *pErrorCode = U_INVALID_CHAR_FOUND;

+ return NULL;

+ }

+ length = U8_LENGTH(ch);

+ if((pDestLimit - pDest) >= length) {

+ /* convert and append*/

+ pDest=_appendUTF8(pDest, ch);

+ } else {

+ reqLength = length;

+ break;

+ }

+ while((ch=*pSrc++)!=0) {

+ if(ch<=0x7f) {

+ ++reqLength;

+ } else if(ch<=0x7ff) {

+ reqLength+=2;

+ } else if(!UTF_IS_SURROGATE(ch)) {

+ reqLength+=3;

+ } else if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {

+ ++pSrc;

+ reqLength+=4;

+ } else if(subchar>=0) {

+ reqLength+=U8_LENGTH(subchar);

+ ++numSubstitutions;

+ } else {

+ /* Unicode 3.2 forbids surrogate code points in UTF-8 */

+ *pErrorCode = U_INVALID_CHAR_FOUND;

+ return NULL;

+ }

+ } else {

+ const UChar *pSrcLimit = pSrc+srcLength;

+ int32_t count;

+ /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */

+ for(;;) {

+ /*

+ * Each iteration of the inner loop progresses by at most 3 UTF-8

+ * bytes and one UChar, for most characters.

+ * For supplementary code points (4 & 2), which are rare,

+ * there is an additional adjustment.

+ */

+ count = (int32_t)((pDestLimit - pDest) / 3);

+ srcLength = (int32_t)(pSrcLimit - pSrc);

+ if(count > srcLength) {

+ count = srcLength; /* min(remaining dest/3, remaining src) */

+ }

+ if(count < 3) {

+ /*

+ * Too much overhead if we get near the end of the string,

+ * continue with the next loop.

+ */

+ break;

+ }

+ do {

+ ch=*pSrc++;

+ if(ch <= 0x7f) {

+ *pDest++ = (uint8_t)ch;

+ } else if(ch <= 0x7ff) {

+ *pDest++=(uint8_t)((ch>>6)|0xc0);

+ *pDest++=(uint8_t)((ch&0x3f)|0x80);

+ } else if(ch <= 0xd7ff || ch >= 0xe000) {

+ *pDest++=(uint8_t)((ch>>12)|0xe0);

+ *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);

+ *pDest++=(uint8_t)((ch&0x3f)|0x80);

+ } else /* ch is a surrogate */ {

+ /*

+ * We will read two UChars and probably output four bytes,

+ * which we didn't account for with computing count,

+ * so we adjust it here.

+ */

+ if(--count == 0) {

+ --pSrc; /* undo ch=*pSrc++ for the lead surrogate */

+ break; /* recompute count */

+ }

+ if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {

+ ++pSrc;

+ ch=UTF16_GET_PAIR_VALUE(ch, ch2);

+ /* writing 4 bytes per 2 UChars is ok */

+ *pDest++=(uint8_t)((ch>>18)|0xf0);

+ *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);

+ *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);

+ *pDest++=(uint8_t)((ch&0x3f)|0x80);

+ } else {

+ /* Unicode 3.2 forbids surrogate code points in UTF-8 */

+ if(subchar>=0) {

+ ch=subchar;

+ ++numSubstitutions;

+ } else {

+ *pErrorCode = U_INVALID_CHAR_FOUND;

+ return NULL;

+ }

+ /* convert and append*/

+ pDest=_appendUTF8(pDest, ch);

+ }

+ } while(--count > 0);

+ }

+ while(pSrc<pSrcLimit) {

+ ch=*pSrc++;

+ if(ch <= 0x7f) {

+ if(pDest<pDestLimit) {

+ *pDest++ = (uint8_t)ch;

+ } else {

+ reqLength = 1;

+ break;

+ }

+ } else if(ch <= 0x7ff) {

+ if((pDestLimit - pDest) >= 2) {

+ *pDest++=(uint8_t)((ch>>6)|0xc0);

+ *pDest++=(uint8_t)((ch&0x3f)|0x80);

+ } else {

+ reqLength = 2;

+ break;

+ }

+ } else if(ch <= 0xd7ff || ch >= 0xe000) {

+ if((pDestLimit - pDest) >= 3) {

+ *pDest++=(uint8_t)((ch>>12)|0xe0);

+ *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);

+ *pDest++=(uint8_t)((ch&0x3f)|0x80);

+ } else {

+ reqLength = 3;

+ break;

+ }

+ } else /* ch is a surrogate */ {

+ int32_t length;

+ if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {

+ ++pSrc;

+ ch=UTF16_GET_PAIR_VALUE(ch, ch2);

+ } else if(subchar>=0) {

+ ch=subchar;

+ ++numSubstitutions;

+ } else {

+ /* Unicode 3.2 forbids surrogate code points in UTF-8 */

+ *pErrorCode = U_INVALID_CHAR_FOUND;

+ return NULL;

+ }

+ length = U8_LENGTH(ch);

+ if((pDestLimit - pDest) >= length) {

+ /* convert and append*/

+ pDest=_appendUTF8(pDest, ch);

+ } else {

+ reqLength = length;

+ break;

+ }

+ while(pSrc<pSrcLimit) {

+ ch=*pSrc++;

+ if(ch<=0x7f) {

+ ++reqLength;

+ } else if(ch<=0x7ff) {

+ reqLength+=2;

+ } else if(!UTF_IS_SURROGATE(ch)) {

+ reqLength+=3;

+ } else if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {

+ ++pSrc;

+ reqLength+=4;

+ } else if(subchar>=0) {

+ reqLength+=U8_LENGTH(subchar);

+ ++numSubstitutions;

+ } else {

+ /* Unicode 3.2 forbids surrogate code points in UTF-8 */

+ *pErrorCode = U_INVALID_CHAR_FOUND;

+ return NULL;

+ }

+ reqLength+=(int32_t)(pDest - (uint8_t *)dest);

+ if(pNumSubstitutions!=NULL) {

+ *pNumSubstitutions=numSubstitutions;

+ }

+ if(pDestLength){

+ *pDestLength = reqLength;

+ }

+ /* Terminate the buffer */

+ u_terminateChars(dest, destCapacity, reqLength, pErrorCode);

+ return dest;

+U_CAPI char* U_EXPORT2

+u_strToUTF8(char *dest,

+ int32_t destCapacity,

+ int32_t *pDestLength,

+ const UChar *pSrc,

+ int32_t srcLength,

+ UErrorCode *pErrorCode){

+ return u_strToUTF8WithSub(

+ dest, destCapacity, pDestLength,

+ pSrc, srcLength,

+ U_SENTINEL, NULL,

+ pErrorCode);

+U_CAPI UChar* U_EXPORT2

+u_strFromJavaModifiedUTF8WithSub(

+ UChar *dest,

+ int32_t destCapacity,

+ int32_t *pDestLength,

+ const char *src,

+ int32_t srcLength,

+ UChar32 subchar, int32_t *pNumSubstitutions,

+ UErrorCode *pErrorCode) {

+ UChar *pDest = dest;

+ UChar *pDestLimit = dest+destCapacity;

+ UChar32 ch;

+ int32_t reqLength = 0;

+ const uint8_t* pSrc = (const uint8_t*) src;

+ const uint8_t *pSrcLimit;

+ int32_t count;

+ uint8_t t1, t2; /* trail bytes */

+ int32_t numSubstitutions;

+ /* args check */

+ if(U_FAILURE(*pErrorCode)){

+ return NULL;

+ }

+ if( (src==NULL && srcLength!=0) || srcLength < -1 ||

+ (dest==NULL && destCapacity!=0) || destCapacity<0 ||

+ subchar > 0x10ffff || U_IS_SURROGATE(subchar)

+ ) {

+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;

+ return NULL;

+ }

+ if(pNumSubstitutions!=NULL) {

+ *pNumSubstitutions=0;

+ }

+ numSubstitutions=0;

+ if(srcLength < 0) {

+ /*

+ * Transform a NUL-terminated ASCII string.

+ * Handle non-ASCII strings with slower code.

+ */

+ while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {

+ *pDest++=(UChar)ch;

+ ++pSrc;

+ }

+ if(ch == 0) {

+ reqLength=(int32_t)(pDest - dest);

+ if(pDestLength) {

+ *pDestLength = reqLength;

+ }

+ /* Terminate the buffer */

+ u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);

+ return dest;

+ }

+ srcLength = uprv_strlen((const char *)pSrc);

+ }

+ /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */

+ pSrcLimit = pSrc + srcLength;

+ for(;;) {

+ count = (int32_t)(pDestLimit - pDest);

+ srcLength = (int32_t)(pSrcLimit - pSrc);

+ if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {

+ /* fast ASCII loop */

+ const uint8_t *prevSrc = pSrc;

+ int32_t delta;

+ while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {

+ *pDest++=(UChar)ch;

+ ++pSrc;

+ }

+ delta = (int32_t)(pSrc - prevSrc);

+ count -= delta;

+ srcLength -= delta;

+ }

+ /*

+ * Each iteration of the inner loop progresses by at most 3 UTF-8

+ * bytes and one UChar.

+ */

+ srcLength /= 3;

+ if(count > srcLength) {

+ count = srcLength; /* min(remaining dest, remaining src/3) */

+ }

+ if(count < 3) {

+ /*

+ * Too much overhead if we get near the end of the string,

+ * continue with the next loop.

+ */

+ break;

+ }

+ do {

+ ch = *pSrc;

+ if(ch <= 0x7f){

+ *pDest++=(UChar)ch;

+ ++pSrc;

+ } else {

+ if(ch >= 0xe0) {

+ if( /* handle U+0000..U+FFFF inline */

+ ch <= 0xef &&

+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&

+ (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f

+ ) {

+ /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */

+ *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);

+ pSrc += 3;

+ continue;

+ }

+ } else {

+ if( /* handle U+0000..U+07FF inline */

+ ch >= 0xc0 &&

+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f

+ ) {

+ *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);

+ pSrc += 2;

+ continue;

+ }

+ if(subchar < 0) {

+ *pErrorCode = U_INVALID_CHAR_FOUND;

+ return NULL;

+ } else if(subchar > 0xffff && --count == 0) {

+ /*

+ * We need to write two UChars, adjusted count for that,

+ * and ran out of space.

+ */

+ break;

+ } else {

+ /* function call for error cases */

+ ++pSrc; /* continue after the lead byte */

+ utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);

+ ++numSubstitutions;

+ if(subchar<=0xFFFF) {

+ *(pDest++)=(UChar)subchar;

+ } else {

+ *(pDest++)=U16_LEAD(subchar);

+ *(pDest++)=U16_TRAIL(subchar);

+ }

+ } while(--count > 0);

+ }

+ while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {

+ ch = *pSrc;

+ if(ch <= 0x7f){

+ *pDest++=(UChar)ch;

+ ++pSrc;

+ } else {

+ if(ch >= 0xe0) {

+ if( /* handle U+0000..U+FFFF inline */

+ ch <= 0xef &&

+ ((pSrcLimit - pSrc) >= 3) &&

+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&

+ (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f

+ ) {

+ /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */

+ *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);

+ pSrc += 3;

+ continue;

+ }

+ } else {

+ if( /* handle U+0000..U+07FF inline */

+ ch >= 0xc0 &&

+ ((pSrcLimit - pSrc) >= 2) &&

+ (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f

+ ) {

+ *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);

+ pSrc += 2;

+ continue;

+ }

+ if(subchar < 0) {

+ *pErrorCode = U_INVALID_CHAR_FOUND;

+ return NULL;

+ } else {

+ /* function call for error cases */

+ ++pSrc; /* continue after the lead byte */

+ utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);

+ ++numSubstitutions;

+ if(subchar<=0xFFFF) {

+ *(pDest++)=(UChar)subchar;

+ } else {

+ *(pDest++)=U16_LEAD(subchar);

+ if(pDest<pDestLimit) {

+ *(pDest++)=U16_TRAIL(subchar);

+ } else {

+ reqLength++;

+ break;

+ }

+ /* do not fill the dest buffer just count the UChars needed */

+ while(pSrc < pSrcLimit){

+ ch = *pSrc;

+ if(ch <= 0x7f) {

+ reqLength++;

+ ++pSrc;

+ } else {

+ if(ch >= 0xe0) {

+ if( /* handle U+0000..U+FFFF inline */

+ ch <= 0xef &&

+ ((pSrcLimit - pSrc) >= 3) &&

+ (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&

+ (uint8_t)(pSrc[2] - 0x80) <= 0x3f

+ ) {

+ reqLength++;

+ pSrc += 3;

+ continue;

+ }

+ } else {

+ if( /* handle U+0000..U+07FF inline */

+ ch >= 0xc0 &&

+ ((pSrcLimit - pSrc) >= 2) &&

+ (uint8_t)(pSrc[1] - 0x80) <= 0x3f

+ ) {

+ reqLength++;

+ pSrc += 2;

+ continue;

+ }

+ if(subchar < 0) {

+ *pErrorCode = U_INVALID_CHAR_FOUND;

+ return NULL;

+ } else {

+ /* function call for error cases */

+ ++pSrc; /* continue after the lead byte */

+ utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);

+ ++numSubstitutions;

+ reqLength+=U16_LENGTH(ch);

+ }

+ if(pNumSubstitutions!=NULL) {

+ *pNumSubstitutions=numSubstitutions;

+ }

+ reqLength+=(int32_t)(pDest - dest);

+ if(pDestLength) {

+ *pDestLength = reqLength;

+ }

+ /* Terminate the buffer */

+ u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);

+ return dest;

+U_CAPI char* U_EXPORT2

+u_strToJavaModifiedUTF8(

+ char *dest,

+ int32_t destCapacity,

+ int32_t *pDestLength,

+ const UChar *src,

+ int32_t srcLength,

+ UErrorCode *pErrorCode) {

+ int32_t reqLength=0;

+ uint32_t ch=0;

+ uint8_t *pDest = (uint8_t *)dest;

+ uint8_t *pDestLimit = pDest + destCapacity;

+ const UChar *pSrcLimit;

+ int32_t count;

+ /* args check */

+ if(U_FAILURE(*pErrorCode)){

+ return NULL;

+ }

+ if( (src==NULL && srcLength!=0) || srcLength < -1 ||

+ (dest==NULL && destCapacity!=0) || destCapacity<0

+ ) {

+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;

+ return NULL;

+ }

+ if(srcLength==-1) {

+ /* Convert NUL-terminated ASCII, then find the string length. */

+ while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {

+ *pDest++ = (uint8_t)ch;

+ ++src;

+ }

+ if(ch == 0) {

+ reqLength=(int32_t)(pDest - (uint8_t *)dest);

+ if(pDestLength) {

+ *pDestLength = reqLength;

+ }

+ /* Terminate the buffer */

+ u_terminateChars(dest, destCapacity, reqLength, pErrorCode);

+ return dest;

+ }

+ srcLength = u_strlen(src);

+ }

+ /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */

+ pSrcLimit = src+srcLength;

+ for(;;) {

+ count = (int32_t)(pDestLimit - pDest);

+ srcLength = (int32_t)(pSrcLimit - src);

+ if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {

+ /* fast ASCII loop */

+ const UChar *prevSrc = src;

+ int32_t delta;

+ while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {

+ *pDest++=(uint8_t)ch;

+ ++src;

+ }

+ delta = (int32_t)(src - prevSrc);

+ count -= delta;

+ srcLength -= delta;

+ }

+ /*

+ * Each iteration of the inner loop progresses by at most 3 UTF-8

+ * bytes and one UChar.

+ */

+ count /= 3;

+ if(count > srcLength) {

+ count = srcLength; /* min(remaining dest/3, remaining src) */

+ }

+ if(count < 3) {

+ /*

+ * Too much overhead if we get near the end of the string,

+ * continue with the next loop.

+ */

+ break;

+ }

+ do {

+ ch=*src++;

+ if(ch <= 0x7f && ch != 0) {

+ *pDest++ = (uint8_t)ch;

+ } else if(ch <= 0x7ff) {

+ *pDest++=(uint8_t)((ch>>6)|0xc0);

+ *pDest++=(uint8_t)((ch&0x3f)|0x80);

+ } else {

+ *pDest++=(uint8_t)((ch>>12)|0xe0);

+ *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);

+ *pDest++=(uint8_t)((ch&0x3f)|0x80);

+ }

+ } while(--count > 0);

+ }

+ while(src<pSrcLimit) {

+ ch=*src++;

+ if(ch <= 0x7f && ch != 0) {

+ if(pDest<pDestLimit) {

+ *pDest++ = (uint8_t)ch;

+ } else {

+ reqLength = 1;

+ break;

+ }

+ } else if(ch <= 0x7ff) {

+ if((pDestLimit - pDest) >= 2) {

+ *pDest++=(uint8_t)((ch>>6)|0xc0);

+ *pDest++=(uint8_t)((ch&0x3f)|0x80);

+ } else {

+ reqLength = 2;

+ break;

+ }

+ } else {

+ if((pDestLimit - pDest) >= 3) {

+ *pDest++=(uint8_t)((ch>>12)|0xe0);

+ *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);

+ *pDest++=(uint8_t)((ch&0x3f)|0x80);

+ } else {

+ reqLength = 3;

+ break;

+ }

+ while(src<pSrcLimit) {

+ ch=*src++;

+ if(ch <= 0x7f && ch != 0) {

+ ++reqLength;

+ } else if(ch<=0x7ff) {

+ reqLength+=2;

+ } else {

+ reqLength+=3;

+ }

+ reqLength+=(int32_t)(pDest - (uint8_t *)dest);

+ if(pDestLength){

+ *pDestLength = reqLength;

+ }

+ /* Terminate the buffer */

+ u_terminateChars(dest, destCapacity, reqLength, pErrorCode);

+ return dest;

Property changes on: icu46/source/common/ustrtrns.c

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu46/source/common/ustring.c ('k') | icu46/source/common/utext.cpp » ('j') | no next file with comments »