icu46/source/common/ustrtrns.c - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/ustrtrns.c

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 ******************************************************************************

	3 *

	4 * Copyright (C) 2001-2010, International Business Machines

	5 * Corporation and others. All Rights Reserved.

	6 *

	7 ******************************************************************************

	8 *

	9 * File ustrtrns.c

	10 *

	11 * Modification History:

	12 *

	13 * Date Name Description

	14 * 9/10/2001 Ram Creation.

	15 ******************************************************************************

	16 */

	17

	18 /*******************************************************************************

	19 *

	20 * u_strTo* and u_strFrom* APIs

	21 * WCS functions moved to ustr_wcs.c for better modularization

	22 *

	23 *******************************************************************************

	24 */

	25

	26

	27 #include "unicode/putil.h"

	28 #include "unicode/ustring.h"

	29 #include "cstring.h"

	30 #include "cmemory.h"

	31 #include "ustr_imp.h"

	32

	33 U_CAPI UChar* U_EXPORT2

	34 u_strFromUTF32WithSub(UChar *dest,

	35 int32_t destCapacity,

	36 int32_t *pDestLength,

	37 const UChar32 *src,

	38 int32_t srcLength,

	39 UChar32 subchar, int32_t *pNumSubstitutions,

	40 UErrorCode *pErrorCode) {

	41 const UChar32 *srcLimit;

	42 UChar32 ch;

	43 UChar *destLimit;

	44 UChar *pDest;

	45 int32_t reqLength;

	46 int32_t numSubstitutions;

	47

	48 /* args check */

	49 if(U_FAILURE(*pErrorCode)){

	50 return NULL;

	51 }

	52 if( (src==NULL && srcLength!=0) \|\| srcLength < -1 \|\|

	53 (destCapacity<0) \|\| (dest == NULL && destCapacity > 0) \|\|

	54 subchar > 0x10ffff \|\| U_IS_SURROGATE(subchar)

	55 ) {

	56 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;

	57 return NULL;

	58 }

	59

	60 if(pNumSubstitutions != NULL) {

	61 *pNumSubstitutions = 0;

	62 }

	63

	64 pDest = dest;

	65 destLimit = dest + destCapacity;

	66 reqLength = 0;

	67 numSubstitutions = 0;

	68

	69 if(srcLength < 0) {

	70 /* simple loop for conversion of a NUL-terminated BMP string */

	71 while((ch=*src) != 0 &&

	72 ((uint32_t)ch < 0xd800 \|\| (0xe000 <= ch && ch <= 0xffff))) {

	73 ++src;

	74 if(pDest < destLimit) {

	75 *pDest++ = (UChar)ch;

	76 } else {

	77 ++reqLength;

	78 }

	79 }

	80 srcLimit = src;

	81 if(ch != 0) {

	82 /* "complicated" case, find the end of the remaining string */

	83 while(*++srcLimit != 0) {}

	84 }

	85 } else {

	86 srcLimit = src + srcLength;

	87 }

	88

	89 /* convert with length */

	90 while(src < srcLimit) {

	91 ch = *src++;

	92 do {

	93 /* usually "loops" once; twice only for writing subchar */

	94 if((uint32_t)ch < 0xd800 \|\| (0xe000 <= ch && ch <= 0xffff)) {

	95 if(pDest < destLimit) {

	96 *pDest++ = (UChar)ch;

	97 } else {

	98 ++reqLength;

	99 }

	100 break;

	101 } else if(0x10000 <= ch && ch <= 0x10ffff) {

	102 if((pDest + 2) <= destLimit) {

	103 *pDest++ = U16_LEAD(ch);

	104 *pDest++ = U16_TRAIL(ch);

	105 } else {

	106 reqLength += 2;

	107 }

	108 break;

	109 } else if((ch = subchar) < 0) {

	110 /* surrogate code point, or not a Unicode code point at all */

	111 *pErrorCode = U_INVALID_CHAR_FOUND;

	112 return NULL;

	113 } else {

	114 ++numSubstitutions;

	115 }

	116 } while(TRUE);

	117 }

	118

	119 reqLength += (int32_t)(pDest - dest);

	120 if(pDestLength) {

	121 *pDestLength = reqLength;

	122 }

	123 if(pNumSubstitutions != NULL) {

	124 *pNumSubstitutions = numSubstitutions;

	125 }

	126

	127 /* Terminate the buffer */

	128 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);

	129

	130 return dest;

	131 }

	132

	133 U_CAPI UChar* U_EXPORT2

	134 u_strFromUTF32(UChar *dest,

	135 int32_t destCapacity,

	136 int32_t *pDestLength,

	137 const UChar32 *src,

	138 int32_t srcLength,

	139 UErrorCode *pErrorCode) {

	140 return u_strFromUTF32WithSub(

	141 dest, destCapacity, pDestLength,

	142 src, srcLength,

	143 U_SENTINEL, NULL,

	144 pErrorCode);

	145 }

	146

	147 U_CAPI UChar32* U_EXPORT2

	148 u_strToUTF32WithSub(UChar32 *dest,

	149 int32_t destCapacity,

	150 int32_t *pDestLength,

	151 const UChar *src,

	152 int32_t srcLength,

	153 UChar32 subchar, int32_t *pNumSubstitutions,

	154 UErrorCode *pErrorCode) {

	155 const UChar *srcLimit;

	156 UChar32 ch;

	157 UChar ch2;

	158 UChar32 *destLimit;

	159 UChar32 *pDest;

	160 int32_t reqLength;

	161 int32_t numSubstitutions;

	162

	163 /* args check */

	164 if(U_FAILURE(*pErrorCode)){

	165 return NULL;

	166 }

	167 if( (src==NULL && srcLength!=0) \|\| srcLength < -1 \|\|

	168 (destCapacity<0) \|\| (dest == NULL && destCapacity > 0) \|\|

	169 subchar > 0x10ffff \|\| U_IS_SURROGATE(subchar)

	170 ) {

	171 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;

	172 return NULL;

	173 }

	174

	175 if(pNumSubstitutions != NULL) {

	176 *pNumSubstitutions = 0;

	177 }

	178

	179 pDest = dest;

	180 destLimit = dest + destCapacity;

	181 reqLength = 0;

	182 numSubstitutions = 0;

	183

	184 if(srcLength < 0) {

	185 /* simple loop for conversion of a NUL-terminated BMP string */

	186 while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {

	187 ++src;

	188 if(pDest < destLimit) {

	189 *pDest++ = ch;

	190 } else {

	191 ++reqLength;

	192 }

	193 }

	194 srcLimit = src;

	195 if(ch != 0) {

	196 /* "complicated" case, find the end of the remaining string */

	197 while(*++srcLimit != 0) {}

	198 }

	199 } else {

	200 srcLimit = src + srcLength;

	201 }

	202

	203 /* convert with length */

	204 while(src < srcLimit) {

	205 ch = *src++;

	206 if(!U16_IS_SURROGATE(ch)) {

	207 /* write or count ch below */

	208 } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch 2 = *src)) {

	209 ++src;

	210 ch = U16_GET_SUPPLEMENTARY(ch, ch2);

	211 } else if((ch = subchar) < 0) {

	212 /* unpaired surrogate */

	213 *pErrorCode = U_INVALID_CHAR_FOUND;

	214 return NULL;

	215 } else {

	216 ++numSubstitutions;

	217 }

	218 if(pDest < destLimit) {

	219 *pDest++ = ch;

	220 } else {

	221 ++reqLength;

	222 }

	223 }

	224

	225 reqLength += (int32_t)(pDest - dest);

	226 if(pDestLength) {

	227 *pDestLength = reqLength;

	228 }

	229 if(pNumSubstitutions != NULL) {

	230 *pNumSubstitutions = numSubstitutions;

	231 }

	232

	233 /* Terminate the buffer */

	234 u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);

	235

	236 return dest;

	237 }

	238

	239 U_CAPI UChar32* U_EXPORT2

	240 u_strToUTF32(UChar32 *dest,

	241 int32_t destCapacity,

	242 int32_t *pDestLength,

	243 const UChar *src,

	244 int32_t srcLength,

	245 UErrorCode *pErrorCode) {

	246 return u_strToUTF32WithSub(

	247 dest, destCapacity, pDestLength,

	248 src, srcLength,

	249 U_SENTINEL, NULL,

	250 pErrorCode);

	251 }

	252

	253 /* for utf8_nextCharSafeBodyTerminated() */

	254 static const UChar32

	255 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };

	256

	257 /*

	258 * Version of utf8_nextCharSafeBody() with the following differences:

	259 * - checks for NUL termination instead of length

	260 * - works with pointers instead of indexes

	261 * - always strict (strict==-1)

	262 *

	263 * *ps points to after the lead byte and will be moved to after the last trail b yte.

	264 * c is the lead byte.

	265 * @return the code point, or U_SENTINEL

	266 */

	267 static UChar32

	268 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {

	269 const uint8_t s=ps;

	270 uint8_t trail, illegal=0;

	271 uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);

	272 UTF8_MASK_LEAD_BYTE((c), count);

	273 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */

	274 switch(count) {

	275 /* each branch falls through to the next one */

	276 case 5:

	277 case 4:

	278 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's U TF-8 */

	279 illegal=1;

	280 break;

	281 case 3:

	282 trail=(uint8_t)(*s++ - 0x80);

	283 c=(c<<6)\|trail;

	284 if(trail>0x3f \|\| c>=0x110) {

	285 /* not a trail byte, or code point>0x10ffff (outside Unicode) */

	286 illegal=1;

	287 break;

	288 }

	289 case 2:

	290 trail=(uint8_t)(*s++ - 0x80);

	291 if(trail>0x3f) {

	292 /* not a trail byte */

	293 illegal=1;

	294 break;

	295 }

	296 c=(c<<6)\|trail;

	297 case 1:

	298 trail=(uint8_t)(*s++ - 0x80);

	299 if(trail>0x3f) {

	300 /* not a trail byte */

	301 illegal=1;

	302 }

	303 c=(c<<6)\|trail;

	304 break;

	305 case 0:

	306 return U_SENTINEL;

	307 /* no default branch to optimize switch() - all values are covered */

	308 }

	309

	310 /* correct sequence - all trail bytes have (b7..b6)==(10)? */

	311 /* illegal is also set if count>=4 */

	312 if(illegal \|\| c<utf8_minLegal[count] \|\| UTF_IS_SURROGATE(c)) {

	313 /* error handling */

	314 /* don't go beyond this sequence */

	315 s=*ps;

	316 while(count>0 && UTF8_IS_TRAIL(*s)) {

	317 ++s;

	318 --count;

	319 }

	320 c=U_SENTINEL;

	321 }

	322 *ps=s;

	323 return c;

	324 }

	325

	326 /*

	327 * Version of utf8_nextCharSafeBody() with the following differences:

	328 * - works with pointers instead of indexes

	329 * - always strict (strict==-1)

	330 *

	331 * *ps points to after the lead byte and will be moved to after the last trail b yte.

	332 * c is the lead byte.

	333 * @return the code point, or U_SENTINEL

	334 */

	335 static UChar32

	336 utf8_nextCharSafeBodyPointer(const uint8_t *ps, const uint8_t limit, UChar32 c ) {

	337 const uint8_t s=ps;

	338 uint8_t trail, illegal=0;

	339 uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);

	340 if((limit-s)>=count) {

	341 UTF8_MASK_LEAD_BYTE((c), count);

	342 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */

	343 switch(count) {

	344 /* each branch falls through to the next one */

	345 case 5:

	346 case 4:

	347 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode 's UTF-8 */

	348 illegal=1;

	349 break;

	350 case 3:

	351 trail=*s++;

	352 c=(c<<6)\|(trail&0x3f);

	353 if(c<0x110) {

	354 illegal\|=(trail&0xc0)^0x80;

	355 } else {

	356 /* code point>0x10ffff, outside Unicode */

	357 illegal=1;

	358 break;

	359 }

	360 case 2:

	361 trail=*s++;

	362 c=(c<<6)\|(trail&0x3f);

	363 illegal\|=(trail&0xc0)^0x80;

	364 case 1:

	365 trail=*s++;

	366 c=(c<<6)\|(trail&0x3f);

	367 illegal\|=(trail&0xc0)^0x80;

	368 break;

	369 case 0:

	370 return U_SENTINEL;

	371 /* no default branch to optimize switch() - all values are covered */

	372 }

	373 } else {

	374 illegal=1; /* too few bytes left */

	375 }

	376

	377 /* correct sequence - all trail bytes have (b7..b6)==(10)? */

	378 /* illegal is also set if count>=4 */

	379 if(illegal \|\| c<utf8_minLegal[count] \|\| UTF_IS_SURROGATE(c)) {

	380 /* error handling */

	381 /* don't go beyond this sequence */

	382 s=*ps;

	383 while(count>0 && s<limit && UTF8_IS_TRAIL(*s)) {

	384 ++s;

	385 --count;

	386 }

	387 c=U_SENTINEL;

	388 }

	389 *ps=s;

	390 return c;

	391 }

	392

	393 U_CAPI UChar* U_EXPORT2

	394 u_strFromUTF8WithSub(UChar *dest,

	395 int32_t destCapacity,

	396 int32_t *pDestLength,

	397 const char* src,

	398 int32_t srcLength,

	399 UChar32 subchar, int32_t *pNumSubstitutions,

	400 UErrorCode *pErrorCode){

	401 UChar *pDest = dest;

	402 UChar *pDestLimit = dest+destCapacity;

	403 UChar32 ch;

	404 int32_t reqLength = 0;

	405 const uint8_t* pSrc = (const uint8_t*) src;

	406 uint8_t t1, t2; /* trail bytes */

	407 int32_t numSubstitutions;

	408

	409 /* args check */

	410 if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)){

	411 return NULL;

	412 }

	413

	414 if( (src==NULL && srcLength!=0) \|\| srcLength < -1 \|\|

	415 (destCapacity<0) \|\| (dest == NULL && destCapacity > 0) \|\|

	416 subchar > 0x10ffff \|\| U_IS_SURROGATE(subchar)

	417 ) {

	418 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;

	419 return NULL;

	420 }

	421

	422 if(pNumSubstitutions!=NULL) {

	423 *pNumSubstitutions=0;

	424 }

	425 numSubstitutions=0;

	426

	427 /*

	428 * Inline processing of UTF-8 byte sequences:

	429 *

	430 * Byte sequences for the most common characters are handled inline in

	431 * the conversion loops. In order to reduce the path lengths for those

	432 * characters, the tests are arranged in a kind of binary search.

	433 * ASCII (<=0x7f) is checked first, followed by the dividing point

	434 * between 2- and 3-byte sequences (0xe0).

	435 * The 3-byte branch is tested first to speed up CJK text.

	436 * The compiler should combine the subtractions for the two tests for 0xe0.

	437 * Each branch then tests for the other end of its range.

	438 */

	439

	440 if(srcLength < 0){

	441 /*

	442 * Transform a NUL-terminated string.

	443 * The code explicitly checks for NULs only in the lead byte position.

	444 * A NUL byte in the trail byte position fails the trail byte range chec k anyway.

	445 */

	446 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {

	447 if(ch <= 0x7f){

	448 *pDest++=(UChar)ch;

	449 ++pSrc;

	450 } else {

	451 if(ch > 0xe0) {

	452 if( /* handle U+1000..U+CFFF inline */

	453 ch <= 0xec &&

	454 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&

	455 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f

	456 ) {

	457 /* no need for (ch & 0xf) because the upper bits are tru ncated after <<12 in the cast to (UChar) */

	458 *pDest++ = (UChar)((ch << 12) \| (t1 << 6) \| t2);

	459 pSrc += 3;

	460 continue;

	461 }

	462 } else if(ch < 0xe0) {

	463 if( /* handle U+0080..U+07FF inline */

	464 ch >= 0xc2 &&

	465 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f

	466 ) {

	467 *pDest++ = (UChar)(((ch & 0x1f) << 6) \| t1);

	468 pSrc += 2;

	469 continue;

	470 }

	471 }

	472

	473 /* function call for "complicated" and error cases */

	474 ++pSrc; /* continue after the lead byte */

	475 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);

	476 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {

	477 *pErrorCode = U_INVALID_CHAR_FOUND;

	478 return NULL;

	479 } else if(ch<=0xFFFF) {

	480 *(pDest++)=(UChar)ch;

	481 } else {

	482 *(pDest++)=UTF16_LEAD(ch);

	483 if(pDest<pDestLimit) {

	484 *(pDest++)=UTF16_TRAIL(ch);

	485 } else {

	486 reqLength++;

	487 break;

	488 }

	489 }

	490 }

	491 }

	492

	493 /* Pre-flight the rest of the string. */

	494 while((ch = *pSrc) != 0) {

	495 if(ch <= 0x7f){

	496 ++reqLength;

	497 ++pSrc;

	498 } else {

	499 if(ch > 0xe0) {

	500 if( /* handle U+1000..U+CFFF inline */

	501 ch <= 0xec &&

	502 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&

	503 (uint8_t)(pSrc[2] - 0x80) <= 0x3f

	504 ) {

	505 ++reqLength;

	506 pSrc += 3;

	507 continue;

	508 }

	509 } else if(ch < 0xe0) {

	510 if( /* handle U+0080..U+07FF inline */

	511 ch >= 0xc2 &&

	512 (uint8_t)(pSrc[1] - 0x80) <= 0x3f

	513 ) {

	514 ++reqLength;

	515 pSrc += 2;

	516 continue;

	517 }

	518 }

	519

	520 /* function call for "complicated" and error cases */

	521 ++pSrc; /* continue after the lead byte */

	522 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);

	523 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {

	524 *pErrorCode = U_INVALID_CHAR_FOUND;

	525 return NULL;

	526 }

	527 reqLength += U16_LENGTH(ch);

	528 }

	529 }

	530 } else /* srcLength >= 0 */ {

	531 const uint8_t *pSrcLimit = pSrc + srcLength;

	532 int32_t count;

	533

	534 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */

	535 for(;;) {

	536 /*

	537 * Each iteration of the inner loop progresses by at most 3 UTF-8

	538 * bytes and one UChar, for most characters.

	539 * For supplementary code points (4 & 2), which are rare,

	540 * there is an additional adjustment.

	541 */

	542 count = (int32_t)(pDestLimit - pDest);

	543 srcLength = (int32_t)((pSrcLimit - pSrc) / 3);

	544 if(count > srcLength) {

	545 count = srcLength; /* min(remaining dest, remaining src/3) */

	546 }

	547 if(count < 3) {

	548 /*

	549 * Too much overhead if we get near the end of the string,

	550 * continue with the next loop.

	551 */

	552 break;

	553 }

	554

	555 do {

	556 ch = *pSrc;

	557 if(ch <= 0x7f){

	558 *pDest++=(UChar)ch;

	559 ++pSrc;

	560 } else {

	561 if(ch > 0xe0) {

	562 if( /* handle U+1000..U+CFFF inline */

	563 ch <= 0xec &&

	564 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&

	565 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f

	566 ) {

	567 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */

	568 *pDest++ = (UChar)((ch << 12) \| (t1 << 6) \| t2);

	569 pSrc += 3;

	570 continue;

	571 }

	572 } else if(ch < 0xe0) {

	573 if( /* handle U+0080..U+07FF inline */

	574 ch >= 0xc2 &&

	575 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f

	576 ) {

	577 *pDest++ = (UChar)(((ch & 0x1f) << 6) \| t1);

	578 pSrc += 2;

	579 continue;

	580 }

	581 }

	582

	583 if(ch >= 0xf0 \|\| subchar > 0xffff) {

	584 /*

	585 * We may read up to six bytes and write up to two UChar s,

	586 * which we didn't account for with computing count,

	587 * so we adjust it here.

	588 */

	589 if(--count == 0) {

	590 break;

	591 }

	592 }

	593

	594 /* function call for "complicated" and error cases */

	595 ++pSrc; /* continue after the lead byte */

	596 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);

	597 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){

	598 *pErrorCode = U_INVALID_CHAR_FOUND;

	599 return NULL;

	600 }else if(ch<=0xFFFF){

	601 *(pDest++)=(UChar)ch;

	602 }else{

	603 *(pDest++)=UTF16_LEAD(ch);

	604 *(pDest++)=UTF16_TRAIL(ch);

	605 }

	606 }

	607 } while(--count > 0);

	608 }

	609

	610 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {

	611 ch = *pSrc;

	612 if(ch <= 0x7f){

	613 *pDest++=(UChar)ch;

	614 ++pSrc;

	615 } else {

	616 if(ch > 0xe0) {

	617 if( /* handle U+1000..U+CFFF inline */

	618 ch <= 0xec &&

	619 ((pSrcLimit - pSrc) >= 3) &&

	620 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&

	621 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f

	622 ) {

	623 /* no need for (ch & 0xf) because the upper bits are tru ncated after <<12 in the cast to (UChar) */

	624 *pDest++ = (UChar)((ch << 12) \| (t1 << 6) \| t2);

	625 pSrc += 3;

	626 continue;

	627 }

	628 } else if(ch < 0xe0) {

	629 if( /* handle U+0080..U+07FF inline */

	630 ch >= 0xc2 &&

	631 ((pSrcLimit - pSrc) >= 2) &&

	632 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f

	633 ) {

	634 *pDest++ = (UChar)(((ch & 0x1f) << 6) \| t1);

	635 pSrc += 2;

	636 continue;

	637 }

	638 }

	639

	640 /* function call for "complicated" and error cases */

	641 ++pSrc; /* continue after the lead byte */

	642 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);

	643 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){

	644 *pErrorCode = U_INVALID_CHAR_FOUND;

	645 return NULL;

	646 }else if(ch<=0xFFFF){

	647 *(pDest++)=(UChar)ch;

	648 }else{

	649 *(pDest++)=UTF16_LEAD(ch);

	650 if(pDest<pDestLimit){

	651 *(pDest++)=UTF16_TRAIL(ch);

	652 }else{

	653 reqLength++;

	654 break;

	655 }

	656 }

	657 }

	658 }

	659 /* do not fill the dest buffer just count the UChars needed */

	660 while(pSrc < pSrcLimit){

	661 ch = *pSrc;

	662 if(ch <= 0x7f){

	663 reqLength++;

	664 ++pSrc;

	665 } else {

	666 if(ch > 0xe0) {

	667 if( /* handle U+1000..U+CFFF inline */

	668 ch <= 0xec &&

	669 ((pSrcLimit - pSrc) >= 3) &&

	670 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&

	671 (uint8_t)(pSrc[2] - 0x80) <= 0x3f

	672 ) {

	673 reqLength++;

	674 pSrc += 3;

	675 continue;

	676 }

	677 } else if(ch < 0xe0) {

	678 if( /* handle U+0080..U+07FF inline */

	679 ch >= 0xc2 &&

	680 ((pSrcLimit - pSrc) >= 2) &&

	681 (uint8_t)(pSrc[1] - 0x80) <= 0x3f

	682 ) {

	683 reqLength++;

	684 pSrc += 2;

	685 continue;

	686 }

	687 }

	688

	689 /* function call for "complicated" and error cases */

	690 ++pSrc; /* continue after the lead byte */

	691 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);

	692 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){

	693 *pErrorCode = U_INVALID_CHAR_FOUND;

	694 return NULL;

	695 }

	696 reqLength+=UTF_CHAR_LENGTH(ch);

	697 }

	698 }

	699 }

	700

	701 reqLength+=(int32_t)(pDest - dest);

	702

	703 if(pNumSubstitutions!=NULL) {

	704 *pNumSubstitutions=numSubstitutions;

	705 }

	706

	707 if(pDestLength){

	708 *pDestLength = reqLength;

	709 }

	710

	711 /* Terminate the buffer */

	712 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);

	713

	714 return dest;

	715 }

	716

	717 U_CAPI UChar* U_EXPORT2

	718 u_strFromUTF8(UChar *dest,

	719 int32_t destCapacity,

	720 int32_t *pDestLength,

	721 const char* src,

	722 int32_t srcLength,

	723 UErrorCode *pErrorCode){

	724 return u_strFromUTF8WithSub(

	725 dest, destCapacity, pDestLength,

	726 src, srcLength,

	727 U_SENTINEL, NULL,

	728 pErrorCode);

	729 }

	730

	731 U_CAPI UChar * U_EXPORT2

	732 u_strFromUTF8Lenient(UChar *dest,

	733 int32_t destCapacity,

	734 int32_t *pDestLength,

	735 const char *src,

	736 int32_t srcLength,

	737 UErrorCode *pErrorCode) {

	738 UChar *pDest = dest;

	739 UChar32 ch;

	740 int32_t reqLength = 0;

	741 uint8_t* pSrc = (uint8_t*) src;

	742

	743 /* args check */

	744 if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)){

	745 return NULL;

	746 }

	747

	748 if( (src==NULL && srcLength!=0) \|\| srcLength < -1 \|\|

	749 (destCapacity<0) \|\| (dest == NULL && destCapacity > 0)

	750 ) {

	751 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;

	752 return NULL;

	753 }

	754

	755 if(srcLength < 0) {

	756 /* Transform a NUL-terminated string. */

	757 UChar *pDestLimit = dest+destCapacity;

	758 uint8_t t1, t2, t3; /* trail bytes */

	759

	760 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {

	761 if(ch < 0xc0) {

	762 /*

	763 * ASCII, or a trail byte in lead position which is treated like

	764 * a single-byte sequence for better character boundary

	765 * resynchronization after illegal sequences.

	766 */

	767 *pDest++=(UChar)ch;

	768 ++pSrc;

	769 continue;

	770 } else if(ch < 0xe0) { /* U+0080..U+07FF */

	771 if((t1 = pSrc[1]) != 0) {

	772 /* 0x3080 = (0xc0 << 6) + 0x80 */

	773 *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);

	774 pSrc += 2;

	775 continue;

	776 }

	777 } else if(ch < 0xf0) { /* U+0800..U+FFFF */

	778 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {

	779 /* no need for (ch & 0xf) because the upper bits are truncat ed after <<12 in the cast to (UChar) */

	780 /* 0x2080 = (0x80 << 6) + 0x80 */

	781 *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);

	782 pSrc += 3;

	783 continue;

	784 }

	785 } else /* f0..f4 / { / U+10000..U+10FFFF */

	786 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {

	787 pSrc += 4;

	788 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0 x80 */

	789 ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;

	790 *(pDest++) = U16_LEAD(ch);

	791 if(pDest < pDestLimit) {

	792 *(pDest++) = U16_TRAIL(ch);

	793 } else {

	794 reqLength = 1;

	795 break;

	796 }

	797 continue;

	798 }

	799 }

	800

	801 /* truncated character at the end */

	802 *pDest++ = 0xfffd;

	803 while(*++pSrc != 0) {}

	804 break;

	805 }

	806

	807 /* Pre-flight the rest of the string. */

	808 while((ch = *pSrc) != 0) {

	809 if(ch < 0xc0) {

	810 /*

	811 * ASCII, or a trail byte in lead position which is treated like

	812 * a single-byte sequence for better character boundary

	813 * resynchronization after illegal sequences.

	814 */

	815 ++reqLength;

	816 ++pSrc;

	817 continue;

	818 } else if(ch < 0xe0) { /* U+0080..U+07FF */

	819 if(pSrc[1] != 0) {

	820 ++reqLength;

	821 pSrc += 2;

	822 continue;

	823 }

	824 } else if(ch < 0xf0) { /* U+0800..U+FFFF */

	825 if(pSrc[1] != 0 && pSrc[2] != 0) {

	826 ++reqLength;

	827 pSrc += 3;

	828 continue;

	829 }

	830 } else /* f0..f4 / { / U+10000..U+10FFFF */

	831 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {

	832 reqLength += 2;

	833 pSrc += 4;

	834 continue;

	835 }

	836 }

	837

	838 /* truncated character at the end */

	839 ++reqLength;

	840 break;

	841 }

	842 } else /* srcLength >= 0 */ {

	843 const uint8_t *pSrcLimit = pSrc + srcLength;

	844

	845 /*

	846 * This function requires that if srcLength is given, then it must be

	847 * destCapatity >= srcLength so that we need not check for

	848 * destination buffer overflow in the loop.

	849 */

	850 if(destCapacity < srcLength) {

	851 if(pDestLength != NULL) {

	852 pDestLength = srcLength; / this likely overestimates the true destLength! */

	853 }

	854 *pErrorCode = U_BUFFER_OVERFLOW_ERROR;

	855 return NULL;

	856 }

	857

	858 if((pSrcLimit - pSrc) >= 4) {

	859 pSrcLimit -= 3; /* temporarily reduce pSrcLimit */

	860

	861 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */

	862 do {

	863 ch = *pSrc++;

	864 if(ch < 0xc0) {

	865 /*

	866 * ASCII, or a trail byte in lead position which is treated like

	867 * a single-byte sequence for better character boundary

	868 * resynchronization after illegal sequences.

	869 */

	870 *pDest++=(UChar)ch;

	871 } else if(ch < 0xe0) { /* U+0080..U+07FF */

	872 /* 0x3080 = (0xc0 << 6) + 0x80 */

	873 pDest++ = (UChar)((ch << 6) + pSrc++ - 0x3080);

	874 } else if(ch < 0xf0) { /* U+0800..U+FFFF */

	875 /* no need for (ch & 0xf) because the upper bits are truncat ed after <<12 in the cast to (UChar) */

	876 /* 0x2080 = (0x80 << 6) + 0x80 */

	877 ch = (ch << 12) + (*pSrc++ << 6);

	878 pDest++ = (UChar)(ch + pSrc++ - 0x2080);

	879 } else /* f0..f4 / { / U+10000..U+10FFFF */

	880 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0 x80 */

	881 ch = (ch << 18) + (*pSrc++ << 12);

	882 ch += *pSrc++ << 6;

	883 ch += *pSrc++ - 0x3c82080;

	884 *(pDest++) = U16_LEAD(ch);

	885 *(pDest++) = U16_TRAIL(ch);

	886 }

	887 } while(pSrc < pSrcLimit);

	888

	889 pSrcLimit += 3; /* restore original pSrcLimit */

	890 }

	891

	892 while(pSrc < pSrcLimit) {

	893 ch = *pSrc++;

	894 if(ch < 0xc0) {

	895 /*

	896 * ASCII, or a trail byte in lead position which is treated like

	897 * a single-byte sequence for better character boundary

	898 * resynchronization after illegal sequences.

	899 */

	900 *pDest++=(UChar)ch;

	901 continue;

	902 } else if(ch < 0xe0) { /* U+0080..U+07FF */

	903 if(pSrc < pSrcLimit) {

	904 /* 0x3080 = (0xc0 << 6) + 0x80 */

	905 pDest++ = (UChar)((ch << 6) + pSrc++ - 0x3080);

	906 continue;

	907 }

	908 } else if(ch < 0xf0) { /* U+0800..U+FFFF */

	909 if((pSrcLimit - pSrc) >= 2) {

	910 /* no need for (ch & 0xf) because the upper bits are truncat ed after <<12 in the cast to (UChar) */

	911 /* 0x2080 = (0x80 << 6) + 0x80 */

	912 ch = (ch << 12) + (*pSrc++ << 6);

	913 pDest++ = (UChar)(ch + pSrc++ - 0x2080);

	914 pSrc += 3;

	915 continue;

	916 }

	917 } else /* f0..f4 / { / U+10000..U+10FFFF */

	918 if((pSrcLimit - pSrc) >= 3) {

	919 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0 x80 */

	920 ch = (ch << 18) + (*pSrc++ << 12);

	921 ch += *pSrc++ << 6;

	922 ch += *pSrc++ - 0x3c82080;

	923 *(pDest++) = U16_LEAD(ch);

	924 *(pDest++) = U16_TRAIL(ch);

	925 pSrc += 4;

	926 continue;

	927 }

	928 }

	929

	930 /* truncated character at the end */

	931 *pDest++ = 0xfffd;

	932 break;

	933 }

	934 }

	935

	936 reqLength+=(int32_t)(pDest - dest);

	937

	938 if(pDestLength){

	939 *pDestLength = reqLength;

	940 }

	941

	942 /* Terminate the buffer */

	943 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);

	944

	945 return dest;

	946 }

	947

	948 static U_INLINE uint8_t *

	949 _appendUTF8(uint8_t *pDest, UChar32 c) {

	950 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating functi on */

	951 if((c)<=0x7f) {

	952 *pDest++=(uint8_t)c;

	953 } else if(c<=0x7ff) {

	954 *pDest++=(uint8_t)((c>>6)\|0xc0);

	955 *pDest++=(uint8_t)((c&0x3f)\|0x80);

	956 } else if(c<=0xffff) {

	957 *pDest++=(uint8_t)((c>>12)\|0xe0);

	958 *pDest++=(uint8_t)(((c>>6)&0x3f)\|0x80);

	959 *pDest++=(uint8_t)(((c)&0x3f)\|0x80);

	960 } else /* if((uint32_t)(c)<=0x10ffff) */ {

	961 *pDest++=(uint8_t)(((c)>>18)\|0xf0);

	962 *pDest++=(uint8_t)((((c)>>12)&0x3f)\|0x80);

	963 *pDest++=(uint8_t)((((c)>>6)&0x3f)\|0x80);

	964 *pDest++=(uint8_t)(((c)&0x3f)\|0x80);

	965 }

	966 return pDest;

	967 }

	968

	969

	970 U_CAPI char* U_EXPORT2

	971 u_strToUTF8WithSub(char *dest,

	972 int32_t destCapacity,

	973 int32_t *pDestLength,

	974 const UChar *pSrc,

	975 int32_t srcLength,

	976 UChar32 subchar, int32_t *pNumSubstitutions,

	977 UErrorCode *pErrorCode){

	978 int32_t reqLength=0;

	979 uint32_t ch=0,ch2=0;

	980 uint8_t pDest = (uint8_t )dest;

	981 uint8_t *pDestLimit = pDest + destCapacity;

	982 int32_t numSubstitutions;

	983

	984 /* args check */

	985 if(pErrorCode==NULL \|\| U_FAILURE(*pErrorCode)){

	986 return NULL;

	987 }

	988

	989 if( (pSrc==NULL && srcLength!=0) \|\| srcLength < -1 \|\|

	990 (destCapacity<0) \|\| (dest == NULL && destCapacity > 0) \|\|

	991 subchar > 0x10ffff \|\| U_IS_SURROGATE(subchar)

	992 ) {

	993 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;

	994 return NULL;

	995 }

	996

	997 if(pNumSubstitutions!=NULL) {

	998 *pNumSubstitutions=0;

	999 }

	1000 numSubstitutions=0;

	1001

	1002 if(srcLength==-1) {

	1003 while((ch=*pSrc)!=0) {

	1004 ++pSrc;

	1005 if(ch <= 0x7f) {

	1006 if(pDest<pDestLimit) {

	1007 *pDest++ = (uint8_t)ch;

	1008 } else {

	1009 reqLength = 1;

	1010 break;

	1011 }

	1012 } else if(ch <= 0x7ff) {

	1013 if((pDestLimit - pDest) >= 2) {

	1014 *pDest++=(uint8_t)((ch>>6)\|0xc0);

	1015 *pDest++=(uint8_t)((ch&0x3f)\|0x80);

	1016 } else {

	1017 reqLength = 2;

	1018 break;

	1019 }

	1020 } else if(ch <= 0xd7ff \|\| ch >= 0xe000) {

	1021 if((pDestLimit - pDest) >= 3) {

	1022 *pDest++=(uint8_t)((ch>>12)\|0xe0);

	1023 *pDest++=(uint8_t)(((ch>>6)&0x3f)\|0x80);

	1024 *pDest++=(uint8_t)((ch&0x3f)\|0x80);

	1025 } else {

	1026 reqLength = 3;

	1027 break;

	1028 }

	1029 } else /* ch is a surrogate */ {

	1030 int32_t length;

	1031

	1032 /need not check for NUL because NUL fails UTF_IS_TRAIL() anyway /

	1033 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {

	1034 ++pSrc;

	1035 ch=UTF16_GET_PAIR_VALUE(ch, ch2);

	1036 } else if(subchar>=0) {

	1037 ch=subchar;

	1038 ++numSubstitutions;

	1039 } else {

	1040 /* Unicode 3.2 forbids surrogate code points in UTF-8 */

	1041 *pErrorCode = U_INVALID_CHAR_FOUND;

	1042 return NULL;

	1043 }

	1044

	1045 length = U8_LENGTH(ch);

	1046 if((pDestLimit - pDest) >= length) {

	1047 /* convert and append*/

	1048 pDest=_appendUTF8(pDest, ch);

	1049 } else {

	1050 reqLength = length;

	1051 break;

	1052 }

	1053 }

	1054 }

	1055 while((ch=*pSrc++)!=0) {

	1056 if(ch<=0x7f) {

	1057 ++reqLength;

	1058 } else if(ch<=0x7ff) {

	1059 reqLength+=2;

	1060 } else if(!UTF_IS_SURROGATE(ch)) {

	1061 reqLength+=3;

	1062 } else if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {

	1063 ++pSrc;

	1064 reqLength+=4;

	1065 } else if(subchar>=0) {

	1066 reqLength+=U8_LENGTH(subchar);

	1067 ++numSubstitutions;

	1068 } else {

	1069 /* Unicode 3.2 forbids surrogate code points in UTF-8 */

	1070 *pErrorCode = U_INVALID_CHAR_FOUND;

	1071 return NULL;

	1072 }

	1073 }

	1074 } else {

	1075 const UChar *pSrcLimit = pSrc+srcLength;

	1076 int32_t count;

	1077

	1078 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */

	1079 for(;;) {

	1080 /*

	1081 * Each iteration of the inner loop progresses by at most 3 UTF-8

	1082 * bytes and one UChar, for most characters.

	1083 * For supplementary code points (4 & 2), which are rare,

	1084 * there is an additional adjustment.

	1085 */

	1086 count = (int32_t)((pDestLimit - pDest) / 3);

	1087 srcLength = (int32_t)(pSrcLimit - pSrc);

	1088 if(count > srcLength) {

	1089 count = srcLength; /* min(remaining dest/3, remaining src) */

	1090 }

	1091 if(count < 3) {

	1092 /*

	1093 * Too much overhead if we get near the end of the string,

	1094 * continue with the next loop.

	1095 */

	1096 break;

	1097 }

	1098 do {

	1099 ch=*pSrc++;

	1100 if(ch <= 0x7f) {

	1101 *pDest++ = (uint8_t)ch;

	1102 } else if(ch <= 0x7ff) {

	1103 *pDest++=(uint8_t)((ch>>6)\|0xc0);

	1104 *pDest++=(uint8_t)((ch&0x3f)\|0x80);

	1105 } else if(ch <= 0xd7ff \|\| ch >= 0xe000) {

	1106 *pDest++=(uint8_t)((ch>>12)\|0xe0);

	1107 *pDest++=(uint8_t)(((ch>>6)&0x3f)\|0x80);

	1108 *pDest++=(uint8_t)((ch&0x3f)\|0x80);

	1109 } else /* ch is a surrogate */ {

	1110 /*

	1111 * We will read two UChars and probably output four bytes,

	1112 * which we didn't account for with computing count,

	1113 * so we adjust it here.

	1114 */

	1115 if(--count == 0) {

	1116 --pSrc; /* undo ch=pSrc++ for the lead surrogate /

	1117 break; /* recompute count */

	1118 }

	1119

	1120 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {

	1121 ++pSrc;

	1122 ch=UTF16_GET_PAIR_VALUE(ch, ch2);

	1123

	1124 /* writing 4 bytes per 2 UChars is ok */

	1125 *pDest++=(uint8_t)((ch>>18)\|0xf0);

	1126 *pDest++=(uint8_t)(((ch>>12)&0x3f)\|0x80);

	1127 *pDest++=(uint8_t)(((ch>>6)&0x3f)\|0x80);

	1128 *pDest++=(uint8_t)((ch&0x3f)\|0x80);

	1129 } else {

	1130 /* Unicode 3.2 forbids surrogate code points in UTF-8 */

	1131 if(subchar>=0) {

	1132 ch=subchar;

	1133 ++numSubstitutions;

	1134 } else {

	1135 *pErrorCode = U_INVALID_CHAR_FOUND;

	1136 return NULL;

	1137 }

	1138

	1139 /* convert and append*/

	1140 pDest=_appendUTF8(pDest, ch);

	1141 }

	1142 }

	1143 } while(--count > 0);

	1144 }

	1145

	1146 while(pSrc<pSrcLimit) {

	1147 ch=*pSrc++;

	1148 if(ch <= 0x7f) {

	1149 if(pDest<pDestLimit) {

	1150 *pDest++ = (uint8_t)ch;

	1151 } else {

	1152 reqLength = 1;

	1153 break;

	1154 }

	1155 } else if(ch <= 0x7ff) {

	1156 if((pDestLimit - pDest) >= 2) {

	1157 *pDest++=(uint8_t)((ch>>6)\|0xc0);

	1158 *pDest++=(uint8_t)((ch&0x3f)\|0x80);

	1159 } else {

	1160 reqLength = 2;

	1161 break;

	1162 }

	1163 } else if(ch <= 0xd7ff \|\| ch >= 0xe000) {

	1164 if((pDestLimit - pDest) >= 3) {

	1165 *pDest++=(uint8_t)((ch>>12)\|0xe0);

	1166 *pDest++=(uint8_t)(((ch>>6)&0x3f)\|0x80);

	1167 *pDest++=(uint8_t)((ch&0x3f)\|0x80);

	1168 } else {

	1169 reqLength = 3;

	1170 break;

	1171 }

	1172 } else /* ch is a surrogate */ {

	1173 int32_t length;

	1174

	1175 if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL( ch2=*pSrc)) {

	1176 ++pSrc;

	1177 ch=UTF16_GET_PAIR_VALUE(ch, ch2);

	1178 } else if(subchar>=0) {

	1179 ch=subchar;

	1180 ++numSubstitutions;

	1181 } else {

	1182 /* Unicode 3.2 forbids surrogate code points in UTF-8 */

	1183 *pErrorCode = U_INVALID_CHAR_FOUND;

	1184 return NULL;

	1185 }

	1186

	1187 length = U8_LENGTH(ch);

	1188 if((pDestLimit - pDest) >= length) {

	1189 /* convert and append*/

	1190 pDest=_appendUTF8(pDest, ch);

	1191 } else {

	1192 reqLength = length;

	1193 break;

	1194 }

	1195 }

	1196 }

	1197 while(pSrc<pSrcLimit) {

	1198 ch=*pSrc++;

	1199 if(ch<=0x7f) {

	1200 ++reqLength;

	1201 } else if(ch<=0x7ff) {

	1202 reqLength+=2;

	1203 } else if(!UTF_IS_SURROGATE(ch)) {

	1204 reqLength+=3;

	1205 } else if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRA IL(ch2=*pSrc)) {

	1206 ++pSrc;

	1207 reqLength+=4;

	1208 } else if(subchar>=0) {

	1209 reqLength+=U8_LENGTH(subchar);

	1210 ++numSubstitutions;

	1211 } else {

	1212 /* Unicode 3.2 forbids surrogate code points in UTF-8 */

	1213 *pErrorCode = U_INVALID_CHAR_FOUND;

	1214 return NULL;

	1215 }

	1216 }

	1217 }

	1218

	1219 reqLength+=(int32_t)(pDest - (uint8_t *)dest);

	1220

	1221 if(pNumSubstitutions!=NULL) {

	1222 *pNumSubstitutions=numSubstitutions;

	1223 }

	1224

	1225 if(pDestLength){

	1226 *pDestLength = reqLength;

	1227 }

	1228

	1229 /* Terminate the buffer */

	1230 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);

	1231 return dest;

	1232 }

	1233

	1234 U_CAPI char* U_EXPORT2

	1235 u_strToUTF8(char *dest,

	1236 int32_t destCapacity,

	1237 int32_t *pDestLength,

	1238 const UChar *pSrc,

	1239 int32_t srcLength,

	1240 UErrorCode *pErrorCode){

	1241 return u_strToUTF8WithSub(

	1242 dest, destCapacity, pDestLength,

	1243 pSrc, srcLength,

	1244 U_SENTINEL, NULL,

	1245 pErrorCode);

	1246 }

	1247

	1248 U_CAPI UChar* U_EXPORT2

	1249 u_strFromJavaModifiedUTF8WithSub(

	1250 UChar *dest,

	1251 int32_t destCapacity,

	1252 int32_t *pDestLength,

	1253 const char *src,

	1254 int32_t srcLength,

	1255 UChar32 subchar, int32_t *pNumSubstitutions,

	1256 UErrorCode *pErrorCode) {

	1257 UChar *pDest = dest;

	1258 UChar *pDestLimit = dest+destCapacity;

	1259 UChar32 ch;

	1260 int32_t reqLength = 0;

	1261 const uint8_t* pSrc = (const uint8_t*) src;

	1262 const uint8_t *pSrcLimit;

	1263 int32_t count;

	1264 uint8_t t1, t2; /* trail bytes */

	1265 int32_t numSubstitutions;

	1266

	1267 /* args check */

	1268 if(U_FAILURE(*pErrorCode)){

	1269 return NULL;

	1270 }

	1271 if( (src==NULL && srcLength!=0) \|\| srcLength < -1 \|\|

	1272 (dest==NULL && destCapacity!=0) \|\| destCapacity<0 \|\|

	1273 subchar > 0x10ffff \|\| U_IS_SURROGATE(subchar)

	1274 ) {

	1275 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;

	1276 return NULL;

	1277 }

	1278

	1279 if(pNumSubstitutions!=NULL) {

	1280 *pNumSubstitutions=0;

	1281 }

	1282 numSubstitutions=0;

	1283

	1284 if(srcLength < 0) {

	1285 /*

	1286 * Transform a NUL-terminated ASCII string.

	1287 * Handle non-ASCII strings with slower code.

	1288 */

	1289 while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {

	1290 *pDest++=(UChar)ch;

	1291 ++pSrc;

	1292 }

	1293 if(ch == 0) {

	1294 reqLength=(int32_t)(pDest - dest);

	1295 if(pDestLength) {

	1296 *pDestLength = reqLength;

	1297 }

	1298

	1299 /* Terminate the buffer */

	1300 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);

	1301 return dest;

	1302 }

	1303 srcLength = uprv_strlen((const char *)pSrc);

	1304 }

	1305

	1306 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */

	1307 pSrcLimit = pSrc + srcLength;

	1308 for(;;) {

	1309 count = (int32_t)(pDestLimit - pDest);

	1310 srcLength = (int32_t)(pSrcLimit - pSrc);

	1311 if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {

	1312 /* fast ASCII loop */

	1313 const uint8_t *prevSrc = pSrc;

	1314 int32_t delta;

	1315 while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {

	1316 *pDest++=(UChar)ch;

	1317 ++pSrc;

	1318 }

	1319 delta = (int32_t)(pSrc - prevSrc);

	1320 count -= delta;

	1321 srcLength -= delta;

	1322 }

	1323 /*

	1324 * Each iteration of the inner loop progresses by at most 3 UTF-8

	1325 * bytes and one UChar.

	1326 */

	1327 srcLength /= 3;

	1328 if(count > srcLength) {

	1329 count = srcLength; /* min(remaining dest, remaining src/3) */

	1330 }

	1331 if(count < 3) {

	1332 /*

	1333 * Too much overhead if we get near the end of the string,

	1334 * continue with the next loop.

	1335 */

	1336 break;

	1337 }

	1338 do {

	1339 ch = *pSrc;

	1340 if(ch <= 0x7f){

	1341 *pDest++=(UChar)ch;

	1342 ++pSrc;

	1343 } else {

	1344 if(ch >= 0xe0) {

	1345 if( /* handle U+0000..U+FFFF inline */

	1346 ch <= 0xef &&

	1347 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&

	1348 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f

	1349 ) {

	1350 /* no need for (ch & 0xf) because the upper bits are tru ncated after <<12 in the cast to (UChar) */

	1351 *pDest++ = (UChar)((ch << 12) \| (t1 << 6) \| t2);

	1352 pSrc += 3;

	1353 continue;

	1354 }

	1355 } else {

	1356 if( /* handle U+0000..U+07FF inline */

	1357 ch >= 0xc0 &&

	1358 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f

	1359 ) {

	1360 *pDest++ = (UChar)(((ch & 0x1f) << 6) \| t1);

	1361 pSrc += 2;

	1362 continue;

	1363 }

	1364 }

	1365

	1366 if(subchar < 0) {

	1367 *pErrorCode = U_INVALID_CHAR_FOUND;

	1368 return NULL;

	1369 } else if(subchar > 0xffff && --count == 0) {

	1370 /*

	1371 * We need to write two UChars, adjusted count for that,

	1372 * and ran out of space.

	1373 */

	1374 break;

	1375 } else {

	1376 /* function call for error cases */

	1377 ++pSrc; /* continue after the lead byte */

	1378 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);

	1379 ++numSubstitutions;

	1380 if(subchar<=0xFFFF) {

	1381 *(pDest++)=(UChar)subchar;

	1382 } else {

	1383 *(pDest++)=U16_LEAD(subchar);

	1384 *(pDest++)=U16_TRAIL(subchar);

	1385 }

	1386 }

	1387 }

	1388 } while(--count > 0);

	1389 }

	1390

	1391 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {

	1392 ch = *pSrc;

	1393 if(ch <= 0x7f){

	1394 *pDest++=(UChar)ch;

	1395 ++pSrc;

	1396 } else {

	1397 if(ch >= 0xe0) {

	1398 if( /* handle U+0000..U+FFFF inline */

	1399 ch <= 0xef &&

	1400 ((pSrcLimit - pSrc) >= 3) &&

	1401 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&

	1402 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f

	1403 ) {

	1404 /* no need for (ch & 0xf) because the upper bits are truncat ed after <<12 in the cast to (UChar) */

	1405 *pDest++ = (UChar)((ch << 12) \| (t1 << 6) \| t2);

	1406 pSrc += 3;

	1407 continue;

	1408 }

	1409 } else {

	1410 if( /* handle U+0000..U+07FF inline */

	1411 ch >= 0xc0 &&

	1412 ((pSrcLimit - pSrc) >= 2) &&

	1413 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f

	1414 ) {

	1415 *pDest++ = (UChar)(((ch & 0x1f) << 6) \| t1);

	1416 pSrc += 2;

	1417 continue;

	1418 }

	1419 }

	1420

	1421 if(subchar < 0) {

	1422 *pErrorCode = U_INVALID_CHAR_FOUND;

	1423 return NULL;

	1424 } else {

	1425 /* function call for error cases */

	1426 ++pSrc; /* continue after the lead byte */

	1427 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);

	1428 ++numSubstitutions;

	1429 if(subchar<=0xFFFF) {

	1430 *(pDest++)=(UChar)subchar;

	1431 } else {

	1432 *(pDest++)=U16_LEAD(subchar);

	1433 if(pDest<pDestLimit) {

	1434 *(pDest++)=U16_TRAIL(subchar);

	1435 } else {

	1436 reqLength++;

	1437 break;

	1438 }

	1439 }

	1440 }

	1441 }

	1442 }

	1443

	1444 /* do not fill the dest buffer just count the UChars needed */

	1445 while(pSrc < pSrcLimit){

	1446 ch = *pSrc;

	1447 if(ch <= 0x7f) {

	1448 reqLength++;

	1449 ++pSrc;

	1450 } else {

	1451 if(ch >= 0xe0) {

	1452 if( /* handle U+0000..U+FFFF inline */

	1453 ch <= 0xef &&

	1454 ((pSrcLimit - pSrc) >= 3) &&

	1455 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&

	1456 (uint8_t)(pSrc[2] - 0x80) <= 0x3f

	1457 ) {

	1458 reqLength++;

	1459 pSrc += 3;

	1460 continue;

	1461 }

	1462 } else {

	1463 if( /* handle U+0000..U+07FF inline */

	1464 ch >= 0xc0 &&

	1465 ((pSrcLimit - pSrc) >= 2) &&

	1466 (uint8_t)(pSrc[1] - 0x80) <= 0x3f

	1467 ) {

	1468 reqLength++;

	1469 pSrc += 2;

	1470 continue;

	1471 }

	1472 }

	1473

	1474 if(subchar < 0) {

	1475 *pErrorCode = U_INVALID_CHAR_FOUND;

	1476 return NULL;

	1477 } else {

	1478 /* function call for error cases */

	1479 ++pSrc; /* continue after the lead byte */

	1480 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);

	1481 ++numSubstitutions;

	1482 reqLength+=U16_LENGTH(ch);

	1483 }

	1484 }

	1485 }

	1486

	1487 if(pNumSubstitutions!=NULL) {

	1488 *pNumSubstitutions=numSubstitutions;

	1489 }

	1490

	1491 reqLength+=(int32_t)(pDest - dest);

	1492 if(pDestLength) {

	1493 *pDestLength = reqLength;

	1494 }

	1495

	1496 /* Terminate the buffer */

	1497 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);

	1498 return dest;

	1499 }

	1500

	1501 U_CAPI char* U_EXPORT2

	1502 u_strToJavaModifiedUTF8(

	1503 char *dest,

	1504 int32_t destCapacity,

	1505 int32_t *pDestLength,

	1506 const UChar *src,

	1507 int32_t srcLength,

	1508 UErrorCode *pErrorCode) {

	1509 int32_t reqLength=0;

	1510 uint32_t ch=0;

	1511 uint8_t pDest = (uint8_t )dest;

	1512 uint8_t *pDestLimit = pDest + destCapacity;

	1513 const UChar *pSrcLimit;

	1514 int32_t count;

	1515

	1516 /* args check */

	1517 if(U_FAILURE(*pErrorCode)){

	1518 return NULL;

	1519 }

	1520 if( (src==NULL && srcLength!=0) \|\| srcLength < -1 \|\|

	1521 (dest==NULL && destCapacity!=0) \|\| destCapacity<0

	1522 ) {

	1523 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;

	1524 return NULL;

	1525 }

	1526

	1527 if(srcLength==-1) {

	1528 /* Convert NUL-terminated ASCII, then find the string length. */

	1529 while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {

	1530 *pDest++ = (uint8_t)ch;

	1531 ++src;

	1532 }

	1533 if(ch == 0) {

	1534 reqLength=(int32_t)(pDest - (uint8_t *)dest);

	1535 if(pDestLength) {

	1536 *pDestLength = reqLength;

	1537 }

	1538

	1539 /* Terminate the buffer */

	1540 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);

	1541 return dest;

	1542 }

	1543 srcLength = u_strlen(src);

	1544 }

	1545

	1546 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */

	1547 pSrcLimit = src+srcLength;

	1548 for(;;) {

	1549 count = (int32_t)(pDestLimit - pDest);

	1550 srcLength = (int32_t)(pSrcLimit - src);

	1551 if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {

	1552 /* fast ASCII loop */

	1553 const UChar *prevSrc = src;

	1554 int32_t delta;

	1555 while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {

	1556 *pDest++=(uint8_t)ch;

	1557 ++src;

	1558 }

	1559 delta = (int32_t)(src - prevSrc);

	1560 count -= delta;

	1561 srcLength -= delta;

	1562 }

	1563 /*

	1564 * Each iteration of the inner loop progresses by at most 3 UTF-8

	1565 * bytes and one UChar.

	1566 */

	1567 count /= 3;

	1568 if(count > srcLength) {

	1569 count = srcLength; /* min(remaining dest/3, remaining src) */

	1570 }

	1571 if(count < 3) {

	1572 /*

	1573 * Too much overhead if we get near the end of the string,

	1574 * continue with the next loop.

	1575 */

	1576 break;

	1577 }

	1578 do {

	1579 ch=*src++;

	1580 if(ch <= 0x7f && ch != 0) {

	1581 *pDest++ = (uint8_t)ch;

	1582 } else if(ch <= 0x7ff) {

	1583 *pDest++=(uint8_t)((ch>>6)\|0xc0);

	1584 *pDest++=(uint8_t)((ch&0x3f)\|0x80);

	1585 } else {

	1586 *pDest++=(uint8_t)((ch>>12)\|0xe0);

	1587 *pDest++=(uint8_t)(((ch>>6)&0x3f)\|0x80);

	1588 *pDest++=(uint8_t)((ch&0x3f)\|0x80);

	1589 }

	1590 } while(--count > 0);

	1591 }

	1592

	1593 while(src<pSrcLimit) {

	1594 ch=*src++;

	1595 if(ch <= 0x7f && ch != 0) {

	1596 if(pDest<pDestLimit) {

	1597 *pDest++ = (uint8_t)ch;

	1598 } else {

	1599 reqLength = 1;

	1600 break;

	1601 }

	1602 } else if(ch <= 0x7ff) {

	1603 if((pDestLimit - pDest) >= 2) {

	1604 *pDest++=(uint8_t)((ch>>6)\|0xc0);

	1605 *pDest++=(uint8_t)((ch&0x3f)\|0x80);

	1606 } else {

	1607 reqLength = 2;

	1608 break;

	1609 }

	1610 } else {

	1611 if((pDestLimit - pDest) >= 3) {

	1612 *pDest++=(uint8_t)((ch>>12)\|0xe0);

	1613 *pDest++=(uint8_t)(((ch>>6)&0x3f)\|0x80);

	1614 *pDest++=(uint8_t)((ch&0x3f)\|0x80);

	1615 } else {

	1616 reqLength = 3;

	1617 break;

	1618 }

	1619 }

	1620 }

	1621 while(src<pSrcLimit) {

	1622 ch=*src++;

	1623 if(ch <= 0x7f && ch != 0) {

	1624 ++reqLength;

	1625 } else if(ch<=0x7ff) {

	1626 reqLength+=2;

	1627 } else {

	1628 reqLength+=3;

	1629 }

	1630 }

	1631

	1632 reqLength+=(int32_t)(pDest - (uint8_t *)dest);

	1633 if(pDestLength){

	1634 *pDestLength = reqLength;

	1635 }

	1636

	1637 /* Terminate the buffer */

	1638 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);

	1639 return dest;

	1640 }

OLD	NEW

« no previous file with comments | « icu46/source/common/ustring.c ('k') | icu46/source/common/utext.cpp » ('j') | no next file with comments »