OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ****************************************************************************** |
| 3 * |
| 4 * Copyright (C) 2001-2010, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ****************************************************************************** |
| 8 * |
| 9 * File ustrtrns.c |
| 10 * |
| 11 * Modification History: |
| 12 * |
| 13 * Date Name Description |
| 14 * 9/10/2001 Ram Creation. |
| 15 ****************************************************************************** |
| 16 */ |
| 17 |
| 18 /******************************************************************************* |
| 19 * |
| 20 * u_strTo* and u_strFrom* APIs |
| 21 * WCS functions moved to ustr_wcs.c for better modularization |
| 22 * |
| 23 ******************************************************************************* |
| 24 */ |
| 25 |
| 26 |
| 27 #include "unicode/putil.h" |
| 28 #include "unicode/ustring.h" |
| 29 #include "cstring.h" |
| 30 #include "cmemory.h" |
| 31 #include "ustr_imp.h" |
| 32 |
| 33 U_CAPI UChar* U_EXPORT2 |
| 34 u_strFromUTF32WithSub(UChar *dest, |
| 35 int32_t destCapacity, |
| 36 int32_t *pDestLength, |
| 37 const UChar32 *src, |
| 38 int32_t srcLength, |
| 39 UChar32 subchar, int32_t *pNumSubstitutions, |
| 40 UErrorCode *pErrorCode) { |
| 41 const UChar32 *srcLimit; |
| 42 UChar32 ch; |
| 43 UChar *destLimit; |
| 44 UChar *pDest; |
| 45 int32_t reqLength; |
| 46 int32_t numSubstitutions; |
| 47 |
| 48 /* args check */ |
| 49 if(U_FAILURE(*pErrorCode)){ |
| 50 return NULL; |
| 51 } |
| 52 if( (src==NULL && srcLength!=0) || srcLength < -1 || |
| 53 (destCapacity<0) || (dest == NULL && destCapacity > 0) || |
| 54 subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
| 55 ) { |
| 56 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 57 return NULL; |
| 58 } |
| 59 |
| 60 if(pNumSubstitutions != NULL) { |
| 61 *pNumSubstitutions = 0; |
| 62 } |
| 63 |
| 64 pDest = dest; |
| 65 destLimit = dest + destCapacity; |
| 66 reqLength = 0; |
| 67 numSubstitutions = 0; |
| 68 |
| 69 if(srcLength < 0) { |
| 70 /* simple loop for conversion of a NUL-terminated BMP string */ |
| 71 while((ch=*src) != 0 && |
| 72 ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) { |
| 73 ++src; |
| 74 if(pDest < destLimit) { |
| 75 *pDest++ = (UChar)ch; |
| 76 } else { |
| 77 ++reqLength; |
| 78 } |
| 79 } |
| 80 srcLimit = src; |
| 81 if(ch != 0) { |
| 82 /* "complicated" case, find the end of the remaining string */ |
| 83 while(*++srcLimit != 0) {} |
| 84 } |
| 85 } else { |
| 86 srcLimit = src + srcLength; |
| 87 } |
| 88 |
| 89 /* convert with length */ |
| 90 while(src < srcLimit) { |
| 91 ch = *src++; |
| 92 do { |
| 93 /* usually "loops" once; twice only for writing subchar */ |
| 94 if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) { |
| 95 if(pDest < destLimit) { |
| 96 *pDest++ = (UChar)ch; |
| 97 } else { |
| 98 ++reqLength; |
| 99 } |
| 100 break; |
| 101 } else if(0x10000 <= ch && ch <= 0x10ffff) { |
| 102 if((pDest + 2) <= destLimit) { |
| 103 *pDest++ = U16_LEAD(ch); |
| 104 *pDest++ = U16_TRAIL(ch); |
| 105 } else { |
| 106 reqLength += 2; |
| 107 } |
| 108 break; |
| 109 } else if((ch = subchar) < 0) { |
| 110 /* surrogate code point, or not a Unicode code point at all */ |
| 111 *pErrorCode = U_INVALID_CHAR_FOUND; |
| 112 return NULL; |
| 113 } else { |
| 114 ++numSubstitutions; |
| 115 } |
| 116 } while(TRUE); |
| 117 } |
| 118 |
| 119 reqLength += (int32_t)(pDest - dest); |
| 120 if(pDestLength) { |
| 121 *pDestLength = reqLength; |
| 122 } |
| 123 if(pNumSubstitutions != NULL) { |
| 124 *pNumSubstitutions = numSubstitutions; |
| 125 } |
| 126 |
| 127 /* Terminate the buffer */ |
| 128 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); |
| 129 |
| 130 return dest; |
| 131 } |
| 132 |
| 133 U_CAPI UChar* U_EXPORT2 |
| 134 u_strFromUTF32(UChar *dest, |
| 135 int32_t destCapacity, |
| 136 int32_t *pDestLength, |
| 137 const UChar32 *src, |
| 138 int32_t srcLength, |
| 139 UErrorCode *pErrorCode) { |
| 140 return u_strFromUTF32WithSub( |
| 141 dest, destCapacity, pDestLength, |
| 142 src, srcLength, |
| 143 U_SENTINEL, NULL, |
| 144 pErrorCode); |
| 145 } |
| 146 |
| 147 U_CAPI UChar32* U_EXPORT2 |
| 148 u_strToUTF32WithSub(UChar32 *dest, |
| 149 int32_t destCapacity, |
| 150 int32_t *pDestLength, |
| 151 const UChar *src, |
| 152 int32_t srcLength, |
| 153 UChar32 subchar, int32_t *pNumSubstitutions, |
| 154 UErrorCode *pErrorCode) { |
| 155 const UChar *srcLimit; |
| 156 UChar32 ch; |
| 157 UChar ch2; |
| 158 UChar32 *destLimit; |
| 159 UChar32 *pDest; |
| 160 int32_t reqLength; |
| 161 int32_t numSubstitutions; |
| 162 |
| 163 /* args check */ |
| 164 if(U_FAILURE(*pErrorCode)){ |
| 165 return NULL; |
| 166 } |
| 167 if( (src==NULL && srcLength!=0) || srcLength < -1 || |
| 168 (destCapacity<0) || (dest == NULL && destCapacity > 0) || |
| 169 subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
| 170 ) { |
| 171 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 172 return NULL; |
| 173 } |
| 174 |
| 175 if(pNumSubstitutions != NULL) { |
| 176 *pNumSubstitutions = 0; |
| 177 } |
| 178 |
| 179 pDest = dest; |
| 180 destLimit = dest + destCapacity; |
| 181 reqLength = 0; |
| 182 numSubstitutions = 0; |
| 183 |
| 184 if(srcLength < 0) { |
| 185 /* simple loop for conversion of a NUL-terminated BMP string */ |
| 186 while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) { |
| 187 ++src; |
| 188 if(pDest < destLimit) { |
| 189 *pDest++ = ch; |
| 190 } else { |
| 191 ++reqLength; |
| 192 } |
| 193 } |
| 194 srcLimit = src; |
| 195 if(ch != 0) { |
| 196 /* "complicated" case, find the end of the remaining string */ |
| 197 while(*++srcLimit != 0) {} |
| 198 } |
| 199 } else { |
| 200 srcLimit = src + srcLength; |
| 201 } |
| 202 |
| 203 /* convert with length */ |
| 204 while(src < srcLimit) { |
| 205 ch = *src++; |
| 206 if(!U16_IS_SURROGATE(ch)) { |
| 207 /* write or count ch below */ |
| 208 } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch
2 = *src)) { |
| 209 ++src; |
| 210 ch = U16_GET_SUPPLEMENTARY(ch, ch2); |
| 211 } else if((ch = subchar) < 0) { |
| 212 /* unpaired surrogate */ |
| 213 *pErrorCode = U_INVALID_CHAR_FOUND; |
| 214 return NULL; |
| 215 } else { |
| 216 ++numSubstitutions; |
| 217 } |
| 218 if(pDest < destLimit) { |
| 219 *pDest++ = ch; |
| 220 } else { |
| 221 ++reqLength; |
| 222 } |
| 223 } |
| 224 |
| 225 reqLength += (int32_t)(pDest - dest); |
| 226 if(pDestLength) { |
| 227 *pDestLength = reqLength; |
| 228 } |
| 229 if(pNumSubstitutions != NULL) { |
| 230 *pNumSubstitutions = numSubstitutions; |
| 231 } |
| 232 |
| 233 /* Terminate the buffer */ |
| 234 u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode); |
| 235 |
| 236 return dest; |
| 237 } |
| 238 |
| 239 U_CAPI UChar32* U_EXPORT2 |
| 240 u_strToUTF32(UChar32 *dest, |
| 241 int32_t destCapacity, |
| 242 int32_t *pDestLength, |
| 243 const UChar *src, |
| 244 int32_t srcLength, |
| 245 UErrorCode *pErrorCode) { |
| 246 return u_strToUTF32WithSub( |
| 247 dest, destCapacity, pDestLength, |
| 248 src, srcLength, |
| 249 U_SENTINEL, NULL, |
| 250 pErrorCode); |
| 251 } |
| 252 |
| 253 /* for utf8_nextCharSafeBodyTerminated() */ |
| 254 static const UChar32 |
| 255 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; |
| 256 |
| 257 /* |
| 258 * Version of utf8_nextCharSafeBody() with the following differences: |
| 259 * - checks for NUL termination instead of length |
| 260 * - works with pointers instead of indexes |
| 261 * - always strict (strict==-1) |
| 262 * |
| 263 * *ps points to after the lead byte and will be moved to after the last trail b
yte. |
| 264 * c is the lead byte. |
| 265 * @return the code point, or U_SENTINEL |
| 266 */ |
| 267 static UChar32 |
| 268 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) { |
| 269 const uint8_t *s=*ps; |
| 270 uint8_t trail, illegal=0; |
| 271 uint8_t count=UTF8_COUNT_TRAIL_BYTES(c); |
| 272 UTF8_MASK_LEAD_BYTE((c), count); |
| 273 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and
0xff */ |
| 274 switch(count) { |
| 275 /* each branch falls through to the next one */ |
| 276 case 5: |
| 277 case 4: |
| 278 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's U
TF-8 */ |
| 279 illegal=1; |
| 280 break; |
| 281 case 3: |
| 282 trail=(uint8_t)(*s++ - 0x80); |
| 283 c=(c<<6)|trail; |
| 284 if(trail>0x3f || c>=0x110) { |
| 285 /* not a trail byte, or code point>0x10ffff (outside Unicode) */ |
| 286 illegal=1; |
| 287 break; |
| 288 } |
| 289 case 2: |
| 290 trail=(uint8_t)(*s++ - 0x80); |
| 291 if(trail>0x3f) { |
| 292 /* not a trail byte */ |
| 293 illegal=1; |
| 294 break; |
| 295 } |
| 296 c=(c<<6)|trail; |
| 297 case 1: |
| 298 trail=(uint8_t)(*s++ - 0x80); |
| 299 if(trail>0x3f) { |
| 300 /* not a trail byte */ |
| 301 illegal=1; |
| 302 } |
| 303 c=(c<<6)|trail; |
| 304 break; |
| 305 case 0: |
| 306 return U_SENTINEL; |
| 307 /* no default branch to optimize switch() - all values are covered */ |
| 308 } |
| 309 |
| 310 /* correct sequence - all trail bytes have (b7..b6)==(10)? */ |
| 311 /* illegal is also set if count>=4 */ |
| 312 if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) { |
| 313 /* error handling */ |
| 314 /* don't go beyond this sequence */ |
| 315 s=*ps; |
| 316 while(count>0 && UTF8_IS_TRAIL(*s)) { |
| 317 ++s; |
| 318 --count; |
| 319 } |
| 320 c=U_SENTINEL; |
| 321 } |
| 322 *ps=s; |
| 323 return c; |
| 324 } |
| 325 |
| 326 /* |
| 327 * Version of utf8_nextCharSafeBody() with the following differences: |
| 328 * - works with pointers instead of indexes |
| 329 * - always strict (strict==-1) |
| 330 * |
| 331 * *ps points to after the lead byte and will be moved to after the last trail b
yte. |
| 332 * c is the lead byte. |
| 333 * @return the code point, or U_SENTINEL |
| 334 */ |
| 335 static UChar32 |
| 336 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c
) { |
| 337 const uint8_t *s=*ps; |
| 338 uint8_t trail, illegal=0; |
| 339 uint8_t count=UTF8_COUNT_TRAIL_BYTES(c); |
| 340 if((limit-s)>=count) { |
| 341 UTF8_MASK_LEAD_BYTE((c), count); |
| 342 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe
and 0xff */ |
| 343 switch(count) { |
| 344 /* each branch falls through to the next one */ |
| 345 case 5: |
| 346 case 4: |
| 347 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode
's UTF-8 */ |
| 348 illegal=1; |
| 349 break; |
| 350 case 3: |
| 351 trail=*s++; |
| 352 c=(c<<6)|(trail&0x3f); |
| 353 if(c<0x110) { |
| 354 illegal|=(trail&0xc0)^0x80; |
| 355 } else { |
| 356 /* code point>0x10ffff, outside Unicode */ |
| 357 illegal=1; |
| 358 break; |
| 359 } |
| 360 case 2: |
| 361 trail=*s++; |
| 362 c=(c<<6)|(trail&0x3f); |
| 363 illegal|=(trail&0xc0)^0x80; |
| 364 case 1: |
| 365 trail=*s++; |
| 366 c=(c<<6)|(trail&0x3f); |
| 367 illegal|=(trail&0xc0)^0x80; |
| 368 break; |
| 369 case 0: |
| 370 return U_SENTINEL; |
| 371 /* no default branch to optimize switch() - all values are covered */ |
| 372 } |
| 373 } else { |
| 374 illegal=1; /* too few bytes left */ |
| 375 } |
| 376 |
| 377 /* correct sequence - all trail bytes have (b7..b6)==(10)? */ |
| 378 /* illegal is also set if count>=4 */ |
| 379 if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) { |
| 380 /* error handling */ |
| 381 /* don't go beyond this sequence */ |
| 382 s=*ps; |
| 383 while(count>0 && s<limit && UTF8_IS_TRAIL(*s)) { |
| 384 ++s; |
| 385 --count; |
| 386 } |
| 387 c=U_SENTINEL; |
| 388 } |
| 389 *ps=s; |
| 390 return c; |
| 391 } |
| 392 |
| 393 U_CAPI UChar* U_EXPORT2 |
| 394 u_strFromUTF8WithSub(UChar *dest, |
| 395 int32_t destCapacity, |
| 396 int32_t *pDestLength, |
| 397 const char* src, |
| 398 int32_t srcLength, |
| 399 UChar32 subchar, int32_t *pNumSubstitutions, |
| 400 UErrorCode *pErrorCode){ |
| 401 UChar *pDest = dest; |
| 402 UChar *pDestLimit = dest+destCapacity; |
| 403 UChar32 ch; |
| 404 int32_t reqLength = 0; |
| 405 const uint8_t* pSrc = (const uint8_t*) src; |
| 406 uint8_t t1, t2; /* trail bytes */ |
| 407 int32_t numSubstitutions; |
| 408 |
| 409 /* args check */ |
| 410 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ |
| 411 return NULL; |
| 412 } |
| 413 |
| 414 if( (src==NULL && srcLength!=0) || srcLength < -1 || |
| 415 (destCapacity<0) || (dest == NULL && destCapacity > 0) || |
| 416 subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
| 417 ) { |
| 418 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 419 return NULL; |
| 420 } |
| 421 |
| 422 if(pNumSubstitutions!=NULL) { |
| 423 *pNumSubstitutions=0; |
| 424 } |
| 425 numSubstitutions=0; |
| 426 |
| 427 /* |
| 428 * Inline processing of UTF-8 byte sequences: |
| 429 * |
| 430 * Byte sequences for the most common characters are handled inline in |
| 431 * the conversion loops. In order to reduce the path lengths for those |
| 432 * characters, the tests are arranged in a kind of binary search. |
| 433 * ASCII (<=0x7f) is checked first, followed by the dividing point |
| 434 * between 2- and 3-byte sequences (0xe0). |
| 435 * The 3-byte branch is tested first to speed up CJK text. |
| 436 * The compiler should combine the subtractions for the two tests for 0xe0. |
| 437 * Each branch then tests for the other end of its range. |
| 438 */ |
| 439 |
| 440 if(srcLength < 0){ |
| 441 /* |
| 442 * Transform a NUL-terminated string. |
| 443 * The code explicitly checks for NULs only in the lead byte position. |
| 444 * A NUL byte in the trail byte position fails the trail byte range chec
k anyway. |
| 445 */ |
| 446 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { |
| 447 if(ch <= 0x7f){ |
| 448 *pDest++=(UChar)ch; |
| 449 ++pSrc; |
| 450 } else { |
| 451 if(ch > 0xe0) { |
| 452 if( /* handle U+1000..U+CFFF inline */ |
| 453 ch <= 0xec && |
| 454 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && |
| 455 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f |
| 456 ) { |
| 457 /* no need for (ch & 0xf) because the upper bits are tru
ncated after <<12 in the cast to (UChar) */ |
| 458 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); |
| 459 pSrc += 3; |
| 460 continue; |
| 461 } |
| 462 } else if(ch < 0xe0) { |
| 463 if( /* handle U+0080..U+07FF inline */ |
| 464 ch >= 0xc2 && |
| 465 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f |
| 466 ) { |
| 467 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); |
| 468 pSrc += 2; |
| 469 continue; |
| 470 } |
| 471 } |
| 472 |
| 473 /* function call for "complicated" and error cases */ |
| 474 ++pSrc; /* continue after the lead byte */ |
| 475 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); |
| 476 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { |
| 477 *pErrorCode = U_INVALID_CHAR_FOUND; |
| 478 return NULL; |
| 479 } else if(ch<=0xFFFF) { |
| 480 *(pDest++)=(UChar)ch; |
| 481 } else { |
| 482 *(pDest++)=UTF16_LEAD(ch); |
| 483 if(pDest<pDestLimit) { |
| 484 *(pDest++)=UTF16_TRAIL(ch); |
| 485 } else { |
| 486 reqLength++; |
| 487 break; |
| 488 } |
| 489 } |
| 490 } |
| 491 } |
| 492 |
| 493 /* Pre-flight the rest of the string. */ |
| 494 while((ch = *pSrc) != 0) { |
| 495 if(ch <= 0x7f){ |
| 496 ++reqLength; |
| 497 ++pSrc; |
| 498 } else { |
| 499 if(ch > 0xe0) { |
| 500 if( /* handle U+1000..U+CFFF inline */ |
| 501 ch <= 0xec && |
| 502 (uint8_t)(pSrc[1] - 0x80) <= 0x3f && |
| 503 (uint8_t)(pSrc[2] - 0x80) <= 0x3f |
| 504 ) { |
| 505 ++reqLength; |
| 506 pSrc += 3; |
| 507 continue; |
| 508 } |
| 509 } else if(ch < 0xe0) { |
| 510 if( /* handle U+0080..U+07FF inline */ |
| 511 ch >= 0xc2 && |
| 512 (uint8_t)(pSrc[1] - 0x80) <= 0x3f |
| 513 ) { |
| 514 ++reqLength; |
| 515 pSrc += 2; |
| 516 continue; |
| 517 } |
| 518 } |
| 519 |
| 520 /* function call for "complicated" and error cases */ |
| 521 ++pSrc; /* continue after the lead byte */ |
| 522 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); |
| 523 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { |
| 524 *pErrorCode = U_INVALID_CHAR_FOUND; |
| 525 return NULL; |
| 526 } |
| 527 reqLength += U16_LENGTH(ch); |
| 528 } |
| 529 } |
| 530 } else /* srcLength >= 0 */ { |
| 531 const uint8_t *pSrcLimit = pSrc + srcLength; |
| 532 int32_t count; |
| 533 |
| 534 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ |
| 535 for(;;) { |
| 536 /* |
| 537 * Each iteration of the inner loop progresses by at most 3 UTF-8 |
| 538 * bytes and one UChar, for most characters. |
| 539 * For supplementary code points (4 & 2), which are rare, |
| 540 * there is an additional adjustment. |
| 541 */ |
| 542 count = (int32_t)(pDestLimit - pDest); |
| 543 srcLength = (int32_t)((pSrcLimit - pSrc) / 3); |
| 544 if(count > srcLength) { |
| 545 count = srcLength; /* min(remaining dest, remaining src/3) */ |
| 546 } |
| 547 if(count < 3) { |
| 548 /* |
| 549 * Too much overhead if we get near the end of the string, |
| 550 * continue with the next loop. |
| 551 */ |
| 552 break; |
| 553 } |
| 554 |
| 555 do { |
| 556 ch = *pSrc; |
| 557 if(ch <= 0x7f){ |
| 558 *pDest++=(UChar)ch; |
| 559 ++pSrc; |
| 560 } else { |
| 561 if(ch > 0xe0) { |
| 562 if( /* handle U+1000..U+CFFF inline */ |
| 563 ch <= 0xec && |
| 564 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && |
| 565 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f |
| 566 ) { |
| 567 /* no need for (ch & 0xf) because the upper bits are
truncated after <<12 in the cast to (UChar) */ |
| 568 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); |
| 569 pSrc += 3; |
| 570 continue; |
| 571 } |
| 572 } else if(ch < 0xe0) { |
| 573 if( /* handle U+0080..U+07FF inline */ |
| 574 ch >= 0xc2 && |
| 575 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f |
| 576 ) { |
| 577 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); |
| 578 pSrc += 2; |
| 579 continue; |
| 580 } |
| 581 } |
| 582 |
| 583 if(ch >= 0xf0 || subchar > 0xffff) { |
| 584 /* |
| 585 * We may read up to six bytes and write up to two UChar
s, |
| 586 * which we didn't account for with computing count, |
| 587 * so we adjust it here. |
| 588 */ |
| 589 if(--count == 0) { |
| 590 break; |
| 591 } |
| 592 } |
| 593 |
| 594 /* function call for "complicated" and error cases */ |
| 595 ++pSrc; /* continue after the lead byte */ |
| 596 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
| 597 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ |
| 598 *pErrorCode = U_INVALID_CHAR_FOUND; |
| 599 return NULL; |
| 600 }else if(ch<=0xFFFF){ |
| 601 *(pDest++)=(UChar)ch; |
| 602 }else{ |
| 603 *(pDest++)=UTF16_LEAD(ch); |
| 604 *(pDest++)=UTF16_TRAIL(ch); |
| 605 } |
| 606 } |
| 607 } while(--count > 0); |
| 608 } |
| 609 |
| 610 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { |
| 611 ch = *pSrc; |
| 612 if(ch <= 0x7f){ |
| 613 *pDest++=(UChar)ch; |
| 614 ++pSrc; |
| 615 } else { |
| 616 if(ch > 0xe0) { |
| 617 if( /* handle U+1000..U+CFFF inline */ |
| 618 ch <= 0xec && |
| 619 ((pSrcLimit - pSrc) >= 3) && |
| 620 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && |
| 621 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f |
| 622 ) { |
| 623 /* no need for (ch & 0xf) because the upper bits are tru
ncated after <<12 in the cast to (UChar) */ |
| 624 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); |
| 625 pSrc += 3; |
| 626 continue; |
| 627 } |
| 628 } else if(ch < 0xe0) { |
| 629 if( /* handle U+0080..U+07FF inline */ |
| 630 ch >= 0xc2 && |
| 631 ((pSrcLimit - pSrc) >= 2) && |
| 632 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f |
| 633 ) { |
| 634 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); |
| 635 pSrc += 2; |
| 636 continue; |
| 637 } |
| 638 } |
| 639 |
| 640 /* function call for "complicated" and error cases */ |
| 641 ++pSrc; /* continue after the lead byte */ |
| 642 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
| 643 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ |
| 644 *pErrorCode = U_INVALID_CHAR_FOUND; |
| 645 return NULL; |
| 646 }else if(ch<=0xFFFF){ |
| 647 *(pDest++)=(UChar)ch; |
| 648 }else{ |
| 649 *(pDest++)=UTF16_LEAD(ch); |
| 650 if(pDest<pDestLimit){ |
| 651 *(pDest++)=UTF16_TRAIL(ch); |
| 652 }else{ |
| 653 reqLength++; |
| 654 break; |
| 655 } |
| 656 } |
| 657 } |
| 658 } |
| 659 /* do not fill the dest buffer just count the UChars needed */ |
| 660 while(pSrc < pSrcLimit){ |
| 661 ch = *pSrc; |
| 662 if(ch <= 0x7f){ |
| 663 reqLength++; |
| 664 ++pSrc; |
| 665 } else { |
| 666 if(ch > 0xe0) { |
| 667 if( /* handle U+1000..U+CFFF inline */ |
| 668 ch <= 0xec && |
| 669 ((pSrcLimit - pSrc) >= 3) && |
| 670 (uint8_t)(pSrc[1] - 0x80) <= 0x3f && |
| 671 (uint8_t)(pSrc[2] - 0x80) <= 0x3f |
| 672 ) { |
| 673 reqLength++; |
| 674 pSrc += 3; |
| 675 continue; |
| 676 } |
| 677 } else if(ch < 0xe0) { |
| 678 if( /* handle U+0080..U+07FF inline */ |
| 679 ch >= 0xc2 && |
| 680 ((pSrcLimit - pSrc) >= 2) && |
| 681 (uint8_t)(pSrc[1] - 0x80) <= 0x3f |
| 682 ) { |
| 683 reqLength++; |
| 684 pSrc += 2; |
| 685 continue; |
| 686 } |
| 687 } |
| 688 |
| 689 /* function call for "complicated" and error cases */ |
| 690 ++pSrc; /* continue after the lead byte */ |
| 691 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
| 692 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ |
| 693 *pErrorCode = U_INVALID_CHAR_FOUND; |
| 694 return NULL; |
| 695 } |
| 696 reqLength+=UTF_CHAR_LENGTH(ch); |
| 697 } |
| 698 } |
| 699 } |
| 700 |
| 701 reqLength+=(int32_t)(pDest - dest); |
| 702 |
| 703 if(pNumSubstitutions!=NULL) { |
| 704 *pNumSubstitutions=numSubstitutions; |
| 705 } |
| 706 |
| 707 if(pDestLength){ |
| 708 *pDestLength = reqLength; |
| 709 } |
| 710 |
| 711 /* Terminate the buffer */ |
| 712 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); |
| 713 |
| 714 return dest; |
| 715 } |
| 716 |
| 717 U_CAPI UChar* U_EXPORT2 |
| 718 u_strFromUTF8(UChar *dest, |
| 719 int32_t destCapacity, |
| 720 int32_t *pDestLength, |
| 721 const char* src, |
| 722 int32_t srcLength, |
| 723 UErrorCode *pErrorCode){ |
| 724 return u_strFromUTF8WithSub( |
| 725 dest, destCapacity, pDestLength, |
| 726 src, srcLength, |
| 727 U_SENTINEL, NULL, |
| 728 pErrorCode); |
| 729 } |
| 730 |
| 731 U_CAPI UChar * U_EXPORT2 |
| 732 u_strFromUTF8Lenient(UChar *dest, |
| 733 int32_t destCapacity, |
| 734 int32_t *pDestLength, |
| 735 const char *src, |
| 736 int32_t srcLength, |
| 737 UErrorCode *pErrorCode) { |
| 738 UChar *pDest = dest; |
| 739 UChar32 ch; |
| 740 int32_t reqLength = 0; |
| 741 uint8_t* pSrc = (uint8_t*) src; |
| 742 |
| 743 /* args check */ |
| 744 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ |
| 745 return NULL; |
| 746 } |
| 747 |
| 748 if( (src==NULL && srcLength!=0) || srcLength < -1 || |
| 749 (destCapacity<0) || (dest == NULL && destCapacity > 0) |
| 750 ) { |
| 751 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 752 return NULL; |
| 753 } |
| 754 |
| 755 if(srcLength < 0) { |
| 756 /* Transform a NUL-terminated string. */ |
| 757 UChar *pDestLimit = dest+destCapacity; |
| 758 uint8_t t1, t2, t3; /* trail bytes */ |
| 759 |
| 760 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { |
| 761 if(ch < 0xc0) { |
| 762 /* |
| 763 * ASCII, or a trail byte in lead position which is treated like |
| 764 * a single-byte sequence for better character boundary |
| 765 * resynchronization after illegal sequences. |
| 766 */ |
| 767 *pDest++=(UChar)ch; |
| 768 ++pSrc; |
| 769 continue; |
| 770 } else if(ch < 0xe0) { /* U+0080..U+07FF */ |
| 771 if((t1 = pSrc[1]) != 0) { |
| 772 /* 0x3080 = (0xc0 << 6) + 0x80 */ |
| 773 *pDest++ = (UChar)((ch << 6) + t1 - 0x3080); |
| 774 pSrc += 2; |
| 775 continue; |
| 776 } |
| 777 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ |
| 778 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) { |
| 779 /* no need for (ch & 0xf) because the upper bits are truncat
ed after <<12 in the cast to (UChar) */ |
| 780 /* 0x2080 = (0x80 << 6) + 0x80 */ |
| 781 *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080); |
| 782 pSrc += 3; |
| 783 continue; |
| 784 } |
| 785 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ |
| 786 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3])
!= 0) { |
| 787 pSrc += 4; |
| 788 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0
x80 */ |
| 789 ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080; |
| 790 *(pDest++) = U16_LEAD(ch); |
| 791 if(pDest < pDestLimit) { |
| 792 *(pDest++) = U16_TRAIL(ch); |
| 793 } else { |
| 794 reqLength = 1; |
| 795 break; |
| 796 } |
| 797 continue; |
| 798 } |
| 799 } |
| 800 |
| 801 /* truncated character at the end */ |
| 802 *pDest++ = 0xfffd; |
| 803 while(*++pSrc != 0) {} |
| 804 break; |
| 805 } |
| 806 |
| 807 /* Pre-flight the rest of the string. */ |
| 808 while((ch = *pSrc) != 0) { |
| 809 if(ch < 0xc0) { |
| 810 /* |
| 811 * ASCII, or a trail byte in lead position which is treated like |
| 812 * a single-byte sequence for better character boundary |
| 813 * resynchronization after illegal sequences. |
| 814 */ |
| 815 ++reqLength; |
| 816 ++pSrc; |
| 817 continue; |
| 818 } else if(ch < 0xe0) { /* U+0080..U+07FF */ |
| 819 if(pSrc[1] != 0) { |
| 820 ++reqLength; |
| 821 pSrc += 2; |
| 822 continue; |
| 823 } |
| 824 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ |
| 825 if(pSrc[1] != 0 && pSrc[2] != 0) { |
| 826 ++reqLength; |
| 827 pSrc += 3; |
| 828 continue; |
| 829 } |
| 830 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ |
| 831 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) { |
| 832 reqLength += 2; |
| 833 pSrc += 4; |
| 834 continue; |
| 835 } |
| 836 } |
| 837 |
| 838 /* truncated character at the end */ |
| 839 ++reqLength; |
| 840 break; |
| 841 } |
| 842 } else /* srcLength >= 0 */ { |
| 843 const uint8_t *pSrcLimit = pSrc + srcLength; |
| 844 |
| 845 /* |
| 846 * This function requires that if srcLength is given, then it must be |
| 847 * destCapatity >= srcLength so that we need not check for |
| 848 * destination buffer overflow in the loop. |
| 849 */ |
| 850 if(destCapacity < srcLength) { |
| 851 if(pDestLength != NULL) { |
| 852 *pDestLength = srcLength; /* this likely overestimates the true
destLength! */ |
| 853 } |
| 854 *pErrorCode = U_BUFFER_OVERFLOW_ERROR; |
| 855 return NULL; |
| 856 } |
| 857 |
| 858 if((pSrcLimit - pSrc) >= 4) { |
| 859 pSrcLimit -= 3; /* temporarily reduce pSrcLimit */ |
| 860 |
| 861 /* in this loop, we can always access at least 4 bytes, up to pSrc+3
*/ |
| 862 do { |
| 863 ch = *pSrc++; |
| 864 if(ch < 0xc0) { |
| 865 /* |
| 866 * ASCII, or a trail byte in lead position which is treated
like |
| 867 * a single-byte sequence for better character boundary |
| 868 * resynchronization after illegal sequences. |
| 869 */ |
| 870 *pDest++=(UChar)ch; |
| 871 } else if(ch < 0xe0) { /* U+0080..U+07FF */ |
| 872 /* 0x3080 = (0xc0 << 6) + 0x80 */ |
| 873 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); |
| 874 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ |
| 875 /* no need for (ch & 0xf) because the upper bits are truncat
ed after <<12 in the cast to (UChar) */ |
| 876 /* 0x2080 = (0x80 << 6) + 0x80 */ |
| 877 ch = (ch << 12) + (*pSrc++ << 6); |
| 878 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); |
| 879 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ |
| 880 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0
x80 */ |
| 881 ch = (ch << 18) + (*pSrc++ << 12); |
| 882 ch += *pSrc++ << 6; |
| 883 ch += *pSrc++ - 0x3c82080; |
| 884 *(pDest++) = U16_LEAD(ch); |
| 885 *(pDest++) = U16_TRAIL(ch); |
| 886 } |
| 887 } while(pSrc < pSrcLimit); |
| 888 |
| 889 pSrcLimit += 3; /* restore original pSrcLimit */ |
| 890 } |
| 891 |
| 892 while(pSrc < pSrcLimit) { |
| 893 ch = *pSrc++; |
| 894 if(ch < 0xc0) { |
| 895 /* |
| 896 * ASCII, or a trail byte in lead position which is treated like |
| 897 * a single-byte sequence for better character boundary |
| 898 * resynchronization after illegal sequences. |
| 899 */ |
| 900 *pDest++=(UChar)ch; |
| 901 continue; |
| 902 } else if(ch < 0xe0) { /* U+0080..U+07FF */ |
| 903 if(pSrc < pSrcLimit) { |
| 904 /* 0x3080 = (0xc0 << 6) + 0x80 */ |
| 905 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); |
| 906 continue; |
| 907 } |
| 908 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ |
| 909 if((pSrcLimit - pSrc) >= 2) { |
| 910 /* no need for (ch & 0xf) because the upper bits are truncat
ed after <<12 in the cast to (UChar) */ |
| 911 /* 0x2080 = (0x80 << 6) + 0x80 */ |
| 912 ch = (ch << 12) + (*pSrc++ << 6); |
| 913 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); |
| 914 pSrc += 3; |
| 915 continue; |
| 916 } |
| 917 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ |
| 918 if((pSrcLimit - pSrc) >= 3) { |
| 919 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0
x80 */ |
| 920 ch = (ch << 18) + (*pSrc++ << 12); |
| 921 ch += *pSrc++ << 6; |
| 922 ch += *pSrc++ - 0x3c82080; |
| 923 *(pDest++) = U16_LEAD(ch); |
| 924 *(pDest++) = U16_TRAIL(ch); |
| 925 pSrc += 4; |
| 926 continue; |
| 927 } |
| 928 } |
| 929 |
| 930 /* truncated character at the end */ |
| 931 *pDest++ = 0xfffd; |
| 932 break; |
| 933 } |
| 934 } |
| 935 |
| 936 reqLength+=(int32_t)(pDest - dest); |
| 937 |
| 938 if(pDestLength){ |
| 939 *pDestLength = reqLength; |
| 940 } |
| 941 |
| 942 /* Terminate the buffer */ |
| 943 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); |
| 944 |
| 945 return dest; |
| 946 } |
| 947 |
| 948 static U_INLINE uint8_t * |
| 949 _appendUTF8(uint8_t *pDest, UChar32 c) { |
| 950 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating functi
on */ |
| 951 if((c)<=0x7f) { |
| 952 *pDest++=(uint8_t)c; |
| 953 } else if(c<=0x7ff) { |
| 954 *pDest++=(uint8_t)((c>>6)|0xc0); |
| 955 *pDest++=(uint8_t)((c&0x3f)|0x80); |
| 956 } else if(c<=0xffff) { |
| 957 *pDest++=(uint8_t)((c>>12)|0xe0); |
| 958 *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80); |
| 959 *pDest++=(uint8_t)(((c)&0x3f)|0x80); |
| 960 } else /* if((uint32_t)(c)<=0x10ffff) */ { |
| 961 *pDest++=(uint8_t)(((c)>>18)|0xf0); |
| 962 *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80); |
| 963 *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80); |
| 964 *pDest++=(uint8_t)(((c)&0x3f)|0x80); |
| 965 } |
| 966 return pDest; |
| 967 } |
| 968 |
| 969 |
| 970 U_CAPI char* U_EXPORT2 |
| 971 u_strToUTF8WithSub(char *dest, |
| 972 int32_t destCapacity, |
| 973 int32_t *pDestLength, |
| 974 const UChar *pSrc, |
| 975 int32_t srcLength, |
| 976 UChar32 subchar, int32_t *pNumSubstitutions, |
| 977 UErrorCode *pErrorCode){ |
| 978 int32_t reqLength=0; |
| 979 uint32_t ch=0,ch2=0; |
| 980 uint8_t *pDest = (uint8_t *)dest; |
| 981 uint8_t *pDestLimit = pDest + destCapacity; |
| 982 int32_t numSubstitutions; |
| 983 |
| 984 /* args check */ |
| 985 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ |
| 986 return NULL; |
| 987 } |
| 988 |
| 989 if( (pSrc==NULL && srcLength!=0) || srcLength < -1 || |
| 990 (destCapacity<0) || (dest == NULL && destCapacity > 0) || |
| 991 subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
| 992 ) { |
| 993 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 994 return NULL; |
| 995 } |
| 996 |
| 997 if(pNumSubstitutions!=NULL) { |
| 998 *pNumSubstitutions=0; |
| 999 } |
| 1000 numSubstitutions=0; |
| 1001 |
| 1002 if(srcLength==-1) { |
| 1003 while((ch=*pSrc)!=0) { |
| 1004 ++pSrc; |
| 1005 if(ch <= 0x7f) { |
| 1006 if(pDest<pDestLimit) { |
| 1007 *pDest++ = (uint8_t)ch; |
| 1008 } else { |
| 1009 reqLength = 1; |
| 1010 break; |
| 1011 } |
| 1012 } else if(ch <= 0x7ff) { |
| 1013 if((pDestLimit - pDest) >= 2) { |
| 1014 *pDest++=(uint8_t)((ch>>6)|0xc0); |
| 1015 *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| 1016 } else { |
| 1017 reqLength = 2; |
| 1018 break; |
| 1019 } |
| 1020 } else if(ch <= 0xd7ff || ch >= 0xe000) { |
| 1021 if((pDestLimit - pDest) >= 3) { |
| 1022 *pDest++=(uint8_t)((ch>>12)|0xe0); |
| 1023 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
| 1024 *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| 1025 } else { |
| 1026 reqLength = 3; |
| 1027 break; |
| 1028 } |
| 1029 } else /* ch is a surrogate */ { |
| 1030 int32_t length; |
| 1031 |
| 1032 /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway
*/ |
| 1033 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { |
| 1034 ++pSrc; |
| 1035 ch=UTF16_GET_PAIR_VALUE(ch, ch2); |
| 1036 } else if(subchar>=0) { |
| 1037 ch=subchar; |
| 1038 ++numSubstitutions; |
| 1039 } else { |
| 1040 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
| 1041 *pErrorCode = U_INVALID_CHAR_FOUND; |
| 1042 return NULL; |
| 1043 } |
| 1044 |
| 1045 length = U8_LENGTH(ch); |
| 1046 if((pDestLimit - pDest) >= length) { |
| 1047 /* convert and append*/ |
| 1048 pDest=_appendUTF8(pDest, ch); |
| 1049 } else { |
| 1050 reqLength = length; |
| 1051 break; |
| 1052 } |
| 1053 } |
| 1054 } |
| 1055 while((ch=*pSrc++)!=0) { |
| 1056 if(ch<=0x7f) { |
| 1057 ++reqLength; |
| 1058 } else if(ch<=0x7ff) { |
| 1059 reqLength+=2; |
| 1060 } else if(!UTF_IS_SURROGATE(ch)) { |
| 1061 reqLength+=3; |
| 1062 } else if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { |
| 1063 ++pSrc; |
| 1064 reqLength+=4; |
| 1065 } else if(subchar>=0) { |
| 1066 reqLength+=U8_LENGTH(subchar); |
| 1067 ++numSubstitutions; |
| 1068 } else { |
| 1069 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
| 1070 *pErrorCode = U_INVALID_CHAR_FOUND; |
| 1071 return NULL; |
| 1072 } |
| 1073 } |
| 1074 } else { |
| 1075 const UChar *pSrcLimit = pSrc+srcLength; |
| 1076 int32_t count; |
| 1077 |
| 1078 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ |
| 1079 for(;;) { |
| 1080 /* |
| 1081 * Each iteration of the inner loop progresses by at most 3 UTF-8 |
| 1082 * bytes and one UChar, for most characters. |
| 1083 * For supplementary code points (4 & 2), which are rare, |
| 1084 * there is an additional adjustment. |
| 1085 */ |
| 1086 count = (int32_t)((pDestLimit - pDest) / 3); |
| 1087 srcLength = (int32_t)(pSrcLimit - pSrc); |
| 1088 if(count > srcLength) { |
| 1089 count = srcLength; /* min(remaining dest/3, remaining src) */ |
| 1090 } |
| 1091 if(count < 3) { |
| 1092 /* |
| 1093 * Too much overhead if we get near the end of the string, |
| 1094 * continue with the next loop. |
| 1095 */ |
| 1096 break; |
| 1097 } |
| 1098 do { |
| 1099 ch=*pSrc++; |
| 1100 if(ch <= 0x7f) { |
| 1101 *pDest++ = (uint8_t)ch; |
| 1102 } else if(ch <= 0x7ff) { |
| 1103 *pDest++=(uint8_t)((ch>>6)|0xc0); |
| 1104 *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| 1105 } else if(ch <= 0xd7ff || ch >= 0xe000) { |
| 1106 *pDest++=(uint8_t)((ch>>12)|0xe0); |
| 1107 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
| 1108 *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| 1109 } else /* ch is a surrogate */ { |
| 1110 /* |
| 1111 * We will read two UChars and probably output four bytes, |
| 1112 * which we didn't account for with computing count, |
| 1113 * so we adjust it here. |
| 1114 */ |
| 1115 if(--count == 0) { |
| 1116 --pSrc; /* undo ch=*pSrc++ for the lead surrogate */ |
| 1117 break; /* recompute count */ |
| 1118 } |
| 1119 |
| 1120 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { |
| 1121 ++pSrc; |
| 1122 ch=UTF16_GET_PAIR_VALUE(ch, ch2); |
| 1123 |
| 1124 /* writing 4 bytes per 2 UChars is ok */ |
| 1125 *pDest++=(uint8_t)((ch>>18)|0xf0); |
| 1126 *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80); |
| 1127 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
| 1128 *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| 1129 } else { |
| 1130 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
| 1131 if(subchar>=0) { |
| 1132 ch=subchar; |
| 1133 ++numSubstitutions; |
| 1134 } else { |
| 1135 *pErrorCode = U_INVALID_CHAR_FOUND; |
| 1136 return NULL; |
| 1137 } |
| 1138 |
| 1139 /* convert and append*/ |
| 1140 pDest=_appendUTF8(pDest, ch); |
| 1141 } |
| 1142 } |
| 1143 } while(--count > 0); |
| 1144 } |
| 1145 |
| 1146 while(pSrc<pSrcLimit) { |
| 1147 ch=*pSrc++; |
| 1148 if(ch <= 0x7f) { |
| 1149 if(pDest<pDestLimit) { |
| 1150 *pDest++ = (uint8_t)ch; |
| 1151 } else { |
| 1152 reqLength = 1; |
| 1153 break; |
| 1154 } |
| 1155 } else if(ch <= 0x7ff) { |
| 1156 if((pDestLimit - pDest) >= 2) { |
| 1157 *pDest++=(uint8_t)((ch>>6)|0xc0); |
| 1158 *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| 1159 } else { |
| 1160 reqLength = 2; |
| 1161 break; |
| 1162 } |
| 1163 } else if(ch <= 0xd7ff || ch >= 0xe000) { |
| 1164 if((pDestLimit - pDest) >= 3) { |
| 1165 *pDest++=(uint8_t)((ch>>12)|0xe0); |
| 1166 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
| 1167 *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| 1168 } else { |
| 1169 reqLength = 3; |
| 1170 break; |
| 1171 } |
| 1172 } else /* ch is a surrogate */ { |
| 1173 int32_t length; |
| 1174 |
| 1175 if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(
ch2=*pSrc)) { |
| 1176 ++pSrc; |
| 1177 ch=UTF16_GET_PAIR_VALUE(ch, ch2); |
| 1178 } else if(subchar>=0) { |
| 1179 ch=subchar; |
| 1180 ++numSubstitutions; |
| 1181 } else { |
| 1182 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
| 1183 *pErrorCode = U_INVALID_CHAR_FOUND; |
| 1184 return NULL; |
| 1185 } |
| 1186 |
| 1187 length = U8_LENGTH(ch); |
| 1188 if((pDestLimit - pDest) >= length) { |
| 1189 /* convert and append*/ |
| 1190 pDest=_appendUTF8(pDest, ch); |
| 1191 } else { |
| 1192 reqLength = length; |
| 1193 break; |
| 1194 } |
| 1195 } |
| 1196 } |
| 1197 while(pSrc<pSrcLimit) { |
| 1198 ch=*pSrc++; |
| 1199 if(ch<=0x7f) { |
| 1200 ++reqLength; |
| 1201 } else if(ch<=0x7ff) { |
| 1202 reqLength+=2; |
| 1203 } else if(!UTF_IS_SURROGATE(ch)) { |
| 1204 reqLength+=3; |
| 1205 } else if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRA
IL(ch2=*pSrc)) { |
| 1206 ++pSrc; |
| 1207 reqLength+=4; |
| 1208 } else if(subchar>=0) { |
| 1209 reqLength+=U8_LENGTH(subchar); |
| 1210 ++numSubstitutions; |
| 1211 } else { |
| 1212 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
| 1213 *pErrorCode = U_INVALID_CHAR_FOUND; |
| 1214 return NULL; |
| 1215 } |
| 1216 } |
| 1217 } |
| 1218 |
| 1219 reqLength+=(int32_t)(pDest - (uint8_t *)dest); |
| 1220 |
| 1221 if(pNumSubstitutions!=NULL) { |
| 1222 *pNumSubstitutions=numSubstitutions; |
| 1223 } |
| 1224 |
| 1225 if(pDestLength){ |
| 1226 *pDestLength = reqLength; |
| 1227 } |
| 1228 |
| 1229 /* Terminate the buffer */ |
| 1230 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); |
| 1231 return dest; |
| 1232 } |
| 1233 |
| 1234 U_CAPI char* U_EXPORT2 |
| 1235 u_strToUTF8(char *dest, |
| 1236 int32_t destCapacity, |
| 1237 int32_t *pDestLength, |
| 1238 const UChar *pSrc, |
| 1239 int32_t srcLength, |
| 1240 UErrorCode *pErrorCode){ |
| 1241 return u_strToUTF8WithSub( |
| 1242 dest, destCapacity, pDestLength, |
| 1243 pSrc, srcLength, |
| 1244 U_SENTINEL, NULL, |
| 1245 pErrorCode); |
| 1246 } |
| 1247 |
| 1248 U_CAPI UChar* U_EXPORT2 |
| 1249 u_strFromJavaModifiedUTF8WithSub( |
| 1250 UChar *dest, |
| 1251 int32_t destCapacity, |
| 1252 int32_t *pDestLength, |
| 1253 const char *src, |
| 1254 int32_t srcLength, |
| 1255 UChar32 subchar, int32_t *pNumSubstitutions, |
| 1256 UErrorCode *pErrorCode) { |
| 1257 UChar *pDest = dest; |
| 1258 UChar *pDestLimit = dest+destCapacity; |
| 1259 UChar32 ch; |
| 1260 int32_t reqLength = 0; |
| 1261 const uint8_t* pSrc = (const uint8_t*) src; |
| 1262 const uint8_t *pSrcLimit; |
| 1263 int32_t count; |
| 1264 uint8_t t1, t2; /* trail bytes */ |
| 1265 int32_t numSubstitutions; |
| 1266 |
| 1267 /* args check */ |
| 1268 if(U_FAILURE(*pErrorCode)){ |
| 1269 return NULL; |
| 1270 } |
| 1271 if( (src==NULL && srcLength!=0) || srcLength < -1 || |
| 1272 (dest==NULL && destCapacity!=0) || destCapacity<0 || |
| 1273 subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
| 1274 ) { |
| 1275 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 1276 return NULL; |
| 1277 } |
| 1278 |
| 1279 if(pNumSubstitutions!=NULL) { |
| 1280 *pNumSubstitutions=0; |
| 1281 } |
| 1282 numSubstitutions=0; |
| 1283 |
| 1284 if(srcLength < 0) { |
| 1285 /* |
| 1286 * Transform a NUL-terminated ASCII string. |
| 1287 * Handle non-ASCII strings with slower code. |
| 1288 */ |
| 1289 while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) { |
| 1290 *pDest++=(UChar)ch; |
| 1291 ++pSrc; |
| 1292 } |
| 1293 if(ch == 0) { |
| 1294 reqLength=(int32_t)(pDest - dest); |
| 1295 if(pDestLength) { |
| 1296 *pDestLength = reqLength; |
| 1297 } |
| 1298 |
| 1299 /* Terminate the buffer */ |
| 1300 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); |
| 1301 return dest; |
| 1302 } |
| 1303 srcLength = uprv_strlen((const char *)pSrc); |
| 1304 } |
| 1305 |
| 1306 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ |
| 1307 pSrcLimit = pSrc + srcLength; |
| 1308 for(;;) { |
| 1309 count = (int32_t)(pDestLimit - pDest); |
| 1310 srcLength = (int32_t)(pSrcLimit - pSrc); |
| 1311 if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) { |
| 1312 /* fast ASCII loop */ |
| 1313 const uint8_t *prevSrc = pSrc; |
| 1314 int32_t delta; |
| 1315 while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) { |
| 1316 *pDest++=(UChar)ch; |
| 1317 ++pSrc; |
| 1318 } |
| 1319 delta = (int32_t)(pSrc - prevSrc); |
| 1320 count -= delta; |
| 1321 srcLength -= delta; |
| 1322 } |
| 1323 /* |
| 1324 * Each iteration of the inner loop progresses by at most 3 UTF-8 |
| 1325 * bytes and one UChar. |
| 1326 */ |
| 1327 srcLength /= 3; |
| 1328 if(count > srcLength) { |
| 1329 count = srcLength; /* min(remaining dest, remaining src/3) */ |
| 1330 } |
| 1331 if(count < 3) { |
| 1332 /* |
| 1333 * Too much overhead if we get near the end of the string, |
| 1334 * continue with the next loop. |
| 1335 */ |
| 1336 break; |
| 1337 } |
| 1338 do { |
| 1339 ch = *pSrc; |
| 1340 if(ch <= 0x7f){ |
| 1341 *pDest++=(UChar)ch; |
| 1342 ++pSrc; |
| 1343 } else { |
| 1344 if(ch >= 0xe0) { |
| 1345 if( /* handle U+0000..U+FFFF inline */ |
| 1346 ch <= 0xef && |
| 1347 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && |
| 1348 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f |
| 1349 ) { |
| 1350 /* no need for (ch & 0xf) because the upper bits are tru
ncated after <<12 in the cast to (UChar) */ |
| 1351 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); |
| 1352 pSrc += 3; |
| 1353 continue; |
| 1354 } |
| 1355 } else { |
| 1356 if( /* handle U+0000..U+07FF inline */ |
| 1357 ch >= 0xc0 && |
| 1358 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f |
| 1359 ) { |
| 1360 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); |
| 1361 pSrc += 2; |
| 1362 continue; |
| 1363 } |
| 1364 } |
| 1365 |
| 1366 if(subchar < 0) { |
| 1367 *pErrorCode = U_INVALID_CHAR_FOUND; |
| 1368 return NULL; |
| 1369 } else if(subchar > 0xffff && --count == 0) { |
| 1370 /* |
| 1371 * We need to write two UChars, adjusted count for that, |
| 1372 * and ran out of space. |
| 1373 */ |
| 1374 break; |
| 1375 } else { |
| 1376 /* function call for error cases */ |
| 1377 ++pSrc; /* continue after the lead byte */ |
| 1378 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
| 1379 ++numSubstitutions; |
| 1380 if(subchar<=0xFFFF) { |
| 1381 *(pDest++)=(UChar)subchar; |
| 1382 } else { |
| 1383 *(pDest++)=U16_LEAD(subchar); |
| 1384 *(pDest++)=U16_TRAIL(subchar); |
| 1385 } |
| 1386 } |
| 1387 } |
| 1388 } while(--count > 0); |
| 1389 } |
| 1390 |
| 1391 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { |
| 1392 ch = *pSrc; |
| 1393 if(ch <= 0x7f){ |
| 1394 *pDest++=(UChar)ch; |
| 1395 ++pSrc; |
| 1396 } else { |
| 1397 if(ch >= 0xe0) { |
| 1398 if( /* handle U+0000..U+FFFF inline */ |
| 1399 ch <= 0xef && |
| 1400 ((pSrcLimit - pSrc) >= 3) && |
| 1401 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && |
| 1402 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f |
| 1403 ) { |
| 1404 /* no need for (ch & 0xf) because the upper bits are truncat
ed after <<12 in the cast to (UChar) */ |
| 1405 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); |
| 1406 pSrc += 3; |
| 1407 continue; |
| 1408 } |
| 1409 } else { |
| 1410 if( /* handle U+0000..U+07FF inline */ |
| 1411 ch >= 0xc0 && |
| 1412 ((pSrcLimit - pSrc) >= 2) && |
| 1413 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f |
| 1414 ) { |
| 1415 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); |
| 1416 pSrc += 2; |
| 1417 continue; |
| 1418 } |
| 1419 } |
| 1420 |
| 1421 if(subchar < 0) { |
| 1422 *pErrorCode = U_INVALID_CHAR_FOUND; |
| 1423 return NULL; |
| 1424 } else { |
| 1425 /* function call for error cases */ |
| 1426 ++pSrc; /* continue after the lead byte */ |
| 1427 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
| 1428 ++numSubstitutions; |
| 1429 if(subchar<=0xFFFF) { |
| 1430 *(pDest++)=(UChar)subchar; |
| 1431 } else { |
| 1432 *(pDest++)=U16_LEAD(subchar); |
| 1433 if(pDest<pDestLimit) { |
| 1434 *(pDest++)=U16_TRAIL(subchar); |
| 1435 } else { |
| 1436 reqLength++; |
| 1437 break; |
| 1438 } |
| 1439 } |
| 1440 } |
| 1441 } |
| 1442 } |
| 1443 |
| 1444 /* do not fill the dest buffer just count the UChars needed */ |
| 1445 while(pSrc < pSrcLimit){ |
| 1446 ch = *pSrc; |
| 1447 if(ch <= 0x7f) { |
| 1448 reqLength++; |
| 1449 ++pSrc; |
| 1450 } else { |
| 1451 if(ch >= 0xe0) { |
| 1452 if( /* handle U+0000..U+FFFF inline */ |
| 1453 ch <= 0xef && |
| 1454 ((pSrcLimit - pSrc) >= 3) && |
| 1455 (uint8_t)(pSrc[1] - 0x80) <= 0x3f && |
| 1456 (uint8_t)(pSrc[2] - 0x80) <= 0x3f |
| 1457 ) { |
| 1458 reqLength++; |
| 1459 pSrc += 3; |
| 1460 continue; |
| 1461 } |
| 1462 } else { |
| 1463 if( /* handle U+0000..U+07FF inline */ |
| 1464 ch >= 0xc0 && |
| 1465 ((pSrcLimit - pSrc) >= 2) && |
| 1466 (uint8_t)(pSrc[1] - 0x80) <= 0x3f |
| 1467 ) { |
| 1468 reqLength++; |
| 1469 pSrc += 2; |
| 1470 continue; |
| 1471 } |
| 1472 } |
| 1473 |
| 1474 if(subchar < 0) { |
| 1475 *pErrorCode = U_INVALID_CHAR_FOUND; |
| 1476 return NULL; |
| 1477 } else { |
| 1478 /* function call for error cases */ |
| 1479 ++pSrc; /* continue after the lead byte */ |
| 1480 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
| 1481 ++numSubstitutions; |
| 1482 reqLength+=U16_LENGTH(ch); |
| 1483 } |
| 1484 } |
| 1485 } |
| 1486 |
| 1487 if(pNumSubstitutions!=NULL) { |
| 1488 *pNumSubstitutions=numSubstitutions; |
| 1489 } |
| 1490 |
| 1491 reqLength+=(int32_t)(pDest - dest); |
| 1492 if(pDestLength) { |
| 1493 *pDestLength = reqLength; |
| 1494 } |
| 1495 |
| 1496 /* Terminate the buffer */ |
| 1497 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); |
| 1498 return dest; |
| 1499 } |
| 1500 |
| 1501 U_CAPI char* U_EXPORT2 |
| 1502 u_strToJavaModifiedUTF8( |
| 1503 char *dest, |
| 1504 int32_t destCapacity, |
| 1505 int32_t *pDestLength, |
| 1506 const UChar *src, |
| 1507 int32_t srcLength, |
| 1508 UErrorCode *pErrorCode) { |
| 1509 int32_t reqLength=0; |
| 1510 uint32_t ch=0; |
| 1511 uint8_t *pDest = (uint8_t *)dest; |
| 1512 uint8_t *pDestLimit = pDest + destCapacity; |
| 1513 const UChar *pSrcLimit; |
| 1514 int32_t count; |
| 1515 |
| 1516 /* args check */ |
| 1517 if(U_FAILURE(*pErrorCode)){ |
| 1518 return NULL; |
| 1519 } |
| 1520 if( (src==NULL && srcLength!=0) || srcLength < -1 || |
| 1521 (dest==NULL && destCapacity!=0) || destCapacity<0 |
| 1522 ) { |
| 1523 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| 1524 return NULL; |
| 1525 } |
| 1526 |
| 1527 if(srcLength==-1) { |
| 1528 /* Convert NUL-terminated ASCII, then find the string length. */ |
| 1529 while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) { |
| 1530 *pDest++ = (uint8_t)ch; |
| 1531 ++src; |
| 1532 } |
| 1533 if(ch == 0) { |
| 1534 reqLength=(int32_t)(pDest - (uint8_t *)dest); |
| 1535 if(pDestLength) { |
| 1536 *pDestLength = reqLength; |
| 1537 } |
| 1538 |
| 1539 /* Terminate the buffer */ |
| 1540 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); |
| 1541 return dest; |
| 1542 } |
| 1543 srcLength = u_strlen(src); |
| 1544 } |
| 1545 |
| 1546 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ |
| 1547 pSrcLimit = src+srcLength; |
| 1548 for(;;) { |
| 1549 count = (int32_t)(pDestLimit - pDest); |
| 1550 srcLength = (int32_t)(pSrcLimit - src); |
| 1551 if(count >= srcLength && srcLength > 0 && *src <= 0x7f) { |
| 1552 /* fast ASCII loop */ |
| 1553 const UChar *prevSrc = src; |
| 1554 int32_t delta; |
| 1555 while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) { |
| 1556 *pDest++=(uint8_t)ch; |
| 1557 ++src; |
| 1558 } |
| 1559 delta = (int32_t)(src - prevSrc); |
| 1560 count -= delta; |
| 1561 srcLength -= delta; |
| 1562 } |
| 1563 /* |
| 1564 * Each iteration of the inner loop progresses by at most 3 UTF-8 |
| 1565 * bytes and one UChar. |
| 1566 */ |
| 1567 count /= 3; |
| 1568 if(count > srcLength) { |
| 1569 count = srcLength; /* min(remaining dest/3, remaining src) */ |
| 1570 } |
| 1571 if(count < 3) { |
| 1572 /* |
| 1573 * Too much overhead if we get near the end of the string, |
| 1574 * continue with the next loop. |
| 1575 */ |
| 1576 break; |
| 1577 } |
| 1578 do { |
| 1579 ch=*src++; |
| 1580 if(ch <= 0x7f && ch != 0) { |
| 1581 *pDest++ = (uint8_t)ch; |
| 1582 } else if(ch <= 0x7ff) { |
| 1583 *pDest++=(uint8_t)((ch>>6)|0xc0); |
| 1584 *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| 1585 } else { |
| 1586 *pDest++=(uint8_t)((ch>>12)|0xe0); |
| 1587 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
| 1588 *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| 1589 } |
| 1590 } while(--count > 0); |
| 1591 } |
| 1592 |
| 1593 while(src<pSrcLimit) { |
| 1594 ch=*src++; |
| 1595 if(ch <= 0x7f && ch != 0) { |
| 1596 if(pDest<pDestLimit) { |
| 1597 *pDest++ = (uint8_t)ch; |
| 1598 } else { |
| 1599 reqLength = 1; |
| 1600 break; |
| 1601 } |
| 1602 } else if(ch <= 0x7ff) { |
| 1603 if((pDestLimit - pDest) >= 2) { |
| 1604 *pDest++=(uint8_t)((ch>>6)|0xc0); |
| 1605 *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| 1606 } else { |
| 1607 reqLength = 2; |
| 1608 break; |
| 1609 } |
| 1610 } else { |
| 1611 if((pDestLimit - pDest) >= 3) { |
| 1612 *pDest++=(uint8_t)((ch>>12)|0xe0); |
| 1613 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
| 1614 *pDest++=(uint8_t)((ch&0x3f)|0x80); |
| 1615 } else { |
| 1616 reqLength = 3; |
| 1617 break; |
| 1618 } |
| 1619 } |
| 1620 } |
| 1621 while(src<pSrcLimit) { |
| 1622 ch=*src++; |
| 1623 if(ch <= 0x7f && ch != 0) { |
| 1624 ++reqLength; |
| 1625 } else if(ch<=0x7ff) { |
| 1626 reqLength+=2; |
| 1627 } else { |
| 1628 reqLength+=3; |
| 1629 } |
| 1630 } |
| 1631 |
| 1632 reqLength+=(int32_t)(pDest - (uint8_t *)dest); |
| 1633 if(pDestLength){ |
| 1634 *pDestLength = reqLength; |
| 1635 } |
| 1636 |
| 1637 /* Terminate the buffer */ |
| 1638 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); |
| 1639 return dest; |
| 1640 } |
OLD | NEW |