OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ********************************************************************** |
| 3 * Copyright (C) 2002-2010, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ********************************************************************** |
| 6 * file name: ucnv_u16.c |
| 7 * encoding: US-ASCII |
| 8 * tab size: 8 (not used) |
| 9 * indentation:4 |
| 10 * |
| 11 * created on: 2002jul01 |
| 12 * created by: Markus W. Scherer |
| 13 * |
| 14 * UTF-16 converter implementation. Used to be in ucnv_utf.c. |
| 15 */ |
| 16 |
| 17 #include "unicode/utypes.h" |
| 18 |
| 19 #if !UCONFIG_NO_CONVERSION |
| 20 |
| 21 #include "unicode/ucnv.h" |
| 22 #include "ucnv_bld.h" |
| 23 #include "ucnv_cnv.h" |
| 24 #include "cmemory.h" |
| 25 |
| 26 enum { |
| 27 UCNV_NEED_TO_WRITE_BOM=1 |
| 28 }; |
| 29 |
| 30 /* |
| 31 * The UTF-16 toUnicode implementation is also used for the Java-specific |
| 32 * "with BOM" variants of UTF-16BE and UTF-16LE. |
| 33 */ |
| 34 static void |
| 35 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
| 36 UErrorCode *pErrorCode); |
| 37 |
| 38 /* UTF-16BE ----------------------------------------------------------------- */ |
| 39 |
| 40 #if U_IS_BIG_ENDIAN |
| 41 # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets |
| 42 #else |
| 43 # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets |
| 44 #endif |
| 45 |
| 46 |
| 47 static void |
| 48 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, |
| 49 UErrorCode *pErrorCode) { |
| 50 UConverter *cnv; |
| 51 const UChar *source; |
| 52 char *target; |
| 53 int32_t *offsets; |
| 54 |
| 55 uint32_t targetCapacity, length, sourceIndex; |
| 56 UChar c, trail; |
| 57 char overflow[4]; |
| 58 |
| 59 source=pArgs->source; |
| 60 length=(int32_t)(pArgs->sourceLimit-source); |
| 61 if(length<=0) { |
| 62 /* no input, nothing to do */ |
| 63 return; |
| 64 } |
| 65 |
| 66 cnv=pArgs->converter; |
| 67 |
| 68 /* write the BOM if necessary */ |
| 69 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
| 70 static const char bom[]={ (char)0xfe, (char)0xff }; |
| 71 ucnv_fromUWriteBytes(cnv, |
| 72 bom, 2, |
| 73 &pArgs->target, pArgs->targetLimit, |
| 74 &pArgs->offsets, -1, |
| 75 pErrorCode); |
| 76 cnv->fromUnicodeStatus=0; |
| 77 } |
| 78 |
| 79 target=pArgs->target; |
| 80 if(target >= pArgs->targetLimit) { |
| 81 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 82 return; |
| 83 } |
| 84 |
| 85 targetCapacity=(uint32_t)(pArgs->targetLimit-target); |
| 86 offsets=pArgs->offsets; |
| 87 sourceIndex=0; |
| 88 |
| 89 /* c!=0 indicates in several places outside the main loops that a surrogate
was found */ |
| 90 |
| 91 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCa
pacity>=4) { |
| 92 /* the last buffer ended with a lead surrogate, output the surrogate pai
r */ |
| 93 ++source; |
| 94 --length; |
| 95 target[0]=(uint8_t)(c>>8); |
| 96 target[1]=(uint8_t)c; |
| 97 target[2]=(uint8_t)(trail>>8); |
| 98 target[3]=(uint8_t)trail; |
| 99 target+=4; |
| 100 targetCapacity-=4; |
| 101 if(offsets!=NULL) { |
| 102 *offsets++=-1; |
| 103 *offsets++=-1; |
| 104 *offsets++=-1; |
| 105 *offsets++=-1; |
| 106 } |
| 107 sourceIndex=1; |
| 108 cnv->fromUChar32=c=0; |
| 109 } |
| 110 |
| 111 if(c==0) { |
| 112 /* copy an even number of bytes for complete UChars */ |
| 113 uint32_t count=2*length; |
| 114 if(count>targetCapacity) { |
| 115 count=targetCapacity&~1; |
| 116 } |
| 117 /* count is even */ |
| 118 targetCapacity-=count; |
| 119 count>>=1; |
| 120 length-=count; |
| 121 |
| 122 if(offsets==NULL) { |
| 123 while(count>0) { |
| 124 c=*source++; |
| 125 if(U16_IS_SINGLE(c)) { |
| 126 target[0]=(uint8_t)(c>>8); |
| 127 target[1]=(uint8_t)c; |
| 128 target+=2; |
| 129 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(t
rail=*source)) { |
| 130 ++source; |
| 131 --count; |
| 132 target[0]=(uint8_t)(c>>8); |
| 133 target[1]=(uint8_t)c; |
| 134 target[2]=(uint8_t)(trail>>8); |
| 135 target[3]=(uint8_t)trail; |
| 136 target+=4; |
| 137 } else { |
| 138 break; |
| 139 } |
| 140 --count; |
| 141 } |
| 142 } else { |
| 143 while(count>0) { |
| 144 c=*source++; |
| 145 if(U16_IS_SINGLE(c)) { |
| 146 target[0]=(uint8_t)(c>>8); |
| 147 target[1]=(uint8_t)c; |
| 148 target+=2; |
| 149 *offsets++=sourceIndex; |
| 150 *offsets++=sourceIndex++; |
| 151 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(t
rail=*source)) { |
| 152 ++source; |
| 153 --count; |
| 154 target[0]=(uint8_t)(c>>8); |
| 155 target[1]=(uint8_t)c; |
| 156 target[2]=(uint8_t)(trail>>8); |
| 157 target[3]=(uint8_t)trail; |
| 158 target+=4; |
| 159 *offsets++=sourceIndex; |
| 160 *offsets++=sourceIndex; |
| 161 *offsets++=sourceIndex; |
| 162 *offsets++=sourceIndex; |
| 163 sourceIndex+=2; |
| 164 } else { |
| 165 break; |
| 166 } |
| 167 --count; |
| 168 } |
| 169 } |
| 170 |
| 171 if(count==0) { |
| 172 /* done with the loop for complete UChars */ |
| 173 if(length>0 && targetCapacity>0) { |
| 174 /* |
| 175 * there is more input and some target capacity - |
| 176 * it must be targetCapacity==1 because otherwise |
| 177 * the above would have copied more; |
| 178 * prepare for overflow output |
| 179 */ |
| 180 if(U16_IS_SINGLE(c=*source++)) { |
| 181 overflow[0]=(char)(c>>8); |
| 182 overflow[1]=(char)c; |
| 183 length=2; /* 2 bytes to output */ |
| 184 c=0; |
| 185 /* } else { keep c for surrogate handling, length will be set th
ere */ |
| 186 } |
| 187 } else { |
| 188 length=0; |
| 189 c=0; |
| 190 } |
| 191 } else { |
| 192 /* keep c for surrogate handling, length will be set there */ |
| 193 targetCapacity+=2*count; |
| 194 } |
| 195 } else { |
| 196 length=0; /* from here on, length counts the bytes in overflow[] */ |
| 197 } |
| 198 |
| 199 if(c!=0) { |
| 200 /* |
| 201 * c is a surrogate, and |
| 202 * - source or target too short |
| 203 * - or the surrogate is unmatched |
| 204 */ |
| 205 length=0; |
| 206 if(U16_IS_SURROGATE_LEAD(c)) { |
| 207 if(source<pArgs->sourceLimit) { |
| 208 if(U16_IS_TRAIL(trail=*source)) { |
| 209 /* output the surrogate pair, will overflow (see conditions
comment above) */ |
| 210 ++source; |
| 211 overflow[0]=(char)(c>>8); |
| 212 overflow[1]=(char)c; |
| 213 overflow[2]=(char)(trail>>8); |
| 214 overflow[3]=(char)trail; |
| 215 length=4; /* 4 bytes to output */ |
| 216 c=0; |
| 217 } else { |
| 218 /* unmatched lead surrogate */ |
| 219 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 220 } |
| 221 } else { |
| 222 /* see if the trail surrogate is in the next buffer */ |
| 223 } |
| 224 } else { |
| 225 /* unmatched trail surrogate */ |
| 226 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 227 } |
| 228 cnv->fromUChar32=c; |
| 229 } |
| 230 |
| 231 if(length>0) { |
| 232 /* output length bytes with overflow (length>targetCapacity>0) */ |
| 233 ucnv_fromUWriteBytes(cnv, |
| 234 overflow, length, |
| 235 (char **)&target, pArgs->targetLimit, |
| 236 &offsets, sourceIndex, |
| 237 pErrorCode); |
| 238 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); |
| 239 } |
| 240 |
| 241 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0)
{ |
| 242 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 243 } |
| 244 |
| 245 /* write back the updated pointers */ |
| 246 pArgs->source=source; |
| 247 pArgs->target=(char *)target; |
| 248 pArgs->offsets=offsets; |
| 249 } |
| 250 |
| 251 static void |
| 252 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
| 253 UErrorCode *pErrorCode) { |
| 254 UConverter *cnv; |
| 255 const uint8_t *source; |
| 256 UChar *target; |
| 257 int32_t *offsets; |
| 258 |
| 259 uint32_t targetCapacity, length, count, sourceIndex; |
| 260 UChar c, trail; |
| 261 |
| 262 if(pArgs->converter->mode<8) { |
| 263 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); |
| 264 return; |
| 265 } |
| 266 |
| 267 cnv=pArgs->converter; |
| 268 source=(const uint8_t *)pArgs->source; |
| 269 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); |
| 270 if(length<=0 && cnv->toUnicodeStatus==0) { |
| 271 /* no input, nothing to do */ |
| 272 return; |
| 273 } |
| 274 |
| 275 target=pArgs->target; |
| 276 if(target >= pArgs->targetLimit) { |
| 277 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 278 return; |
| 279 } |
| 280 |
| 281 targetCapacity=(uint32_t)(pArgs->targetLimit-target); |
| 282 offsets=pArgs->offsets; |
| 283 sourceIndex=0; |
| 284 c=0; |
| 285 |
| 286 /* complete a partial UChar or pair from the last call */ |
| 287 if(cnv->toUnicodeStatus!=0) { |
| 288 /* |
| 289 * special case: single byte from a previous buffer, |
| 290 * where the byte turned out not to belong to a trail surrogate |
| 291 * and the preceding, unmatched lead surrogate was put into toUBytes[] |
| 292 * for error handling |
| 293 */ |
| 294 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; |
| 295 cnv->toULength=1; |
| 296 cnv->toUnicodeStatus=0; |
| 297 } |
| 298 if((count=cnv->toULength)!=0) { |
| 299 uint8_t *p=cnv->toUBytes; |
| 300 do { |
| 301 p[count++]=*source++; |
| 302 ++sourceIndex; |
| 303 --length; |
| 304 if(count==2) { |
| 305 c=((UChar)p[0]<<8)|p[1]; |
| 306 if(U16_IS_SINGLE(c)) { |
| 307 /* output the BMP code point */ |
| 308 *target++=c; |
| 309 if(offsets!=NULL) { |
| 310 *offsets++=-1; |
| 311 } |
| 312 --targetCapacity; |
| 313 count=0; |
| 314 c=0; |
| 315 break; |
| 316 } else if(U16_IS_SURROGATE_LEAD(c)) { |
| 317 /* continue collecting bytes for the trail surrogate */ |
| 318 c=0; /* avoid unnecessary surrogate handling below */ |
| 319 } else { |
| 320 /* fall through to error handling for an unmatched trail sur
rogate */ |
| 321 break; |
| 322 } |
| 323 } else if(count==4) { |
| 324 c=((UChar)p[0]<<8)|p[1]; |
| 325 trail=((UChar)p[2]<<8)|p[3]; |
| 326 if(U16_IS_TRAIL(trail)) { |
| 327 /* output the surrogate pair */ |
| 328 *target++=c; |
| 329 if(targetCapacity>=2) { |
| 330 *target++=trail; |
| 331 if(offsets!=NULL) { |
| 332 *offsets++=-1; |
| 333 *offsets++=-1; |
| 334 } |
| 335 targetCapacity-=2; |
| 336 } else /* targetCapacity==1 */ { |
| 337 targetCapacity=0; |
| 338 cnv->UCharErrorBuffer[0]=trail; |
| 339 cnv->UCharErrorBufferLength=1; |
| 340 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 341 } |
| 342 count=0; |
| 343 c=0; |
| 344 break; |
| 345 } else { |
| 346 /* unmatched lead surrogate, handle here for consistent toUB
ytes[] */ |
| 347 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 348 |
| 349 /* back out reading the code unit after it */ |
| 350 if(((const uint8_t *)pArgs->source-source)>=2) { |
| 351 source-=2; |
| 352 } else { |
| 353 /* |
| 354 * if the trail unit's first byte was in a previous buff
er, then |
| 355 * we need to put it into a special place because toUByt
es[] will be |
| 356 * used for the lead unit's bytes |
| 357 */ |
| 358 cnv->toUnicodeStatus=0x100|p[2]; |
| 359 --source; |
| 360 } |
| 361 cnv->toULength=2; |
| 362 |
| 363 /* write back the updated pointers */ |
| 364 pArgs->source=(const char *)source; |
| 365 pArgs->target=target; |
| 366 pArgs->offsets=offsets; |
| 367 return; |
| 368 } |
| 369 } |
| 370 } while(length>0); |
| 371 cnv->toULength=(int8_t)count; |
| 372 } |
| 373 |
| 374 /* copy an even number of bytes for complete UChars */ |
| 375 count=2*targetCapacity; |
| 376 if(count>length) { |
| 377 count=length&~1; |
| 378 } |
| 379 if(c==0 && count>0) { |
| 380 length-=count; |
| 381 count>>=1; |
| 382 targetCapacity-=count; |
| 383 if(offsets==NULL) { |
| 384 do { |
| 385 c=((UChar)source[0]<<8)|source[1]; |
| 386 source+=2; |
| 387 if(U16_IS_SINGLE(c)) { |
| 388 *target++=c; |
| 389 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && |
| 390 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) |
| 391 ) { |
| 392 source+=2; |
| 393 --count; |
| 394 *target++=c; |
| 395 *target++=trail; |
| 396 } else { |
| 397 break; |
| 398 } |
| 399 } while(--count>0); |
| 400 } else { |
| 401 do { |
| 402 c=((UChar)source[0]<<8)|source[1]; |
| 403 source+=2; |
| 404 if(U16_IS_SINGLE(c)) { |
| 405 *target++=c; |
| 406 *offsets++=sourceIndex; |
| 407 sourceIndex+=2; |
| 408 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && |
| 409 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) |
| 410 ) { |
| 411 source+=2; |
| 412 --count; |
| 413 *target++=c; |
| 414 *target++=trail; |
| 415 *offsets++=sourceIndex; |
| 416 *offsets++=sourceIndex; |
| 417 sourceIndex+=4; |
| 418 } else { |
| 419 break; |
| 420 } |
| 421 } while(--count>0); |
| 422 } |
| 423 |
| 424 if(count==0) { |
| 425 /* done with the loop for complete UChars */ |
| 426 c=0; |
| 427 } else { |
| 428 /* keep c for surrogate handling, trail will be set there */ |
| 429 length+=2*(count-1); /* one more byte pair was consumed than count d
ecremented */ |
| 430 targetCapacity+=count; |
| 431 } |
| 432 } |
| 433 |
| 434 if(c!=0) { |
| 435 /* |
| 436 * c is a surrogate, and |
| 437 * - source or target too short |
| 438 * - or the surrogate is unmatched |
| 439 */ |
| 440 cnv->toUBytes[0]=(uint8_t)(c>>8); |
| 441 cnv->toUBytes[1]=(uint8_t)c; |
| 442 cnv->toULength=2; |
| 443 |
| 444 if(U16_IS_SURROGATE_LEAD(c)) { |
| 445 if(length>=2) { |
| 446 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) { |
| 447 /* output the surrogate pair, will overflow (see conditions
comment above) */ |
| 448 source+=2; |
| 449 length-=2; |
| 450 *target++=c; |
| 451 if(offsets!=NULL) { |
| 452 *offsets++=sourceIndex; |
| 453 } |
| 454 cnv->UCharErrorBuffer[0]=trail; |
| 455 cnv->UCharErrorBufferLength=1; |
| 456 cnv->toULength=0; |
| 457 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 458 } else { |
| 459 /* unmatched lead surrogate */ |
| 460 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 461 } |
| 462 } else { |
| 463 /* see if the trail surrogate is in the next buffer */ |
| 464 } |
| 465 } else { |
| 466 /* unmatched trail surrogate */ |
| 467 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 468 } |
| 469 } |
| 470 |
| 471 if(U_SUCCESS(*pErrorCode)) { |
| 472 /* check for a remaining source byte */ |
| 473 if(length>0) { |
| 474 if(targetCapacity==0) { |
| 475 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 476 } else { |
| 477 /* it must be length==1 because otherwise the above would have c
opied more */ |
| 478 cnv->toUBytes[cnv->toULength++]=*source++; |
| 479 } |
| 480 } |
| 481 } |
| 482 |
| 483 /* write back the updated pointers */ |
| 484 pArgs->source=(const char *)source; |
| 485 pArgs->target=target; |
| 486 pArgs->offsets=offsets; |
| 487 } |
| 488 |
| 489 static UChar32 |
| 490 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { |
| 491 const uint8_t *s, *sourceLimit; |
| 492 UChar32 c; |
| 493 |
| 494 if(pArgs->converter->mode<8) { |
| 495 return UCNV_GET_NEXT_UCHAR_USE_TO_U; |
| 496 } |
| 497 |
| 498 s=(const uint8_t *)pArgs->source; |
| 499 sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
| 500 |
| 501 if(s>=sourceLimit) { |
| 502 /* no input */ |
| 503 *err=U_INDEX_OUTOFBOUNDS_ERROR; |
| 504 return 0xffff; |
| 505 } |
| 506 |
| 507 if(s+2>sourceLimit) { |
| 508 /* only one byte: truncated UChar */ |
| 509 pArgs->converter->toUBytes[0]=*s++; |
| 510 pArgs->converter->toULength=1; |
| 511 pArgs->source=(const char *)s; |
| 512 *err = U_TRUNCATED_CHAR_FOUND; |
| 513 return 0xffff; |
| 514 } |
| 515 |
| 516 /* get one UChar */ |
| 517 c=((UChar32)*s<<8)|s[1]; |
| 518 s+=2; |
| 519 |
| 520 /* check for a surrogate pair */ |
| 521 if(U_IS_SURROGATE(c)) { |
| 522 if(U16_IS_SURROGATE_LEAD(c)) { |
| 523 if(s+2<=sourceLimit) { |
| 524 UChar trail; |
| 525 |
| 526 /* get a second UChar and see if it is a trail surrogate */ |
| 527 trail=((UChar)*s<<8)|s[1]; |
| 528 if(U16_IS_TRAIL(trail)) { |
| 529 c=U16_GET_SUPPLEMENTARY(c, trail); |
| 530 s+=2; |
| 531 } else { |
| 532 /* unmatched lead surrogate */ |
| 533 c=-2; |
| 534 } |
| 535 } else { |
| 536 /* too few (2 or 3) bytes for a surrogate pair: truncated code p
oint */ |
| 537 uint8_t *bytes=pArgs->converter->toUBytes; |
| 538 s-=2; |
| 539 pArgs->converter->toULength=(int8_t)(sourceLimit-s); |
| 540 do { |
| 541 *bytes++=*s++; |
| 542 } while(s<sourceLimit); |
| 543 |
| 544 c=0xffff; |
| 545 *err=U_TRUNCATED_CHAR_FOUND; |
| 546 } |
| 547 } else { |
| 548 /* unmatched trail surrogate */ |
| 549 c=-2; |
| 550 } |
| 551 |
| 552 if(c<0) { |
| 553 /* write the unmatched surrogate */ |
| 554 uint8_t *bytes=pArgs->converter->toUBytes; |
| 555 pArgs->converter->toULength=2; |
| 556 *bytes=*(s-2); |
| 557 bytes[1]=*(s-1); |
| 558 |
| 559 c=0xffff; |
| 560 *err=U_ILLEGAL_CHAR_FOUND; |
| 561 } |
| 562 } |
| 563 |
| 564 pArgs->source=(const char *)s; |
| 565 return c; |
| 566 } |
| 567 |
| 568 static void |
| 569 _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) { |
| 570 if(choice<=UCNV_RESET_TO_UNICODE) { |
| 571 /* reset toUnicode state */ |
| 572 if(UCNV_GET_VERSION(cnv)==0) { |
| 573 cnv->mode=8; /* no BOM handling */ |
| 574 } else { |
| 575 cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM
*/ |
| 576 } |
| 577 } |
| 578 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { |
| 579 /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BO
M */ |
| 580 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; |
| 581 } |
| 582 } |
| 583 |
| 584 static void |
| 585 _UTF16BEOpen(UConverter *cnv, |
| 586 UConverterLoadArgs *pArgs, |
| 587 UErrorCode *pErrorCode) { |
| 588 if(UCNV_GET_VERSION(cnv)<=1) { |
| 589 _UTF16BEReset(cnv, UCNV_RESET_BOTH); |
| 590 } else { |
| 591 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 592 } |
| 593 } |
| 594 |
| 595 static const char * |
| 596 _UTF16BEGetName(const UConverter *cnv) { |
| 597 if(UCNV_GET_VERSION(cnv)==0) { |
| 598 return "UTF-16BE"; |
| 599 } else { |
| 600 return "UTF-16BE,version=1"; |
| 601 } |
| 602 } |
| 603 |
| 604 static const UConverterImpl _UTF16BEImpl={ |
| 605 UCNV_UTF16_BigEndian, |
| 606 |
| 607 NULL, |
| 608 NULL, |
| 609 |
| 610 _UTF16BEOpen, |
| 611 NULL, |
| 612 _UTF16BEReset, |
| 613 |
| 614 _UTF16BEToUnicodeWithOffsets, |
| 615 _UTF16BEToUnicodeWithOffsets, |
| 616 _UTF16BEFromUnicodeWithOffsets, |
| 617 _UTF16BEFromUnicodeWithOffsets, |
| 618 _UTF16BEGetNextUChar, |
| 619 |
| 620 NULL, |
| 621 _UTF16BEGetName, |
| 622 NULL, |
| 623 NULL, |
| 624 ucnv_getNonSurrogateUnicodeSet |
| 625 }; |
| 626 |
| 627 static const UConverterStaticData _UTF16BEStaticData={ |
| 628 sizeof(UConverterStaticData), |
| 629 "UTF-16BE", |
| 630 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2, |
| 631 { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE, |
| 632 0, |
| 633 0, |
| 634 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
| 635 }; |
| 636 |
| 637 |
| 638 const UConverterSharedData _UTF16BEData={ |
| 639 sizeof(UConverterSharedData), ~((uint32_t) 0), |
| 640 NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl, |
| 641 0 |
| 642 }; |
| 643 |
| 644 /* UTF-16LE ----------------------------------------------------------------- */ |
| 645 |
| 646 static void |
| 647 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, |
| 648 UErrorCode *pErrorCode) { |
| 649 UConverter *cnv; |
| 650 const UChar *source; |
| 651 char *target; |
| 652 int32_t *offsets; |
| 653 |
| 654 uint32_t targetCapacity, length, sourceIndex; |
| 655 UChar c, trail; |
| 656 char overflow[4]; |
| 657 |
| 658 source=pArgs->source; |
| 659 length=(int32_t)(pArgs->sourceLimit-source); |
| 660 if(length<=0) { |
| 661 /* no input, nothing to do */ |
| 662 return; |
| 663 } |
| 664 |
| 665 cnv=pArgs->converter; |
| 666 |
| 667 /* write the BOM if necessary */ |
| 668 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
| 669 static const char bom[]={ (char)0xff, (char)0xfe }; |
| 670 ucnv_fromUWriteBytes(cnv, |
| 671 bom, 2, |
| 672 &pArgs->target, pArgs->targetLimit, |
| 673 &pArgs->offsets, -1, |
| 674 pErrorCode); |
| 675 cnv->fromUnicodeStatus=0; |
| 676 } |
| 677 |
| 678 target=pArgs->target; |
| 679 if(target >= pArgs->targetLimit) { |
| 680 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 681 return; |
| 682 } |
| 683 |
| 684 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); |
| 685 offsets=pArgs->offsets; |
| 686 sourceIndex=0; |
| 687 |
| 688 /* c!=0 indicates in several places outside the main loops that a surrogate
was found */ |
| 689 |
| 690 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCa
pacity>=4) { |
| 691 /* the last buffer ended with a lead surrogate, output the surrogate pai
r */ |
| 692 ++source; |
| 693 --length; |
| 694 target[0]=(uint8_t)c; |
| 695 target[1]=(uint8_t)(c>>8); |
| 696 target[2]=(uint8_t)trail; |
| 697 target[3]=(uint8_t)(trail>>8); |
| 698 target+=4; |
| 699 targetCapacity-=4; |
| 700 if(offsets!=NULL) { |
| 701 *offsets++=-1; |
| 702 *offsets++=-1; |
| 703 *offsets++=-1; |
| 704 *offsets++=-1; |
| 705 } |
| 706 sourceIndex=1; |
| 707 cnv->fromUChar32=c=0; |
| 708 } |
| 709 |
| 710 if(c==0) { |
| 711 /* copy an even number of bytes for complete UChars */ |
| 712 uint32_t count=2*length; |
| 713 if(count>targetCapacity) { |
| 714 count=targetCapacity&~1; |
| 715 } |
| 716 /* count is even */ |
| 717 targetCapacity-=count; |
| 718 count>>=1; |
| 719 length-=count; |
| 720 |
| 721 if(offsets==NULL) { |
| 722 while(count>0) { |
| 723 c=*source++; |
| 724 if(U16_IS_SINGLE(c)) { |
| 725 target[0]=(uint8_t)c; |
| 726 target[1]=(uint8_t)(c>>8); |
| 727 target+=2; |
| 728 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(t
rail=*source)) { |
| 729 ++source; |
| 730 --count; |
| 731 target[0]=(uint8_t)c; |
| 732 target[1]=(uint8_t)(c>>8); |
| 733 target[2]=(uint8_t)trail; |
| 734 target[3]=(uint8_t)(trail>>8); |
| 735 target+=4; |
| 736 } else { |
| 737 break; |
| 738 } |
| 739 --count; |
| 740 } |
| 741 } else { |
| 742 while(count>0) { |
| 743 c=*source++; |
| 744 if(U16_IS_SINGLE(c)) { |
| 745 target[0]=(uint8_t)c; |
| 746 target[1]=(uint8_t)(c>>8); |
| 747 target+=2; |
| 748 *offsets++=sourceIndex; |
| 749 *offsets++=sourceIndex++; |
| 750 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(t
rail=*source)) { |
| 751 ++source; |
| 752 --count; |
| 753 target[0]=(uint8_t)c; |
| 754 target[1]=(uint8_t)(c>>8); |
| 755 target[2]=(uint8_t)trail; |
| 756 target[3]=(uint8_t)(trail>>8); |
| 757 target+=4; |
| 758 *offsets++=sourceIndex; |
| 759 *offsets++=sourceIndex; |
| 760 *offsets++=sourceIndex; |
| 761 *offsets++=sourceIndex; |
| 762 sourceIndex+=2; |
| 763 } else { |
| 764 break; |
| 765 } |
| 766 --count; |
| 767 } |
| 768 } |
| 769 |
| 770 if(count==0) { |
| 771 /* done with the loop for complete UChars */ |
| 772 if(length>0 && targetCapacity>0) { |
| 773 /* |
| 774 * there is more input and some target capacity - |
| 775 * it must be targetCapacity==1 because otherwise |
| 776 * the above would have copied more; |
| 777 * prepare for overflow output |
| 778 */ |
| 779 if(U16_IS_SINGLE(c=*source++)) { |
| 780 overflow[0]=(char)c; |
| 781 overflow[1]=(char)(c>>8); |
| 782 length=2; /* 2 bytes to output */ |
| 783 c=0; |
| 784 /* } else { keep c for surrogate handling, length will be set th
ere */ |
| 785 } |
| 786 } else { |
| 787 length=0; |
| 788 c=0; |
| 789 } |
| 790 } else { |
| 791 /* keep c for surrogate handling, length will be set there */ |
| 792 targetCapacity+=2*count; |
| 793 } |
| 794 } else { |
| 795 length=0; /* from here on, length counts the bytes in overflow[] */ |
| 796 } |
| 797 |
| 798 if(c!=0) { |
| 799 /* |
| 800 * c is a surrogate, and |
| 801 * - source or target too short |
| 802 * - or the surrogate is unmatched |
| 803 */ |
| 804 length=0; |
| 805 if(U16_IS_SURROGATE_LEAD(c)) { |
| 806 if(source<pArgs->sourceLimit) { |
| 807 if(U16_IS_TRAIL(trail=*source)) { |
| 808 /* output the surrogate pair, will overflow (see conditions
comment above) */ |
| 809 ++source; |
| 810 overflow[0]=(char)c; |
| 811 overflow[1]=(char)(c>>8); |
| 812 overflow[2]=(char)trail; |
| 813 overflow[3]=(char)(trail>>8); |
| 814 length=4; /* 4 bytes to output */ |
| 815 c=0; |
| 816 } else { |
| 817 /* unmatched lead surrogate */ |
| 818 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 819 } |
| 820 } else { |
| 821 /* see if the trail surrogate is in the next buffer */ |
| 822 } |
| 823 } else { |
| 824 /* unmatched trail surrogate */ |
| 825 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 826 } |
| 827 cnv->fromUChar32=c; |
| 828 } |
| 829 |
| 830 if(length>0) { |
| 831 /* output length bytes with overflow (length>targetCapacity>0) */ |
| 832 ucnv_fromUWriteBytes(cnv, |
| 833 overflow, length, |
| 834 &target, pArgs->targetLimit, |
| 835 &offsets, sourceIndex, |
| 836 pErrorCode); |
| 837 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); |
| 838 } |
| 839 |
| 840 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0)
{ |
| 841 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 842 } |
| 843 |
| 844 /* write back the updated pointers */ |
| 845 pArgs->source=source; |
| 846 pArgs->target=target; |
| 847 pArgs->offsets=offsets; |
| 848 } |
| 849 |
| 850 static void |
| 851 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
| 852 UErrorCode *pErrorCode) { |
| 853 UConverter *cnv; |
| 854 const uint8_t *source; |
| 855 UChar *target; |
| 856 int32_t *offsets; |
| 857 |
| 858 uint32_t targetCapacity, length, count, sourceIndex; |
| 859 UChar c, trail; |
| 860 |
| 861 if(pArgs->converter->mode<8) { |
| 862 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); |
| 863 return; |
| 864 } |
| 865 |
| 866 cnv=pArgs->converter; |
| 867 source=(const uint8_t *)pArgs->source; |
| 868 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); |
| 869 if(length<=0 && cnv->toUnicodeStatus==0) { |
| 870 /* no input, nothing to do */ |
| 871 return; |
| 872 } |
| 873 |
| 874 target=pArgs->target; |
| 875 if(target >= pArgs->targetLimit) { |
| 876 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 877 return; |
| 878 } |
| 879 |
| 880 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); |
| 881 offsets=pArgs->offsets; |
| 882 sourceIndex=0; |
| 883 c=0; |
| 884 |
| 885 /* complete a partial UChar or pair from the last call */ |
| 886 if(cnv->toUnicodeStatus!=0) { |
| 887 /* |
| 888 * special case: single byte from a previous buffer, |
| 889 * where the byte turned out not to belong to a trail surrogate |
| 890 * and the preceding, unmatched lead surrogate was put into toUBytes[] |
| 891 * for error handling |
| 892 */ |
| 893 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; |
| 894 cnv->toULength=1; |
| 895 cnv->toUnicodeStatus=0; |
| 896 } |
| 897 if((count=cnv->toULength)!=0) { |
| 898 uint8_t *p=cnv->toUBytes; |
| 899 do { |
| 900 p[count++]=*source++; |
| 901 ++sourceIndex; |
| 902 --length; |
| 903 if(count==2) { |
| 904 c=((UChar)p[1]<<8)|p[0]; |
| 905 if(U16_IS_SINGLE(c)) { |
| 906 /* output the BMP code point */ |
| 907 *target++=c; |
| 908 if(offsets!=NULL) { |
| 909 *offsets++=-1; |
| 910 } |
| 911 --targetCapacity; |
| 912 count=0; |
| 913 c=0; |
| 914 break; |
| 915 } else if(U16_IS_SURROGATE_LEAD(c)) { |
| 916 /* continue collecting bytes for the trail surrogate */ |
| 917 c=0; /* avoid unnecessary surrogate handling below */ |
| 918 } else { |
| 919 /* fall through to error handling for an unmatched trail sur
rogate */ |
| 920 break; |
| 921 } |
| 922 } else if(count==4) { |
| 923 c=((UChar)p[1]<<8)|p[0]; |
| 924 trail=((UChar)p[3]<<8)|p[2]; |
| 925 if(U16_IS_TRAIL(trail)) { |
| 926 /* output the surrogate pair */ |
| 927 *target++=c; |
| 928 if(targetCapacity>=2) { |
| 929 *target++=trail; |
| 930 if(offsets!=NULL) { |
| 931 *offsets++=-1; |
| 932 *offsets++=-1; |
| 933 } |
| 934 targetCapacity-=2; |
| 935 } else /* targetCapacity==1 */ { |
| 936 targetCapacity=0; |
| 937 cnv->UCharErrorBuffer[0]=trail; |
| 938 cnv->UCharErrorBufferLength=1; |
| 939 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 940 } |
| 941 count=0; |
| 942 c=0; |
| 943 break; |
| 944 } else { |
| 945 /* unmatched lead surrogate, handle here for consistent toUB
ytes[] */ |
| 946 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 947 |
| 948 /* back out reading the code unit after it */ |
| 949 if(((const uint8_t *)pArgs->source-source)>=2) { |
| 950 source-=2; |
| 951 } else { |
| 952 /* |
| 953 * if the trail unit's first byte was in a previous buff
er, then |
| 954 * we need to put it into a special place because toUByt
es[] will be |
| 955 * used for the lead unit's bytes |
| 956 */ |
| 957 cnv->toUnicodeStatus=0x100|p[2]; |
| 958 --source; |
| 959 } |
| 960 cnv->toULength=2; |
| 961 |
| 962 /* write back the updated pointers */ |
| 963 pArgs->source=(const char *)source; |
| 964 pArgs->target=target; |
| 965 pArgs->offsets=offsets; |
| 966 return; |
| 967 } |
| 968 } |
| 969 } while(length>0); |
| 970 cnv->toULength=(int8_t)count; |
| 971 } |
| 972 |
| 973 /* copy an even number of bytes for complete UChars */ |
| 974 count=2*targetCapacity; |
| 975 if(count>length) { |
| 976 count=length&~1; |
| 977 } |
| 978 if(c==0 && count>0) { |
| 979 length-=count; |
| 980 count>>=1; |
| 981 targetCapacity-=count; |
| 982 if(offsets==NULL) { |
| 983 do { |
| 984 c=((UChar)source[1]<<8)|source[0]; |
| 985 source+=2; |
| 986 if(U16_IS_SINGLE(c)) { |
| 987 *target++=c; |
| 988 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && |
| 989 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) |
| 990 ) { |
| 991 source+=2; |
| 992 --count; |
| 993 *target++=c; |
| 994 *target++=trail; |
| 995 } else { |
| 996 break; |
| 997 } |
| 998 } while(--count>0); |
| 999 } else { |
| 1000 do { |
| 1001 c=((UChar)source[1]<<8)|source[0]; |
| 1002 source+=2; |
| 1003 if(U16_IS_SINGLE(c)) { |
| 1004 *target++=c; |
| 1005 *offsets++=sourceIndex; |
| 1006 sourceIndex+=2; |
| 1007 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && |
| 1008 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) |
| 1009 ) { |
| 1010 source+=2; |
| 1011 --count; |
| 1012 *target++=c; |
| 1013 *target++=trail; |
| 1014 *offsets++=sourceIndex; |
| 1015 *offsets++=sourceIndex; |
| 1016 sourceIndex+=4; |
| 1017 } else { |
| 1018 break; |
| 1019 } |
| 1020 } while(--count>0); |
| 1021 } |
| 1022 |
| 1023 if(count==0) { |
| 1024 /* done with the loop for complete UChars */ |
| 1025 c=0; |
| 1026 } else { |
| 1027 /* keep c for surrogate handling, trail will be set there */ |
| 1028 length+=2*(count-1); /* one more byte pair was consumed than count d
ecremented */ |
| 1029 targetCapacity+=count; |
| 1030 } |
| 1031 } |
| 1032 |
| 1033 if(c!=0) { |
| 1034 /* |
| 1035 * c is a surrogate, and |
| 1036 * - source or target too short |
| 1037 * - or the surrogate is unmatched |
| 1038 */ |
| 1039 cnv->toUBytes[0]=(uint8_t)c; |
| 1040 cnv->toUBytes[1]=(uint8_t)(c>>8); |
| 1041 cnv->toULength=2; |
| 1042 |
| 1043 if(U16_IS_SURROGATE_LEAD(c)) { |
| 1044 if(length>=2) { |
| 1045 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) { |
| 1046 /* output the surrogate pair, will overflow (see conditions
comment above) */ |
| 1047 source+=2; |
| 1048 length-=2; |
| 1049 *target++=c; |
| 1050 if(offsets!=NULL) { |
| 1051 *offsets++=sourceIndex; |
| 1052 } |
| 1053 cnv->UCharErrorBuffer[0]=trail; |
| 1054 cnv->UCharErrorBufferLength=1; |
| 1055 cnv->toULength=0; |
| 1056 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 1057 } else { |
| 1058 /* unmatched lead surrogate */ |
| 1059 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 1060 } |
| 1061 } else { |
| 1062 /* see if the trail surrogate is in the next buffer */ |
| 1063 } |
| 1064 } else { |
| 1065 /* unmatched trail surrogate */ |
| 1066 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 1067 } |
| 1068 } |
| 1069 |
| 1070 if(U_SUCCESS(*pErrorCode)) { |
| 1071 /* check for a remaining source byte */ |
| 1072 if(length>0) { |
| 1073 if(targetCapacity==0) { |
| 1074 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 1075 } else { |
| 1076 /* it must be length==1 because otherwise the above would have c
opied more */ |
| 1077 cnv->toUBytes[cnv->toULength++]=*source++; |
| 1078 } |
| 1079 } |
| 1080 } |
| 1081 |
| 1082 /* write back the updated pointers */ |
| 1083 pArgs->source=(const char *)source; |
| 1084 pArgs->target=target; |
| 1085 pArgs->offsets=offsets; |
| 1086 } |
| 1087 |
| 1088 static UChar32 |
| 1089 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { |
| 1090 const uint8_t *s, *sourceLimit; |
| 1091 UChar32 c; |
| 1092 |
| 1093 if(pArgs->converter->mode<8) { |
| 1094 return UCNV_GET_NEXT_UCHAR_USE_TO_U; |
| 1095 } |
| 1096 |
| 1097 s=(const uint8_t *)pArgs->source; |
| 1098 sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
| 1099 |
| 1100 if(s>=sourceLimit) { |
| 1101 /* no input */ |
| 1102 *err=U_INDEX_OUTOFBOUNDS_ERROR; |
| 1103 return 0xffff; |
| 1104 } |
| 1105 |
| 1106 if(s+2>sourceLimit) { |
| 1107 /* only one byte: truncated UChar */ |
| 1108 pArgs->converter->toUBytes[0]=*s++; |
| 1109 pArgs->converter->toULength=1; |
| 1110 pArgs->source=(const char *)s; |
| 1111 *err = U_TRUNCATED_CHAR_FOUND; |
| 1112 return 0xffff; |
| 1113 } |
| 1114 |
| 1115 /* get one UChar */ |
| 1116 c=((UChar32)s[1]<<8)|*s; |
| 1117 s+=2; |
| 1118 |
| 1119 /* check for a surrogate pair */ |
| 1120 if(U_IS_SURROGATE(c)) { |
| 1121 if(U16_IS_SURROGATE_LEAD(c)) { |
| 1122 if(s+2<=sourceLimit) { |
| 1123 UChar trail; |
| 1124 |
| 1125 /* get a second UChar and see if it is a trail surrogate */ |
| 1126 trail=((UChar)s[1]<<8)|*s; |
| 1127 if(U16_IS_TRAIL(trail)) { |
| 1128 c=U16_GET_SUPPLEMENTARY(c, trail); |
| 1129 s+=2; |
| 1130 } else { |
| 1131 /* unmatched lead surrogate */ |
| 1132 c=-2; |
| 1133 } |
| 1134 } else { |
| 1135 /* too few (2 or 3) bytes for a surrogate pair: truncated code p
oint */ |
| 1136 uint8_t *bytes=pArgs->converter->toUBytes; |
| 1137 s-=2; |
| 1138 pArgs->converter->toULength=(int8_t)(sourceLimit-s); |
| 1139 do { |
| 1140 *bytes++=*s++; |
| 1141 } while(s<sourceLimit); |
| 1142 |
| 1143 c=0xffff; |
| 1144 *err=U_TRUNCATED_CHAR_FOUND; |
| 1145 } |
| 1146 } else { |
| 1147 /* unmatched trail surrogate */ |
| 1148 c=-2; |
| 1149 } |
| 1150 |
| 1151 if(c<0) { |
| 1152 /* write the unmatched surrogate */ |
| 1153 uint8_t *bytes=pArgs->converter->toUBytes; |
| 1154 pArgs->converter->toULength=2; |
| 1155 *bytes=*(s-2); |
| 1156 bytes[1]=*(s-1); |
| 1157 |
| 1158 c=0xffff; |
| 1159 *err=U_ILLEGAL_CHAR_FOUND; |
| 1160 } |
| 1161 } |
| 1162 |
| 1163 pArgs->source=(const char *)s; |
| 1164 return c; |
| 1165 } |
| 1166 |
| 1167 static void |
| 1168 _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) { |
| 1169 if(choice<=UCNV_RESET_TO_UNICODE) { |
| 1170 /* reset toUnicode state */ |
| 1171 if(UCNV_GET_VERSION(cnv)==0) { |
| 1172 cnv->mode=8; /* no BOM handling */ |
| 1173 } else { |
| 1174 cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no
BOM */ |
| 1175 } |
| 1176 } |
| 1177 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { |
| 1178 /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE
BOM */ |
| 1179 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; |
| 1180 } |
| 1181 } |
| 1182 |
| 1183 static void |
| 1184 _UTF16LEOpen(UConverter *cnv, |
| 1185 UConverterLoadArgs *pArgs, |
| 1186 UErrorCode *pErrorCode) { |
| 1187 if(UCNV_GET_VERSION(cnv)<=1) { |
| 1188 _UTF16LEReset(cnv, UCNV_RESET_BOTH); |
| 1189 } else { |
| 1190 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 1191 } |
| 1192 } |
| 1193 |
| 1194 static const char * |
| 1195 _UTF16LEGetName(const UConverter *cnv) { |
| 1196 if(UCNV_GET_VERSION(cnv)==0) { |
| 1197 return "UTF-16LE"; |
| 1198 } else { |
| 1199 return "UTF-16LE,version=1"; |
| 1200 } |
| 1201 } |
| 1202 |
| 1203 static const UConverterImpl _UTF16LEImpl={ |
| 1204 UCNV_UTF16_LittleEndian, |
| 1205 |
| 1206 NULL, |
| 1207 NULL, |
| 1208 |
| 1209 _UTF16LEOpen, |
| 1210 NULL, |
| 1211 _UTF16LEReset, |
| 1212 |
| 1213 _UTF16LEToUnicodeWithOffsets, |
| 1214 _UTF16LEToUnicodeWithOffsets, |
| 1215 _UTF16LEFromUnicodeWithOffsets, |
| 1216 _UTF16LEFromUnicodeWithOffsets, |
| 1217 _UTF16LEGetNextUChar, |
| 1218 |
| 1219 NULL, |
| 1220 _UTF16LEGetName, |
| 1221 NULL, |
| 1222 NULL, |
| 1223 ucnv_getNonSurrogateUnicodeSet |
| 1224 }; |
| 1225 |
| 1226 |
| 1227 static const UConverterStaticData _UTF16LEStaticData={ |
| 1228 sizeof(UConverterStaticData), |
| 1229 "UTF-16LE", |
| 1230 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2, |
| 1231 { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE, |
| 1232 0, |
| 1233 0, |
| 1234 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
| 1235 }; |
| 1236 |
| 1237 |
| 1238 const UConverterSharedData _UTF16LEData={ |
| 1239 sizeof(UConverterSharedData), ~((uint32_t) 0), |
| 1240 NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl, |
| 1241 0 |
| 1242 }; |
| 1243 |
| 1244 /* UTF-16 (Detect BOM) ------------------------------------------------------ */ |
| 1245 |
| 1246 /* |
| 1247 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE |
| 1248 * accordingly. |
| 1249 * This is a simpler version of the UTF-32 converter, with |
| 1250 * fewer states for shorter BOMs. |
| 1251 * |
| 1252 * State values: |
| 1253 * 0 initial state |
| 1254 * 1 saw first byte |
| 1255 * 2..5 - |
| 1256 * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1 |
| 1257 * 8 UTF-16BE mode |
| 1258 * 9 UTF-16LE mode |
| 1259 * |
| 1260 * During detection: state==number of initial bytes seen so far. |
| 1261 * |
| 1262 * On output, emit U+FEFF as the first code point. |
| 1263 * |
| 1264 * Variants: |
| 1265 * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error
. |
| 1266 * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and |
| 1267 * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as a
n error. |
| 1268 */ |
| 1269 |
| 1270 static void |
| 1271 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) { |
| 1272 if(choice<=UCNV_RESET_TO_UNICODE) { |
| 1273 /* reset toUnicode: state=0 */ |
| 1274 cnv->mode=0; |
| 1275 } |
| 1276 if(choice!=UCNV_RESET_TO_UNICODE) { |
| 1277 /* reset fromUnicode: prepare to output the UTF-16PE BOM */ |
| 1278 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; |
| 1279 } |
| 1280 } |
| 1281 |
| 1282 static const UConverterSharedData _UTF16v2Data; |
| 1283 |
| 1284 static void |
| 1285 _UTF16Open(UConverter *cnv, |
| 1286 UConverterLoadArgs *pArgs, |
| 1287 UErrorCode *pErrorCode) { |
| 1288 if(UCNV_GET_VERSION(cnv)<=2) { |
| 1289 if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) { |
| 1290 /* |
| 1291 * Switch implementation, and switch the staticData that's different |
| 1292 * and was copied into the UConverter. |
| 1293 * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.) |
| 1294 * UTF-16,version=2 fromUnicode() always writes a big-endian byte st
ream. |
| 1295 */ |
| 1296 cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data; |
| 1297 uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MA
X_SUBCHAR_LEN); |
| 1298 } |
| 1299 _UTF16Reset(cnv, UCNV_RESET_BOTH); |
| 1300 } else { |
| 1301 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 1302 } |
| 1303 } |
| 1304 |
| 1305 static const char * |
| 1306 _UTF16GetName(const UConverter *cnv) { |
| 1307 if(UCNV_GET_VERSION(cnv)==0) { |
| 1308 return "UTF-16"; |
| 1309 } else if(UCNV_GET_VERSION(cnv)==1) { |
| 1310 return "UTF-16,version=1"; |
| 1311 } else { |
| 1312 return "UTF-16,version=2"; |
| 1313 } |
| 1314 } |
| 1315 |
| 1316 const UConverterSharedData _UTF16Data; |
| 1317 |
| 1318 #define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData) |
| 1319 #define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData) |
| 1320 #define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UT
F16v2Data) |
| 1321 |
| 1322 static void |
| 1323 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
| 1324 UErrorCode *pErrorCode) { |
| 1325 UConverter *cnv=pArgs->converter; |
| 1326 const char *source=pArgs->source; |
| 1327 const char *sourceLimit=pArgs->sourceLimit; |
| 1328 int32_t *offsets=pArgs->offsets; |
| 1329 |
| 1330 int32_t state, offsetDelta; |
| 1331 uint8_t b; |
| 1332 |
| 1333 state=cnv->mode; |
| 1334 |
| 1335 /* |
| 1336 * If we detect a BOM in this buffer, then we must add the BOM size to the |
| 1337 * offsets because the actual converter function will not see and count the
BOM. |
| 1338 * offsetDelta will have the number of the BOM bytes that are in the current
buffer. |
| 1339 */ |
| 1340 offsetDelta=0; |
| 1341 |
| 1342 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) { |
| 1343 switch(state) { |
| 1344 case 0: |
| 1345 cnv->toUBytes[0]=(uint8_t)*source++; |
| 1346 cnv->toULength=1; |
| 1347 state=1; |
| 1348 break; |
| 1349 case 1: |
| 1350 /* |
| 1351 * Only inside this switch case can the state variable |
| 1352 * temporarily take two additional values: |
| 1353 * 6: BOM error, continue with BE |
| 1354 * 7: BOM error, continue with LE |
| 1355 */ |
| 1356 b=*source; |
| 1357 if(cnv->toUBytes[0]==0xfe && b==0xff) { |
| 1358 if(IS_UTF16LE(cnv)) { |
| 1359 state=7; /* illegal reverse BOM for Java "UnicodeLittle" */ |
| 1360 } else { |
| 1361 state=8; /* detect UTF-16BE */ |
| 1362 } |
| 1363 } else if(cnv->toUBytes[0]==0xff && b==0xfe) { |
| 1364 if(IS_UTF16BE(cnv)) { |
| 1365 state=6; /* illegal reverse BOM for Java "UnicodeBig" */ |
| 1366 } else { |
| 1367 state=9; /* detect UTF-16LE */ |
| 1368 } |
| 1369 } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) { |
| 1370 state=6; /* illegal missing BOM for Java "Unicode" */ |
| 1371 } |
| 1372 if(state>=8) { |
| 1373 /* BOM detected, consume it */ |
| 1374 ++source; |
| 1375 cnv->toULength=0; |
| 1376 offsetDelta=(int32_t)(source-pArgs->source); |
| 1377 } else if(state<6) { |
| 1378 /* ok: no BOM, and not a reverse BOM */ |
| 1379 if(source!=pArgs->source) { |
| 1380 /* reset the source for a correct first offset */ |
| 1381 source=pArgs->source; |
| 1382 cnv->toULength=0; |
| 1383 } |
| 1384 if(IS_UTF16LE(cnv)) { |
| 1385 /* Make Java "UnicodeLittle" default to LE. */ |
| 1386 state=9; |
| 1387 } else { |
| 1388 /* Make standard UTF-16 and Java "UnicodeBig" default to BE.
*/ |
| 1389 state=8; |
| 1390 } |
| 1391 } else { |
| 1392 /* |
| 1393 * error: missing BOM, or reverse BOM |
| 1394 * UTF-16,version=1: Java-specific "Unicode" requires a BOM. |
| 1395 * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE
BOM or no BOM. |
| 1396 * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an
LE BOM or no BOM. |
| 1397 */ |
| 1398 /* report the non-BOM or reverse BOM as an illegal sequence */ |
| 1399 cnv->toUBytes[1]=b; |
| 1400 cnv->toULength=2; |
| 1401 pArgs->source=source+1; |
| 1402 /* continue with conversion if the callback resets the error */ |
| 1403 /* |
| 1404 * Make Java "Unicode" default to BE like standard UTF-16. |
| 1405 * Make Java "UnicodeBig" and "UnicodeLittle" default |
| 1406 * to their normal endiannesses. |
| 1407 */ |
| 1408 cnv->mode=state+2; |
| 1409 *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE; |
| 1410 return; |
| 1411 } |
| 1412 /* convert the rest of the stream */ |
| 1413 cnv->mode=state; |
| 1414 continue; |
| 1415 case 8: |
| 1416 /* call UTF-16BE */ |
| 1417 pArgs->source=source; |
| 1418 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); |
| 1419 source=pArgs->source; |
| 1420 break; |
| 1421 case 9: |
| 1422 /* call UTF-16LE */ |
| 1423 pArgs->source=source; |
| 1424 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); |
| 1425 source=pArgs->source; |
| 1426 break; |
| 1427 default: |
| 1428 break; /* does not occur */ |
| 1429 } |
| 1430 } |
| 1431 |
| 1432 /* add BOM size to offsets - see comment at offsetDelta declaration */ |
| 1433 if(offsets!=NULL && offsetDelta!=0) { |
| 1434 int32_t *offsetsLimit=pArgs->offsets; |
| 1435 while(offsets<offsetsLimit) { |
| 1436 *offsets++ += offsetDelta; |
| 1437 } |
| 1438 } |
| 1439 |
| 1440 pArgs->source=source; |
| 1441 |
| 1442 if(source==sourceLimit && pArgs->flush) { |
| 1443 /* handle truncated input */ |
| 1444 switch(state) { |
| 1445 case 0: |
| 1446 break; /* no input at all, nothing to do */ |
| 1447 case 8: |
| 1448 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); |
| 1449 break; |
| 1450 case 9: |
| 1451 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); |
| 1452 break; |
| 1453 default: |
| 1454 /* 0<state<8: framework will report truncation, nothing to do here *
/ |
| 1455 break; |
| 1456 } |
| 1457 } |
| 1458 |
| 1459 cnv->mode=state; |
| 1460 } |
| 1461 |
| 1462 static UChar32 |
| 1463 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs, |
| 1464 UErrorCode *pErrorCode) { |
| 1465 switch(pArgs->converter->mode) { |
| 1466 case 8: |
| 1467 return _UTF16BEGetNextUChar(pArgs, pErrorCode); |
| 1468 case 9: |
| 1469 return _UTF16LEGetNextUChar(pArgs, pErrorCode); |
| 1470 default: |
| 1471 return UCNV_GET_NEXT_UCHAR_USE_TO_U; |
| 1472 } |
| 1473 } |
| 1474 |
| 1475 static const UConverterImpl _UTF16Impl = { |
| 1476 UCNV_UTF16, |
| 1477 |
| 1478 NULL, |
| 1479 NULL, |
| 1480 |
| 1481 _UTF16Open, |
| 1482 NULL, |
| 1483 _UTF16Reset, |
| 1484 |
| 1485 _UTF16ToUnicodeWithOffsets, |
| 1486 _UTF16ToUnicodeWithOffsets, |
| 1487 _UTF16PEFromUnicodeWithOffsets, |
| 1488 _UTF16PEFromUnicodeWithOffsets, |
| 1489 _UTF16GetNextUChar, |
| 1490 |
| 1491 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ |
| 1492 _UTF16GetName, |
| 1493 NULL, |
| 1494 NULL, |
| 1495 ucnv_getNonSurrogateUnicodeSet |
| 1496 }; |
| 1497 |
| 1498 static const UConverterStaticData _UTF16StaticData = { |
| 1499 sizeof(UConverterStaticData), |
| 1500 "UTF-16", |
| 1501 1204, /* CCSID for BOM sensitive UTF-16 */ |
| 1502 UCNV_IBM, UCNV_UTF16, 2, 2, |
| 1503 #if U_IS_BIG_ENDIAN |
| 1504 { 0xff, 0xfd, 0, 0 }, 2, |
| 1505 #else |
| 1506 { 0xfd, 0xff, 0, 0 }, 2, |
| 1507 #endif |
| 1508 FALSE, FALSE, |
| 1509 0, |
| 1510 0, |
| 1511 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
| 1512 }; |
| 1513 |
| 1514 const UConverterSharedData _UTF16Data = { |
| 1515 sizeof(UConverterSharedData), ~((uint32_t) 0), |
| 1516 NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl, |
| 1517 0 |
| 1518 }; |
| 1519 |
| 1520 static const UConverterImpl _UTF16v2Impl = { |
| 1521 UCNV_UTF16, |
| 1522 |
| 1523 NULL, |
| 1524 NULL, |
| 1525 |
| 1526 _UTF16Open, |
| 1527 NULL, |
| 1528 _UTF16Reset, |
| 1529 |
| 1530 _UTF16ToUnicodeWithOffsets, |
| 1531 _UTF16ToUnicodeWithOffsets, |
| 1532 _UTF16BEFromUnicodeWithOffsets, |
| 1533 _UTF16BEFromUnicodeWithOffsets, |
| 1534 _UTF16GetNextUChar, |
| 1535 |
| 1536 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ |
| 1537 _UTF16GetName, |
| 1538 NULL, |
| 1539 NULL, |
| 1540 ucnv_getNonSurrogateUnicodeSet |
| 1541 }; |
| 1542 |
| 1543 static const UConverterStaticData _UTF16v2StaticData = { |
| 1544 sizeof(UConverterStaticData), |
| 1545 "UTF-16,version=2", |
| 1546 1204, /* CCSID for BOM sensitive UTF-16 */ |
| 1547 UCNV_IBM, UCNV_UTF16, 2, 2, |
| 1548 { 0xff, 0xfd, 0, 0 }, 2, |
| 1549 FALSE, FALSE, |
| 1550 0, |
| 1551 0, |
| 1552 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
| 1553 }; |
| 1554 |
| 1555 static const UConverterSharedData _UTF16v2Data = { |
| 1556 sizeof(UConverterSharedData), ~((uint32_t) 0), |
| 1557 NULL, NULL, &_UTF16v2StaticData, FALSE, &_UTF16v2Impl, |
| 1558 0 |
| 1559 }; |
| 1560 |
| 1561 #endif |
OLD | NEW |