icu46/source/common/ucnv_u16.c - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/ucnv_u16.c

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 2002-2010, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 **********************************************************************

	6 * file name: ucnv_u16.c

	7 * encoding: US-ASCII

	8 * tab size: 8 (not used)

	9 * indentation:4

	10 *

	11 * created on: 2002jul01

	12 * created by: Markus W. Scherer

	13 *

	14 * UTF-16 converter implementation. Used to be in ucnv_utf.c.

	15 */

	16

	17 #include "unicode/utypes.h"

	18

	19 #if !UCONFIG_NO_CONVERSION

	20

	21 #include "unicode/ucnv.h"

	22 #include "ucnv_bld.h"

	23 #include "ucnv_cnv.h"

	24 #include "cmemory.h"

	25

	26 enum {

	27 UCNV_NEED_TO_WRITE_BOM=1

	28 };

	29

	30 /*

	31 * The UTF-16 toUnicode implementation is also used for the Java-specific

	32 * "with BOM" variants of UTF-16BE and UTF-16LE.

	33 */

	34 static void

	35 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

	36 UErrorCode *pErrorCode);

	37

	38 /* UTF-16BE ----------------------------------------------------------------- */

	39

	40 #if U_IS_BIG_ENDIAN

	41 # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets

	42 #else

	43 # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets

	44 #endif

	45

	46

	47 static void

	48 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,

	49 UErrorCode *pErrorCode) {

	50 UConverter *cnv;

	51 const UChar *source;

	52 char *target;

	53 int32_t *offsets;

	54

	55 uint32_t targetCapacity, length, sourceIndex;

	56 UChar c, trail;

	57 char overflow[4];

	58

	59 source=pArgs->source;

	60 length=(int32_t)(pArgs->sourceLimit-source);

	61 if(length<=0) {

	62 /* no input, nothing to do */

	63 return;

	64 }

	65

	66 cnv=pArgs->converter;

	67

	68 /* write the BOM if necessary */

	69 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {

	70 static const char bom[]={ (char)0xfe, (char)0xff };

	71 ucnv_fromUWriteBytes(cnv,

	72 bom, 2,

	73 &pArgs->target, pArgs->targetLimit,

	74 &pArgs->offsets, -1,

	75 pErrorCode);

	76 cnv->fromUnicodeStatus=0;

	77 }

	78

	79 target=pArgs->target;

	80 if(target >= pArgs->targetLimit) {

	81 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	82 return;

	83 }

	84

	85 targetCapacity=(uint32_t)(pArgs->targetLimit-target);

	86 offsets=pArgs->offsets;

	87 sourceIndex=0;

	88

	89 /* c!=0 indicates in several places outside the main loops that a surrogate was found */

	90

	91 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCa pacity>=4) {

	92 /* the last buffer ended with a lead surrogate, output the surrogate pai r */

	93 ++source;

	94 --length;

	95 target[0]=(uint8_t)(c>>8);

	96 target[1]=(uint8_t)c;

	97 target[2]=(uint8_t)(trail>>8);

	98 target[3]=(uint8_t)trail;

	99 target+=4;

	100 targetCapacity-=4;

	101 if(offsets!=NULL) {

	102 *offsets++=-1;

	103 *offsets++=-1;

	104 *offsets++=-1;

	105 *offsets++=-1;

	106 }

	107 sourceIndex=1;

	108 cnv->fromUChar32=c=0;

	109 }

	110

	111 if(c==0) {

	112 /* copy an even number of bytes for complete UChars */

	113 uint32_t count=2*length;

	114 if(count>targetCapacity) {

	115 count=targetCapacity&~1;

	116 }

	117 /* count is even */

	118 targetCapacity-=count;

	119 count>>=1;

	120 length-=count;

	121

	122 if(offsets==NULL) {

	123 while(count>0) {

	124 c=*source++;

	125 if(U16_IS_SINGLE(c)) {

	126 target[0]=(uint8_t)(c>>8);

	127 target[1]=(uint8_t)c;

	128 target+=2;

	129 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(t rail=*source)) {

	130 ++source;

	131 --count;

	132 target[0]=(uint8_t)(c>>8);

	133 target[1]=(uint8_t)c;

	134 target[2]=(uint8_t)(trail>>8);

	135 target[3]=(uint8_t)trail;

	136 target+=4;

	137 } else {

	138 break;

	139 }

	140 --count;

	141 }

	142 } else {

	143 while(count>0) {

	144 c=*source++;

	145 if(U16_IS_SINGLE(c)) {

	146 target[0]=(uint8_t)(c>>8);

	147 target[1]=(uint8_t)c;

	148 target+=2;

	149 *offsets++=sourceIndex;

	150 *offsets++=sourceIndex++;

	151 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(t rail=*source)) {

	152 ++source;

	153 --count;

	154 target[0]=(uint8_t)(c>>8);

	155 target[1]=(uint8_t)c;

	156 target[2]=(uint8_t)(trail>>8);

	157 target[3]=(uint8_t)trail;

	158 target+=4;

	159 *offsets++=sourceIndex;

	160 *offsets++=sourceIndex;

	161 *offsets++=sourceIndex;

	162 *offsets++=sourceIndex;

	163 sourceIndex+=2;

	164 } else {

	165 break;

	166 }

	167 --count;

	168 }

	169 }

	170

	171 if(count==0) {

	172 /* done with the loop for complete UChars */

	173 if(length>0 && targetCapacity>0) {

	174 /*

	175 * there is more input and some target capacity -

	176 * it must be targetCapacity==1 because otherwise

	177 * the above would have copied more;

	178 * prepare for overflow output

	179 */

	180 if(U16_IS_SINGLE(c=*source++)) {

	181 overflow[0]=(char)(c>>8);

	182 overflow[1]=(char)c;

	183 length=2; /* 2 bytes to output */

	184 c=0;

	185 /* } else { keep c for surrogate handling, length will be set th ere */

	186 }

	187 } else {

	188 length=0;

	189 c=0;

	190 }

	191 } else {

	192 /* keep c for surrogate handling, length will be set there */

	193 targetCapacity+=2*count;

	194 }

	195 } else {

	196 length=0; /* from here on, length counts the bytes in overflow[] */

	197 }

	198

	199 if(c!=0) {

	200 /*

	201 * c is a surrogate, and

	202 * - source or target too short

	203 * - or the surrogate is unmatched

	204 */

	205 length=0;

	206 if(U16_IS_SURROGATE_LEAD(c)) {

	207 if(source<pArgs->sourceLimit) {

	208 if(U16_IS_TRAIL(trail=*source)) {

	209 /* output the surrogate pair, will overflow (see conditions comment above) */

	210 ++source;

	211 overflow[0]=(char)(c>>8);

	212 overflow[1]=(char)c;

	213 overflow[2]=(char)(trail>>8);

	214 overflow[3]=(char)trail;

	215 length=4; /* 4 bytes to output */

	216 c=0;

	217 } else {

	218 /* unmatched lead surrogate */

	219 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	220 }

	221 } else {

	222 /* see if the trail surrogate is in the next buffer */

	223 }

	224 } else {

	225 /* unmatched trail surrogate */

	226 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	227 }

	228 cnv->fromUChar32=c;

	229 }

	230

	231 if(length>0) {

	232 /* output length bytes with overflow (length>targetCapacity>0) */

	233 ucnv_fromUWriteBytes(cnv,

	234 overflow, length,

	235 (char **)&target, pArgs->targetLimit,

	236 &offsets, sourceIndex,

	237 pErrorCode);

	238 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);

	239 }

	240

	241 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {

	242 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	243 }

	244

	245 /* write back the updated pointers */

	246 pArgs->source=source;

	247 pArgs->target=(char *)target;

	248 pArgs->offsets=offsets;

	249 }

	250

	251 static void

	252 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

	253 UErrorCode *pErrorCode) {

	254 UConverter *cnv;

	255 const uint8_t *source;

	256 UChar *target;

	257 int32_t *offsets;

	258

	259 uint32_t targetCapacity, length, count, sourceIndex;

	260 UChar c, trail;

	261

	262 if(pArgs->converter->mode<8) {

	263 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);

	264 return;

	265 }

	266

	267 cnv=pArgs->converter;

	268 source=(const uint8_t *)pArgs->source;

	269 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);

	270 if(length<=0 && cnv->toUnicodeStatus==0) {

	271 /* no input, nothing to do */

	272 return;

	273 }

	274

	275 target=pArgs->target;

	276 if(target >= pArgs->targetLimit) {

	277 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	278 return;

	279 }

	280

	281 targetCapacity=(uint32_t)(pArgs->targetLimit-target);

	282 offsets=pArgs->offsets;

	283 sourceIndex=0;

	284 c=0;

	285

	286 /* complete a partial UChar or pair from the last call */

	287 if(cnv->toUnicodeStatus!=0) {

	288 /*

	289 * special case: single byte from a previous buffer,

	290 * where the byte turned out not to belong to a trail surrogate

	291 * and the preceding, unmatched lead surrogate was put into toUBytes[]

	292 * for error handling

	293 */

	294 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;

	295 cnv->toULength=1;

	296 cnv->toUnicodeStatus=0;

	297 }

	298 if((count=cnv->toULength)!=0) {

	299 uint8_t *p=cnv->toUBytes;

	300 do {

	301 p[count++]=*source++;

	302 ++sourceIndex;

	303 --length;

	304 if(count==2) {

	305 c=((UChar)p[0]<<8)\|p[1];

	306 if(U16_IS_SINGLE(c)) {

	307 /* output the BMP code point */

	308 *target++=c;

	309 if(offsets!=NULL) {

	310 *offsets++=-1;

	311 }

	312 --targetCapacity;

	313 count=0;

	314 c=0;

	315 break;

	316 } else if(U16_IS_SURROGATE_LEAD(c)) {

	317 /* continue collecting bytes for the trail surrogate */

	318 c=0; /* avoid unnecessary surrogate handling below */

	319 } else {

	320 /* fall through to error handling for an unmatched trail sur rogate */

	321 break;

	322 }

	323 } else if(count==4) {

	324 c=((UChar)p[0]<<8)\|p[1];

	325 trail=((UChar)p[2]<<8)\|p[3];

	326 if(U16_IS_TRAIL(trail)) {

	327 /* output the surrogate pair */

	328 *target++=c;

	329 if(targetCapacity>=2) {

	330 *target++=trail;

	331 if(offsets!=NULL) {

	332 *offsets++=-1;

	333 *offsets++=-1;

	334 }

	335 targetCapacity-=2;

	336 } else /* targetCapacity==1 */ {

	337 targetCapacity=0;

	338 cnv->UCharErrorBuffer[0]=trail;

	339 cnv->UCharErrorBufferLength=1;

	340 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	341 }

	342 count=0;

	343 c=0;

	344 break;

	345 } else {

	346 /* unmatched lead surrogate, handle here for consistent toUB ytes[] */

	347 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	348

	349 /* back out reading the code unit after it */

	350 if(((const uint8_t *)pArgs->source-source)>=2) {

	351 source-=2;

	352 } else {

	353 /*

	354 * if the trail unit's first byte was in a previous buff er, then

	355 * we need to put it into a special place because toUByt es[] will be

	356 * used for the lead unit's bytes

	357 */

	358 cnv->toUnicodeStatus=0x100\|p[2];

	359 --source;

	360 }

	361 cnv->toULength=2;

	362

	363 /* write back the updated pointers */

	364 pArgs->source=(const char *)source;

	365 pArgs->target=target;

	366 pArgs->offsets=offsets;

	367 return;

	368 }

	369 }

	370 } while(length>0);

	371 cnv->toULength=(int8_t)count;

	372 }

	373

	374 /* copy an even number of bytes for complete UChars */

	375 count=2*targetCapacity;

	376 if(count>length) {

	377 count=length&~1;

	378 }

	379 if(c==0 && count>0) {

	380 length-=count;

	381 count>>=1;

	382 targetCapacity-=count;

	383 if(offsets==NULL) {

	384 do {

	385 c=((UChar)source[0]<<8)\|source[1];

	386 source+=2;

	387 if(U16_IS_SINGLE(c)) {

	388 *target++=c;

	389 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&

	390 U16_IS_TRAIL(trail=((UChar)source[0]<<8)\|source[1])

	391 ) {

	392 source+=2;

	393 --count;

	394 *target++=c;

	395 *target++=trail;

	396 } else {

	397 break;

	398 }

	399 } while(--count>0);

	400 } else {

	401 do {

	402 c=((UChar)source[0]<<8)\|source[1];

	403 source+=2;

	404 if(U16_IS_SINGLE(c)) {

	405 *target++=c;

	406 *offsets++=sourceIndex;

	407 sourceIndex+=2;

	408 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&

	409 U16_IS_TRAIL(trail=((UChar)source[0]<<8)\|source[1])

	410 ) {

	411 source+=2;

	412 --count;

	413 *target++=c;

	414 *target++=trail;

	415 *offsets++=sourceIndex;

	416 *offsets++=sourceIndex;

	417 sourceIndex+=4;

	418 } else {

	419 break;

	420 }

	421 } while(--count>0);

	422 }

	423

	424 if(count==0) {

	425 /* done with the loop for complete UChars */

	426 c=0;

	427 } else {

	428 /* keep c for surrogate handling, trail will be set there */

	429 length+=2(count-1); / one more byte pair was consumed than count d ecremented */

	430 targetCapacity+=count;

	431 }

	432 }

	433

	434 if(c!=0) {

	435 /*

	436 * c is a surrogate, and

	437 * - source or target too short

	438 * - or the surrogate is unmatched

	439 */

	440 cnv->toUBytes[0]=(uint8_t)(c>>8);

	441 cnv->toUBytes[1]=(uint8_t)c;

	442 cnv->toULength=2;

	443

	444 if(U16_IS_SURROGATE_LEAD(c)) {

	445 if(length>=2) {

	446 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)\|source[1])) {

	447 /* output the surrogate pair, will overflow (see conditions comment above) */

	448 source+=2;

	449 length-=2;

	450 *target++=c;

	451 if(offsets!=NULL) {

	452 *offsets++=sourceIndex;

	453 }

	454 cnv->UCharErrorBuffer[0]=trail;

	455 cnv->UCharErrorBufferLength=1;

	456 cnv->toULength=0;

	457 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	458 } else {

	459 /* unmatched lead surrogate */

	460 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	461 }

	462 } else {

	463 /* see if the trail surrogate is in the next buffer */

	464 }

	465 } else {

	466 /* unmatched trail surrogate */

	467 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	468 }

	469 }

	470

	471 if(U_SUCCESS(*pErrorCode)) {

	472 /* check for a remaining source byte */

	473 if(length>0) {

	474 if(targetCapacity==0) {

	475 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	476 } else {

	477 /* it must be length==1 because otherwise the above would have c opied more */

	478 cnv->toUBytes[cnv->toULength++]=*source++;

	479 }

	480 }

	481 }

	482

	483 /* write back the updated pointers */

	484 pArgs->source=(const char *)source;

	485 pArgs->target=target;

	486 pArgs->offsets=offsets;

	487 }

	488

	489 static UChar32

	490 _UTF16BEGetNextUChar(UConverterToUnicodeArgs pArgs, UErrorCode err) {

	491 const uint8_t s, sourceLimit;

	492 UChar32 c;

	493

	494 if(pArgs->converter->mode<8) {

	495 return UCNV_GET_NEXT_UCHAR_USE_TO_U;

	496 }

	497

	498 s=(const uint8_t *)pArgs->source;

	499 sourceLimit=(const uint8_t *)pArgs->sourceLimit;

	500

	501 if(s>=sourceLimit) {

	502 /* no input */

	503 *err=U_INDEX_OUTOFBOUNDS_ERROR;

	504 return 0xffff;

	505 }

	506

	507 if(s+2>sourceLimit) {

	508 /* only one byte: truncated UChar */

	509 pArgs->converter->toUBytes[0]=*s++;

	510 pArgs->converter->toULength=1;

	511 pArgs->source=(const char *)s;

	512 *err = U_TRUNCATED_CHAR_FOUND;

	513 return 0xffff;

	514 }

	515

	516 /* get one UChar */

	517 c=((UChar32)*s<<8)\|s[1];

	518 s+=2;

	519

	520 /* check for a surrogate pair */

	521 if(U_IS_SURROGATE(c)) {

	522 if(U16_IS_SURROGATE_LEAD(c)) {

	523 if(s+2<=sourceLimit) {

	524 UChar trail;

	525

	526 /* get a second UChar and see if it is a trail surrogate */

	527 trail=((UChar)*s<<8)\|s[1];

	528 if(U16_IS_TRAIL(trail)) {

	529 c=U16_GET_SUPPLEMENTARY(c, trail);

	530 s+=2;

	531 } else {

	532 /* unmatched lead surrogate */

	533 c=-2;

	534 }

	535 } else {

	536 /* too few (2 or 3) bytes for a surrogate pair: truncated code p oint */

	537 uint8_t *bytes=pArgs->converter->toUBytes;

	538 s-=2;

	539 pArgs->converter->toULength=(int8_t)(sourceLimit-s);

	540 do {

	541 bytes++=s++;

	542 } while(s<sourceLimit);

	543

	544 c=0xffff;

	545 *err=U_TRUNCATED_CHAR_FOUND;

	546 }

	547 } else {

	548 /* unmatched trail surrogate */

	549 c=-2;

	550 }

	551

	552 if(c<0) {

	553 /* write the unmatched surrogate */

	554 uint8_t *bytes=pArgs->converter->toUBytes;

	555 pArgs->converter->toULength=2;

	556 bytes=(s-2);

	557 bytes[1]=*(s-1);

	558

	559 c=0xffff;

	560 *err=U_ILLEGAL_CHAR_FOUND;

	561 }

	562 }

	563

	564 pArgs->source=(const char *)s;

	565 return c;

	566 }

	567

	568 static void

	569 _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {

	570 if(choice<=UCNV_RESET_TO_UNICODE) {

	571 /* reset toUnicode state */

	572 if(UCNV_GET_VERSION(cnv)==0) {

	573 cnv->mode=8; /* no BOM handling */

	574 } else {

	575 cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */

	576 }

	577 }

	578 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {

	579 /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BO M */

	580 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;

	581 }

	582 }

	583

	584 static void

	585 _UTF16BEOpen(UConverter *cnv,

	586 UConverterLoadArgs *pArgs,

	587 UErrorCode *pErrorCode) {

	588 if(UCNV_GET_VERSION(cnv)<=1) {

	589 _UTF16BEReset(cnv, UCNV_RESET_BOTH);

	590 } else {

	591 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;

	592 }

	593 }

	594

	595 static const char *

	596 _UTF16BEGetName(const UConverter *cnv) {

	597 if(UCNV_GET_VERSION(cnv)==0) {

	598 return "UTF-16BE";

	599 } else {

	600 return "UTF-16BE,version=1";

	601 }

	602 }

	603

	604 static const UConverterImpl _UTF16BEImpl={

	605 UCNV_UTF16_BigEndian,

	606

	607 NULL,

	608 NULL,

	609

	610 _UTF16BEOpen,

	611 NULL,

	612 _UTF16BEReset,

	613

	614 _UTF16BEToUnicodeWithOffsets,

	615 _UTF16BEToUnicodeWithOffsets,

	616 _UTF16BEFromUnicodeWithOffsets,

	617 _UTF16BEFromUnicodeWithOffsets,

	618 _UTF16BEGetNextUChar,

	619

	620 NULL,

	621 _UTF16BEGetName,

	622 NULL,

	623 NULL,

	624 ucnv_getNonSurrogateUnicodeSet

	625 };

	626

	627 static const UConverterStaticData _UTF16BEStaticData={

	628 sizeof(UConverterStaticData),

	629 "UTF-16BE",

	630 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,

	631 { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,

	632 0,

	633 0,

	634 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

	635 };

	636

	637

	638 const UConverterSharedData _UTF16BEData={

	639 sizeof(UConverterSharedData), ~((uint32_t) 0),

	640 NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl,

	641 0

	642 };

	643

	644 /* UTF-16LE ----------------------------------------------------------------- */

	645

	646 static void

	647 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,

	648 UErrorCode *pErrorCode) {

	649 UConverter *cnv;

	650 const UChar *source;

	651 char *target;

	652 int32_t *offsets;

	653

	654 uint32_t targetCapacity, length, sourceIndex;

	655 UChar c, trail;

	656 char overflow[4];

	657

	658 source=pArgs->source;

	659 length=(int32_t)(pArgs->sourceLimit-source);

	660 if(length<=0) {

	661 /* no input, nothing to do */

	662 return;

	663 }

	664

	665 cnv=pArgs->converter;

	666

	667 /* write the BOM if necessary */

	668 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {

	669 static const char bom[]={ (char)0xff, (char)0xfe };

	670 ucnv_fromUWriteBytes(cnv,

	671 bom, 2,

	672 &pArgs->target, pArgs->targetLimit,

	673 &pArgs->offsets, -1,

	674 pErrorCode);

	675 cnv->fromUnicodeStatus=0;

	676 }

	677

	678 target=pArgs->target;

	679 if(target >= pArgs->targetLimit) {

	680 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	681 return;

	682 }

	683

	684 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);

	685 offsets=pArgs->offsets;

	686 sourceIndex=0;

	687

	688 /* c!=0 indicates in several places outside the main loops that a surrogate was found */

	689

	690 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCa pacity>=4) {

	691 /* the last buffer ended with a lead surrogate, output the surrogate pai r */

	692 ++source;

	693 --length;

	694 target[0]=(uint8_t)c;

	695 target[1]=(uint8_t)(c>>8);

	696 target[2]=(uint8_t)trail;

	697 target[3]=(uint8_t)(trail>>8);

	698 target+=4;

	699 targetCapacity-=4;

	700 if(offsets!=NULL) {

	701 *offsets++=-1;

	702 *offsets++=-1;

	703 *offsets++=-1;

	704 *offsets++=-1;

	705 }

	706 sourceIndex=1;

	707 cnv->fromUChar32=c=0;

	708 }

	709

	710 if(c==0) {

	711 /* copy an even number of bytes for complete UChars */

	712 uint32_t count=2*length;

	713 if(count>targetCapacity) {

	714 count=targetCapacity&~1;

	715 }

	716 /* count is even */

	717 targetCapacity-=count;

	718 count>>=1;

	719 length-=count;

	720

	721 if(offsets==NULL) {

	722 while(count>0) {

	723 c=*source++;

	724 if(U16_IS_SINGLE(c)) {

	725 target[0]=(uint8_t)c;

	726 target[1]=(uint8_t)(c>>8);

	727 target+=2;

	728 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(t rail=*source)) {

	729 ++source;

	730 --count;

	731 target[0]=(uint8_t)c;

	732 target[1]=(uint8_t)(c>>8);

	733 target[2]=(uint8_t)trail;

	734 target[3]=(uint8_t)(trail>>8);

	735 target+=4;

	736 } else {

	737 break;

	738 }

	739 --count;

	740 }

	741 } else {

	742 while(count>0) {

	743 c=*source++;

	744 if(U16_IS_SINGLE(c)) {

	745 target[0]=(uint8_t)c;

	746 target[1]=(uint8_t)(c>>8);

	747 target+=2;

	748 *offsets++=sourceIndex;

	749 *offsets++=sourceIndex++;

	750 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(t rail=*source)) {

	751 ++source;

	752 --count;

	753 target[0]=(uint8_t)c;

	754 target[1]=(uint8_t)(c>>8);

	755 target[2]=(uint8_t)trail;

	756 target[3]=(uint8_t)(trail>>8);

	757 target+=4;

	758 *offsets++=sourceIndex;

	759 *offsets++=sourceIndex;

	760 *offsets++=sourceIndex;

	761 *offsets++=sourceIndex;

	762 sourceIndex+=2;

	763 } else {

	764 break;

	765 }

	766 --count;

	767 }

	768 }

	769

	770 if(count==0) {

	771 /* done with the loop for complete UChars */

	772 if(length>0 && targetCapacity>0) {

	773 /*

	774 * there is more input and some target capacity -

	775 * it must be targetCapacity==1 because otherwise

	776 * the above would have copied more;

	777 * prepare for overflow output

	778 */

	779 if(U16_IS_SINGLE(c=*source++)) {

	780 overflow[0]=(char)c;

	781 overflow[1]=(char)(c>>8);

	782 length=2; /* 2 bytes to output */

	783 c=0;

	784 /* } else { keep c for surrogate handling, length will be set th ere */

	785 }

	786 } else {

	787 length=0;

	788 c=0;

	789 }

	790 } else {

	791 /* keep c for surrogate handling, length will be set there */

	792 targetCapacity+=2*count;

	793 }

	794 } else {

	795 length=0; /* from here on, length counts the bytes in overflow[] */

	796 }

	797

	798 if(c!=0) {

	799 /*

	800 * c is a surrogate, and

	801 * - source or target too short

	802 * - or the surrogate is unmatched

	803 */

	804 length=0;

	805 if(U16_IS_SURROGATE_LEAD(c)) {

	806 if(source<pArgs->sourceLimit) {

	807 if(U16_IS_TRAIL(trail=*source)) {

	808 /* output the surrogate pair, will overflow (see conditions comment above) */

	809 ++source;

	810 overflow[0]=(char)c;

	811 overflow[1]=(char)(c>>8);

	812 overflow[2]=(char)trail;

	813 overflow[3]=(char)(trail>>8);

	814 length=4; /* 4 bytes to output */

	815 c=0;

	816 } else {

	817 /* unmatched lead surrogate */

	818 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	819 }

	820 } else {

	821 /* see if the trail surrogate is in the next buffer */

	822 }

	823 } else {

	824 /* unmatched trail surrogate */

	825 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	826 }

	827 cnv->fromUChar32=c;

	828 }

	829

	830 if(length>0) {

	831 /* output length bytes with overflow (length>targetCapacity>0) */

	832 ucnv_fromUWriteBytes(cnv,

	833 overflow, length,

	834 &target, pArgs->targetLimit,

	835 &offsets, sourceIndex,

	836 pErrorCode);

	837 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);

	838 }

	839

	840 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {

	841 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	842 }

	843

	844 /* write back the updated pointers */

	845 pArgs->source=source;

	846 pArgs->target=target;

	847 pArgs->offsets=offsets;

	848 }

	849

	850 static void

	851 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

	852 UErrorCode *pErrorCode) {

	853 UConverter *cnv;

	854 const uint8_t *source;

	855 UChar *target;

	856 int32_t *offsets;

	857

	858 uint32_t targetCapacity, length, count, sourceIndex;

	859 UChar c, trail;

	860

	861 if(pArgs->converter->mode<8) {

	862 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);

	863 return;

	864 }

	865

	866 cnv=pArgs->converter;

	867 source=(const uint8_t *)pArgs->source;

	868 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);

	869 if(length<=0 && cnv->toUnicodeStatus==0) {

	870 /* no input, nothing to do */

	871 return;

	872 }

	873

	874 target=pArgs->target;

	875 if(target >= pArgs->targetLimit) {

	876 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	877 return;

	878 }

	879

	880 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);

	881 offsets=pArgs->offsets;

	882 sourceIndex=0;

	883 c=0;

	884

	885 /* complete a partial UChar or pair from the last call */

	886 if(cnv->toUnicodeStatus!=0) {

	887 /*

	888 * special case: single byte from a previous buffer,

	889 * where the byte turned out not to belong to a trail surrogate

	890 * and the preceding, unmatched lead surrogate was put into toUBytes[]

	891 * for error handling

	892 */

	893 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;

	894 cnv->toULength=1;

	895 cnv->toUnicodeStatus=0;

	896 }

	897 if((count=cnv->toULength)!=0) {

	898 uint8_t *p=cnv->toUBytes;

	899 do {

	900 p[count++]=*source++;

	901 ++sourceIndex;

	902 --length;

	903 if(count==2) {

	904 c=((UChar)p[1]<<8)\|p[0];

	905 if(U16_IS_SINGLE(c)) {

	906 /* output the BMP code point */

	907 *target++=c;

	908 if(offsets!=NULL) {

	909 *offsets++=-1;

	910 }

	911 --targetCapacity;

	912 count=0;

	913 c=0;

	914 break;

	915 } else if(U16_IS_SURROGATE_LEAD(c)) {

	916 /* continue collecting bytes for the trail surrogate */

	917 c=0; /* avoid unnecessary surrogate handling below */

	918 } else {

	919 /* fall through to error handling for an unmatched trail sur rogate */

	920 break;

	921 }

	922 } else if(count==4) {

	923 c=((UChar)p[1]<<8)\|p[0];

	924 trail=((UChar)p[3]<<8)\|p[2];

	925 if(U16_IS_TRAIL(trail)) {

	926 /* output the surrogate pair */

	927 *target++=c;

	928 if(targetCapacity>=2) {

	929 *target++=trail;

	930 if(offsets!=NULL) {

	931 *offsets++=-1;

	932 *offsets++=-1;

	933 }

	934 targetCapacity-=2;

	935 } else /* targetCapacity==1 */ {

	936 targetCapacity=0;

	937 cnv->UCharErrorBuffer[0]=trail;

	938 cnv->UCharErrorBufferLength=1;

	939 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	940 }

	941 count=0;

	942 c=0;

	943 break;

	944 } else {

	945 /* unmatched lead surrogate, handle here for consistent toUB ytes[] */

	946 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	947

	948 /* back out reading the code unit after it */

	949 if(((const uint8_t *)pArgs->source-source)>=2) {

	950 source-=2;

	951 } else {

	952 /*

	953 * if the trail unit's first byte was in a previous buff er, then

	954 * we need to put it into a special place because toUByt es[] will be

	955 * used for the lead unit's bytes

	956 */

	957 cnv->toUnicodeStatus=0x100\|p[2];

	958 --source;

	959 }

	960 cnv->toULength=2;

	961

	962 /* write back the updated pointers */

	963 pArgs->source=(const char *)source;

	964 pArgs->target=target;

	965 pArgs->offsets=offsets;

	966 return;

	967 }

	968 }

	969 } while(length>0);

	970 cnv->toULength=(int8_t)count;

	971 }

	972

	973 /* copy an even number of bytes for complete UChars */

	974 count=2*targetCapacity;

	975 if(count>length) {

	976 count=length&~1;

	977 }

	978 if(c==0 && count>0) {

	979 length-=count;

	980 count>>=1;

	981 targetCapacity-=count;

	982 if(offsets==NULL) {

	983 do {

	984 c=((UChar)source[1]<<8)\|source[0];

	985 source+=2;

	986 if(U16_IS_SINGLE(c)) {

	987 *target++=c;

	988 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&

	989 U16_IS_TRAIL(trail=((UChar)source[1]<<8)\|source[0])

	990 ) {

	991 source+=2;

	992 --count;

	993 *target++=c;

	994 *target++=trail;

	995 } else {

	996 break;

	997 }

	998 } while(--count>0);

	999 } else {

	1000 do {

	1001 c=((UChar)source[1]<<8)\|source[0];

	1002 source+=2;

	1003 if(U16_IS_SINGLE(c)) {

	1004 *target++=c;

	1005 *offsets++=sourceIndex;

	1006 sourceIndex+=2;

	1007 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&

	1008 U16_IS_TRAIL(trail=((UChar)source[1]<<8)\|source[0])

	1009 ) {

	1010 source+=2;

	1011 --count;

	1012 *target++=c;

	1013 *target++=trail;

	1014 *offsets++=sourceIndex;

	1015 *offsets++=sourceIndex;

	1016 sourceIndex+=4;

	1017 } else {

	1018 break;

	1019 }

	1020 } while(--count>0);

	1021 }

	1022

	1023 if(count==0) {

	1024 /* done with the loop for complete UChars */

	1025 c=0;

	1026 } else {

	1027 /* keep c for surrogate handling, trail will be set there */

	1028 length+=2(count-1); / one more byte pair was consumed than count d ecremented */

	1029 targetCapacity+=count;

	1030 }

	1031 }

	1032

	1033 if(c!=0) {

	1034 /*

	1035 * c is a surrogate, and

	1036 * - source or target too short

	1037 * - or the surrogate is unmatched

	1038 */

	1039 cnv->toUBytes[0]=(uint8_t)c;

	1040 cnv->toUBytes[1]=(uint8_t)(c>>8);

	1041 cnv->toULength=2;

	1042

	1043 if(U16_IS_SURROGATE_LEAD(c)) {

	1044 if(length>=2) {

	1045 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)\|source[0])) {

	1046 /* output the surrogate pair, will overflow (see conditions comment above) */

	1047 source+=2;

	1048 length-=2;

	1049 *target++=c;

	1050 if(offsets!=NULL) {

	1051 *offsets++=sourceIndex;

	1052 }

	1053 cnv->UCharErrorBuffer[0]=trail;

	1054 cnv->UCharErrorBufferLength=1;

	1055 cnv->toULength=0;

	1056 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	1057 } else {

	1058 /* unmatched lead surrogate */

	1059 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	1060 }

	1061 } else {

	1062 /* see if the trail surrogate is in the next buffer */

	1063 }

	1064 } else {

	1065 /* unmatched trail surrogate */

	1066 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	1067 }

	1068 }

	1069

	1070 if(U_SUCCESS(*pErrorCode)) {

	1071 /* check for a remaining source byte */

	1072 if(length>0) {

	1073 if(targetCapacity==0) {

	1074 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	1075 } else {

	1076 /* it must be length==1 because otherwise the above would have c opied more */

	1077 cnv->toUBytes[cnv->toULength++]=*source++;

	1078 }

	1079 }

	1080 }

	1081

	1082 /* write back the updated pointers */

	1083 pArgs->source=(const char *)source;

	1084 pArgs->target=target;

	1085 pArgs->offsets=offsets;

	1086 }

	1087

	1088 static UChar32

	1089 _UTF16LEGetNextUChar(UConverterToUnicodeArgs pArgs, UErrorCode err) {

	1090 const uint8_t s, sourceLimit;

	1091 UChar32 c;

	1092

	1093 if(pArgs->converter->mode<8) {

	1094 return UCNV_GET_NEXT_UCHAR_USE_TO_U;

	1095 }

	1096

	1097 s=(const uint8_t *)pArgs->source;

	1098 sourceLimit=(const uint8_t *)pArgs->sourceLimit;

	1099

	1100 if(s>=sourceLimit) {

	1101 /* no input */

	1102 *err=U_INDEX_OUTOFBOUNDS_ERROR;

	1103 return 0xffff;

	1104 }

	1105

	1106 if(s+2>sourceLimit) {

	1107 /* only one byte: truncated UChar */

	1108 pArgs->converter->toUBytes[0]=*s++;

	1109 pArgs->converter->toULength=1;

	1110 pArgs->source=(const char *)s;

	1111 *err = U_TRUNCATED_CHAR_FOUND;

	1112 return 0xffff;

	1113 }

	1114

	1115 /* get one UChar */

	1116 c=((UChar32)s[1]<<8)\|*s;

	1117 s+=2;

	1118

	1119 /* check for a surrogate pair */

	1120 if(U_IS_SURROGATE(c)) {

	1121 if(U16_IS_SURROGATE_LEAD(c)) {

	1122 if(s+2<=sourceLimit) {

	1123 UChar trail;

	1124

	1125 /* get a second UChar and see if it is a trail surrogate */

	1126 trail=((UChar)s[1]<<8)\|*s;

	1127 if(U16_IS_TRAIL(trail)) {

	1128 c=U16_GET_SUPPLEMENTARY(c, trail);

	1129 s+=2;

	1130 } else {

	1131 /* unmatched lead surrogate */

	1132 c=-2;

	1133 }

	1134 } else {

	1135 /* too few (2 or 3) bytes for a surrogate pair: truncated code p oint */

	1136 uint8_t *bytes=pArgs->converter->toUBytes;

	1137 s-=2;

	1138 pArgs->converter->toULength=(int8_t)(sourceLimit-s);

	1139 do {

	1140 bytes++=s++;

	1141 } while(s<sourceLimit);

	1142

	1143 c=0xffff;

	1144 *err=U_TRUNCATED_CHAR_FOUND;

	1145 }

	1146 } else {

	1147 /* unmatched trail surrogate */

	1148 c=-2;

	1149 }

	1150

	1151 if(c<0) {

	1152 /* write the unmatched surrogate */

	1153 uint8_t *bytes=pArgs->converter->toUBytes;

	1154 pArgs->converter->toULength=2;

	1155 bytes=(s-2);

	1156 bytes[1]=*(s-1);

	1157

	1158 c=0xffff;

	1159 *err=U_ILLEGAL_CHAR_FOUND;

	1160 }

	1161 }

	1162

	1163 pArgs->source=(const char *)s;

	1164 return c;

	1165 }

	1166

	1167 static void

	1168 _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {

	1169 if(choice<=UCNV_RESET_TO_UNICODE) {

	1170 /* reset toUnicode state */

	1171 if(UCNV_GET_VERSION(cnv)==0) {

	1172 cnv->mode=8; /* no BOM handling */

	1173 } else {

	1174 cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */

	1175 }

	1176 }

	1177 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {

	1178 /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */

	1179 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;

	1180 }

	1181 }

	1182

	1183 static void

	1184 _UTF16LEOpen(UConverter *cnv,

	1185 UConverterLoadArgs *pArgs,

	1186 UErrorCode *pErrorCode) {

	1187 if(UCNV_GET_VERSION(cnv)<=1) {

	1188 _UTF16LEReset(cnv, UCNV_RESET_BOTH);

	1189 } else {

	1190 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;

	1191 }

	1192 }

	1193

	1194 static const char *

	1195 _UTF16LEGetName(const UConverter *cnv) {

	1196 if(UCNV_GET_VERSION(cnv)==0) {

	1197 return "UTF-16LE";

	1198 } else {

	1199 return "UTF-16LE,version=1";

	1200 }

	1201 }

	1202

	1203 static const UConverterImpl _UTF16LEImpl={

	1204 UCNV_UTF16_LittleEndian,

	1205

	1206 NULL,

	1207 NULL,

	1208

	1209 _UTF16LEOpen,

	1210 NULL,

	1211 _UTF16LEReset,

	1212

	1213 _UTF16LEToUnicodeWithOffsets,

	1214 _UTF16LEToUnicodeWithOffsets,

	1215 _UTF16LEFromUnicodeWithOffsets,

	1216 _UTF16LEFromUnicodeWithOffsets,

	1217 _UTF16LEGetNextUChar,

	1218

	1219 NULL,

	1220 _UTF16LEGetName,

	1221 NULL,

	1222 NULL,

	1223 ucnv_getNonSurrogateUnicodeSet

	1224 };

	1225

	1226

	1227 static const UConverterStaticData _UTF16LEStaticData={

	1228 sizeof(UConverterStaticData),

	1229 "UTF-16LE",

	1230 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,

	1231 { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,

	1232 0,

	1233 0,

	1234 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

	1235 };

	1236

	1237

	1238 const UConverterSharedData _UTF16LEData={

	1239 sizeof(UConverterSharedData), ~((uint32_t) 0),

	1240 NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl,

	1241 0

	1242 };

	1243

	1244 /* UTF-16 (Detect BOM) ------------------------------------------------------ */

	1245

	1246 /*

	1247 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE

	1248 * accordingly.

	1249 * This is a simpler version of the UTF-32 converter, with

	1250 * fewer states for shorter BOMs.

	1251 *

	1252 * State values:

	1253 * 0 initial state

	1254 * 1 saw first byte

	1255 * 2..5 -

	1256 * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1

	1257 * 8 UTF-16BE mode

	1258 * 9 UTF-16LE mode

	1259 *

	1260 * During detection: state==number of initial bytes seen so far.

	1261 *

	1262 * On output, emit U+FEFF as the first code point.

	1263 *

	1264 * Variants:

	1265 * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error .

	1266 * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and

	1267 * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as a n error.

	1268 */

	1269

	1270 static void

	1271 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {

	1272 if(choice<=UCNV_RESET_TO_UNICODE) {

	1273 /* reset toUnicode: state=0 */

	1274 cnv->mode=0;

	1275 }

	1276 if(choice!=UCNV_RESET_TO_UNICODE) {

	1277 /* reset fromUnicode: prepare to output the UTF-16PE BOM */

	1278 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;

	1279 }

	1280 }

	1281

	1282 static const UConverterSharedData _UTF16v2Data;

	1283

	1284 static void

	1285 _UTF16Open(UConverter *cnv,

	1286 UConverterLoadArgs *pArgs,

	1287 UErrorCode *pErrorCode) {

	1288 if(UCNV_GET_VERSION(cnv)<=2) {

	1289 if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {

	1290 /*

	1291 * Switch implementation, and switch the staticData that's different

	1292 * and was copied into the UConverter.

	1293 * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)

	1294 * UTF-16,version=2 fromUnicode() always writes a big-endian byte st ream.

	1295 */

	1296 cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;

	1297 uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MA X_SUBCHAR_LEN);

	1298 }

	1299 _UTF16Reset(cnv, UCNV_RESET_BOTH);

	1300 } else {

	1301 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;

	1302 }

	1303 }

	1304

	1305 static const char *

	1306 _UTF16GetName(const UConverter *cnv) {

	1307 if(UCNV_GET_VERSION(cnv)==0) {

	1308 return "UTF-16";

	1309 } else if(UCNV_GET_VERSION(cnv)==1) {

	1310 return "UTF-16,version=1";

	1311 } else {

	1312 return "UTF-16,version=2";

	1313 }

	1314 }

	1315

	1316 const UConverterSharedData _UTF16Data;

	1317

	1318 #define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData)

	1319 #define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData)

	1320 #define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data \|\| (cnv)->sharedData==&_UT F16v2Data)

	1321

	1322 static void

	1323 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

	1324 UErrorCode *pErrorCode) {

	1325 UConverter *cnv=pArgs->converter;

	1326 const char *source=pArgs->source;

	1327 const char *sourceLimit=pArgs->sourceLimit;

	1328 int32_t *offsets=pArgs->offsets;

	1329

	1330 int32_t state, offsetDelta;

	1331 uint8_t b;

	1332

	1333 state=cnv->mode;

	1334

	1335 /*

	1336 * If we detect a BOM in this buffer, then we must add the BOM size to the

	1337 * offsets because the actual converter function will not see and count the BOM.

	1338 * offsetDelta will have the number of the BOM bytes that are in the current buffer.

	1339 */

	1340 offsetDelta=0;

	1341

	1342 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {

	1343 switch(state) {

	1344 case 0:

	1345 cnv->toUBytes[0]=(uint8_t)*source++;

	1346 cnv->toULength=1;

	1347 state=1;

	1348 break;

	1349 case 1:

	1350 /*

	1351 * Only inside this switch case can the state variable

	1352 * temporarily take two additional values:

	1353 * 6: BOM error, continue with BE

	1354 * 7: BOM error, continue with LE

	1355 */

	1356 b=*source;

	1357 if(cnv->toUBytes[0]==0xfe && b==0xff) {

	1358 if(IS_UTF16LE(cnv)) {

	1359 state=7; /* illegal reverse BOM for Java "UnicodeLittle" */

	1360 } else {

	1361 state=8; /* detect UTF-16BE */

	1362 }

	1363 } else if(cnv->toUBytes[0]==0xff && b==0xfe) {

	1364 if(IS_UTF16BE(cnv)) {

	1365 state=6; /* illegal reverse BOM for Java "UnicodeBig" */

	1366 } else {

	1367 state=9; /* detect UTF-16LE */

	1368 }

	1369 } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {

	1370 state=6; /* illegal missing BOM for Java "Unicode" */

	1371 }

	1372 if(state>=8) {

	1373 /* BOM detected, consume it */

	1374 ++source;

	1375 cnv->toULength=0;

	1376 offsetDelta=(int32_t)(source-pArgs->source);

	1377 } else if(state<6) {

	1378 /* ok: no BOM, and not a reverse BOM */

	1379 if(source!=pArgs->source) {

	1380 /* reset the source for a correct first offset */

	1381 source=pArgs->source;

	1382 cnv->toULength=0;

	1383 }

	1384 if(IS_UTF16LE(cnv)) {

	1385 /* Make Java "UnicodeLittle" default to LE. */

	1386 state=9;

	1387 } else {

	1388 /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */

	1389 state=8;

	1390 }

	1391 } else {

	1392 /*

	1393 * error: missing BOM, or reverse BOM

	1394 * UTF-16,version=1: Java-specific "Unicode" requires a BOM.

	1395 * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.

	1396 * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.

	1397 */

	1398 /* report the non-BOM or reverse BOM as an illegal sequence */

	1399 cnv->toUBytes[1]=b;

	1400 cnv->toULength=2;

	1401 pArgs->source=source+1;

	1402 /* continue with conversion if the callback resets the error */

	1403 /*

	1404 * Make Java "Unicode" default to BE like standard UTF-16.

	1405 * Make Java "UnicodeBig" and "UnicodeLittle" default

	1406 * to their normal endiannesses.

	1407 */

	1408 cnv->mode=state+2;

	1409 *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;

	1410 return;

	1411 }

	1412 /* convert the rest of the stream */

	1413 cnv->mode=state;

	1414 continue;

	1415 case 8:

	1416 /* call UTF-16BE */

	1417 pArgs->source=source;

	1418 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);

	1419 source=pArgs->source;

	1420 break;

	1421 case 9:

	1422 /* call UTF-16LE */

	1423 pArgs->source=source;

	1424 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);

	1425 source=pArgs->source;

	1426 break;

	1427 default:

	1428 break; /* does not occur */

	1429 }

	1430 }

	1431

	1432 /* add BOM size to offsets - see comment at offsetDelta declaration */

	1433 if(offsets!=NULL && offsetDelta!=0) {

	1434 int32_t *offsetsLimit=pArgs->offsets;

	1435 while(offsets<offsetsLimit) {

	1436 *offsets++ += offsetDelta;

	1437 }

	1438 }

	1439

	1440 pArgs->source=source;

	1441

	1442 if(source==sourceLimit && pArgs->flush) {

	1443 /* handle truncated input */

	1444 switch(state) {

	1445 case 0:

	1446 break; /* no input at all, nothing to do */

	1447 case 8:

	1448 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);

	1449 break;

	1450 case 9:

	1451 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);

	1452 break;

	1453 default:

	1454 /* 0<state<8: framework will report truncation, nothing to do here * /

	1455 break;

	1456 }

	1457 }

	1458

	1459 cnv->mode=state;

	1460 }

	1461

	1462 static UChar32

	1463 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,

	1464 UErrorCode *pErrorCode) {

	1465 switch(pArgs->converter->mode) {

	1466 case 8:

	1467 return _UTF16BEGetNextUChar(pArgs, pErrorCode);

	1468 case 9:

	1469 return _UTF16LEGetNextUChar(pArgs, pErrorCode);

	1470 default:

	1471 return UCNV_GET_NEXT_UCHAR_USE_TO_U;

	1472 }

	1473 }

	1474

	1475 static const UConverterImpl _UTF16Impl = {

	1476 UCNV_UTF16,

	1477

	1478 NULL,

	1479 NULL,

	1480

	1481 _UTF16Open,

	1482 NULL,

	1483 _UTF16Reset,

	1484

	1485 _UTF16ToUnicodeWithOffsets,

	1486 _UTF16ToUnicodeWithOffsets,

	1487 _UTF16PEFromUnicodeWithOffsets,

	1488 _UTF16PEFromUnicodeWithOffsets,

	1489 _UTF16GetNextUChar,

	1490

	1491 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */

	1492 _UTF16GetName,

	1493 NULL,

	1494 NULL,

	1495 ucnv_getNonSurrogateUnicodeSet

	1496 };

	1497

	1498 static const UConverterStaticData _UTF16StaticData = {

	1499 sizeof(UConverterStaticData),

	1500 "UTF-16",

	1501 1204, /* CCSID for BOM sensitive UTF-16 */

	1502 UCNV_IBM, UCNV_UTF16, 2, 2,

	1503 #if U_IS_BIG_ENDIAN

	1504 { 0xff, 0xfd, 0, 0 }, 2,

	1505 #else

	1506 { 0xfd, 0xff, 0, 0 }, 2,

	1507 #endif

	1508 FALSE, FALSE,

	1509 0,

	1510 0,

	1511 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

	1512 };

	1513

	1514 const UConverterSharedData _UTF16Data = {

	1515 sizeof(UConverterSharedData), ~((uint32_t) 0),

	1516 NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl,

	1517 0

	1518 };

	1519

	1520 static const UConverterImpl _UTF16v2Impl = {

	1521 UCNV_UTF16,

	1522

	1523 NULL,

	1524 NULL,

	1525

	1526 _UTF16Open,

	1527 NULL,

	1528 _UTF16Reset,

	1529

	1530 _UTF16ToUnicodeWithOffsets,

	1531 _UTF16ToUnicodeWithOffsets,

	1532 _UTF16BEFromUnicodeWithOffsets,

	1533 _UTF16BEFromUnicodeWithOffsets,

	1534 _UTF16GetNextUChar,

	1535

	1536 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */

	1537 _UTF16GetName,

	1538 NULL,

	1539 NULL,

	1540 ucnv_getNonSurrogateUnicodeSet

	1541 };

	1542

	1543 static const UConverterStaticData _UTF16v2StaticData = {

	1544 sizeof(UConverterStaticData),

	1545 "UTF-16,version=2",

	1546 1204, /* CCSID for BOM sensitive UTF-16 */

	1547 UCNV_IBM, UCNV_UTF16, 2, 2,

	1548 { 0xff, 0xfd, 0, 0 }, 2,

	1549 FALSE, FALSE,

	1550 0,

	1551 0,

	1552 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

	1553 };

	1554

	1555 static const UConverterSharedData _UTF16v2Data = {

	1556 sizeof(UConverterSharedData), ~((uint32_t) 0),

	1557 NULL, NULL, &_UTF16v2StaticData, FALSE, &_UTF16v2Impl,

	1558 0

	1559 };

	1560

	1561 #endif

OLD	NEW

« no previous file with comments | « icu46/source/common/ucnv_set.c ('k') | icu46/source/common/ucnv_u32.c » ('j') | no next file with comments »