icu46/source/common/ucnv_u32.c - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/ucnv_u32.c

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 2002-2009, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 **********************************************************************

	6 * file name: ucnv_u32.c

	7 * encoding: US-ASCII

	8 * tab size: 8 (not used)

	9 * indentation:4

	10 *

	11 * created on: 2002jul01

	12 * created by: Markus W. Scherer

	13 *

	14 * UTF-32 converter implementation. Used to be in ucnv_utf.c.

	15 */

	16

	17 #include "unicode/utypes.h"

	18

	19 #if !UCONFIG_NO_CONVERSION

	20

	21 #include "unicode/ucnv.h"

	22 #include "ucnv_bld.h"

	23 #include "ucnv_cnv.h"

	24 #include "cmemory.h"

	25

	26 #define MAXIMUM_UCS2 0x0000FFFF

	27 #define MAXIMUM_UTF 0x0010FFFF

	28 #define HALF_SHIFT 10

	29 #define HALF_BASE 0x0010000

	30 #define HALF_MASK 0x3FF

	31 #define SURROGATE_HIGH_START 0xD800

	32 #define SURROGATE_LOW_START 0xDC00

	33

	34 /* -SURROGATE_LOW_START + HALF_BASE */

	35 #define SURROGATE_LOW_BASE 9216

	36

	37 enum {

	38 UCNV_NEED_TO_WRITE_BOM=1

	39 };

	40

	41 /* UTF-32BE ----------------------------------------------------------------- */

	42

	43 static void

	44 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,

	45 UErrorCode * err)

	46 {

	47 const unsigned char mySource = (unsigned char ) args->source;

	48 UChar *myTarget = args->target;

	49 const unsigned char sourceLimit = (unsigned char ) args->sourceLimit;

	50 const UChar *targetLimit = args->targetLimit;

	51 unsigned char *toUBytes = args->converter->toUBytes;

	52 uint32_t ch, i;

	53

	54 /* Restore state of current sequence */

	55 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {

	56 i = args->converter->toULength; /* restore # of bytes consumed */

	57 args->converter->toULength = 0;

	58

	59 ch = args->converter->toUnicodeStatus - 1;/Stores the previously calcul ated ch from a previous call/

	60 args->converter->toUnicodeStatus = 0;

	61 goto morebytes;

	62 }

	63

	64 while (mySource < sourceLimit && myTarget < targetLimit) {

	65 i = 0;

	66 ch = 0;

	67 morebytes:

	68 while (i < sizeof(uint32_t)) {

	69 if (mySource < sourceLimit) {

	70 ch = (ch << 8) \| (uint8_t)(*mySource);

	71 toUBytes[i++] = (char) *(mySource++);

	72 }

	73 else {

	74 /* stores a partially calculated target*/

	75 /* + 1 to make 0 a valid character */

	76 args->converter->toUnicodeStatus = ch + 1;

	77 args->converter->toULength = (int8_t) i;

	78 goto donefornow;

	79 }

	80 }

	81

	82 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {

	83 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */

	84 if (ch <= MAXIMUM_UCS2)

	85 {

	86 /* fits in 16 bits */

	87 *(myTarget++) = (UChar) ch;

	88 }

	89 else {

	90 /* write out the surrogates */

	91 *(myTarget++) = U16_LEAD(ch);

	92 ch = U16_TRAIL(ch);

	93 if (myTarget < targetLimit) {

	94 *(myTarget++) = (UChar)ch;

	95 }

	96 else {

	97 /* Put in overflow buffer (not handled here) */

	98 args->converter->UCharErrorBuffer[0] = (UChar) ch;

	99 args->converter->UCharErrorBufferLength = 1;

	100 *err = U_BUFFER_OVERFLOW_ERROR;

	101 break;

	102 }

	103 }

	104 }

	105 else {

	106 args->converter->toULength = (int8_t)i;

	107 *err = U_ILLEGAL_CHAR_FOUND;

	108 break;

	109 }

	110 }

	111

	112 donefornow:

	113 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {

	114 /* End of target buffer */

	115 *err = U_BUFFER_OVERFLOW_ERROR;

	116 }

	117

	118 args->target = myTarget;

	119 args->source = (const char *) mySource;

	120 }

	121

	122 static void

	123 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,

	124 UErrorCode * err)

	125 {

	126 const unsigned char mySource = (unsigned char ) args->source;

	127 UChar *myTarget = args->target;

	128 int32_t *myOffsets = args->offsets;

	129 const unsigned char sourceLimit = (unsigned char ) args->sourceLimit;

	130 const UChar *targetLimit = args->targetLimit;

	131 unsigned char *toUBytes = args->converter->toUBytes;

	132 uint32_t ch, i;

	133 int32_t offsetNum = 0;

	134

	135 /* Restore state of current sequence */

	136 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {

	137 i = args->converter->toULength; /* restore # of bytes consumed */

	138 args->converter->toULength = 0;

	139

	140 ch = args->converter->toUnicodeStatus - 1;/Stores the previously calcul ated ch from a previous call/

	141 args->converter->toUnicodeStatus = 0;

	142 goto morebytes;

	143 }

	144

	145 while (mySource < sourceLimit && myTarget < targetLimit) {

	146 i = 0;

	147 ch = 0;

	148 morebytes:

	149 while (i < sizeof(uint32_t)) {

	150 if (mySource < sourceLimit) {

	151 ch = (ch << 8) \| (uint8_t)(*mySource);

	152 toUBytes[i++] = (char) *(mySource++);

	153 }

	154 else {

	155 /* stores a partially calculated target*/

	156 /* + 1 to make 0 a valid character */

	157 args->converter->toUnicodeStatus = ch + 1;

	158 args->converter->toULength = (int8_t) i;

	159 goto donefornow;

	160 }

	161 }

	162

	163 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {

	164 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */

	165 if (ch <= MAXIMUM_UCS2) {

	166 /* fits in 16 bits */

	167 *(myTarget++) = (UChar) ch;

	168 *(myOffsets++) = offsetNum;

	169 }

	170 else {

	171 /* write out the surrogates */

	172 *(myTarget++) = U16_LEAD(ch);

	173 *myOffsets++ = offsetNum;

	174 ch = U16_TRAIL(ch);

	175 if (myTarget < targetLimit)

	176 {

	177 *(myTarget++) = (UChar)ch;

	178 *(myOffsets++) = offsetNum;

	179 }

	180 else {

	181 /* Put in overflow buffer (not handled here) */

	182 args->converter->UCharErrorBuffer[0] = (UChar) ch;

	183 args->converter->UCharErrorBufferLength = 1;

	184 *err = U_BUFFER_OVERFLOW_ERROR;

	185 break;

	186 }

	187 }

	188 }

	189 else {

	190 args->converter->toULength = (int8_t)i;

	191 *err = U_ILLEGAL_CHAR_FOUND;

	192 break;

	193 }

	194 offsetNum += i;

	195 }

	196

	197 donefornow:

	198 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

	199 {

	200 /* End of target buffer */

	201 *err = U_BUFFER_OVERFLOW_ERROR;

	202 }

	203

	204 args->target = myTarget;

	205 args->source = (const char *) mySource;

	206 args->offsets = myOffsets;

	207 }

	208

	209 static void

	210 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,

	211 UErrorCode * err)

	212 {

	213 const UChar *mySource = args->source;

	214 unsigned char *myTarget;

	215 const UChar *sourceLimit = args->sourceLimit;

	216 const unsigned char targetLimit = (unsigned char ) args->targetLimit;

	217 UChar32 ch, ch2;

	218 unsigned int indexToWrite;

	219 unsigned char temp[sizeof(uint32_t)];

	220

	221 if(mySource >= sourceLimit) {

	222 /* no input, nothing to do */

	223 return;

	224 }

	225

	226 /* write the BOM if necessary */

	227 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {

	228 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };

	229 ucnv_fromUWriteBytes(args->converter,

	230 bom, 4,

	231 &args->target, args->targetLimit,

	232 &args->offsets, -1,

	233 err);

	234 args->converter->fromUnicodeStatus=0;

	235 }

	236

	237 myTarget = (unsigned char *) args->target;

	238 temp[0] = 0;

	239

	240 if (args->converter->fromUChar32) {

	241 ch = args->converter->fromUChar32;

	242 args->converter->fromUChar32 = 0;

	243 goto lowsurogate;

	244 }

	245

	246 while (mySource < sourceLimit && myTarget < targetLimit) {

	247 ch = *(mySource++);

	248

	249 if (UTF_IS_SURROGATE(ch)) {

	250 if (U_IS_LEAD(ch)) {

	251 lowsurogate:

	252 if (mySource < sourceLimit) {

	253 ch2 = *mySource;

	254 if (U_IS_TRAIL(ch2)) {

	255 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;

	256 mySource++;

	257 }

	258 else {

	259 /* this is an unmatched trail code unit (2nd surrogate) */

	260 /* callback(illegal) */

	261 args->converter->fromUChar32 = ch;

	262 *err = U_ILLEGAL_CHAR_FOUND;

	263 break;

	264 }

	265 }

	266 else {

	267 /* ran out of source */

	268 args->converter->fromUChar32 = ch;

	269 if (args->flush) {

	270 /* this is an unmatched trail code unit (2nd surrogate) */

	271 /* callback(illegal) */

	272 *err = U_ILLEGAL_CHAR_FOUND;

	273 }

	274 break;

	275 }

	276 }

	277 else {

	278 /* this is an unmatched trail code unit (2nd surrogate) */

	279 /* callback(illegal) */

	280 args->converter->fromUChar32 = ch;

	281 *err = U_ILLEGAL_CHAR_FOUND;

	282 break;

	283 }

	284 }

	285

	286 /* We cannot get any larger than 10FFFF because we are coming from UTF-1 6 */

	287 temp[1] = (uint8_t) (ch >> 16 & 0x1F);

	288 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & F F) */

	289 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & F F) */

	290

	291 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrit e++) {

	292 if (myTarget < targetLimit) {

	293 *(myTarget++) = temp[indexToWrite];

	294 }

	295 else {

	296 args->converter->charErrorBuffer[args->converter->charErrorBuffe rLength++] = temp[indexToWrite];

	297 *err = U_BUFFER_OVERFLOW_ERROR;

	298 }

	299 }

	300 }

	301

	302 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {

	303 *err = U_BUFFER_OVERFLOW_ERROR;

	304 }

	305

	306 args->target = (char *) myTarget;

	307 args->source = mySource;

	308 }

	309

	310 static void

	311 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,

	312 UErrorCode * err)

	313 {

	314 const UChar *mySource = args->source;

	315 unsigned char *myTarget;

	316 int32_t *myOffsets;

	317 const UChar *sourceLimit = args->sourceLimit;

	318 const unsigned char targetLimit = (unsigned char ) args->targetLimit;

	319 UChar32 ch, ch2;

	320 int32_t offsetNum = 0;

	321 unsigned int indexToWrite;

	322 unsigned char temp[sizeof(uint32_t)];

	323

	324 if(mySource >= sourceLimit) {

	325 /* no input, nothing to do */

	326 return;

	327 }

	328

	329 /* write the BOM if necessary */

	330 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {

	331 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };

	332 ucnv_fromUWriteBytes(args->converter,

	333 bom, 4,

	334 &args->target, args->targetLimit,

	335 &args->offsets, -1,

	336 err);

	337 args->converter->fromUnicodeStatus=0;

	338 }

	339

	340 myTarget = (unsigned char *) args->target;

	341 myOffsets = args->offsets;

	342 temp[0] = 0;

	343

	344 if (args->converter->fromUChar32) {

	345 ch = args->converter->fromUChar32;

	346 args->converter->fromUChar32 = 0;

	347 goto lowsurogate;

	348 }

	349

	350 while (mySource < sourceLimit && myTarget < targetLimit) {

	351 ch = *(mySource++);

	352

	353 if (UTF_IS_SURROGATE(ch)) {

	354 if (U_IS_LEAD(ch)) {

	355 lowsurogate:

	356 if (mySource < sourceLimit) {

	357 ch2 = *mySource;

	358 if (U_IS_TRAIL(ch2)) {

	359 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;

	360 mySource++;

	361 }

	362 else {

	363 /* this is an unmatched trail code unit (2nd surrogate) */

	364 /* callback(illegal) */

	365 args->converter->fromUChar32 = ch;

	366 *err = U_ILLEGAL_CHAR_FOUND;

	367 break;

	368 }

	369 }

	370 else {

	371 /* ran out of source */

	372 args->converter->fromUChar32 = ch;

	373 if (args->flush) {

	374 /* this is an unmatched trail code unit (2nd surrogate) */

	375 /* callback(illegal) */

	376 *err = U_ILLEGAL_CHAR_FOUND;

	377 }

	378 break;

	379 }

	380 }

	381 else {

	382 /* this is an unmatched trail code unit (2nd surrogate) */

	383 /* callback(illegal) */

	384 args->converter->fromUChar32 = ch;

	385 *err = U_ILLEGAL_CHAR_FOUND;

	386 break;

	387 }

	388 }

	389

	390 /* We cannot get any larger than 10FFFF because we are coming from UTF-1 6 */

	391 temp[1] = (uint8_t) (ch >> 16 & 0x1F);

	392 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & F F) */

	393 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & F F) */

	394

	395 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrit e++) {

	396 if (myTarget < targetLimit) {

	397 *(myTarget++) = temp[indexToWrite];

	398 *(myOffsets++) = offsetNum;

	399 }

	400 else {

	401 args->converter->charErrorBuffer[args->converter->charErrorBuffe rLength++] = temp[indexToWrite];

	402 *err = U_BUFFER_OVERFLOW_ERROR;

	403 }

	404 }

	405 offsetNum = offsetNum + 1 + (temp[1] != 0);

	406 }

	407

	408 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {

	409 *err = U_BUFFER_OVERFLOW_ERROR;

	410 }

	411

	412 args->target = (char *) myTarget;

	413 args->source = mySource;

	414 args->offsets = myOffsets;

	415 }

	416

	417 static UChar32

	418 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,

	419 UErrorCode* err)

	420 {

	421 const uint8_t *mySource;

	422 UChar32 myUChar;

	423 int32_t length;

	424

	425 mySource = (const uint8_t *)args->source;

	426 if (mySource >= (const uint8_t *)args->sourceLimit)

	427 {

	428 /* no input */

	429 *err = U_INDEX_OUTOFBOUNDS_ERROR;

	430 return 0xffff;

	431 }

	432

	433 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);

	434 if (length < 4)

	435 {

	436 /* got a partial character */

	437 uprv_memcpy(args->converter->toUBytes, mySource, length);

	438 args->converter->toULength = (int8_t)length;

	439 args->source = (const char *)(mySource + length);

	440 *err = U_TRUNCATED_CHAR_FOUND;

	441 return 0xffff;

	442 }

	443

	444 /* Don't even try to do a direct cast because the value may be on an odd add ress. */

	445 myUChar = ((UChar32)mySource[0] << 24)

	446 \| ((UChar32)mySource[1] << 16)

	447 \| ((UChar32)mySource[2] << 8)

	448 \| ((UChar32)mySource[3]);

	449

	450 args->source = (const char *)(mySource + 4);

	451 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {

	452 return myUChar;

	453 }

	454

	455 uprv_memcpy(args->converter->toUBytes, mySource, 4);

	456 args->converter->toULength = 4;

	457

	458 *err = U_ILLEGAL_CHAR_FOUND;

	459 return 0xffff;

	460 }

	461

	462 static const UConverterImpl _UTF32BEImpl = {

	463 UCNV_UTF32_BigEndian,

	464

	465 NULL,

	466 NULL,

	467

	468 NULL,

	469 NULL,

	470 NULL,

	471

	472 T_UConverter_toUnicode_UTF32_BE,

	473 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,

	474 T_UConverter_fromUnicode_UTF32_BE,

	475 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,

	476 T_UConverter_getNextUChar_UTF32_BE,

	477

	478 NULL,

	479 NULL,

	480 NULL,

	481 NULL,

	482 ucnv_getNonSurrogateUnicodeSet

	483 };

	484

	485 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */

	486 static const UConverterStaticData _UTF32BEStaticData = {

	487 sizeof(UConverterStaticData),

	488 "UTF-32BE",

	489 1232,

	490 UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,

	491 { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,

	492 0,

	493 0,

	494 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

	495 };

	496

	497 const UConverterSharedData _UTF32BEData = {

	498 sizeof(UConverterSharedData), ~((uint32_t) 0),

	499 NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl,

	500 0

	501 };

	502

	503 /* UTF-32LE ---------------------------------------------------------- */

	504

	505 static void

	506 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,

	507 UErrorCode * err)

	508 {

	509 const unsigned char mySource = (unsigned char ) args->source;

	510 UChar *myTarget = args->target;

	511 const unsigned char sourceLimit = (unsigned char ) args->sourceLimit;

	512 const UChar *targetLimit = args->targetLimit;

	513 unsigned char *toUBytes = args->converter->toUBytes;

	514 uint32_t ch, i;

	515

	516 /* Restore state of current sequence */

	517 if (args->converter->toUnicodeStatus && myTarget < targetLimit)

	518 {

	519 i = args->converter->toULength; /* restore # of bytes consumed */

	520 args->converter->toULength = 0;

	521

	522 /* Stores the previously calculated ch from a previous call*/

	523 ch = args->converter->toUnicodeStatus - 1;

	524 args->converter->toUnicodeStatus = 0;

	525 goto morebytes;

	526 }

	527

	528 while (mySource < sourceLimit && myTarget < targetLimit)

	529 {

	530 i = 0;

	531 ch = 0;

	532 morebytes:

	533 while (i < sizeof(uint32_t))

	534 {

	535 if (mySource < sourceLimit)

	536 {

	537 ch \|= ((uint8_t)(mySource)) << (i 8);

	538 toUBytes[i++] = (char) *(mySource++);

	539 }

	540 else

	541 {

	542 /* stores a partially calculated target*/

	543 /* + 1 to make 0 a valid character */

	544 args->converter->toUnicodeStatus = ch + 1;

	545 args->converter->toULength = (int8_t) i;

	546 goto donefornow;

	547 }

	548 }

	549

	550 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {

	551 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */

	552 if (ch <= MAXIMUM_UCS2) {

	553 /* fits in 16 bits */

	554 *(myTarget++) = (UChar) ch;

	555 }

	556 else {

	557 /* write out the surrogates */

	558 *(myTarget++) = U16_LEAD(ch);

	559 ch = U16_TRAIL(ch);

	560 if (myTarget < targetLimit) {

	561 *(myTarget++) = (UChar)ch;

	562 }

	563 else {

	564 /* Put in overflow buffer (not handled here) */

	565 args->converter->UCharErrorBuffer[0] = (UChar) ch;

	566 args->converter->UCharErrorBufferLength = 1;

	567 *err = U_BUFFER_OVERFLOW_ERROR;

	568 break;

	569 }

	570 }

	571 }

	572 else {

	573 args->converter->toULength = (int8_t)i;

	574 *err = U_ILLEGAL_CHAR_FOUND;

	575 break;

	576 }

	577 }

	578

	579 donefornow:

	580 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

	581 {

	582 /* End of target buffer */

	583 *err = U_BUFFER_OVERFLOW_ERROR;

	584 }

	585

	586 args->target = myTarget;

	587 args->source = (const char *) mySource;

	588 }

	589

	590 static void

	591 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,

	592 UErrorCode * err)

	593 {

	594 const unsigned char mySource = (unsigned char ) args->source;

	595 UChar *myTarget = args->target;

	596 int32_t *myOffsets = args->offsets;

	597 const unsigned char sourceLimit = (unsigned char ) args->sourceLimit;

	598 const UChar *targetLimit = args->targetLimit;

	599 unsigned char *toUBytes = args->converter->toUBytes;

	600 uint32_t ch, i;

	601 int32_t offsetNum = 0;

	602

	603 /* Restore state of current sequence */

	604 if (args->converter->toUnicodeStatus && myTarget < targetLimit)

	605 {

	606 i = args->converter->toULength; /* restore # of bytes consumed */

	607 args->converter->toULength = 0;

	608

	609 /* Stores the previously calculated ch from a previous call*/

	610 ch = args->converter->toUnicodeStatus - 1;

	611 args->converter->toUnicodeStatus = 0;

	612 goto morebytes;

	613 }

	614

	615 while (mySource < sourceLimit && myTarget < targetLimit)

	616 {

	617 i = 0;

	618 ch = 0;

	619 morebytes:

	620 while (i < sizeof(uint32_t))

	621 {

	622 if (mySource < sourceLimit)

	623 {

	624 ch \|= ((uint8_t)(mySource)) << (i 8);

	625 toUBytes[i++] = (char) *(mySource++);

	626 }

	627 else

	628 {

	629 /* stores a partially calculated target*/

	630 /* + 1 to make 0 a valid character */

	631 args->converter->toUnicodeStatus = ch + 1;

	632 args->converter->toULength = (int8_t) i;

	633 goto donefornow;

	634 }

	635 }

	636

	637 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))

	638 {

	639 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */

	640 if (ch <= MAXIMUM_UCS2)

	641 {

	642 /* fits in 16 bits */

	643 *(myTarget++) = (UChar) ch;

	644 *(myOffsets++) = offsetNum;

	645 }

	646 else {

	647 /* write out the surrogates */

	648 *(myTarget++) = U16_LEAD(ch);

	649 *(myOffsets++) = offsetNum;

	650 ch = U16_TRAIL(ch);

	651 if (myTarget < targetLimit)

	652 {

	653 *(myTarget++) = (UChar)ch;

	654 *(myOffsets++) = offsetNum;

	655 }

	656 else

	657 {

	658 /* Put in overflow buffer (not handled here) */

	659 args->converter->UCharErrorBuffer[0] = (UChar) ch;

	660 args->converter->UCharErrorBufferLength = 1;

	661 *err = U_BUFFER_OVERFLOW_ERROR;

	662 break;

	663 }

	664 }

	665 }

	666 else

	667 {

	668 args->converter->toULength = (int8_t)i;

	669 *err = U_ILLEGAL_CHAR_FOUND;

	670 break;

	671 }

	672 offsetNum += i;

	673 }

	674

	675 donefornow:

	676 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

	677 {

	678 /* End of target buffer */

	679 *err = U_BUFFER_OVERFLOW_ERROR;

	680 }

	681

	682 args->target = myTarget;

	683 args->source = (const char *) mySource;

	684 args->offsets = myOffsets;

	685 }

	686

	687 static void

	688 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,

	689 UErrorCode * err)

	690 {

	691 const UChar *mySource = args->source;

	692 unsigned char *myTarget;

	693 const UChar *sourceLimit = args->sourceLimit;

	694 const unsigned char targetLimit = (unsigned char ) args->targetLimit;

	695 UChar32 ch, ch2;

	696 unsigned int indexToWrite;

	697 unsigned char temp[sizeof(uint32_t)];

	698

	699 if(mySource >= sourceLimit) {

	700 /* no input, nothing to do */

	701 return;

	702 }

	703

	704 /* write the BOM if necessary */

	705 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {

	706 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };

	707 ucnv_fromUWriteBytes(args->converter,

	708 bom, 4,

	709 &args->target, args->targetLimit,

	710 &args->offsets, -1,

	711 err);

	712 args->converter->fromUnicodeStatus=0;

	713 }

	714

	715 myTarget = (unsigned char *) args->target;

	716 temp[3] = 0;

	717

	718 if (args->converter->fromUChar32)

	719 {

	720 ch = args->converter->fromUChar32;

	721 args->converter->fromUChar32 = 0;

	722 goto lowsurogate;

	723 }

	724

	725 while (mySource < sourceLimit && myTarget < targetLimit)

	726 {

	727 ch = *(mySource++);

	728

	729 if (UTF_IS_SURROGATE(ch)) {

	730 if (U_IS_LEAD(ch))

	731 {

	732 lowsurogate:

	733 if (mySource < sourceLimit)

	734 {

	735 ch2 = *mySource;

	736 if (U_IS_TRAIL(ch2)) {

	737 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;

	738 mySource++;

	739 }

	740 else {

	741 /* this is an unmatched trail code unit (2nd surrogate) */

	742 /* callback(illegal) */

	743 args->converter->fromUChar32 = ch;

	744 *err = U_ILLEGAL_CHAR_FOUND;

	745 break;

	746 }

	747 }

	748 else {

	749 /* ran out of source */

	750 args->converter->fromUChar32 = ch;

	751 if (args->flush) {

	752 /* this is an unmatched trail code unit (2nd surrogate) */

	753 /* callback(illegal) */

	754 *err = U_ILLEGAL_CHAR_FOUND;

	755 }

	756 break;

	757 }

	758 }

	759 else {

	760 /* this is an unmatched trail code unit (2nd surrogate) */

	761 /* callback(illegal) */

	762 args->converter->fromUChar32 = ch;

	763 *err = U_ILLEGAL_CHAR_FOUND;

	764 break;

	765 }

	766 }

	767

	768 /* We cannot get any larger than 10FFFF because we are coming from UTF-1 6 */

	769 temp[2] = (uint8_t) (ch >> 16 & 0x1F);

	770 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & F F) */

	771 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & F F) */

	772

	773 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrit e++)

	774 {

	775 if (myTarget < targetLimit)

	776 {

	777 *(myTarget++) = temp[indexToWrite];

	778 }

	779 else

	780 {

	781 args->converter->charErrorBuffer[args->converter->charErrorBuffe rLength++] = temp[indexToWrite];

	782 *err = U_BUFFER_OVERFLOW_ERROR;

	783 }

	784 }

	785 }

	786

	787 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

	788 {

	789 *err = U_BUFFER_OVERFLOW_ERROR;

	790 }

	791

	792 args->target = (char *) myTarget;

	793 args->source = mySource;

	794 }

	795

	796 static void

	797 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,

	798 UErrorCode * err)

	799 {

	800 const UChar *mySource = args->source;

	801 unsigned char *myTarget;

	802 int32_t *myOffsets;

	803 const UChar *sourceLimit = args->sourceLimit;

	804 const unsigned char targetLimit = (unsigned char ) args->targetLimit;

	805 UChar32 ch, ch2;

	806 unsigned int indexToWrite;

	807 unsigned char temp[sizeof(uint32_t)];

	808 int32_t offsetNum = 0;

	809

	810 if(mySource >= sourceLimit) {

	811 /* no input, nothing to do */

	812 return;

	813 }

	814

	815 /* write the BOM if necessary */

	816 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {

	817 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };

	818 ucnv_fromUWriteBytes(args->converter,

	819 bom, 4,

	820 &args->target, args->targetLimit,

	821 &args->offsets, -1,

	822 err);

	823 args->converter->fromUnicodeStatus=0;

	824 }

	825

	826 myTarget = (unsigned char *) args->target;

	827 myOffsets = args->offsets;

	828 temp[3] = 0;

	829

	830 if (args->converter->fromUChar32)

	831 {

	832 ch = args->converter->fromUChar32;

	833 args->converter->fromUChar32 = 0;

	834 goto lowsurogate;

	835 }

	836

	837 while (mySource < sourceLimit && myTarget < targetLimit)

	838 {

	839 ch = *(mySource++);

	840

	841 if (UTF_IS_SURROGATE(ch)) {

	842 if (U_IS_LEAD(ch))

	843 {

	844 lowsurogate:

	845 if (mySource < sourceLimit)

	846 {

	847 ch2 = *mySource;

	848 if (U_IS_TRAIL(ch2))

	849 {

	850 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;

	851 mySource++;

	852 }

	853 else {

	854 /* this is an unmatched trail code unit (2nd surrogate) */

	855 /* callback(illegal) */

	856 args->converter->fromUChar32 = ch;

	857 *err = U_ILLEGAL_CHAR_FOUND;

	858 break;

	859 }

	860 }

	861 else {

	862 /* ran out of source */

	863 args->converter->fromUChar32 = ch;

	864 if (args->flush) {

	865 /* this is an unmatched trail code unit (2nd surrogate) */

	866 /* callback(illegal) */

	867 *err = U_ILLEGAL_CHAR_FOUND;

	868 }

	869 break;

	870 }

	871 }

	872 else {

	873 /* this is an unmatched trail code unit (2nd surrogate) */

	874 /* callback(illegal) */

	875 args->converter->fromUChar32 = ch;

	876 *err = U_ILLEGAL_CHAR_FOUND;

	877 break;

	878 }

	879 }

	880

	881 /* We cannot get any larger than 10FFFF because we are coming from UTF-1 6 */

	882 temp[2] = (uint8_t) (ch >> 16 & 0x1F);

	883 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & F F) */

	884 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & F F) */

	885

	886 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrit e++)

	887 {

	888 if (myTarget < targetLimit)

	889 {

	890 *(myTarget++) = temp[indexToWrite];

	891 *(myOffsets++) = offsetNum;

	892 }

	893 else

	894 {

	895 args->converter->charErrorBuffer[args->converter->charErrorBuffe rLength++] = temp[indexToWrite];

	896 *err = U_BUFFER_OVERFLOW_ERROR;

	897 }

	898 }

	899 offsetNum = offsetNum + 1 + (temp[2] != 0);

	900 }

	901

	902 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

	903 {

	904 *err = U_BUFFER_OVERFLOW_ERROR;

	905 }

	906

	907 args->target = (char *) myTarget;

	908 args->source = mySource;

	909 args->offsets = myOffsets;

	910 }

	911

	912 static UChar32

	913 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,

	914 UErrorCode* err)

	915 {

	916 const uint8_t *mySource;

	917 UChar32 myUChar;

	918 int32_t length;

	919

	920 mySource = (const uint8_t *)args->source;

	921 if (mySource >= (const uint8_t *)args->sourceLimit)

	922 {

	923 /* no input */

	924 *err = U_INDEX_OUTOFBOUNDS_ERROR;

	925 return 0xffff;

	926 }

	927

	928 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);

	929 if (length < 4)

	930 {

	931 /* got a partial character */

	932 uprv_memcpy(args->converter->toUBytes, mySource, length);

	933 args->converter->toULength = (int8_t)length;

	934 args->source = (const char *)(mySource + length);

	935 *err = U_TRUNCATED_CHAR_FOUND;

	936 return 0xffff;

	937 }

	938

	939 /* Don't even try to do a direct cast because the value may be on an odd add ress. */

	940 myUChar = ((UChar32)mySource[3] << 24)

	941 \| ((UChar32)mySource[2] << 16)

	942 \| ((UChar32)mySource[1] << 8)

	943 \| ((UChar32)mySource[0]);

	944

	945 args->source = (const char *)(mySource + 4);

	946 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {

	947 return myUChar;

	948 }

	949

	950 uprv_memcpy(args->converter->toUBytes, mySource, 4);

	951 args->converter->toULength = 4;

	952

	953 *err = U_ILLEGAL_CHAR_FOUND;

	954 return 0xffff;

	955 }

	956

	957 static const UConverterImpl _UTF32LEImpl = {

	958 UCNV_UTF32_LittleEndian,

	959

	960 NULL,

	961 NULL,

	962

	963 NULL,

	964 NULL,

	965 NULL,

	966

	967 T_UConverter_toUnicode_UTF32_LE,

	968 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,

	969 T_UConverter_fromUnicode_UTF32_LE,

	970 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,

	971 T_UConverter_getNextUChar_UTF32_LE,

	972

	973 NULL,

	974 NULL,

	975 NULL,

	976 NULL,

	977 ucnv_getNonSurrogateUnicodeSet

	978 };

	979

	980 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */

	981 static const UConverterStaticData _UTF32LEStaticData = {

	982 sizeof(UConverterStaticData),

	983 "UTF-32LE",

	984 1234,

	985 UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,

	986 { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,

	987 0,

	988 0,

	989 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

	990 };

	991

	992

	993 const UConverterSharedData _UTF32LEData = {

	994 sizeof(UConverterSharedData), ~((uint32_t) 0),

	995 NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl,

	996 0

	997 };

	998

	999 /* UTF-32 (Detect BOM) ------------------------------------------------------ */

	1000

	1001 /*

	1002 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE

	1003 * accordingly.

	1004 *

	1005 * State values:

	1006 * 0 initial state

	1007 * 1 saw 00

	1008 * 2 saw 00 00

	1009 * 3 saw 00 00 FE

	1010 * 4 -

	1011 * 5 saw FF

	1012 * 6 saw FF FE

	1013 * 7 saw FF FE 00

	1014 * 8 UTF-32BE mode

	1015 * 9 UTF-32LE mode

	1016 *

	1017 * During detection: state&3==number of matching bytes so far.

	1018 *

	1019 * On output, emit U+FEFF as the first code point.

	1020 */

	1021

	1022 static void

	1023 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {

	1024 if(choice<=UCNV_RESET_TO_UNICODE) {

	1025 /* reset toUnicode: state=0 */

	1026 cnv->mode=0;

	1027 }

	1028 if(choice!=UCNV_RESET_TO_UNICODE) {

	1029 /* reset fromUnicode: prepare to output the UTF-32PE BOM */

	1030 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;

	1031 }

	1032 }

	1033

	1034 static void

	1035 _UTF32Open(UConverter *cnv,

	1036 UConverterLoadArgs *pArgs,

	1037 UErrorCode *pErrorCode) {

	1038 _UTF32Reset(cnv, UCNV_RESET_BOTH);

	1039 }

	1040

	1041 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (ch ar)0xfe, 0, 0 };

	1042

	1043 static void

	1044 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

	1045 UErrorCode *pErrorCode) {

	1046 UConverter *cnv=pArgs->converter;

	1047 const char *source=pArgs->source;

	1048 const char *sourceLimit=pArgs->sourceLimit;

	1049 int32_t *offsets=pArgs->offsets;

	1050

	1051 int32_t state, offsetDelta;

	1052 char b;

	1053

	1054 state=cnv->mode;

	1055

	1056 /*

	1057 * If we detect a BOM in this buffer, then we must add the BOM size to the

	1058 * offsets because the actual converter function will not see and count the BOM.

	1059 * offsetDelta will have the number of the BOM bytes that are in the current buffer.

	1060 */

	1061 offsetDelta=0;

	1062

	1063 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {

	1064 switch(state) {

	1065 case 0:

	1066 b=*source;

	1067 if(b==0) {

	1068 state=1; /* could be 00 00 FE FF */

	1069 } else if(b==(char)0xff) {

	1070 state=5; /* could be FF FE 00 00 */

	1071 } else {

	1072 state=8; /* default to UTF-32BE */

	1073 continue;

	1074 }

	1075 ++source;

	1076 break;

	1077 case 1:

	1078 case 2:

	1079 case 3:

	1080 case 5:

	1081 case 6:

	1082 case 7:

	1083 if(*source==utf32BOM[state]) {

	1084 ++state;

	1085 ++source;

	1086 if(state==4) {

	1087 state=8; /* detect UTF-32BE */

	1088 offsetDelta=(int32_t)(source-pArgs->source);

	1089 } else if(state==8) {

	1090 state=9; /* detect UTF-32LE */

	1091 offsetDelta=(int32_t)(source-pArgs->source);

	1092 }

	1093 } else {

	1094 /* switch to UTF-32BE and pass the previous bytes */

	1095 int32_t count=(int32_t)(source-pArgs->source); /* number of byte s from this buffer */

	1096

	1097 /* reset the source */

	1098 source=pArgs->source;

	1099

	1100 if(count==(state&3)) {

	1101 /* simple: all in the same buffer, just reset source */

	1102 } else {

	1103 UBool oldFlush=pArgs->flush;

	1104

	1105 /* some of the bytes are from a previous buffer, replay thos e first */

	1106 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */

	1107 pArgs->sourceLimit=pArgs->source+((state&3)-count); /* repla y previous bytes */

	1108 pArgs->flush=FALSE; /* this sourceLimit is not the real sour ce stream limit */

	1109

	1110 /* no offsets: bytes from previous buffer, and not enough fo r output */

	1111 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);

	1112

	1113 /* restore real pointers; pArgs->source will be set in case 8/9 */

	1114 pArgs->sourceLimit=sourceLimit;

	1115 pArgs->flush=oldFlush;

	1116 }

	1117 state=8;

	1118 continue;

	1119 }

	1120 break;

	1121 case 8:

	1122 /* call UTF-32BE */

	1123 pArgs->source=source;

	1124 if(offsets==NULL) {

	1125 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);

	1126 } else {

	1127 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);

	1128 }

	1129 source=pArgs->source;

	1130 break;

	1131 case 9:

	1132 /* call UTF-32LE */

	1133 pArgs->source=source;

	1134 if(offsets==NULL) {

	1135 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);

	1136 } else {

	1137 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);

	1138 }

	1139 source=pArgs->source;

	1140 break;

	1141 default:

	1142 break; /* does not occur */

	1143 }

	1144 }

	1145

	1146 /* add BOM size to offsets - see comment at offsetDelta declaration */

	1147 if(offsets!=NULL && offsetDelta!=0) {

	1148 int32_t *offsetsLimit=pArgs->offsets;

	1149 while(offsets<offsetsLimit) {

	1150 *offsets++ += offsetDelta;

	1151 }

	1152 }

	1153

	1154 pArgs->source=source;

	1155

	1156 if(source==sourceLimit && pArgs->flush) {

	1157 /* handle truncated input */

	1158 switch(state) {

	1159 case 0:

	1160 break; /* no input at all, nothing to do */

	1161 case 8:

	1162 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);

	1163 break;

	1164 case 9:

	1165 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);

	1166 break;

	1167 default:

	1168 /* handle 0<state<8: call UTF-32BE with too-short input */

	1169 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */

	1170 pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */

	1171

	1172 /* no offsets: not enough for output */

	1173 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);

	1174 pArgs->source=source;

	1175 pArgs->sourceLimit=sourceLimit;

	1176 state=8;

	1177 break;

	1178 }

	1179 }

	1180

	1181 cnv->mode=state;

	1182 }

	1183

	1184 static UChar32

	1185 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,

	1186 UErrorCode *pErrorCode) {

	1187 switch(pArgs->converter->mode) {

	1188 case 8:

	1189 return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);

	1190 case 9:

	1191 return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);

	1192 default:

	1193 return UCNV_GET_NEXT_UCHAR_USE_TO_U;

	1194 }

	1195 }

	1196

	1197 static const UConverterImpl _UTF32Impl = {

	1198 UCNV_UTF32,

	1199

	1200 NULL,

	1201 NULL,

	1202

	1203 _UTF32Open,

	1204 NULL,

	1205 _UTF32Reset,

	1206

	1207 _UTF32ToUnicodeWithOffsets,

	1208 _UTF32ToUnicodeWithOffsets,

	1209 #if U_IS_BIG_ENDIAN

	1210 T_UConverter_fromUnicode_UTF32_BE,

	1211 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,

	1212 #else

	1213 T_UConverter_fromUnicode_UTF32_LE,

	1214 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,

	1215 #endif

	1216 _UTF32GetNextUChar,

	1217

	1218 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */

	1219 NULL,

	1220 NULL,

	1221 NULL,

	1222 ucnv_getNonSurrogateUnicodeSet

	1223 };

	1224

	1225 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianes s of UTF-32 */

	1226 static const UConverterStaticData _UTF32StaticData = {

	1227 sizeof(UConverterStaticData),

	1228 "UTF-32",

	1229 1236,

	1230 UCNV_IBM, UCNV_UTF32, 4, 4,

	1231 #if U_IS_BIG_ENDIAN

	1232 { 0, 0, 0xff, 0xfd }, 4,

	1233 #else

	1234 { 0xfd, 0xff, 0, 0 }, 4,

	1235 #endif

	1236 FALSE, FALSE,

	1237 0,

	1238 0,

	1239 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

	1240 };

	1241

	1242 const UConverterSharedData _UTF32Data = {

	1243 sizeof(UConverterSharedData), ~((uint32_t) 0),

	1244 NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,

	1245 0

	1246 };

	1247

	1248 #endif

OLD	NEW

« no previous file with comments | « icu46/source/common/ucnv_u16.c ('k') | icu46/source/common/ucnv_u7.c » ('j') | no next file with comments »