icu46/source/common/ucnv_u8.c - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/ucnv_u8.c

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 2002-2007, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 **********************************************************************

	6 * file name: ucnv_u8.c

	7 * encoding: US-ASCII

	8 * tab size: 8 (not used)

	9 * indentation:4

	10 *

	11 * created on: 2002jul01

	12 * created by: Markus W. Scherer

	13 *

	14 * UTF-8 converter implementation. Used to be in ucnv_utf.c.

	15 *

	16 * Also, CESU-8 implementation, see UTR 26.

	17 * The CESU-8 converter uses all the same functions as the

	18 * UTF-8 converter, with a branch for converting supplementary code points.

	19 */

	20

	21 #include "unicode/utypes.h"

	22

	23 #if !UCONFIG_NO_CONVERSION

	24

	25 #include "unicode/ucnv.h"

	26 #include "ucnv_bld.h"

	27 #include "ucnv_cnv.h"

	28 #include "cmemory.h"

	29

	30 /* Prototypes --------------------------------------------------------------- */

	31

	32 /* Keep these here to make finicky compilers happy */

	33

	34 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,

	35 UErrorCode *err);

	36 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args ,

	37 UErrorCode *err);

	38

	39

	40 /* UTF-8 -------------------------------------------------------------------- */

	41

	42 /* UTF-8 Conversion DATA

	43 * for more information see Unicode Standard 2.0, Transformation Formats Appen dix A-9

	44 */

	45 /static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;/

	46 #define MAXIMUM_UCS2 0x0000FFFF

	47 #define MAXIMUM_UTF 0x0010FFFF

	48 #define MAXIMUM_UCS4 0x7FFFFFFF

	49 #define HALF_SHIFT 10

	50 #define HALF_BASE 0x0010000

	51 #define HALF_MASK 0x3FF

	52 #define SURROGATE_HIGH_START 0xD800

	53 #define SURROGATE_HIGH_END 0xDBFF

	54 #define SURROGATE_LOW_START 0xDC00

	55 #define SURROGATE_LOW_END 0xDFFF

	56

	57 /* -SURROGATE_LOW_START + HALF_BASE */

	58 #define SURROGATE_LOW_BASE 9216

	59

	60 static const uint32_t offsetsFromUTF8[7] = {0,

	61 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,

	62 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080

	63 };

	64

	65 /* END OF UTF-8 Conversion DATA */

	66

	67 static const int8_t bytesFromUTF8[256] = {

	68 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

	69 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

	70 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

	71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

	72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	74 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

	75 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0

	76 };

	77

	78 /*

	79 * Starting with Unicode 3.0.1:

	80 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_m inChar32[N];

	81 * byte sequences with more than 4 bytes are illegal in UTF-8,

	82 * which is tested with impossible values for them

	83 */

	84 static const uint32_t

	85 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };

	86

	87 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,

	88 UErrorCode * err)

	89 {

	90 UConverter *cnv = args->converter;

	91 const unsigned char mySource = (unsigned char ) args->source;

	92 UChar *myTarget = args->target;

	93 const unsigned char sourceLimit = (unsigned char ) args->sourceLimit;

	94 const UChar *targetLimit = args->targetLimit;

	95 unsigned char *toUBytes = cnv->toUBytes;

	96 UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);

	97 uint32_t ch, ch2 = 0;

	98 int32_t i, inBytes;

	99

	100 /* Restore size of current sequence */

	101 if (cnv->toUnicodeStatus && myTarget < targetLimit)

	102 {

	103 inBytes = cnv->mode; /* restore # of bytes to consume */

	104 i = cnv->toULength; /* restore # of bytes consumed */

	105 cnv->toULength = 0;

	106

	107 ch = cnv->toUnicodeStatus;/Stores the previously calculated ch from a p revious call/

	108 cnv->toUnicodeStatus = 0;

	109 goto morebytes;

	110 }

	111

	112

	113 while (mySource < sourceLimit && myTarget < targetLimit)

	114 {

	115 ch = *(mySource++);

	116 if (ch < 0x80) /* Simple case */

	117 {

	118 *(myTarget++) = (UChar) ch;

	119 }

	120 else

	121 {

	122 /* store the first char */

	123 toUBytes[0] = (char)ch;

	124 inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */

	125 i = 1;

	126

	127 morebytes:

	128 while (i < inBytes)

	129 {

	130 if (mySource < sourceLimit)

	131 {

	132 toUBytes[i] = (char) (ch2 = *mySource);

	133 if (!UTF8_IS_TRAIL(ch2))

	134 {

	135 break; /* i < inBytes */

	136 }

	137 ch = (ch << 6) + ch2;

	138 ++mySource;

	139 i++;

	140 }

	141 else

	142 {

	143 /* stores a partially calculated target*/

	144 cnv->toUnicodeStatus = ch;

	145 cnv->mode = inBytes;

	146 cnv->toULength = (int8_t) i;

	147 goto donefornow;

	148 }

	149 }

	150

	151 /* Remove the accumulated high bits */

	152 ch -= offsetsFromUTF8[inBytes];

	153

	154 /*

	155 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:

	156 * - use only trail bytes after a lead byte (checked above)

	157 * - use the right number of trail bytes for a given lead byte

	158 * - encode a code point <= U+10ffff

	159 * - use the fewest possible number of bytes for their code points

	160 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])

	161 *

	162 * Starting with Unicode 3.2, surrogate code points must not be enco ded in UTF-8.

	163 * There are no irregular sequences any more.

	164 * In CESU-8, only surrogates, not supplementary code points, are en coded directly.

	165 */

	166 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&

	167 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))

	168 {

	169 /* Normal valid byte when the loop has not prematurely terminate d (i < inBytes) */

	170 if (ch <= MAXIMUM_UCS2)

	171 {

	172 /* fits in 16 bits */

	173 *(myTarget++) = (UChar) ch;

	174 }

	175 else

	176 {

	177 /* write out the surrogates */

	178 ch -= HALF_BASE;

	179 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH _START);

	180 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;

	181 if (myTarget < targetLimit)

	182 {

	183 *(myTarget++) = (UChar)ch;

	184 }

	185 else

	186 {

	187 /* Put in overflow buffer (not handled here) */

	188 cnv->UCharErrorBuffer[0] = (UChar) ch;

	189 cnv->UCharErrorBufferLength = 1;

	190 *err = U_BUFFER_OVERFLOW_ERROR;

	191 break;

	192 }

	193 }

	194 }

	195 else

	196 {

	197 cnv->toULength = (int8_t)i;

	198 *err = U_ILLEGAL_CHAR_FOUND;

	199 break;

	200 }

	201 }

	202 }

	203

	204 donefornow:

	205 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

	206 {

	207 /* End of target buffer */

	208 *err = U_BUFFER_OVERFLOW_ERROR;

	209 }

	210

	211 args->target = myTarget;

	212 args->source = (const char *) mySource;

	213 }

	214

	215 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,

	216 UErrorCode * err)

	217 {

	218 UConverter *cnv = args->converter;

	219 const unsigned char mySource = (unsigned char ) args->source;

	220 UChar *myTarget = args->target;

	221 int32_t *myOffsets = args->offsets;

	222 int32_t offsetNum = 0;

	223 const unsigned char sourceLimit = (unsigned char ) args->sourceLimit;

	224 const UChar *targetLimit = args->targetLimit;

	225 unsigned char *toUBytes = cnv->toUBytes;

	226 UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);

	227 uint32_t ch, ch2 = 0;

	228 int32_t i, inBytes;

	229

	230 /* Restore size of current sequence */

	231 if (cnv->toUnicodeStatus && myTarget < targetLimit)

	232 {

	233 inBytes = cnv->mode; /* restore # of bytes to consume */

	234 i = cnv->toULength; /* restore # of bytes consumed */

	235 cnv->toULength = 0;

	236

	237 ch = cnv->toUnicodeStatus;/Stores the previously calculated ch from a p revious call/

	238 cnv->toUnicodeStatus = 0;

	239 goto morebytes;

	240 }

	241

	242 while (mySource < sourceLimit && myTarget < targetLimit)

	243 {

	244 ch = *(mySource++);

	245 if (ch < 0x80) /* Simple case */

	246 {

	247 *(myTarget++) = (UChar) ch;

	248 *(myOffsets++) = offsetNum++;

	249 }

	250 else

	251 {

	252 toUBytes[0] = (char)ch;

	253 inBytes = bytesFromUTF8[ch];

	254 i = 1;

	255

	256 morebytes:

	257 while (i < inBytes)

	258 {

	259 if (mySource < sourceLimit)

	260 {

	261 toUBytes[i] = (char) (ch2 = *mySource);

	262 if (!UTF8_IS_TRAIL(ch2))

	263 {

	264 break; /* i < inBytes */

	265 }

	266 ch = (ch << 6) + ch2;

	267 ++mySource;

	268 i++;

	269 }

	270 else

	271 {

	272 cnv->toUnicodeStatus = ch;

	273 cnv->mode = inBytes;

	274 cnv->toULength = (int8_t)i;

	275 goto donefornow;

	276 }

	277 }

	278

	279 /* Remove the accumulated high bits */

	280 ch -= offsetsFromUTF8[inBytes];

	281

	282 /*

	283 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:

	284 * - use only trail bytes after a lead byte (checked above)

	285 * - use the right number of trail bytes for a given lead byte

	286 * - encode a code point <= U+10ffff

	287 * - use the fewest possible number of bytes for their code points

	288 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])

	289 *

	290 * Starting with Unicode 3.2, surrogate code points must not be enco ded in UTF-8.

	291 * There are no irregular sequences any more.

	292 * In CESU-8, only surrogates, not supplementary code points, are en coded directly.

	293 */

	294 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&

	295 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))

	296 {

	297 /* Normal valid byte when the loop has not prematurely terminate d (i < inBytes) */

	298 if (ch <= MAXIMUM_UCS2)

	299 {

	300 /* fits in 16 bits */

	301 *(myTarget++) = (UChar) ch;

	302 *(myOffsets++) = offsetNum;

	303 }

	304 else

	305 {

	306 /* write out the surrogates */

	307 ch -= HALF_BASE;

	308 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH _START);

	309 *(myOffsets++) = offsetNum;

	310 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;

	311 if (myTarget < targetLimit)

	312 {

	313 *(myTarget++) = (UChar)ch;

	314 *(myOffsets++) = offsetNum;

	315 }

	316 else

	317 {

	318 cnv->UCharErrorBuffer[0] = (UChar) ch;

	319 cnv->UCharErrorBufferLength = 1;

	320 *err = U_BUFFER_OVERFLOW_ERROR;

	321 }

	322 }

	323 offsetNum += i;

	324 }

	325 else

	326 {

	327 cnv->toULength = (int8_t)i;

	328 *err = U_ILLEGAL_CHAR_FOUND;

	329 break;

	330 }

	331 }

	332 }

	333

	334 donefornow:

	335 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

	336 { /* End of target buffer */

	337 *err = U_BUFFER_OVERFLOW_ERROR;

	338 }

	339

	340 args->target = myTarget;

	341 args->source = (const char *) mySource;

	342 args->offsets = myOffsets;

	343 }

	344

	345 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,

	346 UErrorCode * err)

	347 {

	348 UConverter *cnv = args->converter;

	349 const UChar *mySource = args->source;

	350 const UChar *sourceLimit = args->sourceLimit;

	351 uint8_t myTarget = (uint8_t ) args->target;

	352 const uint8_t targetLimit = (uint8_t ) args->targetLimit;

	353 uint8_t *tempPtr;

	354 UChar32 ch;

	355 uint8_t tempBuf[4];

	356 int32_t indexToWrite;

	357 UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);

	358

	359 if (cnv->fromUChar32 && myTarget < targetLimit)

	360 {

	361 ch = cnv->fromUChar32;

	362 cnv->fromUChar32 = 0;

	363 goto lowsurrogate;

	364 }

	365

	366 while (mySource < sourceLimit && myTarget < targetLimit)

	367 {

	368 ch = *(mySource++);

	369

	370 if (ch < 0x80) /* Single byte */

	371 {

	372 *(myTarget++) = (uint8_t) ch;

	373 }

	374 else if (ch < 0x800) /* Double byte */

	375 {

	376 *(myTarget++) = (uint8_t) ((ch >> 6) \| 0xc0);

	377 if (myTarget < targetLimit)

	378 {

	379 *(myTarget++) = (uint8_t) ((ch & 0x3f) \| 0x80);

	380 }

	381 else

	382 {

	383 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) \| 0x80);

	384 cnv->charErrorBufferLength = 1;

	385 *err = U_BUFFER_OVERFLOW_ERROR;

	386 }

	387 }

	388 else {

	389 /* Check for surrogates */

	390 if(UTF_IS_SURROGATE(ch) && isNotCESU8) {

	391 lowsurrogate:

	392 if (mySource < sourceLimit) {

	393 /* test both code units */

	394 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*my Source)) {

	395 /* convert and consume this supplementary code point */

	396 ch=UTF16_GET_PAIR_VALUE(ch, *mySource);

	397 ++mySource;

	398 /* exit this condition tree */

	399 }

	400 else {

	401 /* this is an unpaired trail or lead code unit */

	402 /* callback(illegal) */

	403 cnv->fromUChar32 = ch;

	404 *err = U_ILLEGAL_CHAR_FOUND;

	405 break;

	406 }

	407 }

	408 else {

	409 /* no more input */

	410 cnv->fromUChar32 = ch;

	411 break;

	412 }

	413 }

	414

	415 /* Do we write the buffer directly for speed,

	416 or do we have to be careful about target buffer space? */

	417 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);

	418

	419 if (ch <= MAXIMUM_UCS2) {

	420 indexToWrite = 2;

	421 tempPtr[0] = (uint8_t) ((ch >> 12) \| 0xe0);

	422 }

	423 else {

	424 indexToWrite = 3;

	425 tempPtr[0] = (uint8_t) ((ch >> 18) \| 0xf0);

	426 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) \| 0x80);

	427 }

	428 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) \| 0x80);

	429 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) \| 0x80);

	430

	431 if (tempPtr == myTarget) {

	432 /* There was enough space to write the codepoint directly. */

	433 myTarget += (indexToWrite + 1);

	434 }

	435 else {

	436 /* We might run out of room soon. Write it slowly. */

	437 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {

	438 if (myTarget < targetLimit) {

	439 (myTarget++) = tempPtr;

	440 }

	441 else {

	442 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *te mpPtr;

	443 *err = U_BUFFER_OVERFLOW_ERROR;

	444 }

	445 }

	446 }

	447 }

	448 }

	449

	450 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

	451 {

	452 *err = U_BUFFER_OVERFLOW_ERROR;

	453 }

	454

	455 args->target = (char *) myTarget;

	456 args->source = mySource;

	457 }

	458

	459 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * ar gs,

	460 UErrorCode * err)

	461 {

	462 UConverter *cnv = args->converter;

	463 const UChar *mySource = args->source;

	464 int32_t *myOffsets = args->offsets;

	465 const UChar *sourceLimit = args->sourceLimit;

	466 uint8_t myTarget = (uint8_t ) args->target;

	467 const uint8_t targetLimit = (uint8_t ) args->targetLimit;

	468 uint8_t *tempPtr;

	469 UChar32 ch;

	470 int32_t offsetNum, nextSourceIndex;

	471 int32_t indexToWrite;

	472 uint8_t tempBuf[4];

	473 UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);

	474

	475 if (cnv->fromUChar32 && myTarget < targetLimit)

	476 {

	477 ch = cnv->fromUChar32;

	478 cnv->fromUChar32 = 0;

	479 offsetNum = -1;

	480 nextSourceIndex = 0;

	481 goto lowsurrogate;

	482 } else {

	483 offsetNum = 0;

	484 }

	485

	486 while (mySource < sourceLimit && myTarget < targetLimit)

	487 {

	488 ch = *(mySource++);

	489

	490 if (ch < 0x80) /* Single byte */

	491 {

	492 *(myOffsets++) = offsetNum++;

	493 *(myTarget++) = (char) ch;

	494 }

	495 else if (ch < 0x800) /* Double byte */

	496 {

	497 *(myOffsets++) = offsetNum;

	498 *(myTarget++) = (uint8_t) ((ch >> 6) \| 0xc0);

	499 if (myTarget < targetLimit)

	500 {

	501 *(myOffsets++) = offsetNum++;

	502 *(myTarget++) = (uint8_t) ((ch & 0x3f) \| 0x80);

	503 }

	504 else

	505 {

	506 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) \| 0x80);

	507 cnv->charErrorBufferLength = 1;

	508 *err = U_BUFFER_OVERFLOW_ERROR;

	509 }

	510 }

	511 else

	512 /* Check for surrogates */

	513 {

	514 nextSourceIndex = offsetNum + 1;

	515

	516 if(UTF_IS_SURROGATE(ch) && isNotCESU8) {

	517 lowsurrogate:

	518 if (mySource < sourceLimit) {

	519 /* test both code units */

	520 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*my Source)) {

	521 /* convert and consume this supplementary code point */

	522 ch=UTF16_GET_PAIR_VALUE(ch, *mySource);

	523 ++mySource;

	524 ++nextSourceIndex;

	525 /* exit this condition tree */

	526 }

	527 else {

	528 /* this is an unpaired trail or lead code unit */

	529 /* callback(illegal) */

	530 cnv->fromUChar32 = ch;

	531 *err = U_ILLEGAL_CHAR_FOUND;

	532 break;

	533 }

	534 }

	535 else {

	536 /* no more input */

	537 cnv->fromUChar32 = ch;

	538 break;

	539 }

	540 }

	541

	542 /* Do we write the buffer directly for speed,

	543 or do we have to be careful about target buffer space? */

	544 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);

	545

	546 if (ch <= MAXIMUM_UCS2) {

	547 indexToWrite = 2;

	548 tempPtr[0] = (uint8_t) ((ch >> 12) \| 0xe0);

	549 }

	550 else {

	551 indexToWrite = 3;

	552 tempPtr[0] = (uint8_t) ((ch >> 18) \| 0xf0);

	553 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) \| 0x80);

	554 }

	555 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) \| 0x80);

	556 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) \| 0x80);

	557

	558 if (tempPtr == myTarget) {

	559 /* There was enough space to write the codepoint directly. */

	560 myTarget += (indexToWrite + 1);

	561 myOffsets[0] = offsetNum;

	562 myOffsets[1] = offsetNum;

	563 myOffsets[2] = offsetNum;

	564 if (indexToWrite >= 3) {

	565 myOffsets[3] = offsetNum;

	566 }

	567 myOffsets += (indexToWrite + 1);

	568 }

	569 else {

	570 /* We might run out of room soon. Write it slowly. */

	571 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {

	572 if (myTarget < targetLimit)

	573 {

	574 *(myOffsets++) = offsetNum;

	575 (myTarget++) = tempPtr;

	576 }

	577 else

	578 {

	579 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *te mpPtr;

	580 *err = U_BUFFER_OVERFLOW_ERROR;

	581 }

	582 }

	583 }

	584 offsetNum = nextSourceIndex;

	585 }

	586 }

	587

	588 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))

	589 {

	590 *err = U_BUFFER_OVERFLOW_ERROR;

	591 }

	592

	593 args->target = (char *) myTarget;

	594 args->source = mySource;

	595 args->offsets = myOffsets;

	596 }

	597

	598 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,

	599 UErrorCode *err) {

	600 UConverter *cnv;

	601 const uint8_t *sourceInitial;

	602 const uint8_t *source;

	603 uint16_t extraBytesToWrite;

	604 uint8_t myByte;

	605 UChar32 ch;

	606 int8_t i, isLegalSequence;

	607

	608 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */

	609

	610 cnv = args->converter;

	611 sourceInitial = source = (const uint8_t *)args->source;

	612 if (source >= (const uint8_t *)args->sourceLimit)

	613 {

	614 /* no input */

	615 *err = U_INDEX_OUTOFBOUNDS_ERROR;

	616 return 0xffff;

	617 }

	618

	619 myByte = (uint8_t)*(source++);

	620 if (myByte < 0x80)

	621 {

	622 args->source = (const char *)source;

	623 return (UChar32)myByte;

	624 }

	625

	626 extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];

	627 if (extraBytesToWrite == 0) {

	628 cnv->toUBytes[0] = myByte;

	629 cnv->toULength = 1;

	630 *err = U_ILLEGAL_CHAR_FOUND;

	631 args->source = (const char *)source;

	632 return 0xffff;

	633 }

	634

	635 /The byte sequence is longer than the buffer area passed/

	636 if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)

	637 {

	638 /* check if all of the remaining bytes are trail bytes */

	639 cnv->toUBytes[0] = myByte;

	640 i = 1;

	641 *err = U_TRUNCATED_CHAR_FOUND;

	642 while(source < (const uint8_t *)args->sourceLimit) {

	643 if(U8_IS_TRAIL(myByte = *source)) {

	644 cnv->toUBytes[i++] = myByte;

	645 ++source;

	646 } else {

	647 /* error even before we run out of input */

	648 *err = U_ILLEGAL_CHAR_FOUND;

	649 break;

	650 }

	651 }

	652 cnv->toULength = i;

	653 args->source = (const char *)source;

	654 return 0xffff;

	655 }

	656

	657 isLegalSequence = 1;

	658 ch = myByte << 6;

	659 switch(extraBytesToWrite)

	660 {

	661 /* note: code falls through cases! (sic)*/

	662 case 6:

	663 ch += (myByte = *source);

	664 ch <<= 6;

	665 if (!UTF8_IS_TRAIL(myByte))

	666 {

	667 isLegalSequence = 0;

	668 break;

	669 }

	670 ++source;

	671 case 5:

	672 ch += (myByte = *source);

	673 ch <<= 6;

	674 if (!UTF8_IS_TRAIL(myByte))

	675 {

	676 isLegalSequence = 0;

	677 break;

	678 }

	679 ++source;

	680 case 4:

	681 ch += (myByte = *source);

	682 ch <<= 6;

	683 if (!UTF8_IS_TRAIL(myByte))

	684 {

	685 isLegalSequence = 0;

	686 break;

	687 }

	688 ++source;

	689 case 3:

	690 ch += (myByte = *source);

	691 ch <<= 6;

	692 if (!UTF8_IS_TRAIL(myByte))

	693 {

	694 isLegalSequence = 0;

	695 break;

	696 }

	697 ++source;

	698 case 2:

	699 ch += (myByte = *source);

	700 if (!UTF8_IS_TRAIL(myByte))

	701 {

	702 isLegalSequence = 0;

	703 break;

	704 }

	705 ++source;

	706 };

	707 ch -= offsetsFromUTF8[extraBytesToWrite];

	708 args->source = (const char *)source;

	709

	710 /*

	711 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:

	712 * - use only trail bytes after a lead byte (checked above)

	713 * - use the right number of trail bytes for a given lead byte

	714 * - encode a code point <= U+10ffff

	715 * - use the fewest possible number of bytes for their code points

	716 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])

	717 *

	718 * Starting with Unicode 3.2, surrogate code points must not be encoded in U TF-8.

	719 * There are no irregular sequences any more.

	720 */

	721 if (isLegalSequence &&

	722 (uint32_t)ch <= MAXIMUM_UTF &&

	723 (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&

	724 !U_IS_SURROGATE(ch)

	725 ) {

	726 return ch; /* return the code point */

	727 }

	728

	729 for(i = 0; sourceInitial < source; ++i) {

	730 cnv->toUBytes[i] = *sourceInitial++;

	731 }

	732 cnv->toULength = i;

	733 *err = U_ILLEGAL_CHAR_FOUND;

	734 return 0xffff;

	735 }

	736

	737 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */

	738

	739 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */

	740 static const UChar32

	741 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };

	742

	743 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail )<<6+trail... */

	744 static const UChar32

	745 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };

	746

	747 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8( ). */

	748 static void

	749 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,

	750 UConverterToUnicodeArgs *pToUArgs,

	751 UErrorCode *pErrorCode) {

	752 UConverter utf8, cnv;

	753 const uint8_t source, sourceLimit;

	754 uint8_t *target;

	755 int32_t targetCapacity;

	756 int32_t count;

	757

	758 int8_t oldToULength, toULength, toULimit;

	759

	760 UChar32 c;

	761 uint8_t b, t1, t2;

	762

	763 /* set up the local pointers */

	764 utf8=pToUArgs->converter;

	765 cnv=pFromUArgs->converter;

	766 source=(uint8_t *)pToUArgs->source;

	767 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;

	768 target=(uint8_t *)pFromUArgs->target;

	769 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);

	770

	771 /* get the converter state from the UTF-8 UConverter */

	772 c=(UChar32)utf8->toUnicodeStatus;

	773 if(c!=0) {

	774 toULength=oldToULength=utf8->toULength;

	775 toULimit=(int8_t)utf8->mode;

	776 } else {

	777 toULength=oldToULength=toULimit=0;

	778 }

	779

	780 count=(int32_t)(sourceLimit-source)+oldToULength;

	781 if(count<toULimit) {

	782 /*

	783 * Not enough input to complete the partial character.

	784 * Jump to moreBytes below - it will not output to target.

	785 */

	786 } else if(targetCapacity<toULimit) {

	787 /*

	788 * Not enough target capacity to output the partial character.

	789 * Let the standard converter handle this.

	790 */

	791 *pErrorCode=U_USING_DEFAULT_WARNING;

	792 return;

	793 } else {

	794 /*

	795 * Use a single counter for source and target, counting the minimum of

	796 * the source length and the target capacity.

	797 * As a result, the source length is checked only once per multi-byte

	798 * character instead of twice.

	799 *

	800 * Make sure that the last byte sequence is complete, or else

	801 * stop just before it.

	802 * (The longest legal byte sequence has 3 trail bytes.)

	803 * Count oldToULength (number of source bytes from a previous buffer)

	804 * into the source length but reduce the source index by toULimit

	805 * while going back over trail bytes in order to not go back into

	806 * the bytes that will be read for finishing a partial

	807 * sequence from the previous buffer.

	808 * Let the standard converter handle edge cases.

	809 */

	810 int32_t i;

	811

	812 if(count>targetCapacity) {

	813 count=targetCapacity;

	814 }

	815

	816 i=0;

	817 while(i<3 && i<(count-toULimit)) {

	818 b=source[count-oldToULength-i-1];

	819 if(U8_IS_TRAIL(b)) {

	820 ++i;

	821 } else {

	822 if(i<utf8_countTrailBytes[b]) {

	823 /* stop converting before the lead byte if there are not eno ugh trail bytes for it */

	824 count-=i+1;

	825 }

	826 break;

	827 }

	828 }

	829 }

	830

	831 if(c!=0) {

	832 utf8->toUnicodeStatus=0;

	833 utf8->toULength=0;

	834 goto moreBytes;

	835 /* See note in ucnv_SBCSFromUTF8() about this goto. */

	836 }

	837

	838 /* conversion loop */

	839 while(count>0) {

	840 b=*source++;

	841 if((int8_t)b>=0) {

	842 /* convert ASCII */

	843 *target++=b;

	844 --count;

	845 continue;

	846 } else {

	847 if(b>0xe0) {

	848 if( /* handle U+1000..U+D7FF inline */

	849 (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) \|\|

	850 (b==0xed && (t1 <= 0x9f))) &&

	851 (t2=source[1]) >= 0x80 && t2 <= 0xbf

	852 ) {

	853 source+=2;

	854 *target++=b;

	855 *target++=t1;

	856 *target++=t2;

	857 count-=3;

	858 continue;

	859 }

	860 } else if(b<0xe0) {

	861 if( /* handle U+0080..U+07FF inline */

	862 b>=0xc2 &&

	863 (t1=*source) >= 0x80 && t1 <= 0xbf

	864 ) {

	865 ++source;

	866 *target++=b;

	867 *target++=t1;

	868 count-=2;

	869 continue;

	870 }

	871 } else if(b==0xe0) {

	872 if( /* handle U+0800..U+0FFF inline */

	873 (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&

	874 (t2=source[1]) >= 0x80 && t2 <= 0xbf

	875 ) {

	876 source+=2;

	877 *target++=b;

	878 *target++=t1;

	879 *target++=t2;

	880 count-=3;

	881 continue;

	882 }

	883 }

	884

	885 /* handle "complicated" and error cases, and continuing partial char acters */

	886 oldToULength=0;

	887 toULength=1;

	888 toULimit=utf8_countTrailBytes[b]+1;

	889 c=b;

	890 moreBytes:

	891 while(toULength<toULimit) {

	892 if(source<sourceLimit) {

	893 b=*source;

	894 if(U8_IS_TRAIL(b)) {

	895 ++source;

	896 ++toULength;

	897 c=(c<<6)+b;

	898 } else {

	899 break; /* sequence too short, stop with toULength<toULim it */

	900 }

	901 } else {

	902 /* store the partial UTF-8 character, compatible with the re gular UTF-8 converter */

	903 source-=(toULength-oldToULength);

	904 while(oldToULength<toULength) {

	905 utf8->toUBytes[oldToULength++]=*source++;

	906 }

	907 utf8->toUnicodeStatus=c;

	908 utf8->toULength=toULength;

	909 utf8->mode=toULimit;

	910 pToUArgs->source=(char *)source;

	911 pFromUArgs->target=(char *)target;

	912 return;

	913 }

	914 }

	915

	916 if( toULength==toULimit && /* consumed all trail bytes */

	917 (toULength==3 \|\| toULength==2) && /* BMP */

	918 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&

	919 (c<=0xd7ff \|\| 0xe000<=c) /* not a surrogate */

	920 ) {

	921 /* legal byte sequence for BMP code point */

	922 } else if(

	923 toULength==toULimit && toULength==4 &&

	924 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)

	925 ) {

	926 /* legal byte sequence for supplementary code point */

	927 } else {

	928 /* error handling: illegal UTF-8 byte sequence */

	929 source-=(toULength-oldToULength);

	930 while(oldToULength<toULength) {

	931 utf8->toUBytes[oldToULength++]=*source++;

	932 }

	933 utf8->toULength=toULength;

	934 pToUArgs->source=(char *)source;

	935 pFromUArgs->target=(char *)target;

	936 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	937 return;

	938 }

	939

	940 /* copy the legal byte sequence to the target */

	941 {

	942 int8_t i;

	943

	944 for(i=0; i<oldToULength; ++i) {

	945 *target++=utf8->toUBytes[i];

	946 }

	947 source-=(toULength-oldToULength);

	948 for(; i<toULength; ++i) {

	949 target++=source++;

	950 }

	951 count-=toULength;

	952 }

	953 }

	954 }

	955

	956 if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {

	957 if(target==(const uint8_t *)pFromUArgs->targetLimit) {

	958 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	959 } else {

	960 b=*source;

	961 toULimit=utf8_countTrailBytes[b]+1;

	962 if(toULimit>(sourceLimit-source)) {

	963 /* collect a truncated byte sequence */

	964 toULength=0;

	965 c=b;

	966 for(;;) {

	967 utf8->toUBytes[toULength++]=b;

	968 if(++source==sourceLimit) {

	969 /* partial byte sequence at end of source */

	970 utf8->toUnicodeStatus=c;

	971 utf8->toULength=toULength;

	972 utf8->mode=toULimit;

	973 break;

	974 } else if(!U8_IS_TRAIL(b=*source)) {

	975 /* lead byte in trail byte position */

	976 utf8->toULength=toULength;

	977 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	978 break;

	979 }

	980 c=(c<<6)+b;

	981 }

	982 } else {

	983 /* partial-sequence target overflow: fall back to the pivoting i mplementation */

	984 *pErrorCode=U_USING_DEFAULT_WARNING;

	985 }

	986 }

	987 }

	988

	989 /* write back the updated pointers */

	990 pToUArgs->source=(char *)source;

	991 pFromUArgs->target=(char *)target;

	992 }

	993

	994 /* UTF-8 converter data ----------------------------------------------------- */

	995

	996 static const UConverterImpl _UTF8Impl={

	997 UCNV_UTF8,

	998

	999 NULL,

	1000 NULL,

	1001

	1002 NULL,

	1003 NULL,

	1004 NULL,

	1005

	1006 ucnv_toUnicode_UTF8,

	1007 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,

	1008 ucnv_fromUnicode_UTF8,

	1009 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,

	1010 ucnv_getNextUChar_UTF8,

	1011

	1012 NULL,

	1013 NULL,

	1014 NULL,

	1015 NULL,

	1016 ucnv_getNonSurrogateUnicodeSet,

	1017

	1018 ucnv_UTF8FromUTF8,

	1019 ucnv_UTF8FromUTF8

	1020 };

	1021

	1022 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */

	1023 static const UConverterStaticData _UTF8StaticData={

	1024 sizeof(UConverterStaticData),

	1025 "UTF-8",

	1026 1208, UCNV_IBM, UCNV_UTF8,

	1027 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */

	1028 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,

	1029 0,

	1030 0,

	1031 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

	1032 };

	1033

	1034

	1035 const UConverterSharedData _UTF8Data={

	1036 sizeof(UConverterSharedData), ~((uint32_t) 0),

	1037 NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,

	1038 0

	1039 };

	1040

	1041 /* CESU-8 converter data ---------------------------------------------------- */

	1042

	1043 static const UConverterImpl _CESU8Impl={

	1044 UCNV_CESU8,

	1045

	1046 NULL,

	1047 NULL,

	1048

	1049 NULL,

	1050 NULL,

	1051 NULL,

	1052

	1053 ucnv_toUnicode_UTF8,

	1054 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,

	1055 ucnv_fromUnicode_UTF8,

	1056 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,

	1057 NULL,

	1058

	1059 NULL,

	1060 NULL,

	1061 NULL,

	1062 NULL,

	1063 ucnv_getCompleteUnicodeSet

	1064 };

	1065

	1066 static const UConverterStaticData _CESU8StaticData={

	1067 sizeof(UConverterStaticData),

	1068 "CESU-8",

	1069 9400, /* CCSID for CESU-8 */

	1070 UCNV_UNKNOWN, UCNV_CESU8, 1, 3,

	1071 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,

	1072 0,

	1073 0,

	1074 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

	1075 };

	1076

	1077

	1078 const UConverterSharedData _CESU8Data={

	1079 sizeof(UConverterSharedData), ~((uint32_t) 0),

	1080 NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,

	1081 0

	1082 };

	1083

	1084 #endif

OLD	NEW

« no previous file with comments | « icu46/source/common/ucnv_u7.c ('k') | icu46/source/common/ucnvbocu.c » ('j') | no next file with comments »