OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ********************************************************************** |
| 3 * Copyright (C) 2002-2007, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ********************************************************************** |
| 6 * file name: ucnv_u8.c |
| 7 * encoding: US-ASCII |
| 8 * tab size: 8 (not used) |
| 9 * indentation:4 |
| 10 * |
| 11 * created on: 2002jul01 |
| 12 * created by: Markus W. Scherer |
| 13 * |
| 14 * UTF-8 converter implementation. Used to be in ucnv_utf.c. |
| 15 * |
| 16 * Also, CESU-8 implementation, see UTR 26. |
| 17 * The CESU-8 converter uses all the same functions as the |
| 18 * UTF-8 converter, with a branch for converting supplementary code points. |
| 19 */ |
| 20 |
| 21 #include "unicode/utypes.h" |
| 22 |
| 23 #if !UCONFIG_NO_CONVERSION |
| 24 |
| 25 #include "unicode/ucnv.h" |
| 26 #include "ucnv_bld.h" |
| 27 #include "ucnv_cnv.h" |
| 28 #include "cmemory.h" |
| 29 |
| 30 /* Prototypes --------------------------------------------------------------- */ |
| 31 |
| 32 /* Keep these here to make finicky compilers happy */ |
| 33 |
| 34 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args, |
| 35 UErrorCode *err); |
| 36 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args
, |
| 37 UErrorCode *err); |
| 38 |
| 39 |
| 40 /* UTF-8 -------------------------------------------------------------------- */ |
| 41 |
| 42 /* UTF-8 Conversion DATA |
| 43 * for more information see Unicode Standard 2.0, Transformation Formats Appen
dix A-9 |
| 44 */ |
| 45 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/ |
| 46 #define MAXIMUM_UCS2 0x0000FFFF |
| 47 #define MAXIMUM_UTF 0x0010FFFF |
| 48 #define MAXIMUM_UCS4 0x7FFFFFFF |
| 49 #define HALF_SHIFT 10 |
| 50 #define HALF_BASE 0x0010000 |
| 51 #define HALF_MASK 0x3FF |
| 52 #define SURROGATE_HIGH_START 0xD800 |
| 53 #define SURROGATE_HIGH_END 0xDBFF |
| 54 #define SURROGATE_LOW_START 0xDC00 |
| 55 #define SURROGATE_LOW_END 0xDFFF |
| 56 |
| 57 /* -SURROGATE_LOW_START + HALF_BASE */ |
| 58 #define SURROGATE_LOW_BASE 9216 |
| 59 |
| 60 static const uint32_t offsetsFromUTF8[7] = {0, |
| 61 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080, |
| 62 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080 |
| 63 }; |
| 64 |
| 65 /* END OF UTF-8 Conversion DATA */ |
| 66 |
| 67 static const int8_t bytesFromUTF8[256] = { |
| 68 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, |
| 69 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, |
| 70 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, |
| 71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, |
| 72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, |
| 73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, |
| 74 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, |
| 75 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5,
5, 5, 6, 6, 0, 0 |
| 76 }; |
| 77 |
| 78 /* |
| 79 * Starting with Unicode 3.0.1: |
| 80 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_m
inChar32[N]; |
| 81 * byte sequences with more than 4 bytes are illegal in UTF-8, |
| 82 * which is tested with impossible values for them |
| 83 */ |
| 84 static const uint32_t |
| 85 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff }; |
| 86 |
| 87 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args, |
| 88 UErrorCode * err) |
| 89 { |
| 90 UConverter *cnv = args->converter; |
| 91 const unsigned char *mySource = (unsigned char *) args->source; |
| 92 UChar *myTarget = args->target; |
| 93 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
| 94 const UChar *targetLimit = args->targetLimit; |
| 95 unsigned char *toUBytes = cnv->toUBytes; |
| 96 UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data); |
| 97 uint32_t ch, ch2 = 0; |
| 98 int32_t i, inBytes; |
| 99 |
| 100 /* Restore size of current sequence */ |
| 101 if (cnv->toUnicodeStatus && myTarget < targetLimit) |
| 102 { |
| 103 inBytes = cnv->mode; /* restore # of bytes to consume */ |
| 104 i = cnv->toULength; /* restore # of bytes consumed */ |
| 105 cnv->toULength = 0; |
| 106 |
| 107 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a p
revious call*/ |
| 108 cnv->toUnicodeStatus = 0; |
| 109 goto morebytes; |
| 110 } |
| 111 |
| 112 |
| 113 while (mySource < sourceLimit && myTarget < targetLimit) |
| 114 { |
| 115 ch = *(mySource++); |
| 116 if (ch < 0x80) /* Simple case */ |
| 117 { |
| 118 *(myTarget++) = (UChar) ch; |
| 119 } |
| 120 else |
| 121 { |
| 122 /* store the first char */ |
| 123 toUBytes[0] = (char)ch; |
| 124 inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */ |
| 125 i = 1; |
| 126 |
| 127 morebytes: |
| 128 while (i < inBytes) |
| 129 { |
| 130 if (mySource < sourceLimit) |
| 131 { |
| 132 toUBytes[i] = (char) (ch2 = *mySource); |
| 133 if (!UTF8_IS_TRAIL(ch2)) |
| 134 { |
| 135 break; /* i < inBytes */ |
| 136 } |
| 137 ch = (ch << 6) + ch2; |
| 138 ++mySource; |
| 139 i++; |
| 140 } |
| 141 else |
| 142 { |
| 143 /* stores a partially calculated target*/ |
| 144 cnv->toUnicodeStatus = ch; |
| 145 cnv->mode = inBytes; |
| 146 cnv->toULength = (int8_t) i; |
| 147 goto donefornow; |
| 148 } |
| 149 } |
| 150 |
| 151 /* Remove the accumulated high bits */ |
| 152 ch -= offsetsFromUTF8[inBytes]; |
| 153 |
| 154 /* |
| 155 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: |
| 156 * - use only trail bytes after a lead byte (checked above) |
| 157 * - use the right number of trail bytes for a given lead byte |
| 158 * - encode a code point <= U+10ffff |
| 159 * - use the fewest possible number of bytes for their code points |
| 160 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) |
| 161 * |
| 162 * Starting with Unicode 3.2, surrogate code points must not be enco
ded in UTF-8. |
| 163 * There are no irregular sequences any more. |
| 164 * In CESU-8, only surrogates, not supplementary code points, are en
coded directly. |
| 165 */ |
| 166 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && |
| 167 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch))) |
| 168 { |
| 169 /* Normal valid byte when the loop has not prematurely terminate
d (i < inBytes) */ |
| 170 if (ch <= MAXIMUM_UCS2) |
| 171 { |
| 172 /* fits in 16 bits */ |
| 173 *(myTarget++) = (UChar) ch; |
| 174 } |
| 175 else |
| 176 { |
| 177 /* write out the surrogates */ |
| 178 ch -= HALF_BASE; |
| 179 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH
_START); |
| 180 ch = (ch & HALF_MASK) + SURROGATE_LOW_START; |
| 181 if (myTarget < targetLimit) |
| 182 { |
| 183 *(myTarget++) = (UChar)ch; |
| 184 } |
| 185 else |
| 186 { |
| 187 /* Put in overflow buffer (not handled here) */ |
| 188 cnv->UCharErrorBuffer[0] = (UChar) ch; |
| 189 cnv->UCharErrorBufferLength = 1; |
| 190 *err = U_BUFFER_OVERFLOW_ERROR; |
| 191 break; |
| 192 } |
| 193 } |
| 194 } |
| 195 else |
| 196 { |
| 197 cnv->toULength = (int8_t)i; |
| 198 *err = U_ILLEGAL_CHAR_FOUND; |
| 199 break; |
| 200 } |
| 201 } |
| 202 } |
| 203 |
| 204 donefornow: |
| 205 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
| 206 { |
| 207 /* End of target buffer */ |
| 208 *err = U_BUFFER_OVERFLOW_ERROR; |
| 209 } |
| 210 |
| 211 args->target = myTarget; |
| 212 args->source = (const char *) mySource; |
| 213 } |
| 214 |
| 215 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args, |
| 216 UErrorCode * err) |
| 217 { |
| 218 UConverter *cnv = args->converter; |
| 219 const unsigned char *mySource = (unsigned char *) args->source; |
| 220 UChar *myTarget = args->target; |
| 221 int32_t *myOffsets = args->offsets; |
| 222 int32_t offsetNum = 0; |
| 223 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
| 224 const UChar *targetLimit = args->targetLimit; |
| 225 unsigned char *toUBytes = cnv->toUBytes; |
| 226 UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data); |
| 227 uint32_t ch, ch2 = 0; |
| 228 int32_t i, inBytes; |
| 229 |
| 230 /* Restore size of current sequence */ |
| 231 if (cnv->toUnicodeStatus && myTarget < targetLimit) |
| 232 { |
| 233 inBytes = cnv->mode; /* restore # of bytes to consume */ |
| 234 i = cnv->toULength; /* restore # of bytes consumed */ |
| 235 cnv->toULength = 0; |
| 236 |
| 237 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a p
revious call*/ |
| 238 cnv->toUnicodeStatus = 0; |
| 239 goto morebytes; |
| 240 } |
| 241 |
| 242 while (mySource < sourceLimit && myTarget < targetLimit) |
| 243 { |
| 244 ch = *(mySource++); |
| 245 if (ch < 0x80) /* Simple case */ |
| 246 { |
| 247 *(myTarget++) = (UChar) ch; |
| 248 *(myOffsets++) = offsetNum++; |
| 249 } |
| 250 else |
| 251 { |
| 252 toUBytes[0] = (char)ch; |
| 253 inBytes = bytesFromUTF8[ch]; |
| 254 i = 1; |
| 255 |
| 256 morebytes: |
| 257 while (i < inBytes) |
| 258 { |
| 259 if (mySource < sourceLimit) |
| 260 { |
| 261 toUBytes[i] = (char) (ch2 = *mySource); |
| 262 if (!UTF8_IS_TRAIL(ch2)) |
| 263 { |
| 264 break; /* i < inBytes */ |
| 265 } |
| 266 ch = (ch << 6) + ch2; |
| 267 ++mySource; |
| 268 i++; |
| 269 } |
| 270 else |
| 271 { |
| 272 cnv->toUnicodeStatus = ch; |
| 273 cnv->mode = inBytes; |
| 274 cnv->toULength = (int8_t)i; |
| 275 goto donefornow; |
| 276 } |
| 277 } |
| 278 |
| 279 /* Remove the accumulated high bits */ |
| 280 ch -= offsetsFromUTF8[inBytes]; |
| 281 |
| 282 /* |
| 283 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: |
| 284 * - use only trail bytes after a lead byte (checked above) |
| 285 * - use the right number of trail bytes for a given lead byte |
| 286 * - encode a code point <= U+10ffff |
| 287 * - use the fewest possible number of bytes for their code points |
| 288 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) |
| 289 * |
| 290 * Starting with Unicode 3.2, surrogate code points must not be enco
ded in UTF-8. |
| 291 * There are no irregular sequences any more. |
| 292 * In CESU-8, only surrogates, not supplementary code points, are en
coded directly. |
| 293 */ |
| 294 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && |
| 295 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch))) |
| 296 { |
| 297 /* Normal valid byte when the loop has not prematurely terminate
d (i < inBytes) */ |
| 298 if (ch <= MAXIMUM_UCS2) |
| 299 { |
| 300 /* fits in 16 bits */ |
| 301 *(myTarget++) = (UChar) ch; |
| 302 *(myOffsets++) = offsetNum; |
| 303 } |
| 304 else |
| 305 { |
| 306 /* write out the surrogates */ |
| 307 ch -= HALF_BASE; |
| 308 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH
_START); |
| 309 *(myOffsets++) = offsetNum; |
| 310 ch = (ch & HALF_MASK) + SURROGATE_LOW_START; |
| 311 if (myTarget < targetLimit) |
| 312 { |
| 313 *(myTarget++) = (UChar)ch; |
| 314 *(myOffsets++) = offsetNum; |
| 315 } |
| 316 else |
| 317 { |
| 318 cnv->UCharErrorBuffer[0] = (UChar) ch; |
| 319 cnv->UCharErrorBufferLength = 1; |
| 320 *err = U_BUFFER_OVERFLOW_ERROR; |
| 321 } |
| 322 } |
| 323 offsetNum += i; |
| 324 } |
| 325 else |
| 326 { |
| 327 cnv->toULength = (int8_t)i; |
| 328 *err = U_ILLEGAL_CHAR_FOUND; |
| 329 break; |
| 330 } |
| 331 } |
| 332 } |
| 333 |
| 334 donefornow: |
| 335 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
| 336 { /* End of target buffer */ |
| 337 *err = U_BUFFER_OVERFLOW_ERROR; |
| 338 } |
| 339 |
| 340 args->target = myTarget; |
| 341 args->source = (const char *) mySource; |
| 342 args->offsets = myOffsets; |
| 343 } |
| 344 |
| 345 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args, |
| 346 UErrorCode * err) |
| 347 { |
| 348 UConverter *cnv = args->converter; |
| 349 const UChar *mySource = args->source; |
| 350 const UChar *sourceLimit = args->sourceLimit; |
| 351 uint8_t *myTarget = (uint8_t *) args->target; |
| 352 const uint8_t *targetLimit = (uint8_t *) args->targetLimit; |
| 353 uint8_t *tempPtr; |
| 354 UChar32 ch; |
| 355 uint8_t tempBuf[4]; |
| 356 int32_t indexToWrite; |
| 357 UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data); |
| 358 |
| 359 if (cnv->fromUChar32 && myTarget < targetLimit) |
| 360 { |
| 361 ch = cnv->fromUChar32; |
| 362 cnv->fromUChar32 = 0; |
| 363 goto lowsurrogate; |
| 364 } |
| 365 |
| 366 while (mySource < sourceLimit && myTarget < targetLimit) |
| 367 { |
| 368 ch = *(mySource++); |
| 369 |
| 370 if (ch < 0x80) /* Single byte */ |
| 371 { |
| 372 *(myTarget++) = (uint8_t) ch; |
| 373 } |
| 374 else if (ch < 0x800) /* Double byte */ |
| 375 { |
| 376 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); |
| 377 if (myTarget < targetLimit) |
| 378 { |
| 379 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); |
| 380 } |
| 381 else |
| 382 { |
| 383 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); |
| 384 cnv->charErrorBufferLength = 1; |
| 385 *err = U_BUFFER_OVERFLOW_ERROR; |
| 386 } |
| 387 } |
| 388 else { |
| 389 /* Check for surrogates */ |
| 390 if(UTF_IS_SURROGATE(ch) && isNotCESU8) { |
| 391 lowsurrogate: |
| 392 if (mySource < sourceLimit) { |
| 393 /* test both code units */ |
| 394 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*my
Source)) { |
| 395 /* convert and consume this supplementary code point */ |
| 396 ch=UTF16_GET_PAIR_VALUE(ch, *mySource); |
| 397 ++mySource; |
| 398 /* exit this condition tree */ |
| 399 } |
| 400 else { |
| 401 /* this is an unpaired trail or lead code unit */ |
| 402 /* callback(illegal) */ |
| 403 cnv->fromUChar32 = ch; |
| 404 *err = U_ILLEGAL_CHAR_FOUND; |
| 405 break; |
| 406 } |
| 407 } |
| 408 else { |
| 409 /* no more input */ |
| 410 cnv->fromUChar32 = ch; |
| 411 break; |
| 412 } |
| 413 } |
| 414 |
| 415 /* Do we write the buffer directly for speed, |
| 416 or do we have to be careful about target buffer space? */ |
| 417 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); |
| 418 |
| 419 if (ch <= MAXIMUM_UCS2) { |
| 420 indexToWrite = 2; |
| 421 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); |
| 422 } |
| 423 else { |
| 424 indexToWrite = 3; |
| 425 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); |
| 426 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); |
| 427 } |
| 428 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); |
| 429 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); |
| 430 |
| 431 if (tempPtr == myTarget) { |
| 432 /* There was enough space to write the codepoint directly. */ |
| 433 myTarget += (indexToWrite + 1); |
| 434 } |
| 435 else { |
| 436 /* We might run out of room soon. Write it slowly. */ |
| 437 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { |
| 438 if (myTarget < targetLimit) { |
| 439 *(myTarget++) = *tempPtr; |
| 440 } |
| 441 else { |
| 442 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *te
mpPtr; |
| 443 *err = U_BUFFER_OVERFLOW_ERROR; |
| 444 } |
| 445 } |
| 446 } |
| 447 } |
| 448 } |
| 449 |
| 450 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
| 451 { |
| 452 *err = U_BUFFER_OVERFLOW_ERROR; |
| 453 } |
| 454 |
| 455 args->target = (char *) myTarget; |
| 456 args->source = mySource; |
| 457 } |
| 458 |
| 459 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * ar
gs, |
| 460 UErrorCode * err) |
| 461 { |
| 462 UConverter *cnv = args->converter; |
| 463 const UChar *mySource = args->source; |
| 464 int32_t *myOffsets = args->offsets; |
| 465 const UChar *sourceLimit = args->sourceLimit; |
| 466 uint8_t *myTarget = (uint8_t *) args->target; |
| 467 const uint8_t *targetLimit = (uint8_t *) args->targetLimit; |
| 468 uint8_t *tempPtr; |
| 469 UChar32 ch; |
| 470 int32_t offsetNum, nextSourceIndex; |
| 471 int32_t indexToWrite; |
| 472 uint8_t tempBuf[4]; |
| 473 UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data); |
| 474 |
| 475 if (cnv->fromUChar32 && myTarget < targetLimit) |
| 476 { |
| 477 ch = cnv->fromUChar32; |
| 478 cnv->fromUChar32 = 0; |
| 479 offsetNum = -1; |
| 480 nextSourceIndex = 0; |
| 481 goto lowsurrogate; |
| 482 } else { |
| 483 offsetNum = 0; |
| 484 } |
| 485 |
| 486 while (mySource < sourceLimit && myTarget < targetLimit) |
| 487 { |
| 488 ch = *(mySource++); |
| 489 |
| 490 if (ch < 0x80) /* Single byte */ |
| 491 { |
| 492 *(myOffsets++) = offsetNum++; |
| 493 *(myTarget++) = (char) ch; |
| 494 } |
| 495 else if (ch < 0x800) /* Double byte */ |
| 496 { |
| 497 *(myOffsets++) = offsetNum; |
| 498 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); |
| 499 if (myTarget < targetLimit) |
| 500 { |
| 501 *(myOffsets++) = offsetNum++; |
| 502 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); |
| 503 } |
| 504 else |
| 505 { |
| 506 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); |
| 507 cnv->charErrorBufferLength = 1; |
| 508 *err = U_BUFFER_OVERFLOW_ERROR; |
| 509 } |
| 510 } |
| 511 else |
| 512 /* Check for surrogates */ |
| 513 { |
| 514 nextSourceIndex = offsetNum + 1; |
| 515 |
| 516 if(UTF_IS_SURROGATE(ch) && isNotCESU8) { |
| 517 lowsurrogate: |
| 518 if (mySource < sourceLimit) { |
| 519 /* test both code units */ |
| 520 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*my
Source)) { |
| 521 /* convert and consume this supplementary code point */ |
| 522 ch=UTF16_GET_PAIR_VALUE(ch, *mySource); |
| 523 ++mySource; |
| 524 ++nextSourceIndex; |
| 525 /* exit this condition tree */ |
| 526 } |
| 527 else { |
| 528 /* this is an unpaired trail or lead code unit */ |
| 529 /* callback(illegal) */ |
| 530 cnv->fromUChar32 = ch; |
| 531 *err = U_ILLEGAL_CHAR_FOUND; |
| 532 break; |
| 533 } |
| 534 } |
| 535 else { |
| 536 /* no more input */ |
| 537 cnv->fromUChar32 = ch; |
| 538 break; |
| 539 } |
| 540 } |
| 541 |
| 542 /* Do we write the buffer directly for speed, |
| 543 or do we have to be careful about target buffer space? */ |
| 544 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); |
| 545 |
| 546 if (ch <= MAXIMUM_UCS2) { |
| 547 indexToWrite = 2; |
| 548 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); |
| 549 } |
| 550 else { |
| 551 indexToWrite = 3; |
| 552 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); |
| 553 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); |
| 554 } |
| 555 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); |
| 556 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); |
| 557 |
| 558 if (tempPtr == myTarget) { |
| 559 /* There was enough space to write the codepoint directly. */ |
| 560 myTarget += (indexToWrite + 1); |
| 561 myOffsets[0] = offsetNum; |
| 562 myOffsets[1] = offsetNum; |
| 563 myOffsets[2] = offsetNum; |
| 564 if (indexToWrite >= 3) { |
| 565 myOffsets[3] = offsetNum; |
| 566 } |
| 567 myOffsets += (indexToWrite + 1); |
| 568 } |
| 569 else { |
| 570 /* We might run out of room soon. Write it slowly. */ |
| 571 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { |
| 572 if (myTarget < targetLimit) |
| 573 { |
| 574 *(myOffsets++) = offsetNum; |
| 575 *(myTarget++) = *tempPtr; |
| 576 } |
| 577 else |
| 578 { |
| 579 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *te
mpPtr; |
| 580 *err = U_BUFFER_OVERFLOW_ERROR; |
| 581 } |
| 582 } |
| 583 } |
| 584 offsetNum = nextSourceIndex; |
| 585 } |
| 586 } |
| 587 |
| 588 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
| 589 { |
| 590 *err = U_BUFFER_OVERFLOW_ERROR; |
| 591 } |
| 592 |
| 593 args->target = (char *) myTarget; |
| 594 args->source = mySource; |
| 595 args->offsets = myOffsets; |
| 596 } |
| 597 |
| 598 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args, |
| 599 UErrorCode *err) { |
| 600 UConverter *cnv; |
| 601 const uint8_t *sourceInitial; |
| 602 const uint8_t *source; |
| 603 uint16_t extraBytesToWrite; |
| 604 uint8_t myByte; |
| 605 UChar32 ch; |
| 606 int8_t i, isLegalSequence; |
| 607 |
| 608 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs
*/ |
| 609 |
| 610 cnv = args->converter; |
| 611 sourceInitial = source = (const uint8_t *)args->source; |
| 612 if (source >= (const uint8_t *)args->sourceLimit) |
| 613 { |
| 614 /* no input */ |
| 615 *err = U_INDEX_OUTOFBOUNDS_ERROR; |
| 616 return 0xffff; |
| 617 } |
| 618 |
| 619 myByte = (uint8_t)*(source++); |
| 620 if (myByte < 0x80) |
| 621 { |
| 622 args->source = (const char *)source; |
| 623 return (UChar32)myByte; |
| 624 } |
| 625 |
| 626 extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte]; |
| 627 if (extraBytesToWrite == 0) { |
| 628 cnv->toUBytes[0] = myByte; |
| 629 cnv->toULength = 1; |
| 630 *err = U_ILLEGAL_CHAR_FOUND; |
| 631 args->source = (const char *)source; |
| 632 return 0xffff; |
| 633 } |
| 634 |
| 635 /*The byte sequence is longer than the buffer area passed*/ |
| 636 if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit) |
| 637 { |
| 638 /* check if all of the remaining bytes are trail bytes */ |
| 639 cnv->toUBytes[0] = myByte; |
| 640 i = 1; |
| 641 *err = U_TRUNCATED_CHAR_FOUND; |
| 642 while(source < (const uint8_t *)args->sourceLimit) { |
| 643 if(U8_IS_TRAIL(myByte = *source)) { |
| 644 cnv->toUBytes[i++] = myByte; |
| 645 ++source; |
| 646 } else { |
| 647 /* error even before we run out of input */ |
| 648 *err = U_ILLEGAL_CHAR_FOUND; |
| 649 break; |
| 650 } |
| 651 } |
| 652 cnv->toULength = i; |
| 653 args->source = (const char *)source; |
| 654 return 0xffff; |
| 655 } |
| 656 |
| 657 isLegalSequence = 1; |
| 658 ch = myByte << 6; |
| 659 switch(extraBytesToWrite) |
| 660 { |
| 661 /* note: code falls through cases! (sic)*/ |
| 662 case 6: |
| 663 ch += (myByte = *source); |
| 664 ch <<= 6; |
| 665 if (!UTF8_IS_TRAIL(myByte)) |
| 666 { |
| 667 isLegalSequence = 0; |
| 668 break; |
| 669 } |
| 670 ++source; |
| 671 case 5: |
| 672 ch += (myByte = *source); |
| 673 ch <<= 6; |
| 674 if (!UTF8_IS_TRAIL(myByte)) |
| 675 { |
| 676 isLegalSequence = 0; |
| 677 break; |
| 678 } |
| 679 ++source; |
| 680 case 4: |
| 681 ch += (myByte = *source); |
| 682 ch <<= 6; |
| 683 if (!UTF8_IS_TRAIL(myByte)) |
| 684 { |
| 685 isLegalSequence = 0; |
| 686 break; |
| 687 } |
| 688 ++source; |
| 689 case 3: |
| 690 ch += (myByte = *source); |
| 691 ch <<= 6; |
| 692 if (!UTF8_IS_TRAIL(myByte)) |
| 693 { |
| 694 isLegalSequence = 0; |
| 695 break; |
| 696 } |
| 697 ++source; |
| 698 case 2: |
| 699 ch += (myByte = *source); |
| 700 if (!UTF8_IS_TRAIL(myByte)) |
| 701 { |
| 702 isLegalSequence = 0; |
| 703 break; |
| 704 } |
| 705 ++source; |
| 706 }; |
| 707 ch -= offsetsFromUTF8[extraBytesToWrite]; |
| 708 args->source = (const char *)source; |
| 709 |
| 710 /* |
| 711 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: |
| 712 * - use only trail bytes after a lead byte (checked above) |
| 713 * - use the right number of trail bytes for a given lead byte |
| 714 * - encode a code point <= U+10ffff |
| 715 * - use the fewest possible number of bytes for their code points |
| 716 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) |
| 717 * |
| 718 * Starting with Unicode 3.2, surrogate code points must not be encoded in U
TF-8. |
| 719 * There are no irregular sequences any more. |
| 720 */ |
| 721 if (isLegalSequence && |
| 722 (uint32_t)ch <= MAXIMUM_UTF && |
| 723 (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] && |
| 724 !U_IS_SURROGATE(ch) |
| 725 ) { |
| 726 return ch; /* return the code point */ |
| 727 } |
| 728 |
| 729 for(i = 0; sourceInitial < source; ++i) { |
| 730 cnv->toUBytes[i] = *sourceInitial++; |
| 731 } |
| 732 cnv->toULength = i; |
| 733 *err = U_ILLEGAL_CHAR_FOUND; |
| 734 return 0xffff; |
| 735 } |
| 736 |
| 737 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */ |
| 738 |
| 739 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */ |
| 740 static const UChar32 |
| 741 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 }; |
| 742 |
| 743 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail
)<<6+trail... */ |
| 744 static const UChar32 |
| 745 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; |
| 746 |
| 747 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(
). */ |
| 748 static void |
| 749 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, |
| 750 UConverterToUnicodeArgs *pToUArgs, |
| 751 UErrorCode *pErrorCode) { |
| 752 UConverter *utf8, *cnv; |
| 753 const uint8_t *source, *sourceLimit; |
| 754 uint8_t *target; |
| 755 int32_t targetCapacity; |
| 756 int32_t count; |
| 757 |
| 758 int8_t oldToULength, toULength, toULimit; |
| 759 |
| 760 UChar32 c; |
| 761 uint8_t b, t1, t2; |
| 762 |
| 763 /* set up the local pointers */ |
| 764 utf8=pToUArgs->converter; |
| 765 cnv=pFromUArgs->converter; |
| 766 source=(uint8_t *)pToUArgs->source; |
| 767 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; |
| 768 target=(uint8_t *)pFromUArgs->target; |
| 769 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); |
| 770 |
| 771 /* get the converter state from the UTF-8 UConverter */ |
| 772 c=(UChar32)utf8->toUnicodeStatus; |
| 773 if(c!=0) { |
| 774 toULength=oldToULength=utf8->toULength; |
| 775 toULimit=(int8_t)utf8->mode; |
| 776 } else { |
| 777 toULength=oldToULength=toULimit=0; |
| 778 } |
| 779 |
| 780 count=(int32_t)(sourceLimit-source)+oldToULength; |
| 781 if(count<toULimit) { |
| 782 /* |
| 783 * Not enough input to complete the partial character. |
| 784 * Jump to moreBytes below - it will not output to target. |
| 785 */ |
| 786 } else if(targetCapacity<toULimit) { |
| 787 /* |
| 788 * Not enough target capacity to output the partial character. |
| 789 * Let the standard converter handle this. |
| 790 */ |
| 791 *pErrorCode=U_USING_DEFAULT_WARNING; |
| 792 return; |
| 793 } else { |
| 794 /* |
| 795 * Use a single counter for source and target, counting the minimum of |
| 796 * the source length and the target capacity. |
| 797 * As a result, the source length is checked only once per multi-byte |
| 798 * character instead of twice. |
| 799 * |
| 800 * Make sure that the last byte sequence is complete, or else |
| 801 * stop just before it. |
| 802 * (The longest legal byte sequence has 3 trail bytes.) |
| 803 * Count oldToULength (number of source bytes from a previous buffer) |
| 804 * into the source length but reduce the source index by toULimit |
| 805 * while going back over trail bytes in order to not go back into |
| 806 * the bytes that will be read for finishing a partial |
| 807 * sequence from the previous buffer. |
| 808 * Let the standard converter handle edge cases. |
| 809 */ |
| 810 int32_t i; |
| 811 |
| 812 if(count>targetCapacity) { |
| 813 count=targetCapacity; |
| 814 } |
| 815 |
| 816 i=0; |
| 817 while(i<3 && i<(count-toULimit)) { |
| 818 b=source[count-oldToULength-i-1]; |
| 819 if(U8_IS_TRAIL(b)) { |
| 820 ++i; |
| 821 } else { |
| 822 if(i<utf8_countTrailBytes[b]) { |
| 823 /* stop converting before the lead byte if there are not eno
ugh trail bytes for it */ |
| 824 count-=i+1; |
| 825 } |
| 826 break; |
| 827 } |
| 828 } |
| 829 } |
| 830 |
| 831 if(c!=0) { |
| 832 utf8->toUnicodeStatus=0; |
| 833 utf8->toULength=0; |
| 834 goto moreBytes; |
| 835 /* See note in ucnv_SBCSFromUTF8() about this goto. */ |
| 836 } |
| 837 |
| 838 /* conversion loop */ |
| 839 while(count>0) { |
| 840 b=*source++; |
| 841 if((int8_t)b>=0) { |
| 842 /* convert ASCII */ |
| 843 *target++=b; |
| 844 --count; |
| 845 continue; |
| 846 } else { |
| 847 if(b>0xe0) { |
| 848 if( /* handle U+1000..U+D7FF inline */ |
| 849 (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) || |
| 850 (b==0xed && (t1 <= 0x9f))) && |
| 851 (t2=source[1]) >= 0x80 && t2 <= 0xbf |
| 852 ) { |
| 853 source+=2; |
| 854 *target++=b; |
| 855 *target++=t1; |
| 856 *target++=t2; |
| 857 count-=3; |
| 858 continue; |
| 859 } |
| 860 } else if(b<0xe0) { |
| 861 if( /* handle U+0080..U+07FF inline */ |
| 862 b>=0xc2 && |
| 863 (t1=*source) >= 0x80 && t1 <= 0xbf |
| 864 ) { |
| 865 ++source; |
| 866 *target++=b; |
| 867 *target++=t1; |
| 868 count-=2; |
| 869 continue; |
| 870 } |
| 871 } else if(b==0xe0) { |
| 872 if( /* handle U+0800..U+0FFF inline */ |
| 873 (t1=source[0]) >= 0xa0 && t1 <= 0xbf && |
| 874 (t2=source[1]) >= 0x80 && t2 <= 0xbf |
| 875 ) { |
| 876 source+=2; |
| 877 *target++=b; |
| 878 *target++=t1; |
| 879 *target++=t2; |
| 880 count-=3; |
| 881 continue; |
| 882 } |
| 883 } |
| 884 |
| 885 /* handle "complicated" and error cases, and continuing partial char
acters */ |
| 886 oldToULength=0; |
| 887 toULength=1; |
| 888 toULimit=utf8_countTrailBytes[b]+1; |
| 889 c=b; |
| 890 moreBytes: |
| 891 while(toULength<toULimit) { |
| 892 if(source<sourceLimit) { |
| 893 b=*source; |
| 894 if(U8_IS_TRAIL(b)) { |
| 895 ++source; |
| 896 ++toULength; |
| 897 c=(c<<6)+b; |
| 898 } else { |
| 899 break; /* sequence too short, stop with toULength<toULim
it */ |
| 900 } |
| 901 } else { |
| 902 /* store the partial UTF-8 character, compatible with the re
gular UTF-8 converter */ |
| 903 source-=(toULength-oldToULength); |
| 904 while(oldToULength<toULength) { |
| 905 utf8->toUBytes[oldToULength++]=*source++; |
| 906 } |
| 907 utf8->toUnicodeStatus=c; |
| 908 utf8->toULength=toULength; |
| 909 utf8->mode=toULimit; |
| 910 pToUArgs->source=(char *)source; |
| 911 pFromUArgs->target=(char *)target; |
| 912 return; |
| 913 } |
| 914 } |
| 915 |
| 916 if( toULength==toULimit && /* consumed all trail bytes */ |
| 917 (toULength==3 || toULength==2) && /* BMP */ |
| 918 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && |
| 919 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ |
| 920 ) { |
| 921 /* legal byte sequence for BMP code point */ |
| 922 } else if( |
| 923 toULength==toULimit && toULength==4 && |
| 924 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) |
| 925 ) { |
| 926 /* legal byte sequence for supplementary code point */ |
| 927 } else { |
| 928 /* error handling: illegal UTF-8 byte sequence */ |
| 929 source-=(toULength-oldToULength); |
| 930 while(oldToULength<toULength) { |
| 931 utf8->toUBytes[oldToULength++]=*source++; |
| 932 } |
| 933 utf8->toULength=toULength; |
| 934 pToUArgs->source=(char *)source; |
| 935 pFromUArgs->target=(char *)target; |
| 936 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 937 return; |
| 938 } |
| 939 |
| 940 /* copy the legal byte sequence to the target */ |
| 941 { |
| 942 int8_t i; |
| 943 |
| 944 for(i=0; i<oldToULength; ++i) { |
| 945 *target++=utf8->toUBytes[i]; |
| 946 } |
| 947 source-=(toULength-oldToULength); |
| 948 for(; i<toULength; ++i) { |
| 949 *target++=*source++; |
| 950 } |
| 951 count-=toULength; |
| 952 } |
| 953 } |
| 954 } |
| 955 |
| 956 if(U_SUCCESS(*pErrorCode) && source<sourceLimit) { |
| 957 if(target==(const uint8_t *)pFromUArgs->targetLimit) { |
| 958 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 959 } else { |
| 960 b=*source; |
| 961 toULimit=utf8_countTrailBytes[b]+1; |
| 962 if(toULimit>(sourceLimit-source)) { |
| 963 /* collect a truncated byte sequence */ |
| 964 toULength=0; |
| 965 c=b; |
| 966 for(;;) { |
| 967 utf8->toUBytes[toULength++]=b; |
| 968 if(++source==sourceLimit) { |
| 969 /* partial byte sequence at end of source */ |
| 970 utf8->toUnicodeStatus=c; |
| 971 utf8->toULength=toULength; |
| 972 utf8->mode=toULimit; |
| 973 break; |
| 974 } else if(!U8_IS_TRAIL(b=*source)) { |
| 975 /* lead byte in trail byte position */ |
| 976 utf8->toULength=toULength; |
| 977 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 978 break; |
| 979 } |
| 980 c=(c<<6)+b; |
| 981 } |
| 982 } else { |
| 983 /* partial-sequence target overflow: fall back to the pivoting i
mplementation */ |
| 984 *pErrorCode=U_USING_DEFAULT_WARNING; |
| 985 } |
| 986 } |
| 987 } |
| 988 |
| 989 /* write back the updated pointers */ |
| 990 pToUArgs->source=(char *)source; |
| 991 pFromUArgs->target=(char *)target; |
| 992 } |
| 993 |
| 994 /* UTF-8 converter data ----------------------------------------------------- */ |
| 995 |
| 996 static const UConverterImpl _UTF8Impl={ |
| 997 UCNV_UTF8, |
| 998 |
| 999 NULL, |
| 1000 NULL, |
| 1001 |
| 1002 NULL, |
| 1003 NULL, |
| 1004 NULL, |
| 1005 |
| 1006 ucnv_toUnicode_UTF8, |
| 1007 ucnv_toUnicode_UTF8_OFFSETS_LOGIC, |
| 1008 ucnv_fromUnicode_UTF8, |
| 1009 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, |
| 1010 ucnv_getNextUChar_UTF8, |
| 1011 |
| 1012 NULL, |
| 1013 NULL, |
| 1014 NULL, |
| 1015 NULL, |
| 1016 ucnv_getNonSurrogateUnicodeSet, |
| 1017 |
| 1018 ucnv_UTF8FromUTF8, |
| 1019 ucnv_UTF8FromUTF8 |
| 1020 }; |
| 1021 |
| 1022 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */ |
| 1023 static const UConverterStaticData _UTF8StaticData={ |
| 1024 sizeof(UConverterStaticData), |
| 1025 "UTF-8", |
| 1026 1208, UCNV_IBM, UCNV_UTF8, |
| 1027 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ |
| 1028 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, |
| 1029 0, |
| 1030 0, |
| 1031 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
| 1032 }; |
| 1033 |
| 1034 |
| 1035 const UConverterSharedData _UTF8Data={ |
| 1036 sizeof(UConverterSharedData), ~((uint32_t) 0), |
| 1037 NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl, |
| 1038 0 |
| 1039 }; |
| 1040 |
| 1041 /* CESU-8 converter data ---------------------------------------------------- */ |
| 1042 |
| 1043 static const UConverterImpl _CESU8Impl={ |
| 1044 UCNV_CESU8, |
| 1045 |
| 1046 NULL, |
| 1047 NULL, |
| 1048 |
| 1049 NULL, |
| 1050 NULL, |
| 1051 NULL, |
| 1052 |
| 1053 ucnv_toUnicode_UTF8, |
| 1054 ucnv_toUnicode_UTF8_OFFSETS_LOGIC, |
| 1055 ucnv_fromUnicode_UTF8, |
| 1056 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, |
| 1057 NULL, |
| 1058 |
| 1059 NULL, |
| 1060 NULL, |
| 1061 NULL, |
| 1062 NULL, |
| 1063 ucnv_getCompleteUnicodeSet |
| 1064 }; |
| 1065 |
| 1066 static const UConverterStaticData _CESU8StaticData={ |
| 1067 sizeof(UConverterStaticData), |
| 1068 "CESU-8", |
| 1069 9400, /* CCSID for CESU-8 */ |
| 1070 UCNV_UNKNOWN, UCNV_CESU8, 1, 3, |
| 1071 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, |
| 1072 0, |
| 1073 0, |
| 1074 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
| 1075 }; |
| 1076 |
| 1077 |
| 1078 const UConverterSharedData _CESU8Data={ |
| 1079 sizeof(UConverterSharedData), ~((uint32_t) 0), |
| 1080 NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl, |
| 1081 0 |
| 1082 }; |
| 1083 |
| 1084 #endif |
OLD | NEW |