| OLD | NEW |
| 1 /* | 1 /* |
| 2 ******************************************************************************* | 2 ******************************************************************************* |
| 3 * | 3 * |
| 4 * Copyright (C) 1999-2013, International Business Machines | 4 * Copyright (C) 1999-2014, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. | 5 * Corporation and others. All Rights Reserved. |
| 6 * | 6 * |
| 7 ******************************************************************************* | 7 ******************************************************************************* |
| 8 * file name: utf8.h | 8 * file name: utf8.h |
| 9 * encoding: US-ASCII | 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) | 10 * tab size: 8 (not used) |
| 11 * indentation:4 | 11 * indentation:4 |
| 12 * | 12 * |
| 13 * created on: 1999sep13 | 13 * created on: 1999sep13 |
| 14 * created by: Markus W. Scherer | 14 * created by: Markus W. Scherer |
| (...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 71 * leadByte is evaluated multiple times. | 71 * leadByte is evaluated multiple times. |
| 72 * | 72 * |
| 73 * The pre-ICU 50 implementation used the exported array utf8_countTrailBytes: | 73 * The pre-ICU 50 implementation used the exported array utf8_countTrailBytes: |
| 74 * #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte]) | 74 * #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte]) |
| 75 * leadByte was evaluated exactly once. | 75 * leadByte was evaluated exactly once. |
| 76 * | 76 * |
| 77 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. | 77 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. |
| 78 * @internal | 78 * @internal |
| 79 */ | 79 */ |
| 80 #define U8_COUNT_TRAIL_BYTES(leadByte) \ | 80 #define U8_COUNT_TRAIL_BYTES(leadByte) \ |
| 81 ((leadByte)<0xf0 ? \ | 81 ((uint8_t)(leadByte)<0xf0 ? \ |
| 82 ((leadByte)>=0xc0)+((leadByte)>=0xe0) : \ | 82 ((uint8_t)(leadByte)>=0xc0)+((uint8_t)(leadByte)>=0xe0) : \ |
| 83 (leadByte)<0xfe ? 3+((leadByte)>=0xf8)+((leadByte)>=0xfc) : 0) | 83 (uint8_t)(leadByte)<0xfe ? 3+((uint8_t)(leadByte)>=0xf8)+((uint8_t)(lead
Byte)>=0xfc) : 0) |
| 84 | 84 |
| 85 /** | 85 /** |
| 86 * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. | 86 * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. |
| 87 * The maximum supported lead byte is 0xf4 corresponding to U+10FFFF. | 87 * The maximum supported lead byte is 0xf4 corresponding to U+10FFFF. |
| 88 * leadByte might be evaluated multiple times. | 88 * leadByte might be evaluated multiple times. |
| 89 * | 89 * |
| 90 * This is internal since it is not meant to be called directly by external clie
nts; | 90 * This is internal since it is not meant to be called directly by external clie
nts; |
| 91 * however it is called by public macros in this file and thus must remain stabl
e. | 91 * however it is called by public macros in this file and thus must remain stabl
e. |
| 92 * | 92 * |
| 93 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. | 93 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. |
| (...skipping 152 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 246 * @param c output UChar32 variable, set to <0 in case of an error | 246 * @param c output UChar32 variable, set to <0 in case of an error |
| 247 * @see U8_GET_UNSAFE | 247 * @see U8_GET_UNSAFE |
| 248 * @stable ICU 2.4 | 248 * @stable ICU 2.4 |
| 249 */ | 249 */ |
| 250 #define U8_GET(s, start, i, length, c) { \ | 250 #define U8_GET(s, start, i, length, c) { \ |
| 251 int32_t _u8_get_index=(i); \ | 251 int32_t _u8_get_index=(i); \ |
| 252 U8_SET_CP_START(s, start, _u8_get_index); \ | 252 U8_SET_CP_START(s, start, _u8_get_index); \ |
| 253 U8_NEXT(s, _u8_get_index, length, c); \ | 253 U8_NEXT(s, _u8_get_index, length, c); \ |
| 254 } | 254 } |
| 255 | 255 |
| 256 #ifndef U_HIDE_DRAFT_API | |
| 257 /** | 256 /** |
| 258 * Get a code point from a string at a random-access offset, | 257 * Get a code point from a string at a random-access offset, |
| 259 * without changing the offset. | 258 * without changing the offset. |
| 260 * The offset may point to either the lead byte or one of the trail bytes | 259 * The offset may point to either the lead byte or one of the trail bytes |
| 261 * for a code point, in which case the macro will read all of the bytes | 260 * for a code point, in which case the macro will read all of the bytes |
| 262 * for the code point. | 261 * for the code point. |
| 263 * | 262 * |
| 264 * The length can be negative for a NUL-terminated string. | 263 * The length can be negative for a NUL-terminated string. |
| 265 * | 264 * |
| 266 * If the offset points to an illegal UTF-8 byte sequence, then | 265 * If the offset points to an illegal UTF-8 byte sequence, then |
| 267 * c is set to U+FFFD. | 266 * c is set to U+FFFD. |
| 268 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_O
R_FFFD. | 267 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_O
R_FFFD. |
| 269 * | 268 * |
| 270 * This macro does not distinguish between a real U+FFFD in the text | 269 * This macro does not distinguish between a real U+FFFD in the text |
| 271 * and U+FFFD returned for an ill-formed sequence. | 270 * and U+FFFD returned for an ill-formed sequence. |
| 272 * Use U8_GET() if that distinction is important. | 271 * Use U8_GET() if that distinction is important. |
| 273 * | 272 * |
| 274 * @param s const uint8_t * string | 273 * @param s const uint8_t * string |
| 275 * @param start int32_t starting string offset | 274 * @param start int32_t starting string offset |
| 276 * @param i int32_t string offset, must be start<=i<length | 275 * @param i int32_t string offset, must be start<=i<length |
| 277 * @param length int32_t string length | 276 * @param length int32_t string length |
| 278 * @param c output UChar32 variable, set to U+FFFD in case of an error | 277 * @param c output UChar32 variable, set to U+FFFD in case of an error |
| 279 * @see U8_GET | 278 * @see U8_GET |
| 280 * @draft ICU 51 | 279 * @stable ICU 51 |
| 281 */ | 280 */ |
| 282 #define U8_GET_OR_FFFD(s, start, i, length, c) { \ | 281 #define U8_GET_OR_FFFD(s, start, i, length, c) { \ |
| 283 int32_t _u8_get_index=(i); \ | 282 int32_t _u8_get_index=(i); \ |
| 284 U8_SET_CP_START(s, start, _u8_get_index); \ | 283 U8_SET_CP_START(s, start, _u8_get_index); \ |
| 285 U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \ | 284 U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \ |
| 286 } | 285 } |
| 287 #endif /* U_HIDE_DRAFT_API */ | |
| 288 | 286 |
| 289 /* definitions with forward iteration --------------------------------------- */ | 287 /* definitions with forward iteration --------------------------------------- */ |
| 290 | 288 |
| 291 /** | 289 /** |
| 292 * Get a code point from a string at a code point boundary offset, | 290 * Get a code point from a string at a code point boundary offset, |
| 293 * and advance the offset to the next code point boundary. | 291 * and advance the offset to the next code point boundary. |
| 294 * (Post-incrementing forward iteration.) | 292 * (Post-incrementing forward iteration.) |
| 295 * "Unsafe" macro, assumes well-formed UTF-8. | 293 * "Unsafe" macro, assumes well-formed UTF-8. |
| 296 * | 294 * |
| 297 * The offset may point to the lead byte of a multi-byte sequence, | 295 * The offset may point to the lead byte of a multi-byte sequence, |
| (...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 361 ) { \ | 359 ) { \ |
| 362 (c)=(((c)&0x1f)<<6)|__t1; \ | 360 (c)=(((c)&0x1f)<<6)|__t1; \ |
| 363 ++(i); \ | 361 ++(i); \ |
| 364 } else { \ | 362 } else { \ |
| 365 /* function call for "complicated" and error cases */ \ | 363 /* function call for "complicated" and error cases */ \ |
| 366 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1)
; \ | 364 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1)
; \ |
| 367 } \ | 365 } \ |
| 368 } \ | 366 } \ |
| 369 } | 367 } |
| 370 | 368 |
| 371 #ifndef U_HIDE_DRAFT_API | |
| 372 /** | 369 /** |
| 373 * Get a code point from a string at a code point boundary offset, | 370 * Get a code point from a string at a code point boundary offset, |
| 374 * and advance the offset to the next code point boundary. | 371 * and advance the offset to the next code point boundary. |
| 375 * (Post-incrementing forward iteration.) | 372 * (Post-incrementing forward iteration.) |
| 376 * "Safe" macro, checks for illegal sequences and for string boundaries. | 373 * "Safe" macro, checks for illegal sequences and for string boundaries. |
| 377 * | 374 * |
| 378 * The length can be negative for a NUL-terminated string. | 375 * The length can be negative for a NUL-terminated string. |
| 379 * | 376 * |
| 380 * The offset may point to the lead byte of a multi-byte sequence, | 377 * The offset may point to the lead byte of a multi-byte sequence, |
| 381 * in which case the macro will read the whole sequence. | 378 * in which case the macro will read the whole sequence. |
| 382 * If the offset points to a trail byte or an illegal UTF-8 sequence, then | 379 * If the offset points to a trail byte or an illegal UTF-8 sequence, then |
| 383 * c is set to U+FFFD. | 380 * c is set to U+FFFD. |
| 384 * | 381 * |
| 385 * This macro does not distinguish between a real U+FFFD in the text | 382 * This macro does not distinguish between a real U+FFFD in the text |
| 386 * and U+FFFD returned for an ill-formed sequence. | 383 * and U+FFFD returned for an ill-formed sequence. |
| 387 * Use U8_NEXT() if that distinction is important. | 384 * Use U8_NEXT() if that distinction is important. |
| 388 * | 385 * |
| 389 * @param s const uint8_t * string | 386 * @param s const uint8_t * string |
| 390 * @param i int32_t string offset, must be i<length | 387 * @param i int32_t string offset, must be i<length |
| 391 * @param length int32_t string length | 388 * @param length int32_t string length |
| 392 * @param c output UChar32 variable, set to U+FFFD in case of an error | 389 * @param c output UChar32 variable, set to U+FFFD in case of an error |
| 393 * @see U8_NEXT | 390 * @see U8_NEXT |
| 394 * @draft ICU 51 | 391 * @stable ICU 51 |
| 395 */ | 392 */ |
| 396 #define U8_NEXT_OR_FFFD(s, i, length, c) { \ | 393 #define U8_NEXT_OR_FFFD(s, i, length, c) { \ |
| 397 (c)=(uint8_t)(s)[(i)++]; \ | 394 (c)=(uint8_t)(s)[(i)++]; \ |
| 398 if((c)>=0x80) { \ | 395 if((c)>=0x80) { \ |
| 399 uint8_t __t1, __t2; \ | 396 uint8_t __t1, __t2; \ |
| 400 if( /* handle U+1000..U+CFFF inline */ \ | 397 if( /* handle U+1000..U+CFFF inline */ \ |
| 401 (0xe0<(c) && (c)<=0xec) && \ | 398 (0xe0<(c) && (c)<=0xec) && \ |
| 402 (((i)+1)<(length) || (length)<0) && \ | 399 (((i)+1)<(length) || (length)<0) && \ |
| 403 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \ | 400 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \ |
| 404 (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \ | 401 (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \ |
| 405 ) { \ | 402 ) { \ |
| 406 /* no need for (c&0xf) because the upper bits are truncated after <<
12 in the cast to (UChar) */ \ | 403 /* no need for (c&0xf) because the upper bits are truncated after <<
12 in the cast to (UChar) */ \ |
| 407 (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \ | 404 (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \ |
| 408 (i)+=2; \ | 405 (i)+=2; \ |
| 409 } else if( /* handle U+0080..U+07FF inline */ \ | 406 } else if( /* handle U+0080..U+07FF inline */ \ |
| 410 ((c)<0xe0 && (c)>=0xc2) && \ | 407 ((c)<0xe0 && (c)>=0xc2) && \ |
| 411 ((i)!=(length)) && \ | 408 ((i)!=(length)) && \ |
| 412 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \ | 409 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \ |
| 413 ) { \ | 410 ) { \ |
| 414 (c)=(((c)&0x1f)<<6)|__t1; \ | 411 (c)=(((c)&0x1f)<<6)|__t1; \ |
| 415 ++(i); \ | 412 ++(i); \ |
| 416 } else { \ | 413 } else { \ |
| 417 /* function call for "complicated" and error cases */ \ | 414 /* function call for "complicated" and error cases */ \ |
| 418 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -3)
; \ | 415 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -3)
; \ |
| 419 } \ | 416 } \ |
| 420 } \ | 417 } \ |
| 421 } | 418 } |
| 422 #endif /* U_HIDE_DRAFT_API */ | |
| 423 | 419 |
| 424 /** | 420 /** |
| 425 * Append a code point to a string, overwriting 1 to 4 bytes. | 421 * Append a code point to a string, overwriting 1 to 4 bytes. |
| 426 * The offset points to the current end of the string contents | 422 * The offset points to the current end of the string contents |
| 427 * and is advanced (post-increment). | 423 * and is advanced (post-increment). |
| 428 * "Unsafe" macro, assumes a valid code point and sufficient space in the string
. | 424 * "Unsafe" macro, assumes a valid code point and sufficient space in the string
. |
| 429 * Otherwise, the result is undefined. | 425 * Otherwise, the result is undefined. |
| 430 * | 426 * |
| 431 * @param s const uint8_t * string buffer | 427 * @param s const uint8_t * string buffer |
| 432 * @param i string offset | 428 * @param i string offset |
| (...skipping 236 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 669 * @see U8_PREV_UNSAFE | 665 * @see U8_PREV_UNSAFE |
| 670 * @stable ICU 2.4 | 666 * @stable ICU 2.4 |
| 671 */ | 667 */ |
| 672 #define U8_PREV(s, start, i, c) { \ | 668 #define U8_PREV(s, start, i, c) { \ |
| 673 (c)=(uint8_t)(s)[--(i)]; \ | 669 (c)=(uint8_t)(s)[--(i)]; \ |
| 674 if((c)>=0x80) { \ | 670 if((c)>=0x80) { \ |
| 675 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ | 671 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ |
| 676 } \ | 672 } \ |
| 677 } | 673 } |
| 678 | 674 |
| 679 #ifndef U_HIDE_DRAFT_API | |
| 680 /** | 675 /** |
| 681 * Move the string offset from one code point boundary to the previous one | 676 * Move the string offset from one code point boundary to the previous one |
| 682 * and get the code point between them. | 677 * and get the code point between them. |
| 683 * (Pre-decrementing backward iteration.) | 678 * (Pre-decrementing backward iteration.) |
| 684 * "Safe" macro, checks for illegal sequences and for string boundaries. | 679 * "Safe" macro, checks for illegal sequences and for string boundaries. |
| 685 * | 680 * |
| 686 * The input offset may be the same as the string length. | 681 * The input offset may be the same as the string length. |
| 687 * If the offset is behind a multi-byte sequence, then the macro will read | 682 * If the offset is behind a multi-byte sequence, then the macro will read |
| 688 * the whole sequence. | 683 * the whole sequence. |
| 689 * If the offset is behind a lead byte, then that itself | 684 * If the offset is behind a lead byte, then that itself |
| 690 * will be returned as the code point. | 685 * will be returned as the code point. |
| 691 * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD. | 686 * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD. |
| 692 * | 687 * |
| 693 * This macro does not distinguish between a real U+FFFD in the text | 688 * This macro does not distinguish between a real U+FFFD in the text |
| 694 * and U+FFFD returned for an ill-formed sequence. | 689 * and U+FFFD returned for an ill-formed sequence. |
| 695 * Use U8_PREV() if that distinction is important. | 690 * Use U8_PREV() if that distinction is important. |
| 696 * | 691 * |
| 697 * @param s const uint8_t * string | 692 * @param s const uint8_t * string |
| 698 * @param start int32_t starting string offset (usually 0) | 693 * @param start int32_t starting string offset (usually 0) |
| 699 * @param i int32_t string offset, must be start<i | 694 * @param i int32_t string offset, must be start<i |
| 700 * @param c output UChar32 variable, set to U+FFFD in case of an error | 695 * @param c output UChar32 variable, set to U+FFFD in case of an error |
| 701 * @see U8_PREV | 696 * @see U8_PREV |
| 702 * @draft ICU 51 | 697 * @stable ICU 51 |
| 703 */ | 698 */ |
| 704 #define U8_PREV_OR_FFFD(s, start, i, c) { \ | 699 #define U8_PREV_OR_FFFD(s, start, i, c) { \ |
| 705 (c)=(uint8_t)(s)[--(i)]; \ | 700 (c)=(uint8_t)(s)[--(i)]; \ |
| 706 if((c)>=0x80) { \ | 701 if((c)>=0x80) { \ |
| 707 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \ | 702 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \ |
| 708 } \ | 703 } \ |
| 709 } | 704 } |
| 710 #endif /* U_HIDE_DRAFT_API */ | |
| 711 | 705 |
| 712 /** | 706 /** |
| 713 * Move the string offset from one code point boundary to the previous one. | 707 * Move the string offset from one code point boundary to the previous one. |
| 714 * (Pre-decrementing backward iteration.) | 708 * (Pre-decrementing backward iteration.) |
| 715 * The input offset may be the same as the string length. | 709 * The input offset may be the same as the string length. |
| 716 * "Unsafe" macro, assumes well-formed UTF-8. | 710 * "Unsafe" macro, assumes well-formed UTF-8. |
| 717 * | 711 * |
| 718 * @param s const uint8_t * string | 712 * @param s const uint8_t * string |
| 719 * @param i string offset | 713 * @param i string offset |
| 720 * @see U8_BACK_1 | 714 * @see U8_BACK_1 |
| (...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 821 * @stable ICU 2.4 | 815 * @stable ICU 2.4 |
| 822 */ | 816 */ |
| 823 #define U8_SET_CP_LIMIT(s, start, i, length) { \ | 817 #define U8_SET_CP_LIMIT(s, start, i, length) { \ |
| 824 if((start)<(i) && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ | 818 if((start)<(i) && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ |
| 825 U8_BACK_1(s, start, i); \ | 819 U8_BACK_1(s, start, i); \ |
| 826 U8_FWD_1(s, i, length); \ | 820 U8_FWD_1(s, i, length); \ |
| 827 } \ | 821 } \ |
| 828 } | 822 } |
| 829 | 823 |
| 830 #endif | 824 #endif |
| OLD | NEW |