OLD | NEW |
1 /* | 1 /* |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * | 3 * |
4 * Copyright (C) 1999-2013, International Business Machines | 4 * Copyright (C) 1999-2014, International Business Machines |
5 * Corporation and others. All Rights Reserved. | 5 * Corporation and others. All Rights Reserved. |
6 * | 6 * |
7 ******************************************************************************* | 7 ******************************************************************************* |
8 * file name: utf8.h | 8 * file name: utf8.h |
9 * encoding: US-ASCII | 9 * encoding: US-ASCII |
10 * tab size: 8 (not used) | 10 * tab size: 8 (not used) |
11 * indentation:4 | 11 * indentation:4 |
12 * | 12 * |
13 * created on: 1999sep13 | 13 * created on: 1999sep13 |
14 * created by: Markus W. Scherer | 14 * created by: Markus W. Scherer |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
71 * leadByte is evaluated multiple times. | 71 * leadByte is evaluated multiple times. |
72 * | 72 * |
73 * The pre-ICU 50 implementation used the exported array utf8_countTrailBytes: | 73 * The pre-ICU 50 implementation used the exported array utf8_countTrailBytes: |
74 * #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte]) | 74 * #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte]) |
75 * leadByte was evaluated exactly once. | 75 * leadByte was evaluated exactly once. |
76 * | 76 * |
77 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. | 77 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. |
78 * @internal | 78 * @internal |
79 */ | 79 */ |
80 #define U8_COUNT_TRAIL_BYTES(leadByte) \ | 80 #define U8_COUNT_TRAIL_BYTES(leadByte) \ |
81 ((leadByte)<0xf0 ? \ | 81 ((uint8_t)(leadByte)<0xf0 ? \ |
82 ((leadByte)>=0xc0)+((leadByte)>=0xe0) : \ | 82 ((uint8_t)(leadByte)>=0xc0)+((uint8_t)(leadByte)>=0xe0) : \ |
83 (leadByte)<0xfe ? 3+((leadByte)>=0xf8)+((leadByte)>=0xfc) : 0) | 83 (uint8_t)(leadByte)<0xfe ? 3+((uint8_t)(leadByte)>=0xf8)+((uint8_t)(lead
Byte)>=0xfc) : 0) |
84 | 84 |
85 /** | 85 /** |
86 * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. | 86 * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. |
87 * The maximum supported lead byte is 0xf4 corresponding to U+10FFFF. | 87 * The maximum supported lead byte is 0xf4 corresponding to U+10FFFF. |
88 * leadByte might be evaluated multiple times. | 88 * leadByte might be evaluated multiple times. |
89 * | 89 * |
90 * This is internal since it is not meant to be called directly by external clie
nts; | 90 * This is internal since it is not meant to be called directly by external clie
nts; |
91 * however it is called by public macros in this file and thus must remain stabl
e. | 91 * however it is called by public macros in this file and thus must remain stabl
e. |
92 * | 92 * |
93 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. | 93 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. |
(...skipping 152 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
246 * @param c output UChar32 variable, set to <0 in case of an error | 246 * @param c output UChar32 variable, set to <0 in case of an error |
247 * @see U8_GET_UNSAFE | 247 * @see U8_GET_UNSAFE |
248 * @stable ICU 2.4 | 248 * @stable ICU 2.4 |
249 */ | 249 */ |
250 #define U8_GET(s, start, i, length, c) { \ | 250 #define U8_GET(s, start, i, length, c) { \ |
251 int32_t _u8_get_index=(i); \ | 251 int32_t _u8_get_index=(i); \ |
252 U8_SET_CP_START(s, start, _u8_get_index); \ | 252 U8_SET_CP_START(s, start, _u8_get_index); \ |
253 U8_NEXT(s, _u8_get_index, length, c); \ | 253 U8_NEXT(s, _u8_get_index, length, c); \ |
254 } | 254 } |
255 | 255 |
256 #ifndef U_HIDE_DRAFT_API | |
257 /** | 256 /** |
258 * Get a code point from a string at a random-access offset, | 257 * Get a code point from a string at a random-access offset, |
259 * without changing the offset. | 258 * without changing the offset. |
260 * The offset may point to either the lead byte or one of the trail bytes | 259 * The offset may point to either the lead byte or one of the trail bytes |
261 * for a code point, in which case the macro will read all of the bytes | 260 * for a code point, in which case the macro will read all of the bytes |
262 * for the code point. | 261 * for the code point. |
263 * | 262 * |
264 * The length can be negative for a NUL-terminated string. | 263 * The length can be negative for a NUL-terminated string. |
265 * | 264 * |
266 * If the offset points to an illegal UTF-8 byte sequence, then | 265 * If the offset points to an illegal UTF-8 byte sequence, then |
267 * c is set to U+FFFD. | 266 * c is set to U+FFFD. |
268 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_O
R_FFFD. | 267 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_O
R_FFFD. |
269 * | 268 * |
270 * This macro does not distinguish between a real U+FFFD in the text | 269 * This macro does not distinguish between a real U+FFFD in the text |
271 * and U+FFFD returned for an ill-formed sequence. | 270 * and U+FFFD returned for an ill-formed sequence. |
272 * Use U8_GET() if that distinction is important. | 271 * Use U8_GET() if that distinction is important. |
273 * | 272 * |
274 * @param s const uint8_t * string | 273 * @param s const uint8_t * string |
275 * @param start int32_t starting string offset | 274 * @param start int32_t starting string offset |
276 * @param i int32_t string offset, must be start<=i<length | 275 * @param i int32_t string offset, must be start<=i<length |
277 * @param length int32_t string length | 276 * @param length int32_t string length |
278 * @param c output UChar32 variable, set to U+FFFD in case of an error | 277 * @param c output UChar32 variable, set to U+FFFD in case of an error |
279 * @see U8_GET | 278 * @see U8_GET |
280 * @draft ICU 51 | 279 * @stable ICU 51 |
281 */ | 280 */ |
282 #define U8_GET_OR_FFFD(s, start, i, length, c) { \ | 281 #define U8_GET_OR_FFFD(s, start, i, length, c) { \ |
283 int32_t _u8_get_index=(i); \ | 282 int32_t _u8_get_index=(i); \ |
284 U8_SET_CP_START(s, start, _u8_get_index); \ | 283 U8_SET_CP_START(s, start, _u8_get_index); \ |
285 U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \ | 284 U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \ |
286 } | 285 } |
287 #endif /* U_HIDE_DRAFT_API */ | |
288 | 286 |
289 /* definitions with forward iteration --------------------------------------- */ | 287 /* definitions with forward iteration --------------------------------------- */ |
290 | 288 |
291 /** | 289 /** |
292 * Get a code point from a string at a code point boundary offset, | 290 * Get a code point from a string at a code point boundary offset, |
293 * and advance the offset to the next code point boundary. | 291 * and advance the offset to the next code point boundary. |
294 * (Post-incrementing forward iteration.) | 292 * (Post-incrementing forward iteration.) |
295 * "Unsafe" macro, assumes well-formed UTF-8. | 293 * "Unsafe" macro, assumes well-formed UTF-8. |
296 * | 294 * |
297 * The offset may point to the lead byte of a multi-byte sequence, | 295 * The offset may point to the lead byte of a multi-byte sequence, |
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
361 ) { \ | 359 ) { \ |
362 (c)=(((c)&0x1f)<<6)|__t1; \ | 360 (c)=(((c)&0x1f)<<6)|__t1; \ |
363 ++(i); \ | 361 ++(i); \ |
364 } else { \ | 362 } else { \ |
365 /* function call for "complicated" and error cases */ \ | 363 /* function call for "complicated" and error cases */ \ |
366 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1)
; \ | 364 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1)
; \ |
367 } \ | 365 } \ |
368 } \ | 366 } \ |
369 } | 367 } |
370 | 368 |
371 #ifndef U_HIDE_DRAFT_API | |
372 /** | 369 /** |
373 * Get a code point from a string at a code point boundary offset, | 370 * Get a code point from a string at a code point boundary offset, |
374 * and advance the offset to the next code point boundary. | 371 * and advance the offset to the next code point boundary. |
375 * (Post-incrementing forward iteration.) | 372 * (Post-incrementing forward iteration.) |
376 * "Safe" macro, checks for illegal sequences and for string boundaries. | 373 * "Safe" macro, checks for illegal sequences and for string boundaries. |
377 * | 374 * |
378 * The length can be negative for a NUL-terminated string. | 375 * The length can be negative for a NUL-terminated string. |
379 * | 376 * |
380 * The offset may point to the lead byte of a multi-byte sequence, | 377 * The offset may point to the lead byte of a multi-byte sequence, |
381 * in which case the macro will read the whole sequence. | 378 * in which case the macro will read the whole sequence. |
382 * If the offset points to a trail byte or an illegal UTF-8 sequence, then | 379 * If the offset points to a trail byte or an illegal UTF-8 sequence, then |
383 * c is set to U+FFFD. | 380 * c is set to U+FFFD. |
384 * | 381 * |
385 * This macro does not distinguish between a real U+FFFD in the text | 382 * This macro does not distinguish between a real U+FFFD in the text |
386 * and U+FFFD returned for an ill-formed sequence. | 383 * and U+FFFD returned for an ill-formed sequence. |
387 * Use U8_NEXT() if that distinction is important. | 384 * Use U8_NEXT() if that distinction is important. |
388 * | 385 * |
389 * @param s const uint8_t * string | 386 * @param s const uint8_t * string |
390 * @param i int32_t string offset, must be i<length | 387 * @param i int32_t string offset, must be i<length |
391 * @param length int32_t string length | 388 * @param length int32_t string length |
392 * @param c output UChar32 variable, set to U+FFFD in case of an error | 389 * @param c output UChar32 variable, set to U+FFFD in case of an error |
393 * @see U8_NEXT | 390 * @see U8_NEXT |
394 * @draft ICU 51 | 391 * @stable ICU 51 |
395 */ | 392 */ |
396 #define U8_NEXT_OR_FFFD(s, i, length, c) { \ | 393 #define U8_NEXT_OR_FFFD(s, i, length, c) { \ |
397 (c)=(uint8_t)(s)[(i)++]; \ | 394 (c)=(uint8_t)(s)[(i)++]; \ |
398 if((c)>=0x80) { \ | 395 if((c)>=0x80) { \ |
399 uint8_t __t1, __t2; \ | 396 uint8_t __t1, __t2; \ |
400 if( /* handle U+1000..U+CFFF inline */ \ | 397 if( /* handle U+1000..U+CFFF inline */ \ |
401 (0xe0<(c) && (c)<=0xec) && \ | 398 (0xe0<(c) && (c)<=0xec) && \ |
402 (((i)+1)<(length) || (length)<0) && \ | 399 (((i)+1)<(length) || (length)<0) && \ |
403 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \ | 400 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \ |
404 (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \ | 401 (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \ |
405 ) { \ | 402 ) { \ |
406 /* no need for (c&0xf) because the upper bits are truncated after <<
12 in the cast to (UChar) */ \ | 403 /* no need for (c&0xf) because the upper bits are truncated after <<
12 in the cast to (UChar) */ \ |
407 (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \ | 404 (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \ |
408 (i)+=2; \ | 405 (i)+=2; \ |
409 } else if( /* handle U+0080..U+07FF inline */ \ | 406 } else if( /* handle U+0080..U+07FF inline */ \ |
410 ((c)<0xe0 && (c)>=0xc2) && \ | 407 ((c)<0xe0 && (c)>=0xc2) && \ |
411 ((i)!=(length)) && \ | 408 ((i)!=(length)) && \ |
412 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \ | 409 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \ |
413 ) { \ | 410 ) { \ |
414 (c)=(((c)&0x1f)<<6)|__t1; \ | 411 (c)=(((c)&0x1f)<<6)|__t1; \ |
415 ++(i); \ | 412 ++(i); \ |
416 } else { \ | 413 } else { \ |
417 /* function call for "complicated" and error cases */ \ | 414 /* function call for "complicated" and error cases */ \ |
418 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -3)
; \ | 415 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -3)
; \ |
419 } \ | 416 } \ |
420 } \ | 417 } \ |
421 } | 418 } |
422 #endif /* U_HIDE_DRAFT_API */ | |
423 | 419 |
424 /** | 420 /** |
425 * Append a code point to a string, overwriting 1 to 4 bytes. | 421 * Append a code point to a string, overwriting 1 to 4 bytes. |
426 * The offset points to the current end of the string contents | 422 * The offset points to the current end of the string contents |
427 * and is advanced (post-increment). | 423 * and is advanced (post-increment). |
428 * "Unsafe" macro, assumes a valid code point and sufficient space in the string
. | 424 * "Unsafe" macro, assumes a valid code point and sufficient space in the string
. |
429 * Otherwise, the result is undefined. | 425 * Otherwise, the result is undefined. |
430 * | 426 * |
431 * @param s const uint8_t * string buffer | 427 * @param s const uint8_t * string buffer |
432 * @param i string offset | 428 * @param i string offset |
(...skipping 236 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
669 * @see U8_PREV_UNSAFE | 665 * @see U8_PREV_UNSAFE |
670 * @stable ICU 2.4 | 666 * @stable ICU 2.4 |
671 */ | 667 */ |
672 #define U8_PREV(s, start, i, c) { \ | 668 #define U8_PREV(s, start, i, c) { \ |
673 (c)=(uint8_t)(s)[--(i)]; \ | 669 (c)=(uint8_t)(s)[--(i)]; \ |
674 if((c)>=0x80) { \ | 670 if((c)>=0x80) { \ |
675 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ | 671 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ |
676 } \ | 672 } \ |
677 } | 673 } |
678 | 674 |
679 #ifndef U_HIDE_DRAFT_API | |
680 /** | 675 /** |
681 * Move the string offset from one code point boundary to the previous one | 676 * Move the string offset from one code point boundary to the previous one |
682 * and get the code point between them. | 677 * and get the code point between them. |
683 * (Pre-decrementing backward iteration.) | 678 * (Pre-decrementing backward iteration.) |
684 * "Safe" macro, checks for illegal sequences and for string boundaries. | 679 * "Safe" macro, checks for illegal sequences and for string boundaries. |
685 * | 680 * |
686 * The input offset may be the same as the string length. | 681 * The input offset may be the same as the string length. |
687 * If the offset is behind a multi-byte sequence, then the macro will read | 682 * If the offset is behind a multi-byte sequence, then the macro will read |
688 * the whole sequence. | 683 * the whole sequence. |
689 * If the offset is behind a lead byte, then that itself | 684 * If the offset is behind a lead byte, then that itself |
690 * will be returned as the code point. | 685 * will be returned as the code point. |
691 * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD. | 686 * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD. |
692 * | 687 * |
693 * This macro does not distinguish between a real U+FFFD in the text | 688 * This macro does not distinguish between a real U+FFFD in the text |
694 * and U+FFFD returned for an ill-formed sequence. | 689 * and U+FFFD returned for an ill-formed sequence. |
695 * Use U8_PREV() if that distinction is important. | 690 * Use U8_PREV() if that distinction is important. |
696 * | 691 * |
697 * @param s const uint8_t * string | 692 * @param s const uint8_t * string |
698 * @param start int32_t starting string offset (usually 0) | 693 * @param start int32_t starting string offset (usually 0) |
699 * @param i int32_t string offset, must be start<i | 694 * @param i int32_t string offset, must be start<i |
700 * @param c output UChar32 variable, set to U+FFFD in case of an error | 695 * @param c output UChar32 variable, set to U+FFFD in case of an error |
701 * @see U8_PREV | 696 * @see U8_PREV |
702 * @draft ICU 51 | 697 * @stable ICU 51 |
703 */ | 698 */ |
704 #define U8_PREV_OR_FFFD(s, start, i, c) { \ | 699 #define U8_PREV_OR_FFFD(s, start, i, c) { \ |
705 (c)=(uint8_t)(s)[--(i)]; \ | 700 (c)=(uint8_t)(s)[--(i)]; \ |
706 if((c)>=0x80) { \ | 701 if((c)>=0x80) { \ |
707 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \ | 702 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \ |
708 } \ | 703 } \ |
709 } | 704 } |
710 #endif /* U_HIDE_DRAFT_API */ | |
711 | 705 |
712 /** | 706 /** |
713 * Move the string offset from one code point boundary to the previous one. | 707 * Move the string offset from one code point boundary to the previous one. |
714 * (Pre-decrementing backward iteration.) | 708 * (Pre-decrementing backward iteration.) |
715 * The input offset may be the same as the string length. | 709 * The input offset may be the same as the string length. |
716 * "Unsafe" macro, assumes well-formed UTF-8. | 710 * "Unsafe" macro, assumes well-formed UTF-8. |
717 * | 711 * |
718 * @param s const uint8_t * string | 712 * @param s const uint8_t * string |
719 * @param i string offset | 713 * @param i string offset |
720 * @see U8_BACK_1 | 714 * @see U8_BACK_1 |
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
821 * @stable ICU 2.4 | 815 * @stable ICU 2.4 |
822 */ | 816 */ |
823 #define U8_SET_CP_LIMIT(s, start, i, length) { \ | 817 #define U8_SET_CP_LIMIT(s, start, i, length) { \ |
824 if((start)<(i) && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ | 818 if((start)<(i) && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ |
825 U8_BACK_1(s, start, i); \ | 819 U8_BACK_1(s, start, i); \ |
826 U8_FWD_1(s, i, length); \ | 820 U8_FWD_1(s, i, length); \ |
827 } \ | 821 } \ |
828 } | 822 } |
829 | 823 |
830 #endif | 824 #endif |
OLD | NEW |