source/common/unicode/utf8.h - Issue 845603002: Update ICU to 54.1 step 1

Side by Side Diff: source/common/unicode/utf8.h

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master

Patch Set: remove unusued directories Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 *******************************************************************************	2 *******************************************************************************

3 *	3 *

4 * Copyright (C) 1999-2013, International Business Machines	4 * Copyright (C) 1999-2014, International Business Machines

5 * Corporation and others. All Rights Reserved.	5 * Corporation and others. All Rights Reserved.

6 *	6 *

7 *******************************************************************************	7 *******************************************************************************

8 * file name: utf8.h	8 * file name: utf8.h

9 * encoding: US-ASCII	9 * encoding: US-ASCII

10 * tab size: 8 (not used)	10 * tab size: 8 (not used)

11 * indentation:4	11 * indentation:4

12 *	12 *

13 * created on: 1999sep13	13 * created on: 1999sep13

14 * created by: Markus W. Scherer	14 * created by: Markus W. Scherer

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
71 * leadByte is evaluated multiple times.	71 * leadByte is evaluated multiple times.

72 *	72 *

73 * The pre-ICU 50 implementation used the exported array utf8_countTrailBytes:	73 * The pre-ICU 50 implementation used the exported array utf8_countTrailBytes:

74 * #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte])	74 * #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte])

75 * leadByte was evaluated exactly once.	75 * leadByte was evaluated exactly once.

76 *	76 *

77 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.	77 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.

78 * @internal	78 * @internal

79 */	79 */

80 #define U8_COUNT_TRAIL_BYTES(leadByte) \	80 #define U8_COUNT_TRAIL_BYTES(leadByte) \

81 ((leadByte)<0xf0 ? \	81 ((uint8_t)(leadByte)<0xf0 ? \

82 ((leadByte)>=0xc0)+((leadByte)>=0xe0) : \	82 ((uint8_t)(leadByte)>=0xc0)+((uint8_t)(leadByte)>=0xe0) : \

83 (leadByte)<0xfe ? 3+((leadByte)>=0xf8)+((leadByte)>=0xfc) : 0)	83 (uint8_t)(leadByte)<0xfe ? 3+((uint8_t)(leadByte)>=0xf8)+((uint8_t)(lead Byte)>=0xfc) : 0)

84	84

85 /**	85 /**

86 * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.	86 * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.

87 * The maximum supported lead byte is 0xf4 corresponding to U+10FFFF.	87 * The maximum supported lead byte is 0xf4 corresponding to U+10FFFF.

88 * leadByte might be evaluated multiple times.	88 * leadByte might be evaluated multiple times.

89 *	89 *

90 * This is internal since it is not meant to be called directly by external clie nts;	90 * This is internal since it is not meant to be called directly by external clie nts;

91 * however it is called by public macros in this file and thus must remain stabl e.	91 * however it is called by public macros in this file and thus must remain stabl e.

92 *	92 *

93 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.	93 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.

(...skipping 152 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
246 * @param c output UChar32 variable, set to <0 in case of an error	246 * @param c output UChar32 variable, set to <0 in case of an error

247 * @see U8_GET_UNSAFE	247 * @see U8_GET_UNSAFE

248 * @stable ICU 2.4	248 * @stable ICU 2.4

249 */	249 */

250 #define U8_GET(s, start, i, length, c) { \	250 #define U8_GET(s, start, i, length, c) { \

251 int32_t _u8_get_index=(i); \	251 int32_t _u8_get_index=(i); \

252 U8_SET_CP_START(s, start, _u8_get_index); \	252 U8_SET_CP_START(s, start, _u8_get_index); \

253 U8_NEXT(s, _u8_get_index, length, c); \	253 U8_NEXT(s, _u8_get_index, length, c); \

254 }	254 }

255	255

256 #ifndef U_HIDE_DRAFT_API

257 /**	256 /**

258 * Get a code point from a string at a random-access offset,	257 * Get a code point from a string at a random-access offset,

259 * without changing the offset.	258 * without changing the offset.

260 * The offset may point to either the lead byte or one of the trail bytes	259 * The offset may point to either the lead byte or one of the trail bytes

261 * for a code point, in which case the macro will read all of the bytes	260 * for a code point, in which case the macro will read all of the bytes

262 * for the code point.	261 * for the code point.

263 *	262 *

264 * The length can be negative for a NUL-terminated string.	263 * The length can be negative for a NUL-terminated string.

265 *	264 *

266 * If the offset points to an illegal UTF-8 byte sequence, then	265 * If the offset points to an illegal UTF-8 byte sequence, then

267 * c is set to U+FFFD.	266 * c is set to U+FFFD.

268 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_O R_FFFD.	267 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_O R_FFFD.

269 *	268 *

270 * This macro does not distinguish between a real U+FFFD in the text	269 * This macro does not distinguish between a real U+FFFD in the text

271 * and U+FFFD returned for an ill-formed sequence.	270 * and U+FFFD returned for an ill-formed sequence.

272 * Use U8_GET() if that distinction is important.	271 * Use U8_GET() if that distinction is important.

273 *	272 *

274 * @param s const uint8_t * string	273 * @param s const uint8_t * string

275 * @param start int32_t starting string offset	274 * @param start int32_t starting string offset

276 * @param i int32_t string offset, must be start<=i<length	275 * @param i int32_t string offset, must be start<=i<length

277 * @param length int32_t string length	276 * @param length int32_t string length

278 * @param c output UChar32 variable, set to U+FFFD in case of an error	277 * @param c output UChar32 variable, set to U+FFFD in case of an error

279 * @see U8_GET	278 * @see U8_GET

280 * @draft ICU 51	279 * @stable ICU 51

281 */	280 */

282 #define U8_GET_OR_FFFD(s, start, i, length, c) { \	281 #define U8_GET_OR_FFFD(s, start, i, length, c) { \

283 int32_t _u8_get_index=(i); \	282 int32_t _u8_get_index=(i); \

284 U8_SET_CP_START(s, start, _u8_get_index); \	283 U8_SET_CP_START(s, start, _u8_get_index); \

285 U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \	284 U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \

286 }	285 }

287 #endif /* U_HIDE_DRAFT_API */

288	286

289 /* definitions with forward iteration --------------------------------------- */	287 /* definitions with forward iteration --------------------------------------- */

290	288

291 /**	289 /**

292 * Get a code point from a string at a code point boundary offset,	290 * Get a code point from a string at a code point boundary offset,

293 * and advance the offset to the next code point boundary.	291 * and advance the offset to the next code point boundary.

294 * (Post-incrementing forward iteration.)	292 * (Post-incrementing forward iteration.)

295 * "Unsafe" macro, assumes well-formed UTF-8.	293 * "Unsafe" macro, assumes well-formed UTF-8.

296 *	294 *

297 * The offset may point to the lead byte of a multi-byte sequence,	295 * The offset may point to the lead byte of a multi-byte sequence,

(...skipping 63 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
361 ) { \	359 ) { \

362 (c)=(((c)&0x1f)<<6)\|__t1; \	360 (c)=(((c)&0x1f)<<6)\|__t1; \

363 ++(i); \	361 ++(i); \

364 } else { \	362 } else { \

365 /* function call for "complicated" and error cases */ \	363 /* function call for "complicated" and error cases */ \

366 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1) ; \	364 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1) ; \

367 } \	365 } \

368 } \	366 } \

369 }	367 }

370	368

371 #ifndef U_HIDE_DRAFT_API

372 /**	369 /**

373 * Get a code point from a string at a code point boundary offset,	370 * Get a code point from a string at a code point boundary offset,

374 * and advance the offset to the next code point boundary.	371 * and advance the offset to the next code point boundary.

375 * (Post-incrementing forward iteration.)	372 * (Post-incrementing forward iteration.)

376 * "Safe" macro, checks for illegal sequences and for string boundaries.	373 * "Safe" macro, checks for illegal sequences and for string boundaries.

377 *	374 *

378 * The length can be negative for a NUL-terminated string.	375 * The length can be negative for a NUL-terminated string.

379 *	376 *

380 * The offset may point to the lead byte of a multi-byte sequence,	377 * The offset may point to the lead byte of a multi-byte sequence,

381 * in which case the macro will read the whole sequence.	378 * in which case the macro will read the whole sequence.

382 * If the offset points to a trail byte or an illegal UTF-8 sequence, then	379 * If the offset points to a trail byte or an illegal UTF-8 sequence, then

383 * c is set to U+FFFD.	380 * c is set to U+FFFD.

384 *	381 *

385 * This macro does not distinguish between a real U+FFFD in the text	382 * This macro does not distinguish between a real U+FFFD in the text

386 * and U+FFFD returned for an ill-formed sequence.	383 * and U+FFFD returned for an ill-formed sequence.

387 * Use U8_NEXT() if that distinction is important.	384 * Use U8_NEXT() if that distinction is important.

388 *	385 *

389 * @param s const uint8_t * string	386 * @param s const uint8_t * string

390 * @param i int32_t string offset, must be i<length	387 * @param i int32_t string offset, must be i<length

391 * @param length int32_t string length	388 * @param length int32_t string length

392 * @param c output UChar32 variable, set to U+FFFD in case of an error	389 * @param c output UChar32 variable, set to U+FFFD in case of an error

393 * @see U8_NEXT	390 * @see U8_NEXT

394 * @draft ICU 51	391 * @stable ICU 51

395 */	392 */

396 #define U8_NEXT_OR_FFFD(s, i, length, c) { \	393 #define U8_NEXT_OR_FFFD(s, i, length, c) { \

397 (c)=(uint8_t)(s)[(i)++]; \	394 (c)=(uint8_t)(s)[(i)++]; \

398 if((c)>=0x80) { \	395 if((c)>=0x80) { \

399 uint8_t __t1, __t2; \	396 uint8_t __t1, __t2; \

400 if( /* handle U+1000..U+CFFF inline */ \	397 if( /* handle U+1000..U+CFFF inline */ \

401 (0xe0<(c) && (c)<=0xec) && \	398 (0xe0<(c) && (c)<=0xec) && \

402 (((i)+1)<(length) \|\| (length)<0) && \	399 (((i)+1)<(length) \|\| (length)<0) && \

403 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \	400 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \

404 (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \	401 (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \

405 ) { \	402 ) { \

406 /* no need for (c&0xf) because the upper bits are truncated after << 12 in the cast to (UChar) */ \	403 /* no need for (c&0xf) because the upper bits are truncated after << 12 in the cast to (UChar) */ \

407 (c)=(UChar)(((c)<<12)\|(__t1<<6)\|__t2); \	404 (c)=(UChar)(((c)<<12)\|(__t1<<6)\|__t2); \

408 (i)+=2; \	405 (i)+=2; \

409 } else if( /* handle U+0080..U+07FF inline */ \	406 } else if( /* handle U+0080..U+07FF inline */ \

410 ((c)<0xe0 && (c)>=0xc2) && \	407 ((c)<0xe0 && (c)>=0xc2) && \

411 ((i)!=(length)) && \	408 ((i)!=(length)) && \

412 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \	409 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \

413 ) { \	410 ) { \

414 (c)=(((c)&0x1f)<<6)\|__t1; \	411 (c)=(((c)&0x1f)<<6)\|__t1; \

415 ++(i); \	412 ++(i); \

416 } else { \	413 } else { \

417 /* function call for "complicated" and error cases */ \	414 /* function call for "complicated" and error cases */ \

418 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -3) ; \	415 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -3) ; \

419 } \	416 } \

420 } \	417 } \

421 }	418 }

422 #endif /* U_HIDE_DRAFT_API */

423	419

424 /**	420 /**

425 * Append a code point to a string, overwriting 1 to 4 bytes.	421 * Append a code point to a string, overwriting 1 to 4 bytes.

426 * The offset points to the current end of the string contents	422 * The offset points to the current end of the string contents

427 * and is advanced (post-increment).	423 * and is advanced (post-increment).

428 * "Unsafe" macro, assumes a valid code point and sufficient space in the string .	424 * "Unsafe" macro, assumes a valid code point and sufficient space in the string .

429 * Otherwise, the result is undefined.	425 * Otherwise, the result is undefined.

430 *	426 *

431 * @param s const uint8_t * string buffer	427 * @param s const uint8_t * string buffer

432 * @param i string offset	428 * @param i string offset

(...skipping 236 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
669 * @see U8_PREV_UNSAFE	665 * @see U8_PREV_UNSAFE

670 * @stable ICU 2.4	666 * @stable ICU 2.4

671 */	667 */

672 #define U8_PREV(s, start, i, c) { \	668 #define U8_PREV(s, start, i, c) { \

673 (c)=(uint8_t)(s)[--(i)]; \	669 (c)=(uint8_t)(s)[--(i)]; \

674 if((c)>=0x80) { \	670 if((c)>=0x80) { \

675 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \	671 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \

676 } \	672 } \

677 }	673 }

678	674

679 #ifndef U_HIDE_DRAFT_API

680 /**	675 /**

681 * Move the string offset from one code point boundary to the previous one	676 * Move the string offset from one code point boundary to the previous one

682 * and get the code point between them.	677 * and get the code point between them.

683 * (Pre-decrementing backward iteration.)	678 * (Pre-decrementing backward iteration.)

684 * "Safe" macro, checks for illegal sequences and for string boundaries.	679 * "Safe" macro, checks for illegal sequences and for string boundaries.

685 *	680 *

686 * The input offset may be the same as the string length.	681 * The input offset may be the same as the string length.

687 * If the offset is behind a multi-byte sequence, then the macro will read	682 * If the offset is behind a multi-byte sequence, then the macro will read

688 * the whole sequence.	683 * the whole sequence.

689 * If the offset is behind a lead byte, then that itself	684 * If the offset is behind a lead byte, then that itself

690 * will be returned as the code point.	685 * will be returned as the code point.

691 * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.	686 * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.

692 *	687 *

693 * This macro does not distinguish between a real U+FFFD in the text	688 * This macro does not distinguish between a real U+FFFD in the text

694 * and U+FFFD returned for an ill-formed sequence.	689 * and U+FFFD returned for an ill-formed sequence.

695 * Use U8_PREV() if that distinction is important.	690 * Use U8_PREV() if that distinction is important.

696 *	691 *

697 * @param s const uint8_t * string	692 * @param s const uint8_t * string

698 * @param start int32_t starting string offset (usually 0)	693 * @param start int32_t starting string offset (usually 0)

699 * @param i int32_t string offset, must be start<i	694 * @param i int32_t string offset, must be start<i

700 * @param c output UChar32 variable, set to U+FFFD in case of an error	695 * @param c output UChar32 variable, set to U+FFFD in case of an error

701 * @see U8_PREV	696 * @see U8_PREV

702 * @draft ICU 51	697 * @stable ICU 51

703 */	698 */

704 #define U8_PREV_OR_FFFD(s, start, i, c) { \	699 #define U8_PREV_OR_FFFD(s, start, i, c) { \

705 (c)=(uint8_t)(s)[--(i)]; \	700 (c)=(uint8_t)(s)[--(i)]; \

706 if((c)>=0x80) { \	701 if((c)>=0x80) { \

707 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \	702 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \

708 } \	703 } \

709 }	704 }

710 #endif /* U_HIDE_DRAFT_API */

711	705

712 /**	706 /**

713 * Move the string offset from one code point boundary to the previous one.	707 * Move the string offset from one code point boundary to the previous one.

714 * (Pre-decrementing backward iteration.)	708 * (Pre-decrementing backward iteration.)

715 * The input offset may be the same as the string length.	709 * The input offset may be the same as the string length.

716 * "Unsafe" macro, assumes well-formed UTF-8.	710 * "Unsafe" macro, assumes well-formed UTF-8.

717 *	711 *

718 * @param s const uint8_t * string	712 * @param s const uint8_t * string

719 * @param i string offset	713 * @param i string offset

720 * @see U8_BACK_1	714 * @see U8_BACK_1

(...skipping 100 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
821 * @stable ICU 2.4	815 * @stable ICU 2.4

822 */	816 */

823 #define U8_SET_CP_LIMIT(s, start, i, length) { \	817 #define U8_SET_CP_LIMIT(s, start, i, length) { \

824 if((start)<(i) && ((i)<(length) \|\| ((length)<0 && (s)[i]!=0))) { \	818 if((start)<(i) && ((i)<(length) \|\| ((length)<0 && (s)[i]!=0))) { \

825 U8_BACK_1(s, start, i); \	819 U8_BACK_1(s, start, i); \

826 U8_FWD_1(s, i, length); \	820 U8_FWD_1(s, i, length); \

827 } \	821 } \

828 }	822 }

829	823

830 #endif	824 #endif

OLD	NEW

« no previous file with comments | « source/common/unicode/ustring.h ('k') | source/common/unicode/utypes.h » ('j') | no next file with comments »