source/i18n/collationfcd.h - Issue 845603002: Update ICU to 54.1 step 1

Side by Side Diff: source/i18n/collationfcd.h

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master

Patch Set: remove unusued directories Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 *******************************************************************************

	3 * Copyright (C) 2012-2014, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 *******************************************************************************

	6 * collationfcd.h

	7 *

	8 * created on: 2012aug18

	9 * created by: Markus W. Scherer

	10 */

	11

	12 #ifndef __COLLATIONFCD_H__

	13 #define __COLLATIONFCD_H__

	14

	15 #include "unicode/utypes.h"

	16

	17 #if !UCONFIG_NO_COLLATION

	18

	19 #include "unicode/utf16.h"

	20

	21 U_NAMESPACE_BEGIN

	22

	23 /**

	24 * Data and functions for the FCD check fast path.

	25 *

	26 * The fast path looks at a pair of 16-bit code units and checks

	27 * whether there is an FCD boundary between them;

	28 * there is if the first unit has a trailing ccc=0 (!hasTccc(first))

	29 * or the second unit has a leading ccc=0 (!hasLccc(second)),

	30 * or both.

	31 * When the fast path finds a possible non-boundary,

	32 * then the FCD check slow path looks at the actual sequence of FCD values.

	33 *

	34 * This is a pure optimization.

	35 * The fast path must at least find all possible non-boundaries.

	36 * If the fast path is too pessimistic, it costs performance.

	37 *

	38 * For a pair of BMP characters, the fast path tests are precise (1 bit per char acter).

	39 *

	40 * For a supplementary code point, the two units are its lead and trail surrogat es.

	41 * We set hasTccc(lead)=true if any of its 1024 associated supplementary code po ints

	42 * has lccc!=0 or tccc!=0.

	43 * We set hasLccc(trail)=true for all trail surrogates.

	44 * As a result, we leave the fast path if the lead surrogate might start a

	45 * supplementary code point that is not FCD-inert.

	46 * (So the fast path need not detect that there is a surrogate pair,

	47 * nor look ahead to the next full code point.)

	48 *

	49 * hasLccc(lead)=true if any of its 1024 associated supplementary code points

	50 * has lccc!=0, for fast boundary checking between BMP & supplementary.

	51 *

	52 * hasTccc(trail)=false:

	53 * It should only be tested for unpaired trail surrogates which are FCD-inert.

	54 */

	55 class U_I18N_API CollationFCD {

	56 public:

	57 static inline UBool hasLccc(UChar32 c) {

	58 // assert c <= 0xffff

	59 // c can be negative, e.g., U_SENTINEL from UCharIterator;

	60 // that is handled in the first test.

	61 int32_t i;

	62 return

	63 // U+0300 is the first character with lccc!=0.

	64 c >= 0x300 &&

	65 (i = lcccIndex[c >> 5]) != 0 &&

	66 (lcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0;

	67 }

	68

	69 static inline UBool hasTccc(UChar32 c) {

	70 // assert c <= 0xffff

	71 // c can be negative, e.g., U_SENTINEL from UCharIterator;

	72 // that is handled in the first test.

	73 int32_t i;

	74 return

	75 // U+00C0 is the first character with tccc!=0.

	76 c >= 0xc0 &&

	77 (i = tcccIndex[c >> 5]) != 0 &&

	78 (tcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0;

	79 }

	80

	81 static inline UBool mayHaveLccc(UChar32 c) {

	82 // Handles all of Unicode 0..10FFFF.

	83 // c can be negative, e.g., U_SENTINEL.

	84 // U+0300 is the first character with lccc!=0.

	85 if(c < 0x300) { return FALSE; }

	86 if(c > 0xffff) { c = U16_LEAD(c); }

	87 int32_t i;

	88 return

	89 (i = lcccIndex[c >> 5]) != 0 &&

	90 (lcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0;

	91 }

	92

	93 /**

	94 * Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81)

	95 * must be decomposed before reaching the core collation code,

	96 * or else some sequences including them, even ones passing the FCD check,

	97 * do not yield canonically equivalent results.

	98 *

	99 * This is a fast and imprecise test.

	100 *

	101 * @param c a code point

	102 * @return TRUE if c is U+0F73, U+0F75 or U+0F81 or one of several other Tib etan characters

	103 */

	104 static inline UBool maybeTibetanCompositeVowel(UChar32 c) {

	105 return (c & 0x1fff01) == 0xf01;

	106 }

	107

	108 /**

	109 * Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81)

	110 * must be decomposed before reaching the core collation code,

	111 * or else some sequences including them, even ones passing the FCD check,

	112 * do not yield canonically equivalent results.

	113 *

	114 * They have distinct lccc/tccc combinations: 129/130 or 129/132.

	115 *

	116 * @param fcd16 the FCD value (lccc/tccc combination) of a code point

	117 * @return TRUE if fcd16 is from U+0F73, U+0F75 or U+0F81

	118 */

	119 static inline UBool isFCD16OfTibetanCompositeVowel(uint16_t fcd16) {

	120 return fcd16 == 0x8182 \|\| fcd16 == 0x8184;

	121 }

	122

	123 private:

	124 CollationFCD(); // No instantiation.

	125

	126 static const uint8_t lcccIndex[2048];

	127 static const uint8_t tcccIndex[2048];

	128 static const uint32_t lcccBits[];

	129 static const uint32_t tcccBits[];

	130 };

	131

	132 U_NAMESPACE_END

	133

	134 #endif // !UCONFIG_NO_COLLATION

	135 #endif // __COLLATIONFCD_H__

OLD	NEW

« no previous file with comments | « source/i18n/collationfastlatinbuilder.cpp ('k') | source/i18n/collationfcd.cpp » ('j') | no next file with comments »