OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 2012-2014, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* |
| 6 * collationdatabuilder.h |
| 7 * |
| 8 * created on: 2012apr01 |
| 9 * created by: Markus W. Scherer |
| 10 */ |
| 11 |
| 12 #ifndef __COLLATIONDATABUILDER_H__ |
| 13 #define __COLLATIONDATABUILDER_H__ |
| 14 |
| 15 #include "unicode/utypes.h" |
| 16 |
| 17 #if !UCONFIG_NO_COLLATION |
| 18 |
| 19 #include "unicode/uniset.h" |
| 20 #include "unicode/unistr.h" |
| 21 #include "unicode/uversion.h" |
| 22 #include "collation.h" |
| 23 #include "collationdata.h" |
| 24 #include "collationsettings.h" |
| 25 #include "normalizer2impl.h" |
| 26 #include "utrie2.h" |
| 27 #include "uvectr32.h" |
| 28 #include "uvectr64.h" |
| 29 #include "uvector.h" |
| 30 |
| 31 U_NAMESPACE_BEGIN |
| 32 |
| 33 struct ConditionalCE32; |
| 34 |
| 35 class CollationFastLatinBuilder; |
| 36 class CopyHelper; |
| 37 class DataBuilderCollationIterator; |
| 38 class UCharsTrieBuilder; |
| 39 |
| 40 /** |
| 41 * Low-level CollationData builder. |
| 42 * Takes (character, CE) pairs and builds them into runtime data structures. |
| 43 * Supports characters with context prefixes and contraction suffixes. |
| 44 */ |
| 45 class U_I18N_API CollationDataBuilder : public UObject { |
| 46 public: |
| 47 /** |
| 48 * Collation element modifier. Interface class for a modifier |
| 49 * that changes a tailoring builder's temporary CEs to final CEs. |
| 50 * Called for every non-special CE32 and every expansion CE. |
| 51 */ |
| 52 class CEModifier : public UObject { |
| 53 public: |
| 54 virtual ~CEModifier(); |
| 55 /** Returns a new CE to replace the non-special input CE32, or else Coll
ation::NO_CE. */ |
| 56 virtual int64_t modifyCE32(uint32_t ce32) const = 0; |
| 57 /** Returns a new CE to replace the input CE, or else Collation::NO_CE.
*/ |
| 58 virtual int64_t modifyCE(int64_t ce) const = 0; |
| 59 }; |
| 60 |
| 61 CollationDataBuilder(UErrorCode &errorCode); |
| 62 |
| 63 virtual ~CollationDataBuilder(); |
| 64 |
| 65 void initForTailoring(const CollationData *b, UErrorCode &errorCode); |
| 66 |
| 67 virtual UBool isCompressibleLeadByte(uint32_t b) const; |
| 68 |
| 69 inline UBool isCompressiblePrimary(uint32_t p) const { |
| 70 return isCompressibleLeadByte(p >> 24); |
| 71 } |
| 72 |
| 73 /** |
| 74 * @return TRUE if this builder has mappings (e.g., add() has been called) |
| 75 */ |
| 76 UBool hasMappings() const { return modified; } |
| 77 |
| 78 /** |
| 79 * @return TRUE if c has CEs in this builder |
| 80 */ |
| 81 UBool isAssigned(UChar32 c) const; |
| 82 |
| 83 /** |
| 84 * @return the three-byte primary if c maps to a single such CE and has no c
ontext data, |
| 85 * otherwise returns 0. |
| 86 */ |
| 87 uint32_t getLongPrimaryIfSingleCE(UChar32 c) const; |
| 88 |
| 89 /** |
| 90 * @return the single CE for c. |
| 91 * Sets an error code if c does not have a single CE. |
| 92 */ |
| 93 int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const; |
| 94 |
| 95 void add(const UnicodeString &prefix, const UnicodeString &s, |
| 96 const int64_t ces[], int32_t cesLength, |
| 97 UErrorCode &errorCode); |
| 98 |
| 99 /** |
| 100 * Encodes the ces as either the returned ce32 by itself, |
| 101 * or by storing an expansion, with the returned ce32 referring to that. |
| 102 * |
| 103 * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength)) |
| 104 */ |
| 105 virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCod
e &errorCode); |
| 106 void addCE32(const UnicodeString &prefix, const UnicodeString &s, |
| 107 uint32_t ce32, UErrorCode &errorCode); |
| 108 |
| 109 /** |
| 110 * Sets three-byte-primary CEs for a range of code points in code point orde
r, |
| 111 * if it is worth doing; otherwise no change is made. |
| 112 * None of the code points in the range should have complex mappings so far |
| 113 * (expansions/contractions/prefixes). |
| 114 * @param start first code point |
| 115 * @param end last code point (inclusive) |
| 116 * @param primary primary weight for 'start' |
| 117 * @param step per-code point primary-weight increment |
| 118 * @param errorCode ICU in/out error code |
| 119 * @return TRUE if an OFFSET_TAG range was used for start..end |
| 120 */ |
| 121 UBool maybeSetPrimaryRange(UChar32 start, UChar32 end, |
| 122 uint32_t primary, int32_t step, |
| 123 UErrorCode &errorCode); |
| 124 |
| 125 /** |
| 126 * Sets three-byte-primary CEs for a range of code points in code point orde
r. |
| 127 * Sets range values if that is worth doing, or else individual values. |
| 128 * None of the code points in the range should have complex mappings so far |
| 129 * (expansions/contractions/prefixes). |
| 130 * @param start first code point |
| 131 * @param end last code point (inclusive) |
| 132 * @param primary primary weight for 'start' |
| 133 * @param step per-code point primary-weight increment |
| 134 * @param errorCode ICU in/out error code |
| 135 * @return the next primary after 'end': start primary incremented by ((end-
start)+1)*step |
| 136 */ |
| 137 uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end, |
| 138 uint32_t primary, int32_t step, |
| 139 UErrorCode &errorCode); |
| 140 |
| 141 /** |
| 142 * Copies all mappings from the src builder, with modifications. |
| 143 * This builder here must not be built yet, and should be empty. |
| 144 */ |
| 145 void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier, |
| 146 UErrorCode &errorCode); |
| 147 |
| 148 void optimize(const UnicodeSet &set, UErrorCode &errorCode); |
| 149 void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode); |
| 150 |
| 151 void enableFastLatin() { fastLatinEnabled = TRUE; } |
| 152 virtual void build(CollationData &data, UErrorCode &errorCode); |
| 153 |
| 154 /** |
| 155 * Looks up CEs for s and appends them to the ces array. |
| 156 * Does not handle normalization: s should be in FCD form. |
| 157 * |
| 158 * Does not write completely ignorable CEs. |
| 159 * Does not write beyond Collation::MAX_EXPANSION_LENGTH. |
| 160 * |
| 161 * @return incremented cesLength |
| 162 */ |
| 163 int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength); |
| 164 int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s, |
| 165 int64_t ces[], int32_t cesLength); |
| 166 |
| 167 protected: |
| 168 friend class CopyHelper; |
| 169 friend class DataBuilderCollationIterator; |
| 170 |
| 171 uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) con
st; |
| 172 |
| 173 int32_t addCE(int64_t ce, UErrorCode &errorCode); |
| 174 int32_t addCE32(uint32_t ce32, UErrorCode &errorCode); |
| 175 int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErr
orCode &errorCode); |
| 176 |
| 177 inline ConditionalCE32 *getConditionalCE32(int32_t index) const { |
| 178 return static_cast<ConditionalCE32 *>(conditionalCE32s[index]); |
| 179 } |
| 180 inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const { |
| 181 return getConditionalCE32(Collation::indexFromCE32(ce32)); |
| 182 } |
| 183 |
| 184 static uint32_t makeBuilderContextCE32(int32_t index) { |
| 185 return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, i
ndex); |
| 186 } |
| 187 static inline UBool isBuilderContextCE32(uint32_t ce32) { |
| 188 return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG); |
| 189 } |
| 190 |
| 191 static uint32_t encodeOneCEAsCE32(int64_t ce); |
| 192 uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode); |
| 193 uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &er
rorCode); |
| 194 uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorC
ode &errorCode); |
| 195 |
| 196 uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErro
rCode &errorCode); |
| 197 /** |
| 198 * Copies base contractions to a list of ConditionalCE32. |
| 199 * Sets cond->next to the index of the first new item |
| 200 * and returns the index of the last new item. |
| 201 */ |
| 202 int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint
32_t ce32, |
| 203 ConditionalCE32 *cond, UErrorCode &erro
rCode); |
| 204 |
| 205 UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode); |
| 206 void setDigitTags(UErrorCode &errorCode); |
| 207 void setLeadSurrogates(UErrorCode &errorCode); |
| 208 |
| 209 void buildMappings(CollationData &data, UErrorCode &errorCode); |
| 210 |
| 211 void clearContexts(); |
| 212 void buildContexts(UErrorCode &errorCode); |
| 213 uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode); |
| 214 int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder, |
| 215 UErrorCode &errorCode); |
| 216 |
| 217 void buildFastLatinTable(CollationData &data, UErrorCode &errorCode); |
| 218 |
| 219 int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t
cesLength); |
| 220 |
| 221 static UChar32 jamoCpFromIndex(int32_t i) { |
| 222 // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27 |
| 223 if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; } |
| 224 i -= Hangul::JAMO_L_COUNT; |
| 225 if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; } |
| 226 i -= Hangul::JAMO_V_COUNT; |
| 227 // i < 27 |
| 228 return Hangul::JAMO_T_BASE + 1 + i; |
| 229 } |
| 230 |
| 231 /** @see Collation::BUILDER_DATA_TAG */ |
| 232 static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100; |
| 233 |
| 234 const Normalizer2Impl &nfcImpl; |
| 235 const CollationData *base; |
| 236 const CollationSettings *baseSettings; |
| 237 UTrie2 *trie; |
| 238 UVector32 ce32s; |
| 239 UVector64 ce64s; |
| 240 UVector conditionalCE32s; // vector of ConditionalCE32 |
| 241 // Characters that have context (prefixes or contraction suffixes). |
| 242 UnicodeSet contextChars; |
| 243 // Serialized UCharsTrie structures for finalized contexts. |
| 244 UnicodeString contexts; |
| 245 UnicodeSet unsafeBackwardSet; |
| 246 UBool modified; |
| 247 |
| 248 UBool fastLatinEnabled; |
| 249 CollationFastLatinBuilder *fastLatinBuilder; |
| 250 |
| 251 DataBuilderCollationIterator *collIter; |
| 252 }; |
| 253 |
| 254 U_NAMESPACE_END |
| 255 |
| 256 #endif // !UCONFIG_NO_COLLATION |
| 257 #endif // __COLLATIONDATABUILDER_H__ |
OLD | NEW |