OLD | NEW |
1 /* | 1 /* |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * Copyright (C) 2013-2014, International Business Machines | 3 * Copyright (C) 2013-2015, International Business Machines |
4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
5 ******************************************************************************* | 5 ******************************************************************************* |
6 * collationfastlatinbuilder.cpp | 6 * collationfastlatinbuilder.cpp |
7 * | 7 * |
8 * created on: 2013aug09 | 8 * created on: 2013aug09 |
9 * created by: Markus W. Scherer | 9 * created by: Markus W. Scherer |
10 */ | 10 */ |
11 | 11 |
12 #define DEBUG_COLLATION_FAST_LATIN_BUILDER 0 // 0 or 1 or 2 | 12 #define DEBUG_COLLATION_FAST_LATIN_BUILDER 0 // 0 or 1 or 2 |
13 #if DEBUG_COLLATION_FAST_LATIN_BUILDER | 13 #if DEBUG_COLLATION_FAST_LATIN_BUILDER |
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
129 UBool ok = !shortPrimaryOverflow && | 129 UBool ok = !shortPrimaryOverflow && |
130 encodeCharCEs(errorCode) && encodeContractions(errorCode); | 130 encodeCharCEs(errorCode) && encodeContractions(errorCode); |
131 contractionCEs.removeAllElements(); // might reduce heap memory usage | 131 contractionCEs.removeAllElements(); // might reduce heap memory usage |
132 uniqueCEs.removeAllElements(); | 132 uniqueCEs.removeAllElements(); |
133 return ok; | 133 return ok; |
134 } | 134 } |
135 | 135 |
136 UBool | 136 UBool |
137 CollationFastLatinBuilder::loadGroups(const CollationData &data, UErrorCode &err
orCode) { | 137 CollationFastLatinBuilder::loadGroups(const CollationData &data, UErrorCode &err
orCode) { |
138 if(U_FAILURE(errorCode)) { return FALSE; } | 138 if(U_FAILURE(errorCode)) { return FALSE; } |
139 result.append(0); // reserved for version & headerLength | 139 headerLength = 1 + NUM_SPECIAL_GROUPS; |
| 140 uint32_t r0 = (CollationFastLatin::VERSION << 8) | headerLength; |
| 141 result.append((UChar)r0); |
140 // The first few reordering groups should be special groups | 142 // The first few reordering groups should be special groups |
141 // (space, punct, ..., digit) followed by Latn, then Grek and other scripts. | 143 // (space, punct, ..., digit) followed by Latn, then Grek and other scripts. |
142 for(int32_t i = 0;;) { | 144 for(int32_t i = 0; i < NUM_SPECIAL_GROUPS; ++i) { |
143 if(i >= data.scriptsLength) { | 145 lastSpecialPrimaries[i] = data.getLastPrimaryForGroup(UCOL_REORDER_CODE_
FIRST + i); |
144 // no Latn script | 146 if(lastSpecialPrimaries[i] == 0) { |
145 errorCode = U_INTERNAL_PROGRAM_ERROR; | 147 // missing data |
146 return FALSE; | 148 return FALSE; |
147 } | 149 } |
148 uint32_t head = data.scripts[i]; | 150 result.append(0); // reserve a slot for this group |
149 uint32_t lastByte = head & 0xff; // last primary byte in the group | 151 } |
150 int32_t group = data.scripts[i + 2]; | 152 |
151 if(group == UCOL_REORDER_CODE_DIGIT) { | 153 firstDigitPrimary = data.getFirstPrimaryForGroup(UCOL_REORDER_CODE_DIGIT); |
152 firstDigitPrimary = (head & 0xff00) << 16; | 154 firstLatinPrimary = data.getFirstPrimaryForGroup(USCRIPT_LATIN); |
153 headerLength = result.length(); | 155 lastLatinPrimary = data.getLastPrimaryForGroup(USCRIPT_LATIN); |
154 uint32_t r0 = (CollationFastLatin::VERSION << 8) | headerLength; | 156 if(firstDigitPrimary == 0 || firstLatinPrimary == 0) { |
155 result.setCharAt(0, (UChar)r0); | 157 // missing data |
156 } else if(group == USCRIPT_LATIN) { | 158 return FALSE; |
157 if(firstDigitPrimary == 0) { | |
158 // no digit group | |
159 errorCode = U_INTERNAL_PROGRAM_ERROR; | |
160 return FALSE; | |
161 } | |
162 firstLatinPrimary = (head & 0xff00) << 16; | |
163 lastLatinPrimary = (lastByte << 24) | 0xffffff; | |
164 break; | |
165 } else if(firstDigitPrimary == 0) { | |
166 // a group below digits | |
167 if(lastByte > 0x7f) { | |
168 // We only use 7 bits for the last byte of a below-digits group. | |
169 // This does not warrant an errorCode, but we do not build a fas
t Latin table. | |
170 return FALSE; | |
171 } | |
172 result.append((UChar)lastByte); | |
173 } | |
174 i = i + 2 + data.scripts[i + 1]; | |
175 } | 159 } |
176 return TRUE; | 160 return TRUE; |
177 } | 161 } |
178 | 162 |
179 UBool | 163 UBool |
180 CollationFastLatinBuilder::inSameGroup(uint32_t p, uint32_t q) const { | 164 CollationFastLatinBuilder::inSameGroup(uint32_t p, uint32_t q) const { |
181 // Both or neither need to be encoded as short primaries, | 165 // Both or neither need to be encoded as short primaries, |
182 // so that we can test only one and use the same bit mask. | 166 // so that we can test only one and use the same bit mask. |
183 if(p >= firstShortPrimary) { | 167 if(p >= firstShortPrimary) { |
184 return q >= firstShortPrimary; | 168 return q >= firstShortPrimary; |
185 } else if(q >= firstShortPrimary) { | 169 } else if(q >= firstShortPrimary) { |
186 return FALSE; | 170 return FALSE; |
187 } | 171 } |
188 // Both or neither must be potentially-variable, | 172 // Both or neither must be potentially-variable, |
189 // so that we can test only one and determine if both are variable. | 173 // so that we can test only one and determine if both are variable. |
190 if(p >= firstDigitPrimary) { | 174 uint32_t lastVariablePrimary = lastSpecialPrimaries[NUM_SPECIAL_GROUPS - 1]; |
191 return q >= firstDigitPrimary; | 175 if(p > lastVariablePrimary) { |
192 } else if(q >= firstDigitPrimary) { | 176 return q > lastVariablePrimary; |
| 177 } else if(q > lastVariablePrimary) { |
193 return FALSE; | 178 return FALSE; |
194 } | 179 } |
195 // Both will be encoded with long mini primaries. | 180 // Both will be encoded with long mini primaries. |
196 // They must be in the same special reordering group, | 181 // They must be in the same special reordering group, |
197 // so that we can test only one and determine if both are variable. | 182 // so that we can test only one and determine if both are variable. |
198 p >>= 24; // first primary byte | |
199 q >>= 24; | |
200 U_ASSERT(p != 0 && q != 0); | 183 U_ASSERT(p != 0 && q != 0); |
201 U_ASSERT(p <= result[headerLength - 1]); // the loop will terminate | 184 for(int32_t i = 0;; ++i) { // will terminate |
202 for(int32_t i = 1;; ++i) { | 185 uint32_t lastPrimary = lastSpecialPrimaries[i]; |
203 uint32_t lastByte = result[i]; | 186 if(p <= lastPrimary) { |
204 if(p <= lastByte) { | 187 return q <= lastPrimary; |
205 return q <= lastByte; | 188 } else if(q <= lastPrimary) { |
206 } else if(q <= lastByte) { | |
207 return FALSE; | 189 return FALSE; |
208 } | 190 } |
209 } | 191 } |
210 } | 192 } |
211 | 193 |
212 void | 194 void |
213 CollationFastLatinBuilder::resetCEs() { | 195 CollationFastLatinBuilder::resetCEs() { |
214 contractionCEs.removeAllElements(); | 196 contractionCEs.removeAllElements(); |
215 uniqueCEs.removeAllElements(); | 197 uniqueCEs.removeAllElements(); |
216 shortPrimaryOverflow = FALSE; | 198 shortPrimaryOverflow = FALSE; |
(...skipping 227 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
444 | 426 |
445 UBool | 427 UBool |
446 CollationFastLatinBuilder::encodeUniqueCEs(UErrorCode &errorCode) { | 428 CollationFastLatinBuilder::encodeUniqueCEs(UErrorCode &errorCode) { |
447 if(U_FAILURE(errorCode)) { return FALSE; } | 429 if(U_FAILURE(errorCode)) { return FALSE; } |
448 uprv_free(miniCEs); | 430 uprv_free(miniCEs); |
449 miniCEs = (uint16_t *)uprv_malloc(uniqueCEs.size() * 2); | 431 miniCEs = (uint16_t *)uprv_malloc(uniqueCEs.size() * 2); |
450 if(miniCEs == NULL) { | 432 if(miniCEs == NULL) { |
451 errorCode = U_MEMORY_ALLOCATION_ERROR; | 433 errorCode = U_MEMORY_ALLOCATION_ERROR; |
452 return FALSE; | 434 return FALSE; |
453 } | 435 } |
454 int32_t group = 1; | 436 int32_t group = 0; |
455 uint32_t lastGroupByte = result[group]; | 437 uint32_t lastGroupPrimary = lastSpecialPrimaries[group]; |
456 // The lowest unique CE must be at least a secondary CE. | 438 // The lowest unique CE must be at least a secondary CE. |
457 U_ASSERT(((uint32_t)uniqueCEs.elementAti(0) >> 16) != 0); | 439 U_ASSERT(((uint32_t)uniqueCEs.elementAti(0) >> 16) != 0); |
458 uint32_t prevPrimary = 0; | 440 uint32_t prevPrimary = 0; |
459 uint32_t prevSecondary = 0; | 441 uint32_t prevSecondary = 0; |
460 uint32_t pri = 0; | 442 uint32_t pri = 0; |
461 uint32_t sec = 0; | 443 uint32_t sec = 0; |
462 uint32_t ter = CollationFastLatin::COMMON_TER; | 444 uint32_t ter = CollationFastLatin::COMMON_TER; |
463 for(int32_t i = 0; i < uniqueCEs.size(); ++i) { | 445 for(int32_t i = 0; i < uniqueCEs.size(); ++i) { |
464 int64_t ce = uniqueCEs.elementAti(i); | 446 int64_t ce = uniqueCEs.elementAti(i); |
465 // Note: At least one of the p/s/t weights changes from one unique CE to
the next. | 447 // Note: At least one of the p/s/t weights changes from one unique CE to
the next. |
466 // (uniqueCEs does not store case bits.) | 448 // (uniqueCEs does not store case bits.) |
467 uint32_t p = (uint32_t)(ce >> 32); | 449 uint32_t p = (uint32_t)(ce >> 32); |
468 if(p != prevPrimary) { | 450 if(p != prevPrimary) { |
469 uint32_t p1 = p >> 24; | 451 while(p > lastGroupPrimary) { |
470 while(p1 > lastGroupByte) { | |
471 U_ASSERT(pri <= CollationFastLatin::MAX_LONG); | 452 U_ASSERT(pri <= CollationFastLatin::MAX_LONG); |
472 // Add the last "long primary" in or before the group | 453 // Set the group's header entry to the |
473 // into the upper 9 bits of the group entry. | 454 // last "long primary" in or before the group. |
474 result.setCharAt(group, (UChar)((pri << 4) | lastGroupByte)); | 455 result.setCharAt(1 + group, (UChar)pri); |
475 if(++group < headerLength) { // group is 1-based | 456 if(++group < NUM_SPECIAL_GROUPS) { |
476 lastGroupByte = result[group]; | 457 lastGroupPrimary = lastSpecialPrimaries[group]; |
477 } else { | 458 } else { |
478 lastGroupByte = 0xff; | 459 lastGroupPrimary = 0xffffffff; |
479 break; | 460 break; |
480 } | 461 } |
481 } | 462 } |
482 if(p < firstShortPrimary) { | 463 if(p < firstShortPrimary) { |
483 if(pri == 0) { | 464 if(pri == 0) { |
484 pri = CollationFastLatin::MIN_LONG; | 465 pri = CollationFastLatin::MIN_LONG; |
485 } else if(pri < CollationFastLatin::MAX_LONG) { | 466 } else if(pri < CollationFastLatin::MAX_LONG) { |
486 pri += CollationFastLatin::LONG_INC; | 467 pri += CollationFastLatin::LONG_INC; |
487 } else { | 468 } else { |
488 #if DEBUG_COLLATION_FAST_LATIN_BUILDER | 469 #if DEBUG_COLLATION_FAST_LATIN_BUILDER |
(...skipping 236 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
725 // Secondary CE, or a CE with a short primary, copy the case bits. | 706 // Secondary CE, or a CE with a short primary, copy the case bits. |
726 case1 = (case1 >> (14 - 3)) + CollationFastLatin::LOWER_CASE; | 707 case1 = (case1 >> (14 - 3)) + CollationFastLatin::LOWER_CASE; |
727 miniCE1 |= case1; | 708 miniCE1 |= case1; |
728 } | 709 } |
729 return (miniCE << 16) | miniCE1; | 710 return (miniCE << 16) | miniCE1; |
730 } | 711 } |
731 | 712 |
732 U_NAMESPACE_END | 713 U_NAMESPACE_END |
733 | 714 |
734 #endif // !UCONFIG_NO_COLLATION | 715 #endif // !UCONFIG_NO_COLLATION |
OLD | NEW |