OLD | NEW |
| (Empty) |
1 /* | |
2 ******************************************************************************* | |
3 * Copyright (C) 2012-2014, International Business Machines | |
4 * Corporation and others. All Rights Reserved. | |
5 ******************************************************************************* | |
6 * collationbasedatabuilder.cpp | |
7 * | |
8 * created on: 2012aug11 | |
9 * created by: Markus W. Scherer | |
10 */ | |
11 | |
12 #include "unicode/utypes.h" | |
13 | |
14 #if !UCONFIG_NO_COLLATION | |
15 | |
16 #include "unicode/localpointer.h" | |
17 #include "unicode/ucharstriebuilder.h" | |
18 #include "unicode/uniset.h" | |
19 #include "unicode/unistr.h" | |
20 #include "unicode/utf16.h" | |
21 #include "collation.h" | |
22 #include "collationbasedatabuilder.h" | |
23 #include "collationdata.h" | |
24 #include "collationdatabuilder.h" | |
25 #include "collationrootelements.h" | |
26 #include "normalizer2impl.h" | |
27 #include "uassert.h" | |
28 #include "utrie2.h" | |
29 #include "uvectr32.h" | |
30 #include "uvectr64.h" | |
31 #include "uvector.h" | |
32 | |
33 U_NAMESPACE_BEGIN | |
34 | |
35 namespace { | |
36 | |
37 /** | |
38 * Compare two signed int64_t values as if they were unsigned. | |
39 */ | |
40 int32_t | |
41 compareInt64AsUnsigned(int64_t a, int64_t b) { | |
42 if((uint64_t)a < (uint64_t)b) { | |
43 return -1; | |
44 } else if((uint64_t)a > (uint64_t)b) { | |
45 return 1; | |
46 } else { | |
47 return 0; | |
48 } | |
49 } | |
50 | |
51 // TODO: Try to merge this with the binarySearch in alphaindex.cpp. | |
52 /** | |
53 * Like Java Collections.binarySearch(List, String, Comparator). | |
54 * | |
55 * @return the index>=0 where the item was found, | |
56 * or the index<0 for inserting the string at ~index in sorted order | |
57 */ | |
58 int32_t | |
59 binarySearch(const UVector64 &list, int64_t ce) { | |
60 if (list.size() == 0) { return ~0; } | |
61 int32_t start = 0; | |
62 int32_t limit = list.size(); | |
63 for (;;) { | |
64 int32_t i = (start + limit) / 2; | |
65 int32_t cmp = compareInt64AsUnsigned(ce, list.elementAti(i)); | |
66 if (cmp == 0) { | |
67 return i; | |
68 } else if (cmp < 0) { | |
69 if (i == start) { | |
70 return ~start; // insert ce before i | |
71 } | |
72 limit = i; | |
73 } else { | |
74 if (i == start) { | |
75 return ~(start + 1); // insert ce after i | |
76 } | |
77 start = i; | |
78 } | |
79 } | |
80 } | |
81 | |
82 } // namespace | |
83 | |
84 CollationBaseDataBuilder::CollationBaseDataBuilder(UErrorCode &errorCode) | |
85 : CollationDataBuilder(errorCode), | |
86 numericPrimary(0x12000000), | |
87 firstHanPrimary(0), lastHanPrimary(0), hanStep(2), | |
88 rootElements(errorCode) { | |
89 } | |
90 | |
91 CollationBaseDataBuilder::~CollationBaseDataBuilder() { | |
92 } | |
93 | |
94 void | |
95 CollationBaseDataBuilder::init(UErrorCode &errorCode) { | |
96 if(U_FAILURE(errorCode)) { return; } | |
97 if(trie != NULL) { | |
98 errorCode = U_INVALID_STATE_ERROR; | |
99 return; | |
100 } | |
101 | |
102 // Not compressible: | |
103 // - digits | |
104 // - Latin | |
105 // - Hani | |
106 // - trail weights | |
107 // Some scripts are compressible, some are not. | |
108 uprv_memset(compressibleBytes, FALSE, 256); | |
109 compressibleBytes[Collation::UNASSIGNED_IMPLICIT_BYTE] = TRUE; | |
110 | |
111 // For a base, the default is to compute an unassigned-character implicit CE
. | |
112 // This includes surrogate code points; see the last option in | |
113 // UCA section 7.1.1 Handling Ill-Formed Code Unit Sequences. | |
114 trie = utrie2_open(Collation::UNASSIGNED_CE32, Collation::FFFD_CE32, &errorC
ode); | |
115 | |
116 // Preallocate trie blocks for Latin in the hope that proximity helps with C
PU caches. | |
117 for(UChar32 c = 0; c < 0x180; ++c) { | |
118 utrie2_set32(trie, c, Collation::UNASSIGNED_CE32, &errorCode); | |
119 } | |
120 | |
121 utrie2_set32(trie, 0xfffe, Collation::MERGE_SEPARATOR_CE32, &errorCode); | |
122 // No root element for the merge separator which has 02 weights. | |
123 // Some code assumes that the root first primary CE is the "space first prim
ary" | |
124 // from FractionalUCA.txt. | |
125 | |
126 uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_T
AG, 0); | |
127 utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32,
TRUE, &errorCode); | |
128 | |
129 // Add a mapping for the first-unassigned boundary, | |
130 // which is the AlphabeticIndex overflow boundary. | |
131 UnicodeString s((UChar)0xfdd1); // Script boundary contractions start with
U+FDD1. | |
132 s.append((UChar)0xfdd0); // Zzzz script sample character U+FDD0. | |
133 int64_t ce = Collation::makeCE(Collation::FIRST_UNASSIGNED_PRIMARY); | |
134 add(UnicodeString(), s, &ce, 1, errorCode); | |
135 | |
136 // Add a tailoring boundary, but not a mapping, for [first trailing]. | |
137 ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY); | |
138 rootElements.addElement(ce, errorCode); | |
139 | |
140 // U+FFFD maps to a CE with the third-highest primary weight, | |
141 // for predictable handling of ill-formed UTF-8. | |
142 uint32_t ce32 = Collation::FFFD_CE32; | |
143 utrie2_set32(trie, 0xfffd, ce32, &errorCode); | |
144 addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode); | |
145 | |
146 // U+FFFF maps to a CE with the highest primary weight. | |
147 ce32 = Collation::MAX_REGULAR_CE32; | |
148 utrie2_set32(trie, 0xffff, ce32, &errorCode); | |
149 addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode); | |
150 } | |
151 | |
152 void | |
153 CollationBaseDataBuilder::initHanRanges(const UChar32 ranges[], int32_t length, | |
154 UErrorCode &errorCode) { | |
155 if(U_FAILURE(errorCode) || length == 0) { return; } | |
156 if((length & 1) != 0) { // incomplete start/end pairs | |
157 errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
158 return; | |
159 } | |
160 if(isAssigned(0x4e00)) { // already set | |
161 errorCode = U_INVALID_STATE_ERROR; | |
162 return; | |
163 } | |
164 int32_t numHanCodePoints = 0; | |
165 for(int32_t i = 0; i < length; i += 2) { | |
166 UChar32 start = ranges[i]; | |
167 UChar32 end = ranges[i + 1]; | |
168 numHanCodePoints += end - start + 1; | |
169 } | |
170 // Multiply the number of code points by (gap+1). | |
171 // Add hanStep+2 for tailoring after the last Han character. | |
172 int32_t gap = 1; | |
173 hanStep = gap + 1; | |
174 int32_t numHan = numHanCodePoints * hanStep + hanStep + 2; | |
175 // Numbers of Han primaries per lead byte determined by | |
176 // numbers of 2nd (not compressible) times 3rd primary byte values. | |
177 int32_t numHanPerLeadByte = 254 * 254; | |
178 int32_t numHanLeadBytes = (numHan + numHanPerLeadByte - 1) / numHanPerLeadBy
te; | |
179 uint32_t hanPrimary = (uint32_t)(Collation::UNASSIGNED_IMPLICIT_BYTE - numHa
nLeadBytes) << 24; | |
180 hanPrimary |= 0x20200; | |
181 firstHanPrimary = hanPrimary; | |
182 for(int32_t i = 0; i < length; i += 2) { | |
183 UChar32 start = ranges[i]; | |
184 UChar32 end = ranges[i + 1]; | |
185 hanPrimary = setPrimaryRangeAndReturnNext(start, end, hanPrimary, hanSte
p, errorCode); | |
186 } | |
187 // One past the actual last one, but that is harmless for tailoring. | |
188 // It saves us from subtracting "hanStep" and handling underflows. | |
189 lastHanPrimary = hanPrimary; | |
190 } | |
191 | |
192 UBool | |
193 CollationBaseDataBuilder::isCompressibleLeadByte(uint32_t b) const { | |
194 return compressibleBytes[b]; | |
195 } | |
196 | |
197 void | |
198 CollationBaseDataBuilder::setCompressibleLeadByte(uint32_t b) { | |
199 compressibleBytes[b] = TRUE; | |
200 } | |
201 | |
202 int32_t | |
203 CollationBaseDataBuilder::diffTwoBytePrimaries(uint32_t p1, uint32_t p2, UBool i
sCompressible) { | |
204 if((p1 & 0xff000000) == (p2 & 0xff000000)) { | |
205 // Same lead bytes. | |
206 return (int32_t)(p2 - p1) >> 16; | |
207 } else { | |
208 int32_t linear1; | |
209 int32_t linear2; | |
210 int32_t factor; | |
211 if(isCompressible) { | |
212 // Second byte for compressible lead byte: 251 bytes 04..FE | |
213 linear1 = (int32_t)((p1 >> 16) & 0xff) - 4; | |
214 linear2 = (int32_t)((p2 >> 16) & 0xff) - 4; | |
215 factor = 251; | |
216 } else { | |
217 // Second byte for incompressible lead byte: 254 bytes 02..FF | |
218 linear1 = (int32_t)((p1 >> 16) & 0xff) - 2; | |
219 linear2 = (int32_t)((p2 >> 16) & 0xff) - 2; | |
220 factor = 254; | |
221 } | |
222 linear1 += factor * (int32_t)((p1 >> 24) & 0xff); | |
223 linear2 += factor * (int32_t)((p2 >> 24) & 0xff); | |
224 return linear2 - linear1; | |
225 } | |
226 } | |
227 | |
228 int32_t | |
229 CollationBaseDataBuilder::diffThreeBytePrimaries(uint32_t p1, uint32_t p2, UBool
isCompressible) { | |
230 if((p1 & 0xffff0000) == (p2 & 0xffff0000)) { | |
231 // Same first two bytes. | |
232 return (int32_t)(p2 - p1) >> 8; | |
233 } else { | |
234 // Third byte: 254 bytes 02..FF | |
235 int32_t linear1 = (int32_t)((p1 >> 8) & 0xff) - 2; | |
236 int32_t linear2 = (int32_t)((p2 >> 8) & 0xff) - 2; | |
237 int32_t factor; | |
238 if(isCompressible) { | |
239 // Second byte for compressible lead byte: 251 bytes 04..FE | |
240 linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 4); | |
241 linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 4); | |
242 factor = 251 * 254; | |
243 } else { | |
244 // Second byte for incompressible lead byte: 254 bytes 02..FF | |
245 linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 2); | |
246 linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 2); | |
247 factor = 254 * 254; | |
248 } | |
249 linear1 += factor * (int32_t)((p1 >> 24) & 0xff); | |
250 linear2 += factor * (int32_t)((p2 >> 24) & 0xff); | |
251 return linear2 - linear1; | |
252 } | |
253 } | |
254 | |
255 uint32_t | |
256 CollationBaseDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, UErr
orCode &errorCode) { | |
257 addRootElements(ces, cesLength, errorCode); | |
258 return CollationDataBuilder::encodeCEs(ces, cesLength, errorCode); | |
259 } | |
260 | |
261 void | |
262 CollationBaseDataBuilder::addRootElements(const int64_t ces[], int32_t cesLength
, | |
263 UErrorCode &errorCode) { | |
264 if(U_FAILURE(errorCode)) { return; } | |
265 for(int32_t i = 0; i < cesLength; ++i) { | |
266 addRootElement(ces[i], errorCode); | |
267 } | |
268 } | |
269 | |
270 void | |
271 CollationBaseDataBuilder::addRootElement(int64_t ce, UErrorCode &errorCode) { | |
272 if(U_FAILURE(errorCode) || ce == 0) { return; } | |
273 // Remove case bits. | |
274 ce &= INT64_C(0xffffffffffff3fff); | |
275 U_ASSERT((ce & 0xc0) == 0); // quaternary==0 | |
276 // Ignore the CE if it has a Han primary weight and common secondary/tertiar
y weights. | |
277 // We will add it later, as part of the Han ranges. | |
278 uint32_t p = (uint32_t)(ce >> 32); | |
279 uint32_t secTer = (uint32_t)ce; | |
280 if(secTer == Collation::COMMON_SEC_AND_TER_CE) { | |
281 if(firstHanPrimary <= p && p <= lastHanPrimary) { | |
282 return; | |
283 } | |
284 } else { | |
285 // Check that secondary and tertiary weights are >= "common". | |
286 uint32_t s = secTer >> 16; | |
287 uint32_t t = secTer & Collation::ONLY_TERTIARY_MASK; | |
288 if((s != 0 && s < Collation::COMMON_WEIGHT16) || (t != 0 && t < Collatio
n::COMMON_WEIGHT16)) { | |
289 errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
290 return; | |
291 } | |
292 } | |
293 // Check that primaries have at most 3 bytes. | |
294 if((p & 0xff) != 0) { | |
295 errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
296 return; | |
297 } | |
298 int32_t i = binarySearch(rootElements, ce); | |
299 if(i < 0) { | |
300 rootElements.insertElementAt(ce, ~i, errorCode); | |
301 } | |
302 } | |
303 | |
304 void | |
305 CollationBaseDataBuilder::addReorderingGroup(uint32_t firstByte, uint32_t lastBy
te, | |
306 const UnicodeString &groupScripts, | |
307 UErrorCode &errorCode) { | |
308 if(U_FAILURE(errorCode)) { return; } | |
309 if(groupScripts.isEmpty()) { | |
310 errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
311 return; | |
312 } | |
313 if(groupScripts.indexOf((UChar)USCRIPT_UNKNOWN) >= 0) { | |
314 // Zzzz must not occur. | |
315 // It is the code used in the API to separate low and high scripts. | |
316 errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
317 return; | |
318 } | |
319 // Note: We are mostly trusting the input data, | |
320 // rather than verifying that reordering groups do not intersect | |
321 // with their lead byte ranges nor their sets of scripts, | |
322 // and that all script codes are valid. | |
323 scripts.append((UChar)((firstByte << 8) | lastByte)); | |
324 scripts.append((UChar)groupScripts.length()); | |
325 scripts.append(groupScripts); | |
326 } | |
327 | |
328 void | |
329 CollationBaseDataBuilder::build(CollationData &data, UErrorCode &errorCode) { | |
330 buildMappings(data, errorCode); | |
331 data.numericPrimary = numericPrimary; | |
332 data.compressibleBytes = compressibleBytes; | |
333 data.scripts = reinterpret_cast<const uint16_t *>(scripts.getBuffer()); | |
334 data.scriptsLength = scripts.length(); | |
335 buildFastLatinTable(data, errorCode); | |
336 } | |
337 | |
338 void | |
339 CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &e
rrorCode) { | |
340 if(U_FAILURE(errorCode)) { return; } | |
341 uint32_t nextHanPrimary = firstHanPrimary; // Set to 0xffffffff after the l
ast Han range. | |
342 uint32_t prevPrimary = 0; // Start with primary ignorable CEs. | |
343 UBool tryRange = FALSE; | |
344 for(int32_t i = 0; i < rootElements.size(); ++i) { | |
345 int64_t ce = rootElements.elementAti(i); | |
346 uint32_t p = (uint32_t)(ce >> 32); | |
347 uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK; | |
348 if(p != prevPrimary) { | |
349 U_ASSERT((p & 0xff) == 0); | |
350 int32_t end; | |
351 if(p >= nextHanPrimary) { | |
352 // Add a Han primary weight or range. | |
353 // We omitted them initially, and omitted all CEs with Han prima
ries | |
354 // and common secondary/tertiary weights. | |
355 U_ASSERT(p > lastHanPrimary || secTer != Collation::COMMON_SEC_A
ND_TER_CE); | |
356 if(p == nextHanPrimary) { | |
357 // One single Han primary with non-common secondary/tertiary
weights. | |
358 table.addElement((int32_t)p, errorCode); | |
359 if(p < lastHanPrimary) { | |
360 // Prepare for the next Han range. | |
361 nextHanPrimary = Collation::incThreeBytePrimaryByOffset(
p, FALSE, hanStep); | |
362 } else { | |
363 // p is the last Han primary. | |
364 nextHanPrimary = 0xffffffff; | |
365 } | |
366 } else { | |
367 // p > nextHanPrimary: Add a Han primary range, starting wit
h nextHanPrimary. | |
368 table.addElement((int32_t)nextHanPrimary, errorCode); | |
369 if(nextHanPrimary == lastHanPrimary) { | |
370 // nextHanPrimary == lastHanPrimary < p | |
371 // We just wrote the single last Han primary. | |
372 nextHanPrimary = 0xffffffff; | |
373 } else if(p < lastHanPrimary) { | |
374 // nextHanPrimary < p < lastHanPrimary | |
375 // End the Han range on p, prepare for the next range. | |
376 table.addElement((int32_t)p | hanStep, errorCode); | |
377 nextHanPrimary = Collation::incThreeBytePrimaryByOffset(
p, FALSE, hanStep); | |
378 } else if(p == lastHanPrimary) { | |
379 // nextHanPrimary < p == lastHanPrimary | |
380 // End the last Han range on p. | |
381 table.addElement((int32_t)p | hanStep, errorCode); | |
382 nextHanPrimary = 0xffffffff; | |
383 } else { | |
384 // nextHanPrimary < lastHanPrimary < p | |
385 // End the last Han range, then write p. | |
386 table.addElement((int32_t)lastHanPrimary | hanStep, erro
rCode); | |
387 nextHanPrimary = 0xffffffff; | |
388 table.addElement((int32_t)p, errorCode); | |
389 } | |
390 } | |
391 } else if(tryRange && secTer == Collation::COMMON_SEC_AND_TER_CE && | |
392 (end = writeRootElementsRange(prevPrimary, p, i + 1, table,
errorCode)) != 0) { | |
393 // Multiple CEs with only common secondary/tertiary weights were | |
394 // combined into a primary range. | |
395 // The range end was written, ending with the primary of rootEle
ments[end]. | |
396 ce = rootElements.elementAti(end); | |
397 p = (uint32_t)(ce >> 32); | |
398 secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK; | |
399 i = end; | |
400 } else { | |
401 // Write the primary weight of a normal CE. | |
402 table.addElement((int32_t)p, errorCode); | |
403 } | |
404 prevPrimary = p; | |
405 } | |
406 if(secTer == Collation::COMMON_SEC_AND_TER_CE) { | |
407 // The common secondar/tertiary weights are implied in the primary u
nit. | |
408 // If there is no intervening delta unit, then we will try to combin
e | |
409 // the next several primaries into a range. | |
410 tryRange = TRUE; | |
411 } else { | |
412 // For each new set of secondary/tertiary weights we write a delta u
nit. | |
413 table.addElement((int32_t)secTer | CollationRootElements::SEC_TER_DE
LTA_FLAG, errorCode); | |
414 tryRange = FALSE; | |
415 } | |
416 } | |
417 | |
418 // Limit sentinel for root elements. | |
419 // This allows us to reduce range checks at runtime. | |
420 table.addElement(CollationRootElements::PRIMARY_SENTINEL, errorCode); | |
421 } | |
422 | |
423 int32_t | |
424 CollationBaseDataBuilder::writeRootElementsRange( | |
425 uint32_t prevPrimary, uint32_t p, int32_t i, | |
426 UVector32 &table, UErrorCode &errorCode) { | |
427 if(U_FAILURE(errorCode) || i >= rootElements.size()) { return 0; } | |
428 U_ASSERT(prevPrimary < p); | |
429 // No ranges of single-byte primaries. | |
430 if((p & prevPrimary & 0xff0000) == 0) { return 0; } | |
431 // Lead bytes of compressible primaries must match. | |
432 UBool isCompressible = isCompressiblePrimary(p); | |
433 if((isCompressible || isCompressiblePrimary(prevPrimary)) && | |
434 (p & 0xff000000) != (prevPrimary & 0xff000000)) { | |
435 return 0; | |
436 } | |
437 // Number of bytes in the primaries. | |
438 UBool twoBytes; | |
439 // Number of primaries from prevPrimary to p. | |
440 int32_t step; | |
441 if((p & 0xff00) == 0) { | |
442 // 2-byte primary | |
443 if((prevPrimary & 0xff00) != 0) { return 0; } // length mismatch | |
444 twoBytes = TRUE; | |
445 step = diffTwoBytePrimaries(prevPrimary, p, isCompressible); | |
446 } else { | |
447 // 3-byte primary | |
448 if((prevPrimary & 0xff00) == 0) { return 0; } // length mismatch | |
449 twoBytes = FALSE; | |
450 step = diffThreeBytePrimaries(prevPrimary, p, isCompressible); | |
451 } | |
452 if(step > (int32_t)CollationRootElements::PRIMARY_STEP_MASK) { return 0; } | |
453 // See if there are more than two CEs with primaries increasing by "step" | |
454 // and with only common secondary/tertiary weights on all but the last one. | |
455 int32_t end = 0; // Initially 0: No range for just two primaries. | |
456 for(;;) { | |
457 prevPrimary = p; | |
458 // Calculate which primary we expect next. | |
459 uint32_t nextPrimary; // = p + step | |
460 if(twoBytes) { | |
461 nextPrimary = Collation::incTwoBytePrimaryByOffset(p, isCompressible
, step); | |
462 } else { | |
463 nextPrimary = Collation::incThreeBytePrimaryByOffset(p, isCompressib
le, step); | |
464 } | |
465 // Fetch the actual next CE. | |
466 int64_t ce = rootElements.elementAti(i); | |
467 p = (uint32_t)(ce >> 32); | |
468 uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK; | |
469 // Does this primary increase by "step" from the last one? | |
470 if(p != nextPrimary || | |
471 // Do not cross into a new lead byte if either is compressible. | |
472 ((p & 0xff000000) != (prevPrimary & 0xff000000) && | |
473 (isCompressible || isCompressiblePrimary(p)))) { | |
474 // The range ends with the previous CE. | |
475 p = prevPrimary; | |
476 break; | |
477 } | |
478 // Extend the range to include this primary. | |
479 end = i++; | |
480 // This primary is the last in the range if it has non-common weights | |
481 // or if we are at the end of the list. | |
482 if(secTer != Collation::COMMON_SEC_AND_TER_CE || i >= rootElements.size(
)) { break; } | |
483 } | |
484 if(end != 0) { | |
485 table.addElement((int32_t)p | step, errorCode); | |
486 } | |
487 return end; | |
488 } | |
489 | |
490 U_NAMESPACE_END | |
491 | |
492 #endif // !UCONFIG_NO_COLLATION | |
OLD | NEW |