Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(210)

Side by Side Diff: source/i18n/collationbasedatabuilder.cpp

Issue 1621843002: ICU 56 update step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@561
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/i18n/collationbasedatabuilder.h ('k') | source/i18n/collationbuilder.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /*
2 *******************************************************************************
3 * Copyright (C) 2012-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationbasedatabuilder.cpp
7 *
8 * created on: 2012aug11
9 * created by: Markus W. Scherer
10 */
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_COLLATION
15
16 #include "unicode/localpointer.h"
17 #include "unicode/ucharstriebuilder.h"
18 #include "unicode/uniset.h"
19 #include "unicode/unistr.h"
20 #include "unicode/utf16.h"
21 #include "collation.h"
22 #include "collationbasedatabuilder.h"
23 #include "collationdata.h"
24 #include "collationdatabuilder.h"
25 #include "collationrootelements.h"
26 #include "normalizer2impl.h"
27 #include "uassert.h"
28 #include "utrie2.h"
29 #include "uvectr32.h"
30 #include "uvectr64.h"
31 #include "uvector.h"
32
33 U_NAMESPACE_BEGIN
34
35 namespace {
36
37 /**
38 * Compare two signed int64_t values as if they were unsigned.
39 */
40 int32_t
41 compareInt64AsUnsigned(int64_t a, int64_t b) {
42 if((uint64_t)a < (uint64_t)b) {
43 return -1;
44 } else if((uint64_t)a > (uint64_t)b) {
45 return 1;
46 } else {
47 return 0;
48 }
49 }
50
51 // TODO: Try to merge this with the binarySearch in alphaindex.cpp.
52 /**
53 * Like Java Collections.binarySearch(List, String, Comparator).
54 *
55 * @return the index>=0 where the item was found,
56 * or the index<0 for inserting the string at ~index in sorted order
57 */
58 int32_t
59 binarySearch(const UVector64 &list, int64_t ce) {
60 if (list.size() == 0) { return ~0; }
61 int32_t start = 0;
62 int32_t limit = list.size();
63 for (;;) {
64 int32_t i = (start + limit) / 2;
65 int32_t cmp = compareInt64AsUnsigned(ce, list.elementAti(i));
66 if (cmp == 0) {
67 return i;
68 } else if (cmp < 0) {
69 if (i == start) {
70 return ~start; // insert ce before i
71 }
72 limit = i;
73 } else {
74 if (i == start) {
75 return ~(start + 1); // insert ce after i
76 }
77 start = i;
78 }
79 }
80 }
81
82 } // namespace
83
84 CollationBaseDataBuilder::CollationBaseDataBuilder(UErrorCode &errorCode)
85 : CollationDataBuilder(errorCode),
86 numericPrimary(0x12000000),
87 firstHanPrimary(0), lastHanPrimary(0), hanStep(2),
88 rootElements(errorCode) {
89 }
90
91 CollationBaseDataBuilder::~CollationBaseDataBuilder() {
92 }
93
94 void
95 CollationBaseDataBuilder::init(UErrorCode &errorCode) {
96 if(U_FAILURE(errorCode)) { return; }
97 if(trie != NULL) {
98 errorCode = U_INVALID_STATE_ERROR;
99 return;
100 }
101
102 // Not compressible:
103 // - digits
104 // - Latin
105 // - Hani
106 // - trail weights
107 // Some scripts are compressible, some are not.
108 uprv_memset(compressibleBytes, FALSE, 256);
109 compressibleBytes[Collation::UNASSIGNED_IMPLICIT_BYTE] = TRUE;
110
111 // For a base, the default is to compute an unassigned-character implicit CE .
112 // This includes surrogate code points; see the last option in
113 // UCA section 7.1.1 Handling Ill-Formed Code Unit Sequences.
114 trie = utrie2_open(Collation::UNASSIGNED_CE32, Collation::FFFD_CE32, &errorC ode);
115
116 // Preallocate trie blocks for Latin in the hope that proximity helps with C PU caches.
117 for(UChar32 c = 0; c < 0x180; ++c) {
118 utrie2_set32(trie, c, Collation::UNASSIGNED_CE32, &errorCode);
119 }
120
121 utrie2_set32(trie, 0xfffe, Collation::MERGE_SEPARATOR_CE32, &errorCode);
122 // No root element for the merge separator which has 02 weights.
123 // Some code assumes that the root first primary CE is the "space first prim ary"
124 // from FractionalUCA.txt.
125
126 uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_T AG, 0);
127 utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode);
128
129 // Add a mapping for the first-unassigned boundary,
130 // which is the AlphabeticIndex overflow boundary.
131 UnicodeString s((UChar)0xfdd1); // Script boundary contractions start with U+FDD1.
132 s.append((UChar)0xfdd0); // Zzzz script sample character U+FDD0.
133 int64_t ce = Collation::makeCE(Collation::FIRST_UNASSIGNED_PRIMARY);
134 add(UnicodeString(), s, &ce, 1, errorCode);
135
136 // Add a tailoring boundary, but not a mapping, for [first trailing].
137 ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);
138 rootElements.addElement(ce, errorCode);
139
140 // U+FFFD maps to a CE with the third-highest primary weight,
141 // for predictable handling of ill-formed UTF-8.
142 uint32_t ce32 = Collation::FFFD_CE32;
143 utrie2_set32(trie, 0xfffd, ce32, &errorCode);
144 addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode);
145
146 // U+FFFF maps to a CE with the highest primary weight.
147 ce32 = Collation::MAX_REGULAR_CE32;
148 utrie2_set32(trie, 0xffff, ce32, &errorCode);
149 addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode);
150 }
151
152 void
153 CollationBaseDataBuilder::initHanRanges(const UChar32 ranges[], int32_t length,
154 UErrorCode &errorCode) {
155 if(U_FAILURE(errorCode) || length == 0) { return; }
156 if((length & 1) != 0) { // incomplete start/end pairs
157 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
158 return;
159 }
160 if(isAssigned(0x4e00)) { // already set
161 errorCode = U_INVALID_STATE_ERROR;
162 return;
163 }
164 int32_t numHanCodePoints = 0;
165 for(int32_t i = 0; i < length; i += 2) {
166 UChar32 start = ranges[i];
167 UChar32 end = ranges[i + 1];
168 numHanCodePoints += end - start + 1;
169 }
170 // Multiply the number of code points by (gap+1).
171 // Add hanStep+2 for tailoring after the last Han character.
172 int32_t gap = 1;
173 hanStep = gap + 1;
174 int32_t numHan = numHanCodePoints * hanStep + hanStep + 2;
175 // Numbers of Han primaries per lead byte determined by
176 // numbers of 2nd (not compressible) times 3rd primary byte values.
177 int32_t numHanPerLeadByte = 254 * 254;
178 int32_t numHanLeadBytes = (numHan + numHanPerLeadByte - 1) / numHanPerLeadBy te;
179 uint32_t hanPrimary = (uint32_t)(Collation::UNASSIGNED_IMPLICIT_BYTE - numHa nLeadBytes) << 24;
180 hanPrimary |= 0x20200;
181 firstHanPrimary = hanPrimary;
182 for(int32_t i = 0; i < length; i += 2) {
183 UChar32 start = ranges[i];
184 UChar32 end = ranges[i + 1];
185 hanPrimary = setPrimaryRangeAndReturnNext(start, end, hanPrimary, hanSte p, errorCode);
186 }
187 // One past the actual last one, but that is harmless for tailoring.
188 // It saves us from subtracting "hanStep" and handling underflows.
189 lastHanPrimary = hanPrimary;
190 }
191
192 UBool
193 CollationBaseDataBuilder::isCompressibleLeadByte(uint32_t b) const {
194 return compressibleBytes[b];
195 }
196
197 void
198 CollationBaseDataBuilder::setCompressibleLeadByte(uint32_t b) {
199 compressibleBytes[b] = TRUE;
200 }
201
202 int32_t
203 CollationBaseDataBuilder::diffTwoBytePrimaries(uint32_t p1, uint32_t p2, UBool i sCompressible) {
204 if((p1 & 0xff000000) == (p2 & 0xff000000)) {
205 // Same lead bytes.
206 return (int32_t)(p2 - p1) >> 16;
207 } else {
208 int32_t linear1;
209 int32_t linear2;
210 int32_t factor;
211 if(isCompressible) {
212 // Second byte for compressible lead byte: 251 bytes 04..FE
213 linear1 = (int32_t)((p1 >> 16) & 0xff) - 4;
214 linear2 = (int32_t)((p2 >> 16) & 0xff) - 4;
215 factor = 251;
216 } else {
217 // Second byte for incompressible lead byte: 254 bytes 02..FF
218 linear1 = (int32_t)((p1 >> 16) & 0xff) - 2;
219 linear2 = (int32_t)((p2 >> 16) & 0xff) - 2;
220 factor = 254;
221 }
222 linear1 += factor * (int32_t)((p1 >> 24) & 0xff);
223 linear2 += factor * (int32_t)((p2 >> 24) & 0xff);
224 return linear2 - linear1;
225 }
226 }
227
228 int32_t
229 CollationBaseDataBuilder::diffThreeBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) {
230 if((p1 & 0xffff0000) == (p2 & 0xffff0000)) {
231 // Same first two bytes.
232 return (int32_t)(p2 - p1) >> 8;
233 } else {
234 // Third byte: 254 bytes 02..FF
235 int32_t linear1 = (int32_t)((p1 >> 8) & 0xff) - 2;
236 int32_t linear2 = (int32_t)((p2 >> 8) & 0xff) - 2;
237 int32_t factor;
238 if(isCompressible) {
239 // Second byte for compressible lead byte: 251 bytes 04..FE
240 linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 4);
241 linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 4);
242 factor = 251 * 254;
243 } else {
244 // Second byte for incompressible lead byte: 254 bytes 02..FF
245 linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 2);
246 linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 2);
247 factor = 254 * 254;
248 }
249 linear1 += factor * (int32_t)((p1 >> 24) & 0xff);
250 linear2 += factor * (int32_t)((p2 >> 24) & 0xff);
251 return linear2 - linear1;
252 }
253 }
254
255 uint32_t
256 CollationBaseDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, UErr orCode &errorCode) {
257 addRootElements(ces, cesLength, errorCode);
258 return CollationDataBuilder::encodeCEs(ces, cesLength, errorCode);
259 }
260
261 void
262 CollationBaseDataBuilder::addRootElements(const int64_t ces[], int32_t cesLength ,
263 UErrorCode &errorCode) {
264 if(U_FAILURE(errorCode)) { return; }
265 for(int32_t i = 0; i < cesLength; ++i) {
266 addRootElement(ces[i], errorCode);
267 }
268 }
269
270 void
271 CollationBaseDataBuilder::addRootElement(int64_t ce, UErrorCode &errorCode) {
272 if(U_FAILURE(errorCode) || ce == 0) { return; }
273 // Remove case bits.
274 ce &= INT64_C(0xffffffffffff3fff);
275 U_ASSERT((ce & 0xc0) == 0); // quaternary==0
276 // Ignore the CE if it has a Han primary weight and common secondary/tertiar y weights.
277 // We will add it later, as part of the Han ranges.
278 uint32_t p = (uint32_t)(ce >> 32);
279 uint32_t secTer = (uint32_t)ce;
280 if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
281 if(firstHanPrimary <= p && p <= lastHanPrimary) {
282 return;
283 }
284 } else {
285 // Check that secondary and tertiary weights are >= "common".
286 uint32_t s = secTer >> 16;
287 uint32_t t = secTer & Collation::ONLY_TERTIARY_MASK;
288 if((s != 0 && s < Collation::COMMON_WEIGHT16) || (t != 0 && t < Collatio n::COMMON_WEIGHT16)) {
289 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
290 return;
291 }
292 }
293 // Check that primaries have at most 3 bytes.
294 if((p & 0xff) != 0) {
295 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
296 return;
297 }
298 int32_t i = binarySearch(rootElements, ce);
299 if(i < 0) {
300 rootElements.insertElementAt(ce, ~i, errorCode);
301 }
302 }
303
304 void
305 CollationBaseDataBuilder::addReorderingGroup(uint32_t firstByte, uint32_t lastBy te,
306 const UnicodeString &groupScripts,
307 UErrorCode &errorCode) {
308 if(U_FAILURE(errorCode)) { return; }
309 if(groupScripts.isEmpty()) {
310 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
311 return;
312 }
313 if(groupScripts.indexOf((UChar)USCRIPT_UNKNOWN) >= 0) {
314 // Zzzz must not occur.
315 // It is the code used in the API to separate low and high scripts.
316 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
317 return;
318 }
319 // Note: We are mostly trusting the input data,
320 // rather than verifying that reordering groups do not intersect
321 // with their lead byte ranges nor their sets of scripts,
322 // and that all script codes are valid.
323 scripts.append((UChar)((firstByte << 8) | lastByte));
324 scripts.append((UChar)groupScripts.length());
325 scripts.append(groupScripts);
326 }
327
328 void
329 CollationBaseDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
330 buildMappings(data, errorCode);
331 data.numericPrimary = numericPrimary;
332 data.compressibleBytes = compressibleBytes;
333 data.scripts = reinterpret_cast<const uint16_t *>(scripts.getBuffer());
334 data.scriptsLength = scripts.length();
335 buildFastLatinTable(data, errorCode);
336 }
337
338 void
339 CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &e rrorCode) {
340 if(U_FAILURE(errorCode)) { return; }
341 uint32_t nextHanPrimary = firstHanPrimary; // Set to 0xffffffff after the l ast Han range.
342 uint32_t prevPrimary = 0; // Start with primary ignorable CEs.
343 UBool tryRange = FALSE;
344 for(int32_t i = 0; i < rootElements.size(); ++i) {
345 int64_t ce = rootElements.elementAti(i);
346 uint32_t p = (uint32_t)(ce >> 32);
347 uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
348 if(p != prevPrimary) {
349 U_ASSERT((p & 0xff) == 0);
350 int32_t end;
351 if(p >= nextHanPrimary) {
352 // Add a Han primary weight or range.
353 // We omitted them initially, and omitted all CEs with Han prima ries
354 // and common secondary/tertiary weights.
355 U_ASSERT(p > lastHanPrimary || secTer != Collation::COMMON_SEC_A ND_TER_CE);
356 if(p == nextHanPrimary) {
357 // One single Han primary with non-common secondary/tertiary weights.
358 table.addElement((int32_t)p, errorCode);
359 if(p < lastHanPrimary) {
360 // Prepare for the next Han range.
361 nextHanPrimary = Collation::incThreeBytePrimaryByOffset( p, FALSE, hanStep);
362 } else {
363 // p is the last Han primary.
364 nextHanPrimary = 0xffffffff;
365 }
366 } else {
367 // p > nextHanPrimary: Add a Han primary range, starting wit h nextHanPrimary.
368 table.addElement((int32_t)nextHanPrimary, errorCode);
369 if(nextHanPrimary == lastHanPrimary) {
370 // nextHanPrimary == lastHanPrimary < p
371 // We just wrote the single last Han primary.
372 nextHanPrimary = 0xffffffff;
373 } else if(p < lastHanPrimary) {
374 // nextHanPrimary < p < lastHanPrimary
375 // End the Han range on p, prepare for the next range.
376 table.addElement((int32_t)p | hanStep, errorCode);
377 nextHanPrimary = Collation::incThreeBytePrimaryByOffset( p, FALSE, hanStep);
378 } else if(p == lastHanPrimary) {
379 // nextHanPrimary < p == lastHanPrimary
380 // End the last Han range on p.
381 table.addElement((int32_t)p | hanStep, errorCode);
382 nextHanPrimary = 0xffffffff;
383 } else {
384 // nextHanPrimary < lastHanPrimary < p
385 // End the last Han range, then write p.
386 table.addElement((int32_t)lastHanPrimary | hanStep, erro rCode);
387 nextHanPrimary = 0xffffffff;
388 table.addElement((int32_t)p, errorCode);
389 }
390 }
391 } else if(tryRange && secTer == Collation::COMMON_SEC_AND_TER_CE &&
392 (end = writeRootElementsRange(prevPrimary, p, i + 1, table, errorCode)) != 0) {
393 // Multiple CEs with only common secondary/tertiary weights were
394 // combined into a primary range.
395 // The range end was written, ending with the primary of rootEle ments[end].
396 ce = rootElements.elementAti(end);
397 p = (uint32_t)(ce >> 32);
398 secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
399 i = end;
400 } else {
401 // Write the primary weight of a normal CE.
402 table.addElement((int32_t)p, errorCode);
403 }
404 prevPrimary = p;
405 }
406 if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
407 // The common secondar/tertiary weights are implied in the primary u nit.
408 // If there is no intervening delta unit, then we will try to combin e
409 // the next several primaries into a range.
410 tryRange = TRUE;
411 } else {
412 // For each new set of secondary/tertiary weights we write a delta u nit.
413 table.addElement((int32_t)secTer | CollationRootElements::SEC_TER_DE LTA_FLAG, errorCode);
414 tryRange = FALSE;
415 }
416 }
417
418 // Limit sentinel for root elements.
419 // This allows us to reduce range checks at runtime.
420 table.addElement(CollationRootElements::PRIMARY_SENTINEL, errorCode);
421 }
422
423 int32_t
424 CollationBaseDataBuilder::writeRootElementsRange(
425 uint32_t prevPrimary, uint32_t p, int32_t i,
426 UVector32 &table, UErrorCode &errorCode) {
427 if(U_FAILURE(errorCode) || i >= rootElements.size()) { return 0; }
428 U_ASSERT(prevPrimary < p);
429 // No ranges of single-byte primaries.
430 if((p & prevPrimary & 0xff0000) == 0) { return 0; }
431 // Lead bytes of compressible primaries must match.
432 UBool isCompressible = isCompressiblePrimary(p);
433 if((isCompressible || isCompressiblePrimary(prevPrimary)) &&
434 (p & 0xff000000) != (prevPrimary & 0xff000000)) {
435 return 0;
436 }
437 // Number of bytes in the primaries.
438 UBool twoBytes;
439 // Number of primaries from prevPrimary to p.
440 int32_t step;
441 if((p & 0xff00) == 0) {
442 // 2-byte primary
443 if((prevPrimary & 0xff00) != 0) { return 0; } // length mismatch
444 twoBytes = TRUE;
445 step = diffTwoBytePrimaries(prevPrimary, p, isCompressible);
446 } else {
447 // 3-byte primary
448 if((prevPrimary & 0xff00) == 0) { return 0; } // length mismatch
449 twoBytes = FALSE;
450 step = diffThreeBytePrimaries(prevPrimary, p, isCompressible);
451 }
452 if(step > (int32_t)CollationRootElements::PRIMARY_STEP_MASK) { return 0; }
453 // See if there are more than two CEs with primaries increasing by "step"
454 // and with only common secondary/tertiary weights on all but the last one.
455 int32_t end = 0; // Initially 0: No range for just two primaries.
456 for(;;) {
457 prevPrimary = p;
458 // Calculate which primary we expect next.
459 uint32_t nextPrimary; // = p + step
460 if(twoBytes) {
461 nextPrimary = Collation::incTwoBytePrimaryByOffset(p, isCompressible , step);
462 } else {
463 nextPrimary = Collation::incThreeBytePrimaryByOffset(p, isCompressib le, step);
464 }
465 // Fetch the actual next CE.
466 int64_t ce = rootElements.elementAti(i);
467 p = (uint32_t)(ce >> 32);
468 uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
469 // Does this primary increase by "step" from the last one?
470 if(p != nextPrimary ||
471 // Do not cross into a new lead byte if either is compressible.
472 ((p & 0xff000000) != (prevPrimary & 0xff000000) &&
473 (isCompressible || isCompressiblePrimary(p)))) {
474 // The range ends with the previous CE.
475 p = prevPrimary;
476 break;
477 }
478 // Extend the range to include this primary.
479 end = i++;
480 // This primary is the last in the range if it has non-common weights
481 // or if we are at the end of the list.
482 if(secTer != Collation::COMMON_SEC_AND_TER_CE || i >= rootElements.size( )) { break; }
483 }
484 if(end != 0) {
485 table.addElement((int32_t)p | step, errorCode);
486 }
487 return end;
488 }
489
490 U_NAMESPACE_END
491
492 #endif // !UCONFIG_NO_COLLATION
OLDNEW
« no previous file with comments | « source/i18n/collationbasedatabuilder.h ('k') | source/i18n/collationbuilder.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698