OLD | NEW |
| (Empty) |
1 /* | |
2 ****************************************************************************** | |
3 * Copyright (C) 1996-2014, International Business Machines | |
4 * Corporation and others. All Rights Reserved. | |
5 ****************************************************************************** | |
6 */ | |
7 | |
8 #include "unicode/utypes.h" | |
9 | |
10 #if !UCONFIG_NO_COLLATION | |
11 | |
12 #include "unicode/unistr.h" | |
13 #include "unicode/usearch.h" | |
14 | |
15 #include "cmemory.h" | |
16 #include "unicode/coll.h" | |
17 #include "unicode/tblcoll.h" | |
18 #include "unicode/coleitr.h" | |
19 #include "unicode/ucoleitr.h" | |
20 | |
21 #include "unicode/regex.h" // TODO: make conditional on regexp being buil
t. | |
22 | |
23 #include "unicode/uniset.h" | |
24 #include "unicode/uset.h" | |
25 #include "unicode/usetiter.h" | |
26 #include "unicode/ustring.h" | |
27 #include "hash.h" | |
28 #include "normalizer2impl.h" | |
29 #include "uhash.h" | |
30 #include "usrchimp.h" | |
31 #include "uassert.h" | |
32 | |
33 #include "colldata.h" | |
34 | |
35 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) | |
36 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) | |
37 #define DELETE_ARRAY(array) uprv_free((void *) (array)) | |
38 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src),
(count) * sizeof (src)[0]) | |
39 | |
40 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status) | |
41 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0) | |
42 { | |
43 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), stri
ng.length(), &status); | |
44 UCollationStrength strength = ucol_getStrength(coll); | |
45 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==
UCOL_SHIFTED; | |
46 uint32_t variableTop = ucol_getVariableTop(coll, &status); | |
47 uint32_t strengthMask = 0; | |
48 int32_t order; | |
49 | |
50 if (U_FAILURE(status)) { | |
51 return; | |
52 } | |
53 | |
54 // **** only set flag if string has Han(gul) **** | |
55 // ucol_forceHanImplicit(elems, &status); -- removed for ticket #10476 | |
56 | |
57 switch (strength) | |
58 { | |
59 default: | |
60 strengthMask |= UCOL_TERTIARYORDERMASK; | |
61 /* fall through */ | |
62 | |
63 case UCOL_SECONDARY: | |
64 strengthMask |= UCOL_SECONDARYORDERMASK; | |
65 /* fall through */ | |
66 | |
67 case UCOL_PRIMARY: | |
68 strengthMask |= UCOL_PRIMARYORDERMASK; | |
69 } | |
70 | |
71 ces = ceBuffer; | |
72 | |
73 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) { | |
74 UBool cont = isContinuation(order); | |
75 | |
76 order &= strengthMask; | |
77 | |
78 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORD
ERMASK) != 0) { | |
79 if (strength >= UCOL_QUATERNARY) { | |
80 order &= UCOL_PRIMARYORDERMASK; | |
81 } else { | |
82 order = UCOL_IGNORABLE; | |
83 } | |
84 } | |
85 | |
86 if (order == UCOL_IGNORABLE) { | |
87 continue; | |
88 } | |
89 | |
90 if (cont) { | |
91 order |= UCOL_CONTINUATION_MARKER; | |
92 } | |
93 | |
94 add(order, status); | |
95 } | |
96 | |
97 ucol_closeElements(elems); | |
98 } | |
99 | |
100 CEList::~CEList() | |
101 { | |
102 if (ces != ceBuffer) { | |
103 DELETE_ARRAY(ces); | |
104 } | |
105 } | |
106 | |
107 void CEList::add(uint32_t ce, UErrorCode &status) | |
108 { | |
109 if (U_FAILURE(status)) { | |
110 return; | |
111 } | |
112 | |
113 if (listSize >= listMax) { | |
114 int32_t newMax = listMax + CELIST_BUFFER_SIZE; | |
115 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax); | |
116 | |
117 if (newCEs == NULL) { | |
118 status = U_MEMORY_ALLOCATION_ERROR; | |
119 return; | |
120 } | |
121 | |
122 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t)); | |
123 | |
124 if (ces != ceBuffer) { | |
125 DELETE_ARRAY(ces); | |
126 } | |
127 | |
128 ces = newCEs; | |
129 listMax = newMax; | |
130 } | |
131 | |
132 ces[listSize++] = ce; | |
133 } | |
134 | |
135 uint32_t CEList::get(int32_t index) const | |
136 { | |
137 if (index >= 0 && index < listSize) { | |
138 return ces[index]; | |
139 } | |
140 | |
141 return (uint32_t)UCOL_NULLORDER; | |
142 } | |
143 | |
144 uint32_t &CEList::operator[](int32_t index) const | |
145 { | |
146 return ces[index]; | |
147 } | |
148 | |
149 UBool CEList::matchesAt(int32_t offset, const CEList *other) const | |
150 { | |
151 if (other == NULL || listSize - offset < other->size()) { | |
152 return FALSE; | |
153 } | |
154 | |
155 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) { | |
156 if (ces[i] != (*other)[j]) { | |
157 return FALSE; | |
158 } | |
159 } | |
160 | |
161 return TRUE; | |
162 } | |
163 | |
164 int32_t CEList::size() const | |
165 { | |
166 return listSize; | |
167 } | |
168 | |
169 StringList::StringList(UErrorCode &status) | |
170 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0) | |
171 { | |
172 if (U_FAILURE(status)) { | |
173 return; | |
174 } | |
175 | |
176 strings = new UnicodeString [listMax]; | |
177 | |
178 if (strings == NULL) { | |
179 status = U_MEMORY_ALLOCATION_ERROR; | |
180 return; | |
181 } | |
182 } | |
183 | |
184 StringList::~StringList() | |
185 { | |
186 delete[] strings; | |
187 } | |
188 | |
189 void StringList::add(const UnicodeString *string, UErrorCode &status) | |
190 { | |
191 if (U_FAILURE(status)) { | |
192 return; | |
193 } | |
194 if (listSize >= listMax) { | |
195 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE; | |
196 UnicodeString *newStrings = new UnicodeString[newMax]; | |
197 if (newStrings == NULL) { | |
198 status = U_MEMORY_ALLOCATION_ERROR; | |
199 return; | |
200 } | |
201 for (int32_t i=0; i<listSize; ++i) { | |
202 newStrings[i] = strings[i]; | |
203 } | |
204 delete[] strings; | |
205 strings = newStrings; | |
206 listMax = newMax; | |
207 } | |
208 | |
209 // The ctor initialized all the strings in | |
210 // the array to empty strings, so this | |
211 // is the same as copying the source string. | |
212 strings[listSize++].append(*string); | |
213 } | |
214 | |
215 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status) | |
216 { | |
217 const UnicodeString string(chars, count); | |
218 | |
219 add(&string, status); | |
220 } | |
221 | |
222 const UnicodeString *StringList::get(int32_t index) const | |
223 { | |
224 if (index >= 0 && index < listSize) { | |
225 return &strings[index]; | |
226 } | |
227 | |
228 return NULL; | |
229 } | |
230 | |
231 int32_t StringList::size() const | |
232 { | |
233 return listSize; | |
234 } | |
235 | |
236 | |
237 U_CDECL_BEGIN | |
238 static void U_CALLCONV | |
239 deleteStringList(void *obj) | |
240 { | |
241 StringList *strings = (StringList *) obj; | |
242 | |
243 delete strings; | |
244 } | |
245 U_CDECL_END | |
246 | |
247 class CEToStringsMap | |
248 { | |
249 public: | |
250 CEToStringsMap(UErrorCode &status); | |
251 ~CEToStringsMap(); | |
252 | |
253 void put(uint32_t ce, UnicodeString *string, UErrorCode &status); | |
254 StringList *getStringList(uint32_t ce) const; | |
255 | |
256 private: | |
257 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status); | |
258 UHashtable *map; | |
259 }; | |
260 | |
261 CEToStringsMap::CEToStringsMap(UErrorCode &status) | |
262 : map(NULL) | |
263 { | |
264 if (U_FAILURE(status)) { | |
265 return; | |
266 } | |
267 | |
268 map = uhash_open(uhash_hashLong, uhash_compareLong, | |
269 uhash_compareCaselessUnicodeString, | |
270 &status); | |
271 | |
272 if (U_FAILURE(status)) { | |
273 return; | |
274 } | |
275 | |
276 uhash_setValueDeleter(map, deleteStringList); | |
277 } | |
278 | |
279 CEToStringsMap::~CEToStringsMap() | |
280 { | |
281 uhash_close(map); | |
282 } | |
283 | |
284 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status) | |
285 { | |
286 StringList *strings = getStringList(ce); | |
287 | |
288 if (strings == NULL) { | |
289 strings = new StringList(status); | |
290 | |
291 if (strings == NULL || U_FAILURE(status)) { | |
292 status = U_MEMORY_ALLOCATION_ERROR; | |
293 return; | |
294 } | |
295 | |
296 putStringList(ce, strings, status); | |
297 } | |
298 | |
299 strings->add(string, status); | |
300 } | |
301 | |
302 StringList *CEToStringsMap::getStringList(uint32_t ce) const | |
303 { | |
304 return (StringList *) uhash_iget(map, ce); | |
305 } | |
306 | |
307 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCo
de &status) | |
308 { | |
309 uhash_iput(map, ce, (void *) stringList, &status); | |
310 } | |
311 | |
312 #define CLONE_COLLATOR | |
313 | |
314 CollData::CollData(UCollator *collator, UErrorCode &status) | |
315 : coll(NULL), ceToCharsStartingWith(NULL) | |
316 { | |
317 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]] | |
318 // i.e. other, control, private use, format, surrogate | |
319 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20); | |
320 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20); | |
321 USet *charsToTest = uset_openPattern(test_pattern, 20, &status); | |
322 | |
323 // Han ext. A, Han, Jamo, Hangul, Han Ext. B | |
324 // i.e. all the characers we handle implicitly | |
325 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\
\uD7AF][\\U00020000-\\U0002A6DF]]", 70); | |
326 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\
\uD7AF][\\U00020000-\\U0002A6DF]]", 70); | |
327 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status); | |
328 | |
329 if (U_FAILURE(status)) { | |
330 return; | |
331 } | |
332 | |
333 USet *expansions = uset_openEmpty(); | |
334 USet *contractions = uset_openEmpty(); | |
335 int32_t itemCount; | |
336 | |
337 ceToCharsStartingWith = new CEToStringsMap(status); | |
338 | |
339 if (U_FAILURE(status)) { | |
340 goto bail; | |
341 } | |
342 | |
343 #ifdef CLONE_COLLATOR | |
344 coll = ucol_safeClone(collator, NULL, NULL, &status); | |
345 | |
346 if (U_FAILURE(status)) { | |
347 goto bail; | |
348 } | |
349 #else | |
350 coll = collator; | |
351 #endif | |
352 | |
353 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &st
atus); | |
354 | |
355 uset_addAll(charsToTest, contractions); | |
356 uset_addAll(charsToTest, expansions); | |
357 uset_removeAll(charsToTest, charsToRemove); | |
358 | |
359 itemCount = uset_getItemCount(charsToTest); | |
360 for(int32_t item = 0; item < itemCount; item += 1) { | |
361 UChar32 start = 0, end = 0; | |
362 UChar buffer[16]; | |
363 int32_t len = uset_getItem(charsToTest, item, &start, &end, | |
364 buffer, 16, &status); | |
365 | |
366 if (len == 0) { | |
367 for (UChar32 ch = start; ch <= end; ch += 1) { | |
368 UnicodeString *st = new UnicodeString(ch); | |
369 | |
370 if (st == NULL) { | |
371 status = U_MEMORY_ALLOCATION_ERROR; | |
372 break; | |
373 } | |
374 | |
375 CEList *ceList = new CEList(coll, *st, status); | |
376 | |
377 ceToCharsStartingWith->put(ceList->get(0), st, status); | |
378 | |
379 delete ceList; | |
380 delete st; | |
381 } | |
382 } else if (len > 0) { | |
383 UnicodeString *st = new UnicodeString(buffer, len); | |
384 | |
385 if (st == NULL) { | |
386 status = U_MEMORY_ALLOCATION_ERROR; | |
387 break; | |
388 } | |
389 | |
390 CEList *ceList = new CEList(coll, *st, status); | |
391 | |
392 ceToCharsStartingWith->put(ceList->get(0), st, status); | |
393 | |
394 delete ceList; | |
395 delete st; | |
396 } else { | |
397 // shouldn't happen... | |
398 } | |
399 | |
400 if (U_FAILURE(status)) { | |
401 break; | |
402 } | |
403 } | |
404 | |
405 bail: | |
406 uset_close(contractions); | |
407 uset_close(expansions); | |
408 uset_close(charsToRemove); | |
409 uset_close(charsToTest); | |
410 | |
411 if (U_FAILURE(status)) { | |
412 return; | |
413 } | |
414 | |
415 UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status)
; | |
416 if (U_FAILURE(status)) { | |
417 return; | |
418 } | |
419 UnicodeSetIterator hanIter(hanRanges); | |
420 UnicodeString hanString; | |
421 while(hanIter.nextRange()) { | |
422 hanString.append(hanIter.getCodepoint()); | |
423 hanString.append(hanIter.getCodepointEnd()); | |
424 } | |
425 // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9, | |
426 // but as of Unicode 6.3 the 11xx block is filled, | |
427 // and there are also more Jamo T at U+D7CB..U+D7FB. | |
428 // Maybe use [:HST=T:] and look for the end of the last range? | |
429 // Maybe use script boundary mappings instead of this code?? | |
430 UChar jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAM
O_T_BASE + 1, 0x11FF}; | |
431 UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges)); | |
432 CEList hanList(coll, hanString, status); | |
433 CEList jamoList(coll, jamoString, status); | |
434 int32_t j = 0; | |
435 | |
436 if (U_FAILURE(status)) { | |
437 return; | |
438 } | |
439 | |
440 for (int32_t c = 0; c < jamoList.size(); c += 1) { | |
441 uint32_t jce = jamoList[c]; | |
442 | |
443 if (! isContinuation(jce)) { | |
444 jamoLimits[j++] = jce; | |
445 } | |
446 } | |
447 | |
448 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT); | |
449 | |
450 minHan = 0xFFFFFFFF; | |
451 maxHan = 0; | |
452 | |
453 for(int32_t h = 0; h < hanList.size(); h += 2) { | |
454 uint32_t han = (uint32_t) hanList[h]; | |
455 | |
456 if (han < minHan) { | |
457 minHan = han; | |
458 } | |
459 | |
460 if (han > maxHan) { | |
461 maxHan = han; | |
462 } | |
463 } | |
464 | |
465 maxHan += (1 << UCOL_PRIMARYORDERSHIFT); | |
466 } | |
467 | |
468 CollData::~CollData() | |
469 { | |
470 #ifdef CLONE_COLLATOR | |
471 ucol_close(coll); | |
472 #endif | |
473 | |
474 delete ceToCharsStartingWith; | |
475 } | |
476 | |
477 UCollator *CollData::getCollator() const | |
478 { | |
479 return coll; | |
480 } | |
481 | |
482 const StringList *CollData::getStringList(int32_t ce) const | |
483 { | |
484 return ceToCharsStartingWith->getStringList(ce); | |
485 } | |
486 | |
487 const CEList *CollData::getCEList(const UnicodeString *string) const | |
488 { | |
489 UErrorCode status = U_ZERO_ERROR; | |
490 const CEList *list = new CEList(coll, *string, status); | |
491 | |
492 if (U_FAILURE(status)) { | |
493 delete list; | |
494 list = NULL; | |
495 } | |
496 | |
497 return list; | |
498 } | |
499 | |
500 void CollData::freeCEList(const CEList *list) | |
501 { | |
502 delete list; | |
503 } | |
504 | |
505 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t
*history) const | |
506 { | |
507 // find out shortest string for the longest sequence of ces. | |
508 // this can probably be folded with the minLengthCache... | |
509 | |
510 if (history[offset] >= 0) { | |
511 return history[offset]; | |
512 } | |
513 | |
514 uint32_t ce = ceList->get(offset); | |
515 int32_t maxOffset = ceList->size(); | |
516 int32_t shortestLength = INT32_MAX; | |
517 const StringList *strings = ceToCharsStartingWith->getStringList(ce); | |
518 | |
519 if (strings != NULL) { | |
520 int32_t stringCount = strings->size(); | |
521 | |
522 for (int32_t s = 0; s < stringCount; s += 1) { | |
523 const UnicodeString *string = strings->get(s); | |
524 UErrorCode status = U_ZERO_ERROR; | |
525 const CEList *ceList2 = new CEList(coll, *string, status); | |
526 | |
527 if (U_FAILURE(status)) { | |
528 delete ceList2; | |
529 ceList2 = NULL; | |
530 } | |
531 | |
532 if (ceList->matchesAt(offset, ceList2)) { | |
533 U_ASSERT(ceList2 != NULL); | |
534 int32_t clength = ceList2->size(); | |
535 int32_t slength = string->length(); | |
536 int32_t roffset = offset + clength; | |
537 int32_t rlength = 0; | |
538 | |
539 if (roffset < maxOffset) { | |
540 rlength = minLengthInChars(ceList, roffset, history); | |
541 | |
542 if (rlength <= 0) { | |
543 // delete before continue to avoid memory leak. | |
544 delete ceList2; | |
545 | |
546 // ignore any dead ends | |
547 continue; | |
548 } | |
549 } | |
550 | |
551 if (shortestLength > slength + rlength) { | |
552 shortestLength = slength + rlength; | |
553 } | |
554 } | |
555 | |
556 delete ceList2; | |
557 } | |
558 } | |
559 | |
560 if (shortestLength == INT32_MAX) { | |
561 // No matching strings at this offset. See if | |
562 // the CE is in a range we can handle manually. | |
563 if (ce >= minHan && ce < maxHan) { | |
564 // all han have implicit orders which | |
565 // generate two CEs. | |
566 int32_t roffset = offset + 2; | |
567 int32_t rlength = 0; | |
568 | |
569 //history[roffset++] = -1; | |
570 //history[roffset++] = 1; | |
571 | |
572 if (roffset < maxOffset) { | |
573 rlength = minLengthInChars(ceList, roffset, history); | |
574 } | |
575 | |
576 if (rlength < 0) { | |
577 return -1; | |
578 } | |
579 | |
580 shortestLength = 1 + rlength; | |
581 goto have_shortest; | |
582 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) { | |
583 int32_t roffset = offset; | |
584 int32_t rlength = 0; | |
585 | |
586 // **** this loop may not handle archaic Hangul correctly **** | |
587 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset +=
1) { | |
588 uint32_t jce = ceList->get(roffset); | |
589 | |
590 // Some Jamo have 24-bit primary order; skip the | |
591 // 2nd CE. This should always be OK because if | |
592 // we're still in the loop all we've seen are | |
593 // a series of Jamo in LVT order. | |
594 if (isContinuation(jce)) { | |
595 continue; | |
596 } | |
597 | |
598 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) { | |
599 break; | |
600 } | |
601 } | |
602 | |
603 if (roffset == offset) { | |
604 // we started with a non-L Jamo... | |
605 // just say it comes from a single character | |
606 roffset += 1; | |
607 | |
608 // See if the single Jamo has a 24-bit order. | |
609 if (roffset < maxOffset && isContinuation(ceList->get(roffset)))
{ | |
610 roffset += 1; | |
611 } | |
612 } | |
613 | |
614 if (roffset < maxOffset) { | |
615 rlength = minLengthInChars(ceList, roffset, history); | |
616 } | |
617 | |
618 if (rlength < 0) { | |
619 return -1; | |
620 } | |
621 | |
622 shortestLength = 1 + rlength; | |
623 goto have_shortest; | |
624 } | |
625 | |
626 // Can't handle it manually either. Just move on. | |
627 return -1; | |
628 } | |
629 | |
630 have_shortest: | |
631 history[offset] = shortestLength; | |
632 | |
633 return shortestLength; | |
634 } | |
635 | |
636 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const | |
637 { | |
638 int32_t clength = ceList->size(); | |
639 int32_t *history = NEW_ARRAY(int32_t, clength); | |
640 | |
641 for (int32_t i = 0; i < clength; i += 1) { | |
642 history[i] = -1; | |
643 } | |
644 | |
645 int32_t minLength = minLengthInChars(ceList, offset, history); | |
646 | |
647 DELETE_ARRAY(history); | |
648 | |
649 return minLength; | |
650 } | |
651 | |
652 #endif // #if !UCONFIG_NO_COLLATION | |
OLD | NEW |