OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ****************************************************************************** |
| 3 * Copyright (C) 1996-2009, International Business Machines * |
| 4 * Corporation and others. All Rights Reserved. * |
| 5 ****************************************************************************** |
| 6 */ |
| 7 |
| 8 #include "unicode/utypes.h" |
| 9 |
| 10 #if !UCONFIG_NO_COLLATION |
| 11 |
| 12 #include "unicode/unistr.h" |
| 13 #include "unicode/putil.h" |
| 14 #include "unicode/usearch.h" |
| 15 |
| 16 #include "cmemory.h" |
| 17 #include "unicode/coll.h" |
| 18 #include "unicode/tblcoll.h" |
| 19 #include "unicode/coleitr.h" |
| 20 #include "unicode/ucoleitr.h" |
| 21 |
| 22 #include "unicode/regex.h" // TODO: make conditional on regexp being buil
t. |
| 23 |
| 24 #include "unicode/uniset.h" |
| 25 #include "unicode/uset.h" |
| 26 #include "unicode/ustring.h" |
| 27 #include "hash.h" |
| 28 #include "uhash.h" |
| 29 #include "ucln_in.h" |
| 30 #include "ucol_imp.h" |
| 31 #include "umutex.h" |
| 32 |
| 33 #include "unicode/colldata.h" |
| 34 |
| 35 U_NAMESPACE_BEGIN |
| 36 |
| 37 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) |
| 38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) |
| 39 #define DELETE_ARRAY(array) uprv_free((void *) (array)) |
| 40 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src),
(count) * sizeof (src)[0]) |
| 41 |
| 42 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList) |
| 43 |
| 44 #ifdef INSTRUMENT_CELIST |
| 45 int32_t CEList::_active = 0; |
| 46 int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
| 47 #endif |
| 48 |
| 49 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status) |
| 50 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0) |
| 51 { |
| 52 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), stri
ng.length(), &status); |
| 53 UCollationStrength strength = ucol_getStrength(coll); |
| 54 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==
UCOL_SHIFTED; |
| 55 uint32_t variableTop = ucol_getVariableTop(coll, &status); |
| 56 uint32_t strengthMask = 0; |
| 57 int32_t order; |
| 58 |
| 59 if (U_FAILURE(status)) { |
| 60 return; |
| 61 } |
| 62 |
| 63 // **** only set flag if string has Han(gul) **** |
| 64 ucol_forceHanImplicit(elems, &status); |
| 65 |
| 66 switch (strength) |
| 67 { |
| 68 default: |
| 69 strengthMask |= UCOL_TERTIARYORDERMASK; |
| 70 /* fall through */ |
| 71 |
| 72 case UCOL_SECONDARY: |
| 73 strengthMask |= UCOL_SECONDARYORDERMASK; |
| 74 /* fall through */ |
| 75 |
| 76 case UCOL_PRIMARY: |
| 77 strengthMask |= UCOL_PRIMARYORDERMASK; |
| 78 } |
| 79 |
| 80 #ifdef INSTRUMENT_CELIST |
| 81 _active += 1; |
| 82 _histogram[0] += 1; |
| 83 #endif |
| 84 |
| 85 ces = ceBuffer; |
| 86 |
| 87 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) { |
| 88 UBool cont = isContinuation(order); |
| 89 |
| 90 order &= strengthMask; |
| 91 |
| 92 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORD
ERMASK) != 0) { |
| 93 if (strength >= UCOL_QUATERNARY) { |
| 94 order &= UCOL_PRIMARYORDERMASK; |
| 95 } else { |
| 96 order = UCOL_IGNORABLE; |
| 97 } |
| 98 } |
| 99 |
| 100 if (order == UCOL_IGNORABLE) { |
| 101 continue; |
| 102 } |
| 103 |
| 104 if (cont) { |
| 105 order |= UCOL_CONTINUATION_MARKER; |
| 106 } |
| 107 |
| 108 add(order, status); |
| 109 } |
| 110 |
| 111 ucol_closeElements(elems); |
| 112 } |
| 113 |
| 114 CEList::~CEList() |
| 115 { |
| 116 #ifdef INSTRUMENT_CELIST |
| 117 _active -= 1; |
| 118 #endif |
| 119 |
| 120 if (ces != ceBuffer) { |
| 121 DELETE_ARRAY(ces); |
| 122 } |
| 123 } |
| 124 |
| 125 void CEList::add(uint32_t ce, UErrorCode &status) |
| 126 { |
| 127 if (U_FAILURE(status)) { |
| 128 return; |
| 129 } |
| 130 |
| 131 if (listSize >= listMax) { |
| 132 int32_t newMax = listMax + CELIST_BUFFER_SIZE; |
| 133 |
| 134 #ifdef INSTRUMENT_CELIST |
| 135 _histogram[listSize / CELIST_BUFFER_SIZE] += 1; |
| 136 #endif |
| 137 |
| 138 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax); |
| 139 |
| 140 if (newCEs == NULL) { |
| 141 status = U_MEMORY_ALLOCATION_ERROR; |
| 142 return; |
| 143 } |
| 144 |
| 145 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t)); |
| 146 |
| 147 if (ces != ceBuffer) { |
| 148 DELETE_ARRAY(ces); |
| 149 } |
| 150 |
| 151 ces = newCEs; |
| 152 listMax = newMax; |
| 153 } |
| 154 |
| 155 ces[listSize++] = ce; |
| 156 } |
| 157 |
| 158 uint32_t CEList::get(int32_t index) const |
| 159 { |
| 160 if (index >= 0 && index < listSize) { |
| 161 return ces[index]; |
| 162 } |
| 163 |
| 164 return UCOL_NULLORDER; |
| 165 } |
| 166 |
| 167 uint32_t &CEList::operator[](int32_t index) const |
| 168 { |
| 169 return ces[index]; |
| 170 } |
| 171 |
| 172 UBool CEList::matchesAt(int32_t offset, const CEList *other) const |
| 173 { |
| 174 if (other == NULL || listSize - offset < other->size()) { |
| 175 return FALSE; |
| 176 } |
| 177 |
| 178 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) { |
| 179 if (ces[i] != (*other)[j]) { |
| 180 return FALSE; |
| 181 } |
| 182 } |
| 183 |
| 184 return TRUE; |
| 185 } |
| 186 |
| 187 int32_t CEList::size() const |
| 188 { |
| 189 return listSize; |
| 190 } |
| 191 |
| 192 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList) |
| 193 |
| 194 #ifdef INSTRUMENT_STRING_LIST |
| 195 int32_t StringList::_lists = 0; |
| 196 int32_t StringList::_strings = 0; |
| 197 int32_t StringList::_histogram[101] = {0}; |
| 198 #endif |
| 199 |
| 200 StringList::StringList(UErrorCode &status) |
| 201 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0) |
| 202 { |
| 203 if (U_FAILURE(status)) { |
| 204 return; |
| 205 } |
| 206 |
| 207 strings = new UnicodeString [listMax]; |
| 208 |
| 209 if (strings == NULL) { |
| 210 status = U_MEMORY_ALLOCATION_ERROR; |
| 211 return; |
| 212 } |
| 213 |
| 214 #ifdef INSTRUMENT_STRING_LIST |
| 215 _lists += 1; |
| 216 _histogram[0] += 1; |
| 217 #endif |
| 218 } |
| 219 |
| 220 StringList::~StringList() |
| 221 { |
| 222 delete[] strings; |
| 223 } |
| 224 |
| 225 void StringList::add(const UnicodeString *string, UErrorCode &status) |
| 226 { |
| 227 if (U_FAILURE(status)) { |
| 228 return; |
| 229 } |
| 230 |
| 231 #ifdef INSTRUMENT_STRING_LIST |
| 232 _strings += 1; |
| 233 #endif |
| 234 |
| 235 if (listSize >= listMax) { |
| 236 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE; |
| 237 |
| 238 UnicodeString *newStrings = new UnicodeString[newMax]; |
| 239 |
| 240 uprv_memcpy(newStrings, strings, listSize * sizeof(UnicodeString)); |
| 241 |
| 242 #ifdef INSTRUMENT_STRING_LIST |
| 243 int32_t _h = listSize / STRING_LIST_BUFFER_SIZE; |
| 244 |
| 245 if (_h > 100) { |
| 246 _h = 100; |
| 247 } |
| 248 |
| 249 _histogram[_h] += 1; |
| 250 #endif |
| 251 |
| 252 delete[] strings; |
| 253 strings = newStrings; |
| 254 listMax = newMax; |
| 255 } |
| 256 |
| 257 // The ctor initialized all the strings in |
| 258 // the array to empty strings, so this |
| 259 // is the same as copying the source string. |
| 260 strings[listSize++].append(*string); |
| 261 } |
| 262 |
| 263 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status) |
| 264 { |
| 265 const UnicodeString string(chars, count); |
| 266 |
| 267 add(&string, status); |
| 268 } |
| 269 |
| 270 const UnicodeString *StringList::get(int32_t index) const |
| 271 { |
| 272 if (index >= 0 && index < listSize) { |
| 273 return &strings[index]; |
| 274 } |
| 275 |
| 276 return NULL; |
| 277 } |
| 278 |
| 279 int32_t StringList::size() const |
| 280 { |
| 281 return listSize; |
| 282 } |
| 283 |
| 284 |
| 285 U_CFUNC void deleteStringList(void *obj); |
| 286 |
| 287 class CEToStringsMap : public UMemory |
| 288 { |
| 289 public: |
| 290 |
| 291 CEToStringsMap(UErrorCode &status); |
| 292 ~CEToStringsMap(); |
| 293 |
| 294 void put(uint32_t ce, UnicodeString *string, UErrorCode &status); |
| 295 StringList *getStringList(uint32_t ce) const; |
| 296 |
| 297 private: |
| 298 |
| 299 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status); |
| 300 UHashtable *map; |
| 301 }; |
| 302 |
| 303 CEToStringsMap::CEToStringsMap(UErrorCode &status) |
| 304 : map(NULL) |
| 305 { |
| 306 if (U_FAILURE(status)) { |
| 307 return; |
| 308 } |
| 309 |
| 310 map = uhash_open(uhash_hashLong, uhash_compareLong, |
| 311 uhash_compareCaselessUnicodeString, |
| 312 &status); |
| 313 |
| 314 if (U_FAILURE(status)) { |
| 315 return; |
| 316 } |
| 317 |
| 318 uhash_setValueDeleter(map, deleteStringList); |
| 319 } |
| 320 |
| 321 CEToStringsMap::~CEToStringsMap() |
| 322 { |
| 323 uhash_close(map); |
| 324 } |
| 325 |
| 326 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status) |
| 327 { |
| 328 StringList *strings = getStringList(ce); |
| 329 |
| 330 if (strings == NULL) { |
| 331 strings = new StringList(status); |
| 332 |
| 333 if (strings == NULL || U_FAILURE(status)) { |
| 334 status = U_MEMORY_ALLOCATION_ERROR; |
| 335 return; |
| 336 } |
| 337 |
| 338 putStringList(ce, strings, status); |
| 339 } |
| 340 |
| 341 strings->add(string, status); |
| 342 } |
| 343 |
| 344 StringList *CEToStringsMap::getStringList(uint32_t ce) const |
| 345 { |
| 346 return (StringList *) uhash_iget(map, ce); |
| 347 } |
| 348 |
| 349 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCo
de &status) |
| 350 { |
| 351 uhash_iput(map, ce, (void *) stringList, &status); |
| 352 } |
| 353 |
| 354 U_CFUNC void deleteStringList(void *obj) |
| 355 { |
| 356 StringList *strings = (StringList *) obj; |
| 357 |
| 358 delete strings; |
| 359 } |
| 360 |
| 361 U_CFUNC void deleteCEList(void *obj); |
| 362 U_CFUNC void deleteUnicodeStringKey(void *obj); |
| 363 |
| 364 class StringToCEsMap : public UMemory |
| 365 { |
| 366 public: |
| 367 StringToCEsMap(UErrorCode &status); |
| 368 ~StringToCEsMap(); |
| 369 |
| 370 void put(const UnicodeString *string, const CEList *ces, UErrorCode &status)
; |
| 371 const CEList *get(const UnicodeString *string); |
| 372 void free(const CEList *list); |
| 373 |
| 374 private: |
| 375 |
| 376 |
| 377 UHashtable *map; |
| 378 }; |
| 379 |
| 380 StringToCEsMap::StringToCEsMap(UErrorCode &status) |
| 381 : map(NULL) |
| 382 { |
| 383 if (U_FAILURE(status)) { |
| 384 return; |
| 385 } |
| 386 |
| 387 map = uhash_open(uhash_hashUnicodeString, |
| 388 uhash_compareUnicodeString, |
| 389 uhash_compareLong, |
| 390 &status); |
| 391 |
| 392 if (U_FAILURE(status)) { |
| 393 return; |
| 394 } |
| 395 |
| 396 uhash_setValueDeleter(map, deleteCEList); |
| 397 uhash_setKeyDeleter(map, deleteUnicodeStringKey); |
| 398 } |
| 399 |
| 400 StringToCEsMap::~StringToCEsMap() |
| 401 { |
| 402 uhash_close(map); |
| 403 } |
| 404 |
| 405 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorC
ode &status) |
| 406 { |
| 407 uhash_put(map, (void *) string, (void *) ces, &status); |
| 408 } |
| 409 |
| 410 const CEList *StringToCEsMap::get(const UnicodeString *string) |
| 411 { |
| 412 return (const CEList *) uhash_get(map, string); |
| 413 } |
| 414 |
| 415 U_CFUNC void deleteCEList(void *obj) |
| 416 { |
| 417 CEList *list = (CEList *) obj; |
| 418 |
| 419 delete list; |
| 420 } |
| 421 |
| 422 U_CFUNC void deleteUnicodeStringKey(void *obj) |
| 423 { |
| 424 UnicodeString *key = (UnicodeString *) obj; |
| 425 |
| 426 delete key; |
| 427 } |
| 428 |
| 429 class CollDataCacheEntry : public UMemory |
| 430 { |
| 431 public: |
| 432 CollDataCacheEntry(CollData *theData); |
| 433 ~CollDataCacheEntry(); |
| 434 |
| 435 CollData *data; |
| 436 int32_t refCount; |
| 437 }; |
| 438 |
| 439 CollDataCacheEntry::CollDataCacheEntry(CollData *theData) |
| 440 : data(theData), refCount(1) |
| 441 { |
| 442 // nothing else to do |
| 443 } |
| 444 |
| 445 CollDataCacheEntry::~CollDataCacheEntry() |
| 446 { |
| 447 // check refCount? |
| 448 delete data; |
| 449 } |
| 450 |
| 451 class CollDataCache : public UMemory |
| 452 { |
| 453 public: |
| 454 CollDataCache(UErrorCode &status); |
| 455 ~CollDataCache(); |
| 456 |
| 457 CollData *get(UCollator *collator, UErrorCode &status); |
| 458 void unref(CollData *collData); |
| 459 |
| 460 void flush(); |
| 461 |
| 462 private: |
| 463 static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBuffe
rLength); |
| 464 static void deleteKey(char *key); |
| 465 |
| 466 UMTX lock; |
| 467 UHashtable *cache; |
| 468 }; |
| 469 |
| 470 U_CFUNC void deleteChars(void * /*obj*/) |
| 471 { |
| 472 // char *chars = (char *) obj; |
| 473 // All the key strings are owned by the |
| 474 // CollData objects and don't need to |
| 475 // be freed here. |
| 476 //DELETE_ARRAY(chars); |
| 477 } |
| 478 |
| 479 U_CFUNC void deleteCollDataCacheEntry(void *obj) |
| 480 { |
| 481 CollDataCacheEntry *entry = (CollDataCacheEntry *) obj; |
| 482 |
| 483 delete entry; |
| 484 } |
| 485 |
| 486 CollDataCache::CollDataCache(UErrorCode &status) |
| 487 : lock(0), cache(NULL) |
| 488 { |
| 489 if (U_FAILURE(status)) { |
| 490 return; |
| 491 } |
| 492 |
| 493 cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &
status); |
| 494 |
| 495 if (U_FAILURE(status)) { |
| 496 return; |
| 497 } |
| 498 |
| 499 uhash_setValueDeleter(cache, deleteCollDataCacheEntry); |
| 500 uhash_setKeyDeleter(cache, deleteChars); |
| 501 } |
| 502 |
| 503 CollDataCache::~CollDataCache() |
| 504 { |
| 505 umtx_lock(&lock); |
| 506 uhash_close(cache); |
| 507 cache = NULL; |
| 508 umtx_unlock(&lock); |
| 509 |
| 510 umtx_destroy(&lock); |
| 511 } |
| 512 |
| 513 CollData *CollDataCache::get(UCollator *collator, UErrorCode &status) |
| 514 { |
| 515 char keyBuffer[KEY_BUFFER_SIZE]; |
| 516 int32_t keyLength = KEY_BUFFER_SIZE; |
| 517 char *key = getKey(collator, keyBuffer, &keyLength); |
| 518 CollData *result = NULL, *newData = NULL; |
| 519 CollDataCacheEntry *entry = NULL, *newEntry = NULL; |
| 520 |
| 521 umtx_lock(&lock); |
| 522 entry = (CollDataCacheEntry *) uhash_get(cache, key); |
| 523 |
| 524 if (entry == NULL) { |
| 525 umtx_unlock(&lock); |
| 526 |
| 527 newData = new CollData(collator, key, keyLength, status); |
| 528 newEntry = new CollDataCacheEntry(newData); |
| 529 |
| 530 if (U_FAILURE(status) || newData == NULL || newEntry == NULL) { |
| 531 status = U_MEMORY_ALLOCATION_ERROR; |
| 532 return NULL; |
| 533 } |
| 534 |
| 535 umtx_lock(&lock); |
| 536 entry = (CollDataCacheEntry *) uhash_get(cache, key); |
| 537 |
| 538 if (entry == NULL) { |
| 539 uhash_put(cache, newData->key, newEntry, &status); |
| 540 umtx_unlock(&lock); |
| 541 |
| 542 if (U_FAILURE(status)) { |
| 543 delete newEntry; |
| 544 delete newData; |
| 545 |
| 546 return NULL; |
| 547 } |
| 548 |
| 549 return newData; |
| 550 } |
| 551 } |
| 552 |
| 553 result = entry->data; |
| 554 entry->refCount += 1; |
| 555 umtx_unlock(&lock); |
| 556 |
| 557 if (key != keyBuffer) { |
| 558 deleteKey(key); |
| 559 } |
| 560 |
| 561 if (newEntry != NULL) { |
| 562 delete newEntry; |
| 563 delete newData; |
| 564 } |
| 565 |
| 566 return result; |
| 567 } |
| 568 |
| 569 void CollDataCache::unref(CollData *collData) |
| 570 { |
| 571 CollDataCacheEntry *entry = NULL; |
| 572 |
| 573 umtx_lock(&lock); |
| 574 entry = (CollDataCacheEntry *) uhash_get(cache, collData->key); |
| 575 |
| 576 if (entry != NULL) { |
| 577 entry->refCount -= 1; |
| 578 } |
| 579 umtx_unlock(&lock); |
| 580 } |
| 581 |
| 582 char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBu
fferLength) |
| 583 { |
| 584 UErrorCode status = U_ZERO_ERROR; |
| 585 int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyB
ufferLength, &status); |
| 586 |
| 587 if (len >= *keyBufferLength) { |
| 588 *keyBufferLength = (len + 2) & ~1; // round to even length, leaving roo
m for terminating null |
| 589 keyBuffer = NEW_ARRAY(char, *keyBufferLength); |
| 590 status = U_ZERO_ERROR; |
| 591 |
| 592 len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBuffe
rLength, &status); |
| 593 } |
| 594 |
| 595 keyBuffer[len] = '\0'; |
| 596 |
| 597 return keyBuffer; |
| 598 } |
| 599 |
| 600 void CollDataCache::flush() |
| 601 { |
| 602 const UHashElement *element; |
| 603 int32_t pos = -1; |
| 604 |
| 605 umtx_lock(&lock); |
| 606 while ((element = uhash_nextElement(cache, &pos)) != NULL) { |
| 607 CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointe
r; |
| 608 |
| 609 if (entry->refCount <= 0) { |
| 610 uhash_removeElement(cache, element); |
| 611 } |
| 612 } |
| 613 umtx_unlock(&lock); |
| 614 } |
| 615 |
| 616 void CollDataCache::deleteKey(char *key) |
| 617 { |
| 618 DELETE_ARRAY(key); |
| 619 } |
| 620 |
| 621 U_CDECL_BEGIN |
| 622 static UBool coll_data_cleanup(void) { |
| 623 CollData::freeCollDataCache(); |
| 624 return TRUE; |
| 625 } |
| 626 U_CDECL_END |
| 627 |
| 628 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData) |
| 629 |
| 630 CollData::CollData() |
| 631 { |
| 632 // nothing |
| 633 } |
| 634 |
| 635 #define CLONE_COLLATOR |
| 636 |
| 637 //#define CACHE_CELISTS |
| 638 CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength,
UErrorCode &status) |
| 639 : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL) |
| 640 { |
| 641 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]] |
| 642 // i.e. other, control, private use, format, surrogate |
| 643 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20); |
| 644 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20); |
| 645 USet *charsToTest = uset_openPattern(test_pattern, 20, &status); |
| 646 |
| 647 // Han ext. A, Han, Jamo, Hangul, Han Ext. B |
| 648 // i.e. all the characers we handle implicitly |
| 649 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\
\uD7AF][\\U00020000-\\U0002A6DF]]", 70); |
| 650 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\
\uD7AF][\\U00020000-\\U0002A6DF]]", 70); |
| 651 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status); |
| 652 |
| 653 if (U_FAILURE(status)) { |
| 654 return; |
| 655 } |
| 656 |
| 657 USet *expansions = uset_openEmpty(); |
| 658 USet *contractions = uset_openEmpty(); |
| 659 int32_t itemCount; |
| 660 |
| 661 #ifdef CACHE_CELISTS |
| 662 charsToCEList = new StringToCEsMap(status); |
| 663 |
| 664 if (U_FAILURE(status)) { |
| 665 goto bail; |
| 666 } |
| 667 #else |
| 668 charsToCEList = NULL; |
| 669 #endif |
| 670 |
| 671 ceToCharsStartingWith = new CEToStringsMap(status); |
| 672 |
| 673 if (U_FAILURE(status)) { |
| 674 goto bail; |
| 675 } |
| 676 |
| 677 if (cacheKeyLength > KEY_BUFFER_SIZE) { |
| 678 key = NEW_ARRAY(char, cacheKeyLength); |
| 679 |
| 680 if (key == NULL) { |
| 681 status = U_MEMORY_ALLOCATION_ERROR; |
| 682 goto bail; |
| 683 } |
| 684 } else { |
| 685 key = keyBuffer; |
| 686 } |
| 687 |
| 688 ARRAY_COPY(key, cacheKey, cacheKeyLength); |
| 689 |
| 690 #ifdef CLONE_COLLATOR |
| 691 coll = ucol_safeClone(collator, NULL, NULL, &status); |
| 692 |
| 693 if (U_FAILURE(status)) { |
| 694 goto bail; |
| 695 } |
| 696 #else |
| 697 coll = collator; |
| 698 #endif |
| 699 |
| 700 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &st
atus); |
| 701 |
| 702 uset_addAll(charsToTest, contractions); |
| 703 uset_addAll(charsToTest, expansions); |
| 704 uset_removeAll(charsToTest, charsToRemove); |
| 705 |
| 706 itemCount = uset_getItemCount(charsToTest); |
| 707 for(int32_t item = 0; item < itemCount; item += 1) { |
| 708 UChar32 start = 0, end = 0; |
| 709 UChar buffer[16]; |
| 710 int32_t len = uset_getItem(charsToTest, item, &start, &end, |
| 711 buffer, 16, &status); |
| 712 |
| 713 if (len == 0) { |
| 714 for (UChar32 ch = start; ch <= end; ch += 1) { |
| 715 UnicodeString *st = new UnicodeString(ch); |
| 716 |
| 717 if (st == NULL) { |
| 718 status = U_MEMORY_ALLOCATION_ERROR; |
| 719 break; |
| 720 } |
| 721 |
| 722 CEList *ceList = new CEList(coll, *st, status); |
| 723 |
| 724 ceToCharsStartingWith->put(ceList->get(0), st, status); |
| 725 |
| 726 #ifdef CACHE_CELISTS |
| 727 charsToCEList->put(st, ceList, status); |
| 728 #else |
| 729 delete ceList; |
| 730 delete st; |
| 731 #endif |
| 732 } |
| 733 } else if (len > 0) { |
| 734 UnicodeString *st = new UnicodeString(buffer, len); |
| 735 |
| 736 if (st == NULL) { |
| 737 status = U_MEMORY_ALLOCATION_ERROR; |
| 738 break; |
| 739 } |
| 740 |
| 741 CEList *ceList = new CEList(coll, *st, status); |
| 742 |
| 743 ceToCharsStartingWith->put(ceList->get(0), st, status); |
| 744 |
| 745 #ifdef CACHE_CELISTS |
| 746 charsToCEList->put(st, ceList, status); |
| 747 #else |
| 748 delete ceList; |
| 749 delete st; |
| 750 #endif |
| 751 } else { |
| 752 // shouldn't happen... |
| 753 } |
| 754 |
| 755 if (U_FAILURE(status)) { |
| 756 break; |
| 757 } |
| 758 } |
| 759 |
| 760 bail: |
| 761 uset_close(contractions); |
| 762 uset_close(expansions); |
| 763 uset_close(charsToRemove); |
| 764 uset_close(charsToTest); |
| 765 |
| 766 if (U_FAILURE(status)) { |
| 767 return; |
| 768 } |
| 769 |
| 770 UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT
, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A, |
| 771 UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B}; |
| 772 UChar jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_J
AMO, UCOL_LAST_T_JAMO}; |
| 773 UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(ha
nRanges)); |
| 774 UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges)); |
| 775 CEList hanList(coll, hanString, status); |
| 776 CEList jamoList(coll, jamoString, status); |
| 777 int32_t j = 0; |
| 778 |
| 779 if (U_FAILURE(status)) { |
| 780 return; |
| 781 } |
| 782 |
| 783 for (int32_t c = 0; c < jamoList.size(); c += 1) { |
| 784 uint32_t jce = jamoList[c]; |
| 785 |
| 786 if (! isContinuation(jce)) { |
| 787 jamoLimits[j++] = jce; |
| 788 } |
| 789 } |
| 790 |
| 791 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT); |
| 792 |
| 793 minHan = 0xFFFFFFFF; |
| 794 maxHan = 0; |
| 795 |
| 796 for(int32_t h = 0; h < hanList.size(); h += 2) { |
| 797 uint32_t han = (uint32_t) hanList[h]; |
| 798 |
| 799 if (han < minHan) { |
| 800 minHan = han; |
| 801 } |
| 802 |
| 803 if (han > maxHan) { |
| 804 maxHan = han; |
| 805 } |
| 806 } |
| 807 |
| 808 maxHan += (1 << UCOL_PRIMARYORDERSHIFT); |
| 809 } |
| 810 |
| 811 CollData::~CollData() |
| 812 { |
| 813 #ifdef CLONE_COLLATOR |
| 814 ucol_close(coll); |
| 815 #endif |
| 816 |
| 817 if (key != keyBuffer) { |
| 818 DELETE_ARRAY(key); |
| 819 } |
| 820 |
| 821 delete ceToCharsStartingWith; |
| 822 |
| 823 #ifdef CACHE_CELISTS |
| 824 delete charsToCEList; |
| 825 #endif |
| 826 } |
| 827 |
| 828 UCollator *CollData::getCollator() const |
| 829 { |
| 830 return coll; |
| 831 } |
| 832 |
| 833 const StringList *CollData::getStringList(int32_t ce) const |
| 834 { |
| 835 return ceToCharsStartingWith->getStringList(ce); |
| 836 } |
| 837 |
| 838 const CEList *CollData::getCEList(const UnicodeString *string) const |
| 839 { |
| 840 #ifdef CACHE_CELISTS |
| 841 return charsToCEList->get(string); |
| 842 #else |
| 843 UErrorCode status = U_ZERO_ERROR; |
| 844 const CEList *list = new CEList(coll, *string, status); |
| 845 |
| 846 if (U_FAILURE(status)) { |
| 847 delete list; |
| 848 list = NULL; |
| 849 } |
| 850 |
| 851 return list; |
| 852 #endif |
| 853 } |
| 854 |
| 855 void CollData::freeCEList(const CEList *list) |
| 856 { |
| 857 #ifndef CACHE_CELISTS |
| 858 delete list; |
| 859 #endif |
| 860 } |
| 861 |
| 862 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t
*history) const |
| 863 { |
| 864 // find out shortest string for the longest sequence of ces. |
| 865 // this can probably be folded with the minLengthCache... |
| 866 |
| 867 if (history[offset] >= 0) { |
| 868 return history[offset]; |
| 869 } |
| 870 |
| 871 uint32_t ce = ceList->get(offset); |
| 872 int32_t maxOffset = ceList->size(); |
| 873 int32_t shortestLength = INT32_MAX; |
| 874 const StringList *strings = ceToCharsStartingWith->getStringList(ce); |
| 875 |
| 876 if (strings != NULL) { |
| 877 int32_t stringCount = strings->size(); |
| 878 |
| 879 for (int32_t s = 0; s < stringCount; s += 1) { |
| 880 const UnicodeString *string = strings->get(s); |
| 881 #ifdef CACHE_CELISTS |
| 882 const CEList *ceList2 = charsToCEList->get(string); |
| 883 #else |
| 884 UErrorCode status = U_ZERO_ERROR; |
| 885 const CEList *ceList2 = new CEList(coll, *string, status); |
| 886 |
| 887 if (U_FAILURE(status)) { |
| 888 delete ceList2; |
| 889 ceList2 = NULL; |
| 890 } |
| 891 #endif |
| 892 |
| 893 if (ceList->matchesAt(offset, ceList2)) { |
| 894 int32_t clength = ceList2->size(); |
| 895 int32_t slength = string->length(); |
| 896 int32_t roffset = offset + clength; |
| 897 int32_t rlength = 0; |
| 898 |
| 899 if (roffset < maxOffset) { |
| 900 rlength = minLengthInChars(ceList, roffset, history); |
| 901 |
| 902 if (rlength <= 0) { |
| 903 // delete before continue to avoid memory leak. |
| 904 #ifndef CACHE_CELISTS |
| 905 delete ceList2; |
| 906 #endif |
| 907 // ignore any dead ends |
| 908 continue; |
| 909 } |
| 910 } |
| 911 |
| 912 if (shortestLength > slength + rlength) { |
| 913 shortestLength = slength + rlength; |
| 914 } |
| 915 } |
| 916 |
| 917 #ifndef CACHE_CELISTS |
| 918 delete ceList2; |
| 919 #endif |
| 920 } |
| 921 } |
| 922 |
| 923 if (shortestLength == INT32_MAX) { |
| 924 // No matching strings at this offset. See if |
| 925 // the CE is in a range we can handle manually. |
| 926 if (ce >= minHan && ce < maxHan) { |
| 927 // all han have implicit orders which |
| 928 // generate two CEs. |
| 929 int32_t roffset = offset + 2; |
| 930 int32_t rlength = 0; |
| 931 |
| 932 //history[roffset++] = -1; |
| 933 //history[roffset++] = 1; |
| 934 |
| 935 if (roffset < maxOffset) { |
| 936 rlength = minLengthInChars(ceList, roffset, history); |
| 937 } |
| 938 |
| 939 if (rlength < 0) { |
| 940 return -1; |
| 941 } |
| 942 |
| 943 shortestLength = 1 + rlength; |
| 944 goto have_shortest; |
| 945 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) { |
| 946 int32_t roffset = offset; |
| 947 int32_t rlength = 0; |
| 948 |
| 949 // **** this loop may not handle archaic Hangul correctly **** |
| 950 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset +=
1) { |
| 951 uint32_t jce = ceList->get(roffset); |
| 952 |
| 953 // Some Jamo have 24-bit primary order; skip the |
| 954 // 2nd CE. This should always be OK because if |
| 955 // we're still in the loop all we've seen are |
| 956 // a series of Jamo in LVT order. |
| 957 if (isContinuation(jce)) { |
| 958 continue; |
| 959 } |
| 960 |
| 961 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) { |
| 962 break; |
| 963 } |
| 964 } |
| 965 |
| 966 if (roffset == offset) { |
| 967 // we started with a non-L Jamo... |
| 968 // just say it comes from a single character |
| 969 roffset += 1; |
| 970 |
| 971 // See if the single Jamo has a 24-bit order. |
| 972 if (roffset < maxOffset && isContinuation(ceList->get(roffset)))
{ |
| 973 roffset += 1; |
| 974 } |
| 975 } |
| 976 |
| 977 if (roffset < maxOffset) { |
| 978 rlength = minLengthInChars(ceList, roffset, history); |
| 979 } |
| 980 |
| 981 if (rlength < 0) { |
| 982 return -1; |
| 983 } |
| 984 |
| 985 shortestLength = 1 + rlength; |
| 986 goto have_shortest; |
| 987 } |
| 988 |
| 989 // Can't handle it manually either. Just move on. |
| 990 return -1; |
| 991 } |
| 992 |
| 993 have_shortest: |
| 994 history[offset] = shortestLength; |
| 995 |
| 996 return shortestLength; |
| 997 } |
| 998 |
| 999 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const |
| 1000 { |
| 1001 int32_t clength = ceList->size(); |
| 1002 int32_t *history = NEW_ARRAY(int32_t, clength); |
| 1003 |
| 1004 for (int32_t i = 0; i < clength; i += 1) { |
| 1005 history[i] = -1; |
| 1006 } |
| 1007 |
| 1008 int32_t minLength = minLengthInChars(ceList, offset, history); |
| 1009 |
| 1010 DELETE_ARRAY(history); |
| 1011 |
| 1012 return minLength; |
| 1013 } |
| 1014 |
| 1015 CollData *CollData::open(UCollator *collator, UErrorCode &status) |
| 1016 { |
| 1017 if (U_FAILURE(status)) { |
| 1018 return NULL; |
| 1019 } |
| 1020 |
| 1021 CollDataCache *cache = getCollDataCache(); |
| 1022 |
| 1023 return cache->get(collator, status); |
| 1024 } |
| 1025 |
| 1026 void CollData::close(CollData *collData) |
| 1027 { |
| 1028 CollDataCache *cache = getCollDataCache(); |
| 1029 |
| 1030 cache->unref(collData); |
| 1031 } |
| 1032 |
| 1033 CollDataCache *CollData::collDataCache = NULL; |
| 1034 |
| 1035 CollDataCache *CollData::getCollDataCache() |
| 1036 { |
| 1037 UErrorCode status = U_ZERO_ERROR; |
| 1038 CollDataCache *cache = NULL; |
| 1039 |
| 1040 UMTX_CHECK(NULL, collDataCache, cache); |
| 1041 |
| 1042 if (cache == NULL) { |
| 1043 cache = new CollDataCache(status); |
| 1044 |
| 1045 if (U_FAILURE(status)) { |
| 1046 delete cache; |
| 1047 return NULL; |
| 1048 } |
| 1049 |
| 1050 umtx_lock(NULL); |
| 1051 if (collDataCache == NULL) { |
| 1052 collDataCache = cache; |
| 1053 |
| 1054 ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup); |
| 1055 } |
| 1056 umtx_unlock(NULL); |
| 1057 |
| 1058 if (collDataCache != cache) { |
| 1059 delete cache; |
| 1060 } |
| 1061 } |
| 1062 |
| 1063 return collDataCache; |
| 1064 } |
| 1065 |
| 1066 void CollData::freeCollDataCache() |
| 1067 { |
| 1068 CollDataCache *cache = NULL; |
| 1069 |
| 1070 UMTX_CHECK(NULL, collDataCache, cache); |
| 1071 |
| 1072 if (cache != NULL) { |
| 1073 umtx_lock(NULL); |
| 1074 if (collDataCache != NULL) { |
| 1075 collDataCache = NULL; |
| 1076 } else { |
| 1077 cache = NULL; |
| 1078 } |
| 1079 umtx_unlock(NULL); |
| 1080 |
| 1081 delete cache; |
| 1082 } |
| 1083 } |
| 1084 |
| 1085 void CollData::flushCollDataCache() |
| 1086 { |
| 1087 CollDataCache *cache = NULL; |
| 1088 |
| 1089 UMTX_CHECK(NULL, collDataCache, cache); |
| 1090 |
| 1091 // **** this will fail if the another **** |
| 1092 // **** thread deletes the cache here **** |
| 1093 if (cache != NULL) { |
| 1094 cache->flush(); |
| 1095 } |
| 1096 } |
| 1097 |
| 1098 U_NAMESPACE_END |
| 1099 |
| 1100 #endif // #if !UCONFIG_NO_COLLATION |
OLD | NEW |