OLD | NEW |
1 /* | 1 /* |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * Copyright (C) 2009-2013, International Business Machines Corporation and | 3 * Copyright (C) 2009-2014, International Business Machines Corporation and |
4 * others. All Rights Reserved. | 4 * others. All Rights Reserved. |
5 ******************************************************************************* | 5 ******************************************************************************* |
6 */ | 6 */ |
7 | 7 |
8 #include "unicode/utypes.h" | 8 #include "unicode/utypes.h" |
9 | 9 |
10 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION | 10 #if !UCONFIG_NO_COLLATION |
11 | 11 |
12 #include "unicode/alphaindex.h" | 12 #include "unicode/alphaindex.h" |
13 #include "unicode/coleitr.h" | |
14 #include "unicode/coll.h" | 13 #include "unicode/coll.h" |
15 #include "unicode/localpointer.h" | 14 #include "unicode/localpointer.h" |
16 #include "unicode/normalizer2.h" | 15 #include "unicode/normalizer2.h" |
17 #include "unicode/tblcoll.h" | 16 #include "unicode/tblcoll.h" |
| 17 #include "unicode/uchar.h" |
18 #include "unicode/ulocdata.h" | 18 #include "unicode/ulocdata.h" |
19 #include "unicode/uniset.h" | 19 #include "unicode/uniset.h" |
20 #include "unicode/uobject.h" | 20 #include "unicode/uobject.h" |
21 #include "unicode/usetiter.h" | 21 #include "unicode/usetiter.h" |
22 #include "unicode/utf16.h" | 22 #include "unicode/utf16.h" |
23 | 23 |
24 #include "cmemory.h" | 24 #include "cmemory.h" |
25 #include "cstring.h" | 25 #include "cstring.h" |
26 #include "uassert.h" | 26 #include "uassert.h" |
27 #include "uvector.h" | 27 #include "uvector.h" |
| 28 #include "uvectr64.h" |
28 | 29 |
29 //#include <string> | 30 //#include <string> |
30 //#include <iostream> | 31 //#include <iostream> |
31 | 32 |
32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) | |
33 | |
34 U_NAMESPACE_BEGIN | 33 U_NAMESPACE_BEGIN |
35 | 34 |
36 namespace { | 35 namespace { |
37 | 36 |
38 /** | 37 /** |
39 * Prefix string for Chinese index buckets. | 38 * Prefix string for Chinese index buckets. |
40 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collat
ion_Indexes | 39 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collat
ion_Indexes |
41 */ | 40 */ |
42 const UChar BASE[1] = { 0xFDD0 }; | 41 const UChar BASE[1] = { 0xFDD0 }; |
43 const int32_t BASE_LENGTH = 1; | 42 const int32_t BASE_LENGTH = 1; |
(...skipping 278 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
322 errorCode = U_MEMORY_ALLOCATION_ERROR; | 321 errorCode = U_MEMORY_ALLOCATION_ERROR; |
323 return; | 322 return; |
324 } | 323 } |
325 checkDistinct = FALSE; | 324 checkDistinct = FALSE; |
326 } else { | 325 } else { |
327 checkDistinct = TRUE; | 326 checkDistinct = TRUE; |
328 } | 327 } |
329 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode)
< 0) { | 328 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode)
< 0) { |
330 // Ignore a primary-ignorable or non-alphabetic index character. | 329 // Ignore a primary-ignorable or non-alphabetic index character. |
331 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorC
ode) >= 0) { | 330 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorC
ode) >= 0) { |
332 // Ignore an index characters that will land in the overflow bucket. | 331 // Ignore an index character that will land in the overflow bucket. |
333 } else if (checkDistinct && | 332 } else if (checkDistinct && |
334 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode
) == 0) { | 333 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode
) == 0) { |
335 // Ignore a multi-code point index character that does not sort dist
inctly | 334 // Ignore a multi-code point index character that does not sort dist
inctly |
336 // from the sequence of its separate characters. | 335 // from the sequence of its separate characters. |
337 } else { | 336 } else { |
338 int32_t insertionPoint = binarySearch(indexCharacters, *item, *colla
torPrimaryOnly_); | 337 int32_t insertionPoint = binarySearch(indexCharacters, *item, *colla
torPrimaryOnly_); |
339 if (insertionPoint < 0) { | 338 if (insertionPoint < 0) { |
340 indexCharacters.insertElementAt( | 339 indexCharacters.insertElementAt( |
341 ownedString(*item, ownedItem, errorCode), ~insertionPoint, e
rrorCode); | 340 ownedString(*item, ownedItem, errorCode), ~insertionPoint, e
rrorCode); |
342 } else { | 341 } else { |
343 const UnicodeString &itemAlreadyIn = *getString(indexCharacters,
insertionPoint); | 342 const UnicodeString &itemAlreadyIn = *getString(indexCharacters,
insertionPoint); |
344 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlread
yIn)) { | 343 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlread
yIn)) { |
345 indexCharacters.setElementAt( | 344 indexCharacters.setElementAt( |
346 ownedString(*item, ownedItem, errorCode), insertionPoint
); | 345 ownedString(*item, ownedItem, errorCode), insertionPoint
); |
347 } | 346 } |
348 } | 347 } |
349 } | 348 } |
350 } | 349 } |
351 if (U_FAILURE(errorCode)) { return; } | 350 if (U_FAILURE(errorCode)) { return; } |
352 | 351 |
353 // if the result is still too large, cut down to maxCount elements, by remov
ing every nth element | 352 // if the result is still too large, cut down to maxLabelCount_ elements, by
removing every nth element |
354 | 353 |
355 int32_t size = indexCharacters.size() - 1; | 354 int32_t size = indexCharacters.size() - 1; |
356 if (size > maxLabelCount_) { | 355 if (size > maxLabelCount_) { |
357 int32_t count = 0; | 356 int32_t count = 0; |
358 int32_t old = -1; | 357 int32_t old = -1; |
359 for (int32_t i = 0; i < indexCharacters.size();) { | 358 for (int32_t i = 0; i < indexCharacters.size();) { |
360 ++count; | 359 ++count; |
361 int32_t bump = count * maxLabelCount_ / size; | 360 int32_t bump = count * maxLabelCount_ / size; |
362 if (bump == old) { | 361 if (bump == old) { |
363 indexCharacters.removeElementAt(i); | 362 indexCharacters.removeElementAt(i); |
(...skipping 22 matching lines...) Expand all Loading... |
386 count /= 10; | 385 count /= 10; |
387 temp.insert(0, (UChar)(0x30 + count)); | 386 temp.insert(0, (UChar)(0x30 + count)); |
388 } | 387 } |
389 } | 388 } |
390 return temp.append((UChar)0x5283); | 389 return temp.append((UChar)0x5283); |
391 } | 390 } |
392 return temp.setTo(current, BASE_LENGTH); | 391 return temp.setTo(current, BASE_LENGTH); |
393 } | 392 } |
394 | 393 |
395 UBool hasMultiplePrimaryWeights( | 394 UBool hasMultiplePrimaryWeights( |
396 CollationElementIterator &cei, int32_t variableTop, | 395 const RuleBasedCollator &coll, uint32_t variableTop, |
397 const UnicodeString &s, UErrorCode &errorCode) { | 396 const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) { |
398 cei.setText(s, errorCode); | 397 ces.removeAllElements(); |
| 398 coll.internalGetCEs(s, ces, errorCode); |
| 399 if (U_FAILURE(errorCode)) { return FALSE; } |
399 UBool seenPrimary = FALSE; | 400 UBool seenPrimary = FALSE; |
400 for (;;) { | 401 for (int32_t i = 0; i < ces.size(); ++i) { |
401 int32_t ce32 = cei.next(errorCode); | 402 int64_t ce = ces.elementAti(i); |
402 if (ce32 == CollationElementIterator::NULLORDER) { | 403 uint32_t p = (uint32_t)(ce >> 32); |
403 break; | 404 if (p > variableTop) { |
404 } | 405 // not primary ignorable |
405 int32_t p = CollationElementIterator::primaryOrder(ce32); | |
406 if (p > variableTop && (ce32 & 0xc0) != 0xc0) { | |
407 // not primary ignorable, and not a continuation CE | |
408 if (seenPrimary) { | 406 if (seenPrimary) { |
409 return TRUE; | 407 return TRUE; |
410 } | 408 } |
411 seenPrimary = TRUE; | 409 seenPrimary = TRUE; |
412 } | 410 } |
413 } | 411 } |
414 return FALSE; | 412 return FALSE; |
415 } | 413 } |
416 | 414 |
417 } // namespace | 415 } // namespace |
418 | 416 |
419 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { | 417 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { |
420 // Initialize indexCharacters. | 418 // Initialize indexCharacters. |
421 UVector indexCharacters(errorCode); | 419 UVector indexCharacters(errorCode); |
422 indexCharacters.setDeleter(uprv_deleteUObject); | 420 indexCharacters.setDeleter(uprv_deleteUObject); |
423 initLabels(indexCharacters, errorCode); | 421 initLabels(indexCharacters, errorCode); |
424 if (U_FAILURE(errorCode)) { return NULL; } | 422 if (U_FAILURE(errorCode)) { return NULL; } |
425 | 423 |
426 // Variables for hasMultiplePrimaryWeights(). | 424 // Variables for hasMultiplePrimaryWeights(). |
427 LocalPointer<CollationElementIterator> cei( | 425 UVector64 ces(errorCode); |
428 collatorPrimaryOnly_->createCollationElementIterator(emptyString_)); | 426 uint32_t variableTop; |
429 if (cei.isNull()) { | |
430 errorCode = U_MEMORY_ALLOCATION_ERROR; | |
431 return NULL; | |
432 } | |
433 int32_t variableTop; | |
434 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) =
= UCOL_SHIFTED) { | 427 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) =
= UCOL_SHIFTED) { |
435 variableTop = CollationElementIterator::primaryOrder( | 428 variableTop = collatorPrimaryOnly_->getVariableTop(errorCode); |
436 (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode)); | |
437 } else { | 429 } else { |
438 variableTop = 0; | 430 variableTop = 0; |
439 } | 431 } |
440 UBool hasInvisibleBuckets = FALSE; | 432 UBool hasInvisibleBuckets = FALSE; |
441 | 433 |
442 // Helper arrays for Chinese Pinyin collation. | 434 // Helper arrays for Chinese Pinyin collation. |
443 Bucket *asciiBuckets[26] = { | 435 Bucket *asciiBuckets[26] = { |
444 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, | 436 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, |
445 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL | 437 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL |
446 }; | 438 }; |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
507 UChar c; | 499 UChar c; |
508 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5
A) { // A-Z | 500 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5
A) { // A-Z |
509 asciiBuckets[c - 0x41] = bucket; | 501 asciiBuckets[c - 0x41] = bucket; |
510 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BAS
E, BASE_LENGTH) && | 502 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BAS
E, BASE_LENGTH) && |
511 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) { | 503 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) { |
512 pinyinBuckets[c - 0x41] = bucket; | 504 pinyinBuckets[c - 0x41] = bucket; |
513 hasPinyin = TRUE; | 505 hasPinyin = TRUE; |
514 } | 506 } |
515 // Check for multiple primary weights. | 507 // Check for multiple primary weights. |
516 if (!current.startsWith(BASE, BASE_LENGTH) && | 508 if (!current.startsWith(BASE, BASE_LENGTH) && |
517 hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode)
&& | 509 hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, cu
rrent, |
| 510 ces, errorCode) && |
518 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsW
ith("\uffff") */) { | 511 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsW
ith("\uffff") */) { |
519 // "AE-ligature" or "Sch" etc. | 512 // "AE-ligature" or "Sch" etc. |
520 for (int32_t i = bucketList->size() - 2;; --i) { | 513 for (int32_t i = bucketList->size() - 2;; --i) { |
521 Bucket *singleBucket = getBucket(*bucketList, i); | 514 Bucket *singleBucket = getBucket(*bucketList, i); |
522 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { | 515 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { |
523 // There is no single-character bucket since the last | 516 // There is no single-character bucket since the last |
524 // underflow or inflow label. | 517 // underflow or inflow label. |
525 break; | 518 break; |
526 } | 519 } |
527 if (singleBucket->displayBucket_ == NULL && | 520 if (singleBucket->displayBucket_ == NULL && |
528 !hasMultiplePrimaryWeights( | 521 !hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variab
leTop, |
529 *cei, variableTop, singleBucket->lowerBoundary_, err
orCode)) { | 522 singleBucket->lowerBoundary_, |
| 523 ces, errorCode)) { |
530 // Add an invisible bucket that redirects strings greater th
an the expansion | 524 // Add an invisible bucket that redirects strings greater th
an the expansion |
531 // to the previous single-character bucket. | 525 // to the previous single-character bucket. |
532 // For example, after ... Q R S Sch we add Sch\uFFFF->S | 526 // For example, after ... Q R S Sch we add Sch\uFFFF->S |
533 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. | 527 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. |
534 bucket = new Bucket(emptyString_, | 528 bucket = new Bucket(emptyString_, |
535 UnicodeString(current).append((UChar)0xFFFF), | 529 UnicodeString(current).append((UChar)0xFFFF), |
536 U_ALPHAINDEX_NORMAL); | 530 U_ALPHAINDEX_NORMAL); |
537 if (bucket == NULL) { | 531 if (bucket == NULL) { |
538 errorCode = U_MEMORY_ALLOCATION_ERROR; | 532 errorCode = U_MEMORY_ALLOCATION_ERROR; |
539 return NULL; | 533 return NULL; |
(...skipping 163 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
703 } | 697 } |
704 } | 698 } |
705 | 699 |
706 void AlphabeticIndex::internalResetBucketIterator() { | 700 void AlphabeticIndex::internalResetBucketIterator() { |
707 labelsIterIndex_ = -1; | 701 labelsIterIndex_ = -1; |
708 currentBucket_ = NULL; | 702 currentBucket_ = NULL; |
709 } | 703 } |
710 | 704 |
711 | 705 |
712 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status
) { | 706 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status
) { |
713 if (U_FAILURE(status)) { return; } | |
714 // Chinese index characters, which are specific to each of the several Chine
se tailorings, | |
715 // take precedence over the single locale data exemplar set per language. | |
716 const char *language = locale.getLanguage(); | |
717 if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 || | |
718 uprv_strcmp(language, "ko") == 0) { | |
719 // TODO: This should be done regardless of the language, but it's expens
ive. | |
720 // We should add a Collator function (can be @internal) | |
721 // to enumerate just the contractions that start with a given code point
or string. | |
722 if (addChineseIndexCharacters(status) || U_FAILURE(status)) { | |
723 return; | |
724 } | |
725 } | |
726 | |
727 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); | 707 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); |
728 if (U_FAILURE(status)) { | 708 if (U_FAILURE(status)) { |
729 return; | 709 return; |
730 } | 710 } |
731 | 711 |
732 UnicodeSet exemplars; | 712 UnicodeSet exemplars; |
733 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_I
NDEX, &status); | 713 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_I
NDEX, &status); |
734 if (U_SUCCESS(status)) { | 714 if (U_SUCCESS(status)) { |
735 initialLabels_->addAll(exemplars); | 715 initialLabels_->addAll(exemplars); |
736 return; | 716 return; |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
777 while (it.next()) { | 757 while (it.next()) { |
778 const UnicodeString &exemplarC = it.getString(); | 758 const UnicodeString &exemplarC = it.getString(); |
779 upperC = exemplarC; | 759 upperC = exemplarC; |
780 upperC.toUpper(locale); | 760 upperC.toUpper(locale); |
781 initialLabels_->add(upperC); | 761 initialLabels_->add(upperC); |
782 } | 762 } |
783 } | 763 } |
784 | 764 |
785 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { | 765 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { |
786 UnicodeSet contractions; | 766 UnicodeSet contractions; |
787 ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(), | 767 collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCo
de); |
788 contractions.toUSet(), NULL, FALSE, &error
Code); | 768 if (U_FAILURE(errorCode) || contractions.isEmpty()) { return FALSE; } |
789 if (U_FAILURE(errorCode)) { return FALSE; } | 769 initialLabels_->addAll(contractions); |
790 UnicodeString firstHanBoundary; | |
791 UBool hasPinyin = FALSE; | |
792 UnicodeSetIterator iter(contractions); | 770 UnicodeSetIterator iter(contractions); |
793 while (iter.next()) { | 771 while (iter.next()) { |
794 const UnicodeString &s = iter.getString(); | 772 const UnicodeString &s = iter.getString(); |
795 if (s.startsWith(BASE, BASE_LENGTH)) { | 773 U_ASSERT (s.startsWith(BASE, BASE_LENGTH)); |
796 initialLabels_->add(s); | 774 UChar c = s.charAt(s.length() - 1); |
797 if (firstHanBoundary.isEmpty() || | 775 if (0x41 <= c && c <= 0x5A) { // A-Z |
798 collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode
) < 0) { | 776 // There are Pinyin labels, add ASCII A-Z labels as well. |
799 firstHanBoundary = s; | 777 initialLabels_->add(0x41, 0x5A); // A-Z |
800 } | 778 break; |
801 UChar c = s.charAt(s.length() - 1); | |
802 if (0x41 <= c && c <= 0x5A) { // A-Z | |
803 hasPinyin = TRUE; | |
804 } | |
805 } | 779 } |
806 } | 780 } |
807 if (hasPinyin) { | 781 return TRUE; |
808 initialLabels_->add(0x41, 0x5A); // A-Z | |
809 } | |
810 if (!firstHanBoundary.isEmpty()) { | |
811 // The hardcoded list of script boundaries includes U+4E00 | |
812 // which is tailored to not be the first primary | |
813 // in all Chinese tailorings except "unihan". | |
814 // Replace U+4E00 with the first boundary string from the tailoring. | |
815 // TODO: This becomes obsolete when the root collator gets | |
816 // reliable script-first-primary mappings. | |
817 int32_t hanIndex = binarySearch( | |
818 *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPr
imaryOnly_); | |
819 if (hanIndex >= 0) { | |
820 UnicodeString *fh = new UnicodeString(firstHanBoundary); | |
821 if (fh == NULL) { | |
822 errorCode = U_MEMORY_ALLOCATION_ERROR; | |
823 return FALSE; | |
824 } | |
825 firstCharsInScripts_->setElementAt(fh, hanIndex); | |
826 } | |
827 return TRUE; | |
828 } else { | |
829 return FALSE; | |
830 } | |
831 } | 782 } |
832 | 783 |
833 | 784 |
834 /* | 785 /* |
835 * Return the string with interspersed CGJs. Input must have more than 2 codepoi
nts. | 786 * Return the string with interspersed CGJs. Input must have more than 2 codepoi
nts. |
836 */ | 787 */ |
837 static const UChar CGJ = 0x034F; | 788 static const UChar CGJ = 0x034F; |
838 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { | 789 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { |
839 UnicodeString result; | 790 UnicodeString result; |
840 if (item.length() == 0) { | 791 if (item.length() == 0) { |
(...skipping 17 matching lines...) Expand all Loading... |
858 return FALSE; | 809 return FALSE; |
859 } | 810 } |
860 | 811 |
861 | 812 |
862 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { | 813 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { |
863 return FALSE; | 814 return FALSE; |
864 } | 815 } |
865 | 816 |
866 | 817 |
867 const RuleBasedCollator &AlphabeticIndex::getCollator() const { | 818 const RuleBasedCollator &AlphabeticIndex::getCollator() const { |
868 // There are no known non-RuleBasedCollator collators, and none ever expecte
d. | 819 return *collator_; |
869 // But, in case that changes, better a null pointer than a wrong type. | |
870 return *dynamic_cast<RuleBasedCollator *>(collator_); | |
871 } | 820 } |
872 | 821 |
873 | 822 |
874 const UnicodeString &AlphabeticIndex::getInflowLabel() const { | 823 const UnicodeString &AlphabeticIndex::getInflowLabel() const { |
875 return inflowLabel_; | 824 return inflowLabel_; |
876 } | 825 } |
877 | 826 |
878 const UnicodeString &AlphabeticIndex::getOverflowLabel() const { | 827 const UnicodeString &AlphabeticIndex::getOverflowLabel() const { |
879 return overflowLabel_; | 828 return overflowLabel_; |
880 } | 829 } |
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
940 if (initialLabels_ == NULL) { | 889 if (initialLabels_ == NULL) { |
941 status = U_MEMORY_ALLOCATION_ERROR; | 890 status = U_MEMORY_ALLOCATION_ERROR; |
942 return; | 891 return; |
943 } | 892 } |
944 | 893 |
945 inflowLabel_.setTo((UChar)0x2026); // Ellipsis | 894 inflowLabel_.setTo((UChar)0x2026); // Ellipsis |
946 overflowLabel_ = inflowLabel_; | 895 overflowLabel_ = inflowLabel_; |
947 underflowLabel_ = inflowLabel_; | 896 underflowLabel_ = inflowLabel_; |
948 | 897 |
949 if (collator_ == NULL) { | 898 if (collator_ == NULL) { |
950 collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*l
ocale, status)); | 899 Collator *coll = Collator::createInstance(*locale, status); |
951 if (U_FAILURE(status)) { return; } | 900 if (U_FAILURE(status)) { |
| 901 delete coll; |
| 902 return; |
| 903 } |
| 904 if (coll == NULL) { |
| 905 status = U_MEMORY_ALLOCATION_ERROR; |
| 906 return; |
| 907 } |
| 908 collator_ = dynamic_cast<RuleBasedCollator *>(coll); |
952 if (collator_ == NULL) { | 909 if (collator_ == NULL) { |
953 status = U_MEMORY_ALLOCATION_ERROR; | 910 delete coll; |
| 911 status = U_UNSUPPORTED_ERROR; |
954 return; | 912 return; |
955 } | 913 } |
956 } | 914 } |
957 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone()); | 915 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone()); |
958 if (collatorPrimaryOnly_ == NULL) { | 916 if (collatorPrimaryOnly_ == NULL) { |
959 status = U_MEMORY_ALLOCATION_ERROR; | 917 status = U_MEMORY_ALLOCATION_ERROR; |
960 return; | 918 return; |
961 } | 919 } |
962 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); | 920 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); |
963 firstCharsInScripts_ = firstStringsInScript(status); | 921 firstCharsInScripts_ = firstStringsInScript(status); |
964 if (U_FAILURE(status)) { return; } | 922 if (U_FAILURE(status)) { return; } |
965 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimar
yOnly_, status); | 923 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimar
yOnly_, status); |
966 UnicodeString _4E00((UChar)0x4E00); | |
967 UnicodeString _1100((UChar)0x1100); | |
968 UnicodeString _1112((UChar)0x1112); | |
969 if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 && | |
970 collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) { | |
971 // The standard Korean tailoring sorts Hanja (Han characters) | |
972 // as secondary differences from Hangul syllables. | |
973 // This makes U+4E00 not useful as a Han-script boundary. | |
974 // TODO: This becomes obsolete when the root collator gets | |
975 // reliable script-first-primary mappings. | |
976 int32_t hanIndex = binarySearch( | |
977 *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_); | |
978 if (hanIndex >= 0) { | |
979 firstCharsInScripts_->removeElementAt(hanIndex); | |
980 } | |
981 } | |
982 // Guard against a degenerate collator where | 924 // Guard against a degenerate collator where |
983 // some script boundary strings are primary ignorable. | 925 // some script boundary strings are primary ignorable. |
984 for (;;) { | 926 for (;;) { |
985 if (U_FAILURE(status)) { return; } | 927 if (U_FAILURE(status)) { return; } |
986 if (firstCharsInScripts_->isEmpty()) { | 928 if (firstCharsInScripts_->isEmpty()) { |
987 // AlphabeticIndex requires some non-ignorable script boundary strin
gs. | 929 // AlphabeticIndex requires some non-ignorable script boundary strin
gs. |
988 status = U_ILLEGAL_ARGUMENT_ERROR; | 930 status = U_ILLEGAL_ARGUMENT_ERROR; |
989 return; | 931 return; |
990 } | 932 } |
991 if (collatorPrimaryOnly_->compare( | 933 if (collatorPrimaryOnly_->compare( |
992 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)
), | 934 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)
), |
993 emptyString_, status) == UCOL_EQUAL) { | 935 emptyString_, status) == UCOL_EQUAL) { |
994 firstCharsInScripts_->removeElementAt(0); | 936 firstCharsInScripts_->removeElementAt(0); |
995 } else { | 937 } else { |
996 break; | 938 break; |
997 } | 939 } |
998 } | 940 } |
999 | 941 |
1000 if (locale != NULL) { | 942 // Chinese index characters, which are specific to each of the several Chine
se tailorings, |
| 943 // take precedence over the single locale data exemplar set per language. |
| 944 if (!addChineseIndexCharacters(status) && locale != NULL) { |
1001 addIndexExemplars(*locale, status); | 945 addIndexExemplars(*locale, status); |
1002 } | 946 } |
1003 } | 947 } |
1004 | 948 |
1005 | 949 |
1006 // | 950 // |
1007 // Comparison function for UVector<UnicodeString *> sorting with a collator. | 951 // Comparison function for UVector<UnicodeString *> sorting with a collator. |
1008 // | 952 // |
1009 static int32_t U_CALLCONV | 953 static int32_t U_CALLCONV |
1010 collatorComparator(const void *context, const void *left, const void *right) { | 954 collatorComparator(const void *context, const void *left, const void *right) { |
(...skipping 24 matching lines...) Expand all Loading... |
1035 recordCompareFn(const void *context, const void *left, const void *right) { | 979 recordCompareFn(const void *context, const void *left, const void *right) { |
1036 const UElement *leftElement = static_cast<const UElement *>(left); | 980 const UElement *leftElement = static_cast<const UElement *>(left); |
1037 const UElement *rightElement = static_cast<const UElement *>(right); | 981 const UElement *rightElement = static_cast<const UElement *>(right); |
1038 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex:
:Record *>(leftElement->pointer); | 982 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex:
:Record *>(leftElement->pointer); |
1039 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex:
:Record *>(rightElement->pointer); | 983 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex:
:Record *>(rightElement->pointer); |
1040 const Collator *col = static_cast<const Collator *>(context); | 984 const Collator *col = static_cast<const Collator *>(context); |
1041 UErrorCode errorCode = U_ZERO_ERROR; | 985 UErrorCode errorCode = U_ZERO_ERROR; |
1042 return col->compare(leftRec->name_, rightRec->name_, errorCode); | 986 return col->compare(leftRec->name_, rightRec->name_, errorCode); |
1043 } | 987 } |
1044 | 988 |
1045 | |
1046 /** | |
1047 * This list contains one character per script that has the | |
1048 * lowest primary weight for that script in the root collator. | |
1049 * This list will be copied and sorted to account for script reordering. | |
1050 * | |
1051 * <p>TODO: This is fragile. If the first character of a script is tailored | |
1052 * so that it does not map to the script's lowest primary weight any more, | |
1053 * then the buckets will be off. | |
1054 * There are hacks in the code to handle the known CJK tailorings of U+4E00. | |
1055 * | |
1056 * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-befor
e a. | |
1057 * | |
1058 * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in | |
1059 * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java | |
1060 */ | |
1061 static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { | |
1062 0x41, 0, 0x03B1, 0, | |
1063 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0x
DD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0, | |
1064 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0,
0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0, | |
1065 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0, | |
1066 0xAAF2, 0, // Meetei Mayek | |
1067 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0, | |
1068 U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada | |
1069 U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri | |
1070 0x1B83, 0, | |
1071 0xD802, 0xDE00, 0, 0x0E01, 0, | |
1072 0x0EDE, 0, // Lao | |
1073 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0, | |
1074 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0, | |
1075 U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma | |
1076 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0, | |
1077 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0,
0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0, | |
1078 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0,
0xA4F8, 0, | |
1079 U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao | |
1080 0xD800, 0xDE80, 0, | |
1081 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0,
0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0, | |
1082 0xD801, 0xDC80, 0, | |
1083 U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng | |
1084 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0,
0xD802, 0xDC40, 0, | |
1085 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0,
0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0, | |
1086 U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive | |
1087 U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs | |
1088 0x4E00, 0, | |
1089 // TODO: The overflow bucket's lowerBoundary string should be the | |
1090 // first item after the last reordering group in the collator's script order
. | |
1091 // This should normally be the first Unicode code point | |
1092 // that is unassigned (U+0378 in Unicode 6.3) and untailored. | |
1093 // However, at least up to ICU 51 the Hani reordering group includes | |
1094 // unassigned code points, | |
1095 // and there is no stable string for the start of the trailing-weights range
. | |
1096 // The only known string that sorts "high" is U+FFFF. | |
1097 // When ICU separates Hani vs. unassigned reordering groups, we need to fix
this, | |
1098 // and fix relevant test code. | |
1099 // Ideally, FractionalUCA.txt will have a "script first primary" | |
1100 // for unassigned code points. | |
1101 0xFFFF, 0 | |
1102 }; | |
1103 | |
1104 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { | 989 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { |
1105 if (U_FAILURE(status)) { | 990 if (U_FAILURE(status)) { |
1106 return NULL; | 991 return NULL; |
1107 } | 992 } |
1108 UVector *dest = new UVector(status); | 993 LocalPointer<UVector> dest(new UVector(status)); |
1109 if (dest == NULL) { | 994 if (dest.isNull()) { |
1110 status = U_MEMORY_ALLOCATION_ERROR; | 995 status = U_MEMORY_ALLOCATION_ERROR; |
1111 return NULL; | 996 return NULL; |
1112 } | 997 } |
1113 dest->setDeleter(uprv_deleteUObject); | 998 dest->setDeleter(uprv_deleteUObject); |
1114 const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS; | 999 // Fetch the script-first-primary contractions which are defined in the root
collator. |
1115 const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS); | 1000 // They all start with U+FDD1. |
1116 do { | 1001 UnicodeSet set; |
1117 if (U_FAILURE(status)) { | 1002 collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status); |
1118 return dest; | 1003 if (U_FAILURE(status)) { |
| 1004 return NULL; |
| 1005 } |
| 1006 if (set.isEmpty()) { |
| 1007 status = U_UNSUPPORTED_ERROR; |
| 1008 return NULL; |
| 1009 } |
| 1010 UnicodeSetIterator iter(set); |
| 1011 while (iter.next()) { |
| 1012 const UnicodeString &boundary = iter.getString(); |
| 1013 uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1)); |
| 1014 if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) { |
| 1015 // Ignore boundaries for the special reordering groups. |
| 1016 // Take only those for "real scripts" (where the sample character is
a Letter, |
| 1017 // and the one for unassigned implicit weights (Cn). |
| 1018 continue; |
1119 } | 1019 } |
1120 UnicodeString *str = new UnicodeString(src, -1); | 1020 UnicodeString *s = new UnicodeString(boundary); |
1121 if (str == NULL) { | 1021 if (s == NULL) { |
1122 status = U_MEMORY_ALLOCATION_ERROR; | 1022 status = U_MEMORY_ALLOCATION_ERROR; |
1123 return dest; | 1023 return NULL; |
1124 } | 1024 } |
1125 dest->addElement(str, status); | 1025 dest->addElement(s, status); |
1126 src += str->length() + 1; | 1026 } |
1127 } while (src < limit); | 1027 return dest.orphan(); |
1128 return dest; | |
1129 } | 1028 } |
1130 | 1029 |
1131 | 1030 |
1132 namespace { | 1031 namespace { |
1133 | 1032 |
1134 /** | 1033 /** |
1135 * Returns true if one index character string is "better" than the other. | 1034 * Returns true if one index character string is "better" than the other. |
1136 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is | 1035 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is |
1137 * better, and otherwise binary-less-than is better. | 1036 * better, and otherwise binary-less-than is better. |
1138 */ | 1037 */ |
(...skipping 201 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1340 records_(NULL) { | 1239 records_(NULL) { |
1341 } | 1240 } |
1342 | 1241 |
1343 | 1242 |
1344 AlphabeticIndex::Bucket::~Bucket() { | 1243 AlphabeticIndex::Bucket::~Bucket() { |
1345 delete records_; | 1244 delete records_; |
1346 } | 1245 } |
1347 | 1246 |
1348 U_NAMESPACE_END | 1247 U_NAMESPACE_END |
1349 | 1248 |
1350 #endif | 1249 #endif // !UCONFIG_NO_COLLATION |
OLD | NEW |