Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(82)

Side by Side Diff: source/i18n/alphaindex.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master
Patch Set: remove unusued directories Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/i18n/Makefile.in ('k') | source/i18n/anytrans.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 ******************************************************************************* 2 *******************************************************************************
3 * Copyright (C) 2009-2013, International Business Machines Corporation and 3 * Copyright (C) 2009-2014, International Business Machines Corporation and
4 * others. All Rights Reserved. 4 * others. All Rights Reserved.
5 ******************************************************************************* 5 *******************************************************************************
6 */ 6 */
7 7
8 #include "unicode/utypes.h" 8 #include "unicode/utypes.h"
9 9
10 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION 10 #if !UCONFIG_NO_COLLATION
11 11
12 #include "unicode/alphaindex.h" 12 #include "unicode/alphaindex.h"
13 #include "unicode/coleitr.h"
14 #include "unicode/coll.h" 13 #include "unicode/coll.h"
15 #include "unicode/localpointer.h" 14 #include "unicode/localpointer.h"
16 #include "unicode/normalizer2.h" 15 #include "unicode/normalizer2.h"
17 #include "unicode/tblcoll.h" 16 #include "unicode/tblcoll.h"
17 #include "unicode/uchar.h"
18 #include "unicode/ulocdata.h" 18 #include "unicode/ulocdata.h"
19 #include "unicode/uniset.h" 19 #include "unicode/uniset.h"
20 #include "unicode/uobject.h" 20 #include "unicode/uobject.h"
21 #include "unicode/usetiter.h" 21 #include "unicode/usetiter.h"
22 #include "unicode/utf16.h" 22 #include "unicode/utf16.h"
23 23
24 #include "cmemory.h" 24 #include "cmemory.h"
25 #include "cstring.h" 25 #include "cstring.h"
26 #include "uassert.h" 26 #include "uassert.h"
27 #include "uvector.h" 27 #include "uvector.h"
28 #include "uvectr64.h"
28 29
29 //#include <string> 30 //#include <string>
30 //#include <iostream> 31 //#include <iostream>
31 32
32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
33
34 U_NAMESPACE_BEGIN 33 U_NAMESPACE_BEGIN
35 34
36 namespace { 35 namespace {
37 36
38 /** 37 /**
39 * Prefix string for Chinese index buckets. 38 * Prefix string for Chinese index buckets.
40 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collat ion_Indexes 39 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collat ion_Indexes
41 */ 40 */
42 const UChar BASE[1] = { 0xFDD0 }; 41 const UChar BASE[1] = { 0xFDD0 };
43 const int32_t BASE_LENGTH = 1; 42 const int32_t BASE_LENGTH = 1;
(...skipping 278 matching lines...) Expand 10 before | Expand all | Expand 10 after
322 errorCode = U_MEMORY_ALLOCATION_ERROR; 321 errorCode = U_MEMORY_ALLOCATION_ERROR;
323 return; 322 return;
324 } 323 }
325 checkDistinct = FALSE; 324 checkDistinct = FALSE;
326 } else { 325 } else {
327 checkDistinct = TRUE; 326 checkDistinct = TRUE;
328 } 327 }
329 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { 328 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
330 // Ignore a primary-ignorable or non-alphabetic index character. 329 // Ignore a primary-ignorable or non-alphabetic index character.
331 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorC ode) >= 0) { 330 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorC ode) >= 0) {
332 // Ignore an index characters that will land in the overflow bucket. 331 // Ignore an index character that will land in the overflow bucket.
333 } else if (checkDistinct && 332 } else if (checkDistinct &&
334 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode ) == 0) { 333 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode ) == 0) {
335 // Ignore a multi-code point index character that does not sort dist inctly 334 // Ignore a multi-code point index character that does not sort dist inctly
336 // from the sequence of its separate characters. 335 // from the sequence of its separate characters.
337 } else { 336 } else {
338 int32_t insertionPoint = binarySearch(indexCharacters, *item, *colla torPrimaryOnly_); 337 int32_t insertionPoint = binarySearch(indexCharacters, *item, *colla torPrimaryOnly_);
339 if (insertionPoint < 0) { 338 if (insertionPoint < 0) {
340 indexCharacters.insertElementAt( 339 indexCharacters.insertElementAt(
341 ownedString(*item, ownedItem, errorCode), ~insertionPoint, e rrorCode); 340 ownedString(*item, ownedItem, errorCode), ~insertionPoint, e rrorCode);
342 } else { 341 } else {
343 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint); 342 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
344 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlread yIn)) { 343 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlread yIn)) {
345 indexCharacters.setElementAt( 344 indexCharacters.setElementAt(
346 ownedString(*item, ownedItem, errorCode), insertionPoint ); 345 ownedString(*item, ownedItem, errorCode), insertionPoint );
347 } 346 }
348 } 347 }
349 } 348 }
350 } 349 }
351 if (U_FAILURE(errorCode)) { return; } 350 if (U_FAILURE(errorCode)) { return; }
352 351
353 // if the result is still too large, cut down to maxCount elements, by remov ing every nth element 352 // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
354 353
355 int32_t size = indexCharacters.size() - 1; 354 int32_t size = indexCharacters.size() - 1;
356 if (size > maxLabelCount_) { 355 if (size > maxLabelCount_) {
357 int32_t count = 0; 356 int32_t count = 0;
358 int32_t old = -1; 357 int32_t old = -1;
359 for (int32_t i = 0; i < indexCharacters.size();) { 358 for (int32_t i = 0; i < indexCharacters.size();) {
360 ++count; 359 ++count;
361 int32_t bump = count * maxLabelCount_ / size; 360 int32_t bump = count * maxLabelCount_ / size;
362 if (bump == old) { 361 if (bump == old) {
363 indexCharacters.removeElementAt(i); 362 indexCharacters.removeElementAt(i);
(...skipping 22 matching lines...) Expand all
386 count /= 10; 385 count /= 10;
387 temp.insert(0, (UChar)(0x30 + count)); 386 temp.insert(0, (UChar)(0x30 + count));
388 } 387 }
389 } 388 }
390 return temp.append((UChar)0x5283); 389 return temp.append((UChar)0x5283);
391 } 390 }
392 return temp.setTo(current, BASE_LENGTH); 391 return temp.setTo(current, BASE_LENGTH);
393 } 392 }
394 393
395 UBool hasMultiplePrimaryWeights( 394 UBool hasMultiplePrimaryWeights(
396 CollationElementIterator &cei, int32_t variableTop, 395 const RuleBasedCollator &coll, uint32_t variableTop,
397 const UnicodeString &s, UErrorCode &errorCode) { 396 const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) {
398 cei.setText(s, errorCode); 397 ces.removeAllElements();
398 coll.internalGetCEs(s, ces, errorCode);
399 if (U_FAILURE(errorCode)) { return FALSE; }
399 UBool seenPrimary = FALSE; 400 UBool seenPrimary = FALSE;
400 for (;;) { 401 for (int32_t i = 0; i < ces.size(); ++i) {
401 int32_t ce32 = cei.next(errorCode); 402 int64_t ce = ces.elementAti(i);
402 if (ce32 == CollationElementIterator::NULLORDER) { 403 uint32_t p = (uint32_t)(ce >> 32);
403 break; 404 if (p > variableTop) {
404 } 405 // not primary ignorable
405 int32_t p = CollationElementIterator::primaryOrder(ce32);
406 if (p > variableTop && (ce32 & 0xc0) != 0xc0) {
407 // not primary ignorable, and not a continuation CE
408 if (seenPrimary) { 406 if (seenPrimary) {
409 return TRUE; 407 return TRUE;
410 } 408 }
411 seenPrimary = TRUE; 409 seenPrimary = TRUE;
412 } 410 }
413 } 411 }
414 return FALSE; 412 return FALSE;
415 } 413 }
416 414
417 } // namespace 415 } // namespace
418 416
419 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { 417 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
420 // Initialize indexCharacters. 418 // Initialize indexCharacters.
421 UVector indexCharacters(errorCode); 419 UVector indexCharacters(errorCode);
422 indexCharacters.setDeleter(uprv_deleteUObject); 420 indexCharacters.setDeleter(uprv_deleteUObject);
423 initLabels(indexCharacters, errorCode); 421 initLabels(indexCharacters, errorCode);
424 if (U_FAILURE(errorCode)) { return NULL; } 422 if (U_FAILURE(errorCode)) { return NULL; }
425 423
426 // Variables for hasMultiplePrimaryWeights(). 424 // Variables for hasMultiplePrimaryWeights().
427 LocalPointer<CollationElementIterator> cei( 425 UVector64 ces(errorCode);
428 collatorPrimaryOnly_->createCollationElementIterator(emptyString_)); 426 uint32_t variableTop;
429 if (cei.isNull()) {
430 errorCode = U_MEMORY_ALLOCATION_ERROR;
431 return NULL;
432 }
433 int32_t variableTop;
434 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) = = UCOL_SHIFTED) { 427 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) = = UCOL_SHIFTED) {
435 variableTop = CollationElementIterator::primaryOrder( 428 variableTop = collatorPrimaryOnly_->getVariableTop(errorCode);
436 (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode));
437 } else { 429 } else {
438 variableTop = 0; 430 variableTop = 0;
439 } 431 }
440 UBool hasInvisibleBuckets = FALSE; 432 UBool hasInvisibleBuckets = FALSE;
441 433
442 // Helper arrays for Chinese Pinyin collation. 434 // Helper arrays for Chinese Pinyin collation.
443 Bucket *asciiBuckets[26] = { 435 Bucket *asciiBuckets[26] = {
444 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 436 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
445 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL 437 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
446 }; 438 };
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after
507 UChar c; 499 UChar c;
508 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5 A) { // A-Z 500 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5 A) { // A-Z
509 asciiBuckets[c - 0x41] = bucket; 501 asciiBuckets[c - 0x41] = bucket;
510 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BAS E, BASE_LENGTH) && 502 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BAS E, BASE_LENGTH) &&
511 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) { 503 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
512 pinyinBuckets[c - 0x41] = bucket; 504 pinyinBuckets[c - 0x41] = bucket;
513 hasPinyin = TRUE; 505 hasPinyin = TRUE;
514 } 506 }
515 // Check for multiple primary weights. 507 // Check for multiple primary weights.
516 if (!current.startsWith(BASE, BASE_LENGTH) && 508 if (!current.startsWith(BASE, BASE_LENGTH) &&
517 hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) && 509 hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, cu rrent,
510 ces, errorCode) &&
518 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsW ith("\uffff") */) { 511 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsW ith("\uffff") */) {
519 // "AE-ligature" or "Sch" etc. 512 // "AE-ligature" or "Sch" etc.
520 for (int32_t i = bucketList->size() - 2;; --i) { 513 for (int32_t i = bucketList->size() - 2;; --i) {
521 Bucket *singleBucket = getBucket(*bucketList, i); 514 Bucket *singleBucket = getBucket(*bucketList, i);
522 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { 515 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
523 // There is no single-character bucket since the last 516 // There is no single-character bucket since the last
524 // underflow or inflow label. 517 // underflow or inflow label.
525 break; 518 break;
526 } 519 }
527 if (singleBucket->displayBucket_ == NULL && 520 if (singleBucket->displayBucket_ == NULL &&
528 !hasMultiplePrimaryWeights( 521 !hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variab leTop,
529 *cei, variableTop, singleBucket->lowerBoundary_, err orCode)) { 522 singleBucket->lowerBoundary_,
523 ces, errorCode)) {
530 // Add an invisible bucket that redirects strings greater th an the expansion 524 // Add an invisible bucket that redirects strings greater th an the expansion
531 // to the previous single-character bucket. 525 // to the previous single-character bucket.
532 // For example, after ... Q R S Sch we add Sch\uFFFF->S 526 // For example, after ... Q R S Sch we add Sch\uFFFF->S
533 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. 527 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
534 bucket = new Bucket(emptyString_, 528 bucket = new Bucket(emptyString_,
535 UnicodeString(current).append((UChar)0xFFFF), 529 UnicodeString(current).append((UChar)0xFFFF),
536 U_ALPHAINDEX_NORMAL); 530 U_ALPHAINDEX_NORMAL);
537 if (bucket == NULL) { 531 if (bucket == NULL) {
538 errorCode = U_MEMORY_ALLOCATION_ERROR; 532 errorCode = U_MEMORY_ALLOCATION_ERROR;
539 return NULL; 533 return NULL;
(...skipping 163 matching lines...) Expand 10 before | Expand all | Expand 10 after
703 } 697 }
704 } 698 }
705 699
706 void AlphabeticIndex::internalResetBucketIterator() { 700 void AlphabeticIndex::internalResetBucketIterator() {
707 labelsIterIndex_ = -1; 701 labelsIterIndex_ = -1;
708 currentBucket_ = NULL; 702 currentBucket_ = NULL;
709 } 703 }
710 704
711 705
712 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status ) { 706 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status ) {
713 if (U_FAILURE(status)) { return; }
714 // Chinese index characters, which are specific to each of the several Chine se tailorings,
715 // take precedence over the single locale data exemplar set per language.
716 const char *language = locale.getLanguage();
717 if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
718 uprv_strcmp(language, "ko") == 0) {
719 // TODO: This should be done regardless of the language, but it's expens ive.
720 // We should add a Collator function (can be @internal)
721 // to enumerate just the contractions that start with a given code point or string.
722 if (addChineseIndexCharacters(status) || U_FAILURE(status)) {
723 return;
724 }
725 }
726
727 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); 707 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
728 if (U_FAILURE(status)) { 708 if (U_FAILURE(status)) {
729 return; 709 return;
730 } 710 }
731 711
732 UnicodeSet exemplars; 712 UnicodeSet exemplars;
733 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_I NDEX, &status); 713 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_I NDEX, &status);
734 if (U_SUCCESS(status)) { 714 if (U_SUCCESS(status)) {
735 initialLabels_->addAll(exemplars); 715 initialLabels_->addAll(exemplars);
736 return; 716 return;
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
777 while (it.next()) { 757 while (it.next()) {
778 const UnicodeString &exemplarC = it.getString(); 758 const UnicodeString &exemplarC = it.getString();
779 upperC = exemplarC; 759 upperC = exemplarC;
780 upperC.toUpper(locale); 760 upperC.toUpper(locale);
781 initialLabels_->add(upperC); 761 initialLabels_->add(upperC);
782 } 762 }
783 } 763 }
784 764
785 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { 765 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
786 UnicodeSet contractions; 766 UnicodeSet contractions;
787 ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(), 767 collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCo de);
788 contractions.toUSet(), NULL, FALSE, &error Code); 768 if (U_FAILURE(errorCode) || contractions.isEmpty()) { return FALSE; }
789 if (U_FAILURE(errorCode)) { return FALSE; } 769 initialLabels_->addAll(contractions);
790 UnicodeString firstHanBoundary;
791 UBool hasPinyin = FALSE;
792 UnicodeSetIterator iter(contractions); 770 UnicodeSetIterator iter(contractions);
793 while (iter.next()) { 771 while (iter.next()) {
794 const UnicodeString &s = iter.getString(); 772 const UnicodeString &s = iter.getString();
795 if (s.startsWith(BASE, BASE_LENGTH)) { 773 U_ASSERT (s.startsWith(BASE, BASE_LENGTH));
796 initialLabels_->add(s); 774 UChar c = s.charAt(s.length() - 1);
797 if (firstHanBoundary.isEmpty() || 775 if (0x41 <= c && c <= 0x5A) { // A-Z
798 collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode ) < 0) { 776 // There are Pinyin labels, add ASCII A-Z labels as well.
799 firstHanBoundary = s; 777 initialLabels_->add(0x41, 0x5A); // A-Z
800 } 778 break;
801 UChar c = s.charAt(s.length() - 1);
802 if (0x41 <= c && c <= 0x5A) { // A-Z
803 hasPinyin = TRUE;
804 }
805 } 779 }
806 } 780 }
807 if (hasPinyin) { 781 return TRUE;
808 initialLabels_->add(0x41, 0x5A); // A-Z
809 }
810 if (!firstHanBoundary.isEmpty()) {
811 // The hardcoded list of script boundaries includes U+4E00
812 // which is tailored to not be the first primary
813 // in all Chinese tailorings except "unihan".
814 // Replace U+4E00 with the first boundary string from the tailoring.
815 // TODO: This becomes obsolete when the root collator gets
816 // reliable script-first-primary mappings.
817 int32_t hanIndex = binarySearch(
818 *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPr imaryOnly_);
819 if (hanIndex >= 0) {
820 UnicodeString *fh = new UnicodeString(firstHanBoundary);
821 if (fh == NULL) {
822 errorCode = U_MEMORY_ALLOCATION_ERROR;
823 return FALSE;
824 }
825 firstCharsInScripts_->setElementAt(fh, hanIndex);
826 }
827 return TRUE;
828 } else {
829 return FALSE;
830 }
831 } 782 }
832 783
833 784
834 /* 785 /*
835 * Return the string with interspersed CGJs. Input must have more than 2 codepoi nts. 786 * Return the string with interspersed CGJs. Input must have more than 2 codepoi nts.
836 */ 787 */
837 static const UChar CGJ = 0x034F; 788 static const UChar CGJ = 0x034F;
838 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { 789 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
839 UnicodeString result; 790 UnicodeString result;
840 if (item.length() == 0) { 791 if (item.length() == 0) {
(...skipping 17 matching lines...) Expand all
858 return FALSE; 809 return FALSE;
859 } 810 }
860 811
861 812
862 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { 813 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
863 return FALSE; 814 return FALSE;
864 } 815 }
865 816
866 817
867 const RuleBasedCollator &AlphabeticIndex::getCollator() const { 818 const RuleBasedCollator &AlphabeticIndex::getCollator() const {
868 // There are no known non-RuleBasedCollator collators, and none ever expecte d. 819 return *collator_;
869 // But, in case that changes, better a null pointer than a wrong type.
870 return *dynamic_cast<RuleBasedCollator *>(collator_);
871 } 820 }
872 821
873 822
874 const UnicodeString &AlphabeticIndex::getInflowLabel() const { 823 const UnicodeString &AlphabeticIndex::getInflowLabel() const {
875 return inflowLabel_; 824 return inflowLabel_;
876 } 825 }
877 826
878 const UnicodeString &AlphabeticIndex::getOverflowLabel() const { 827 const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
879 return overflowLabel_; 828 return overflowLabel_;
880 } 829 }
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
940 if (initialLabels_ == NULL) { 889 if (initialLabels_ == NULL) {
941 status = U_MEMORY_ALLOCATION_ERROR; 890 status = U_MEMORY_ALLOCATION_ERROR;
942 return; 891 return;
943 } 892 }
944 893
945 inflowLabel_.setTo((UChar)0x2026); // Ellipsis 894 inflowLabel_.setTo((UChar)0x2026); // Ellipsis
946 overflowLabel_ = inflowLabel_; 895 overflowLabel_ = inflowLabel_;
947 underflowLabel_ = inflowLabel_; 896 underflowLabel_ = inflowLabel_;
948 897
949 if (collator_ == NULL) { 898 if (collator_ == NULL) {
950 collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*l ocale, status)); 899 Collator *coll = Collator::createInstance(*locale, status);
951 if (U_FAILURE(status)) { return; } 900 if (U_FAILURE(status)) {
901 delete coll;
902 return;
903 }
904 if (coll == NULL) {
905 status = U_MEMORY_ALLOCATION_ERROR;
906 return;
907 }
908 collator_ = dynamic_cast<RuleBasedCollator *>(coll);
952 if (collator_ == NULL) { 909 if (collator_ == NULL) {
953 status = U_MEMORY_ALLOCATION_ERROR; 910 delete coll;
911 status = U_UNSUPPORTED_ERROR;
954 return; 912 return;
955 } 913 }
956 } 914 }
957 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone()); 915 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone());
958 if (collatorPrimaryOnly_ == NULL) { 916 if (collatorPrimaryOnly_ == NULL) {
959 status = U_MEMORY_ALLOCATION_ERROR; 917 status = U_MEMORY_ALLOCATION_ERROR;
960 return; 918 return;
961 } 919 }
962 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); 920 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
963 firstCharsInScripts_ = firstStringsInScript(status); 921 firstCharsInScripts_ = firstStringsInScript(status);
964 if (U_FAILURE(status)) { return; } 922 if (U_FAILURE(status)) { return; }
965 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimar yOnly_, status); 923 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimar yOnly_, status);
966 UnicodeString _4E00((UChar)0x4E00);
967 UnicodeString _1100((UChar)0x1100);
968 UnicodeString _1112((UChar)0x1112);
969 if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&
970 collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {
971 // The standard Korean tailoring sorts Hanja (Han characters)
972 // as secondary differences from Hangul syllables.
973 // This makes U+4E00 not useful as a Han-script boundary.
974 // TODO: This becomes obsolete when the root collator gets
975 // reliable script-first-primary mappings.
976 int32_t hanIndex = binarySearch(
977 *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_);
978 if (hanIndex >= 0) {
979 firstCharsInScripts_->removeElementAt(hanIndex);
980 }
981 }
982 // Guard against a degenerate collator where 924 // Guard against a degenerate collator where
983 // some script boundary strings are primary ignorable. 925 // some script boundary strings are primary ignorable.
984 for (;;) { 926 for (;;) {
985 if (U_FAILURE(status)) { return; } 927 if (U_FAILURE(status)) { return; }
986 if (firstCharsInScripts_->isEmpty()) { 928 if (firstCharsInScripts_->isEmpty()) {
987 // AlphabeticIndex requires some non-ignorable script boundary strin gs. 929 // AlphabeticIndex requires some non-ignorable script boundary strin gs.
988 status = U_ILLEGAL_ARGUMENT_ERROR; 930 status = U_ILLEGAL_ARGUMENT_ERROR;
989 return; 931 return;
990 } 932 }
991 if (collatorPrimaryOnly_->compare( 933 if (collatorPrimaryOnly_->compare(
992 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0) ), 934 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0) ),
993 emptyString_, status) == UCOL_EQUAL) { 935 emptyString_, status) == UCOL_EQUAL) {
994 firstCharsInScripts_->removeElementAt(0); 936 firstCharsInScripts_->removeElementAt(0);
995 } else { 937 } else {
996 break; 938 break;
997 } 939 }
998 } 940 }
999 941
1000 if (locale != NULL) { 942 // Chinese index characters, which are specific to each of the several Chine se tailorings,
943 // take precedence over the single locale data exemplar set per language.
944 if (!addChineseIndexCharacters(status) && locale != NULL) {
1001 addIndexExemplars(*locale, status); 945 addIndexExemplars(*locale, status);
1002 } 946 }
1003 } 947 }
1004 948
1005 949
1006 // 950 //
1007 // Comparison function for UVector<UnicodeString *> sorting with a collator. 951 // Comparison function for UVector<UnicodeString *> sorting with a collator.
1008 // 952 //
1009 static int32_t U_CALLCONV 953 static int32_t U_CALLCONV
1010 collatorComparator(const void *context, const void *left, const void *right) { 954 collatorComparator(const void *context, const void *left, const void *right) {
(...skipping 24 matching lines...) Expand all
1035 recordCompareFn(const void *context, const void *left, const void *right) { 979 recordCompareFn(const void *context, const void *left, const void *right) {
1036 const UElement *leftElement = static_cast<const UElement *>(left); 980 const UElement *leftElement = static_cast<const UElement *>(left);
1037 const UElement *rightElement = static_cast<const UElement *>(right); 981 const UElement *rightElement = static_cast<const UElement *>(right);
1038 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex: :Record *>(leftElement->pointer); 982 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex: :Record *>(leftElement->pointer);
1039 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex: :Record *>(rightElement->pointer); 983 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex: :Record *>(rightElement->pointer);
1040 const Collator *col = static_cast<const Collator *>(context); 984 const Collator *col = static_cast<const Collator *>(context);
1041 UErrorCode errorCode = U_ZERO_ERROR; 985 UErrorCode errorCode = U_ZERO_ERROR;
1042 return col->compare(leftRec->name_, rightRec->name_, errorCode); 986 return col->compare(leftRec->name_, rightRec->name_, errorCode);
1043 } 987 }
1044 988
1045
1046 /**
1047 * This list contains one character per script that has the
1048 * lowest primary weight for that script in the root collator.
1049 * This list will be copied and sorted to account for script reordering.
1050 *
1051 * <p>TODO: This is fragile. If the first character of a script is tailored
1052 * so that it does not map to the script's lowest primary weight any more,
1053 * then the buckets will be off.
1054 * There are hacks in the code to handle the known CJK tailorings of U+4E00.
1055 *
1056 * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-befor e a.
1057 *
1058 * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in
1059 * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
1060 */
1061 static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = {
1062 0x41, 0, 0x03B1, 0,
1063 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0x DD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
1064 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
1065 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,
1066 0xAAF2, 0, // Meetei Mayek
1067 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,
1068 U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada
1069 U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri
1070 0x1B83, 0,
1071 0xD802, 0xDE00, 0, 0x0E01, 0,
1072 0x0EDE, 0, // Lao
1073 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
1074 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,
1075 U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma
1076 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
1077 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
1078 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,
1079 U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao
1080 0xD800, 0xDE80, 0,
1081 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
1082 0xD801, 0xDC80, 0,
1083 U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng
1084 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
1085 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0,
1086 U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive
1087 U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs
1088 0x4E00, 0,
1089 // TODO: The overflow bucket's lowerBoundary string should be the
1090 // first item after the last reordering group in the collator's script order .
1091 // This should normally be the first Unicode code point
1092 // that is unassigned (U+0378 in Unicode 6.3) and untailored.
1093 // However, at least up to ICU 51 the Hani reordering group includes
1094 // unassigned code points,
1095 // and there is no stable string for the start of the trailing-weights range .
1096 // The only known string that sorts "high" is U+FFFF.
1097 // When ICU separates Hani vs. unassigned reordering groups, we need to fix this,
1098 // and fix relevant test code.
1099 // Ideally, FractionalUCA.txt will have a "script first primary"
1100 // for unassigned code points.
1101 0xFFFF, 0
1102 };
1103
1104 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { 989 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
1105 if (U_FAILURE(status)) { 990 if (U_FAILURE(status)) {
1106 return NULL; 991 return NULL;
1107 } 992 }
1108 UVector *dest = new UVector(status); 993 LocalPointer<UVector> dest(new UVector(status));
1109 if (dest == NULL) { 994 if (dest.isNull()) {
1110 status = U_MEMORY_ALLOCATION_ERROR; 995 status = U_MEMORY_ALLOCATION_ERROR;
1111 return NULL; 996 return NULL;
1112 } 997 }
1113 dest->setDeleter(uprv_deleteUObject); 998 dest->setDeleter(uprv_deleteUObject);
1114 const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS; 999 // Fetch the script-first-primary contractions which are defined in the root collator.
1115 const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS); 1000 // They all start with U+FDD1.
1116 do { 1001 UnicodeSet set;
1117 if (U_FAILURE(status)) { 1002 collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status);
1118 return dest; 1003 if (U_FAILURE(status)) {
1004 return NULL;
1005 }
1006 if (set.isEmpty()) {
1007 status = U_UNSUPPORTED_ERROR;
1008 return NULL;
1009 }
1010 UnicodeSetIterator iter(set);
1011 while (iter.next()) {
1012 const UnicodeString &boundary = iter.getString();
1013 uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1));
1014 if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) {
1015 // Ignore boundaries for the special reordering groups.
1016 // Take only those for "real scripts" (where the sample character is a Letter,
1017 // and the one for unassigned implicit weights (Cn).
1018 continue;
1119 } 1019 }
1120 UnicodeString *str = new UnicodeString(src, -1); 1020 UnicodeString *s = new UnicodeString(boundary);
1121 if (str == NULL) { 1021 if (s == NULL) {
1122 status = U_MEMORY_ALLOCATION_ERROR; 1022 status = U_MEMORY_ALLOCATION_ERROR;
1123 return dest; 1023 return NULL;
1124 } 1024 }
1125 dest->addElement(str, status); 1025 dest->addElement(s, status);
1126 src += str->length() + 1; 1026 }
1127 } while (src < limit); 1027 return dest.orphan();
1128 return dest;
1129 } 1028 }
1130 1029
1131 1030
1132 namespace { 1031 namespace {
1133 1032
1134 /** 1033 /**
1135 * Returns true if one index character string is "better" than the other. 1034 * Returns true if one index character string is "better" than the other.
1136 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is 1035 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
1137 * better, and otherwise binary-less-than is better. 1036 * better, and otherwise binary-less-than is better.
1138 */ 1037 */
(...skipping 201 matching lines...) Expand 10 before | Expand all | Expand 10 after
1340 records_(NULL) { 1239 records_(NULL) {
1341 } 1240 }
1342 1241
1343 1242
1344 AlphabeticIndex::Bucket::~Bucket() { 1243 AlphabeticIndex::Bucket::~Bucket() {
1345 delete records_; 1244 delete records_;
1346 } 1245 }
1347 1246
1348 U_NAMESPACE_END 1247 U_NAMESPACE_END
1349 1248
1350 #endif 1249 #endif // !UCONFIG_NO_COLLATION
OLDNEW
« no previous file with comments | « source/i18n/Makefile.in ('k') | source/i18n/anytrans.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698