source/i18n/alphaindex.cpp - Issue 845603002: Update ICU to 54.1 step 1

Side by Side Diff: source/i18n/alphaindex.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master

Patch Set: remove unusued directories Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 *******************************************************************************	2 *******************************************************************************

3 * Copyright (C) 2009-2013, International Business Machines Corporation and	3 * Copyright (C) 2009-2014, International Business Machines Corporation and

4 * others. All Rights Reserved.	4 * others. All Rights Reserved.

5 *******************************************************************************	5 *******************************************************************************

6 */	6 */

7	7

8 #include "unicode/utypes.h"	8 #include "unicode/utypes.h"

9	9

10 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION	10 #if !UCONFIG_NO_COLLATION

11	11

12 #include "unicode/alphaindex.h"	12 #include "unicode/alphaindex.h"

13 #include "unicode/coleitr.h"

14 #include "unicode/coll.h"	13 #include "unicode/coll.h"

15 #include "unicode/localpointer.h"	14 #include "unicode/localpointer.h"

16 #include "unicode/normalizer2.h"	15 #include "unicode/normalizer2.h"

17 #include "unicode/tblcoll.h"	16 #include "unicode/tblcoll.h"

	17 #include "unicode/uchar.h"

18 #include "unicode/ulocdata.h"	18 #include "unicode/ulocdata.h"

19 #include "unicode/uniset.h"	19 #include "unicode/uniset.h"

20 #include "unicode/uobject.h"	20 #include "unicode/uobject.h"

21 #include "unicode/usetiter.h"	21 #include "unicode/usetiter.h"

22 #include "unicode/utf16.h"	22 #include "unicode/utf16.h"

23	23

24 #include "cmemory.h"	24 #include "cmemory.h"

25 #include "cstring.h"	25 #include "cstring.h"

26 #include "uassert.h"	26 #include "uassert.h"

27 #include "uvector.h"	27 #include "uvector.h"

	28 #include "uvectr64.h"

28	29

29 //#include <string>	30 //#include <string>

30 //#include <iostream>	31 //#include <iostream>

31	32

32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))

33

34 U_NAMESPACE_BEGIN	33 U_NAMESPACE_BEGIN

35	34

36 namespace {	35 namespace {

37	36

38 /**	37 /**

39 * Prefix string for Chinese index buckets.	38 * Prefix string for Chinese index buckets.

40 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collat ion_Indexes	39 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collat ion_Indexes

41 */	40 */

42 const UChar BASE[1] = { 0xFDD0 };	41 const UChar BASE[1] = { 0xFDD0 };

43 const int32_t BASE_LENGTH = 1;	42 const int32_t BASE_LENGTH = 1;

(...skipping 278 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
322 errorCode = U_MEMORY_ALLOCATION_ERROR;	321 errorCode = U_MEMORY_ALLOCATION_ERROR;

323 return;	322 return;

324 }	323 }

325 checkDistinct = FALSE;	324 checkDistinct = FALSE;

326 } else {	325 } else {

327 checkDistinct = TRUE;	326 checkDistinct = TRUE;

328 }	327 }

329 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {	328 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {

330 // Ignore a primary-ignorable or non-alphabetic index character.	329 // Ignore a primary-ignorable or non-alphabetic index character.

331 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorC ode) >= 0) {	330 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorC ode) >= 0) {

332 // Ignore an index characters that will land in the overflow bucket.	331 // Ignore an index character that will land in the overflow bucket.

333 } else if (checkDistinct &&	332 } else if (checkDistinct &&

334 collatorPrimaryOnly_->compare(item, separated(item), errorCode ) == 0) {	333 collatorPrimaryOnly_->compare(item, separated(item), errorCode ) == 0) {

335 // Ignore a multi-code point index character that does not sort dist inctly	334 // Ignore a multi-code point index character that does not sort dist inctly

336 // from the sequence of its separate characters.	335 // from the sequence of its separate characters.

337 } else {	336 } else {

338 int32_t insertionPoint = binarySearch(indexCharacters, item, colla torPrimaryOnly_);	337 int32_t insertionPoint = binarySearch(indexCharacters, item, colla torPrimaryOnly_);

339 if (insertionPoint < 0) {	338 if (insertionPoint < 0) {

340 indexCharacters.insertElementAt(	339 indexCharacters.insertElementAt(

341 ownedString(*item, ownedItem, errorCode), ~insertionPoint, e rrorCode);	340 ownedString(*item, ownedItem, errorCode), ~insertionPoint, e rrorCode);

342 } else {	341 } else {

343 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);	342 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);

344 if (isOneLabelBetterThanOther(nfkdNormalizer, item, itemAlread yIn)) {	343 if (isOneLabelBetterThanOther(nfkdNormalizer, item, itemAlread yIn)) {

345 indexCharacters.setElementAt(	344 indexCharacters.setElementAt(

346 ownedString(*item, ownedItem, errorCode), insertionPoint );	345 ownedString(*item, ownedItem, errorCode), insertionPoint );

347 }	346 }

348 }	347 }

349 }	348 }

350 }	349 }

351 if (U_FAILURE(errorCode)) { return; }	350 if (U_FAILURE(errorCode)) { return; }

352	351

353 // if the result is still too large, cut down to maxCount elements, by remov ing every nth element	352 // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element

354	353

355 int32_t size = indexCharacters.size() - 1;	354 int32_t size = indexCharacters.size() - 1;

356 if (size > maxLabelCount_) {	355 if (size > maxLabelCount_) {

357 int32_t count = 0;	356 int32_t count = 0;

358 int32_t old = -1;	357 int32_t old = -1;

359 for (int32_t i = 0; i < indexCharacters.size();) {	358 for (int32_t i = 0; i < indexCharacters.size();) {

360 ++count;	359 ++count;

361 int32_t bump = count * maxLabelCount_ / size;	360 int32_t bump = count * maxLabelCount_ / size;

362 if (bump == old) {	361 if (bump == old) {

363 indexCharacters.removeElementAt(i);	362 indexCharacters.removeElementAt(i);

(...skipping 22 matching lines...) Expand all Loading...
386 count /= 10;	385 count /= 10;

387 temp.insert(0, (UChar)(0x30 + count));	386 temp.insert(0, (UChar)(0x30 + count));

388 }	387 }

389 }	388 }

390 return temp.append((UChar)0x5283);	389 return temp.append((UChar)0x5283);

391 }	390 }

392 return temp.setTo(current, BASE_LENGTH);	391 return temp.setTo(current, BASE_LENGTH);

393 }	392 }

394	393

395 UBool hasMultiplePrimaryWeights(	394 UBool hasMultiplePrimaryWeights(

396 CollationElementIterator &cei, int32_t variableTop,	395 const RuleBasedCollator &coll, uint32_t variableTop,

397 const UnicodeString &s, UErrorCode &errorCode) {	396 const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) {

398 cei.setText(s, errorCode);	397 ces.removeAllElements();

	398 coll.internalGetCEs(s, ces, errorCode);

	399 if (U_FAILURE(errorCode)) { return FALSE; }

399 UBool seenPrimary = FALSE;	400 UBool seenPrimary = FALSE;

400 for (;;) {	401 for (int32_t i = 0; i < ces.size(); ++i) {

401 int32_t ce32 = cei.next(errorCode);	402 int64_t ce = ces.elementAti(i);

402 if (ce32 == CollationElementIterator::NULLORDER) {	403 uint32_t p = (uint32_t)(ce >> 32);

403 break;	404 if (p > variableTop) {

404 }	405 // not primary ignorable

405 int32_t p = CollationElementIterator::primaryOrder(ce32);

406 if (p > variableTop && (ce32 & 0xc0) != 0xc0) {

407 // not primary ignorable, and not a continuation CE

408 if (seenPrimary) {	406 if (seenPrimary) {

409 return TRUE;	407 return TRUE;

410 }	408 }

411 seenPrimary = TRUE;	409 seenPrimary = TRUE;

412 }	410 }

413 }	411 }

414 return FALSE;	412 return FALSE;

415 }	413 }

416	414

417 } // namespace	415 } // namespace

418	416

419 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {	417 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {

420 // Initialize indexCharacters.	418 // Initialize indexCharacters.

421 UVector indexCharacters(errorCode);	419 UVector indexCharacters(errorCode);

422 indexCharacters.setDeleter(uprv_deleteUObject);	420 indexCharacters.setDeleter(uprv_deleteUObject);

423 initLabels(indexCharacters, errorCode);	421 initLabels(indexCharacters, errorCode);

424 if (U_FAILURE(errorCode)) { return NULL; }	422 if (U_FAILURE(errorCode)) { return NULL; }

425	423

426 // Variables for hasMultiplePrimaryWeights().	424 // Variables for hasMultiplePrimaryWeights().

427 LocalPointer<CollationElementIterator> cei(	425 UVector64 ces(errorCode);

428 collatorPrimaryOnly_->createCollationElementIterator(emptyString_));	426 uint32_t variableTop;

429 if (cei.isNull()) {

430 errorCode = U_MEMORY_ALLOCATION_ERROR;

431 return NULL;

432 }

433 int32_t variableTop;

434 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) = = UCOL_SHIFTED) {	427 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) = = UCOL_SHIFTED) {

435 variableTop = CollationElementIterator::primaryOrder(	428 variableTop = collatorPrimaryOnly_->getVariableTop(errorCode);

436 (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode));

437 } else {	429 } else {

438 variableTop = 0;	430 variableTop = 0;

439 }	431 }

440 UBool hasInvisibleBuckets = FALSE;	432 UBool hasInvisibleBuckets = FALSE;

441	433

442 // Helper arrays for Chinese Pinyin collation.	434 // Helper arrays for Chinese Pinyin collation.

443 Bucket *asciiBuckets[26] = {	435 Bucket *asciiBuckets[26] = {

444 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,	436 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,

445 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL	437 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL

446 };	438 };

(...skipping 60 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
507 UChar c;	499 UChar c;

508 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5 A) { // A-Z	500 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5 A) { // A-Z

509 asciiBuckets[c - 0x41] = bucket;	501 asciiBuckets[c - 0x41] = bucket;

510 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BAS E, BASE_LENGTH) &&	502 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BAS E, BASE_LENGTH) &&

511 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {	503 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {

512 pinyinBuckets[c - 0x41] = bucket;	504 pinyinBuckets[c - 0x41] = bucket;

513 hasPinyin = TRUE;	505 hasPinyin = TRUE;

514 }	506 }

515 // Check for multiple primary weights.	507 // Check for multiple primary weights.

516 if (!current.startsWith(BASE, BASE_LENGTH) &&	508 if (!current.startsWith(BASE, BASE_LENGTH) &&

517 hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) &&	509 hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, cu rrent,

	510 ces, errorCode) &&

518 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsW ith("\uffff") */) {	511 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsW ith("\uffff") */) {

519 // "AE-ligature" or "Sch" etc.	512 // "AE-ligature" or "Sch" etc.

520 for (int32_t i = bucketList->size() - 2;; --i) {	513 for (int32_t i = bucketList->size() - 2;; --i) {

521 Bucket singleBucket = getBucket(bucketList, i);	514 Bucket singleBucket = getBucket(bucketList, i);

522 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {	515 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {

523 // There is no single-character bucket since the last	516 // There is no single-character bucket since the last

524 // underflow or inflow label.	517 // underflow or inflow label.

525 break;	518 break;

526 }	519 }

527 if (singleBucket->displayBucket_ == NULL &&	520 if (singleBucket->displayBucket_ == NULL &&

528 !hasMultiplePrimaryWeights(	521 !hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variab leTop,

529 *cei, variableTop, singleBucket->lowerBoundary_, err orCode)) {	522 singleBucket->lowerBoundary_,

	523 ces, errorCode)) {

530 // Add an invisible bucket that redirects strings greater th an the expansion	524 // Add an invisible bucket that redirects strings greater th an the expansion

531 // to the previous single-character bucket.	525 // to the previous single-character bucket.

532 // For example, after ... Q R S Sch we add Sch\uFFFF->S	526 // For example, after ... Q R S Sch we add Sch\uFFFF->S

533 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.	527 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.

534 bucket = new Bucket(emptyString_,	528 bucket = new Bucket(emptyString_,

535 UnicodeString(current).append((UChar)0xFFFF),	529 UnicodeString(current).append((UChar)0xFFFF),

536 U_ALPHAINDEX_NORMAL);	530 U_ALPHAINDEX_NORMAL);

537 if (bucket == NULL) {	531 if (bucket == NULL) {

538 errorCode = U_MEMORY_ALLOCATION_ERROR;	532 errorCode = U_MEMORY_ALLOCATION_ERROR;

539 return NULL;	533 return NULL;

(...skipping 163 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
703 }	697 }

704 }	698 }

705	699

706 void AlphabeticIndex::internalResetBucketIterator() {	700 void AlphabeticIndex::internalResetBucketIterator() {

707 labelsIterIndex_ = -1;	701 labelsIterIndex_ = -1;

708 currentBucket_ = NULL;	702 currentBucket_ = NULL;

709 }	703 }

710	704

711	705

712 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status ) {	706 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status ) {

713 if (U_FAILURE(status)) { return; }

714 // Chinese index characters, which are specific to each of the several Chine se tailorings,

715 // take precedence over the single locale data exemplar set per language.

716 const char *language = locale.getLanguage();

717 if (uprv_strcmp(language, "zh") == 0 \|\| uprv_strcmp(language, "ja") == 0 \|\|

718 uprv_strcmp(language, "ko") == 0) {

719 // TODO: This should be done regardless of the language, but it's expens ive.

720 // We should add a Collator function (can be @internal)

721 // to enumerate just the contractions that start with a given code point or string.

722 if (addChineseIndexCharacters(status) \|\| U_FAILURE(status)) {

723 return;

724 }

725 }

726

727 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));	707 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));

728 if (U_FAILURE(status)) {	708 if (U_FAILURE(status)) {

729 return;	709 return;

730 }	710 }

731	711

732 UnicodeSet exemplars;	712 UnicodeSet exemplars;

733 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_I NDEX, &status);	713 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_I NDEX, &status);

734 if (U_SUCCESS(status)) {	714 if (U_SUCCESS(status)) {

735 initialLabels_->addAll(exemplars);	715 initialLabels_->addAll(exemplars);

736 return;	716 return;

(...skipping 40 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
777 while (it.next()) {	757 while (it.next()) {

778 const UnicodeString &exemplarC = it.getString();	758 const UnicodeString &exemplarC = it.getString();

779 upperC = exemplarC;	759 upperC = exemplarC;

780 upperC.toUpper(locale);	760 upperC.toUpper(locale);

781 initialLabels_->add(upperC);	761 initialLabels_->add(upperC);

782 }	762 }

783 }	763 }

784	764

785 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {	765 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {

786 UnicodeSet contractions;	766 UnicodeSet contractions;

787 ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(),	767 collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCo de);

788 contractions.toUSet(), NULL, FALSE, &error Code);	768 if (U_FAILURE(errorCode) \|\| contractions.isEmpty()) { return FALSE; }

789 if (U_FAILURE(errorCode)) { return FALSE; }	769 initialLabels_->addAll(contractions);

790 UnicodeString firstHanBoundary;

791 UBool hasPinyin = FALSE;

792 UnicodeSetIterator iter(contractions);	770 UnicodeSetIterator iter(contractions);

793 while (iter.next()) {	771 while (iter.next()) {

794 const UnicodeString &s = iter.getString();	772 const UnicodeString &s = iter.getString();

795 if (s.startsWith(BASE, BASE_LENGTH)) {	773 U_ASSERT (s.startsWith(BASE, BASE_LENGTH));

796 initialLabels_->add(s);	774 UChar c = s.charAt(s.length() - 1);

797 if (firstHanBoundary.isEmpty() \|\|	775 if (0x41 <= c && c <= 0x5A) { // A-Z

798 collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode ) < 0) {	776 // There are Pinyin labels, add ASCII A-Z labels as well.

799 firstHanBoundary = s;	777 initialLabels_->add(0x41, 0x5A); // A-Z

800 }	778 break;

801 UChar c = s.charAt(s.length() - 1);

802 if (0x41 <= c && c <= 0x5A) { // A-Z

803 hasPinyin = TRUE;

804 }

805 }	779 }

806 }	780 }

807 if (hasPinyin) {	781 return TRUE;

808 initialLabels_->add(0x41, 0x5A); // A-Z

809 }

810 if (!firstHanBoundary.isEmpty()) {

811 // The hardcoded list of script boundaries includes U+4E00

812 // which is tailored to not be the first primary

813 // in all Chinese tailorings except "unihan".

814 // Replace U+4E00 with the first boundary string from the tailoring.

815 // TODO: This becomes obsolete when the root collator gets

816 // reliable script-first-primary mappings.

817 int32_t hanIndex = binarySearch(

818 firstCharsInScripts_, UnicodeString((UChar)0x4E00), collatorPr imaryOnly_);

819 if (hanIndex >= 0) {

820 UnicodeString *fh = new UnicodeString(firstHanBoundary);

821 if (fh == NULL) {

822 errorCode = U_MEMORY_ALLOCATION_ERROR;

823 return FALSE;

824 }

825 firstCharsInScripts_->setElementAt(fh, hanIndex);

826 }

827 return TRUE;

828 } else {

829 return FALSE;

830 }

831 }	782 }

832	783

833	784

834 /*	785 /*

835 * Return the string with interspersed CGJs. Input must have more than 2 codepoi nts.	786 * Return the string with interspersed CGJs. Input must have more than 2 codepoi nts.

836 */	787 */

837 static const UChar CGJ = 0x034F;	788 static const UChar CGJ = 0x034F;

838 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {	789 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {

839 UnicodeString result;	790 UnicodeString result;

840 if (item.length() == 0) {	791 if (item.length() == 0) {

(...skipping 17 matching lines...) Expand all Loading...
858 return FALSE;	809 return FALSE;

859 }	810 }

860	811

861	812

862 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {	813 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {

863 return FALSE;	814 return FALSE;

864 }	815 }

865	816

866	817

867 const RuleBasedCollator &AlphabeticIndex::getCollator() const {	818 const RuleBasedCollator &AlphabeticIndex::getCollator() const {

868 // There are no known non-RuleBasedCollator collators, and none ever expecte d.	819 return *collator_;

869 // But, in case that changes, better a null pointer than a wrong type.

870 return dynamic_cast<RuleBasedCollator >(collator_);

871 }	820 }

872	821

873	822

874 const UnicodeString &AlphabeticIndex::getInflowLabel() const {	823 const UnicodeString &AlphabeticIndex::getInflowLabel() const {

875 return inflowLabel_;	824 return inflowLabel_;

876 }	825 }

877	826

878 const UnicodeString &AlphabeticIndex::getOverflowLabel() const {	827 const UnicodeString &AlphabeticIndex::getOverflowLabel() const {

879 return overflowLabel_;	828 return overflowLabel_;

880 }	829 }

(...skipping 59 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
940 if (initialLabels_ == NULL) {	889 if (initialLabels_ == NULL) {

941 status = U_MEMORY_ALLOCATION_ERROR;	890 status = U_MEMORY_ALLOCATION_ERROR;

942 return;	891 return;

943 }	892 }

944	893

945 inflowLabel_.setTo((UChar)0x2026); // Ellipsis	894 inflowLabel_.setTo((UChar)0x2026); // Ellipsis

946 overflowLabel_ = inflowLabel_;	895 overflowLabel_ = inflowLabel_;

947 underflowLabel_ = inflowLabel_;	896 underflowLabel_ = inflowLabel_;

948	897

949 if (collator_ == NULL) {	898 if (collator_ == NULL) {

950 collator_ = static_cast<RuleBasedCollator >(Collator::createInstance(l ocale, status));	899 Collator coll = Collator::createInstance(locale, status);

951 if (U_FAILURE(status)) { return; }	900 if (U_FAILURE(status)) {

	901 delete coll;

	902 return;

	903 }

	904 if (coll == NULL) {

	905 status = U_MEMORY_ALLOCATION_ERROR;

	906 return;

	907 }

	908 collator_ = dynamic_cast<RuleBasedCollator *>(coll);

952 if (collator_ == NULL) {	909 if (collator_ == NULL) {

953 status = U_MEMORY_ALLOCATION_ERROR;	910 delete coll;

	911 status = U_UNSUPPORTED_ERROR;

954 return;	912 return;

955 }	913 }

956 }	914 }

957 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone());	915 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone());

958 if (collatorPrimaryOnly_ == NULL) {	916 if (collatorPrimaryOnly_ == NULL) {

959 status = U_MEMORY_ALLOCATION_ERROR;	917 status = U_MEMORY_ALLOCATION_ERROR;

960 return;	918 return;

961 }	919 }

962 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);	920 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);

963 firstCharsInScripts_ = firstStringsInScript(status);	921 firstCharsInScripts_ = firstStringsInScript(status);

964 if (U_FAILURE(status)) { return; }	922 if (U_FAILURE(status)) { return; }

965 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimar yOnly_, status);	923 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimar yOnly_, status);

966 UnicodeString _4E00((UChar)0x4E00);

967 UnicodeString _1100((UChar)0x1100);

968 UnicodeString _1112((UChar)0x1112);

969 if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&

970 collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {

971 // The standard Korean tailoring sorts Hanja (Han characters)

972 // as secondary differences from Hangul syllables.

973 // This makes U+4E00 not useful as a Han-script boundary.

974 // TODO: This becomes obsolete when the root collator gets

975 // reliable script-first-primary mappings.

976 int32_t hanIndex = binarySearch(

977 firstCharsInScripts_, _4E00, collatorPrimaryOnly_);

978 if (hanIndex >= 0) {

979 firstCharsInScripts_->removeElementAt(hanIndex);

980 }

981 }

982 // Guard against a degenerate collator where	924 // Guard against a degenerate collator where

983 // some script boundary strings are primary ignorable.	925 // some script boundary strings are primary ignorable.

984 for (;;) {	926 for (;;) {

985 if (U_FAILURE(status)) { return; }	927 if (U_FAILURE(status)) { return; }

986 if (firstCharsInScripts_->isEmpty()) {	928 if (firstCharsInScripts_->isEmpty()) {

987 // AlphabeticIndex requires some non-ignorable script boundary strin gs.	929 // AlphabeticIndex requires some non-ignorable script boundary strin gs.

988 status = U_ILLEGAL_ARGUMENT_ERROR;	930 status = U_ILLEGAL_ARGUMENT_ERROR;

989 return;	931 return;

990 }	932 }

991 if (collatorPrimaryOnly_->compare(	933 if (collatorPrimaryOnly_->compare(

992 static_cast<UnicodeString >(firstCharsInScripts_->elementAt(0) ),	934 static_cast<UnicodeString >(firstCharsInScripts_->elementAt(0) ),

993 emptyString_, status) == UCOL_EQUAL) {	935 emptyString_, status) == UCOL_EQUAL) {

994 firstCharsInScripts_->removeElementAt(0);	936 firstCharsInScripts_->removeElementAt(0);

995 } else {	937 } else {

996 break;	938 break;

997 }	939 }

998 }	940 }

999	941

1000 if (locale != NULL) {	942 // Chinese index characters, which are specific to each of the several Chine se tailorings,

	943 // take precedence over the single locale data exemplar set per language.

	944 if (!addChineseIndexCharacters(status) && locale != NULL) {

1001 addIndexExemplars(*locale, status);	945 addIndexExemplars(*locale, status);

1002 }	946 }

1003 }	947 }

1004	948

1005	949

1006 //	950 //

1007 // Comparison function for UVector<UnicodeString *> sorting with a collator.	951 // Comparison function for UVector<UnicodeString *> sorting with a collator.

1008 //	952 //

1009 static int32_t U_CALLCONV	953 static int32_t U_CALLCONV

1010 collatorComparator(const void context, const void left, const void *right) {	954 collatorComparator(const void context, const void left, const void *right) {

(...skipping 24 matching lines...) Expand all Loading...
1035 recordCompareFn(const void context, const void left, const void *right) {	979 recordCompareFn(const void context, const void left, const void *right) {

1036 const UElement leftElement = static_cast<const UElement >(left);	980 const UElement leftElement = static_cast<const UElement >(left);

1037 const UElement rightElement = static_cast<const UElement >(right);	981 const UElement rightElement = static_cast<const UElement >(right);

1038 const AlphabeticIndex::Record leftRec = static_cast<const AlphabeticIndex: :Record >(leftElement->pointer);	982 const AlphabeticIndex::Record leftRec = static_cast<const AlphabeticIndex: :Record >(leftElement->pointer);

1039 const AlphabeticIndex::Record rightRec = static_cast<const AlphabeticIndex: :Record >(rightElement->pointer);	983 const AlphabeticIndex::Record rightRec = static_cast<const AlphabeticIndex: :Record >(rightElement->pointer);

1040 const Collator col = static_cast<const Collator >(context);	984 const Collator col = static_cast<const Collator >(context);

1041 UErrorCode errorCode = U_ZERO_ERROR;	985 UErrorCode errorCode = U_ZERO_ERROR;

1042 return col->compare(leftRec->name_, rightRec->name_, errorCode);	986 return col->compare(leftRec->name_, rightRec->name_, errorCode);

1043 }	987 }

1044	988

1045

1046 /**

1047 * This list contains one character per script that has the

1048 * lowest primary weight for that script in the root collator.

1049 * This list will be copied and sorted to account for script reordering.

1050 *

1051 * <p>TODO: This is fragile. If the first character of a script is tailored

1052 * so that it does not map to the script's lowest primary weight any more,

1053 * then the buckets will be off.

1054 * There are hacks in the code to handle the known CJK tailorings of U+4E00.

1055 *

1056 * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-befor e a.

1057 *

1058 * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in

1059 * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java

1060 */

1061 static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = {

1062 0x41, 0, 0x03B1, 0,

1063 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0x DD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,

1064 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,

1065 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,

1066 0xAAF2, 0, // Meetei Mayek

1067 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,

1068 U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada

1069 U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri

1070 0x1B83, 0,

1071 0xD802, 0xDE00, 0, 0x0E01, 0,

1072 0x0EDE, 0, // Lao

1073 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,

1074 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,

1075 U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma

1076 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,

1077 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,

1078 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,

1079 U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao

1080 0xD800, 0xDE80, 0,

1081 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,

1082 0xD801, 0xDC80, 0,

1083 U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng

1084 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,

1085 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0,

1086 U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive

1087 U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs

1088 0x4E00, 0,

1089 // TODO: The overflow bucket's lowerBoundary string should be the

1090 // first item after the last reordering group in the collator's script order .

1091 // This should normally be the first Unicode code point

1092 // that is unassigned (U+0378 in Unicode 6.3) and untailored.

1093 // However, at least up to ICU 51 the Hani reordering group includes

1094 // unassigned code points,

1095 // and there is no stable string for the start of the trailing-weights range .

1096 // The only known string that sorts "high" is U+FFFF.

1097 // When ICU separates Hani vs. unassigned reordering groups, we need to fix this,

1098 // and fix relevant test code.

1099 // Ideally, FractionalUCA.txt will have a "script first primary"

1100 // for unassigned code points.

1101 0xFFFF, 0

1102 };

1103

1104 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {	989 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {

1105 if (U_FAILURE(status)) {	990 if (U_FAILURE(status)) {

1106 return NULL;	991 return NULL;

1107 }	992 }

1108 UVector *dest = new UVector(status);	993 LocalPointer<UVector> dest(new UVector(status));

1109 if (dest == NULL) {	994 if (dest.isNull()) {

1110 status = U_MEMORY_ALLOCATION_ERROR;	995 status = U_MEMORY_ALLOCATION_ERROR;

1111 return NULL;	996 return NULL;

1112 }	997 }

1113 dest->setDeleter(uprv_deleteUObject);	998 dest->setDeleter(uprv_deleteUObject);

1114 const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS;	999 // Fetch the script-first-primary contractions which are defined in the root collator.

1115 const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS);	1000 // They all start with U+FDD1.

1116 do {	1001 UnicodeSet set;

1117 if (U_FAILURE(status)) {	1002 collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status);

1118 return dest;	1003 if (U_FAILURE(status)) {

	1004 return NULL;

	1005 }

	1006 if (set.isEmpty()) {

	1007 status = U_UNSUPPORTED_ERROR;

	1008 return NULL;

	1009 }

	1010 UnicodeSetIterator iter(set);

	1011 while (iter.next()) {

	1012 const UnicodeString &boundary = iter.getString();

	1013 uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1));

	1014 if ((gcMask & (U_GC_L_MASK \| U_GC_CN_MASK)) == 0) {

	1015 // Ignore boundaries for the special reordering groups.

	1016 // Take only those for "real scripts" (where the sample character is a Letter,

	1017 // and the one for unassigned implicit weights (Cn).

	1018 continue;

1119 }	1019 }

1120 UnicodeString *str = new UnicodeString(src, -1);	1020 UnicodeString *s = new UnicodeString(boundary);

1121 if (str == NULL) {	1021 if (s == NULL) {

1122 status = U_MEMORY_ALLOCATION_ERROR;	1022 status = U_MEMORY_ALLOCATION_ERROR;

1123 return dest;	1023 return NULL;

1124 }	1024 }

1125 dest->addElement(str, status);	1025 dest->addElement(s, status);

1126 src += str->length() + 1;	1026 }

1127 } while (src < limit);	1027 return dest.orphan();

1128 return dest;

1129 }	1028 }

1130	1029

1131	1030

1132 namespace {	1031 namespace {

1133	1032

1134 /**	1033 /**

1135 * Returns true if one index character string is "better" than the other.	1034 * Returns true if one index character string is "better" than the other.

1136 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is	1035 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is

1137 * better, and otherwise binary-less-than is better.	1036 * better, and otherwise binary-less-than is better.

1138 */	1037 */

(...skipping 201 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1340 records_(NULL) {	1239 records_(NULL) {

1341 }	1240 }

1342	1241

1343	1242

1344 AlphabeticIndex::Bucket::~Bucket() {	1243 AlphabeticIndex::Bucket::~Bucket() {

1345 delete records_;	1244 delete records_;

1346 }	1245 }

1347	1246

1348 U_NAMESPACE_END	1247 U_NAMESPACE_END

1349	1248

1350 #endif	1249 #endif // !UCONFIG_NO_COLLATION

OLD	NEW

« no previous file with comments | « source/i18n/Makefile.in ('k') | source/i18n/anytrans.cpp » ('j') | no next file with comments »