Index: source/i18n/alphaindex.cpp |
diff --git a/source/i18n/alphaindex.cpp b/source/i18n/alphaindex.cpp |
index 88dcaabec1b9eed8ca0c8b6d48dbc4eeb9ebb41f..0bea1dc22fbf8cd6fa9d8620bc985e14e3e31b80 100644 |
--- a/source/i18n/alphaindex.cpp |
+++ b/source/i18n/alphaindex.cpp |
@@ -1,20 +1,20 @@ |
/* |
******************************************************************************* |
-* Copyright (C) 2009-2013, International Business Machines Corporation and |
+* Copyright (C) 2009-2014, International Business Machines Corporation and |
* others. All Rights Reserved. |
******************************************************************************* |
*/ |
#include "unicode/utypes.h" |
-#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION |
+#if !UCONFIG_NO_COLLATION |
#include "unicode/alphaindex.h" |
-#include "unicode/coleitr.h" |
#include "unicode/coll.h" |
#include "unicode/localpointer.h" |
#include "unicode/normalizer2.h" |
#include "unicode/tblcoll.h" |
+#include "unicode/uchar.h" |
#include "unicode/ulocdata.h" |
#include "unicode/uniset.h" |
#include "unicode/uobject.h" |
@@ -25,12 +25,11 @@ |
#include "cstring.h" |
#include "uassert.h" |
#include "uvector.h" |
+#include "uvectr64.h" |
//#include <string> |
//#include <iostream> |
-#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
- |
U_NAMESPACE_BEGIN |
namespace { |
@@ -329,7 +328,7 @@ void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode |
if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { |
// Ignore a primary-ignorable or non-alphabetic index character. |
} else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) { |
- // Ignore an index characters that will land in the overflow bucket. |
+ // Ignore an index character that will land in the overflow bucket. |
} else if (checkDistinct && |
collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) { |
// Ignore a multi-code point index character that does not sort distinctly |
@@ -350,7 +349,7 @@ void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode |
} |
if (U_FAILURE(errorCode)) { return; } |
- // if the result is still too large, cut down to maxCount elements, by removing every nth element |
+ // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element |
int32_t size = indexCharacters.size() - 1; |
if (size > maxLabelCount_) { |
@@ -393,18 +392,17 @@ const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) |
} |
UBool hasMultiplePrimaryWeights( |
- CollationElementIterator &cei, int32_t variableTop, |
- const UnicodeString &s, UErrorCode &errorCode) { |
- cei.setText(s, errorCode); |
+ const RuleBasedCollator &coll, uint32_t variableTop, |
+ const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) { |
+ ces.removeAllElements(); |
+ coll.internalGetCEs(s, ces, errorCode); |
+ if (U_FAILURE(errorCode)) { return FALSE; } |
UBool seenPrimary = FALSE; |
- for (;;) { |
- int32_t ce32 = cei.next(errorCode); |
- if (ce32 == CollationElementIterator::NULLORDER) { |
- break; |
- } |
- int32_t p = CollationElementIterator::primaryOrder(ce32); |
- if (p > variableTop && (ce32 & 0xc0) != 0xc0) { |
- // not primary ignorable, and not a continuation CE |
+ for (int32_t i = 0; i < ces.size(); ++i) { |
+ int64_t ce = ces.elementAti(i); |
+ uint32_t p = (uint32_t)(ce >> 32); |
+ if (p > variableTop) { |
+ // not primary ignorable |
if (seenPrimary) { |
return TRUE; |
} |
@@ -424,16 +422,10 @@ BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { |
if (U_FAILURE(errorCode)) { return NULL; } |
// Variables for hasMultiplePrimaryWeights(). |
- LocalPointer<CollationElementIterator> cei( |
- collatorPrimaryOnly_->createCollationElementIterator(emptyString_)); |
- if (cei.isNull()) { |
- errorCode = U_MEMORY_ALLOCATION_ERROR; |
- return NULL; |
- } |
- int32_t variableTop; |
+ UVector64 ces(errorCode); |
+ uint32_t variableTop; |
if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) { |
- variableTop = CollationElementIterator::primaryOrder( |
- (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode)); |
+ variableTop = collatorPrimaryOnly_->getVariableTop(errorCode); |
} else { |
variableTop = 0; |
} |
@@ -514,7 +506,8 @@ BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { |
} |
// Check for multiple primary weights. |
if (!current.startsWith(BASE, BASE_LENGTH) && |
- hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) && |
+ hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current, |
+ ces, errorCode) && |
current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) { |
// "AE-ligature" or "Sch" etc. |
for (int32_t i = bucketList->size() - 2;; --i) { |
@@ -525,8 +518,9 @@ BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { |
break; |
} |
if (singleBucket->displayBucket_ == NULL && |
- !hasMultiplePrimaryWeights( |
- *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) { |
+ !hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, |
+ singleBucket->lowerBoundary_, |
+ ces, errorCode)) { |
// Add an invisible bucket that redirects strings greater than the expansion |
// to the previous single-character bucket. |
// For example, after ... Q R S Sch we add Sch\uFFFF->S |
@@ -710,20 +704,6 @@ void AlphabeticIndex::internalResetBucketIterator() { |
void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) { |
- if (U_FAILURE(status)) { return; } |
- // Chinese index characters, which are specific to each of the several Chinese tailorings, |
- // take precedence over the single locale data exemplar set per language. |
- const char *language = locale.getLanguage(); |
- if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 || |
- uprv_strcmp(language, "ko") == 0) { |
- // TODO: This should be done regardless of the language, but it's expensive. |
- // We should add a Collator function (can be @internal) |
- // to enumerate just the contractions that start with a given code point or string. |
- if (addChineseIndexCharacters(status) || U_FAILURE(status)) { |
- return; |
- } |
- } |
- |
LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); |
if (U_FAILURE(status)) { |
return; |
@@ -784,50 +764,21 @@ void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status |
UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { |
UnicodeSet contractions; |
- ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(), |
- contractions.toUSet(), NULL, FALSE, &errorCode); |
- if (U_FAILURE(errorCode)) { return FALSE; } |
- UnicodeString firstHanBoundary; |
- UBool hasPinyin = FALSE; |
+ collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCode); |
+ if (U_FAILURE(errorCode) || contractions.isEmpty()) { return FALSE; } |
+ initialLabels_->addAll(contractions); |
UnicodeSetIterator iter(contractions); |
while (iter.next()) { |
const UnicodeString &s = iter.getString(); |
- if (s.startsWith(BASE, BASE_LENGTH)) { |
- initialLabels_->add(s); |
- if (firstHanBoundary.isEmpty() || |
- collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) { |
- firstHanBoundary = s; |
- } |
- UChar c = s.charAt(s.length() - 1); |
- if (0x41 <= c && c <= 0x5A) { // A-Z |
- hasPinyin = TRUE; |
- } |
- } |
- } |
- if (hasPinyin) { |
- initialLabels_->add(0x41, 0x5A); // A-Z |
- } |
- if (!firstHanBoundary.isEmpty()) { |
- // The hardcoded list of script boundaries includes U+4E00 |
- // which is tailored to not be the first primary |
- // in all Chinese tailorings except "unihan". |
- // Replace U+4E00 with the first boundary string from the tailoring. |
- // TODO: This becomes obsolete when the root collator gets |
- // reliable script-first-primary mappings. |
- int32_t hanIndex = binarySearch( |
- *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_); |
- if (hanIndex >= 0) { |
- UnicodeString *fh = new UnicodeString(firstHanBoundary); |
- if (fh == NULL) { |
- errorCode = U_MEMORY_ALLOCATION_ERROR; |
- return FALSE; |
- } |
- firstCharsInScripts_->setElementAt(fh, hanIndex); |
+ U_ASSERT (s.startsWith(BASE, BASE_LENGTH)); |
+ UChar c = s.charAt(s.length() - 1); |
+ if (0x41 <= c && c <= 0x5A) { // A-Z |
+ // There are Pinyin labels, add ASCII A-Z labels as well. |
+ initialLabels_->add(0x41, 0x5A); // A-Z |
+ break; |
} |
- return TRUE; |
- } else { |
- return FALSE; |
} |
+ return TRUE; |
} |
@@ -865,9 +816,7 @@ UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { |
const RuleBasedCollator &AlphabeticIndex::getCollator() const { |
- // There are no known non-RuleBasedCollator collators, and none ever expected. |
- // But, in case that changes, better a null pointer than a wrong type. |
- return *dynamic_cast<RuleBasedCollator *>(collator_); |
+ return *collator_; |
} |
@@ -947,12 +896,21 @@ void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { |
underflowLabel_ = inflowLabel_; |
if (collator_ == NULL) { |
- collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*locale, status)); |
- if (U_FAILURE(status)) { return; } |
- if (collator_ == NULL) { |
+ Collator *coll = Collator::createInstance(*locale, status); |
+ if (U_FAILURE(status)) { |
+ delete coll; |
+ return; |
+ } |
+ if (coll == NULL) { |
status = U_MEMORY_ALLOCATION_ERROR; |
return; |
} |
+ collator_ = dynamic_cast<RuleBasedCollator *>(coll); |
+ if (collator_ == NULL) { |
+ delete coll; |
+ status = U_UNSUPPORTED_ERROR; |
+ return; |
+ } |
} |
collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone()); |
if (collatorPrimaryOnly_ == NULL) { |
@@ -963,22 +921,6 @@ void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { |
firstCharsInScripts_ = firstStringsInScript(status); |
if (U_FAILURE(status)) { return; } |
firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); |
- UnicodeString _4E00((UChar)0x4E00); |
- UnicodeString _1100((UChar)0x1100); |
- UnicodeString _1112((UChar)0x1112); |
- if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 && |
- collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) { |
- // The standard Korean tailoring sorts Hanja (Han characters) |
- // as secondary differences from Hangul syllables. |
- // This makes U+4E00 not useful as a Han-script boundary. |
- // TODO: This becomes obsolete when the root collator gets |
- // reliable script-first-primary mappings. |
- int32_t hanIndex = binarySearch( |
- *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_); |
- if (hanIndex >= 0) { |
- firstCharsInScripts_->removeElementAt(hanIndex); |
- } |
- } |
// Guard against a degenerate collator where |
// some script boundary strings are primary ignorable. |
for (;;) { |
@@ -997,7 +939,9 @@ void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { |
} |
} |
- if (locale != NULL) { |
+ // Chinese index characters, which are specific to each of the several Chinese tailorings, |
+ // take precedence over the single locale data exemplar set per language. |
+ if (!addChineseIndexCharacters(status) && locale != NULL) { |
addIndexExemplars(*locale, status); |
} |
} |
@@ -1042,90 +986,45 @@ recordCompareFn(const void *context, const void *left, const void *right) { |
return col->compare(leftRec->name_, rightRec->name_, errorCode); |
} |
- |
-/** |
- * This list contains one character per script that has the |
- * lowest primary weight for that script in the root collator. |
- * This list will be copied and sorted to account for script reordering. |
- * |
- * <p>TODO: This is fragile. If the first character of a script is tailored |
- * so that it does not map to the script's lowest primary weight any more, |
- * then the buckets will be off. |
- * There are hacks in the code to handle the known CJK tailorings of U+4E00. |
- * |
- * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a. |
- * |
- * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in |
- * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java |
- */ |
-static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { |
- 0x41, 0, 0x03B1, 0, |
- 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0, |
- 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0, |
- 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0, |
- 0xAAF2, 0, // Meetei Mayek |
- 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0, |
- U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada |
- U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri |
- 0x1B83, 0, |
- 0xD802, 0xDE00, 0, 0x0E01, 0, |
- 0x0EDE, 0, // Lao |
- 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0, |
- 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0, |
- U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma |
- 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0, |
- 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0, |
- 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0, |
- U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao |
- 0xD800, 0xDE80, 0, |
- 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0, |
- 0xD801, 0xDC80, 0, |
- U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng |
- 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0, |
- 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0, |
- U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive |
- U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs |
- 0x4E00, 0, |
- // TODO: The overflow bucket's lowerBoundary string should be the |
- // first item after the last reordering group in the collator's script order. |
- // This should normally be the first Unicode code point |
- // that is unassigned (U+0378 in Unicode 6.3) and untailored. |
- // However, at least up to ICU 51 the Hani reordering group includes |
- // unassigned code points, |
- // and there is no stable string for the start of the trailing-weights range. |
- // The only known string that sorts "high" is U+FFFF. |
- // When ICU separates Hani vs. unassigned reordering groups, we need to fix this, |
- // and fix relevant test code. |
- // Ideally, FractionalUCA.txt will have a "script first primary" |
- // for unassigned code points. |
- 0xFFFF, 0 |
-}; |
- |
UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { |
if (U_FAILURE(status)) { |
return NULL; |
} |
- UVector *dest = new UVector(status); |
- if (dest == NULL) { |
+ LocalPointer<UVector> dest(new UVector(status)); |
+ if (dest.isNull()) { |
status = U_MEMORY_ALLOCATION_ERROR; |
return NULL; |
} |
dest->setDeleter(uprv_deleteUObject); |
- const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS; |
- const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS); |
- do { |
- if (U_FAILURE(status)) { |
- return dest; |
+ // Fetch the script-first-primary contractions which are defined in the root collator. |
+ // They all start with U+FDD1. |
+ UnicodeSet set; |
+ collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status); |
+ if (U_FAILURE(status)) { |
+ return NULL; |
+ } |
+ if (set.isEmpty()) { |
+ status = U_UNSUPPORTED_ERROR; |
+ return NULL; |
+ } |
+ UnicodeSetIterator iter(set); |
+ while (iter.next()) { |
+ const UnicodeString &boundary = iter.getString(); |
+ uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1)); |
+ if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) { |
+ // Ignore boundaries for the special reordering groups. |
+ // Take only those for "real scripts" (where the sample character is a Letter, |
+ // and the one for unassigned implicit weights (Cn). |
+ continue; |
} |
- UnicodeString *str = new UnicodeString(src, -1); |
- if (str == NULL) { |
+ UnicodeString *s = new UnicodeString(boundary); |
+ if (s == NULL) { |
status = U_MEMORY_ALLOCATION_ERROR; |
- return dest; |
+ return NULL; |
} |
- dest->addElement(str, status); |
- src += str->length() + 1; |
- } while (src < limit); |
- return dest; |
+ dest->addElement(s, status); |
+ } |
+ return dest.orphan(); |
} |
@@ -1347,4 +1246,4 @@ AlphabeticIndex::Bucket::~Bucket() { |
U_NAMESPACE_END |
-#endif |
+#endif // !UCONFIG_NO_COLLATION |