Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2)

Unified Diff: source/i18n/alphaindex.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master
Patch Set: remove unusued directories Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « source/i18n/Makefile.in ('k') | source/i18n/anytrans.cpp » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: source/i18n/alphaindex.cpp
diff --git a/source/i18n/alphaindex.cpp b/source/i18n/alphaindex.cpp
index 88dcaabec1b9eed8ca0c8b6d48dbc4eeb9ebb41f..0bea1dc22fbf8cd6fa9d8620bc985e14e3e31b80 100644
--- a/source/i18n/alphaindex.cpp
+++ b/source/i18n/alphaindex.cpp
@@ -1,20 +1,20 @@
/*
*******************************************************************************
-* Copyright (C) 2009-2013, International Business Machines Corporation and
+* Copyright (C) 2009-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
#include "unicode/utypes.h"
-#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
+#if !UCONFIG_NO_COLLATION
#include "unicode/alphaindex.h"
-#include "unicode/coleitr.h"
#include "unicode/coll.h"
#include "unicode/localpointer.h"
#include "unicode/normalizer2.h"
#include "unicode/tblcoll.h"
+#include "unicode/uchar.h"
#include "unicode/ulocdata.h"
#include "unicode/uniset.h"
#include "unicode/uobject.h"
@@ -25,12 +25,11 @@
#include "cstring.h"
#include "uassert.h"
#include "uvector.h"
+#include "uvectr64.h"
//#include <string>
//#include <iostream>
-#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
-
U_NAMESPACE_BEGIN
namespace {
@@ -329,7 +328,7 @@ void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode
if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
// Ignore a primary-ignorable or non-alphabetic index character.
} else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
- // Ignore an index characters that will land in the overflow bucket.
+ // Ignore an index character that will land in the overflow bucket.
} else if (checkDistinct &&
collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
// Ignore a multi-code point index character that does not sort distinctly
@@ -350,7 +349,7 @@ void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode
}
if (U_FAILURE(errorCode)) { return; }
- // if the result is still too large, cut down to maxCount elements, by removing every nth element
+ // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
int32_t size = indexCharacters.size() - 1;
if (size > maxLabelCount_) {
@@ -393,18 +392,17 @@ const UnicodeString &fixLabel(const UnicodeString &current, UnicodeString &temp)
}
UBool hasMultiplePrimaryWeights(
- CollationElementIterator &cei, int32_t variableTop,
- const UnicodeString &s, UErrorCode &errorCode) {
- cei.setText(s, errorCode);
+ const RuleBasedCollator &coll, uint32_t variableTop,
+ const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) {
+ ces.removeAllElements();
+ coll.internalGetCEs(s, ces, errorCode);
+ if (U_FAILURE(errorCode)) { return FALSE; }
UBool seenPrimary = FALSE;
- for (;;) {
- int32_t ce32 = cei.next(errorCode);
- if (ce32 == CollationElementIterator::NULLORDER) {
- break;
- }
- int32_t p = CollationElementIterator::primaryOrder(ce32);
- if (p > variableTop && (ce32 & 0xc0) != 0xc0) {
- // not primary ignorable, and not a continuation CE
+ for (int32_t i = 0; i < ces.size(); ++i) {
+ int64_t ce = ces.elementAti(i);
+ uint32_t p = (uint32_t)(ce >> 32);
+ if (p > variableTop) {
+ // not primary ignorable
if (seenPrimary) {
return TRUE;
}
@@ -424,16 +422,10 @@ BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) { return NULL; }
// Variables for hasMultiplePrimaryWeights().
- LocalPointer<CollationElementIterator> cei(
- collatorPrimaryOnly_->createCollationElementIterator(emptyString_));
- if (cei.isNull()) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return NULL;
- }
- int32_t variableTop;
+ UVector64 ces(errorCode);
+ uint32_t variableTop;
if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
- variableTop = CollationElementIterator::primaryOrder(
- (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode));
+ variableTop = collatorPrimaryOnly_->getVariableTop(errorCode);
} else {
variableTop = 0;
}
@@ -514,7 +506,8 @@ BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
}
// Check for multiple primary weights.
if (!current.startsWith(BASE, BASE_LENGTH) &&
- hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) &&
+ hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current,
+ ces, errorCode) &&
current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
// "AE-ligature" or "Sch" etc.
for (int32_t i = bucketList->size() - 2;; --i) {
@@ -525,8 +518,9 @@ BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
break;
}
if (singleBucket->displayBucket_ == NULL &&
- !hasMultiplePrimaryWeights(
- *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) {
+ !hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop,
+ singleBucket->lowerBoundary_,
+ ces, errorCode)) {
// Add an invisible bucket that redirects strings greater than the expansion
// to the previous single-character bucket.
// For example, after ... Q R S Sch we add Sch\uFFFF->S
@@ -710,20 +704,6 @@ void AlphabeticIndex::internalResetBucketIterator() {
void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
- if (U_FAILURE(status)) { return; }
- // Chinese index characters, which are specific to each of the several Chinese tailorings,
- // take precedence over the single locale data exemplar set per language.
- const char *language = locale.getLanguage();
- if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
- uprv_strcmp(language, "ko") == 0) {
- // TODO: This should be done regardless of the language, but it's expensive.
- // We should add a Collator function (can be @internal)
- // to enumerate just the contractions that start with a given code point or string.
- if (addChineseIndexCharacters(status) || U_FAILURE(status)) {
- return;
- }
- }
-
LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
if (U_FAILURE(status)) {
return;
@@ -784,50 +764,21 @@ void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status
UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
UnicodeSet contractions;
- ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(),
- contractions.toUSet(), NULL, FALSE, &errorCode);
- if (U_FAILURE(errorCode)) { return FALSE; }
- UnicodeString firstHanBoundary;
- UBool hasPinyin = FALSE;
+ collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCode);
+ if (U_FAILURE(errorCode) || contractions.isEmpty()) { return FALSE; }
+ initialLabels_->addAll(contractions);
UnicodeSetIterator iter(contractions);
while (iter.next()) {
const UnicodeString &s = iter.getString();
- if (s.startsWith(BASE, BASE_LENGTH)) {
- initialLabels_->add(s);
- if (firstHanBoundary.isEmpty() ||
- collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) {
- firstHanBoundary = s;
- }
- UChar c = s.charAt(s.length() - 1);
- if (0x41 <= c && c <= 0x5A) { // A-Z
- hasPinyin = TRUE;
- }
- }
- }
- if (hasPinyin) {
- initialLabels_->add(0x41, 0x5A); // A-Z
- }
- if (!firstHanBoundary.isEmpty()) {
- // The hardcoded list of script boundaries includes U+4E00
- // which is tailored to not be the first primary
- // in all Chinese tailorings except "unihan".
- // Replace U+4E00 with the first boundary string from the tailoring.
- // TODO: This becomes obsolete when the root collator gets
- // reliable script-first-primary mappings.
- int32_t hanIndex = binarySearch(
- *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_);
- if (hanIndex >= 0) {
- UnicodeString *fh = new UnicodeString(firstHanBoundary);
- if (fh == NULL) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return FALSE;
- }
- firstCharsInScripts_->setElementAt(fh, hanIndex);
+ U_ASSERT (s.startsWith(BASE, BASE_LENGTH));
+ UChar c = s.charAt(s.length() - 1);
+ if (0x41 <= c && c <= 0x5A) { // A-Z
+ // There are Pinyin labels, add ASCII A-Z labels as well.
+ initialLabels_->add(0x41, 0x5A); // A-Z
+ break;
}
- return TRUE;
- } else {
- return FALSE;
}
+ return TRUE;
}
@@ -865,9 +816,7 @@ UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
const RuleBasedCollator &AlphabeticIndex::getCollator() const {
- // There are no known non-RuleBasedCollator collators, and none ever expected.
- // But, in case that changes, better a null pointer than a wrong type.
- return *dynamic_cast<RuleBasedCollator *>(collator_);
+ return *collator_;
}
@@ -947,12 +896,21 @@ void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
underflowLabel_ = inflowLabel_;
if (collator_ == NULL) {
- collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*locale, status));
- if (U_FAILURE(status)) { return; }
- if (collator_ == NULL) {
+ Collator *coll = Collator::createInstance(*locale, status);
+ if (U_FAILURE(status)) {
+ delete coll;
+ return;
+ }
+ if (coll == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
+ collator_ = dynamic_cast<RuleBasedCollator *>(coll);
+ if (collator_ == NULL) {
+ delete coll;
+ status = U_UNSUPPORTED_ERROR;
+ return;
+ }
}
collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone());
if (collatorPrimaryOnly_ == NULL) {
@@ -963,22 +921,6 @@ void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
firstCharsInScripts_ = firstStringsInScript(status);
if (U_FAILURE(status)) { return; }
firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
- UnicodeString _4E00((UChar)0x4E00);
- UnicodeString _1100((UChar)0x1100);
- UnicodeString _1112((UChar)0x1112);
- if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&
- collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {
- // The standard Korean tailoring sorts Hanja (Han characters)
- // as secondary differences from Hangul syllables.
- // This makes U+4E00 not useful as a Han-script boundary.
- // TODO: This becomes obsolete when the root collator gets
- // reliable script-first-primary mappings.
- int32_t hanIndex = binarySearch(
- *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_);
- if (hanIndex >= 0) {
- firstCharsInScripts_->removeElementAt(hanIndex);
- }
- }
// Guard against a degenerate collator where
// some script boundary strings are primary ignorable.
for (;;) {
@@ -997,7 +939,9 @@ void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
}
}
- if (locale != NULL) {
+ // Chinese index characters, which are specific to each of the several Chinese tailorings,
+ // take precedence over the single locale data exemplar set per language.
+ if (!addChineseIndexCharacters(status) && locale != NULL) {
addIndexExemplars(*locale, status);
}
}
@@ -1042,90 +986,45 @@ recordCompareFn(const void *context, const void *left, const void *right) {
return col->compare(leftRec->name_, rightRec->name_, errorCode);
}
-
-/**
- * This list contains one character per script that has the
- * lowest primary weight for that script in the root collator.
- * This list will be copied and sorted to account for script reordering.
- *
- * <p>TODO: This is fragile. If the first character of a script is tailored
- * so that it does not map to the script's lowest primary weight any more,
- * then the buckets will be off.
- * There are hacks in the code to handle the known CJK tailorings of U+4E00.
- *
- * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a.
- *
- * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in
- * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
- */
-static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = {
- 0x41, 0, 0x03B1, 0,
- 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
- 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
- 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,
- 0xAAF2, 0, // Meetei Mayek
- 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,
- U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada
- U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri
- 0x1B83, 0,
- 0xD802, 0xDE00, 0, 0x0E01, 0,
- 0x0EDE, 0, // Lao
- 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
- 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,
- U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma
- 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
- 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
- 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,
- U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao
- 0xD800, 0xDE80, 0,
- 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
- 0xD801, 0xDC80, 0,
- U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng
- 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
- 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0,
- U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive
- U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs
- 0x4E00, 0,
- // TODO: The overflow bucket's lowerBoundary string should be the
- // first item after the last reordering group in the collator's script order.
- // This should normally be the first Unicode code point
- // that is unassigned (U+0378 in Unicode 6.3) and untailored.
- // However, at least up to ICU 51 the Hani reordering group includes
- // unassigned code points,
- // and there is no stable string for the start of the trailing-weights range.
- // The only known string that sorts "high" is U+FFFF.
- // When ICU separates Hani vs. unassigned reordering groups, we need to fix this,
- // and fix relevant test code.
- // Ideally, FractionalUCA.txt will have a "script first primary"
- // for unassigned code points.
- 0xFFFF, 0
-};
-
UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
if (U_FAILURE(status)) {
return NULL;
}
- UVector *dest = new UVector(status);
- if (dest == NULL) {
+ LocalPointer<UVector> dest(new UVector(status));
+ if (dest.isNull()) {
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
dest->setDeleter(uprv_deleteUObject);
- const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS;
- const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS);
- do {
- if (U_FAILURE(status)) {
- return dest;
+ // Fetch the script-first-primary contractions which are defined in the root collator.
+ // They all start with U+FDD1.
+ UnicodeSet set;
+ collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status);
+ if (U_FAILURE(status)) {
+ return NULL;
+ }
+ if (set.isEmpty()) {
+ status = U_UNSUPPORTED_ERROR;
+ return NULL;
+ }
+ UnicodeSetIterator iter(set);
+ while (iter.next()) {
+ const UnicodeString &boundary = iter.getString();
+ uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1));
+ if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) {
+ // Ignore boundaries for the special reordering groups.
+ // Take only those for "real scripts" (where the sample character is a Letter,
+ // and the one for unassigned implicit weights (Cn).
+ continue;
}
- UnicodeString *str = new UnicodeString(src, -1);
- if (str == NULL) {
+ UnicodeString *s = new UnicodeString(boundary);
+ if (s == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
- return dest;
+ return NULL;
}
- dest->addElement(str, status);
- src += str->length() + 1;
- } while (src < limit);
- return dest;
+ dest->addElement(s, status);
+ }
+ return dest.orphan();
}
@@ -1347,4 +1246,4 @@ AlphabeticIndex::Bucket::~Bucket() {
U_NAMESPACE_END
-#endif
+#endif // !UCONFIG_NO_COLLATION
« no previous file with comments | « source/i18n/Makefile.in ('k') | source/i18n/anytrans.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698