Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(512)

Side by Side Diff: source/i18n/rulebasedcollator.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master
Patch Set: remove unusued directories Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/i18n/repattrn.cpp ('k') | source/i18n/scientificformathelper.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /*
2 *******************************************************************************
3 * Copyright (C) 1996-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * rulebasedcollator.cpp
7 *
8 * (replaced the former tblcoll.cpp)
9 *
10 * created on: 2012feb14 with new and old collation code
11 * created by: Markus W. Scherer
12 */
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_COLLATION
17
18 #include "unicode/coll.h"
19 #include "unicode/coleitr.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/locid.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/tblcoll.h"
24 #include "unicode/ucol.h"
25 #include "unicode/uiter.h"
26 #include "unicode/uloc.h"
27 #include "unicode/uniset.h"
28 #include "unicode/unistr.h"
29 #include "unicode/usetiter.h"
30 #include "unicode/utf8.h"
31 #include "unicode/uversion.h"
32 #include "bocsu.h"
33 #include "charstr.h"
34 #include "cmemory.h"
35 #include "collation.h"
36 #include "collationcompare.h"
37 #include "collationdata.h"
38 #include "collationdatareader.h"
39 #include "collationfastlatin.h"
40 #include "collationiterator.h"
41 #include "collationkeys.h"
42 #include "collationroot.h"
43 #include "collationsets.h"
44 #include "collationsettings.h"
45 #include "collationtailoring.h"
46 #include "cstring.h"
47 #include "uassert.h"
48 #include "ucol_imp.h"
49 #include "uhash.h"
50 #include "uitercollationiterator.h"
51 #include "ustr_imp.h"
52 #include "utf16collationiterator.h"
53 #include "utf8collationiterator.h"
54 #include "uvectr64.h"
55
56 U_NAMESPACE_BEGIN
57
58 namespace {
59
60 class FixedSortKeyByteSink : public SortKeyByteSink {
61 public:
62 FixedSortKeyByteSink(char *dest, int32_t destCapacity)
63 : SortKeyByteSink(dest, destCapacity) {}
64 virtual ~FixedSortKeyByteSink();
65
66 private:
67 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t leng th);
68 virtual UBool Resize(int32_t appendCapacity, int32_t length);
69 };
70
71 FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
72
73 void
74 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int 32_t length) {
75 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
76 // Fill the buffer completely.
77 int32_t available = capacity_ - length;
78 if (available > 0) {
79 uprv_memcpy(buffer_ + length, bytes, available);
80 }
81 }
82
83 UBool
84 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
85 return FALSE;
86 }
87
88 } // namespace
89
90 // Not in an anonymous namespace, so that it can be a friend of CollationKey.
91 class CollationKeyByteSink : public SortKeyByteSink {
92 public:
93 CollationKeyByteSink(CollationKey &key)
94 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getC apacity()),
95 key_(key) {}
96 virtual ~CollationKeyByteSink();
97
98 private:
99 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t leng th);
100 virtual UBool Resize(int32_t appendCapacity, int32_t length);
101
102 CollationKey &key_;
103 };
104
105 CollationKeyByteSink::~CollationKeyByteSink() {}
106
107 void
108 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
109 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
110 if (Resize(n, length)) {
111 uprv_memcpy(buffer_ + length, bytes, n);
112 }
113 }
114
115 UBool
116 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
117 if (buffer_ == NULL) {
118 return FALSE; // allocation failed before already
119 }
120 int32_t newCapacity = 2 * capacity_;
121 int32_t altCapacity = length + 2 * appendCapacity;
122 if (newCapacity < altCapacity) {
123 newCapacity = altCapacity;
124 }
125 if (newCapacity < 200) {
126 newCapacity = 200;
127 }
128 uint8_t *newBuffer = key_.reallocate(newCapacity, length);
129 if (newBuffer == NULL) {
130 SetNotOk();
131 return FALSE;
132 }
133 buffer_ = reinterpret_cast<char *>(newBuffer);
134 capacity_ = newCapacity;
135 return TRUE;
136 }
137
138 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator &other)
139 : Collator(other),
140 data(other.data),
141 settings(other.settings),
142 tailoring(other.tailoring),
143 cacheEntry(other.cacheEntry),
144 validLocale(other.validLocale),
145 explicitlySetAttributes(other.explicitlySetAttributes),
146 actualLocaleIsSameAsValid(other.actualLocaleIsSameAsValid) {
147 settings->addRef();
148 cacheEntry->addRef();
149 }
150
151 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length,
152 const RuleBasedCollator *base, UErrorCode & errorCode)
153 : data(NULL),
154 settings(NULL),
155 tailoring(NULL),
156 cacheEntry(NULL),
157 validLocale(""),
158 explicitlySetAttributes(0),
159 actualLocaleIsSameAsValid(FALSE) {
160 if(U_FAILURE(errorCode)) { return; }
161 if(bin == NULL || length == 0 || base == NULL) {
162 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
163 return;
164 }
165 const CollationTailoring *root = CollationRoot::getRoot(errorCode);
166 if(U_FAILURE(errorCode)) { return; }
167 if(base->tailoring != root) {
168 errorCode = U_UNSUPPORTED_ERROR;
169 return;
170 }
171 LocalPointer<CollationTailoring> t(new CollationTailoring(base->tailoring->s ettings));
172 if(t.isNull() || t->isBogus()) {
173 errorCode = U_MEMORY_ALLOCATION_ERROR;
174 return;
175 }
176 CollationDataReader::read(base->tailoring, bin, length, *t, errorCode);
177 if(U_FAILURE(errorCode)) { return; }
178 t->actualLocale.setToBogus();
179 adoptTailoring(t.orphan(), errorCode);
180 }
181
182 RuleBasedCollator::RuleBasedCollator(const CollationCacheEntry *entry)
183 : data(entry->tailoring->data),
184 settings(entry->tailoring->settings),
185 tailoring(entry->tailoring),
186 cacheEntry(entry),
187 validLocale(entry->validLocale),
188 explicitlySetAttributes(0),
189 actualLocaleIsSameAsValid(FALSE) {
190 settings->addRef();
191 cacheEntry->addRef();
192 }
193
194 RuleBasedCollator::~RuleBasedCollator() {
195 SharedObject::clearPtr(settings);
196 SharedObject::clearPtr(cacheEntry);
197 }
198
199 void
200 RuleBasedCollator::adoptTailoring(CollationTailoring *t, UErrorCode &errorCode) {
201 if(U_FAILURE(errorCode)) {
202 t->deleteIfZeroRefCount();
203 return;
204 }
205 U_ASSERT(settings == NULL && data == NULL && tailoring == NULL && cacheEntry == NULL);
206 cacheEntry = new CollationCacheEntry(t->actualLocale, t);
207 if(cacheEntry == NULL) {
208 errorCode = U_MEMORY_ALLOCATION_ERROR;
209 t->deleteIfZeroRefCount();
210 return;
211 }
212 data = t->data;
213 settings = t->settings;
214 settings->addRef();
215 tailoring = t;
216 cacheEntry->addRef();
217 validLocale = t->actualLocale;
218 actualLocaleIsSameAsValid = FALSE;
219 }
220
221 Collator *
222 RuleBasedCollator::clone() const {
223 return new RuleBasedCollator(*this);
224 }
225
226 RuleBasedCollator &RuleBasedCollator::operator=(const RuleBasedCollator &other) {
227 if(this == &other) { return *this; }
228 SharedObject::copyPtr(other.settings, settings);
229 tailoring = other.tailoring;
230 SharedObject::copyPtr(other.cacheEntry, cacheEntry);
231 data = tailoring->data;
232 validLocale = other.validLocale;
233 explicitlySetAttributes = other.explicitlySetAttributes;
234 actualLocaleIsSameAsValid = other.actualLocaleIsSameAsValid;
235 return *this;
236 }
237
238 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator)
239
240 UBool
241 RuleBasedCollator::operator==(const Collator& other) const {
242 if(this == &other) { return TRUE; }
243 if(!Collator::operator==(other)) { return FALSE; }
244 const RuleBasedCollator &o = static_cast<const RuleBasedCollator &>(other);
245 if(*settings != *o.settings) { return FALSE; }
246 if(data == o.data) { return TRUE; }
247 UBool thisIsRoot = data->base == NULL;
248 UBool otherIsRoot = o.data->base == NULL;
249 U_ASSERT(!thisIsRoot || !otherIsRoot); // otherwise their data pointers sho uld be ==
250 if(thisIsRoot != otherIsRoot) { return FALSE; }
251 if((thisIsRoot || !tailoring->rules.isEmpty()) &&
252 (otherIsRoot || !o.tailoring->rules.isEmpty())) {
253 // Shortcut: If both collators have valid rule strings, then compare tho se.
254 if(tailoring->rules == o.tailoring->rules) { return TRUE; }
255 }
256 // Different rule strings can result in the same or equivalent tailoring.
257 // The rule strings are optional in ICU resource bundles, although included by default.
258 // cloneBinary() drops the rule string.
259 UErrorCode errorCode = U_ZERO_ERROR;
260 LocalPointer<UnicodeSet> thisTailored(getTailoredSet(errorCode));
261 LocalPointer<UnicodeSet> otherTailored(o.getTailoredSet(errorCode));
262 if(U_FAILURE(errorCode)) { return FALSE; }
263 if(*thisTailored != *otherTailored) { return FALSE; }
264 // For completeness, we should compare all of the mappings;
265 // or we should create a list of strings, sort it with one collator,
266 // and check if both collators compare adjacent strings the same
267 // (order & strength, down to quaternary); or similar.
268 // Testing equality of collators seems unusual.
269 return TRUE;
270 }
271
272 int32_t
273 RuleBasedCollator::hashCode() const {
274 int32_t h = settings->hashCode();
275 if(data->base == NULL) { return h; } // root collator
276 // Do not rely on the rule string, see comments in operator==().
277 UErrorCode errorCode = U_ZERO_ERROR;
278 LocalPointer<UnicodeSet> set(getTailoredSet(errorCode));
279 if(U_FAILURE(errorCode)) { return 0; }
280 UnicodeSetIterator iter(*set);
281 while(iter.next() && !iter.isString()) {
282 h ^= data->getCE32(iter.getCodepoint());
283 }
284 return h;
285 }
286
287 void
288 RuleBasedCollator::setLocales(const Locale &requested, const Locale &valid,
289 const Locale &actual) {
290 if(actual == tailoring->actualLocale) {
291 actualLocaleIsSameAsValid = FALSE;
292 } else {
293 U_ASSERT(actual == valid);
294 actualLocaleIsSameAsValid = TRUE;
295 }
296 // Do not modify tailoring.actualLocale:
297 // We cannot be sure that that would be thread-safe.
298 validLocale = valid;
299 (void)requested; // Ignore, see also ticket #10477.
300 }
301
302 Locale
303 RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode& errorCode) con st {
304 if(U_FAILURE(errorCode)) {
305 return Locale::getRoot();
306 }
307 switch(type) {
308 case ULOC_ACTUAL_LOCALE:
309 return actualLocaleIsSameAsValid ? validLocale : tailoring->actualLocale ;
310 case ULOC_VALID_LOCALE:
311 return validLocale;
312 case ULOC_REQUESTED_LOCALE:
313 default:
314 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
315 return Locale::getRoot();
316 }
317 }
318
319 const char *
320 RuleBasedCollator::internalGetLocaleID(ULocDataLocaleType type, UErrorCode &erro rCode) const {
321 if(U_FAILURE(errorCode)) {
322 return NULL;
323 }
324 const Locale *result;
325 switch(type) {
326 case ULOC_ACTUAL_LOCALE:
327 result = actualLocaleIsSameAsValid ? &validLocale : &tailoring->actualLo cale;
328 break;
329 case ULOC_VALID_LOCALE:
330 result = &validLocale;
331 break;
332 case ULOC_REQUESTED_LOCALE:
333 default:
334 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
335 return NULL;
336 }
337 if(result->isBogus()) { return NULL; }
338 const char *id = result->getName();
339 return id[0] == 0 ? "root" : id;
340 }
341
342 const UnicodeString&
343 RuleBasedCollator::getRules() const {
344 return tailoring->rules;
345 }
346
347 void
348 RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) const {
349 if(delta == UCOL_TAILORING_ONLY) {
350 buffer = tailoring->rules;
351 return;
352 }
353 // UCOL_FULL_RULES
354 buffer.remove();
355 CollationLoader::appendRootRules(buffer);
356 buffer.append(tailoring->rules).getTerminatedBuffer();
357 }
358
359 void
360 RuleBasedCollator::getVersion(UVersionInfo version) const {
361 uprv_memcpy(version, tailoring->version, U_MAX_VERSION_LENGTH);
362 version[0] += (UCOL_RUNTIME_VERSION << 4) + (UCOL_RUNTIME_VERSION >> 4);
363 }
364
365 UnicodeSet *
366 RuleBasedCollator::getTailoredSet(UErrorCode &errorCode) const {
367 if(U_FAILURE(errorCode)) { return NULL; }
368 UnicodeSet *tailored = new UnicodeSet();
369 if(tailored == NULL) {
370 errorCode = U_MEMORY_ALLOCATION_ERROR;
371 return NULL;
372 }
373 if(data->base != NULL) {
374 TailoredSet(tailored).forData(data, errorCode);
375 if(U_FAILURE(errorCode)) {
376 delete tailored;
377 return NULL;
378 }
379 }
380 return tailored;
381 }
382
383 void
384 RuleBasedCollator::internalGetContractionsAndExpansions(
385 UnicodeSet *contractions, UnicodeSet *expansions,
386 UBool addPrefixes, UErrorCode &errorCode) const {
387 if(U_FAILURE(errorCode)) { return; }
388 if(contractions != NULL) {
389 contractions->clear();
390 }
391 if(expansions != NULL) {
392 expansions->clear();
393 }
394 ContractionsAndExpansions(contractions, expansions, NULL, addPrefixes).forDa ta(data, errorCode);
395 }
396
397 void
398 RuleBasedCollator::internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCod e &errorCode) const {
399 if(U_FAILURE(errorCode)) { return; }
400 ContractionsAndExpansions(&set, NULL, NULL, FALSE).forCodePoint(data, c, err orCode);
401 }
402
403 const CollationSettings &
404 RuleBasedCollator::getDefaultSettings() const {
405 return *tailoring->settings;
406 }
407
408 UColAttributeValue
409 RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCode &errorCode) const {
410 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
411 int32_t option;
412 switch(attr) {
413 case UCOL_FRENCH_COLLATION:
414 option = CollationSettings::BACKWARD_SECONDARY;
415 break;
416 case UCOL_ALTERNATE_HANDLING:
417 return settings->getAlternateHandling();
418 case UCOL_CASE_FIRST:
419 return settings->getCaseFirst();
420 case UCOL_CASE_LEVEL:
421 option = CollationSettings::CASE_LEVEL;
422 break;
423 case UCOL_NORMALIZATION_MODE:
424 option = CollationSettings::CHECK_FCD;
425 break;
426 case UCOL_STRENGTH:
427 return (UColAttributeValue)settings->getStrength();
428 case UCOL_HIRAGANA_QUATERNARY_MODE:
429 // Deprecated attribute, unsettable.
430 return UCOL_OFF;
431 case UCOL_NUMERIC_COLLATION:
432 option = CollationSettings::NUMERIC;
433 break;
434 default:
435 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
436 return UCOL_DEFAULT;
437 }
438 return ((settings->options & option) == 0) ? UCOL_OFF : UCOL_ON;
439 }
440
441 void
442 RuleBasedCollator::setAttribute(UColAttribute attr, UColAttributeValue value,
443 UErrorCode &errorCode) {
444 UColAttributeValue oldValue = getAttribute(attr, errorCode);
445 if(U_FAILURE(errorCode)) { return; }
446 if(value == oldValue) {
447 setAttributeExplicitly(attr);
448 return;
449 }
450 const CollationSettings &defaultSettings = getDefaultSettings();
451 if(settings == &defaultSettings) {
452 if(value == UCOL_DEFAULT) {
453 setAttributeDefault(attr);
454 return;
455 }
456 }
457 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
458 if(ownedSettings == NULL) {
459 errorCode = U_MEMORY_ALLOCATION_ERROR;
460 return;
461 }
462
463 switch(attr) {
464 case UCOL_FRENCH_COLLATION:
465 ownedSettings->setFlag(CollationSettings::BACKWARD_SECONDARY, value,
466 defaultSettings.options, errorCode);
467 break;
468 case UCOL_ALTERNATE_HANDLING:
469 ownedSettings->setAlternateHandling(value, defaultSettings.options, erro rCode);
470 break;
471 case UCOL_CASE_FIRST:
472 ownedSettings->setCaseFirst(value, defaultSettings.options, errorCode);
473 break;
474 case UCOL_CASE_LEVEL:
475 ownedSettings->setFlag(CollationSettings::CASE_LEVEL, value,
476 defaultSettings.options, errorCode);
477 break;
478 case UCOL_NORMALIZATION_MODE:
479 ownedSettings->setFlag(CollationSettings::CHECK_FCD, value,
480 defaultSettings.options, errorCode);
481 break;
482 case UCOL_STRENGTH:
483 ownedSettings->setStrength(value, defaultSettings.options, errorCode);
484 break;
485 case UCOL_HIRAGANA_QUATERNARY_MODE:
486 // Deprecated attribute. Check for valid values but do not change anythi ng.
487 if(value != UCOL_OFF && value != UCOL_ON && value != UCOL_DEFAULT) {
488 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
489 }
490 break;
491 case UCOL_NUMERIC_COLLATION:
492 ownedSettings->setFlag(CollationSettings::NUMERIC, value, defaultSetting s.options, errorCode);
493 break;
494 default:
495 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
496 break;
497 }
498 if(U_FAILURE(errorCode)) { return; }
499 setFastLatinOptions(*ownedSettings);
500 if(value == UCOL_DEFAULT) {
501 setAttributeDefault(attr);
502 } else {
503 setAttributeExplicitly(attr);
504 }
505 }
506
507 Collator &
508 RuleBasedCollator::setMaxVariable(UColReorderCode group, UErrorCode &errorCode) {
509 if(U_FAILURE(errorCode)) { return *this; }
510 // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1.
511 int32_t value;
512 if(group == UCOL_REORDER_CODE_DEFAULT) {
513 value = UCOL_DEFAULT;
514 } else if(UCOL_REORDER_CODE_FIRST <= group && group <= UCOL_REORDER_CODE_CUR RENCY) {
515 value = group - UCOL_REORDER_CODE_FIRST;
516 } else {
517 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
518 return *this;
519 }
520 CollationSettings::MaxVariable oldValue = settings->getMaxVariable();
521 if(value == oldValue) {
522 setAttributeExplicitly(ATTR_VARIABLE_TOP);
523 return *this;
524 }
525 const CollationSettings &defaultSettings = getDefaultSettings();
526 if(settings == &defaultSettings) {
527 if(value == UCOL_DEFAULT) {
528 setAttributeDefault(ATTR_VARIABLE_TOP);
529 return *this;
530 }
531 }
532 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
533 if(ownedSettings == NULL) {
534 errorCode = U_MEMORY_ALLOCATION_ERROR;
535 return *this;
536 }
537
538 if(group == UCOL_REORDER_CODE_DEFAULT) {
539 group = (UColReorderCode)(UCOL_REORDER_CODE_FIRST + defaultSettings.getM axVariable());
540 }
541 uint32_t varTop = data->getLastPrimaryForGroup(group);
542 U_ASSERT(varTop != 0);
543 ownedSettings->setMaxVariable(value, defaultSettings.options, errorCode);
544 if(U_FAILURE(errorCode)) { return *this; }
545 ownedSettings->variableTop = varTop;
546 setFastLatinOptions(*ownedSettings);
547 if(value == UCOL_DEFAULT) {
548 setAttributeDefault(ATTR_VARIABLE_TOP);
549 } else {
550 setAttributeExplicitly(ATTR_VARIABLE_TOP);
551 }
552 return *this;
553 }
554
555 UColReorderCode
556 RuleBasedCollator::getMaxVariable() const {
557 return (UColReorderCode)(UCOL_REORDER_CODE_FIRST + settings->getMaxVariable( ));
558 }
559
560 uint32_t
561 RuleBasedCollator::getVariableTop(UErrorCode & /*errorCode*/) const {
562 return settings->variableTop;
563 }
564
565 uint32_t
566 RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode & errorCode) {
567 if(U_FAILURE(errorCode)) { return 0; }
568 if(varTop == NULL && len !=0) {
569 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
570 return 0;
571 }
572 if(len < 0) { len = u_strlen(varTop); }
573 if(len == 0) {
574 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
575 return 0;
576 }
577 UBool numeric = settings->isNumeric();
578 int64_t ce1, ce2;
579 if(settings->dontCheckFCD()) {
580 UTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
581 ce1 = ci.nextCE(errorCode);
582 ce2 = ci.nextCE(errorCode);
583 } else {
584 FCDUTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len );
585 ce1 = ci.nextCE(errorCode);
586 ce2 = ci.nextCE(errorCode);
587 }
588 if(ce1 == Collation::NO_CE || ce2 != Collation::NO_CE) {
589 errorCode = U_CE_NOT_FOUND_ERROR;
590 return 0;
591 }
592 setVariableTop((uint32_t)(ce1 >> 32), errorCode);
593 return settings->variableTop;
594 }
595
596 uint32_t
597 RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &error Code) {
598 return setVariableTop(varTop.getBuffer(), varTop.length(), errorCode);
599 }
600
601 void
602 RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &errorCode) {
603 if(U_FAILURE(errorCode)) { return; }
604 if(varTop != settings->variableTop) {
605 // Pin the variable top to the end of the reordering group which contain s it.
606 // Only a few special groups are supported.
607 int32_t group = data->getGroupForPrimary(varTop);
608 if(group < UCOL_REORDER_CODE_FIRST || UCOL_REORDER_CODE_CURRENCY < group ) {
609 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
610 return;
611 }
612 uint32_t v = data->getLastPrimaryForGroup(group);
613 U_ASSERT(v != 0 && v >= varTop);
614 varTop = v;
615 if(varTop != settings->variableTop) {
616 CollationSettings *ownedSettings = SharedObject::copyOnWrite(setting s);
617 if(ownedSettings == NULL) {
618 errorCode = U_MEMORY_ALLOCATION_ERROR;
619 return;
620 }
621 ownedSettings->setMaxVariable(group - UCOL_REORDER_CODE_FIRST,
622 getDefaultSettings().options, errorCod e);
623 if(U_FAILURE(errorCode)) { return; }
624 ownedSettings->variableTop = varTop;
625 setFastLatinOptions(*ownedSettings);
626 }
627 }
628 if(varTop == getDefaultSettings().variableTop) {
629 setAttributeDefault(ATTR_VARIABLE_TOP);
630 } else {
631 setAttributeExplicitly(ATTR_VARIABLE_TOP);
632 }
633 }
634
635 int32_t
636 RuleBasedCollator::getReorderCodes(int32_t *dest, int32_t capacity,
637 UErrorCode &errorCode) const {
638 if(U_FAILURE(errorCode)) { return 0; }
639 if(capacity < 0 || (dest == NULL && capacity > 0)) {
640 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
641 return 0;
642 }
643 int32_t length = settings->reorderCodesLength;
644 if(length == 0) { return 0; }
645 if(length > capacity) {
646 errorCode = U_BUFFER_OVERFLOW_ERROR;
647 return length;
648 }
649 uprv_memcpy(dest, settings->reorderCodes, length * 4);
650 return length;
651 }
652
653 void
654 RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length,
655 UErrorCode &errorCode) {
656 if(U_FAILURE(errorCode)) { return; }
657 if(length < 0 || (reorderCodes == NULL && length > 0)) {
658 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
659 return;
660 }
661 if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_NONE) {
662 length = 0;
663 }
664 if(length == settings->reorderCodesLength &&
665 uprv_memcmp(reorderCodes, settings->reorderCodes, length * 4) == 0) {
666 return;
667 }
668 const CollationSettings &defaultSettings = getDefaultSettings();
669 if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_DEFAULT) {
670 if(settings != &defaultSettings) {
671 CollationSettings *ownedSettings = SharedObject::copyOnWrite(setting s);
672 if(ownedSettings == NULL) {
673 errorCode = U_MEMORY_ALLOCATION_ERROR;
674 return;
675 }
676 ownedSettings->aliasReordering(defaultSettings.reorderCodes,
677 defaultSettings.reorderCodesLength,
678 defaultSettings.reorderTable);
679 setFastLatinOptions(*ownedSettings);
680 }
681 return;
682 }
683 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
684 if(ownedSettings == NULL) {
685 errorCode = U_MEMORY_ALLOCATION_ERROR;
686 return;
687 }
688 if(length == 0) {
689 ownedSettings->resetReordering();
690 } else {
691 uint8_t reorderTable[256];
692 data->makeReorderTable(reorderCodes, length, reorderTable, errorCode);
693 if(U_FAILURE(errorCode)) { return; }
694 if(!ownedSettings->setReordering(reorderCodes, length, reorderTable)) {
695 errorCode = U_MEMORY_ALLOCATION_ERROR;
696 return;
697 }
698 }
699 setFastLatinOptions(*ownedSettings);
700 }
701
702 void
703 RuleBasedCollator::setFastLatinOptions(CollationSettings &ownedSettings) const {
704 ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(
705 data, ownedSettings,
706 ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLa tinPrimaries));
707 }
708
709 UCollationResult
710 RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right ,
711 UErrorCode &errorCode) const {
712 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
713 return doCompare(left.getBuffer(), left.length(),
714 right.getBuffer(), right.length(), errorCode);
715 }
716
717 UCollationResult
718 RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right ,
719 int32_t length, UErrorCode &errorCode) const {
720 if(U_FAILURE(errorCode) || length == 0) { return UCOL_EQUAL; }
721 if(length < 0) {
722 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
723 return UCOL_EQUAL;
724 }
725 int32_t leftLength = left.length();
726 int32_t rightLength = right.length();
727 if(leftLength > length) { leftLength = length; }
728 if(rightLength > length) { rightLength = length; }
729 return doCompare(left.getBuffer(), leftLength,
730 right.getBuffer(), rightLength, errorCode);
731 }
732
733 UCollationResult
734 RuleBasedCollator::compare(const UChar *left, int32_t leftLength,
735 const UChar *right, int32_t rightLength,
736 UErrorCode &errorCode) const {
737 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
738 if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) {
739 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
740 return UCOL_EQUAL;
741 }
742 // Make sure both or neither strings have a known length.
743 // We do not optimize for mixed length/termination.
744 if(leftLength >= 0) {
745 if(rightLength < 0) { rightLength = u_strlen(right); }
746 } else {
747 if(rightLength >= 0) { leftLength = u_strlen(left); }
748 }
749 return doCompare(left, leftLength, right, rightLength, errorCode);
750 }
751
752 UCollationResult
753 RuleBasedCollator::compareUTF8(const StringPiece &left, const StringPiece &right ,
754 UErrorCode &errorCode) const {
755 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
756 const uint8_t *leftBytes = reinterpret_cast<const uint8_t *>(left.data());
757 const uint8_t *rightBytes = reinterpret_cast<const uint8_t *>(right.data());
758 if((leftBytes == NULL && !left.empty()) || (rightBytes == NULL && !right.emp ty())) {
759 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
760 return UCOL_EQUAL;
761 }
762 return doCompare(leftBytes, left.length(), rightBytes, right.length(), error Code);
763 }
764
765 UCollationResult
766 RuleBasedCollator::internalCompareUTF8(const char *left, int32_t leftLength,
767 const char *right, int32_t rightLength,
768 UErrorCode &errorCode) const {
769 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
770 if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) {
771 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
772 return UCOL_EQUAL;
773 }
774 // Make sure both or neither strings have a known length.
775 // We do not optimize for mixed length/termination.
776 if(leftLength >= 0) {
777 if(rightLength < 0) { rightLength = uprv_strlen(right); }
778 } else {
779 if(rightLength >= 0) { leftLength = uprv_strlen(left); }
780 }
781 return doCompare(reinterpret_cast<const uint8_t *>(left), leftLength,
782 reinterpret_cast<const uint8_t *>(right), rightLength, erro rCode);
783 }
784
785 namespace {
786
787 /**
788 * Abstract iterator for identical-level string comparisons.
789 * Returns FCD code points and handles temporary switching to NFD.
790 */
791 class NFDIterator : public UObject {
792 public:
793 NFDIterator() : index(-1), length(0) {}
794 virtual ~NFDIterator() {}
795 /**
796 * Returns the next code point from the internal normalization buffer,
797 * or else the next text code point.
798 * Returns -1 at the end of the text.
799 */
800 UChar32 nextCodePoint() {
801 if(index >= 0) {
802 if(index == length) {
803 index = -1;
804 } else {
805 UChar32 c;
806 U16_NEXT_UNSAFE(decomp, index, c);
807 return c;
808 }
809 }
810 return nextRawCodePoint();
811 }
812 /**
813 * @param nfcImpl
814 * @param c the last code point returned by nextCodePoint() or nextDecompose dCodePoint()
815 * @return the first code point in c's decomposition,
816 * or c itself if it was decomposed already or if it does not decomp ose
817 */
818 UChar32 nextDecomposedCodePoint(const Normalizer2Impl &nfcImpl, UChar32 c) {
819 if(index >= 0) { return c; }
820 decomp = nfcImpl.getDecomposition(c, buffer, length);
821 if(decomp == NULL) { return c; }
822 index = 0;
823 U16_NEXT_UNSAFE(decomp, index, c);
824 return c;
825 }
826 protected:
827 /**
828 * Returns the next text code point in FCD order.
829 * Returns -1 at the end of the text.
830 */
831 virtual UChar32 nextRawCodePoint() = 0;
832 private:
833 const UChar *decomp;
834 UChar buffer[4];
835 int32_t index;
836 int32_t length;
837 };
838
839 class UTF16NFDIterator : public NFDIterator {
840 public:
841 UTF16NFDIterator(const UChar *text, const UChar *textLimit) : s(text), limit (textLimit) {}
842 protected:
843 virtual UChar32 nextRawCodePoint() {
844 if(s == limit) { return U_SENTINEL; }
845 UChar32 c = *s++;
846 if(limit == NULL && c == 0) {
847 s = NULL;
848 return U_SENTINEL;
849 }
850 UChar trail;
851 if(U16_IS_LEAD(c) && s != limit && U16_IS_TRAIL(trail = *s)) {
852 ++s;
853 c = U16_GET_SUPPLEMENTARY(c, trail);
854 }
855 return c;
856 }
857
858 const UChar *s;
859 const UChar *limit;
860 };
861
862 class FCDUTF16NFDIterator : public UTF16NFDIterator {
863 public:
864 FCDUTF16NFDIterator(const Normalizer2Impl &nfcImpl, const UChar *text, const UChar *textLimit)
865 : UTF16NFDIterator(NULL, NULL) {
866 UErrorCode errorCode = U_ZERO_ERROR;
867 const UChar *spanLimit = nfcImpl.makeFCD(text, textLimit, NULL, errorCod e);
868 if(U_FAILURE(errorCode)) { return; }
869 if(spanLimit == textLimit || (textLimit == NULL && *spanLimit == 0)) {
870 s = text;
871 limit = spanLimit;
872 } else {
873 str.setTo(text, (int32_t)(spanLimit - text));
874 {
875 ReorderingBuffer buffer(nfcImpl, str);
876 if(buffer.init(str.length(), errorCode)) {
877 nfcImpl.makeFCD(spanLimit, textLimit, &buffer, errorCode);
878 }
879 }
880 if(U_SUCCESS(errorCode)) {
881 s = str.getBuffer();
882 limit = s + str.length();
883 }
884 }
885 }
886 private:
887 UnicodeString str;
888 };
889
890 class UTF8NFDIterator : public NFDIterator {
891 public:
892 UTF8NFDIterator(const uint8_t *text, int32_t textLength)
893 : s(text), pos(0), length(textLength) {}
894 protected:
895 virtual UChar32 nextRawCodePoint() {
896 if(pos == length || (s[pos] == 0 && length < 0)) { return U_SENTINEL; }
897 UChar32 c;
898 U8_NEXT_OR_FFFD(s, pos, length, c);
899 return c;
900 }
901
902 const uint8_t *s;
903 int32_t pos;
904 int32_t length;
905 };
906
907 class FCDUTF8NFDIterator : public NFDIterator {
908 public:
909 FCDUTF8NFDIterator(const CollationData *data, const uint8_t *text, int32_t t extLength)
910 : u8ci(data, FALSE, text, 0, textLength) {}
911 protected:
912 virtual UChar32 nextRawCodePoint() {
913 UErrorCode errorCode = U_ZERO_ERROR;
914 return u8ci.nextCodePoint(errorCode);
915 }
916 private:
917 FCDUTF8CollationIterator u8ci;
918 };
919
920 class UIterNFDIterator : public NFDIterator {
921 public:
922 UIterNFDIterator(UCharIterator &it) : iter(it) {}
923 protected:
924 virtual UChar32 nextRawCodePoint() {
925 return uiter_next32(&iter);
926 }
927 private:
928 UCharIterator &iter;
929 };
930
931 class FCDUIterNFDIterator : public NFDIterator {
932 public:
933 FCDUIterNFDIterator(const CollationData *data, UCharIterator &it, int32_t st artIndex)
934 : uici(data, FALSE, it, startIndex) {}
935 protected:
936 virtual UChar32 nextRawCodePoint() {
937 UErrorCode errorCode = U_ZERO_ERROR;
938 return uici.nextCodePoint(errorCode);
939 }
940 private:
941 FCDUIterCollationIterator uici;
942 };
943
944 UCollationResult compareNFDIter(const Normalizer2Impl &nfcImpl,
945 NFDIterator &left, NFDIterator &right) {
946 for(;;) {
947 // Fetch the next FCD code point from each string.
948 UChar32 leftCp = left.nextCodePoint();
949 UChar32 rightCp = right.nextCodePoint();
950 if(leftCp == rightCp) {
951 if(leftCp < 0) { break; }
952 continue;
953 }
954 // If they are different, then decompose each and compare again.
955 if(leftCp < 0) {
956 leftCp = -2; // end of string
957 } else if(leftCp == 0xfffe) {
958 leftCp = -1; // U+FFFE: merge separator
959 } else {
960 leftCp = left.nextDecomposedCodePoint(nfcImpl, leftCp);
961 }
962 if(rightCp < 0) {
963 rightCp = -2; // end of string
964 } else if(rightCp == 0xfffe) {
965 rightCp = -1; // U+FFFE: merge separator
966 } else {
967 rightCp = right.nextDecomposedCodePoint(nfcImpl, rightCp);
968 }
969 if(leftCp < rightCp) { return UCOL_LESS; }
970 if(leftCp > rightCp) { return UCOL_GREATER; }
971 }
972 return UCOL_EQUAL;
973 }
974
975 } // namespace
976
977 UCollationResult
978 RuleBasedCollator::doCompare(const UChar *left, int32_t leftLength,
979 const UChar *right, int32_t rightLength,
980 UErrorCode &errorCode) const {
981 // U_FAILURE(errorCode) checked by caller.
982 if(left == right && leftLength == rightLength) {
983 return UCOL_EQUAL;
984 }
985
986 // Identical-prefix test.
987 const UChar *leftLimit;
988 const UChar *rightLimit;
989 int32_t equalPrefixLength = 0;
990 if(leftLength < 0) {
991 leftLimit = NULL;
992 rightLimit = NULL;
993 UChar c;
994 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
995 if(c == 0) { return UCOL_EQUAL; }
996 ++equalPrefixLength;
997 }
998 } else {
999 leftLimit = left + leftLength;
1000 rightLimit = right + rightLength;
1001 for(;;) {
1002 if(equalPrefixLength == leftLength) {
1003 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
1004 break;
1005 } else if(equalPrefixLength == rightLength ||
1006 left[equalPrefixLength] != right[equalPrefixLength]) {
1007 break;
1008 }
1009 ++equalPrefixLength;
1010 }
1011 }
1012
1013 UBool numeric = settings->isNumeric();
1014 if(equalPrefixLength > 0) {
1015 if((equalPrefixLength != leftLength &&
1016 data->isUnsafeBackward(left[equalPrefixLength], numeric)) ||
1017 (equalPrefixLength != rightLength &&
1018 data->isUnsafeBackward(right[equalPrefixLength], numeric))) {
1019 // Identical prefix: Back up to the start of a contraction or reorde ring sequence.
1020 while(--equalPrefixLength > 0 &&
1021 data->isUnsafeBackward(left[equalPrefixLength], numeric)) {}
1022 }
1023 // Notes:
1024 // - A longer string can compare equal to a prefix of it if only ignorab les follow.
1025 // - With a backward level, a longer string can compare less-than a pref ix of it.
1026
1027 // Pass the actual start of each string into the CollationIterators,
1028 // plus the equalPrefixLength position,
1029 // so that prefix matches back into the equal prefix work.
1030 }
1031
1032 int32_t result;
1033 int32_t fastLatinOptions = settings->fastLatinOptions;
1034 if(fastLatinOptions >= 0 &&
1035 (equalPrefixLength == leftLength ||
1036 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX) &&
1037 (equalPrefixLength == rightLength ||
1038 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX)) {
1039 if(leftLength >= 0) {
1040 result = CollationFastLatin::compareUTF16(data->fastLatinTable,
1041 settings->fastLatinPrimari es,
1042 fastLatinOptions,
1043 left + equalPrefixLength,
1044 leftLength - equalPrefixLe ngth,
1045 right + equalPrefixLength,
1046 rightLength - equalPrefixL ength);
1047 } else {
1048 result = CollationFastLatin::compareUTF16(data->fastLatinTable,
1049 settings->fastLatinPrimari es,
1050 fastLatinOptions,
1051 left + equalPrefixLength, -1,
1052 right + equalPrefixLength, -1);
1053 }
1054 } else {
1055 result = CollationFastLatin::BAIL_OUT_RESULT;
1056 }
1057
1058 if(result == CollationFastLatin::BAIL_OUT_RESULT) {
1059 if(settings->dontCheckFCD()) {
1060 UTF16CollationIterator leftIter(data, numeric,
1061 left, left + equalPrefixLength, left Limit);
1062 UTF16CollationIterator rightIter(data, numeric,
1063 right, right + equalPrefixLength, ri ghtLimit);
1064 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter , *settings, errorCode);
1065 } else {
1066 FCDUTF16CollationIterator leftIter(data, numeric,
1067 left, left + equalPrefixLength, le ftLimit);
1068 FCDUTF16CollationIterator rightIter(data, numeric,
1069 right, right + equalPrefixLength , rightLimit);
1070 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter , *settings, errorCode);
1071 }
1072 }
1073 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAI LURE(errorCode)) {
1074 return (UCollationResult)result;
1075 }
1076
1077 // Note: If NUL-terminated, we could get the actual limits from the iterator s now.
1078 // That would complicate the iterators a bit, NUL-terminated strings are onl y a C convenience,
1079 // and the benefit seems unlikely to be measurable.
1080
1081 // Compare identical level.
1082 const Normalizer2Impl &nfcImpl = data->nfcImpl;
1083 left += equalPrefixLength;
1084 right += equalPrefixLength;
1085 if(settings->dontCheckFCD()) {
1086 UTF16NFDIterator leftIter(left, leftLimit);
1087 UTF16NFDIterator rightIter(right, rightLimit);
1088 return compareNFDIter(nfcImpl, leftIter, rightIter);
1089 } else {
1090 FCDUTF16NFDIterator leftIter(nfcImpl, left, leftLimit);
1091 FCDUTF16NFDIterator rightIter(nfcImpl, right, rightLimit);
1092 return compareNFDIter(nfcImpl, leftIter, rightIter);
1093 }
1094 }
1095
1096 UCollationResult
1097 RuleBasedCollator::doCompare(const uint8_t *left, int32_t leftLength,
1098 const uint8_t *right, int32_t rightLength,
1099 UErrorCode &errorCode) const {
1100 // U_FAILURE(errorCode) checked by caller.
1101 if(left == right && leftLength == rightLength) {
1102 return UCOL_EQUAL;
1103 }
1104
1105 // Identical-prefix test.
1106 int32_t equalPrefixLength = 0;
1107 if(leftLength < 0) {
1108 uint8_t c;
1109 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
1110 if(c == 0) { return UCOL_EQUAL; }
1111 ++equalPrefixLength;
1112 }
1113 } else {
1114 for(;;) {
1115 if(equalPrefixLength == leftLength) {
1116 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
1117 break;
1118 } else if(equalPrefixLength == rightLength ||
1119 left[equalPrefixLength] != right[equalPrefixLength]) {
1120 break;
1121 }
1122 ++equalPrefixLength;
1123 }
1124 }
1125 // Back up to the start of a partially-equal code point.
1126 if(equalPrefixLength > 0 &&
1127 ((equalPrefixLength != leftLength && U8_IS_TRAIL(left[equalPrefixLen gth])) ||
1128 (equalPrefixLength != rightLength && U8_IS_TRAIL(right[equalPrefixLe ngth])))) {
1129 while(--equalPrefixLength > 0 && U8_IS_TRAIL(left[equalPrefixLength])) { }
1130 }
1131
1132 UBool numeric = settings->isNumeric();
1133 if(equalPrefixLength > 0) {
1134 UBool unsafe = FALSE;
1135 if(equalPrefixLength != leftLength) {
1136 int32_t i = equalPrefixLength;
1137 UChar32 c;
1138 U8_NEXT_OR_FFFD(left, i, leftLength, c);
1139 unsafe = data->isUnsafeBackward(c, numeric);
1140 }
1141 if(!unsafe && equalPrefixLength != rightLength) {
1142 int32_t i = equalPrefixLength;
1143 UChar32 c;
1144 U8_NEXT_OR_FFFD(right, i, rightLength, c);
1145 unsafe = data->isUnsafeBackward(c, numeric);
1146 }
1147 if(unsafe) {
1148 // Identical prefix: Back up to the start of a contraction or reorde ring sequence.
1149 UChar32 c;
1150 do {
1151 U8_PREV_OR_FFFD(left, 0, equalPrefixLength, c);
1152 } while(equalPrefixLength > 0 && data->isUnsafeBackward(c, numeric)) ;
1153 }
1154 // See the notes in the UTF-16 version.
1155
1156 // Pass the actual start of each string into the CollationIterators,
1157 // plus the equalPrefixLength position,
1158 // so that prefix matches back into the equal prefix work.
1159 }
1160
1161 int32_t result;
1162 int32_t fastLatinOptions = settings->fastLatinOptions;
1163 if(fastLatinOptions >= 0 &&
1164 (equalPrefixLength == leftLength ||
1165 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LE AD) &&
1166 (equalPrefixLength == rightLength ||
1167 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_L EAD)) {
1168 if(leftLength >= 0) {
1169 result = CollationFastLatin::compareUTF8(data->fastLatinTable,
1170 settings->fastLatinPrimarie s,
1171 fastLatinOptions,
1172 left + equalPrefixLength,
1173 leftLength - equalPrefixLen gth,
1174 right + equalPrefixLength,
1175 rightLength - equalPrefixLe ngth);
1176 } else {
1177 result = CollationFastLatin::compareUTF8(data->fastLatinTable,
1178 settings->fastLatinPrimarie s,
1179 fastLatinOptions,
1180 left + equalPrefixLength, - 1,
1181 right + equalPrefixLength, -1);
1182 }
1183 } else {
1184 result = CollationFastLatin::BAIL_OUT_RESULT;
1185 }
1186
1187 if(result == CollationFastLatin::BAIL_OUT_RESULT) {
1188 if(settings->dontCheckFCD()) {
1189 UTF8CollationIterator leftIter(data, numeric, left, equalPrefixLengt h, leftLength);
1190 UTF8CollationIterator rightIter(data, numeric, right, equalPrefixLen gth, rightLength);
1191 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter , *settings, errorCode);
1192 } else {
1193 FCDUTF8CollationIterator leftIter(data, numeric, left, equalPrefixLe ngth, leftLength);
1194 FCDUTF8CollationIterator rightIter(data, numeric, right, equalPrefix Length, rightLength);
1195 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter , *settings, errorCode);
1196 }
1197 }
1198 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAI LURE(errorCode)) {
1199 return (UCollationResult)result;
1200 }
1201
1202 // Note: If NUL-terminated, we could get the actual limits from the iterator s now.
1203 // That would complicate the iterators a bit, NUL-terminated strings are onl y a C convenience,
1204 // and the benefit seems unlikely to be measurable.
1205
1206 // Compare identical level.
1207 const Normalizer2Impl &nfcImpl = data->nfcImpl;
1208 left += equalPrefixLength;
1209 right += equalPrefixLength;
1210 if(leftLength > 0) {
1211 leftLength -= equalPrefixLength;
1212 rightLength -= equalPrefixLength;
1213 }
1214 if(settings->dontCheckFCD()) {
1215 UTF8NFDIterator leftIter(left, leftLength);
1216 UTF8NFDIterator rightIter(right, rightLength);
1217 return compareNFDIter(nfcImpl, leftIter, rightIter);
1218 } else {
1219 FCDUTF8NFDIterator leftIter(data, left, leftLength);
1220 FCDUTF8NFDIterator rightIter(data, right, rightLength);
1221 return compareNFDIter(nfcImpl, leftIter, rightIter);
1222 }
1223 }
1224
1225 UCollationResult
1226 RuleBasedCollator::compare(UCharIterator &left, UCharIterator &right,
1227 UErrorCode &errorCode) const {
1228 if(U_FAILURE(errorCode) || &left == &right) { return UCOL_EQUAL; }
1229 UBool numeric = settings->isNumeric();
1230
1231 // Identical-prefix test.
1232 int32_t equalPrefixLength = 0;
1233 {
1234 UChar32 leftUnit;
1235 UChar32 rightUnit;
1236 while((leftUnit = left.next(&left)) == (rightUnit = right.next(&right))) {
1237 if(leftUnit < 0) { return UCOL_EQUAL; }
1238 ++equalPrefixLength;
1239 }
1240
1241 // Back out the code units that differed, for the real collation compari son.
1242 if(leftUnit >= 0) { left.previous(&left); }
1243 if(rightUnit >= 0) { right.previous(&right); }
1244
1245 if(equalPrefixLength > 0) {
1246 if((leftUnit >= 0 && data->isUnsafeBackward(leftUnit, numeric)) ||
1247 (rightUnit >= 0 && data->isUnsafeBackward(rightUnit, numeric ))) {
1248 // Identical prefix: Back up to the start of a contraction or re ordering sequence.
1249 do {
1250 --equalPrefixLength;
1251 leftUnit = left.previous(&left);
1252 right.previous(&right);
1253 } while(equalPrefixLength > 0 && data->isUnsafeBackward(leftUnit , numeric));
1254 }
1255 // See the notes in the UTF-16 version.
1256 }
1257 }
1258
1259 UCollationResult result;
1260 if(settings->dontCheckFCD()) {
1261 UIterCollationIterator leftIter(data, numeric, left);
1262 UIterCollationIterator rightIter(data, numeric, right);
1263 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *s ettings, errorCode);
1264 } else {
1265 FCDUIterCollationIterator leftIter(data, numeric, left, equalPrefixLengt h);
1266 FCDUIterCollationIterator rightIter(data, numeric, right, equalPrefixLen gth);
1267 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *s ettings, errorCode);
1268 }
1269 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAI LURE(errorCode)) {
1270 return result;
1271 }
1272
1273 // Compare identical level.
1274 left.move(&left, equalPrefixLength, UITER_ZERO);
1275 right.move(&right, equalPrefixLength, UITER_ZERO);
1276 const Normalizer2Impl &nfcImpl = data->nfcImpl;
1277 if(settings->dontCheckFCD()) {
1278 UIterNFDIterator leftIter(left);
1279 UIterNFDIterator rightIter(right);
1280 return compareNFDIter(nfcImpl, leftIter, rightIter);
1281 } else {
1282 FCDUIterNFDIterator leftIter(data, left, equalPrefixLength);
1283 FCDUIterNFDIterator rightIter(data, right, equalPrefixLength);
1284 return compareNFDIter(nfcImpl, leftIter, rightIter);
1285 }
1286 }
1287
1288 CollationKey &
1289 RuleBasedCollator::getCollationKey(const UnicodeString &s, CollationKey &key,
1290 UErrorCode &errorCode) const {
1291 return getCollationKey(s.getBuffer(), s.length(), key, errorCode);
1292 }
1293
1294 CollationKey &
1295 RuleBasedCollator::getCollationKey(const UChar *s, int32_t length, CollationKey& key,
1296 UErrorCode &errorCode) const {
1297 if(U_FAILURE(errorCode)) {
1298 return key.setToBogus();
1299 }
1300 if(s == NULL && length != 0) {
1301 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1302 return key.setToBogus();
1303 }
1304 key.reset(); // resets the "bogus" state
1305 CollationKeyByteSink sink(key);
1306 writeSortKey(s, length, sink, errorCode);
1307 if(U_FAILURE(errorCode)) {
1308 key.setToBogus();
1309 } else if(key.isBogus()) {
1310 errorCode = U_MEMORY_ALLOCATION_ERROR;
1311 } else {
1312 key.setLength(sink.NumberOfBytesAppended());
1313 }
1314 return key;
1315 }
1316
1317 int32_t
1318 RuleBasedCollator::getSortKey(const UnicodeString &s,
1319 uint8_t *dest, int32_t capacity) const {
1320 return getSortKey(s.getBuffer(), s.length(), dest, capacity);
1321 }
1322
1323 int32_t
1324 RuleBasedCollator::getSortKey(const UChar *s, int32_t length,
1325 uint8_t *dest, int32_t capacity) const {
1326 if((s == NULL && length != 0) || capacity < 0 || (dest == NULL && capacity > 0)) {
1327 return 0;
1328 }
1329 uint8_t noDest[1] = { 0 };
1330 if(dest == NULL) {
1331 // Distinguish pure preflighting from an allocation error.
1332 dest = noDest;
1333 capacity = 0;
1334 }
1335 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), capacity);
1336 UErrorCode errorCode = U_ZERO_ERROR;
1337 writeSortKey(s, length, sink, errorCode);
1338 return U_SUCCESS(errorCode) ? sink.NumberOfBytesAppended() : 0;
1339 }
1340
1341 void
1342 RuleBasedCollator::writeSortKey(const UChar *s, int32_t length,
1343 SortKeyByteSink &sink, UErrorCode &errorCode) co nst {
1344 if(U_FAILURE(errorCode)) { return; }
1345 const UChar *limit = (length >= 0) ? s + length : NULL;
1346 UBool numeric = settings->isNumeric();
1347 CollationKeys::LevelCallback callback;
1348 if(settings->dontCheckFCD()) {
1349 UTF16CollationIterator iter(data, numeric, s, s, limit);
1350 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
1351 sink, Collation::PRIMARY_LEVEL ,
1352 callback, TRUE, errorCode);
1353 } else {
1354 FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
1355 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
1356 sink, Collation::PRIMARY_LEVEL ,
1357 callback, TRUE, errorCode);
1358 }
1359 if(settings->getStrength() == UCOL_IDENTICAL) {
1360 writeIdenticalLevel(s, limit, sink, errorCode);
1361 }
1362 static const char terminator = 0; // TERMINATOR_BYTE
1363 sink.Append(&terminator, 1);
1364 }
1365
1366 void
1367 RuleBasedCollator::writeIdenticalLevel(const UChar *s, const UChar *limit,
1368 SortKeyByteSink &sink, UErrorCode &errorC ode) const {
1369 // NFD quick check
1370 const UChar *nfdQCYesLimit = data->nfcImpl.decompose(s, limit, NULL, errorCo de);
1371 if(U_FAILURE(errorCode)) { return; }
1372 sink.Append(Collation::LEVEL_SEPARATOR_BYTE);
1373 UChar32 prev = 0;
1374 if(nfdQCYesLimit != s) {
1375 prev = u_writeIdenticalLevelRun(prev, s, (int32_t)(nfdQCYesLimit - s), s ink);
1376 }
1377 // Is there non-NFD text?
1378 int32_t destLengthEstimate;
1379 if(limit != NULL) {
1380 if(nfdQCYesLimit == limit) { return; }
1381 destLengthEstimate = (int32_t)(limit - nfdQCYesLimit);
1382 } else {
1383 // s is NUL-terminated
1384 if(*nfdQCYesLimit == 0) { return; }
1385 destLengthEstimate = -1;
1386 }
1387 UnicodeString nfd;
1388 data->nfcImpl.decompose(nfdQCYesLimit, limit, nfd, destLengthEstimate, error Code);
1389 u_writeIdenticalLevelRun(prev, nfd.getBuffer(), nfd.length(), sink);
1390 }
1391
1392 namespace {
1393
1394 /**
1395 * internalNextSortKeyPart() calls CollationKeys::writeSortKeyUpToQuaternary()
1396 * with an instance of this callback class.
1397 * When another level is about to be written, the callback
1398 * records the level and the number of bytes that will be written until
1399 * the sink (which is actually a FixedSortKeyByteSink) fills up.
1400 *
1401 * When internalNextSortKeyPart() is called again, it restarts with the last lev el
1402 * and ignores as many bytes as were written previously for that level.
1403 */
1404 class PartLevelCallback : public CollationKeys::LevelCallback {
1405 public:
1406 PartLevelCallback(const SortKeyByteSink &s)
1407 : sink(s), level(Collation::PRIMARY_LEVEL) {
1408 levelCapacity = sink.GetRemainingCapacity();
1409 }
1410 virtual ~PartLevelCallback() {}
1411 virtual UBool needToWrite(Collation::Level l) {
1412 if(!sink.Overflowed()) {
1413 // Remember a level that will be at least partially written.
1414 level = l;
1415 levelCapacity = sink.GetRemainingCapacity();
1416 return TRUE;
1417 } else {
1418 return FALSE;
1419 }
1420 }
1421 Collation::Level getLevel() const { return level; }
1422 int32_t getLevelCapacity() const { return levelCapacity; }
1423
1424 private:
1425 const SortKeyByteSink &sink;
1426 Collation::Level level;
1427 int32_t levelCapacity;
1428 };
1429
1430 } // namespace
1431
1432 int32_t
1433 RuleBasedCollator::internalNextSortKeyPart(UCharIterator *iter, uint32_t state[2 ],
1434 uint8_t *dest, int32_t count, UErrorC ode &errorCode) const {
1435 if(U_FAILURE(errorCode)) { return 0; }
1436 if(iter == NULL || state == NULL || count < 0 || (count > 0 && dest == NULL) ) {
1437 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1438 return 0;
1439 }
1440 if(count == 0) { return 0; }
1441
1442 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), count);
1443 sink.IgnoreBytes((int32_t)state[1]);
1444 iter->move(iter, 0, UITER_START);
1445
1446 Collation::Level level = (Collation::Level)state[0];
1447 if(level <= Collation::QUATERNARY_LEVEL) {
1448 UBool numeric = settings->isNumeric();
1449 PartLevelCallback callback(sink);
1450 if(settings->dontCheckFCD()) {
1451 UIterCollationIterator ci(data, numeric, *iter);
1452 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleByte s, *settings,
1453 sink, level, callback, FAL SE, errorCode);
1454 } else {
1455 FCDUIterCollationIterator ci(data, numeric, *iter, 0);
1456 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleByte s, *settings,
1457 sink, level, callback, FAL SE, errorCode);
1458 }
1459 if(U_FAILURE(errorCode)) { return 0; }
1460 if(sink.NumberOfBytesAppended() > count) {
1461 state[0] = (uint32_t)callback.getLevel();
1462 state[1] = (uint32_t)callback.getLevelCapacity();
1463 return count;
1464 }
1465 // All of the normal levels are done.
1466 if(settings->getStrength() == UCOL_IDENTICAL) {
1467 level = Collation::IDENTICAL_LEVEL;
1468 iter->move(iter, 0, UITER_START);
1469 }
1470 // else fall through to setting ZERO_LEVEL
1471 }
1472
1473 if(level == Collation::IDENTICAL_LEVEL) {
1474 int32_t levelCapacity = sink.GetRemainingCapacity();
1475 UnicodeString s;
1476 for(;;) {
1477 UChar32 c = iter->next(iter);
1478 if(c < 0) { break; }
1479 s.append((UChar)c);
1480 }
1481 const UChar *sArray = s.getBuffer();
1482 writeIdenticalLevel(sArray, sArray + s.length(), sink, errorCode);
1483 if(U_FAILURE(errorCode)) { return 0; }
1484 if(sink.NumberOfBytesAppended() > count) {
1485 state[0] = (uint32_t)level;
1486 state[1] = (uint32_t)levelCapacity;
1487 return count;
1488 }
1489 }
1490
1491 // ZERO_LEVEL: Fill the remainder of dest with 00 bytes.
1492 state[0] = (uint32_t)Collation::ZERO_LEVEL;
1493 state[1] = 0;
1494 int32_t length = sink.NumberOfBytesAppended();
1495 int32_t i = length;
1496 while(i < count) { dest[i++] = 0; }
1497 return length;
1498 }
1499
1500 void
1501 RuleBasedCollator::internalGetCEs(const UnicodeString &str, UVector64 &ces,
1502 UErrorCode &errorCode) const {
1503 if(U_FAILURE(errorCode)) { return; }
1504 const UChar *s = str.getBuffer();
1505 const UChar *limit = s + str.length();
1506 UBool numeric = settings->isNumeric();
1507 if(settings->dontCheckFCD()) {
1508 UTF16CollationIterator iter(data, numeric, s, s, limit);
1509 int64_t ce;
1510 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
1511 ces.addElement(ce, errorCode);
1512 }
1513 } else {
1514 FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
1515 int64_t ce;
1516 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
1517 ces.addElement(ce, errorCode);
1518 }
1519 }
1520 }
1521
1522 namespace {
1523
1524 void appendSubtag(CharString &s, char letter, const char *subtag, int32_t length ,
1525 UErrorCode &errorCode) {
1526 if(U_FAILURE(errorCode) || length == 0) { return; }
1527 if(!s.isEmpty()) {
1528 s.append('_', errorCode);
1529 }
1530 s.append(letter, errorCode);
1531 for(int32_t i = 0; i < length; ++i) {
1532 s.append(uprv_toupper(subtag[i]), errorCode);
1533 }
1534 }
1535
1536 void appendAttribute(CharString &s, char letter, UColAttributeValue value,
1537 UErrorCode &errorCode) {
1538 if(U_FAILURE(errorCode)) { return; }
1539 if(!s.isEmpty()) {
1540 s.append('_', errorCode);
1541 }
1542 static const char *valueChars = "1234...........IXO..SN..LU......";
1543 s.append(letter, errorCode);
1544 s.append(valueChars[value], errorCode);
1545 }
1546
1547 } // namespace
1548
1549 int32_t
1550 RuleBasedCollator::internalGetShortDefinitionString(const char *locale,
1551 char *buffer, int32_t capaci ty,
1552 UErrorCode &errorCode) const {
1553 if(U_FAILURE(errorCode)) { return 0; }
1554 if(buffer == NULL ? capacity != 0 : capacity < 0) {
1555 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1556 return 0;
1557 }
1558 if(locale == NULL) {
1559 locale = internalGetLocaleID(ULOC_VALID_LOCALE, errorCode);
1560 }
1561
1562 char resultLocale[ULOC_FULLNAME_CAPACITY + 1];
1563 int32_t length = ucol_getFunctionalEquivalent(resultLocale, ULOC_FULLNAME_CA PACITY,
1564 "collation", locale,
1565 NULL, &errorCode);
1566 if(U_FAILURE(errorCode)) { return 0; }
1567 if(length == 0) {
1568 uprv_strcpy(resultLocale, "root");
1569 } else {
1570 resultLocale[length] = 0;
1571 }
1572
1573 // Append items in alphabetic order of their short definition letters.
1574 CharString result;
1575 char subtag[ULOC_KEYWORD_AND_VALUES_CAPACITY];
1576
1577 if(attributeHasBeenSetExplicitly(UCOL_ALTERNATE_HANDLING)) {
1578 appendAttribute(result, 'A', getAttribute(UCOL_ALTERNATE_HANDLING, error Code), errorCode);
1579 }
1580 // ATTR_VARIABLE_TOP not supported because 'B' was broken.
1581 // See ICU tickets #10372 and #10386.
1582 if(attributeHasBeenSetExplicitly(UCOL_CASE_FIRST)) {
1583 appendAttribute(result, 'C', getAttribute(UCOL_CASE_FIRST, errorCode), e rrorCode);
1584 }
1585 if(attributeHasBeenSetExplicitly(UCOL_NUMERIC_COLLATION)) {
1586 appendAttribute(result, 'D', getAttribute(UCOL_NUMERIC_COLLATION, errorC ode), errorCode);
1587 }
1588 if(attributeHasBeenSetExplicitly(UCOL_CASE_LEVEL)) {
1589 appendAttribute(result, 'E', getAttribute(UCOL_CASE_LEVEL, errorCode), e rrorCode);
1590 }
1591 if(attributeHasBeenSetExplicitly(UCOL_FRENCH_COLLATION)) {
1592 appendAttribute(result, 'F', getAttribute(UCOL_FRENCH_COLLATION, errorCo de), errorCode);
1593 }
1594 // Note: UCOL_HIRAGANA_QUATERNARY_MODE is deprecated and never changes away from default.
1595 length = uloc_getKeywordValue(resultLocale, "collation", subtag, UPRV_LENGTH OF(subtag), &errorCode);
1596 appendSubtag(result, 'K', subtag, length, errorCode);
1597 length = uloc_getLanguage(resultLocale, subtag, UPRV_LENGTHOF(subtag), &erro rCode);
1598 appendSubtag(result, 'L', subtag, length, errorCode);
1599 if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE)) {
1600 appendAttribute(result, 'N', getAttribute(UCOL_NORMALIZATION_MODE, error Code), errorCode);
1601 }
1602 length = uloc_getCountry(resultLocale, subtag, UPRV_LENGTHOF(subtag), &error Code);
1603 appendSubtag(result, 'R', subtag, length, errorCode);
1604 if(attributeHasBeenSetExplicitly(UCOL_STRENGTH)) {
1605 appendAttribute(result, 'S', getAttribute(UCOL_STRENGTH, errorCode), err orCode);
1606 }
1607 length = uloc_getVariant(resultLocale, subtag, UPRV_LENGTHOF(subtag), &error Code);
1608 appendSubtag(result, 'V', subtag, length, errorCode);
1609 length = uloc_getScript(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorC ode);
1610 appendSubtag(result, 'Z', subtag, length, errorCode);
1611
1612 if(U_FAILURE(errorCode)) { return 0; }
1613 if(result.length() <= capacity) {
1614 uprv_memcpy(buffer, result.data(), result.length());
1615 }
1616 return u_terminateChars(buffer, capacity, result.length(), &errorCode);
1617 }
1618
1619 UBool
1620 RuleBasedCollator::isUnsafe(UChar32 c) const {
1621 return data->isUnsafeBackward(c, settings->isNumeric());
1622 }
1623
1624 void
1625 RuleBasedCollator::computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode) {
1626 t->maxExpansions = CollationElementIterator::computeMaxExpansions(t->data, e rrorCode);
1627 }
1628
1629 UBool
1630 RuleBasedCollator::initMaxExpansions(UErrorCode &errorCode) const {
1631 umtx_initOnce(tailoring->maxExpansionsInitOnce, computeMaxExpansions, tailor ing, errorCode);
1632 return U_SUCCESS(errorCode);
1633 }
1634
1635 CollationElementIterator *
1636 RuleBasedCollator::createCollationElementIterator(const UnicodeString& source) c onst {
1637 UErrorCode errorCode = U_ZERO_ERROR;
1638 if(!initMaxExpansions(errorCode)) { return NULL; }
1639 CollationElementIterator *cei = new CollationElementIterator(source, this, e rrorCode);
1640 if(U_FAILURE(errorCode)) {
1641 delete cei;
1642 return NULL;
1643 }
1644 return cei;
1645 }
1646
1647 CollationElementIterator *
1648 RuleBasedCollator::createCollationElementIterator(const CharacterIterator& sourc e) const {
1649 UErrorCode errorCode = U_ZERO_ERROR;
1650 if(!initMaxExpansions(errorCode)) { return NULL; }
1651 CollationElementIterator *cei = new CollationElementIterator(source, this, e rrorCode);
1652 if(U_FAILURE(errorCode)) {
1653 delete cei;
1654 return NULL;
1655 }
1656 return cei;
1657 }
1658
1659 int32_t
1660 RuleBasedCollator::getMaxExpansion(int32_t order) const {
1661 UErrorCode errorCode = U_ZERO_ERROR;
1662 (void)initMaxExpansions(errorCode);
1663 return CollationElementIterator::getMaxExpansion(tailoring->maxExpansions, o rder);
1664 }
1665
1666 U_NAMESPACE_END
1667
1668 #endif // !UCONFIG_NO_COLLATION
OLDNEW
« no previous file with comments | « source/i18n/repattrn.cpp ('k') | source/i18n/scientificformathelper.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698