source/i18n/collationiterator.cpp - Issue 845603002: Update ICU to 54.1 step 1

Side by Side Diff: source/i18n/collationiterator.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master

Patch Set: remove unusued directories Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 *******************************************************************************

	3 * Copyright (C) 2010-2014, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 *******************************************************************************

	6 * collationiterator.cpp

	7 *

	8 * created on: 2010oct27

	9 * created by: Markus W. Scherer

	10 */

	11

	12 #include "utypeinfo.h" // for 'typeid' to work

	13

	14 #include "unicode/utypes.h"

	15

	16 #if !UCONFIG_NO_COLLATION

	17

	18 #include "unicode/ucharstrie.h"

	19 #include "unicode/ustringtrie.h"

	20 #include "charstr.h"

	21 #include "cmemory.h"

	22 #include "collation.h"

	23 #include "collationdata.h"

	24 #include "collationfcd.h"

	25 #include "collationiterator.h"

	26 #include "normalizer2impl.h"

	27 #include "uassert.h"

	28 #include "uvectr32.h"

	29

	30 U_NAMESPACE_BEGIN

	31

	32 CollationIterator::CEBuffer::~CEBuffer() {}

	33

	34 UBool

	35 CollationIterator::CEBuffer::ensureAppendCapacity(int32_t appCap, UErrorCode &er rorCode) {

	36 int32_t capacity = buffer.getCapacity();

	37 if((length + appCap) <= capacity) { return TRUE; }

	38 if(U_FAILURE(errorCode)) { return FALSE; }

	39 do {

	40 if(capacity < 1000) {

	41 capacity *= 4;

	42 } else {

	43 capacity *= 2;

	44 }

	45 } while(capacity < (length + appCap));

	46 int64_t *p = buffer.resize(capacity, length);

	47 if(p == NULL) {

	48 errorCode = U_MEMORY_ALLOCATION_ERROR;

	49 return FALSE;

	50 }

	51 return TRUE;

	52 }

	53

	54 // State of combining marks skipped in discontiguous contraction.

	55 // We create a state object on first use and keep it around deactivated between uses.

	56 class SkippedState : public UMemory {

	57 public:

	58 // Born active but empty.

	59 SkippedState() : pos(0), skipLengthAtMatch(0) {}

	60 void clear() {

	61 oldBuffer.remove();

	62 pos = 0;

	63 // The newBuffer is reset by setFirstSkipped().

	64 }

	65

	66 UBool isEmpty() const { return oldBuffer.isEmpty(); }

	67

	68 UBool hasNext() const { return pos < oldBuffer.length(); }

	69

	70 // Requires hasNext().

	71 UChar32 next() {

	72 UChar32 c = oldBuffer.char32At(pos);

	73 pos += U16_LENGTH(c);

	74 return c;

	75 }

	76

	77 // Accounts for one more input code point read beyond the end of the marks b uffer.

	78 void incBeyond() {

	79 U_ASSERT(!hasNext());

	80 ++pos;

	81 }

	82

	83 // Goes backward through the skipped-marks buffer.

	84 // Returns the number of code points read beyond the skipped marks

	85 // that need to be backtracked through normal input.

	86 int32_t backwardNumCodePoints(int32_t n) {

	87 int32_t length = oldBuffer.length();

	88 int32_t beyond = pos - length;

	89 if(beyond > 0) {

	90 if(beyond >= n) {

	91 // Not back far enough to re-enter the oldBuffer.

	92 pos -= n;

	93 return n;

	94 } else {

	95 // Back out all beyond-oldBuffer code points and re-enter the bu ffer.

	96 pos = oldBuffer.moveIndex32(length, beyond - n);

	97 return beyond;

	98 }

	99 } else {

	100 // Go backwards from inside the oldBuffer.

	101 pos = oldBuffer.moveIndex32(pos, -n);

	102 return 0;

	103 }

	104 }

	105

	106 void setFirstSkipped(UChar32 c) {

	107 skipLengthAtMatch = 0;

	108 newBuffer.setTo(c);

	109 }

	110

	111 void skip(UChar32 c) {

	112 newBuffer.append(c);

	113 }

	114

	115 void recordMatch() { skipLengthAtMatch = newBuffer.length(); }

	116

	117 // Replaces the characters we consumed with the newly skipped ones.

	118 void replaceMatch() {

	119 // Note: UnicodeString.replace() pins pos to at most length().

	120 oldBuffer.replace(0, pos, newBuffer, 0, skipLengthAtMatch);

	121 pos = 0;

	122 }

	123

	124 void saveTrieState(const UCharsTrie &trie) { trie.saveState(state); }

	125 void resetToTrieState(UCharsTrie &trie) const { trie.resetToState(state); }

	126

	127 private:

	128 // Combining marks skipped in previous discontiguous-contraction matching.

	129 // After that discontiguous contraction was completed, we start reading them from here.

	130 UnicodeString oldBuffer;

	131 // Combining marks newly skipped in current discontiguous-contraction matchi ng.

	132 // These might have been read from the normal text or from the oldBuffer.

	133 UnicodeString newBuffer;

	134 // Reading index in oldBuffer,

	135 // or counter for how many code points have been read beyond oldBuffer (pos- oldBuffer.length()).

	136 int32_t pos;

	137 // newBuffer.length() at the time of the last matching character.

	138 // When a partial match fails, we back out skipped and partial-matching inpu t characters.

	139 int32_t skipLengthAtMatch;

	140 // We save the trie state before we attempt to match a character,

	141 // so that we can skip it and try the next one.

	142 UCharsTrie::State state;

	143 };

	144

	145 CollationIterator::CollationIterator(const CollationIterator &other)

	146 : UObject(other),

	147 trie(other.trie),

	148 data(other.data),

	149 cesIndex(other.cesIndex),

	150 skipped(NULL),

	151 numCpFwd(other.numCpFwd),

	152 isNumeric(other.isNumeric) {

	153 UErrorCode errorCode = U_ZERO_ERROR;

	154 int32_t length = other.ceBuffer.length;

	155 if(length > 0 && ceBuffer.ensureAppendCapacity(length, errorCode)) {

	156 for(int32_t i = 0; i < length; ++i) {

	157 ceBuffer.set(i, other.ceBuffer.get(i));

	158 }

	159 ceBuffer.length = length;

	160 } else {

	161 cesIndex = 0;

	162 }

	163 }

	164

	165 CollationIterator::~CollationIterator() {

	166 delete skipped;

	167 }

	168

	169 UBool

	170 CollationIterator::operator==(const CollationIterator &other) const {

	171 // Subclasses: Call this method and then add more specific checks.

	172 // Compare the iterator state but not the collation data (trie & data fields ):

	173 // Assume that the caller compares the data.

	174 // Ignore skipped since that should be unused between calls to nextCE().

	175 // (It only stays around to avoid another memory allocation.)

	176 if(!(typeid(*this) == typeid(other) &&

	177 ceBuffer.length == other.ceBuffer.length &&

	178 cesIndex == other.cesIndex &&

	179 numCpFwd == other.numCpFwd &&

	180 isNumeric == other.isNumeric)) {

	181 return FALSE;

	182 }

	183 for(int32_t i = 0; i < ceBuffer.length; ++i) {

	184 if(ceBuffer.get(i) != other.ceBuffer.get(i)) { return FALSE; }

	185 }

	186 return TRUE;

	187 }

	188

	189 void

	190 CollationIterator::reset() {

	191 cesIndex = ceBuffer.length = 0;

	192 if(skipped != NULL) { skipped->clear(); }

	193 }

	194

	195 int32_t

	196 CollationIterator::fetchCEs(UErrorCode &errorCode) {

	197 while(U_SUCCESS(errorCode) && nextCE(errorCode) != Collation::NO_CE) {

	198 // No need to loop for each expansion CE.

	199 cesIndex = ceBuffer.length;

	200 }

	201 return ceBuffer.length;

	202 }

	203

	204 uint32_t

	205 CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {

	206 c = nextCodePoint(errorCode);

	207 return (c < 0) ? Collation::FALLBACK_CE32 : data->getCE32(c);

	208 }

	209

	210 UChar

	211 CollationIterator::handleGetTrailSurrogate() {

	212 return 0;

	213 }

	214

	215 UBool

	216 CollationIterator::foundNULTerminator() {

	217 return FALSE;

	218 }

	219

	220 UBool

	221 CollationIterator::forbidSurrogateCodePoints() const {

	222 return FALSE;

	223 }

	224

	225 uint32_t

	226 CollationIterator::getDataCE32(UChar32 c) const {

	227 return data->getCE32(c);

	228 }

	229

	230 uint32_t

	231 CollationIterator::getCE32FromBuilderData(uint32_t /ce32/, UErrorCode &errorCo de) {

	232 if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; }

	233 return 0;

	234 }

	235

	236 int64_t

	237 CollationIterator::nextCEFromCE32(const CollationData *d, UChar32 c, uint32_t ce 32,

	238 UErrorCode &errorCode) {

	239 --ceBuffer.length; // Undo ceBuffer.incLength().

	240 appendCEsFromCE32(d, c, ce32, TRUE, errorCode);

	241 if(U_SUCCESS(errorCode)) {

	242 return ceBuffer.get(cesIndex++);

	243 } else {

	244 return Collation::NO_CE_PRIMARY;

	245 }

	246 }

	247

	248 void

	249 CollationIterator::appendCEsFromCE32(const CollationData *d, UChar32 c, uint32_t ce32,

	250 UBool forward, UErrorCode &errorCode) {

	251 while(Collation::isSpecialCE32(ce32)) {

	252 switch(Collation::tagFromCE32(ce32)) {

	253 case Collation::FALLBACK_TAG:

	254 case Collation::RESERVED_TAG_3:

	255 if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; }

	256 return;

	257 case Collation::LONG_PRIMARY_TAG:

	258 ceBuffer.append(Collation::ceFromLongPrimaryCE32(ce32), errorCode);

	259 return;

	260 case Collation::LONG_SECONDARY_TAG:

	261 ceBuffer.append(Collation::ceFromLongSecondaryCE32(ce32), errorCode) ;

	262 return;

	263 case Collation::LATIN_EXPANSION_TAG:

	264 if(ceBuffer.ensureAppendCapacity(2, errorCode)) {

	265 ceBuffer.set(ceBuffer.length, Collation::latinCE0FromCE32(ce32)) ;

	266 ceBuffer.set(ceBuffer.length + 1, Collation::latinCE1FromCE32(ce 32));

	267 ceBuffer.length += 2;

	268 }

	269 return;

	270 case Collation::EXPANSION32_TAG: {

	271 const uint32_t *ce32s = d->ce32s + Collation::indexFromCE32(ce32);

	272 int32_t length = Collation::lengthFromCE32(ce32);

	273 if(ceBuffer.ensureAppendCapacity(length, errorCode)) {

	274 do {

	275 ceBuffer.appendUnsafe(Collation::ceFromCE32(*ce32s++));

	276 } while(--length > 0);

	277 }

	278 return;

	279 }

	280 case Collation::EXPANSION_TAG: {

	281 const int64_t *ces = d->ces + Collation::indexFromCE32(ce32);

	282 int32_t length = Collation::lengthFromCE32(ce32);

	283 if(ceBuffer.ensureAppendCapacity(length, errorCode)) {

	284 do {

	285 ceBuffer.appendUnsafe(*ces++);

	286 } while(--length > 0);

	287 }

	288 return;

	289 }

	290 case Collation::BUILDER_DATA_TAG:

	291 ce32 = getCE32FromBuilderData(ce32, errorCode);

	292 if(U_FAILURE(errorCode)) { return; }

	293 if(ce32 == Collation::FALLBACK_CE32) {

	294 d = data->base;

	295 ce32 = d->getCE32(c);

	296 }

	297 break;

	298 case Collation::PREFIX_TAG:

	299 if(forward) { backwardNumCodePoints(1, errorCode); }

	300 ce32 = getCE32FromPrefix(d, ce32, errorCode);

	301 if(forward) { forwardNumCodePoints(1, errorCode); }

	302 break;

	303 case Collation::CONTRACTION_TAG: {

	304 const UChar *p = d->contexts + Collation::indexFromCE32(ce32);

	305 uint32_t defaultCE32 = CollationData::readCE32(p); // Default if no suffix match.

	306 if(!forward) {

	307 // Backward contractions are handled by previousCEUnsafe().

	308 // c has contractions but they were not found.

	309 ce32 = defaultCE32;

	310 break;

	311 }

	312 UChar32 nextCp;

	313 if(skipped == NULL && numCpFwd < 0) {

	314 // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path,

	315 // avoiding the function call and the nextSkippedCodePoint() ove rhead.

	316 nextCp = nextCodePoint(errorCode);

	317 if(nextCp < 0) {

	318 // No more text.

	319 ce32 = defaultCE32;

	320 break;

	321 } else if((ce32 & Collation::CONTRACT_NEXT_CCC) != 0 &&

	322 !CollationFCD::mayHaveLccc(nextCp)) {

	323 // All contraction suffixes start with characters with lccc! =0

	324 // but the next code point has lccc==0.

	325 backwardNumCodePoints(1, errorCode);

	326 ce32 = defaultCE32;

	327 break;

	328 }

	329 } else {

	330 nextCp = nextSkippedCodePoint(errorCode);

	331 if(nextCp < 0) {

	332 // No more text.

	333 ce32 = defaultCE32;

	334 break;

	335 } else if((ce32 & Collation::CONTRACT_NEXT_CCC) != 0 &&

	336 !CollationFCD::mayHaveLccc(nextCp)) {

	337 // All contraction suffixes start with characters with lccc! =0

	338 // but the next code point has lccc==0.

	339 backwardNumSkipped(1, errorCode);

	340 ce32 = defaultCE32;

	341 break;

	342 }

	343 }

	344 ce32 = nextCE32FromContraction(d, ce32, p + 2, defaultCE32, nextCp, errorCode);

	345 if(ce32 == Collation::NO_CE32) {

	346 // CEs from a discontiguous contraction plus the skipped combini ng marks

	347 // have been appended already.

	348 return;

	349 }

	350 break;

	351 }

	352 case Collation::DIGIT_TAG:

	353 if(isNumeric) {

	354 appendNumericCEs(ce32, forward, errorCode);

	355 return;

	356 } else {

	357 // Fetch the non-numeric-collation CE32 and continue.

	358 ce32 = d->ce32s[Collation::indexFromCE32(ce32)];

	359 break;

	360 }

	361 case Collation::U0000_TAG:

	362 U_ASSERT(c == 0);

	363 if(forward && foundNULTerminator()) {

	364 // Handle NUL-termination. (Not needed in Java.)

	365 ceBuffer.append(Collation::NO_CE, errorCode);

	366 return;

	367 } else {

	368 // Fetch the normal ce32 for U+0000 and continue.

	369 ce32 = d->ce32s[0];

	370 break;

	371 }

	372 case Collation::HANGUL_TAG: {

	373 const uint32_t *jamoCE32s = d->jamoCE32s;

	374 c -= Hangul::HANGUL_BASE;

	375 UChar32 t = c % Hangul::JAMO_T_COUNT;

	376 c /= Hangul::JAMO_T_COUNT;

	377 UChar32 v = c % Hangul::JAMO_V_COUNT;

	378 c /= Hangul::JAMO_V_COUNT;

	379 if((ce32 & Collation::HANGUL_NO_SPECIAL_JAMO) != 0) {

	380 // None of the Jamo CE32s are isSpecialCE32().

	381 // Avoid recursive function calls and per-Jamo tests.

	382 if(ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3, errorCode)) {

	383 ceBuffer.set(ceBuffer.length, Collation::ceFromCE32(jamoCE32 s[c]));

	384 ceBuffer.set(ceBuffer.length + 1, Collation::ceFromCE32(jamo CE32s[19 + v]));

	385 ceBuffer.length += 2;

	386 if(t != 0) {

	387 ceBuffer.appendUnsafe(Collation::ceFromCE32(jamoCE32s[39 + t]));

	388 }

	389 }

	390 return;

	391 } else {

	392 // We should not need to compute each Jamo code point.

	393 // In particular, there should be no offset or implicit ce32.

	394 appendCEsFromCE32(d, U_SENTINEL, jamoCE32s[c], forward, errorCod e);

	395 appendCEsFromCE32(d, U_SENTINEL, jamoCE32s[19 + v], forward, err orCode);

	396 if(t == 0) { return; }

	397 // offset 39 = 19 + 21 - 1:

	398 // 19 = JAMO_L_COUNT

	399 // 21 = JAMO_T_COUNT

	400 // -1 = omit t==0

	401 ce32 = jamoCE32s[39 + t];

	402 c = U_SENTINEL;

	403 break;

	404 }

	405 }

	406 case Collation::LEAD_SURROGATE_TAG: {

	407 U_ASSERT(forward); // Backward iteration should never see lead surr ogate code _unit_ data.

	408 U_ASSERT(U16_IS_LEAD(c));

	409 UChar trail;

	410 if(U16_IS_TRAIL(trail = handleGetTrailSurrogate())) {

	411 c = U16_GET_SUPPLEMENTARY(c, trail);

	412 ce32 &= Collation::LEAD_TYPE_MASK;

	413 if(ce32 == Collation::LEAD_ALL_UNASSIGNED) {

	414 ce32 = Collation::UNASSIGNED_CE32; // unassigned-implicit

	415 } else if(ce32 == Collation::LEAD_ALL_FALLBACK \|\|

	416 (ce32 = d->getCE32FromSupplementary(c)) == Collation::FA LLBACK_CE32) {

	417 // fall back to the base data

	418 d = d->base;

	419 ce32 = d->getCE32FromSupplementary(c);

	420 }

	421 } else {

	422 // c is an unpaired surrogate.

	423 ce32 = Collation::UNASSIGNED_CE32;

	424 }

	425 break;

	426 }

	427 case Collation::OFFSET_TAG:

	428 U_ASSERT(c >= 0);

	429 ceBuffer.append(d->getCEFromOffsetCE32(c, ce32), errorCode);

	430 return;

	431 case Collation::IMPLICIT_TAG:

	432 U_ASSERT(c >= 0);

	433 if(U_IS_SURROGATE(c) && forbidSurrogateCodePoints()) {

	434 ce32 = Collation::FFFD_CE32;

	435 break;

	436 } else {

	437 ceBuffer.append(Collation::unassignedCEFromCodePoint(c), errorCo de);

	438 return;

	439 }

	440 }

	441 }

	442 ceBuffer.append(Collation::ceFromSimpleCE32(ce32), errorCode);

	443 }

	444

	445 uint32_t

	446 CollationIterator::getCE32FromPrefix(const CollationData *d, uint32_t ce32,

	447 UErrorCode &errorCode) {

	448 const UChar *p = d->contexts + Collation::indexFromCE32(ce32);

	449 ce32 = CollationData::readCE32(p); // Default if no prefix match.

	450 p += 2;

	451 // Number of code points read before the original code point.

	452 int32_t lookBehind = 0;

	453 UCharsTrie prefixes(p);

	454 for(;;) {

	455 UChar32 c = previousCodePoint(errorCode);

	456 if(c < 0) { break; }

	457 ++lookBehind;

	458 UStringTrieResult match = prefixes.nextForCodePoint(c);

	459 if(USTRINGTRIE_HAS_VALUE(match)) {

	460 ce32 = (uint32_t)prefixes.getValue();

	461 }

	462 if(!USTRINGTRIE_HAS_NEXT(match)) { break; }

	463 }

	464 forwardNumCodePoints(lookBehind, errorCode);

	465 return ce32;

	466 }

	467

	468 UChar32

	469 CollationIterator::nextSkippedCodePoint(UErrorCode &errorCode) {

	470 if(skipped != NULL && skipped->hasNext()) { return skipped->next(); }

	471 if(numCpFwd == 0) { return U_SENTINEL; }

	472 UChar32 c = nextCodePoint(errorCode);

	473 if(skipped != NULL && !skipped->isEmpty() && c >= 0) { skipped->incBeyond(); }

	474 if(numCpFwd > 0 && c >= 0) { --numCpFwd; }

	475 return c;

	476 }

	477

	478 void

	479 CollationIterator::backwardNumSkipped(int32_t n, UErrorCode &errorCode) {

	480 if(skipped != NULL && !skipped->isEmpty()) {

	481 n = skipped->backwardNumCodePoints(n);

	482 }

	483 backwardNumCodePoints(n, errorCode);

	484 if(numCpFwd >= 0) { numCpFwd += n; }

	485 }

	486

	487 uint32_t

	488 CollationIterator::nextCE32FromContraction(const CollationData *d, uint32_t cont ractionCE32,

	489 const UChar *p, uint32_t ce32, UChar3 2 c,

	490 UErrorCode &errorCode) {

	491 // c: next code point after the original one

	492

	493 // Number of code points read beyond the original code point.

	494 // Needed for discontiguous contraction matching.

	495 int32_t lookAhead = 1;

	496 // Number of code points read since the last match (initially only c).

	497 int32_t sinceMatch = 1;

	498 // Normally we only need a contiguous match,

	499 // and therefore need not remember the suffixes state from before a mismatch for retrying.

	500 // If we are already processing skipped combining marks, then we do track th e state.

	501 UCharsTrie suffixes(p);

	502 if(skipped != NULL && !skipped->isEmpty()) { skipped->saveTrieState(suffixes ); }

	503 UStringTrieResult match = suffixes.firstForCodePoint(c);

	504 for(;;) {

	505 UChar32 nextCp;

	506 if(USTRINGTRIE_HAS_VALUE(match)) {

	507 ce32 = (uint32_t)suffixes.getValue();

	508 if(!USTRINGTRIE_HAS_NEXT(match) \|\| (c = nextSkippedCodePoint(errorCo de)) < 0) {

	509 return ce32;

	510 }

	511 if(skipped != NULL && !skipped->isEmpty()) { skipped->saveTrieState( suffixes); }

	512 sinceMatch = 1;

	513 } else if(match == USTRINGTRIE_NO_MATCH \|\| (nextCp = nextSkippedCodePoin t(errorCode)) < 0) {

	514 // No match for c, or partial match (USTRINGTRIE_NO_VALUE) and no fu rther text.

	515 // Back up if necessary, and try a discontiguous contraction.

	516 if((contractionCE32 & Collation::CONTRACT_TRAILING_CCC) != 0 &&

	517 // Discontiguous contraction matching extends an existing ma tch.

	518 // If there is no match yet, then there is nothing to do.

	519 ((contractionCE32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) == 0 \|\|

	520 sinceMatch < lookAhead)) {

	521 // The last character of at least one suffix has lccc!=0,

	522 // allowing for discontiguous contractions.

	523 // UCA S2.1.1 only processes non-starters immediately following

	524 // "a match in the table" (sinceMatch=1).

	525 if(sinceMatch > 1) {

	526 // Return to the state after the last match.

	527 // (Return to sinceMatch=0 and re-fetch the first partially- matched character.)

	528 backwardNumSkipped(sinceMatch, errorCode);

	529 c = nextSkippedCodePoint(errorCode);

	530 lookAhead -= sinceMatch - 1;

	531 sinceMatch = 1;

	532 }

	533 if(d->getFCD16(c) > 0xff) {

	534 return nextCE32FromDiscontiguousContraction(

	535 d, suffixes, ce32, lookAhead, c, errorCode);

	536 }

	537 }

	538 break;

	539 } else {

	540 // Continue after partial match (USTRINGTRIE_NO_VALUE) for c.

	541 // It does not have a result value, therefore it is not itself "a ma tch in the table".

	542 // If a partially-matched c has ccc!=0 then

	543 // it might be skipped in discontiguous contraction.

	544 c = nextCp;

	545 ++sinceMatch;

	546 }

	547 ++lookAhead;

	548 match = suffixes.nextForCodePoint(c);

	549 }

	550 backwardNumSkipped(sinceMatch, errorCode);

	551 return ce32;

	552 }

	553

	554 uint32_t

	555 CollationIterator::nextCE32FromDiscontiguousContraction(

	556 const CollationData *d, UCharsTrie &suffixes, uint32_t ce32,

	557 int32_t lookAhead, UChar32 c,

	558 UErrorCode &errorCode) {

	559 if(U_FAILURE(errorCode)) { return 0; }

	560

	561 // UCA section 3.3.2 Contractions:

	562 // Contractions that end with non-starter characters

	563 // are known as discontiguous contractions.

	564 // ... discontiguous contractions must be detected in input text

	565 // whenever the final sequence of non-starter characters could be rearranged

	566 // so as to make a contiguous matching sequence that is canonically equivale nt.

	567

	568 // UCA: http://www.unicode.org/reports/tr10/#S2.1

	569 // S2.1 Find the longest initial substring S at each point that has a match in the table.

	570 // S2.1.1 If there are any non-starters following S, process each non-starte r C.

	571 // S2.1.2 If C is not blocked from S, find if S + C has a match in the table .

	572 // Note: A non-starter in a string is called blocked

	573 // if there is another non-starter of the same canonical combining class or zero

	574 // between it and the last character of canonical combining class 0.

	575 // S2.1.3 If there is a match, replace S by S + C, and remove C.

	576

	577 // First: Is a discontiguous contraction even possible?

	578 uint16_t fcd16 = d->getFCD16(c);

	579 U_ASSERT(fcd16 > 0xff); // The caller checked this already, as a shortcut.

	580 UChar32 nextCp = nextSkippedCodePoint(errorCode);

	581 if(nextCp < 0) {

	582 // No further text.

	583 backwardNumSkipped(1, errorCode);

	584 return ce32;

	585 }

	586 ++lookAhead;

	587 uint8_t prevCC = (uint8_t)fcd16;

	588 fcd16 = d->getFCD16(nextCp);

	589 if(fcd16 <= 0xff) {

	590 // The next code point after c is a starter (S2.1.1 "process each non-st arter").

	591 backwardNumSkipped(2, errorCode);

	592 return ce32;

	593 }

	594

	595 // We have read and matched (lookAhead-2) code points,

	596 // read non-matching c and peeked ahead at nextCp.

	597 // Return to the state before the mismatch and continue matching with nextCp .

	598 if(skipped == NULL \|\| skipped->isEmpty()) {

	599 if(skipped == NULL) {

	600 skipped = new SkippedState();

	601 if(skipped == NULL) {

	602 errorCode = U_MEMORY_ALLOCATION_ERROR;

	603 return 0;

	604 }

	605 }

	606 suffixes.reset();

	607 if(lookAhead > 2) {

	608 // Replay the partial match so far.

	609 backwardNumCodePoints(lookAhead, errorCode);

	610 suffixes.firstForCodePoint(nextCodePoint(errorCode));

	611 for(int32_t i = 3; i < lookAhead; ++i) {

	612 suffixes.nextForCodePoint(nextCodePoint(errorCode));

	613 }

	614 // Skip c (which did not match) and nextCp (which we will try now).

	615 forwardNumCodePoints(2, errorCode);

	616 }

	617 skipped->saveTrieState(suffixes);

	618 } else {

	619 // Reset to the trie state before the failed match of c.

	620 skipped->resetToTrieState(suffixes);

	621 }

	622

	623 skipped->setFirstSkipped(c);

	624 // Number of code points read since the last match (at this point: c and nex tCp).

	625 int32_t sinceMatch = 2;

	626 c = nextCp;

	627 for(;;) {

	628 UStringTrieResult match;

	629 // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2)

	630 if(prevCC < (fcd16 >> 8) && USTRINGTRIE_HAS_VALUE(match = suffixes.nextF orCodePoint(c))) {

	631 // "If there is a match, replace S by S + C, and remove C." (S2.1.3)

	632 // Keep prevCC unchanged.

	633 ce32 = (uint32_t)suffixes.getValue();

	634 sinceMatch = 0;

	635 skipped->recordMatch();

	636 if(!USTRINGTRIE_HAS_NEXT(match)) { break; }

	637 skipped->saveTrieState(suffixes);

	638 } else {

	639 // No match for "S + C", skip C.

	640 skipped->skip(c);

	641 skipped->resetToTrieState(suffixes);

	642 prevCC = (uint8_t)fcd16;

	643 }

	644 if((c = nextSkippedCodePoint(errorCode)) < 0) { break; }

	645 ++sinceMatch;

	646 fcd16 = d->getFCD16(c);

	647 if(fcd16 <= 0xff) {

	648 // The next code point after c is a starter (S2.1.1 "process each no n-starter").

	649 break;

	650 }

	651 }

	652 backwardNumSkipped(sinceMatch, errorCode);

	653 UBool isTopDiscontiguous = skipped->isEmpty();

	654 skipped->replaceMatch();

	655 if(isTopDiscontiguous && !skipped->isEmpty()) {

	656 // We did get a match after skipping one or more combining marks,

	657 // and we are not in a recursive discontiguous contraction.

	658 // Append CEs from the contraction ce32

	659 // and then from the combining marks that we skipped before the match.

	660 c = U_SENTINEL;

	661 for(;;) {

	662 appendCEsFromCE32(d, c, ce32, TRUE, errorCode);

	663 // Fetch CE32s for skipped combining marks from the normal data, wit h fallback,

	664 // rather than from the CollationData where we found the contraction .

	665 if(!skipped->hasNext()) { break; }

	666 c = skipped->next();

	667 ce32 = getDataCE32(c);

	668 if(ce32 == Collation::FALLBACK_CE32) {

	669 d = data->base;

	670 ce32 = d->getCE32(c);

	671 } else {

	672 d = data;

	673 }

	674 // Note: A nested discontiguous-contraction match

	675 // replaces consumed combining marks with newly skipped ones

	676 // and resets the reading position to the beginning.

	677 }

	678 skipped->clear();

	679 ce32 = Collation::NO_CE32; // Signal to the caller that the result is i n the ceBuffer.

	680 }

	681 return ce32;

	682 }

	683

	684 void

	685 CollationIterator::appendNumericCEs(uint32_t ce32, UBool forward, UErrorCode &er rorCode) {

	686 // Collect digits.

	687 CharString digits;

	688 if(forward) {

	689 for(;;) {

	690 char digit = Collation::digitFromCE32(ce32);

	691 digits.append(digit, errorCode);

	692 if(numCpFwd == 0) { break; }

	693 UChar32 c = nextCodePoint(errorCode);

	694 if(c < 0) { break; }

	695 ce32 = data->getCE32(c);

	696 if(ce32 == Collation::FALLBACK_CE32) {

	697 ce32 = data->base->getCE32(c);

	698 }

	699 if(!Collation::hasCE32Tag(ce32, Collation::DIGIT_TAG)) {

	700 backwardNumCodePoints(1, errorCode);

	701 break;

	702 }

	703 if(numCpFwd > 0) { --numCpFwd; }

	704 }

	705 } else {

	706 for(;;) {

	707 char digit = Collation::digitFromCE32(ce32);

	708 digits.append(digit, errorCode);

	709 UChar32 c = previousCodePoint(errorCode);

	710 if(c < 0) { break; }

	711 ce32 = data->getCE32(c);

	712 if(ce32 == Collation::FALLBACK_CE32) {

	713 ce32 = data->base->getCE32(c);

	714 }

	715 if(!Collation::hasCE32Tag(ce32, Collation::DIGIT_TAG)) {

	716 forwardNumCodePoints(1, errorCode);

	717 break;

	718 }

	719 }

	720 // Reverse the digit string.

	721 char *p = digits.data();

	722 char *q = p + digits.length() - 1;

	723 while(p < q) {

	724 char digit = *p;

	725 p++ = q;

	726 *q-- = digit;

	727 }

	728 }

	729 if(U_FAILURE(errorCode)) { return; }

	730 int32_t pos = 0;

	731 do {

	732 // Skip leading zeros.

	733 while(pos < (digits.length() - 1) && digits[pos] == 0) { ++pos; }

	734 // Write a sequence of CEs for at most 254 digits at a time.

	735 int32_t segmentLength = digits.length() - pos;

	736 if(segmentLength > 254) { segmentLength = 254; }

	737 appendNumericSegmentCEs(digits.data() + pos, segmentLength, errorCode);

	738 pos += segmentLength;

	739 } while(U_SUCCESS(errorCode) && pos < digits.length());

	740 }

	741

	742 void

	743 CollationIterator::appendNumericSegmentCEs(const char *digits, int32_t length, U ErrorCode &errorCode) {

	744 U_ASSERT(1 <= length && length <= 254);

	745 U_ASSERT(length == 1 \|\| digits[0] != 0);

	746 uint32_t numericPrimary = data->numericPrimary;

	747 // Note: We use primary byte values 2..255: digits are not compressible.

	748 if(length <= 7) {

	749 // Very dense encoding for small numbers.

	750 int32_t value = digits[0];

	751 for(int32_t i = 1; i < length; ++i) {

	752 value = value * 10 + digits[i];

	753 }

	754 // Primary weight second byte values:

	755 // 74 byte values 2.. 75 for small numbers in two-byte primary wei ghts.

	756 // 40 byte values 76..115 for medium numbers in three-byte primary weights.

	757 // 16 byte values 116..131 for large numbers in four-byte primary we ights.

	758 // 124 byte values 132..255 for very large numbers with 4..127 digit pairs.

	759 int32_t firstByte = 2;

	760 int32_t numBytes = 74;

	761 if(value < numBytes) {

	762 // Two-byte primary for 0..73, good for day & month numbers etc.

	763 uint32_t primary = numericPrimary \| ((firstByte + value) << 16);

	764 ceBuffer.append(Collation::makeCE(primary), errorCode);

	765 return;

	766 }

	767 value -= numBytes;

	768 firstByte += numBytes;

	769 numBytes = 40;

	770 if(value < numBytes * 254) {

	771 // Three-byte primary for 74..10233=74+40*254-1, good for year numbe rs and more.

	772 uint32_t primary = numericPrimary \|

	773 ((firstByte + value / 254) << 16) \| ((2 + value % 254) << 8);

	774 ceBuffer.append(Collation::makeCE(primary), errorCode);

	775 return;

	776 }

	777 value -= numBytes * 254;

	778 firstByte += numBytes;

	779 numBytes = 16;

	780 if(value < numBytes * 254 * 254) {

	781 // Four-byte primary for 10234..1042489=10234+16254254-1.

	782 uint32_t primary = numericPrimary \| (2 + value % 254);

	783 value /= 254;

	784 primary \|= (2 + value % 254) << 8;

	785 value /= 254;

	786 primary \|= (firstByte + value % 254) << 16;

	787 ceBuffer.append(Collation::makeCE(primary), errorCode);

	788 return;

	789 }

	790 // original value > 1042489

	791 }

	792 U_ASSERT(length >= 7);

	793

	794 // The second primary byte value 132..255 indicates the number of digit pair s (4..127),

	795 // then we generate primary bytes with those pairs.

	796 // Omit trailing 00 pairs.

	797 // Decrement the value for the last pair.

	798

	799 // Set the exponent. 4 pairs->132, 5 pairs->133, ..., 127 pairs->255.

	800 int32_t numPairs = (length + 1) / 2;

	801 uint32_t primary = numericPrimary \| ((132 - 4 + numPairs) << 16);

	802 // Find the length without trailing 00 pairs.

	803 while(digits[length - 1] == 0 && digits[length - 2] == 0) {

	804 length -= 2;

	805 }

	806 // Read the first pair.

	807 uint32_t pair;

	808 int32_t pos;

	809 if(length & 1) {

	810 // Only "half a pair" if we have an odd number of digits.

	811 pair = digits[0];

	812 pos = 1;

	813 } else {

	814 pair = digits[0] * 10 + digits[1];

	815 pos = 2;

	816 }

	817 pair = 11 + 2 * pair;

	818 // Add the pairs of digits between pos and length.

	819 int32_t shift = 8;

	820 while(pos < length) {

	821 if(shift == 0) {

	822 // Every three pairs/bytes we need to store a 4-byte-primary CE

	823 // and start with a new CE with the '0' primary lead byte.

	824 primary \|= pair;

	825 ceBuffer.append(Collation::makeCE(primary), errorCode);

	826 primary = numericPrimary;

	827 shift = 16;

	828 } else {

	829 primary \|= pair << shift;

	830 shift -= 8;

	831 }

	832 pair = 11 + 2 * (digits[pos] * 10 + digits[pos + 1]);

	833 pos += 2;

	834 }

	835 primary \|= (pair - 1) << shift;

	836 ceBuffer.append(Collation::makeCE(primary), errorCode);

	837 }

	838

	839 int64_t

	840 CollationIterator::previousCE(UVector32 &offsets, UErrorCode &errorCode) {

	841 if(ceBuffer.length > 0) {

	842 // Return the previous buffered CE.

	843 return ceBuffer.get(--ceBuffer.length);

	844 }

	845 offsets.removeAllElements();

	846 int32_t limitOffset = getOffset();

	847 UChar32 c = previousCodePoint(errorCode);

	848 if(c < 0) { return Collation::NO_CE; }

	849 if(data->isUnsafeBackward(c, isNumeric)) {

	850 return previousCEUnsafe(c, offsets, errorCode);

	851 }

	852 // Simple, safe-backwards iteration:

	853 // Get a CE going backwards, handle prefixes but no contractions.

	854 uint32_t ce32 = data->getCE32(c);

	855 const CollationData *d;

	856 if(ce32 == Collation::FALLBACK_CE32) {

	857 d = data->base;

	858 ce32 = d->getCE32(c);

	859 } else {

	860 d = data;

	861 }

	862 if(Collation::isSimpleOrLongCE32(ce32)) {

	863 return Collation::ceFromCE32(ce32);

	864 }

	865 appendCEsFromCE32(d, c, ce32, FALSE, errorCode);

	866 if(U_SUCCESS(errorCode)) {

	867 if(ceBuffer.length > 1) {

	868 offsets.addElement(getOffset(), errorCode);

	869 // For an expansion, the offset of each non-initial CE is the limit offset,

	870 // consistent with forward iteration.

	871 while(offsets.size() <= ceBuffer.length) {

	872 offsets.addElement(limitOffset, errorCode);

	873 };

	874 }

	875 return ceBuffer.get(--ceBuffer.length);

	876 } else {

	877 return Collation::NO_CE_PRIMARY;

	878 }

	879 }

	880

	881 int64_t

	882 CollationIterator::previousCEUnsafe(UChar32 c, UVector32 &offsets, UErrorCode &e rrorCode) {

	883 // We just move through the input counting safe and unsafe code points

	884 // without collecting the unsafe-backward substring into a buffer and

	885 // switching to it.

	886 // This is to keep the logic simple. Otherwise we would have to handle

	887 // prefix matching going before the backward buffer, switching

	888 // to iteration and back, etc.

	889 // In the most important case of iterating over a normal string,

	890 // reading from the string itself is already maximally fast.

	891 // The only drawback there is that after getting the CEs we always

	892 // skip backward to the safe character rather than switching out

	893 // of a backwardBuffer.

	894 // But this should not be the common case for previousCE(),

	895 // and correctness and maintainability are more important than

	896 // complex optimizations.

	897 // Find the first safe character before c.

	898 int32_t numBackward = 1;

	899 while((c = previousCodePoint(errorCode)) >= 0) {

	900 ++numBackward;

	901 if(!data->isUnsafeBackward(c, isNumeric)) {

	902 break;

	903 }

	904 }

	905 // Set the forward iteration limit.

	906 // Note: This counts code points.

	907 // We cannot enforce a limit in the middle of a surrogate pair or similar.

	908 numCpFwd = numBackward;

	909 // Reset the forward iterator.

	910 cesIndex = 0;

	911 U_ASSERT(ceBuffer.length == 0);

	912 // Go forward and collect the CEs.

	913 int32_t offset = getOffset();

	914 while(numCpFwd > 0) {

	915 // nextCE() normally reads one code point.

	916 // Contraction matching and digit specials read more and check numCpFwd.

	917 --numCpFwd;

	918 // Append one or more CEs to the ceBuffer.

	919 (void)nextCE(errorCode);

	920 U_ASSERT(U_FAILURE(errorCode) \|\| ceBuffer.get(ceBuffer.length - 1) != Co llation::NO_CE);

	921 // No need to loop for getting each expansion CE from nextCE().

	922 cesIndex = ceBuffer.length;

	923 // However, we need to write an offset for each CE.

	924 // This is for CollationElementIterator::getOffset() to return

	925 // intermediate offsets from the unsafe-backwards segment.

	926 U_ASSERT(offsets.size() < ceBuffer.length);

	927 offsets.addElement(offset, errorCode);

	928 // For an expansion, the offset of each non-initial CE is the limit offs et,

	929 // consistent with forward iteration.

	930 offset = getOffset();

	931 while(offsets.size() < ceBuffer.length) {

	932 offsets.addElement(offset, errorCode);

	933 };

	934 }

	935 U_ASSERT(offsets.size() == ceBuffer.length);

	936 // End offset corresponding to just after the unsafe-backwards segment.

	937 offsets.addElement(offset, errorCode);

	938 // Reset the forward iteration limit

	939 // and move backward to before the segment for which we fetched CEs.

	940 numCpFwd = -1;

	941 backwardNumCodePoints(numBackward, errorCode);

	942 // Use the collected CEs and return the last one.

	943 cesIndex = 0; // Avoid cesIndex > ceBuffer.length when that gets decremente d.

	944 if(U_SUCCESS(errorCode)) {

	945 return ceBuffer.get(--ceBuffer.length);

	946 } else {

	947 return Collation::NO_CE_PRIMARY;

	948 }

	949 }

	950

	951 U_NAMESPACE_END

	952

	953 #endif // !UCONFIG_NO_COLLATION

OLD	NEW

« no previous file with comments | « source/i18n/collationiterator.h ('k') | source/i18n/collationkeys.h » ('j') | no next file with comments »