icu46/source/common/unisetspan.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/unisetspan.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 ******************************************************************************

	3 *

	4 * Copyright (C) 2007, International Business Machines

	5 * Corporation and others. All Rights Reserved.

	6 *

	7 ******************************************************************************

	8 * file name: unisetspan.cpp

	9 * encoding: US-ASCII

	10 * tab size: 8 (not used)

	11 * indentation:4

	12 *

	13 * created on: 2007mar01

	14 * created by: Markus W. Scherer

	15 */

	16

	17 #include "unicode/utypes.h"

	18 #include "unicode/uniset.h"

	19 #include "unicode/ustring.h"

	20 #include "cmemory.h"

	21 #include "uvector.h"

	22 #include "unisetspan.h"

	23

	24 U_NAMESPACE_BEGIN

	25

	26 /*

	27 * List of offsets from the current position from where to try matching

	28 * a code point or a string.

	29 * Store offsets rather than indexes to simplify the code and use the same list

	30 * for both increments (in span()) and decrements (in spanBack()).

	31 *

	32 * Assumption: The maximum offset is limited, and the offsets that are stored

	33 * at any one time are relatively dense, that is, there are normally no gaps of

	34 * hundreds or thousands of offset values.

	35 *

	36 * The implementation uses a circular buffer of byte flags,

	37 * each indicating whether the corresponding offset is in the list.

	38 * This avoids inserting into a sorted list of offsets (or absolute indexes) and

	39 * physically moving part of the list.

	40 *

	41 * Note: In principle, the caller should setMaxLength() to the maximum of the

	42 * max string length and U16_LENGTH/U8_LENGTH to account for

	43 * "long" single code points.

	44 * However, this implementation uses at least a staticList with more than

	45 * U8_LENGTH entries anyway.

	46 *

	47 * Note: If maxLength were guaranteed to be no more than 32 or 64,

	48 * the list could be stored as bit flags in a single integer.

	49 * Rather than handling a circular buffer with a start list index,

	50 * the integer would simply be shifted when lower offsets are removed.

	51 * UnicodeSet does not have a limit on the lengths of strings.

	52 */

	53 class OffsetList { // Only ever stack-allocated, does not need to inherit UMemo ry.

	54 public:

	55 OffsetList() : list(staticList), capacity(0), length(0), start(0) {}

	56

	57 ~OffsetList() {

	58 if(list!=staticList) {

	59 uprv_free(list);

	60 }

	61 }

	62

	63 // Call exactly once if the list is to be used.

	64 void setMaxLength(int32_t maxLength) {

	65 if(maxLength<=(int32_t)sizeof(staticList)) {

	66 capacity=(int32_t)sizeof(staticList);

	67 } else {

	68 UBool l=(UBool )uprv_malloc(maxLength);

	69 if(l!=NULL) {

	70 list=l;

	71 capacity=maxLength;

	72 }

	73 }

	74 uprv_memset(list, 0, capacity);

	75 }

	76

	77 void clear() {

	78 uprv_memset(list, 0, capacity);

	79 start=length=0;

	80 }

	81

	82 UBool isEmpty() const {

	83 return (UBool)(length==0);

	84 }

	85

	86 // Reduce all stored offsets by delta, used when the current position

	87 // moves by delta.

	88 // There must not be any offsets lower than delta.

	89 // If there is an offset equal to delta, it is removed.

	90 // delta=[1..maxLength]

	91 void shift(int32_t delta) {

	92 int32_t i=start+delta;

	93 if(i>=capacity) {

	94 i-=capacity;

	95 }

	96 if(list[i]) {

	97 list[i]=FALSE;

	98 --length;

	99 }

	100 start=i;

	101 }

	102

	103 // Add an offset. The list must not contain it yet.

	104 // offset=[1..maxLength]

	105 void addOffset(int32_t offset) {

	106 int32_t i=start+offset;

	107 if(i>=capacity) {

	108 i-=capacity;

	109 }

	110 list[i]=TRUE;

	111 ++length;

	112 }

	113

	114 // offset=[1..maxLength]

	115 UBool containsOffset(int32_t offset) const {

	116 int32_t i=start+offset;

	117 if(i>=capacity) {

	118 i-=capacity;

	119 }

	120 return list[i];

	121 }

	122

	123 // Find the lowest stored offset from a non-empty list, remove it,

	124 // and reduce all other offsets by this minimum.

	125 // Returns [1..maxLength].

	126 int32_t popMinimum() {

	127 // Look for the next offset in list[start+1..capacity-1].

	128 int32_t i=start, result;

	129 while(++i<capacity) {

	130 if(list[i]) {

	131 list[i]=FALSE;

	132 --length;

	133 result=i-start;

	134 start=i;

	135 return result;

	136 }

	137 }

	138 // i==capacity

	139

	140 // Wrap around and look for the next offset in list[0..start].

	141 // Since the list is not empty, there will be one.

	142 result=capacity-start;

	143 i=0;

	144 while(!list[i]) {

	145 ++i;

	146 }

	147 list[i]=FALSE;

	148 --length;

	149 start=i;

	150 return result+=i;

	151 }

	152

	153 private:

	154 UBool *list;

	155 int32_t capacity;

	156 int32_t length;

	157 int32_t start;

	158

	159 UBool staticList[16];

	160 };

	161

	162 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.

	163 static int32_t

	164 getUTF8Length(const UChar *s, int32_t length) {

	165 UErrorCode errorCode=U_ZERO_ERROR;

	166 int32_t length8=0;

	167 u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);

	168 if(U_SUCCESS(errorCode) \|\| errorCode==U_BUFFER_OVERFLOW_ERROR) {

	169 return length8;

	170 } else {

	171 // The string contains an unpaired surrogate.

	172 // Ignore this string.

	173 return 0;

	174 }

	175 }

	176

	177 // Append the UTF-8 version of the string to t and return the appended UTF-8 len gth.

	178 static int32_t

	179 appendUTF8(const UChar s, int32_t length, uint8_t t, int32_t capacity) {

	180 UErrorCode errorCode=U_ZERO_ERROR;

	181 int32_t length8=0;

	182 u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);

	183 if(U_SUCCESS(errorCode)) {

	184 return length8;

	185 } else {

	186 // The string contains an unpaired surrogate.

	187 // Ignore this string.

	188 return 0;

	189 }

	190 }

	191

	192 static inline uint8_t

	193 makeSpanLengthByte(int32_t spanLength) {

	194 // 0xfe==UnicodeSetStringSpan::LONG_SPAN

	195 return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;

	196 }

	197

	198 // Construct for all variants of span(), or only for any one variant.

	199 // Initialize as little as possible, for single use.

	200 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,

	201 const UVector &setStrings,

	202 uint32_t which)

	203 : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),

	204 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),

	205 utf8Length(0),

	206 maxLength16(0), maxLength8(0),

	207 all((UBool)(which==ALL)) {

	208 spanSet.retainAll(set);

	209 if(which&NOT_CONTAINED) {

	210 // Default to the same sets.

	211 // addToSpanNotSet() will create a separate set if necessary.

	212 pSpanNotSet=&spanSet;

	213 }

	214

	215 // Determine if the strings even need to be taken into account at all for sp an() etc.

	216 // If any string is relevant, then all strings need to be used for

	217 // span(longest match) but only the relevant ones for span(while contained).

	218 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH

	219 // and do not store UTF-8 strings if !thisRelevant and CONTAINED.

	220 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are r elevant after all.)

	221 // Also count the lengths of the UTF-8 versions of the strings for memory al location.

	222 int32_t stringsLength=strings.size();

	223

	224 int32_t i, spanLength;

	225 UBool someRelevant=FALSE;

	226 for(i=0; i<stringsLength; ++i) {

	227 const UnicodeString &string=(const UnicodeString )strings.elementAt(i) ;

	228 const UChar *s16=string.getBuffer();

	229 int32_t length16=string.length();

	230 UBool thisRelevant;

	231 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);

	232 if(spanLength<length16) { // Relevant string.

	233 someRelevant=thisRelevant=TRUE;

	234 } else {

	235 thisRelevant=FALSE;

	236 }

	237 if((which&UTF16) && length16>maxLength16) {

	238 maxLength16=length16;

	239 }

	240 if((which&UTF8) && (thisRelevant \|\| (which&CONTAINED))) {

	241 int32_t length8=getUTF8Length(s16, length16);

	242 utf8Length+=length8;

	243 if(length8>maxLength8) {

	244 maxLength8=length8;

	245 }

	246 }

	247 }

	248 if(!someRelevant) {

	249 maxLength16=maxLength8=0;

	250 return;

	251 }

	252

	253 // Freeze after checking for the need to use strings at all because freezing

	254 // a set takes some time and memory which are wasted if there are no relevan t strings.

	255 if(all) {

	256 spanSet.freeze();

	257 }

	258

	259 uint8_t *spanBackLengths;

	260 uint8_t *spanUTF8Lengths;

	261 uint8_t *spanBackUTF8Lengths;

	262

	263 // Allocate a block of meta data.

	264 int32_t allocSize;

	265 if(all) {

	266 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.

	267 allocSize=stringsLength*(4+1+1+1+1)+utf8Length;

	268 } else {

	269 allocSize=stringsLength; // One set of span lengths.

	270 if(which&UTF8) {

	271 // UTF-8 lengths and UTF-8 strings.

	272 allocSize+=stringsLength*4+utf8Length;

	273 }

	274 }

	275 if(allocSize<=(int32_t)sizeof(staticLengths)) {

	276 utf8Lengths=staticLengths;

	277 } else {

	278 utf8Lengths=(int32_t *)uprv_malloc(allocSize);

	279 if(utf8Lengths==NULL) {

	280 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpa nUTF16/8() return FALSE.

	281 return; // Out of memory.

	282 }

	283 }

	284

	285 if(all) {

	286 // Store span lengths for all span() variants.

	287 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);

	288 spanBackLengths=spanLengths+stringsLength;

	289 spanUTF8Lengths=spanBackLengths+stringsLength;

	290 spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;

	291 utf8=spanBackUTF8Lengths+stringsLength;

	292 } else {

	293 // Store span lengths for only one span() variant.

	294 if(which&UTF8) {

	295 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);

	296 utf8=spanLengths+stringsLength;

	297 } else {

	298 spanLengths=(uint8_t *)utf8Lengths;

	299 }

	300 spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;

	301 }

	302

	303 // Set the meta data and pSpanNotSet and write the UTF-8 strings.

	304 int32_t utf8Count=0; // Count UTF-8 bytes written so far.

	305

	306 for(i=0; i<stringsLength; ++i) {

	307 const UnicodeString &string=(const UnicodeString )strings.elementAt(i) ;

	308 const UChar *s16=string.getBuffer();

	309 int32_t length16=string.length();

	310 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);

	311 if(spanLength<length16) { // Relevant string.

	312 if(which&UTF16) {

	313 if(which&CONTAINED) {

	314 if(which&FWD) {

	315 spanLengths[i]=makeSpanLengthByte(spanLength);

	316 }

	317 if(which&BACK) {

	318 spanLength=length16-spanSet.spanBack(s16, length16, USET _SPAN_CONTAINED);

	319 spanBackLengths[i]=makeSpanLengthByte(spanLength);

	320 }

	321 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {

	322 spanLengths[i]=spanBackLengths[i]=0; // Only store a releva nt/irrelevant flag.

	323 }

	324 }

	325 if(which&UTF8) {

	326 uint8_t *s8=utf8+utf8Count;

	327 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Cou nt);

	328 utf8Count+=utf8Lengths[i]=length8;

	329 if(length8==0) { // Irrelevant for UTF-8 because not representa ble in UTF-8.

	330 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CO NTAINED;

	331 } else { // Relevant for UTF-8.

	332 if(which&CONTAINED) {

	333 if(which&FWD) {

	334 spanLength=spanSet.spanUTF8((const char *)s8, length 8, USET_SPAN_CONTAINED);

	335 spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);

	336 }

	337 if(which&BACK) {

	338 spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);

	339 spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength );

	340 }

	341 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {

	342 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only st ore a relevant/irrelevant flag.

	343 }

	344 }

	345 }

	346 if(which&NOT_CONTAINED) {

	347 // Add string start and end code points to the spanNotSet so tha t

	348 // a span(while not contained) stops before any string.

	349 UChar32 c;

	350 if(which&FWD) {

	351 int32_t len=0;

	352 U16_NEXT(s16, len, length16, c);

	353 addToSpanNotSet(c);

	354 }

	355 if(which&BACK) {

	356 int32_t len=length16;

	357 U16_PREV(s16, 0, len, c);

	358 addToSpanNotSet(c);

	359 }

	360 }

	361 } else { // Irrelevant string.

	362 if(which&UTF8) {

	363 if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.

	364 uint8_t *s8=utf8+utf8Count;

	365 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf 8Count);

	366 utf8Count+=utf8Lengths[i]=length8;

	367 } else {

	368 utf8Lengths[i]=0;

	369 }

	370 }

	371 if(all) {

	372 spanLengths[i]=spanBackLengths[i]=

	373 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=

	374 (uint8_t)ALL_CP_CONTAINED;

	375 } else {

	376 // All spanXYZLengths pointers contain the same address.

	377 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;

	378 }

	379 }

	380 }

	381

	382 // Finish.

	383 if(all) {

	384 pSpanNotSet->freeze();

	385 }

	386 }

	387

	388 // Copy constructor. Assumes which==ALL for a frozen set.

	389 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStri ngSpan,

	390 const UVector &newParentSetStrings)

	391 : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParent SetStrings),

	392 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),

	393 utf8Length(otherStringSpan.utf8Length),

	394 maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.m axLength8),

	395 all(TRUE) {

	396 if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {

	397 pSpanNotSet=&spanSet;

	398 } else {

	399 pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();

	400 }

	401

	402 // Allocate a block of meta data.

	403 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.

	404 int32_t stringsLength=strings.size();

	405 int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;

	406 if(allocSize<=(int32_t)sizeof(staticLengths)) {

	407 utf8Lengths=staticLengths;

	408 } else {

	409 utf8Lengths=(int32_t *)uprv_malloc(allocSize);

	410 if(utf8Lengths==NULL) {

	411 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpa nUTF16/8() return FALSE.

	412 return; // Out of memory.

	413 }

	414 }

	415

	416 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);

	417 utf8=spanLengths+stringsLength*4;

	418 uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);

	419 }

	420

	421 UnicodeSetStringSpan::~UnicodeSetStringSpan() {

	422 if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {

	423 delete pSpanNotSet;

	424 }

	425 if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {

	426 uprv_free(utf8Lengths);

	427 }

	428 }

	429

	430 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {

	431 if(pSpanNotSet==NULL \|\| pSpanNotSet==&spanSet) {

	432 if(spanSet.contains(c)) {

	433 return; // Nothing to do.

	434 }

	435 UnicodeSet newSet=(UnicodeSet )spanSet.cloneAsThawed();

	436 if(newSet==NULL) {

	437 return; // Out of memory.

	438 } else {

	439 pSpanNotSet=newSet;

	440 }

	441 }

	442 pSpanNotSet->add(c);

	443 }

	444

	445 // Compare strings without any argument checks. Requires length>0.

	446 static inline UBool

	447 matches16(const UChar s, const UChar t, int32_t length) {

	448 do {

	449 if(s++!=t++) {

	450 return FALSE;

	451 }

	452 } while(--length>0);

	453 return TRUE;

	454 }

	455

	456 static inline UBool

	457 matches8(const uint8_t s, const uint8_t t, int32_t length) {

	458 do {

	459 if(s++!=t++) {

	460 return FALSE;

	461 }

	462 } while(--length>0);

	463 return TRUE;

	464 }

	465

	466 // Compare 16-bit Unicode strings (which may be malformed UTF-16)

	467 // at code point boundaries.

	468 // That is, each edge of a match must not be in the middle of a surrogate pair.

	469 static inline UBool

	470 matches16CPB(const UChar s, int32_t start, int32_t limit, const UChar t, int32 _t length) {

	471 s+=start;

	472 limit-=start;

	473 return matches16(s, t, length) &&

	474 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&

	475 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]) );

	476 }

	477

	478 // Does the set contain the next code point?

	479 // If so, return its length; otherwise return its negative length.

	480 static inline int32_t

	481 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {

	482 UChar c=*s, c2;

	483 if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {

	484 return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;

	485 }

	486 return set.contains(c) ? 1 : -1;

	487 }

	488

	489 static inline int32_t

	490 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {

	491 UChar c=s[length-1], c2;

	492 if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {

	493 return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;

	494 }

	495 return set.contains(c) ? 1 : -1;

	496 }

	497

	498 static inline int32_t

	499 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {

	500 UChar32 c=*s;

	501 if((int8_t)c>=0) {

	502 return set.contains(c) ? 1 : -1;

	503 }

	504 // Take advantage of non-ASCII fastpaths in U8_NEXT().

	505 int32_t i=0;

	506 U8_NEXT(s, i, length, c);

	507 return set.contains(c) ? i : -i;

	508 }

	509

	510 static inline int32_t

	511 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {

	512 UChar32 c=s[length-1];

	513 if((int8_t)c>=0) {

	514 return set.contains(c) ? 1 : -1;

	515 }

	516 int32_t i=length-1;

	517 c=utf8_prevCharSafeBody(s, 0, &i, c, -1);

	518 length-=i;

	519 return set.contains(c) ? length : -length;

	520 }

	521

	522 /*

	523 * Note: In span() when spanLength==0 (after a string match, or at the beginning

	524 * after an empty code point span) and in spanNot() and spanNotUTF8(),

	525 * string matching could use a binary search

	526 * because all string matches are done from the same start index.

	527 *

	528 * For UTF-8, this would require a comparison function that returns UTF-16 order .

	529 *

	530 * This optimization should not be necessary for normal UnicodeSets because

	531 * most sets have no strings, and most sets with strings have

	532 * very few very short strings.

	533 * For cases with many strings, it might be better to use a different API

	534 * and implementation with a DFA (state machine).

	535 */

	536

	537 /*

	538 * Algorithm for span(USET_SPAN_CONTAINED)

	539 *

	540 * Theoretical algorithm:

	541 * - Iterate through the string, and at each code point boundary:

	542 * + If the code point there is in the set, then remember to continue after it .

	543 * + If a set string matches at the current position, then remember to continu e after it.

	544 * + Either recursively span for each code point or string match,

	545 * or recursively span for all but the shortest one and

	546 * iteratively continue the span with the shortest local match.

	547 * + Remember the longest recursive span (the farthest end point).

	548 * + If there is no match at the current position, neither for the code point there

	549 * nor for any set string, then stop and return the longest recursive span l ength.

	550 *

	551 * Optimized implementation:

	552 *

	553 * (We assume that most sets will have very few very short strings.

	554 * A span using a string-less set is extremely fast.)

	555 *

	556 * Create and cache a spanSet which contains all of the single code points

	557 * of the original set but none of its strings.

	558 *

	559 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).

	560 * - Loop:

	561 * + Try to match each set string at the end of the spanLength.

	562 * ~ Set strings that start with set-contained code points must be matched

	563 * with a partial overlap because the recursive algorithm would have tried

	564 * to match them at every position.

	565 * ~ Set strings that entirely consist of set-contained code points

	566 * are irrelevant for span(USET_SPAN_CONTAINED) because the

	567 * recursive algorithm would continue after them anyway

	568 * and find the longest recursive match from their end.

	569 * ~ Rather than recursing, note each end point of a set string match.

	570 * + If no set string matched after spanSet.span(), then return

	571 * with where the spanSet.span() ended.

	572 * + If at least one set string matched after spanSet.span(), then

	573 * pop the shortest string match end point and continue

	574 * the loop, trying to match all set strings from there.

	575 * + If at least one more set string matched after a previous string match,

	576 * then test if the code point after the previous string match is also

	577 * contained in the set.

	578 * Continue the loop with the shortest end point of either this code point

	579 * or a matching set string.

	580 * + If no more set string matched after a previous string match,

	581 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).

	582 * Stop if spanLength==0, otherwise continue the loop.

	583 *

	584 * By noting each end point of a set string match,

	585 * the function visits each string position at most once and finishes

	586 * in linear time.

	587 *

	588 * The recursive algorithm may visit the same string position many times

	589 * if multiple paths lead to it and finishes in exponential time.

	590 */

	591

	592 /*

	593 * Algorithm for span(USET_SPAN_SIMPLE)

	594 *

	595 * Theoretical algorithm:

	596 * - Iterate through the string, and at each code point boundary:

	597 * + If the code point there is in the set, then remember to continue after it .

	598 * + If a set string matches at the current position, then remember to continu e after it.

	599 * + Continue from the farthest match position and ignore all others.

	600 * + If there is no match at the current position,

	601 * then stop and return the current position.

	602 *

	603 * Optimized implementation:

	604 *

	605 * (Same assumption and spanSet as above.)

	606 *

	607 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).

	608 * - Loop:

	609 * + Try to match each set string at the end of the spanLength.

	610 * ~ Set strings that start with set-contained code points must be matched

	611 * with a partial overlap because the standard algorithm would have tried

	612 * to match them earlier.

	613 * ~ Set strings that entirely consist of set-contained code points

	614 * must be matched with a full overlap because the longest-match algorithm

	615 * would hide set string matches that end earlier.

	616 * Such set strings need not be matched earlier inside the code point span

	617 * because the standard algorithm would then have continued after

	618 * the set string match anyway.

	619 * ~ Remember the longest set string match (farthest end point) from the ear liest

	620 * starting point.

	621 * + If no set string matched after spanSet.span(), then return

	622 * with where the spanSet.span() ended.

	623 * + If at least one set string matched, then continue the loop after the

	624 * longest match from the earliest position.

	625 * + If no more set string matched after a previous string match,

	626 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).

	627 * Stop if spanLength==0, otherwise continue the loop.

	628 */

	629

	630 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondi tion spanCondition) const {

	631 if(spanCondition==USET_SPAN_NOT_CONTAINED) {

	632 return spanNot(s, length);

	633 }

	634 int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);

	635 if(spanLength==length) {

	636 return length;

	637 }

	638

	639 // Consider strings; they may overlap with the span.

	640 OffsetList offsets;

	641 if(spanCondition==USET_SPAN_CONTAINED) {

	642 // Use offset list to try all possibilities.

	643 offsets.setMaxLength(maxLength16);

	644 }

	645 int32_t pos=spanLength, rest=length-pos;

	646 int32_t i, stringsLength=strings.size();

	647 for(;;) {

	648 if(spanCondition==USET_SPAN_CONTAINED) {

	649 for(i=0; i<stringsLength; ++i) {

	650 int32_t overlap=spanLengths[i];

	651 if(overlap==ALL_CP_CONTAINED) {

	652 continue; // Irrelevant string.

	653 }

	654 const UnicodeString &string=(const UnicodeString )strings.elem entAt(i);

	655 const UChar *s16=string.getBuffer();

	656 int32_t length16=string.length();

	657

	658 // Try to match this string at pos-overlap..pos.

	659 if(overlap>=LONG_SPAN) {

	660 overlap=length16;

	661 // While contained: No point matching fully inside the code point span.

	662 U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point.

	663 }

	664 if(overlap>spanLength) {

	665 overlap=spanLength;

	666 }

	667 int32_t inc=length16-overlap; // Keep overlap+inc==length16.

	668 for(;;) {

	669 if(inc>rest) {

	670 break;

	671 }

	672 // Try to match if the increment is not listed already.

	673 if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overl ap, length, s16, length16)) {

	674 if(inc==rest) {

	675 return length; // Reached the end of the string.

	676 }

	677 offsets.addOffset(inc);

	678 }

	679 if(overlap==0) {

	680 break;

	681 }

	682 --overlap;

	683 ++inc;

	684 }

	685 }

	686 } else /* USET_SPAN_SIMPLE */ {

	687 int32_t maxInc=0, maxOverlap=0;

	688 for(i=0; i<stringsLength; ++i) {

	689 int32_t overlap=spanLengths[i];

	690 // For longest match, we do need to try to match even an all-con tained string

	691 // to find the match from the earliest start.

	692

	693 const UnicodeString &string=(const UnicodeString )strings.elem entAt(i);

	694 const UChar *s16=string.getBuffer();

	695 int32_t length16=string.length();

	696

	697 // Try to match this string at pos-overlap..pos.

	698 if(overlap>=LONG_SPAN) {

	699 overlap=length16;

	700 // Longest match: Need to match fully inside the code point span

	701 // to find the match from the earliest start.

	702 }

	703 if(overlap>spanLength) {

	704 overlap=spanLength;

	705 }

	706 int32_t inc=length16-overlap; // Keep overlap+inc==length16.

	707 for(;;) {

	708 if(inc>rest \|\| overlap<maxOverlap) {

	709 break;

	710 }

	711 // Try to match if the string is longer or starts earlier.

	712 if( (overlap>maxOverlap \|\| /* redundant overlap==maxOverlap && */ inc>maxInc) &&

	713 matches16CPB(s, pos-overlap, length, s16, length16)

	714 ) {

	715 maxInc=inc; // Longest match from earliest start.

	716 maxOverlap=overlap;

	717 break;

	718 }

	719 --overlap;

	720 ++inc;

	721 }

	722 }

	723

	724 if(maxInc!=0 \|\| maxOverlap!=0) {

	725 // Longest-match algorithm, and there was a string match.

	726 // Simply continue after it.

	727 pos+=maxInc;

	728 rest-=maxInc;

	729 if(rest==0) {

	730 return length; // Reached the end of the string.

	731 }

	732 spanLength=0; // Match strings from after a string match.

	733 continue;

	734 }

	735 }

	736 // Finished trying to match all strings at pos.

	737

	738 if(spanLength!=0 \|\| pos==0) {

	739 // The position is after an unlimited code point span (spanLength!=0 ),

	740 // not after a string match.

	741 // The only position where spanLength==0 after a span is pos==0.

	742 // Otherwise, an unlimited code point span is only tried again when no

	743 // strings match, and if such a non-initial span fails we stop.

	744 if(offsets.isEmpty()) {

	745 return pos; // No strings matched after a span.

	746 }

	747 // Match strings from after the next string match.

	748 } else {

	749 // The position is after a string match (or a single code point).

	750 if(offsets.isEmpty()) {

	751 // No more strings matched after a previous string match.

	752 // Try another code point span from after the last string match.

	753 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);

	754 if( spanLength==rest \|\| // Reached the end of the string, or

	755 spanLength==0 // neither strings nor span progressed.

	756 ) {

	757 return pos+spanLength;

	758 }

	759 pos+=spanLength;

	760 rest-=spanLength;

	761 continue; // spanLength>0: Match strings from after a span.

	762 } else {

	763 // Try to match only one code point from after a string match if some

	764 // string matched beyond it, so that we try all possible positio ns

	765 // and don't overshoot.

	766 spanLength=spanOne(spanSet, s+pos, rest);

	767 if(spanLength>0) {

	768 if(spanLength==rest) {

	769 return length; // Reached the end of the string.

	770 }

	771 // Match strings after this code point.

	772 // There cannot be any increments below it because UnicodeSe t strings

	773 // contain multiple code points.

	774 pos+=spanLength;

	775 rest-=spanLength;

	776 offsets.shift(spanLength);

	777 spanLength=0;

	778 continue; // Match strings from after a single code point.

	779 }

	780 // Match strings from after the next string match.

	781 }

	782 }

	783 int32_t minOffset=offsets.popMinimum();

	784 pos+=minOffset;

	785 rest-=minOffset;

	786 spanLength=0; // Match strings from after a string match.

	787 }

	788 }

	789

	790 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanC ondition spanCondition) const {

	791 if(spanCondition==USET_SPAN_NOT_CONTAINED) {

	792 return spanNotBack(s, length);

	793 }

	794 int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);

	795 if(pos==0) {

	796 return 0;

	797 }

	798 int32_t spanLength=length-pos;

	799

	800 // Consider strings; they may overlap with the span.

	801 OffsetList offsets;

	802 if(spanCondition==USET_SPAN_CONTAINED) {

	803 // Use offset list to try all possibilities.

	804 offsets.setMaxLength(maxLength16);

	805 }

	806 int32_t i, stringsLength=strings.size();

	807 uint8_t *spanBackLengths=spanLengths;

	808 if(all) {

	809 spanBackLengths+=stringsLength;

	810 }

	811 for(;;) {

	812 if(spanCondition==USET_SPAN_CONTAINED) {

	813 for(i=0; i<stringsLength; ++i) {

	814 int32_t overlap=spanBackLengths[i];

	815 if(overlap==ALL_CP_CONTAINED) {

	816 continue; // Irrelevant string.

	817 }

	818 const UnicodeString &string=(const UnicodeString )strings.elem entAt(i);

	819 const UChar *s16=string.getBuffer();

	820 int32_t length16=string.length();

	821

	822 // Try to match this string at pos-(length16-overlap)..pos-lengt h16.

	823 if(overlap>=LONG_SPAN) {

	824 overlap=length16;

	825 // While contained: No point matching fully inside the code point span.

	826 int32_t len1=0;

	827 U16_FWD_1(s16, len1, overlap);

	828 overlap-=len1; // Length of the string minus the first code point.

	829 }

	830 if(overlap>spanLength) {

	831 overlap=spanLength;

	832 }

	833 int32_t dec=length16-overlap; // Keep dec+overlap==length16.

	834 for(;;) {

	835 if(dec>pos) {

	836 break;

	837 }

	838 // Try to match if the decrement is not listed already.

	839 if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {

	840 if(dec==pos) {

	841 return 0; // Reached the start of the string.

	842 }

	843 offsets.addOffset(dec);

	844 }

	845 if(overlap==0) {

	846 break;

	847 }

	848 --overlap;

	849 ++dec;

	850 }

	851 }

	852 } else /* USET_SPAN_SIMPLE */ {

	853 int32_t maxDec=0, maxOverlap=0;

	854 for(i=0; i<stringsLength; ++i) {

	855 int32_t overlap=spanBackLengths[i];

	856 // For longest match, we do need to try to match even an all-con tained string

	857 // to find the match from the latest end.

	858

	859 const UnicodeString &string=(const UnicodeString )strings.elem entAt(i);

	860 const UChar *s16=string.getBuffer();

	861 int32_t length16=string.length();

	862

	863 // Try to match this string at pos-(length16-overlap)..pos-lengt h16.

	864 if(overlap>=LONG_SPAN) {

	865 overlap=length16;

	866 // Longest match: Need to match fully inside the code point span

	867 // to find the match from the latest end.

	868 }

	869 if(overlap>spanLength) {

	870 overlap=spanLength;

	871 }

	872 int32_t dec=length16-overlap; // Keep dec+overlap==length16.

	873 for(;;) {

	874 if(dec>pos \|\| overlap<maxOverlap) {

	875 break;

	876 }

	877 // Try to match if the string is longer or ends later.

	878 if( (overlap>maxOverlap \|\| /* redundant overlap==maxOverlap && */ dec>maxDec) &&

	879 matches16CPB(s, pos-dec, length, s16, length16)

	880 ) {

	881 maxDec=dec; // Longest match from latest end.

	882 maxOverlap=overlap;

	883 break;

	884 }

	885 --overlap;

	886 ++dec;

	887 }

	888 }

	889

	890 if(maxDec!=0 \|\| maxOverlap!=0) {

	891 // Longest-match algorithm, and there was a string match.

	892 // Simply continue before it.

	893 pos-=maxDec;

	894 if(pos==0) {

	895 return 0; // Reached the start of the string.

	896 }

	897 spanLength=0; // Match strings from before a string match.

	898 continue;

	899 }

	900 }

	901 // Finished trying to match all strings at pos.

	902

	903 if(spanLength!=0 \|\| pos==length) {

	904 // The position is before an unlimited code point span (spanLength!= 0),

	905 // not before a string match.

	906 // The only position where spanLength==0 before a span is pos==lengt h.

	907 // Otherwise, an unlimited code point span is only tried again when no

	908 // strings match, and if such a non-initial span fails we stop.

	909 if(offsets.isEmpty()) {

	910 return pos; // No strings matched before a span.

	911 }

	912 // Match strings from before the next string match.

	913 } else {

	914 // The position is before a string match (or a single code point).

	915 if(offsets.isEmpty()) {

	916 // No more strings matched before a previous string match.

	917 // Try another code point span from before the last string match .

	918 int32_t oldPos=pos;

	919 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);

	920 spanLength=oldPos-pos;

	921 if( pos==0 \|\| // Reached the start of the string, or

	922 spanLength==0 // neither strings nor span progressed.

	923 ) {

	924 return pos;

	925 }

	926 continue; // spanLength>0: Match strings from before a span.

	927 } else {

	928 // Try to match only one code point from before a string match i f some

	929 // string matched beyond it, so that we try all possible positio ns

	930 // and don't overshoot.

	931 spanLength=spanOneBack(spanSet, s, pos);

	932 if(spanLength>0) {

	933 if(spanLength==pos) {

	934 return 0; // Reached the start of the string.

	935 }

	936 // Match strings before this code point.

	937 // There cannot be any decrements below it because UnicodeSe t strings

	938 // contain multiple code points.

	939 pos-=spanLength;

	940 offsets.shift(spanLength);

	941 spanLength=0;

	942 continue; // Match strings from before a single code point.

	943 }

	944 // Match strings from before the next string match.

	945 }

	946 }

	947 pos-=offsets.popMinimum();

	948 spanLength=0; // Match strings from before a string match.

	949 }

	950 }

	951

	952 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpa nCondition spanCondition) const {

	953 if(spanCondition==USET_SPAN_NOT_CONTAINED) {

	954 return spanNotUTF8(s, length);

	955 }

	956 int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTA INED);

	957 if(spanLength==length) {

	958 return length;

	959 }

	960

	961 // Consider strings; they may overlap with the span.

	962 OffsetList offsets;

	963 if(spanCondition==USET_SPAN_CONTAINED) {

	964 // Use offset list to try all possibilities.

	965 offsets.setMaxLength(maxLength8);

	966 }

	967 int32_t pos=spanLength, rest=length-pos;

	968 int32_t i, stringsLength=strings.size();

	969 uint8_t *spanUTF8Lengths=spanLengths;

	970 if(all) {

	971 spanUTF8Lengths+=2*stringsLength;

	972 }

	973 for(;;) {

	974 const uint8_t *s8=utf8;

	975 int32_t length8;

	976 if(spanCondition==USET_SPAN_CONTAINED) {

	977 for(i=0; i<stringsLength; ++i) {

	978 length8=utf8Lengths[i];

	979 if(length8==0) {

	980 continue; // String not representable in UTF-8.

	981 }

	982 int32_t overlap=spanUTF8Lengths[i];

	983 if(overlap==ALL_CP_CONTAINED) {

	984 s8+=length8;

	985 continue; // Irrelevant string.

	986 }

	987

	988 // Try to match this string at pos-overlap..pos.

	989 if(overlap>=LONG_SPAN) {

	990 overlap=length8;

	991 // While contained: No point matching fully inside the code point span.

	992 U8_BACK_1(s8, 0, overlap); // Length of the string minus th e last code point.

	993 }

	994 if(overlap>spanLength) {

	995 overlap=spanLength;

	996 }

	997 int32_t inc=length8-overlap; // Keep overlap+inc==length8.

	998 for(;;) {

	999 if(inc>rest) {

	1000 break;

	1001 }

	1002 // Try to match if the increment is not listed already.

	1003 // Match at code point boundaries. (The UTF-8 strings were c onverted

	1004 // from UTF-16 and are guaranteed to be well-formed.)

	1005 if( !U8_IS_TRAIL(s[pos-overlap]) &&

	1006 !offsets.containsOffset(inc) &&

	1007 matches8(s+pos-overlap, s8, length8)

	1008

	1009 ) {

	1010 if(inc==rest) {

	1011 return length; // Reached the end of the string.

	1012 }

	1013 offsets.addOffset(inc);

	1014 }

	1015 if(overlap==0) {

	1016 break;

	1017 }

	1018 --overlap;

	1019 ++inc;

	1020 }

	1021 s8+=length8;

	1022 }

	1023 } else /* USET_SPAN_SIMPLE */ {

	1024 int32_t maxInc=0, maxOverlap=0;

	1025 for(i=0; i<stringsLength; ++i) {

	1026 length8=utf8Lengths[i];

	1027 if(length8==0) {

	1028 continue; // String not representable in UTF-8.

	1029 }

	1030 int32_t overlap=spanUTF8Lengths[i];

	1031 // For longest match, we do need to try to match even an all-con tained string

	1032 // to find the match from the earliest start.

	1033

	1034 // Try to match this string at pos-overlap..pos.

	1035 if(overlap>=LONG_SPAN) {

	1036 overlap=length8;

	1037 // Longest match: Need to match fully inside the code point span

	1038 // to find the match from the earliest start.

	1039 }

	1040 if(overlap>spanLength) {

	1041 overlap=spanLength;

	1042 }

	1043 int32_t inc=length8-overlap; // Keep overlap+inc==length8.

	1044 for(;;) {

	1045 if(inc>rest \|\| overlap<maxOverlap) {

	1046 break;

	1047 }

	1048 // Try to match if the string is longer or starts earlier.

	1049 // Match at code point boundaries. (The UTF-8 strings were c onverted

	1050 // from UTF-16 and are guaranteed to be well-formed.)

	1051 if( !U8_IS_TRAIL(s[pos-overlap]) &&

	1052 (overlap>maxOverlap \|\| /* redundant overlap==maxOverlap && */ inc>maxInc) &&

	1053 matches8(s+pos-overlap, s8, length8)

	1054

	1055 ) {

	1056 maxInc=inc; // Longest match from earliest start.

	1057 maxOverlap=overlap;

	1058 break;

	1059 }

	1060 --overlap;

	1061 ++inc;

	1062 }

	1063 s8+=length8;

	1064 }

	1065

	1066 if(maxInc!=0 \|\| maxOverlap!=0) {

	1067 // Longest-match algorithm, and there was a string match.

	1068 // Simply continue after it.

	1069 pos+=maxInc;

	1070 rest-=maxInc;

	1071 if(rest==0) {

	1072 return length; // Reached the end of the string.

	1073 }

	1074 spanLength=0; // Match strings from after a string match.

	1075 continue;

	1076 }

	1077 }

	1078 // Finished trying to match all strings at pos.

	1079

	1080 if(spanLength!=0 \|\| pos==0) {

	1081 // The position is after an unlimited code point span (spanLength!=0 ),

	1082 // not after a string match.

	1083 // The only position where spanLength==0 after a span is pos==0.

	1084 // Otherwise, an unlimited code point span is only tried again when no

	1085 // strings match, and if such a non-initial span fails we stop.

	1086 if(offsets.isEmpty()) {

	1087 return pos; // No strings matched after a span.

	1088 }

	1089 // Match strings from after the next string match.

	1090 } else {

	1091 // The position is after a string match (or a single code point).

	1092 if(offsets.isEmpty()) {

	1093 // No more strings matched after a previous string match.

	1094 // Try another code point span from after the last string match.

	1095 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN _CONTAINED);

	1096 if( spanLength==rest \|\| // Reached the end of the string, or

	1097 spanLength==0 // neither strings nor span progressed.

	1098 ) {

	1099 return pos+spanLength;

	1100 }

	1101 pos+=spanLength;

	1102 rest-=spanLength;

	1103 continue; // spanLength>0: Match strings from after a span.

	1104 } else {

	1105 // Try to match only one code point from after a string match if some

	1106 // string matched beyond it, so that we try all possible positio ns

	1107 // and don't overshoot.

	1108 spanLength=spanOneUTF8(spanSet, s+pos, rest);

	1109 if(spanLength>0) {

	1110 if(spanLength==rest) {

	1111 return length; // Reached the end of the string.

	1112 }

	1113 // Match strings after this code point.

	1114 // There cannot be any increments below it because UnicodeSe t strings

	1115 // contain multiple code points.

	1116 pos+=spanLength;

	1117 rest-=spanLength;

	1118 offsets.shift(spanLength);

	1119 spanLength=0;

	1120 continue; // Match strings from after a single code point.

	1121 }

	1122 // Match strings from after the next string match.

	1123 }

	1124 }

	1125 int32_t minOffset=offsets.popMinimum();

	1126 pos+=minOffset;

	1127 rest-=minOffset;

	1128 spanLength=0; // Match strings from after a string match.

	1129 }

	1130 }

	1131

	1132 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USe tSpanCondition spanCondition) const {

	1133 if(spanCondition==USET_SPAN_NOT_CONTAINED) {

	1134 return spanNotBackUTF8(s, length);

	1135 }

	1136 int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINE D);

	1137 if(pos==0) {

	1138 return 0;

	1139 }

	1140 int32_t spanLength=length-pos;

	1141

	1142 // Consider strings; they may overlap with the span.

	1143 OffsetList offsets;

	1144 if(spanCondition==USET_SPAN_CONTAINED) {

	1145 // Use offset list to try all possibilities.

	1146 offsets.setMaxLength(maxLength8);

	1147 }

	1148 int32_t i, stringsLength=strings.size();

	1149 uint8_t *spanBackUTF8Lengths=spanLengths;

	1150 if(all) {

	1151 spanBackUTF8Lengths+=3*stringsLength;

	1152 }

	1153 for(;;) {

	1154 const uint8_t *s8=utf8;

	1155 int32_t length8;

	1156 if(spanCondition==USET_SPAN_CONTAINED) {

	1157 for(i=0; i<stringsLength; ++i) {

	1158 length8=utf8Lengths[i];

	1159 if(length8==0) {

	1160 continue; // String not representable in UTF-8.

	1161 }

	1162 int32_t overlap=spanBackUTF8Lengths[i];

	1163 if(overlap==ALL_CP_CONTAINED) {

	1164 s8+=length8;

	1165 continue; // Irrelevant string.

	1166 }

	1167

	1168 // Try to match this string at pos-(length8-overlap)..pos-length 8.

	1169 if(overlap>=LONG_SPAN) {

	1170 overlap=length8;

	1171 // While contained: No point matching fully inside the code point span.

	1172 int32_t len1=0;

	1173 U8_FWD_1(s8, len1, overlap);

	1174 overlap-=len1; // Length of the string minus the first code point.

	1175 }

	1176 if(overlap>spanLength) {

	1177 overlap=spanLength;

	1178 }

	1179 int32_t dec=length8-overlap; // Keep dec+overlap==length8.

	1180 for(;;) {

	1181 if(dec>pos) {

	1182 break;

	1183 }

	1184 // Try to match if the decrement is not listed already.

	1185 // Match at code point boundaries. (The UTF-8 strings were c onverted

	1186 // from UTF-16 and are guaranteed to be well-formed.)

	1187 if( !U8_IS_TRAIL(s[pos-dec]) &&

	1188 !offsets.containsOffset(dec) &&

	1189 matches8(s+pos-dec, s8, length8)

	1190 ) {

	1191 if(dec==pos) {

	1192 return 0; // Reached the start of the string.

	1193 }

	1194 offsets.addOffset(dec);

	1195 }

	1196 if(overlap==0) {

	1197 break;

	1198 }

	1199 --overlap;

	1200 ++dec;

	1201 }

	1202 s8+=length8;

	1203 }

	1204 } else /* USET_SPAN_SIMPLE */ {

	1205 int32_t maxDec=0, maxOverlap=0;

	1206 for(i=0; i<stringsLength; ++i) {

	1207 length8=utf8Lengths[i];

	1208 if(length8==0) {

	1209 continue; // String not representable in UTF-8.

	1210 }

	1211 int32_t overlap=spanBackUTF8Lengths[i];

	1212 // For longest match, we do need to try to match even an all-con tained string

	1213 // to find the match from the latest end.

	1214

	1215 // Try to match this string at pos-(length8-overlap)..pos-length 8.

	1216 if(overlap>=LONG_SPAN) {

	1217 overlap=length8;

	1218 // Longest match: Need to match fully inside the code point span

	1219 // to find the match from the latest end.

	1220 }

	1221 if(overlap>spanLength) {

	1222 overlap=spanLength;

	1223 }

	1224 int32_t dec=length8-overlap; // Keep dec+overlap==length8.

	1225 for(;;) {

	1226 if(dec>pos \|\| overlap<maxOverlap) {

	1227 break;

	1228 }

	1229 // Try to match if the string is longer or ends later.

	1230 // Match at code point boundaries. (The UTF-8 strings were c onverted

	1231 // from UTF-16 and are guaranteed to be well-formed.)

	1232 if( !U8_IS_TRAIL(s[pos-dec]) &&

	1233 (overlap>maxOverlap \|\| /* redundant overlap==maxOverlap && */ dec>maxDec) &&

	1234 matches8(s+pos-dec, s8, length8)

	1235 ) {

	1236 maxDec=dec; // Longest match from latest end.

	1237 maxOverlap=overlap;

	1238 break;

	1239 }

	1240 --overlap;

	1241 ++dec;

	1242 }

	1243 s8+=length8;

	1244 }

	1245

	1246 if(maxDec!=0 \|\| maxOverlap!=0) {

	1247 // Longest-match algorithm, and there was a string match.

	1248 // Simply continue before it.

	1249 pos-=maxDec;

	1250 if(pos==0) {

	1251 return 0; // Reached the start of the string.

	1252 }

	1253 spanLength=0; // Match strings from before a string match.

	1254 continue;

	1255 }

	1256 }

	1257 // Finished trying to match all strings at pos.

	1258

	1259 if(spanLength!=0 \|\| pos==length) {

	1260 // The position is before an unlimited code point span (spanLength!= 0),

	1261 // not before a string match.

	1262 // The only position where spanLength==0 before a span is pos==lengt h.

	1263 // Otherwise, an unlimited code point span is only tried again when no

	1264 // strings match, and if such a non-initial span fails we stop.

	1265 if(offsets.isEmpty()) {

	1266 return pos; // No strings matched before a span.

	1267 }

	1268 // Match strings from before the next string match.

	1269 } else {

	1270 // The position is before a string match (or a single code point).

	1271 if(offsets.isEmpty()) {

	1272 // No more strings matched before a previous string match.

	1273 // Try another code point span from before the last string match .

	1274 int32_t oldPos=pos;

	1275 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONT AINED);

	1276 spanLength=oldPos-pos;

	1277 if( pos==0 \|\| // Reached the start of the string, or

	1278 spanLength==0 // neither strings nor span progressed.

	1279 ) {

	1280 return pos;

	1281 }

	1282 continue; // spanLength>0: Match strings from before a span.

	1283 } else {

	1284 // Try to match only one code point from before a string match i f some

	1285 // string matched beyond it, so that we try all possible positio ns

	1286 // and don't overshoot.

	1287 spanLength=spanOneBackUTF8(spanSet, s, pos);

	1288 if(spanLength>0) {

	1289 if(spanLength==pos) {

	1290 return 0; // Reached the start of the string.

	1291 }

	1292 // Match strings before this code point.

	1293 // There cannot be any decrements below it because UnicodeSe t strings

	1294 // contain multiple code points.

	1295 pos-=spanLength;

	1296 offsets.shift(spanLength);

	1297 spanLength=0;

	1298 continue; // Match strings from before a single code point.

	1299 }

	1300 // Match strings from before the next string match.

	1301 }

	1302 }

	1303 pos-=offsets.popMinimum();

	1304 spanLength=0; // Match strings from before a string match.

	1305 }

	1306 }

	1307

	1308 /*

	1309 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)

	1310 *

	1311 * Theoretical algorithm:

	1312 * - Iterate through the string, and at each code point boundary:

	1313 * + If the code point there is in the set, then return with the current posit ion.

	1314 * + If a set string matches at the current position, then return with the cur rent position.

	1315 *

	1316 * Optimized implementation:

	1317 *

	1318 * (Same assumption as for span() above.)

	1319 *

	1320 * Create and cache a spanNotSet which contains all of the single code points

	1321 * of the original set but none of its strings.

	1322 * For each set string add its initial code point to the spanNotSet.

	1323 * (Also add its final code point for spanNotBack().)

	1324 *

	1325 * - Loop:

	1326 * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).

	1327 * + If the current code point is in the original set, then

	1328 * return the current position.

	1329 * + If any set string matches at the current position, then

	1330 * return the current position.

	1331 * + If there is no match at the current position, neither for the code point there

	1332 * nor for any set string, then skip this code point and continue the loop.

	1333 * This happens for set-string-initial code points that were added to spanNo tSet

	1334 * when there is not actually a match for such a set string.

	1335 */

	1336

	1337 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {

	1338 int32_t pos=0, rest=length;

	1339 int32_t i, stringsLength=strings.size();

	1340 do {

	1341 // Span until we find a code point from the set,

	1342 // or a code point that starts or ends some string.

	1343 i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);

	1344 if(i==rest) {

	1345 return length; // Reached the end of the string.

	1346 }

	1347 pos+=i;

	1348 rest-=i;

	1349

	1350 // Check whether the current code point is in the original set,

	1351 // without the string starts and ends.

	1352 int32_t cpLength=spanOne(spanSet, s+pos, rest);

	1353 if(cpLength>0) {

	1354 return pos; // There is a set element at pos.

	1355 }

	1356

	1357 // Try to match the strings at pos.

	1358 for(i=0; i<stringsLength; ++i) {

	1359 if(spanLengths[i]==ALL_CP_CONTAINED) {

	1360 continue; // Irrelevant string.

	1361 }

	1362 const UnicodeString &string=(const UnicodeString )strings.elementA t(i);

	1363 const UChar *s16=string.getBuffer();

	1364 int32_t length16=string.length();

	1365 if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {

	1366 return pos; // There is a set element at pos.

	1367 }

	1368 }

	1369

	1370 // The span(while not contained) ended on a string start/end which is

	1371 // not in the original set. Skip this code point and continue.

	1372 // cpLength<0

	1373 pos-=cpLength;

	1374 rest+=cpLength;

	1375 } while(rest!=0);

	1376 return length; // Reached the end of the string.

	1377 }

	1378

	1379 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {

	1380 int32_t pos=length;

	1381 int32_t i, stringsLength=strings.size();

	1382 do {

	1383 // Span until we find a code point from the set,

	1384 // or a code point that starts or ends some string.

	1385 pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);

	1386 if(pos==0) {

	1387 return 0; // Reached the start of the string.

	1388 }

	1389

	1390 // Check whether the current code point is in the original set,

	1391 // without the string starts and ends.

	1392 int32_t cpLength=spanOneBack(spanSet, s, pos);

	1393 if(cpLength>0) {

	1394 return pos; // There is a set element at pos.

	1395 }

	1396

	1397 // Try to match the strings at pos.

	1398 for(i=0; i<stringsLength; ++i) {

	1399 // Use spanLengths rather than a spanBackLengths pointer because

	1400 // it is easier and we only need to know whether the string is irrel evant

	1401 // which is the same in either array.

	1402 if(spanLengths[i]==ALL_CP_CONTAINED) {

	1403 continue; // Irrelevant string.

	1404 }

	1405 const UnicodeString &string=(const UnicodeString )strings.elementA t(i);

	1406 const UChar *s16=string.getBuffer();

	1407 int32_t length16=string.length();

	1408 if(length16<=pos && matches16CPB(s, pos-length16, length, s16, lengt h16)) {

	1409 return pos; // There is a set element at pos.

	1410 }

	1411 }

	1412

	1413 // The span(while not contained) ended on a string start/end which is

	1414 // not in the original set. Skip this code point and continue.

	1415 // cpLength<0

	1416 pos+=cpLength;

	1417 } while(pos!=0);

	1418 return 0; // Reached the start of the string.

	1419 }

	1420

	1421 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) cons t {

	1422 int32_t pos=0, rest=length;

	1423 int32_t i, stringsLength=strings.size();

	1424 uint8_t *spanUTF8Lengths=spanLengths;

	1425 if(all) {

	1426 spanUTF8Lengths+=2*stringsLength;

	1427 }

	1428 do {

	1429 // Span until we find a code point from the set,

	1430 // or a code point that starts or ends some string.

	1431 i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAIN ED);

	1432 if(i==rest) {

	1433 return length; // Reached the end of the string.

	1434 }

	1435 pos+=i;

	1436 rest-=i;

	1437

	1438 // Check whether the current code point is in the original set,

	1439 // without the string starts and ends.

	1440 int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);

	1441 if(cpLength>0) {

	1442 return pos; // There is a set element at pos.

	1443 }

	1444

	1445 // Try to match the strings at pos.

	1446 const uint8_t *s8=utf8;

	1447 int32_t length8;

	1448 for(i=0; i<stringsLength; ++i) {

	1449 length8=utf8Lengths[i];

	1450 // ALL_CP_CONTAINED: Irrelevant string.

	1451 if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=re st && matches8(s+pos, s8, length8)) {

	1452 return pos; // There is a set element at pos.

	1453 }

	1454 s8+=length8;

	1455 }

	1456

	1457 // The span(while not contained) ended on a string start/end which is

	1458 // not in the original set. Skip this code point and continue.

	1459 // cpLength<0

	1460 pos-=cpLength;

	1461 rest+=cpLength;

	1462 } while(rest!=0);

	1463 return length; // Reached the end of the string.

	1464 }

	1465

	1466 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {

	1467 int32_t pos=length;

	1468 int32_t i, stringsLength=strings.size();

	1469 uint8_t *spanBackUTF8Lengths=spanLengths;

	1470 if(all) {

	1471 spanBackUTF8Lengths+=3*stringsLength;

	1472 }

	1473 do {

	1474 // Span until we find a code point from the set,

	1475 // or a code point that starts or ends some string.

	1476 pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAI NED);

	1477 if(pos==0) {

	1478 return 0; // Reached the start of the string.

	1479 }

	1480

	1481 // Check whether the current code point is in the original set,

	1482 // without the string starts and ends.

	1483 int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);

	1484 if(cpLength>0) {

	1485 return pos; // There is a set element at pos.

	1486 }

	1487

	1488 // Try to match the strings at pos.

	1489 const uint8_t *s8=utf8;

	1490 int32_t length8;

	1491 for(i=0; i<stringsLength; ++i) {

	1492 length8=utf8Lengths[i];

	1493 // ALL_CP_CONTAINED: Irrelevant string.

	1494 if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8 <=pos && matches8(s+pos-length8, s8, length8)) {

	1495 return pos; // There is a set element at pos.

	1496 }

	1497 s8+=length8;

	1498 }

	1499

	1500 // The span(while not contained) ended on a string start/end which is

	1501 // not in the original set. Skip this code point and continue.

	1502 // cpLength<0

	1503 pos+=cpLength;

	1504 } while(pos!=0);

	1505 return 0; // Reached the start of the string.

	1506 }

	1507

	1508 U_NAMESPACE_END

OLD	NEW

« no previous file with comments | « icu46/source/common/unisetspan.h ('k') | icu46/source/common/unistr.cpp » ('j') | no next file with comments »