OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ****************************************************************************** |
| 3 * |
| 4 * Copyright (C) 2007, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ****************************************************************************** |
| 8 * file name: unisetspan.cpp |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2007mar01 |
| 14 * created by: Markus W. Scherer |
| 15 */ |
| 16 |
| 17 #include "unicode/utypes.h" |
| 18 #include "unicode/uniset.h" |
| 19 #include "unicode/ustring.h" |
| 20 #include "cmemory.h" |
| 21 #include "uvector.h" |
| 22 #include "unisetspan.h" |
| 23 |
| 24 U_NAMESPACE_BEGIN |
| 25 |
| 26 /* |
| 27 * List of offsets from the current position from where to try matching |
| 28 * a code point or a string. |
| 29 * Store offsets rather than indexes to simplify the code and use the same list |
| 30 * for both increments (in span()) and decrements (in spanBack()). |
| 31 * |
| 32 * Assumption: The maximum offset is limited, and the offsets that are stored |
| 33 * at any one time are relatively dense, that is, there are normally no gaps of |
| 34 * hundreds or thousands of offset values. |
| 35 * |
| 36 * The implementation uses a circular buffer of byte flags, |
| 37 * each indicating whether the corresponding offset is in the list. |
| 38 * This avoids inserting into a sorted list of offsets (or absolute indexes) and |
| 39 * physically moving part of the list. |
| 40 * |
| 41 * Note: In principle, the caller should setMaxLength() to the maximum of the |
| 42 * max string length and U16_LENGTH/U8_LENGTH to account for |
| 43 * "long" single code points. |
| 44 * However, this implementation uses at least a staticList with more than |
| 45 * U8_LENGTH entries anyway. |
| 46 * |
| 47 * Note: If maxLength were guaranteed to be no more than 32 or 64, |
| 48 * the list could be stored as bit flags in a single integer. |
| 49 * Rather than handling a circular buffer with a start list index, |
| 50 * the integer would simply be shifted when lower offsets are removed. |
| 51 * UnicodeSet does not have a limit on the lengths of strings. |
| 52 */ |
| 53 class OffsetList { // Only ever stack-allocated, does not need to inherit UMemo
ry. |
| 54 public: |
| 55 OffsetList() : list(staticList), capacity(0), length(0), start(0) {} |
| 56 |
| 57 ~OffsetList() { |
| 58 if(list!=staticList) { |
| 59 uprv_free(list); |
| 60 } |
| 61 } |
| 62 |
| 63 // Call exactly once if the list is to be used. |
| 64 void setMaxLength(int32_t maxLength) { |
| 65 if(maxLength<=(int32_t)sizeof(staticList)) { |
| 66 capacity=(int32_t)sizeof(staticList); |
| 67 } else { |
| 68 UBool *l=(UBool *)uprv_malloc(maxLength); |
| 69 if(l!=NULL) { |
| 70 list=l; |
| 71 capacity=maxLength; |
| 72 } |
| 73 } |
| 74 uprv_memset(list, 0, capacity); |
| 75 } |
| 76 |
| 77 void clear() { |
| 78 uprv_memset(list, 0, capacity); |
| 79 start=length=0; |
| 80 } |
| 81 |
| 82 UBool isEmpty() const { |
| 83 return (UBool)(length==0); |
| 84 } |
| 85 |
| 86 // Reduce all stored offsets by delta, used when the current position |
| 87 // moves by delta. |
| 88 // There must not be any offsets lower than delta. |
| 89 // If there is an offset equal to delta, it is removed. |
| 90 // delta=[1..maxLength] |
| 91 void shift(int32_t delta) { |
| 92 int32_t i=start+delta; |
| 93 if(i>=capacity) { |
| 94 i-=capacity; |
| 95 } |
| 96 if(list[i]) { |
| 97 list[i]=FALSE; |
| 98 --length; |
| 99 } |
| 100 start=i; |
| 101 } |
| 102 |
| 103 // Add an offset. The list must not contain it yet. |
| 104 // offset=[1..maxLength] |
| 105 void addOffset(int32_t offset) { |
| 106 int32_t i=start+offset; |
| 107 if(i>=capacity) { |
| 108 i-=capacity; |
| 109 } |
| 110 list[i]=TRUE; |
| 111 ++length; |
| 112 } |
| 113 |
| 114 // offset=[1..maxLength] |
| 115 UBool containsOffset(int32_t offset) const { |
| 116 int32_t i=start+offset; |
| 117 if(i>=capacity) { |
| 118 i-=capacity; |
| 119 } |
| 120 return list[i]; |
| 121 } |
| 122 |
| 123 // Find the lowest stored offset from a non-empty list, remove it, |
| 124 // and reduce all other offsets by this minimum. |
| 125 // Returns [1..maxLength]. |
| 126 int32_t popMinimum() { |
| 127 // Look for the next offset in list[start+1..capacity-1]. |
| 128 int32_t i=start, result; |
| 129 while(++i<capacity) { |
| 130 if(list[i]) { |
| 131 list[i]=FALSE; |
| 132 --length; |
| 133 result=i-start; |
| 134 start=i; |
| 135 return result; |
| 136 } |
| 137 } |
| 138 // i==capacity |
| 139 |
| 140 // Wrap around and look for the next offset in list[0..start]. |
| 141 // Since the list is not empty, there will be one. |
| 142 result=capacity-start; |
| 143 i=0; |
| 144 while(!list[i]) { |
| 145 ++i; |
| 146 } |
| 147 list[i]=FALSE; |
| 148 --length; |
| 149 start=i; |
| 150 return result+=i; |
| 151 } |
| 152 |
| 153 private: |
| 154 UBool *list; |
| 155 int32_t capacity; |
| 156 int32_t length; |
| 157 int32_t start; |
| 158 |
| 159 UBool staticList[16]; |
| 160 }; |
| 161 |
| 162 // Get the number of UTF-8 bytes for a UTF-16 (sub)string. |
| 163 static int32_t |
| 164 getUTF8Length(const UChar *s, int32_t length) { |
| 165 UErrorCode errorCode=U_ZERO_ERROR; |
| 166 int32_t length8=0; |
| 167 u_strToUTF8(NULL, 0, &length8, s, length, &errorCode); |
| 168 if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) { |
| 169 return length8; |
| 170 } else { |
| 171 // The string contains an unpaired surrogate. |
| 172 // Ignore this string. |
| 173 return 0; |
| 174 } |
| 175 } |
| 176 |
| 177 // Append the UTF-8 version of the string to t and return the appended UTF-8 len
gth. |
| 178 static int32_t |
| 179 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) { |
| 180 UErrorCode errorCode=U_ZERO_ERROR; |
| 181 int32_t length8=0; |
| 182 u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode); |
| 183 if(U_SUCCESS(errorCode)) { |
| 184 return length8; |
| 185 } else { |
| 186 // The string contains an unpaired surrogate. |
| 187 // Ignore this string. |
| 188 return 0; |
| 189 } |
| 190 } |
| 191 |
| 192 static inline uint8_t |
| 193 makeSpanLengthByte(int32_t spanLength) { |
| 194 // 0xfe==UnicodeSetStringSpan::LONG_SPAN |
| 195 return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe; |
| 196 } |
| 197 |
| 198 // Construct for all variants of span(), or only for any one variant. |
| 199 // Initialize as little as possible, for single use. |
| 200 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, |
| 201 const UVector &setStrings, |
| 202 uint32_t which) |
| 203 : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings), |
| 204 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), |
| 205 utf8Length(0), |
| 206 maxLength16(0), maxLength8(0), |
| 207 all((UBool)(which==ALL)) { |
| 208 spanSet.retainAll(set); |
| 209 if(which&NOT_CONTAINED) { |
| 210 // Default to the same sets. |
| 211 // addToSpanNotSet() will create a separate set if necessary. |
| 212 pSpanNotSet=&spanSet; |
| 213 } |
| 214 |
| 215 // Determine if the strings even need to be taken into account at all for sp
an() etc. |
| 216 // If any string is relevant, then all strings need to be used for |
| 217 // span(longest match) but only the relevant ones for span(while contained). |
| 218 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH |
| 219 // and do not store UTF-8 strings if !thisRelevant and CONTAINED. |
| 220 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are r
elevant after all.) |
| 221 // Also count the lengths of the UTF-8 versions of the strings for memory al
location. |
| 222 int32_t stringsLength=strings.size(); |
| 223 |
| 224 int32_t i, spanLength; |
| 225 UBool someRelevant=FALSE; |
| 226 for(i=0; i<stringsLength; ++i) { |
| 227 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i)
; |
| 228 const UChar *s16=string.getBuffer(); |
| 229 int32_t length16=string.length(); |
| 230 UBool thisRelevant; |
| 231 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); |
| 232 if(spanLength<length16) { // Relevant string. |
| 233 someRelevant=thisRelevant=TRUE; |
| 234 } else { |
| 235 thisRelevant=FALSE; |
| 236 } |
| 237 if((which&UTF16) && length16>maxLength16) { |
| 238 maxLength16=length16; |
| 239 } |
| 240 if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { |
| 241 int32_t length8=getUTF8Length(s16, length16); |
| 242 utf8Length+=length8; |
| 243 if(length8>maxLength8) { |
| 244 maxLength8=length8; |
| 245 } |
| 246 } |
| 247 } |
| 248 if(!someRelevant) { |
| 249 maxLength16=maxLength8=0; |
| 250 return; |
| 251 } |
| 252 |
| 253 // Freeze after checking for the need to use strings at all because freezing |
| 254 // a set takes some time and memory which are wasted if there are no relevan
t strings. |
| 255 if(all) { |
| 256 spanSet.freeze(); |
| 257 } |
| 258 |
| 259 uint8_t *spanBackLengths; |
| 260 uint8_t *spanUTF8Lengths; |
| 261 uint8_t *spanBackUTF8Lengths; |
| 262 |
| 263 // Allocate a block of meta data. |
| 264 int32_t allocSize; |
| 265 if(all) { |
| 266 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. |
| 267 allocSize=stringsLength*(4+1+1+1+1)+utf8Length; |
| 268 } else { |
| 269 allocSize=stringsLength; // One set of span lengths. |
| 270 if(which&UTF8) { |
| 271 // UTF-8 lengths and UTF-8 strings. |
| 272 allocSize+=stringsLength*4+utf8Length; |
| 273 } |
| 274 } |
| 275 if(allocSize<=(int32_t)sizeof(staticLengths)) { |
| 276 utf8Lengths=staticLengths; |
| 277 } else { |
| 278 utf8Lengths=(int32_t *)uprv_malloc(allocSize); |
| 279 if(utf8Lengths==NULL) { |
| 280 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpa
nUTF16/8() return FALSE. |
| 281 return; // Out of memory. |
| 282 } |
| 283 } |
| 284 |
| 285 if(all) { |
| 286 // Store span lengths for all span() variants. |
| 287 spanLengths=(uint8_t *)(utf8Lengths+stringsLength); |
| 288 spanBackLengths=spanLengths+stringsLength; |
| 289 spanUTF8Lengths=spanBackLengths+stringsLength; |
| 290 spanBackUTF8Lengths=spanUTF8Lengths+stringsLength; |
| 291 utf8=spanBackUTF8Lengths+stringsLength; |
| 292 } else { |
| 293 // Store span lengths for only one span() variant. |
| 294 if(which&UTF8) { |
| 295 spanLengths=(uint8_t *)(utf8Lengths+stringsLength); |
| 296 utf8=spanLengths+stringsLength; |
| 297 } else { |
| 298 spanLengths=(uint8_t *)utf8Lengths; |
| 299 } |
| 300 spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths; |
| 301 } |
| 302 |
| 303 // Set the meta data and pSpanNotSet and write the UTF-8 strings. |
| 304 int32_t utf8Count=0; // Count UTF-8 bytes written so far. |
| 305 |
| 306 for(i=0; i<stringsLength; ++i) { |
| 307 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i)
; |
| 308 const UChar *s16=string.getBuffer(); |
| 309 int32_t length16=string.length(); |
| 310 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); |
| 311 if(spanLength<length16) { // Relevant string. |
| 312 if(which&UTF16) { |
| 313 if(which&CONTAINED) { |
| 314 if(which&FWD) { |
| 315 spanLengths[i]=makeSpanLengthByte(spanLength); |
| 316 } |
| 317 if(which&BACK) { |
| 318 spanLength=length16-spanSet.spanBack(s16, length16, USET
_SPAN_CONTAINED); |
| 319 spanBackLengths[i]=makeSpanLengthByte(spanLength); |
| 320 } |
| 321 } else /* not CONTAINED, not all, but NOT_CONTAINED */ { |
| 322 spanLengths[i]=spanBackLengths[i]=0; // Only store a releva
nt/irrelevant flag. |
| 323 } |
| 324 } |
| 325 if(which&UTF8) { |
| 326 uint8_t *s8=utf8+utf8Count; |
| 327 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Cou
nt); |
| 328 utf8Count+=utf8Lengths[i]=length8; |
| 329 if(length8==0) { // Irrelevant for UTF-8 because not representa
ble in UTF-8. |
| 330 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CO
NTAINED; |
| 331 } else { // Relevant for UTF-8. |
| 332 if(which&CONTAINED) { |
| 333 if(which&FWD) { |
| 334 spanLength=spanSet.spanUTF8((const char *)s8, length
8, USET_SPAN_CONTAINED); |
| 335 spanUTF8Lengths[i]=makeSpanLengthByte(spanLength); |
| 336 } |
| 337 if(which&BACK) { |
| 338 spanLength=length8-spanSet.spanBackUTF8((const char
*)s8, length8, USET_SPAN_CONTAINED); |
| 339 spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength
); |
| 340 } |
| 341 } else /* not CONTAINED, not all, but NOT_CONTAINED */ { |
| 342 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only st
ore a relevant/irrelevant flag. |
| 343 } |
| 344 } |
| 345 } |
| 346 if(which&NOT_CONTAINED) { |
| 347 // Add string start and end code points to the spanNotSet so tha
t |
| 348 // a span(while not contained) stops before any string. |
| 349 UChar32 c; |
| 350 if(which&FWD) { |
| 351 int32_t len=0; |
| 352 U16_NEXT(s16, len, length16, c); |
| 353 addToSpanNotSet(c); |
| 354 } |
| 355 if(which&BACK) { |
| 356 int32_t len=length16; |
| 357 U16_PREV(s16, 0, len, c); |
| 358 addToSpanNotSet(c); |
| 359 } |
| 360 } |
| 361 } else { // Irrelevant string. |
| 362 if(which&UTF8) { |
| 363 if(which&CONTAINED) { // Only necessary for LONGEST_MATCH. |
| 364 uint8_t *s8=utf8+utf8Count; |
| 365 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf
8Count); |
| 366 utf8Count+=utf8Lengths[i]=length8; |
| 367 } else { |
| 368 utf8Lengths[i]=0; |
| 369 } |
| 370 } |
| 371 if(all) { |
| 372 spanLengths[i]=spanBackLengths[i]= |
| 373 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]= |
| 374 (uint8_t)ALL_CP_CONTAINED; |
| 375 } else { |
| 376 // All spanXYZLengths pointers contain the same address. |
| 377 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED; |
| 378 } |
| 379 } |
| 380 } |
| 381 |
| 382 // Finish. |
| 383 if(all) { |
| 384 pSpanNotSet->freeze(); |
| 385 } |
| 386 } |
| 387 |
| 388 // Copy constructor. Assumes which==ALL for a frozen set. |
| 389 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStri
ngSpan, |
| 390 const UVector &newParentSetStrings) |
| 391 : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParent
SetStrings), |
| 392 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), |
| 393 utf8Length(otherStringSpan.utf8Length), |
| 394 maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.m
axLength8), |
| 395 all(TRUE) { |
| 396 if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { |
| 397 pSpanNotSet=&spanSet; |
| 398 } else { |
| 399 pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone(); |
| 400 } |
| 401 |
| 402 // Allocate a block of meta data. |
| 403 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. |
| 404 int32_t stringsLength=strings.size(); |
| 405 int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length; |
| 406 if(allocSize<=(int32_t)sizeof(staticLengths)) { |
| 407 utf8Lengths=staticLengths; |
| 408 } else { |
| 409 utf8Lengths=(int32_t *)uprv_malloc(allocSize); |
| 410 if(utf8Lengths==NULL) { |
| 411 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpa
nUTF16/8() return FALSE. |
| 412 return; // Out of memory. |
| 413 } |
| 414 } |
| 415 |
| 416 spanLengths=(uint8_t *)(utf8Lengths+stringsLength); |
| 417 utf8=spanLengths+stringsLength*4; |
| 418 uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize); |
| 419 } |
| 420 |
| 421 UnicodeSetStringSpan::~UnicodeSetStringSpan() { |
| 422 if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) { |
| 423 delete pSpanNotSet; |
| 424 } |
| 425 if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) { |
| 426 uprv_free(utf8Lengths); |
| 427 } |
| 428 } |
| 429 |
| 430 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { |
| 431 if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) { |
| 432 if(spanSet.contains(c)) { |
| 433 return; // Nothing to do. |
| 434 } |
| 435 UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed(); |
| 436 if(newSet==NULL) { |
| 437 return; // Out of memory. |
| 438 } else { |
| 439 pSpanNotSet=newSet; |
| 440 } |
| 441 } |
| 442 pSpanNotSet->add(c); |
| 443 } |
| 444 |
| 445 // Compare strings without any argument checks. Requires length>0. |
| 446 static inline UBool |
| 447 matches16(const UChar *s, const UChar *t, int32_t length) { |
| 448 do { |
| 449 if(*s++!=*t++) { |
| 450 return FALSE; |
| 451 } |
| 452 } while(--length>0); |
| 453 return TRUE; |
| 454 } |
| 455 |
| 456 static inline UBool |
| 457 matches8(const uint8_t *s, const uint8_t *t, int32_t length) { |
| 458 do { |
| 459 if(*s++!=*t++) { |
| 460 return FALSE; |
| 461 } |
| 462 } while(--length>0); |
| 463 return TRUE; |
| 464 } |
| 465 |
| 466 // Compare 16-bit Unicode strings (which may be malformed UTF-16) |
| 467 // at code point boundaries. |
| 468 // That is, each edge of a match must not be in the middle of a surrogate pair. |
| 469 static inline UBool |
| 470 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32
_t length) { |
| 471 s+=start; |
| 472 limit-=start; |
| 473 return matches16(s, t, length) && |
| 474 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && |
| 475 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])
); |
| 476 } |
| 477 |
| 478 // Does the set contain the next code point? |
| 479 // If so, return its length; otherwise return its negative length. |
| 480 static inline int32_t |
| 481 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) { |
| 482 UChar c=*s, c2; |
| 483 if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { |
| 484 return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2; |
| 485 } |
| 486 return set.contains(c) ? 1 : -1; |
| 487 } |
| 488 |
| 489 static inline int32_t |
| 490 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) { |
| 491 UChar c=s[length-1], c2; |
| 492 if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { |
| 493 return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2; |
| 494 } |
| 495 return set.contains(c) ? 1 : -1; |
| 496 } |
| 497 |
| 498 static inline int32_t |
| 499 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { |
| 500 UChar32 c=*s; |
| 501 if((int8_t)c>=0) { |
| 502 return set.contains(c) ? 1 : -1; |
| 503 } |
| 504 // Take advantage of non-ASCII fastpaths in U8_NEXT(). |
| 505 int32_t i=0; |
| 506 U8_NEXT(s, i, length, c); |
| 507 return set.contains(c) ? i : -i; |
| 508 } |
| 509 |
| 510 static inline int32_t |
| 511 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { |
| 512 UChar32 c=s[length-1]; |
| 513 if((int8_t)c>=0) { |
| 514 return set.contains(c) ? 1 : -1; |
| 515 } |
| 516 int32_t i=length-1; |
| 517 c=utf8_prevCharSafeBody(s, 0, &i, c, -1); |
| 518 length-=i; |
| 519 return set.contains(c) ? length : -length; |
| 520 } |
| 521 |
| 522 /* |
| 523 * Note: In span() when spanLength==0 (after a string match, or at the beginning |
| 524 * after an empty code point span) and in spanNot() and spanNotUTF8(), |
| 525 * string matching could use a binary search |
| 526 * because all string matches are done from the same start index. |
| 527 * |
| 528 * For UTF-8, this would require a comparison function that returns UTF-16 order
. |
| 529 * |
| 530 * This optimization should not be necessary for normal UnicodeSets because |
| 531 * most sets have no strings, and most sets with strings have |
| 532 * very few very short strings. |
| 533 * For cases with many strings, it might be better to use a different API |
| 534 * and implementation with a DFA (state machine). |
| 535 */ |
| 536 |
| 537 /* |
| 538 * Algorithm for span(USET_SPAN_CONTAINED) |
| 539 * |
| 540 * Theoretical algorithm: |
| 541 * - Iterate through the string, and at each code point boundary: |
| 542 * + If the code point there is in the set, then remember to continue after it
. |
| 543 * + If a set string matches at the current position, then remember to continu
e after it. |
| 544 * + Either recursively span for each code point or string match, |
| 545 * or recursively span for all but the shortest one and |
| 546 * iteratively continue the span with the shortest local match. |
| 547 * + Remember the longest recursive span (the farthest end point). |
| 548 * + If there is no match at the current position, neither for the code point
there |
| 549 * nor for any set string, then stop and return the longest recursive span l
ength. |
| 550 * |
| 551 * Optimized implementation: |
| 552 * |
| 553 * (We assume that most sets will have very few very short strings. |
| 554 * A span using a string-less set is extremely fast.) |
| 555 * |
| 556 * Create and cache a spanSet which contains all of the single code points |
| 557 * of the original set but none of its strings. |
| 558 * |
| 559 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). |
| 560 * - Loop: |
| 561 * + Try to match each set string at the end of the spanLength. |
| 562 * ~ Set strings that start with set-contained code points must be matched |
| 563 * with a partial overlap because the recursive algorithm would have tried |
| 564 * to match them at every position. |
| 565 * ~ Set strings that entirely consist of set-contained code points |
| 566 * are irrelevant for span(USET_SPAN_CONTAINED) because the |
| 567 * recursive algorithm would continue after them anyway |
| 568 * and find the longest recursive match from their end. |
| 569 * ~ Rather than recursing, note each end point of a set string match. |
| 570 * + If no set string matched after spanSet.span(), then return |
| 571 * with where the spanSet.span() ended. |
| 572 * + If at least one set string matched after spanSet.span(), then |
| 573 * pop the shortest string match end point and continue |
| 574 * the loop, trying to match all set strings from there. |
| 575 * + If at least one more set string matched after a previous string match, |
| 576 * then test if the code point after the previous string match is also |
| 577 * contained in the set. |
| 578 * Continue the loop with the shortest end point of either this code point |
| 579 * or a matching set string. |
| 580 * + If no more set string matched after a previous string match, |
| 581 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). |
| 582 * Stop if spanLength==0, otherwise continue the loop. |
| 583 * |
| 584 * By noting each end point of a set string match, |
| 585 * the function visits each string position at most once and finishes |
| 586 * in linear time. |
| 587 * |
| 588 * The recursive algorithm may visit the same string position many times |
| 589 * if multiple paths lead to it and finishes in exponential time. |
| 590 */ |
| 591 |
| 592 /* |
| 593 * Algorithm for span(USET_SPAN_SIMPLE) |
| 594 * |
| 595 * Theoretical algorithm: |
| 596 * - Iterate through the string, and at each code point boundary: |
| 597 * + If the code point there is in the set, then remember to continue after it
. |
| 598 * + If a set string matches at the current position, then remember to continu
e after it. |
| 599 * + Continue from the farthest match position and ignore all others. |
| 600 * + If there is no match at the current position, |
| 601 * then stop and return the current position. |
| 602 * |
| 603 * Optimized implementation: |
| 604 * |
| 605 * (Same assumption and spanSet as above.) |
| 606 * |
| 607 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). |
| 608 * - Loop: |
| 609 * + Try to match each set string at the end of the spanLength. |
| 610 * ~ Set strings that start with set-contained code points must be matched |
| 611 * with a partial overlap because the standard algorithm would have tried |
| 612 * to match them earlier. |
| 613 * ~ Set strings that entirely consist of set-contained code points |
| 614 * must be matched with a full overlap because the longest-match algorithm |
| 615 * would hide set string matches that end earlier. |
| 616 * Such set strings need not be matched earlier inside the code point span |
| 617 * because the standard algorithm would then have continued after |
| 618 * the set string match anyway. |
| 619 * ~ Remember the longest set string match (farthest end point) from the ear
liest |
| 620 * starting point. |
| 621 * + If no set string matched after spanSet.span(), then return |
| 622 * with where the spanSet.span() ended. |
| 623 * + If at least one set string matched, then continue the loop after the |
| 624 * longest match from the earliest position. |
| 625 * + If no more set string matched after a previous string match, |
| 626 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). |
| 627 * Stop if spanLength==0, otherwise continue the loop. |
| 628 */ |
| 629 |
| 630 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondi
tion spanCondition) const { |
| 631 if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
| 632 return spanNot(s, length); |
| 633 } |
| 634 int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED); |
| 635 if(spanLength==length) { |
| 636 return length; |
| 637 } |
| 638 |
| 639 // Consider strings; they may overlap with the span. |
| 640 OffsetList offsets; |
| 641 if(spanCondition==USET_SPAN_CONTAINED) { |
| 642 // Use offset list to try all possibilities. |
| 643 offsets.setMaxLength(maxLength16); |
| 644 } |
| 645 int32_t pos=spanLength, rest=length-pos; |
| 646 int32_t i, stringsLength=strings.size(); |
| 647 for(;;) { |
| 648 if(spanCondition==USET_SPAN_CONTAINED) { |
| 649 for(i=0; i<stringsLength; ++i) { |
| 650 int32_t overlap=spanLengths[i]; |
| 651 if(overlap==ALL_CP_CONTAINED) { |
| 652 continue; // Irrelevant string. |
| 653 } |
| 654 const UnicodeString &string=*(const UnicodeString *)strings.elem
entAt(i); |
| 655 const UChar *s16=string.getBuffer(); |
| 656 int32_t length16=string.length(); |
| 657 |
| 658 // Try to match this string at pos-overlap..pos. |
| 659 if(overlap>=LONG_SPAN) { |
| 660 overlap=length16; |
| 661 // While contained: No point matching fully inside the code
point span. |
| 662 U16_BACK_1(s16, 0, overlap); // Length of the string minus
the last code point. |
| 663 } |
| 664 if(overlap>spanLength) { |
| 665 overlap=spanLength; |
| 666 } |
| 667 int32_t inc=length16-overlap; // Keep overlap+inc==length16. |
| 668 for(;;) { |
| 669 if(inc>rest) { |
| 670 break; |
| 671 } |
| 672 // Try to match if the increment is not listed already. |
| 673 if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overl
ap, length, s16, length16)) { |
| 674 if(inc==rest) { |
| 675 return length; // Reached the end of the string. |
| 676 } |
| 677 offsets.addOffset(inc); |
| 678 } |
| 679 if(overlap==0) { |
| 680 break; |
| 681 } |
| 682 --overlap; |
| 683 ++inc; |
| 684 } |
| 685 } |
| 686 } else /* USET_SPAN_SIMPLE */ { |
| 687 int32_t maxInc=0, maxOverlap=0; |
| 688 for(i=0; i<stringsLength; ++i) { |
| 689 int32_t overlap=spanLengths[i]; |
| 690 // For longest match, we do need to try to match even an all-con
tained string |
| 691 // to find the match from the earliest start. |
| 692 |
| 693 const UnicodeString &string=*(const UnicodeString *)strings.elem
entAt(i); |
| 694 const UChar *s16=string.getBuffer(); |
| 695 int32_t length16=string.length(); |
| 696 |
| 697 // Try to match this string at pos-overlap..pos. |
| 698 if(overlap>=LONG_SPAN) { |
| 699 overlap=length16; |
| 700 // Longest match: Need to match fully inside the code point
span |
| 701 // to find the match from the earliest start. |
| 702 } |
| 703 if(overlap>spanLength) { |
| 704 overlap=spanLength; |
| 705 } |
| 706 int32_t inc=length16-overlap; // Keep overlap+inc==length16. |
| 707 for(;;) { |
| 708 if(inc>rest || overlap<maxOverlap) { |
| 709 break; |
| 710 } |
| 711 // Try to match if the string is longer or starts earlier. |
| 712 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap
&& */ inc>maxInc) && |
| 713 matches16CPB(s, pos-overlap, length, s16, length16) |
| 714 ) { |
| 715 maxInc=inc; // Longest match from earliest start. |
| 716 maxOverlap=overlap; |
| 717 break; |
| 718 } |
| 719 --overlap; |
| 720 ++inc; |
| 721 } |
| 722 } |
| 723 |
| 724 if(maxInc!=0 || maxOverlap!=0) { |
| 725 // Longest-match algorithm, and there was a string match. |
| 726 // Simply continue after it. |
| 727 pos+=maxInc; |
| 728 rest-=maxInc; |
| 729 if(rest==0) { |
| 730 return length; // Reached the end of the string. |
| 731 } |
| 732 spanLength=0; // Match strings from after a string match. |
| 733 continue; |
| 734 } |
| 735 } |
| 736 // Finished trying to match all strings at pos. |
| 737 |
| 738 if(spanLength!=0 || pos==0) { |
| 739 // The position is after an unlimited code point span (spanLength!=0
), |
| 740 // not after a string match. |
| 741 // The only position where spanLength==0 after a span is pos==0. |
| 742 // Otherwise, an unlimited code point span is only tried again when
no |
| 743 // strings match, and if such a non-initial span fails we stop. |
| 744 if(offsets.isEmpty()) { |
| 745 return pos; // No strings matched after a span. |
| 746 } |
| 747 // Match strings from after the next string match. |
| 748 } else { |
| 749 // The position is after a string match (or a single code point). |
| 750 if(offsets.isEmpty()) { |
| 751 // No more strings matched after a previous string match. |
| 752 // Try another code point span from after the last string match. |
| 753 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); |
| 754 if( spanLength==rest || // Reached the end of the string, or |
| 755 spanLength==0 // neither strings nor span progressed. |
| 756 ) { |
| 757 return pos+spanLength; |
| 758 } |
| 759 pos+=spanLength; |
| 760 rest-=spanLength; |
| 761 continue; // spanLength>0: Match strings from after a span. |
| 762 } else { |
| 763 // Try to match only one code point from after a string match if
some |
| 764 // string matched beyond it, so that we try all possible positio
ns |
| 765 // and don't overshoot. |
| 766 spanLength=spanOne(spanSet, s+pos, rest); |
| 767 if(spanLength>0) { |
| 768 if(spanLength==rest) { |
| 769 return length; // Reached the end of the string. |
| 770 } |
| 771 // Match strings after this code point. |
| 772 // There cannot be any increments below it because UnicodeSe
t strings |
| 773 // contain multiple code points. |
| 774 pos+=spanLength; |
| 775 rest-=spanLength; |
| 776 offsets.shift(spanLength); |
| 777 spanLength=0; |
| 778 continue; // Match strings from after a single code point. |
| 779 } |
| 780 // Match strings from after the next string match. |
| 781 } |
| 782 } |
| 783 int32_t minOffset=offsets.popMinimum(); |
| 784 pos+=minOffset; |
| 785 rest-=minOffset; |
| 786 spanLength=0; // Match strings from after a string match. |
| 787 } |
| 788 } |
| 789 |
| 790 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanC
ondition spanCondition) const { |
| 791 if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
| 792 return spanNotBack(s, length); |
| 793 } |
| 794 int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED); |
| 795 if(pos==0) { |
| 796 return 0; |
| 797 } |
| 798 int32_t spanLength=length-pos; |
| 799 |
| 800 // Consider strings; they may overlap with the span. |
| 801 OffsetList offsets; |
| 802 if(spanCondition==USET_SPAN_CONTAINED) { |
| 803 // Use offset list to try all possibilities. |
| 804 offsets.setMaxLength(maxLength16); |
| 805 } |
| 806 int32_t i, stringsLength=strings.size(); |
| 807 uint8_t *spanBackLengths=spanLengths; |
| 808 if(all) { |
| 809 spanBackLengths+=stringsLength; |
| 810 } |
| 811 for(;;) { |
| 812 if(spanCondition==USET_SPAN_CONTAINED) { |
| 813 for(i=0; i<stringsLength; ++i) { |
| 814 int32_t overlap=spanBackLengths[i]; |
| 815 if(overlap==ALL_CP_CONTAINED) { |
| 816 continue; // Irrelevant string. |
| 817 } |
| 818 const UnicodeString &string=*(const UnicodeString *)strings.elem
entAt(i); |
| 819 const UChar *s16=string.getBuffer(); |
| 820 int32_t length16=string.length(); |
| 821 |
| 822 // Try to match this string at pos-(length16-overlap)..pos-lengt
h16. |
| 823 if(overlap>=LONG_SPAN) { |
| 824 overlap=length16; |
| 825 // While contained: No point matching fully inside the code
point span. |
| 826 int32_t len1=0; |
| 827 U16_FWD_1(s16, len1, overlap); |
| 828 overlap-=len1; // Length of the string minus the first code
point. |
| 829 } |
| 830 if(overlap>spanLength) { |
| 831 overlap=spanLength; |
| 832 } |
| 833 int32_t dec=length16-overlap; // Keep dec+overlap==length16. |
| 834 for(;;) { |
| 835 if(dec>pos) { |
| 836 break; |
| 837 } |
| 838 // Try to match if the decrement is not listed already. |
| 839 if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec,
length, s16, length16)) { |
| 840 if(dec==pos) { |
| 841 return 0; // Reached the start of the string. |
| 842 } |
| 843 offsets.addOffset(dec); |
| 844 } |
| 845 if(overlap==0) { |
| 846 break; |
| 847 } |
| 848 --overlap; |
| 849 ++dec; |
| 850 } |
| 851 } |
| 852 } else /* USET_SPAN_SIMPLE */ { |
| 853 int32_t maxDec=0, maxOverlap=0; |
| 854 for(i=0; i<stringsLength; ++i) { |
| 855 int32_t overlap=spanBackLengths[i]; |
| 856 // For longest match, we do need to try to match even an all-con
tained string |
| 857 // to find the match from the latest end. |
| 858 |
| 859 const UnicodeString &string=*(const UnicodeString *)strings.elem
entAt(i); |
| 860 const UChar *s16=string.getBuffer(); |
| 861 int32_t length16=string.length(); |
| 862 |
| 863 // Try to match this string at pos-(length16-overlap)..pos-lengt
h16. |
| 864 if(overlap>=LONG_SPAN) { |
| 865 overlap=length16; |
| 866 // Longest match: Need to match fully inside the code point
span |
| 867 // to find the match from the latest end. |
| 868 } |
| 869 if(overlap>spanLength) { |
| 870 overlap=spanLength; |
| 871 } |
| 872 int32_t dec=length16-overlap; // Keep dec+overlap==length16. |
| 873 for(;;) { |
| 874 if(dec>pos || overlap<maxOverlap) { |
| 875 break; |
| 876 } |
| 877 // Try to match if the string is longer or ends later. |
| 878 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap
&& */ dec>maxDec) && |
| 879 matches16CPB(s, pos-dec, length, s16, length16) |
| 880 ) { |
| 881 maxDec=dec; // Longest match from latest end. |
| 882 maxOverlap=overlap; |
| 883 break; |
| 884 } |
| 885 --overlap; |
| 886 ++dec; |
| 887 } |
| 888 } |
| 889 |
| 890 if(maxDec!=0 || maxOverlap!=0) { |
| 891 // Longest-match algorithm, and there was a string match. |
| 892 // Simply continue before it. |
| 893 pos-=maxDec; |
| 894 if(pos==0) { |
| 895 return 0; // Reached the start of the string. |
| 896 } |
| 897 spanLength=0; // Match strings from before a string match. |
| 898 continue; |
| 899 } |
| 900 } |
| 901 // Finished trying to match all strings at pos. |
| 902 |
| 903 if(spanLength!=0 || pos==length) { |
| 904 // The position is before an unlimited code point span (spanLength!=
0), |
| 905 // not before a string match. |
| 906 // The only position where spanLength==0 before a span is pos==lengt
h. |
| 907 // Otherwise, an unlimited code point span is only tried again when
no |
| 908 // strings match, and if such a non-initial span fails we stop. |
| 909 if(offsets.isEmpty()) { |
| 910 return pos; // No strings matched before a span. |
| 911 } |
| 912 // Match strings from before the next string match. |
| 913 } else { |
| 914 // The position is before a string match (or a single code point). |
| 915 if(offsets.isEmpty()) { |
| 916 // No more strings matched before a previous string match. |
| 917 // Try another code point span from before the last string match
. |
| 918 int32_t oldPos=pos; |
| 919 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED); |
| 920 spanLength=oldPos-pos; |
| 921 if( pos==0 || // Reached the start of the string, or |
| 922 spanLength==0 // neither strings nor span progressed. |
| 923 ) { |
| 924 return pos; |
| 925 } |
| 926 continue; // spanLength>0: Match strings from before a span. |
| 927 } else { |
| 928 // Try to match only one code point from before a string match i
f some |
| 929 // string matched beyond it, so that we try all possible positio
ns |
| 930 // and don't overshoot. |
| 931 spanLength=spanOneBack(spanSet, s, pos); |
| 932 if(spanLength>0) { |
| 933 if(spanLength==pos) { |
| 934 return 0; // Reached the start of the string. |
| 935 } |
| 936 // Match strings before this code point. |
| 937 // There cannot be any decrements below it because UnicodeSe
t strings |
| 938 // contain multiple code points. |
| 939 pos-=spanLength; |
| 940 offsets.shift(spanLength); |
| 941 spanLength=0; |
| 942 continue; // Match strings from before a single code point. |
| 943 } |
| 944 // Match strings from before the next string match. |
| 945 } |
| 946 } |
| 947 pos-=offsets.popMinimum(); |
| 948 spanLength=0; // Match strings from before a string match. |
| 949 } |
| 950 } |
| 951 |
| 952 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpa
nCondition spanCondition) const { |
| 953 if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
| 954 return spanNotUTF8(s, length); |
| 955 } |
| 956 int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTA
INED); |
| 957 if(spanLength==length) { |
| 958 return length; |
| 959 } |
| 960 |
| 961 // Consider strings; they may overlap with the span. |
| 962 OffsetList offsets; |
| 963 if(spanCondition==USET_SPAN_CONTAINED) { |
| 964 // Use offset list to try all possibilities. |
| 965 offsets.setMaxLength(maxLength8); |
| 966 } |
| 967 int32_t pos=spanLength, rest=length-pos; |
| 968 int32_t i, stringsLength=strings.size(); |
| 969 uint8_t *spanUTF8Lengths=spanLengths; |
| 970 if(all) { |
| 971 spanUTF8Lengths+=2*stringsLength; |
| 972 } |
| 973 for(;;) { |
| 974 const uint8_t *s8=utf8; |
| 975 int32_t length8; |
| 976 if(spanCondition==USET_SPAN_CONTAINED) { |
| 977 for(i=0; i<stringsLength; ++i) { |
| 978 length8=utf8Lengths[i]; |
| 979 if(length8==0) { |
| 980 continue; // String not representable in UTF-8. |
| 981 } |
| 982 int32_t overlap=spanUTF8Lengths[i]; |
| 983 if(overlap==ALL_CP_CONTAINED) { |
| 984 s8+=length8; |
| 985 continue; // Irrelevant string. |
| 986 } |
| 987 |
| 988 // Try to match this string at pos-overlap..pos. |
| 989 if(overlap>=LONG_SPAN) { |
| 990 overlap=length8; |
| 991 // While contained: No point matching fully inside the code
point span. |
| 992 U8_BACK_1(s8, 0, overlap); // Length of the string minus th
e last code point. |
| 993 } |
| 994 if(overlap>spanLength) { |
| 995 overlap=spanLength; |
| 996 } |
| 997 int32_t inc=length8-overlap; // Keep overlap+inc==length8. |
| 998 for(;;) { |
| 999 if(inc>rest) { |
| 1000 break; |
| 1001 } |
| 1002 // Try to match if the increment is not listed already. |
| 1003 // Match at code point boundaries. (The UTF-8 strings were c
onverted |
| 1004 // from UTF-16 and are guaranteed to be well-formed.) |
| 1005 if( !U8_IS_TRAIL(s[pos-overlap]) && |
| 1006 !offsets.containsOffset(inc) && |
| 1007 matches8(s+pos-overlap, s8, length8) |
| 1008 |
| 1009 ) { |
| 1010 if(inc==rest) { |
| 1011 return length; // Reached the end of the string. |
| 1012 } |
| 1013 offsets.addOffset(inc); |
| 1014 } |
| 1015 if(overlap==0) { |
| 1016 break; |
| 1017 } |
| 1018 --overlap; |
| 1019 ++inc; |
| 1020 } |
| 1021 s8+=length8; |
| 1022 } |
| 1023 } else /* USET_SPAN_SIMPLE */ { |
| 1024 int32_t maxInc=0, maxOverlap=0; |
| 1025 for(i=0; i<stringsLength; ++i) { |
| 1026 length8=utf8Lengths[i]; |
| 1027 if(length8==0) { |
| 1028 continue; // String not representable in UTF-8. |
| 1029 } |
| 1030 int32_t overlap=spanUTF8Lengths[i]; |
| 1031 // For longest match, we do need to try to match even an all-con
tained string |
| 1032 // to find the match from the earliest start. |
| 1033 |
| 1034 // Try to match this string at pos-overlap..pos. |
| 1035 if(overlap>=LONG_SPAN) { |
| 1036 overlap=length8; |
| 1037 // Longest match: Need to match fully inside the code point
span |
| 1038 // to find the match from the earliest start. |
| 1039 } |
| 1040 if(overlap>spanLength) { |
| 1041 overlap=spanLength; |
| 1042 } |
| 1043 int32_t inc=length8-overlap; // Keep overlap+inc==length8. |
| 1044 for(;;) { |
| 1045 if(inc>rest || overlap<maxOverlap) { |
| 1046 break; |
| 1047 } |
| 1048 // Try to match if the string is longer or starts earlier. |
| 1049 // Match at code point boundaries. (The UTF-8 strings were c
onverted |
| 1050 // from UTF-16 and are guaranteed to be well-formed.) |
| 1051 if( !U8_IS_TRAIL(s[pos-overlap]) && |
| 1052 (overlap>maxOverlap || /* redundant overlap==maxOverlap
&& */ inc>maxInc) && |
| 1053 matches8(s+pos-overlap, s8, length8) |
| 1054 |
| 1055 ) { |
| 1056 maxInc=inc; // Longest match from earliest start. |
| 1057 maxOverlap=overlap; |
| 1058 break; |
| 1059 } |
| 1060 --overlap; |
| 1061 ++inc; |
| 1062 } |
| 1063 s8+=length8; |
| 1064 } |
| 1065 |
| 1066 if(maxInc!=0 || maxOverlap!=0) { |
| 1067 // Longest-match algorithm, and there was a string match. |
| 1068 // Simply continue after it. |
| 1069 pos+=maxInc; |
| 1070 rest-=maxInc; |
| 1071 if(rest==0) { |
| 1072 return length; // Reached the end of the string. |
| 1073 } |
| 1074 spanLength=0; // Match strings from after a string match. |
| 1075 continue; |
| 1076 } |
| 1077 } |
| 1078 // Finished trying to match all strings at pos. |
| 1079 |
| 1080 if(spanLength!=0 || pos==0) { |
| 1081 // The position is after an unlimited code point span (spanLength!=0
), |
| 1082 // not after a string match. |
| 1083 // The only position where spanLength==0 after a span is pos==0. |
| 1084 // Otherwise, an unlimited code point span is only tried again when
no |
| 1085 // strings match, and if such a non-initial span fails we stop. |
| 1086 if(offsets.isEmpty()) { |
| 1087 return pos; // No strings matched after a span. |
| 1088 } |
| 1089 // Match strings from after the next string match. |
| 1090 } else { |
| 1091 // The position is after a string match (or a single code point). |
| 1092 if(offsets.isEmpty()) { |
| 1093 // No more strings matched after a previous string match. |
| 1094 // Try another code point span from after the last string match. |
| 1095 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN
_CONTAINED); |
| 1096 if( spanLength==rest || // Reached the end of the string, or |
| 1097 spanLength==0 // neither strings nor span progressed. |
| 1098 ) { |
| 1099 return pos+spanLength; |
| 1100 } |
| 1101 pos+=spanLength; |
| 1102 rest-=spanLength; |
| 1103 continue; // spanLength>0: Match strings from after a span. |
| 1104 } else { |
| 1105 // Try to match only one code point from after a string match if
some |
| 1106 // string matched beyond it, so that we try all possible positio
ns |
| 1107 // and don't overshoot. |
| 1108 spanLength=spanOneUTF8(spanSet, s+pos, rest); |
| 1109 if(spanLength>0) { |
| 1110 if(spanLength==rest) { |
| 1111 return length; // Reached the end of the string. |
| 1112 } |
| 1113 // Match strings after this code point. |
| 1114 // There cannot be any increments below it because UnicodeSe
t strings |
| 1115 // contain multiple code points. |
| 1116 pos+=spanLength; |
| 1117 rest-=spanLength; |
| 1118 offsets.shift(spanLength); |
| 1119 spanLength=0; |
| 1120 continue; // Match strings from after a single code point. |
| 1121 } |
| 1122 // Match strings from after the next string match. |
| 1123 } |
| 1124 } |
| 1125 int32_t minOffset=offsets.popMinimum(); |
| 1126 pos+=minOffset; |
| 1127 rest-=minOffset; |
| 1128 spanLength=0; // Match strings from after a string match. |
| 1129 } |
| 1130 } |
| 1131 |
| 1132 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USe
tSpanCondition spanCondition) const { |
| 1133 if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
| 1134 return spanNotBackUTF8(s, length); |
| 1135 } |
| 1136 int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINE
D); |
| 1137 if(pos==0) { |
| 1138 return 0; |
| 1139 } |
| 1140 int32_t spanLength=length-pos; |
| 1141 |
| 1142 // Consider strings; they may overlap with the span. |
| 1143 OffsetList offsets; |
| 1144 if(spanCondition==USET_SPAN_CONTAINED) { |
| 1145 // Use offset list to try all possibilities. |
| 1146 offsets.setMaxLength(maxLength8); |
| 1147 } |
| 1148 int32_t i, stringsLength=strings.size(); |
| 1149 uint8_t *spanBackUTF8Lengths=spanLengths; |
| 1150 if(all) { |
| 1151 spanBackUTF8Lengths+=3*stringsLength; |
| 1152 } |
| 1153 for(;;) { |
| 1154 const uint8_t *s8=utf8; |
| 1155 int32_t length8; |
| 1156 if(spanCondition==USET_SPAN_CONTAINED) { |
| 1157 for(i=0; i<stringsLength; ++i) { |
| 1158 length8=utf8Lengths[i]; |
| 1159 if(length8==0) { |
| 1160 continue; // String not representable in UTF-8. |
| 1161 } |
| 1162 int32_t overlap=spanBackUTF8Lengths[i]; |
| 1163 if(overlap==ALL_CP_CONTAINED) { |
| 1164 s8+=length8; |
| 1165 continue; // Irrelevant string. |
| 1166 } |
| 1167 |
| 1168 // Try to match this string at pos-(length8-overlap)..pos-length
8. |
| 1169 if(overlap>=LONG_SPAN) { |
| 1170 overlap=length8; |
| 1171 // While contained: No point matching fully inside the code
point span. |
| 1172 int32_t len1=0; |
| 1173 U8_FWD_1(s8, len1, overlap); |
| 1174 overlap-=len1; // Length of the string minus the first code
point. |
| 1175 } |
| 1176 if(overlap>spanLength) { |
| 1177 overlap=spanLength; |
| 1178 } |
| 1179 int32_t dec=length8-overlap; // Keep dec+overlap==length8. |
| 1180 for(;;) { |
| 1181 if(dec>pos) { |
| 1182 break; |
| 1183 } |
| 1184 // Try to match if the decrement is not listed already. |
| 1185 // Match at code point boundaries. (The UTF-8 strings were c
onverted |
| 1186 // from UTF-16 and are guaranteed to be well-formed.) |
| 1187 if( !U8_IS_TRAIL(s[pos-dec]) && |
| 1188 !offsets.containsOffset(dec) && |
| 1189 matches8(s+pos-dec, s8, length8) |
| 1190 ) { |
| 1191 if(dec==pos) { |
| 1192 return 0; // Reached the start of the string. |
| 1193 } |
| 1194 offsets.addOffset(dec); |
| 1195 } |
| 1196 if(overlap==0) { |
| 1197 break; |
| 1198 } |
| 1199 --overlap; |
| 1200 ++dec; |
| 1201 } |
| 1202 s8+=length8; |
| 1203 } |
| 1204 } else /* USET_SPAN_SIMPLE */ { |
| 1205 int32_t maxDec=0, maxOverlap=0; |
| 1206 for(i=0; i<stringsLength; ++i) { |
| 1207 length8=utf8Lengths[i]; |
| 1208 if(length8==0) { |
| 1209 continue; // String not representable in UTF-8. |
| 1210 } |
| 1211 int32_t overlap=spanBackUTF8Lengths[i]; |
| 1212 // For longest match, we do need to try to match even an all-con
tained string |
| 1213 // to find the match from the latest end. |
| 1214 |
| 1215 // Try to match this string at pos-(length8-overlap)..pos-length
8. |
| 1216 if(overlap>=LONG_SPAN) { |
| 1217 overlap=length8; |
| 1218 // Longest match: Need to match fully inside the code point
span |
| 1219 // to find the match from the latest end. |
| 1220 } |
| 1221 if(overlap>spanLength) { |
| 1222 overlap=spanLength; |
| 1223 } |
| 1224 int32_t dec=length8-overlap; // Keep dec+overlap==length8. |
| 1225 for(;;) { |
| 1226 if(dec>pos || overlap<maxOverlap) { |
| 1227 break; |
| 1228 } |
| 1229 // Try to match if the string is longer or ends later. |
| 1230 // Match at code point boundaries. (The UTF-8 strings were c
onverted |
| 1231 // from UTF-16 and are guaranteed to be well-formed.) |
| 1232 if( !U8_IS_TRAIL(s[pos-dec]) && |
| 1233 (overlap>maxOverlap || /* redundant overlap==maxOverlap
&& */ dec>maxDec) && |
| 1234 matches8(s+pos-dec, s8, length8) |
| 1235 ) { |
| 1236 maxDec=dec; // Longest match from latest end. |
| 1237 maxOverlap=overlap; |
| 1238 break; |
| 1239 } |
| 1240 --overlap; |
| 1241 ++dec; |
| 1242 } |
| 1243 s8+=length8; |
| 1244 } |
| 1245 |
| 1246 if(maxDec!=0 || maxOverlap!=0) { |
| 1247 // Longest-match algorithm, and there was a string match. |
| 1248 // Simply continue before it. |
| 1249 pos-=maxDec; |
| 1250 if(pos==0) { |
| 1251 return 0; // Reached the start of the string. |
| 1252 } |
| 1253 spanLength=0; // Match strings from before a string match. |
| 1254 continue; |
| 1255 } |
| 1256 } |
| 1257 // Finished trying to match all strings at pos. |
| 1258 |
| 1259 if(spanLength!=0 || pos==length) { |
| 1260 // The position is before an unlimited code point span (spanLength!=
0), |
| 1261 // not before a string match. |
| 1262 // The only position where spanLength==0 before a span is pos==lengt
h. |
| 1263 // Otherwise, an unlimited code point span is only tried again when
no |
| 1264 // strings match, and if such a non-initial span fails we stop. |
| 1265 if(offsets.isEmpty()) { |
| 1266 return pos; // No strings matched before a span. |
| 1267 } |
| 1268 // Match strings from before the next string match. |
| 1269 } else { |
| 1270 // The position is before a string match (or a single code point). |
| 1271 if(offsets.isEmpty()) { |
| 1272 // No more strings matched before a previous string match. |
| 1273 // Try another code point span from before the last string match
. |
| 1274 int32_t oldPos=pos; |
| 1275 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONT
AINED); |
| 1276 spanLength=oldPos-pos; |
| 1277 if( pos==0 || // Reached the start of the string, or |
| 1278 spanLength==0 // neither strings nor span progressed. |
| 1279 ) { |
| 1280 return pos; |
| 1281 } |
| 1282 continue; // spanLength>0: Match strings from before a span. |
| 1283 } else { |
| 1284 // Try to match only one code point from before a string match i
f some |
| 1285 // string matched beyond it, so that we try all possible positio
ns |
| 1286 // and don't overshoot. |
| 1287 spanLength=spanOneBackUTF8(spanSet, s, pos); |
| 1288 if(spanLength>0) { |
| 1289 if(spanLength==pos) { |
| 1290 return 0; // Reached the start of the string. |
| 1291 } |
| 1292 // Match strings before this code point. |
| 1293 // There cannot be any decrements below it because UnicodeSe
t strings |
| 1294 // contain multiple code points. |
| 1295 pos-=spanLength; |
| 1296 offsets.shift(spanLength); |
| 1297 spanLength=0; |
| 1298 continue; // Match strings from before a single code point. |
| 1299 } |
| 1300 // Match strings from before the next string match. |
| 1301 } |
| 1302 } |
| 1303 pos-=offsets.popMinimum(); |
| 1304 spanLength=0; // Match strings from before a string match. |
| 1305 } |
| 1306 } |
| 1307 |
| 1308 /* |
| 1309 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) |
| 1310 * |
| 1311 * Theoretical algorithm: |
| 1312 * - Iterate through the string, and at each code point boundary: |
| 1313 * + If the code point there is in the set, then return with the current posit
ion. |
| 1314 * + If a set string matches at the current position, then return with the cur
rent position. |
| 1315 * |
| 1316 * Optimized implementation: |
| 1317 * |
| 1318 * (Same assumption as for span() above.) |
| 1319 * |
| 1320 * Create and cache a spanNotSet which contains all of the single code points |
| 1321 * of the original set but none of its strings. |
| 1322 * For each set string add its initial code point to the spanNotSet. |
| 1323 * (Also add its final code point for spanNotBack().) |
| 1324 * |
| 1325 * - Loop: |
| 1326 * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). |
| 1327 * + If the current code point is in the original set, then |
| 1328 * return the current position. |
| 1329 * + If any set string matches at the current position, then |
| 1330 * return the current position. |
| 1331 * + If there is no match at the current position, neither for the code point
there |
| 1332 * nor for any set string, then skip this code point and continue the loop. |
| 1333 * This happens for set-string-initial code points that were added to spanNo
tSet |
| 1334 * when there is not actually a match for such a set string. |
| 1335 */ |
| 1336 |
| 1337 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const { |
| 1338 int32_t pos=0, rest=length; |
| 1339 int32_t i, stringsLength=strings.size(); |
| 1340 do { |
| 1341 // Span until we find a code point from the set, |
| 1342 // or a code point that starts or ends some string. |
| 1343 i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); |
| 1344 if(i==rest) { |
| 1345 return length; // Reached the end of the string. |
| 1346 } |
| 1347 pos+=i; |
| 1348 rest-=i; |
| 1349 |
| 1350 // Check whether the current code point is in the original set, |
| 1351 // without the string starts and ends. |
| 1352 int32_t cpLength=spanOne(spanSet, s+pos, rest); |
| 1353 if(cpLength>0) { |
| 1354 return pos; // There is a set element at pos. |
| 1355 } |
| 1356 |
| 1357 // Try to match the strings at pos. |
| 1358 for(i=0; i<stringsLength; ++i) { |
| 1359 if(spanLengths[i]==ALL_CP_CONTAINED) { |
| 1360 continue; // Irrelevant string. |
| 1361 } |
| 1362 const UnicodeString &string=*(const UnicodeString *)strings.elementA
t(i); |
| 1363 const UChar *s16=string.getBuffer(); |
| 1364 int32_t length16=string.length(); |
| 1365 if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { |
| 1366 return pos; // There is a set element at pos. |
| 1367 } |
| 1368 } |
| 1369 |
| 1370 // The span(while not contained) ended on a string start/end which is |
| 1371 // not in the original set. Skip this code point and continue. |
| 1372 // cpLength<0 |
| 1373 pos-=cpLength; |
| 1374 rest+=cpLength; |
| 1375 } while(rest!=0); |
| 1376 return length; // Reached the end of the string. |
| 1377 } |
| 1378 |
| 1379 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const
{ |
| 1380 int32_t pos=length; |
| 1381 int32_t i, stringsLength=strings.size(); |
| 1382 do { |
| 1383 // Span until we find a code point from the set, |
| 1384 // or a code point that starts or ends some string. |
| 1385 pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED); |
| 1386 if(pos==0) { |
| 1387 return 0; // Reached the start of the string. |
| 1388 } |
| 1389 |
| 1390 // Check whether the current code point is in the original set, |
| 1391 // without the string starts and ends. |
| 1392 int32_t cpLength=spanOneBack(spanSet, s, pos); |
| 1393 if(cpLength>0) { |
| 1394 return pos; // There is a set element at pos. |
| 1395 } |
| 1396 |
| 1397 // Try to match the strings at pos. |
| 1398 for(i=0; i<stringsLength; ++i) { |
| 1399 // Use spanLengths rather than a spanBackLengths pointer because |
| 1400 // it is easier and we only need to know whether the string is irrel
evant |
| 1401 // which is the same in either array. |
| 1402 if(spanLengths[i]==ALL_CP_CONTAINED) { |
| 1403 continue; // Irrelevant string. |
| 1404 } |
| 1405 const UnicodeString &string=*(const UnicodeString *)strings.elementA
t(i); |
| 1406 const UChar *s16=string.getBuffer(); |
| 1407 int32_t length16=string.length(); |
| 1408 if(length16<=pos && matches16CPB(s, pos-length16, length, s16, lengt
h16)) { |
| 1409 return pos; // There is a set element at pos. |
| 1410 } |
| 1411 } |
| 1412 |
| 1413 // The span(while not contained) ended on a string start/end which is |
| 1414 // not in the original set. Skip this code point and continue. |
| 1415 // cpLength<0 |
| 1416 pos+=cpLength; |
| 1417 } while(pos!=0); |
| 1418 return 0; // Reached the start of the string. |
| 1419 } |
| 1420 |
| 1421 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) cons
t { |
| 1422 int32_t pos=0, rest=length; |
| 1423 int32_t i, stringsLength=strings.size(); |
| 1424 uint8_t *spanUTF8Lengths=spanLengths; |
| 1425 if(all) { |
| 1426 spanUTF8Lengths+=2*stringsLength; |
| 1427 } |
| 1428 do { |
| 1429 // Span until we find a code point from the set, |
| 1430 // or a code point that starts or ends some string. |
| 1431 i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAIN
ED); |
| 1432 if(i==rest) { |
| 1433 return length; // Reached the end of the string. |
| 1434 } |
| 1435 pos+=i; |
| 1436 rest-=i; |
| 1437 |
| 1438 // Check whether the current code point is in the original set, |
| 1439 // without the string starts and ends. |
| 1440 int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); |
| 1441 if(cpLength>0) { |
| 1442 return pos; // There is a set element at pos. |
| 1443 } |
| 1444 |
| 1445 // Try to match the strings at pos. |
| 1446 const uint8_t *s8=utf8; |
| 1447 int32_t length8; |
| 1448 for(i=0; i<stringsLength; ++i) { |
| 1449 length8=utf8Lengths[i]; |
| 1450 // ALL_CP_CONTAINED: Irrelevant string. |
| 1451 if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=re
st && matches8(s+pos, s8, length8)) { |
| 1452 return pos; // There is a set element at pos. |
| 1453 } |
| 1454 s8+=length8; |
| 1455 } |
| 1456 |
| 1457 // The span(while not contained) ended on a string start/end which is |
| 1458 // not in the original set. Skip this code point and continue. |
| 1459 // cpLength<0 |
| 1460 pos-=cpLength; |
| 1461 rest+=cpLength; |
| 1462 } while(rest!=0); |
| 1463 return length; // Reached the end of the string. |
| 1464 } |
| 1465 |
| 1466 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length)
const { |
| 1467 int32_t pos=length; |
| 1468 int32_t i, stringsLength=strings.size(); |
| 1469 uint8_t *spanBackUTF8Lengths=spanLengths; |
| 1470 if(all) { |
| 1471 spanBackUTF8Lengths+=3*stringsLength; |
| 1472 } |
| 1473 do { |
| 1474 // Span until we find a code point from the set, |
| 1475 // or a code point that starts or ends some string. |
| 1476 pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAI
NED); |
| 1477 if(pos==0) { |
| 1478 return 0; // Reached the start of the string. |
| 1479 } |
| 1480 |
| 1481 // Check whether the current code point is in the original set, |
| 1482 // without the string starts and ends. |
| 1483 int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); |
| 1484 if(cpLength>0) { |
| 1485 return pos; // There is a set element at pos. |
| 1486 } |
| 1487 |
| 1488 // Try to match the strings at pos. |
| 1489 const uint8_t *s8=utf8; |
| 1490 int32_t length8; |
| 1491 for(i=0; i<stringsLength; ++i) { |
| 1492 length8=utf8Lengths[i]; |
| 1493 // ALL_CP_CONTAINED: Irrelevant string. |
| 1494 if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8
<=pos && matches8(s+pos-length8, s8, length8)) { |
| 1495 return pos; // There is a set element at pos. |
| 1496 } |
| 1497 s8+=length8; |
| 1498 } |
| 1499 |
| 1500 // The span(while not contained) ended on a string start/end which is |
| 1501 // not in the original set. Skip this code point and continue. |
| 1502 // cpLength<0 |
| 1503 pos+=cpLength; |
| 1504 } while(pos!=0); |
| 1505 return 0; // Reached the start of the string. |
| 1506 } |
| 1507 |
| 1508 U_NAMESPACE_END |
OLD | NEW |