OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ************************************************************************** |
| 3 * Copyright (C) 2002-2010 International Business Machines Corporation * |
| 4 * and others. All rights reserved. * |
| 5 ************************************************************************** |
| 6 */ |
| 7 // |
| 8 // file: rematch.cpp |
| 9 // |
| 10 // Contains the implementation of class RegexMatcher, |
| 11 // which is one of the main API classes for the ICU regular expression p
ackage. |
| 12 // |
| 13 |
| 14 #include "unicode/utypes.h" |
| 15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 16 |
| 17 #include "unicode/regex.h" |
| 18 #include "unicode/uniset.h" |
| 19 #include "unicode/uchar.h" |
| 20 #include "unicode/ustring.h" |
| 21 #include "unicode/rbbi.h" |
| 22 #include "uassert.h" |
| 23 #include "cmemory.h" |
| 24 #include "uvector.h" |
| 25 #include "uvectr32.h" |
| 26 #include "uvectr64.h" |
| 27 #include "regeximp.h" |
| 28 #include "regexst.h" |
| 29 #include "regextxt.h" |
| 30 #include "ucase.h" |
| 31 |
| 32 // #include <malloc.h> // Needed for heapcheck testing |
| 33 |
| 34 |
| 35 // Find progress callback |
| 36 // ---------------------- |
| 37 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary
function call. |
| 38 // |
| 39 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \ |
| 40 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FAL
SE) |
| 41 |
| 42 |
| 43 // Smart Backtracking |
| 44 // ------------------ |
| 45 // When a failure would go back to a LOOP_C instruction, |
| 46 // strings, characters, and setrefs scan backwards for a valid start |
| 47 // character themselves, pop the stack, and save state, emulating the |
| 48 // LOOP_C's effect but assured that the next character of input is a |
| 49 // possible matching character. |
| 50 // |
| 51 // Good idea in theory; unfortunately it only helps out a few specific |
| 52 // cases and slows the engine down a little in the rest. |
| 53 |
| 54 //#define REGEX_SMART_BACKTRACKING 1 |
| 55 |
| 56 U_NAMESPACE_BEGIN |
| 57 |
| 58 // Default limit for the size of the back track stack, to avoid system |
| 59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. |
| 60 // This value puts ICU's limits higher than most other regexp implementations, |
| 61 // which use recursion rather than the heap, and take more storage per |
| 62 // backtrack point. |
| 63 // |
| 64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; |
| 65 |
| 66 // Time limit counter constant. |
| 67 // Time limits for expression evaluation are in terms of quanta of work by |
| 68 // the engine, each of which is 10,000 state saves. |
| 69 // This constant determines that state saves per tick number. |
| 70 static const int32_t TIMER_INITIAL_VALUE = 10000; |
| 71 |
| 72 //----------------------------------------------------------------------------- |
| 73 // |
| 74 // Constructor and Destructor |
| 75 // |
| 76 //----------------------------------------------------------------------------- |
| 77 RegexMatcher::RegexMatcher(const RegexPattern *pat) { |
| 78 fDeferredStatus = U_ZERO_ERROR; |
| 79 init(fDeferredStatus); |
| 80 if (U_FAILURE(fDeferredStatus)) { |
| 81 return; |
| 82 } |
| 83 if (pat==NULL) { |
| 84 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; |
| 85 return; |
| 86 } |
| 87 fPattern = pat; |
| 88 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); |
| 89 } |
| 90 |
| 91 |
| 92 |
| 93 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &inp
ut, |
| 94 uint32_t flags, UErrorCode &status) { |
| 95 init(status); |
| 96 if (U_FAILURE(status)) { |
| 97 return; |
| 98 } |
| 99 UParseError pe; |
| 100 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| 101 fPattern = fPatternOwned; |
| 102 |
| 103 UText inputText = UTEXT_INITIALIZER; |
| 104 utext_openConstUnicodeString(&inputText, &input, &status); |
| 105 init2(&inputText, status); |
| 106 utext_close(&inputText); |
| 107 |
| 108 fInputUniStrMaybeMutable = TRUE; |
| 109 } |
| 110 |
| 111 |
| 112 RegexMatcher::RegexMatcher(UText *regexp, UText *input, |
| 113 uint32_t flags, UErrorCode &status) { |
| 114 init(status); |
| 115 if (U_FAILURE(status)) { |
| 116 return; |
| 117 } |
| 118 UParseError pe; |
| 119 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| 120 if (U_FAILURE(status)) { |
| 121 return; |
| 122 } |
| 123 |
| 124 fPattern = fPatternOwned; |
| 125 init2(input, status); |
| 126 } |
| 127 |
| 128 |
| 129 RegexMatcher::RegexMatcher(const UnicodeString ®exp, |
| 130 uint32_t flags, UErrorCode &status) { |
| 131 init(status); |
| 132 if (U_FAILURE(status)) { |
| 133 return; |
| 134 } |
| 135 UParseError pe; |
| 136 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| 137 if (U_FAILURE(status)) { |
| 138 return; |
| 139 } |
| 140 fPattern = fPatternOwned; |
| 141 init2(RegexStaticSets::gStaticSets->fEmptyText, status); |
| 142 } |
| 143 |
| 144 RegexMatcher::RegexMatcher(UText *regexp, |
| 145 uint32_t flags, UErrorCode &status) { |
| 146 init(status); |
| 147 if (U_FAILURE(status)) { |
| 148 return; |
| 149 } |
| 150 UParseError pe; |
| 151 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| 152 if (U_FAILURE(status)) { |
| 153 return; |
| 154 } |
| 155 |
| 156 fPattern = fPatternOwned; |
| 157 init2(RegexStaticSets::gStaticSets->fEmptyText, status); |
| 158 } |
| 159 |
| 160 |
| 161 |
| 162 |
| 163 RegexMatcher::~RegexMatcher() { |
| 164 delete fStack; |
| 165 if (fData != fSmallData) { |
| 166 uprv_free(fData); |
| 167 fData = NULL; |
| 168 } |
| 169 if (fPatternOwned) { |
| 170 delete fPatternOwned; |
| 171 fPatternOwned = NULL; |
| 172 fPattern = NULL; |
| 173 } |
| 174 |
| 175 if (fInput) { |
| 176 delete fInput; |
| 177 } |
| 178 if (fInputText) { |
| 179 utext_close(fInputText); |
| 180 } |
| 181 if (fAltInputText) { |
| 182 utext_close(fAltInputText); |
| 183 } |
| 184 |
| 185 #if UCONFIG_NO_BREAK_ITERATION==0 |
| 186 delete fWordBreakItr; |
| 187 #endif |
| 188 } |
| 189 |
| 190 // |
| 191 // init() common initialization for use by all constructors. |
| 192 // Initialize all fields, get the object into a consistent state. |
| 193 // This must be done even when the initial status shows an error, |
| 194 // so that the object is initialized sufficiently well for the destru
ctor |
| 195 // to run safely. |
| 196 // |
| 197 void RegexMatcher::init(UErrorCode &status) { |
| 198 fPattern = NULL; |
| 199 fPatternOwned = NULL; |
| 200 fFrameSize = 0; |
| 201 fRegionStart = 0; |
| 202 fRegionLimit = 0; |
| 203 fAnchorStart = 0; |
| 204 fAnchorLimit = 0; |
| 205 fLookStart = 0; |
| 206 fLookLimit = 0; |
| 207 fActiveStart = 0; |
| 208 fActiveLimit = 0; |
| 209 fTransparentBounds = FALSE; |
| 210 fAnchoringBounds = TRUE; |
| 211 fMatch = FALSE; |
| 212 fMatchStart = 0; |
| 213 fMatchEnd = 0; |
| 214 fLastMatchEnd = -1; |
| 215 fAppendPosition = 0; |
| 216 fHitEnd = FALSE; |
| 217 fRequireEnd = FALSE; |
| 218 fStack = NULL; |
| 219 fFrame = NULL; |
| 220 fTimeLimit = 0; |
| 221 fTime = 0; |
| 222 fTickCounter = 0; |
| 223 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; |
| 224 fCallbackFn = NULL; |
| 225 fCallbackContext = NULL; |
| 226 fFindProgressCallbackFn = NULL; |
| 227 fFindProgressCallbackContext = NULL; |
| 228 fTraceDebug = FALSE; |
| 229 fDeferredStatus = status; |
| 230 fData = fSmallData; |
| 231 fWordBreakItr = NULL; |
| 232 |
| 233 fStack = new UVector64(status); |
| 234 fInputText = NULL; |
| 235 fAltInputText = NULL; |
| 236 fInput = NULL; |
| 237 fInputLength = 0; |
| 238 fInputUniStrMaybeMutable = FALSE; |
| 239 |
| 240 if (U_FAILURE(status)) { |
| 241 fDeferredStatus = status; |
| 242 } |
| 243 } |
| 244 |
| 245 // |
| 246 // init2() Common initialization for use by RegexMatcher constructors, part 2
. |
| 247 // This handles the common setup to be done after the Pattern is avai
lable. |
| 248 // |
| 249 void RegexMatcher::init2(UText *input, UErrorCode &status) { |
| 250 if (U_FAILURE(status)) { |
| 251 fDeferredStatus = status; |
| 252 return; |
| 253 } |
| 254 |
| 255 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0])
)) { |
| 256 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); |
| 257 if (fData == NULL) { |
| 258 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
| 259 return; |
| 260 } |
| 261 } |
| 262 |
| 263 reset(input); |
| 264 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); |
| 265 if (U_FAILURE(status)) { |
| 266 fDeferredStatus = status; |
| 267 return; |
| 268 } |
| 269 } |
| 270 |
| 271 |
| 272 static const UChar BACKSLASH = 0x5c; |
| 273 static const UChar DOLLARSIGN = 0x24; |
| 274 //------------------------------------------------------------------------------
-- |
| 275 // |
| 276 // appendReplacement |
| 277 // |
| 278 //------------------------------------------------------------------------------
-- |
| 279 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, |
| 280 const UnicodeString &replacement, |
| 281 UErrorCode &status) { |
| 282 UText replacementText = UTEXT_INITIALIZER; |
| 283 |
| 284 utext_openConstUnicodeString(&replacementText, &replacement, &status); |
| 285 if (U_SUCCESS(status)) { |
| 286 UText resultText = UTEXT_INITIALIZER; |
| 287 utext_openUnicodeString(&resultText, &dest, &status); |
| 288 |
| 289 if (U_SUCCESS(status)) { |
| 290 appendReplacement(&resultText, &replacementText, status); |
| 291 utext_close(&resultText); |
| 292 } |
| 293 utext_close(&replacementText); |
| 294 } |
| 295 |
| 296 return *this; |
| 297 } |
| 298 |
| 299 // |
| 300 // appendReplacement, UText mode |
| 301 // |
| 302 RegexMatcher &RegexMatcher::appendReplacement(UText *dest, |
| 303 UText *replacement, |
| 304 UErrorCode &status) { |
| 305 if (U_FAILURE(status)) { |
| 306 return *this; |
| 307 } |
| 308 if (U_FAILURE(fDeferredStatus)) { |
| 309 status = fDeferredStatus; |
| 310 return *this; |
| 311 } |
| 312 if (fMatch == FALSE) { |
| 313 status = U_REGEX_INVALID_STATE; |
| 314 return *this; |
| 315 } |
| 316 |
| 317 // Copy input string from the end of previous match to start of current matc
h |
| 318 int64_t destLen = utext_nativeLength(dest); |
| 319 if (fMatchStart > fAppendPosition) { |
| 320 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 321 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkCo
ntents+fAppendPosition, |
| 322 (int32_t)(fMatchStart-fAppendPosition), &st
atus); |
| 323 } else { |
| 324 int32_t len16; |
| 325 if (UTEXT_USES_U16(fInputText)) { |
| 326 len16 = (int32_t)(fMatchStart-fAppendPosition); |
| 327 } else { |
| 328 UErrorCode lengthStatus = U_ZERO_ERROR; |
| 329 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart,
NULL, 0, &lengthStatus); |
| 330 } |
| 331 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); |
| 332 if (inputChars == NULL) { |
| 333 status = U_MEMORY_ALLOCATION_ERROR; |
| 334 return *this; |
| 335 } |
| 336 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars,
len16+1, &status); |
| 337 destLen += utext_replace(dest, destLen, destLen, inputChars, len16,
&status); |
| 338 uprv_free(inputChars); |
| 339 } |
| 340 } |
| 341 fAppendPosition = fMatchEnd; |
| 342 |
| 343 |
| 344 // scan the replacement text, looking for substitutions ($n) and \escapes. |
| 345 // TODO: optimize this loop by efficiently scanning for '$' or '\', |
| 346 // move entire ranges not containing substitutions. |
| 347 UTEXT_SETNATIVEINDEX(replacement, 0); |
| 348 UChar32 c = UTEXT_NEXT32(replacement); |
| 349 while (c != U_SENTINEL) { |
| 350 if (c == BACKSLASH) { |
| 351 // Backslash Escape. Copy the following char out without further ch
ecks. |
| 352 // Note: Surrogate pairs don't need any special
handling |
| 353 // The second half wont be a '$' or a '\',
and |
| 354 // will move to the dest normally on the n
ext |
| 355 // loop iteration. |
| 356 c = UTEXT_CURRENT32(replacement); |
| 357 if (c == U_SENTINEL) { |
| 358 break; |
| 359 } |
| 360 |
| 361 if (c==0x55/*U*/ || c==0x75/*u*/) { |
| 362 // We have a \udddd or \Udddddddd escape sequence. |
| 363 int32_t offset = 0; |
| 364 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UN
ESCAPE_CONTEXT(replacement); |
| 365 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt,
&offset, INT32_MAX, &context); |
| 366 if (escapedChar != (UChar32)0xFFFFFFFF) { |
| 367 if (U_IS_BMP(escapedChar)) { |
| 368 UChar c16 = (UChar)escapedChar; |
| 369 destLen += utext_replace(dest, destLen, destLen, &c16, 1
, &status); |
| 370 } else { |
| 371 UChar surrogate[2]; |
| 372 surrogate[0] = U16_LEAD(escapedChar); |
| 373 surrogate[1] = U16_TRAIL(escapedChar); |
| 374 if (U_SUCCESS(status)) { |
| 375 destLen += utext_replace(dest, destLen, destLen, sur
rogate, 2, &status); |
| 376 } |
| 377 } |
| 378 // TODO: Report errors for mal-formed \u escapes? |
| 379 // As this is, the original sequence is output, which
may be OK. |
| 380 if (context.lastOffset == offset) { |
| 381 UTEXT_PREVIOUS32(replacement); |
| 382 } else if (context.lastOffset != offset-1) { |
| 383 utext_moveIndex32(replacement, offset - context.lastOffs
et - 1); |
| 384 } |
| 385 } |
| 386 } else { |
| 387 UTEXT_NEXT32(replacement); |
| 388 // Plain backslash escape. Just put out the escaped character. |
| 389 if (U_IS_BMP(c)) { |
| 390 UChar c16 = (UChar)c; |
| 391 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &s
tatus); |
| 392 } else { |
| 393 UChar surrogate[2]; |
| 394 surrogate[0] = U16_LEAD(c); |
| 395 surrogate[1] = U16_TRAIL(c); |
| 396 if (U_SUCCESS(status)) { |
| 397 destLen += utext_replace(dest, destLen, destLen, surroga
te, 2, &status); |
| 398 } |
| 399 } |
| 400 } |
| 401 } else if (c != DOLLARSIGN) { |
| 402 // Normal char, not a $. Copy it out without further checks. |
| 403 if (U_IS_BMP(c)) { |
| 404 UChar c16 = (UChar)c; |
| 405 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &statu
s); |
| 406 } else { |
| 407 UChar surrogate[2]; |
| 408 surrogate[0] = U16_LEAD(c); |
| 409 surrogate[1] = U16_TRAIL(c); |
| 410 if (U_SUCCESS(status)) { |
| 411 destLen += utext_replace(dest, destLen, destLen, surrogate,
2, &status); |
| 412 } |
| 413 } |
| 414 } else { |
| 415 // We've got a $. Pick up a capture group number if one follows. |
| 416 // Consume at most the number of digits necessary for the largest ca
pture |
| 417 // number that is valid for this pattern. |
| 418 |
| 419 int32_t numDigits = 0; |
| 420 int32_t groupNum = 0; |
| 421 UChar32 digitC; |
| 422 for (;;) { |
| 423 digitC = UTEXT_CURRENT32(replacement); |
| 424 if (digitC == U_SENTINEL) { |
| 425 break; |
| 426 } |
| 427 if (u_isdigit(digitC) == FALSE) { |
| 428 break; |
| 429 } |
| 430 UTEXT_NEXT32(replacement); |
| 431 groupNum=groupNum*10 + u_charDigitValue(digitC); |
| 432 numDigits++; |
| 433 if (numDigits >= fPattern->fMaxCaptureDigits) { |
| 434 break; |
| 435 } |
| 436 } |
| 437 |
| 438 |
| 439 if (numDigits == 0) { |
| 440 // The $ didn't introduce a group number at all. |
| 441 // Treat it as just part of the substitution text. |
| 442 UChar c16 = DOLLARSIGN; |
| 443 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &statu
s); |
| 444 } else { |
| 445 // Finally, append the capture group data to the destination. |
| 446 destLen += appendGroup(groupNum, dest, status); |
| 447 if (U_FAILURE(status)) { |
| 448 // Can fail if group number is out of range. |
| 449 break; |
| 450 } |
| 451 } |
| 452 } |
| 453 |
| 454 if (U_FAILURE(status)) { |
| 455 break; |
| 456 } else { |
| 457 c = UTEXT_NEXT32(replacement); |
| 458 } |
| 459 } |
| 460 |
| 461 return *this; |
| 462 } |
| 463 |
| 464 |
| 465 |
| 466 //------------------------------------------------------------------------------
-- |
| 467 // |
| 468 // appendTail Intended to be used in conjunction with appendReplacement() |
| 469 // To the destination string, append everything following |
| 470 // the last match position from the input string. |
| 471 // |
| 472 // Note: Match ranges do not affect appendTail or appendRepla
cement |
| 473 // |
| 474 //------------------------------------------------------------------------------
-- |
| 475 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { |
| 476 UErrorCode status = U_ZERO_ERROR; |
| 477 UText resultText = UTEXT_INITIALIZER; |
| 478 utext_openUnicodeString(&resultText, &dest, &status); |
| 479 |
| 480 if (U_SUCCESS(status)) { |
| 481 appendTail(&resultText, status); |
| 482 utext_close(&resultText); |
| 483 } |
| 484 |
| 485 return dest; |
| 486 } |
| 487 |
| 488 // |
| 489 // appendTail, UText mode |
| 490 // |
| 491 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { |
| 492 UBool bailOut = FALSE; |
| 493 if (U_FAILURE(status)) { |
| 494 bailOut = TRUE; |
| 495 } |
| 496 if (U_FAILURE(fDeferredStatus)) { |
| 497 status = fDeferredStatus; |
| 498 bailOut = TRUE; |
| 499 } |
| 500 |
| 501 if (bailOut) { |
| 502 // dest must not be NULL |
| 503 if (dest) { |
| 504 utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(des
t), NULL, 0, &status); |
| 505 return dest; |
| 506 } |
| 507 } |
| 508 |
| 509 if (fInputLength > fAppendPosition) { |
| 510 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 511 int64_t destLen = utext_nativeLength(dest); |
| 512 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fApp
endPosition, |
| 513 (int32_t)(fInputLength-fAppendPosition), &status); |
| 514 } else { |
| 515 int32_t len16; |
| 516 if (UTEXT_USES_U16(fInputText)) { |
| 517 len16 = (int32_t)(fInputLength-fAppendPosition); |
| 518 } else { |
| 519 len16 = utext_extract(fInputText, fAppendPosition, fInputLength,
NULL, 0, &status); |
| 520 status = U_ZERO_ERROR; // buffer overflow |
| 521 } |
| 522 |
| 523 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); |
| 524 if (inputChars == NULL) { |
| 525 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
| 526 } else { |
| 527 utext_extract(fInputText, fAppendPosition, fInputLength, inputCh
ars, len16, &status); // unterminated |
| 528 int64_t destLen = utext_nativeLength(dest); |
| 529 utext_replace(dest, destLen, destLen, inputChars, len16, &status
); |
| 530 uprv_free(inputChars); |
| 531 } |
| 532 } |
| 533 } |
| 534 return dest; |
| 535 } |
| 536 |
| 537 |
| 538 |
| 539 //------------------------------------------------------------------------------
-- |
| 540 // |
| 541 // end |
| 542 // |
| 543 //------------------------------------------------------------------------------
-- |
| 544 int32_t RegexMatcher::end(UErrorCode &err) const { |
| 545 return end(0, err); |
| 546 } |
| 547 |
| 548 int64_t RegexMatcher::end64(UErrorCode &err) const { |
| 549 return end64(0, err); |
| 550 } |
| 551 |
| 552 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const { |
| 553 if (U_FAILURE(err)) { |
| 554 return -1; |
| 555 } |
| 556 if (fMatch == FALSE) { |
| 557 err = U_REGEX_INVALID_STATE; |
| 558 return -1; |
| 559 } |
| 560 if (group < 0 || group > fPattern->fGroupMap->size()) { |
| 561 err = U_INDEX_OUTOFBOUNDS_ERROR; |
| 562 return -1; |
| 563 } |
| 564 int64_t e = -1; |
| 565 if (group == 0) { |
| 566 e = fMatchEnd; |
| 567 } else { |
| 568 // Get the position within the stack frame of the variables for |
| 569 // this capture group. |
| 570 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); |
| 571 U_ASSERT(groupOffset < fPattern->fFrameSize); |
| 572 U_ASSERT(groupOffset >= 0); |
| 573 e = fFrame->fExtra[groupOffset + 1]; |
| 574 } |
| 575 |
| 576 return e; |
| 577 } |
| 578 |
| 579 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { |
| 580 return (int32_t)end64(group, err); |
| 581 } |
| 582 |
| 583 |
| 584 //------------------------------------------------------------------------------
-- |
| 585 // |
| 586 // find() |
| 587 // |
| 588 //------------------------------------------------------------------------------
-- |
| 589 UBool RegexMatcher::find() { |
| 590 // Start at the position of the last match end. (Will be zero if the |
| 591 // matcher has been reset.) |
| 592 // |
| 593 if (U_FAILURE(fDeferredStatus)) { |
| 594 return FALSE; |
| 595 } |
| 596 |
| 597 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 598 return findUsingChunk(); |
| 599 } |
| 600 |
| 601 int64_t startPos = fMatchEnd; |
| 602 if (startPos==0) { |
| 603 startPos = fActiveStart; |
| 604 } |
| 605 |
| 606 if (fMatch) { |
| 607 // Save the position of any previous successful match. |
| 608 fLastMatchEnd = fMatchEnd; |
| 609 |
| 610 if (fMatchStart == fMatchEnd) { |
| 611 // Previous match had zero length. Move start position up one posit
ion |
| 612 // to avoid sending find() into a loop on zero-length matches. |
| 613 if (startPos >= fActiveLimit) { |
| 614 fMatch = FALSE; |
| 615 fHitEnd = TRUE; |
| 616 return FALSE; |
| 617 } |
| 618 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| 619 UTEXT_NEXT32(fInputText); |
| 620 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| 621 } |
| 622 } else { |
| 623 if (fLastMatchEnd >= 0) { |
| 624 // A previous find() failed to match. Don't try again. |
| 625 // (without this test, a pattern with a zero-length match |
| 626 // could match again at the end of an input string.) |
| 627 fHitEnd = TRUE; |
| 628 return FALSE; |
| 629 } |
| 630 } |
| 631 |
| 632 |
| 633 // Compute the position in the input string beyond which a match can not beg
in, because |
| 634 // the minimum length match would extend past the end of the input. |
| 635 // Note: some patterns that cannot match anything will have fMinMatchLeng
th==Max Int. |
| 636 // Be aware of possible overflows if making changes here. |
| 637 int64_t testStartLimit; |
| 638 if (UTEXT_USES_U16(fInputText)) { |
| 639 testStartLimit = fActiveLimit - fPattern->fMinMatchLen; |
| 640 if (startPos > testStartLimit) { |
| 641 fMatch = FALSE; |
| 642 fHitEnd = TRUE; |
| 643 return FALSE; |
| 644 } |
| 645 } else { |
| 646 // For now, let the matcher discover that it can't match on its own |
| 647 // We don't know how long the match len is in native characters |
| 648 testStartLimit = fActiveLimit; |
| 649 } |
| 650 |
| 651 UChar32 c; |
| 652 U_ASSERT(startPos >= 0); |
| 653 |
| 654 switch (fPattern->fStartType) { |
| 655 case START_NO_INFO: |
| 656 // No optimization was found. |
| 657 // Try a match at each input position. |
| 658 for (;;) { |
| 659 MatchAt(startPos, FALSE, fDeferredStatus); |
| 660 if (U_FAILURE(fDeferredStatus)) { |
| 661 return FALSE; |
| 662 } |
| 663 if (fMatch) { |
| 664 return TRUE; |
| 665 } |
| 666 if (startPos >= testStartLimit) { |
| 667 fHitEnd = TRUE; |
| 668 return FALSE; |
| 669 } |
| 670 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| 671 UTEXT_NEXT32(fInputText); |
| 672 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| 673 // Note that it's perfectly OK for a pattern to have a zero-length |
| 674 // match at the end of a string, so we must make sure that the loo
p |
| 675 // runs with startPos == testStartLimit the last time through. |
| 676 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) |
| 677 return FALSE; |
| 678 } |
| 679 U_ASSERT(FALSE); |
| 680 |
| 681 case START_START: |
| 682 // Matches are only possible at the start of the input string |
| 683 // (pattern begins with ^ or \A) |
| 684 if (startPos > fActiveStart) { |
| 685 fMatch = FALSE; |
| 686 return FALSE; |
| 687 } |
| 688 MatchAt(startPos, FALSE, fDeferredStatus); |
| 689 if (U_FAILURE(fDeferredStatus)) { |
| 690 return FALSE; |
| 691 } |
| 692 return fMatch; |
| 693 |
| 694 |
| 695 case START_SET: |
| 696 { |
| 697 // Match may start on any char from a pre-computed set. |
| 698 U_ASSERT(fPattern->fMinMatchLen > 0); |
| 699 int64_t pos; |
| 700 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| 701 for (;;) { |
| 702 c = UTEXT_NEXT32(fInputText); |
| 703 pos = UTEXT_GETNATIVEINDEX(fInputText); |
| 704 // c will be -1 (U_SENTINEL) at end of text, in which case we |
| 705 // skip this next block (so we don't have a negative array index
) |
| 706 // and handle end of text in the following block. |
| 707 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c))
|| |
| 708 (c>=256 && fPattern->fInitialChars->contains(c))))
{ |
| 709 MatchAt(startPos, FALSE, fDeferredStatus); |
| 710 if (U_FAILURE(fDeferredStatus)) { |
| 711 return FALSE; |
| 712 } |
| 713 if (fMatch) { |
| 714 return TRUE; |
| 715 } |
| 716 UTEXT_SETNATIVEINDEX(fInputText, pos); |
| 717 } |
| 718 if (startPos >= testStartLimit) { |
| 719 fMatch = FALSE; |
| 720 fHitEnd = TRUE; |
| 721 return FALSE; |
| 722 } |
| 723 startPos = pos; |
| 724 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) |
| 725 return FALSE; |
| 726 } |
| 727 } |
| 728 U_ASSERT(FALSE); |
| 729 |
| 730 case START_STRING: |
| 731 case START_CHAR: |
| 732 { |
| 733 // Match starts on exactly one char. |
| 734 U_ASSERT(fPattern->fMinMatchLen > 0); |
| 735 UChar32 theChar = fPattern->fInitialChar; |
| 736 int64_t pos; |
| 737 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| 738 for (;;) { |
| 739 c = UTEXT_NEXT32(fInputText); |
| 740 pos = UTEXT_GETNATIVEINDEX(fInputText); |
| 741 if (c == theChar) { |
| 742 MatchAt(startPos, FALSE, fDeferredStatus); |
| 743 if (U_FAILURE(fDeferredStatus)) { |
| 744 return FALSE; |
| 745 } |
| 746 if (fMatch) { |
| 747 return TRUE; |
| 748 } |
| 749 UTEXT_SETNATIVEINDEX(fInputText, pos); |
| 750 } |
| 751 if (startPos >= testStartLimit) { |
| 752 fMatch = FALSE; |
| 753 fHitEnd = TRUE; |
| 754 return FALSE; |
| 755 } |
| 756 startPos = pos; |
| 757 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) |
| 758 return FALSE; |
| 759 } |
| 760 } |
| 761 U_ASSERT(FALSE); |
| 762 |
| 763 case START_LINE: |
| 764 { |
| 765 UChar32 c; |
| 766 if (startPos == fAnchorStart) { |
| 767 MatchAt(startPos, FALSE, fDeferredStatus); |
| 768 if (U_FAILURE(fDeferredStatus)) { |
| 769 return FALSE; |
| 770 } |
| 771 if (fMatch) { |
| 772 return TRUE; |
| 773 } |
| 774 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| 775 c = UTEXT_NEXT32(fInputText); |
| 776 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| 777 } else { |
| 778 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| 779 c = UTEXT_PREVIOUS32(fInputText); |
| 780 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| 781 } |
| 782 |
| 783 if (fPattern->fFlags & UREGEX_UNIX_LINES) { |
| 784 for (;;) { |
| 785 if (c == 0x0a) { |
| 786 MatchAt(startPos, FALSE, fDeferredStatus); |
| 787 if (U_FAILURE(fDeferredStatus)) { |
| 788 return FALSE; |
| 789 } |
| 790 if (fMatch) { |
| 791 return TRUE; |
| 792 } |
| 793 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| 794 } |
| 795 if (startPos >= testStartLimit) { |
| 796 fMatch = FALSE; |
| 797 fHitEnd = TRUE; |
| 798 return FALSE; |
| 799 } |
| 800 c = UTEXT_NEXT32(fInputText); |
| 801 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| 802 // Note that it's perfectly OK for a pattern to have a zero-
length |
| 803 // match at the end of a string, so we must make sure that
the loop |
| 804 // runs with startPos == testStartLimit the last time thro
ugh. |
| 805 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferred
Status)) |
| 806 return FALSE; |
| 807 } |
| 808 } else { |
| 809 for (;;) { |
| 810 if (((c & 0x7f) <= 0x29) && // First quickly bypass as m
any chars as possible |
| 811 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x202
9 )) { |
| 812 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CU
RRENT32(fInputText) == 0x0a) { |
| 813 UTEXT_NEXT32(fInputText); |
| 814 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| 815 } |
| 816 MatchAt(startPos, FALSE, fDeferredStatus); |
| 817 if (U_FAILURE(fDeferredStatus)) { |
| 818 return FALSE; |
| 819 } |
| 820 if (fMatch) { |
| 821 return TRUE; |
| 822 } |
| 823 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| 824 } |
| 825 if (startPos >= testStartLimit) { |
| 826 fMatch = FALSE; |
| 827 fHitEnd = TRUE; |
| 828 return FALSE; |
| 829 } |
| 830 c = UTEXT_NEXT32(fInputText); |
| 831 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| 832 // Note that it's perfectly OK for a pattern to have a zero-
length |
| 833 // match at the end of a string, so we must make sure that
the loop |
| 834 // runs with startPos == testStartLimit the last time thro
ugh. |
| 835 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferred
Status)) |
| 836 return FALSE; |
| 837 } |
| 838 } |
| 839 } |
| 840 |
| 841 default: |
| 842 U_ASSERT(FALSE); |
| 843 } |
| 844 |
| 845 U_ASSERT(FALSE); |
| 846 return FALSE; |
| 847 } |
| 848 |
| 849 |
| 850 |
| 851 UBool RegexMatcher::find(int64_t start, UErrorCode &status) { |
| 852 if (U_FAILURE(status)) { |
| 853 return FALSE; |
| 854 } |
| 855 if (U_FAILURE(fDeferredStatus)) { |
| 856 status = fDeferredStatus; |
| 857 return FALSE; |
| 858 } |
| 859 this->reset(); // Note: Reset() is specified by Java
Matcher documentation. |
| 860 // This will reset the region t
o be the full input length. |
| 861 if (start < 0) { |
| 862 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 863 return FALSE; |
| 864 } |
| 865 |
| 866 int64_t nativeStart = start; |
| 867 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { |
| 868 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 869 return FALSE; |
| 870 } |
| 871 fMatchEnd = nativeStart; |
| 872 return find(); |
| 873 } |
| 874 |
| 875 |
| 876 //------------------------------------------------------------------------------
-- |
| 877 // |
| 878 // findUsingChunk() -- like find(), but with the advance knowledge that the |
| 879 // entire string is available in the UText's chunk buffer. |
| 880 // |
| 881 //------------------------------------------------------------------------------
-- |
| 882 UBool RegexMatcher::findUsingChunk() { |
| 883 // Start at the position of the last match end. (Will be zero if the |
| 884 // matcher has been reset. |
| 885 // |
| 886 |
| 887 int32_t startPos = (int32_t)fMatchEnd; |
| 888 if (startPos==0) { |
| 889 startPos = (int32_t)fActiveStart; |
| 890 } |
| 891 |
| 892 const UChar *inputBuf = fInputText->chunkContents; |
| 893 |
| 894 if (fMatch) { |
| 895 // Save the position of any previous successful match. |
| 896 fLastMatchEnd = fMatchEnd; |
| 897 |
| 898 if (fMatchStart == fMatchEnd) { |
| 899 // Previous match had zero length. Move start position up one posit
ion |
| 900 // to avoid sending find() into a loop on zero-length matches. |
| 901 if (startPos >= fActiveLimit) { |
| 902 fMatch = FALSE; |
| 903 fHitEnd = TRUE; |
| 904 return FALSE; |
| 905 } |
| 906 U16_FWD_1(inputBuf, startPos, fInputLength); |
| 907 } |
| 908 } else { |
| 909 if (fLastMatchEnd >= 0) { |
| 910 // A previous find() failed to match. Don't try again. |
| 911 // (without this test, a pattern with a zero-length match |
| 912 // could match again at the end of an input string.) |
| 913 fHitEnd = TRUE; |
| 914 return FALSE; |
| 915 } |
| 916 } |
| 917 |
| 918 |
| 919 // Compute the position in the input string beyond which a match can not beg
in, because |
| 920 // the minimum length match would extend past the end of the input. |
| 921 // Note: some patterns that cannot match anything will have fMinMatchLeng
th==Max Int. |
| 922 // Be aware of possible overflows if making changes here. |
| 923 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); |
| 924 if (startPos > testLen) { |
| 925 fMatch = FALSE; |
| 926 fHitEnd = TRUE; |
| 927 return FALSE; |
| 928 } |
| 929 |
| 930 UChar32 c; |
| 931 U_ASSERT(startPos >= 0); |
| 932 |
| 933 switch (fPattern->fStartType) { |
| 934 case START_NO_INFO: |
| 935 // No optimization was found. |
| 936 // Try a match at each input position. |
| 937 for (;;) { |
| 938 MatchChunkAt(startPos, FALSE, fDeferredStatus); |
| 939 if (U_FAILURE(fDeferredStatus)) { |
| 940 return FALSE; |
| 941 } |
| 942 if (fMatch) { |
| 943 return TRUE; |
| 944 } |
| 945 if (startPos >= testLen) { |
| 946 fHitEnd = TRUE; |
| 947 return FALSE; |
| 948 } |
| 949 U16_FWD_1(inputBuf, startPos, fActiveLimit); |
| 950 // Note that it's perfectly OK for a pattern to have a zero-length |
| 951 // match at the end of a string, so we must make sure that the loo
p |
| 952 // runs with startPos == testLen the last time through. |
| 953 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) |
| 954 return FALSE; |
| 955 } |
| 956 U_ASSERT(FALSE); |
| 957 |
| 958 case START_START: |
| 959 // Matches are only possible at the start of the input string |
| 960 // (pattern begins with ^ or \A) |
| 961 if (startPos > fActiveStart) { |
| 962 fMatch = FALSE; |
| 963 return FALSE; |
| 964 } |
| 965 MatchChunkAt(startPos, FALSE, fDeferredStatus); |
| 966 if (U_FAILURE(fDeferredStatus)) { |
| 967 return FALSE; |
| 968 } |
| 969 return fMatch; |
| 970 |
| 971 |
| 972 case START_SET: |
| 973 { |
| 974 // Match may start on any char from a pre-computed set. |
| 975 U_ASSERT(fPattern->fMinMatchLen > 0); |
| 976 for (;;) { |
| 977 int32_t pos = startPos; |
| 978 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf
[startPos++]; |
| 979 if ((c<256 && fPattern->fInitialChars8->contains(c)) || |
| 980 (c>=256 && fPattern->fInitialChars->contains(c))) { |
| 981 MatchChunkAt(pos, FALSE, fDeferredStatus); |
| 982 if (U_FAILURE(fDeferredStatus)) { |
| 983 return FALSE; |
| 984 } |
| 985 if (fMatch) { |
| 986 return TRUE; |
| 987 } |
| 988 } |
| 989 if (pos >= testLen) { |
| 990 fMatch = FALSE; |
| 991 fHitEnd = TRUE; |
| 992 return FALSE; |
| 993 } |
| 994 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) |
| 995 return FALSE; |
| 996 } |
| 997 } |
| 998 U_ASSERT(FALSE); |
| 999 |
| 1000 case START_STRING: |
| 1001 case START_CHAR: |
| 1002 { |
| 1003 // Match starts on exactly one char. |
| 1004 U_ASSERT(fPattern->fMinMatchLen > 0); |
| 1005 UChar32 theChar = fPattern->fInitialChar; |
| 1006 for (;;) { |
| 1007 int32_t pos = startPos; |
| 1008 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf
[startPos++]; |
| 1009 if (c == theChar) { |
| 1010 MatchChunkAt(pos, FALSE, fDeferredStatus); |
| 1011 if (U_FAILURE(fDeferredStatus)) { |
| 1012 return FALSE; |
| 1013 } |
| 1014 if (fMatch) { |
| 1015 return TRUE; |
| 1016 } |
| 1017 } |
| 1018 if (pos >= testLen) { |
| 1019 fMatch = FALSE; |
| 1020 fHitEnd = TRUE; |
| 1021 return FALSE; |
| 1022 } |
| 1023 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) |
| 1024 return FALSE; |
| 1025 } |
| 1026 } |
| 1027 U_ASSERT(FALSE); |
| 1028 |
| 1029 case START_LINE: |
| 1030 { |
| 1031 UChar32 c; |
| 1032 if (startPos == fAnchorStart) { |
| 1033 MatchChunkAt(startPos, FALSE, fDeferredStatus); |
| 1034 if (U_FAILURE(fDeferredStatus)) { |
| 1035 return FALSE; |
| 1036 } |
| 1037 if (fMatch) { |
| 1038 return TRUE; |
| 1039 } |
| 1040 U16_FWD_1(inputBuf, startPos, fActiveLimit); |
| 1041 } |
| 1042 |
| 1043 if (fPattern->fFlags & UREGEX_UNIX_LINES) { |
| 1044 for (;;) { |
| 1045 c = inputBuf[startPos-1]; |
| 1046 if (c == 0x0a) { |
| 1047 MatchChunkAt(startPos, FALSE, fDeferredStatus); |
| 1048 if (U_FAILURE(fDeferredStatus)) { |
| 1049 return FALSE; |
| 1050 } |
| 1051 if (fMatch) { |
| 1052 return TRUE; |
| 1053 } |
| 1054 } |
| 1055 if (startPos >= testLen) { |
| 1056 fMatch = FALSE; |
| 1057 fHitEnd = TRUE; |
| 1058 return FALSE; |
| 1059 } |
| 1060 U16_FWD_1(inputBuf, startPos, fActiveLimit); |
| 1061 // Note that it's perfectly OK for a pattern to have a zero-leng
th |
| 1062 // match at the end of a string, so we must make sure that the
loop |
| 1063 // runs with startPos == testLen the last time through. |
| 1064 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) |
| 1065 return FALSE; |
| 1066 } |
| 1067 } else { |
| 1068 for (;;) { |
| 1069 c = inputBuf[startPos-1]; |
| 1070 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many
chars as possible |
| 1071 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 ))
{ |
| 1072 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPo
s] == 0x0a) { |
| 1073 startPos++; |
| 1074 } |
| 1075 MatchChunkAt(startPos, FALSE, fDeferredStatus); |
| 1076 if (U_FAILURE(fDeferredStatus)) { |
| 1077 return FALSE; |
| 1078 } |
| 1079 if (fMatch) { |
| 1080 return TRUE; |
| 1081 } |
| 1082 } |
| 1083 if (startPos >= testLen) { |
| 1084 fMatch = FALSE; |
| 1085 fHitEnd = TRUE; |
| 1086 return FALSE; |
| 1087 } |
| 1088 U16_FWD_1(inputBuf, startPos, fActiveLimit); |
| 1089 // Note that it's perfectly OK for a pattern to have a zero-leng
th |
| 1090 // match at the end of a string, so we must make sure that the
loop |
| 1091 // runs with startPos == testLen the last time through. |
| 1092 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) |
| 1093 return FALSE; |
| 1094 } |
| 1095 } |
| 1096 } |
| 1097 |
| 1098 default: |
| 1099 U_ASSERT(FALSE); |
| 1100 } |
| 1101 |
| 1102 U_ASSERT(FALSE); |
| 1103 return FALSE; |
| 1104 } |
| 1105 |
| 1106 |
| 1107 |
| 1108 //------------------------------------------------------------------------------
-- |
| 1109 // |
| 1110 // group() |
| 1111 // |
| 1112 //------------------------------------------------------------------------------
-- |
| 1113 UnicodeString RegexMatcher::group(UErrorCode &status) const { |
| 1114 return group(0, status); |
| 1115 } |
| 1116 |
| 1117 // Return immutable shallow clone |
| 1118 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status)
const { |
| 1119 return group(0, dest, group_len, status); |
| 1120 } |
| 1121 |
| 1122 // Return immutable shallow clone |
| 1123 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UE
rrorCode &status) const { |
| 1124 group_len = 0; |
| 1125 UBool bailOut = FALSE; |
| 1126 if (U_FAILURE(status)) { |
| 1127 return dest; |
| 1128 } |
| 1129 if (U_FAILURE(fDeferredStatus)) { |
| 1130 status = fDeferredStatus; |
| 1131 bailOut = TRUE; |
| 1132 } |
| 1133 if (fMatch == FALSE) { |
| 1134 status = U_REGEX_INVALID_STATE; |
| 1135 bailOut = TRUE; |
| 1136 } |
| 1137 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { |
| 1138 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 1139 bailOut = TRUE; |
| 1140 } |
| 1141 |
| 1142 if (bailOut) { |
| 1143 return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status); |
| 1144 } |
| 1145 |
| 1146 int64_t s, e; |
| 1147 if (groupNum == 0) { |
| 1148 s = fMatchStart; |
| 1149 e = fMatchEnd; |
| 1150 } else { |
| 1151 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); |
| 1152 U_ASSERT(groupOffset < fPattern->fFrameSize); |
| 1153 U_ASSERT(groupOffset >= 0); |
| 1154 s = fFrame->fExtra[groupOffset]; |
| 1155 e = fFrame->fExtra[groupOffset+1]; |
| 1156 } |
| 1157 |
| 1158 if (s < 0) { |
| 1159 // A capture group wasn't part of the match |
| 1160 return utext_clone(dest, fInputText, FALSE, TRUE, &status); |
| 1161 } |
| 1162 U_ASSERT(s <= e); |
| 1163 group_len = e - s; |
| 1164 |
| 1165 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); |
| 1166 if (dest) |
| 1167 UTEXT_SETNATIVEINDEX(dest, s); |
| 1168 return dest; |
| 1169 } |
| 1170 |
| 1171 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { |
| 1172 UnicodeString result; |
| 1173 if (U_FAILURE(status)) { |
| 1174 return result; |
| 1175 } |
| 1176 UText resultText = UTEXT_INITIALIZER; |
| 1177 utext_openUnicodeString(&resultText, &result, &status); |
| 1178 group(groupNum, &resultText, status); |
| 1179 utext_close(&resultText); |
| 1180 return result; |
| 1181 } |
| 1182 |
| 1183 |
| 1184 // Return deep (mutable) clone |
| 1185 // Technology Preview (as an API), but note that the UnicodeString
API is implemented |
| 1186 // using this function. |
| 1187 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) co
nst { |
| 1188 UBool bailOut = FALSE; |
| 1189 if (U_FAILURE(status)) { |
| 1190 return dest; |
| 1191 } |
| 1192 if (U_FAILURE(fDeferredStatus)) { |
| 1193 status = fDeferredStatus; |
| 1194 bailOut = TRUE; |
| 1195 } |
| 1196 |
| 1197 if (fMatch == FALSE) { |
| 1198 status = U_REGEX_INVALID_STATE; |
| 1199 bailOut = TRUE; |
| 1200 } |
| 1201 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { |
| 1202 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 1203 bailOut = TRUE; |
| 1204 } |
| 1205 |
| 1206 if (bailOut) { |
| 1207 if (dest) { |
| 1208 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); |
| 1209 return dest; |
| 1210 } else { |
| 1211 return utext_openUChars(NULL, NULL, 0, &status); |
| 1212 } |
| 1213 } |
| 1214 |
| 1215 int64_t s, e; |
| 1216 if (groupNum == 0) { |
| 1217 s = fMatchStart; |
| 1218 e = fMatchEnd; |
| 1219 } else { |
| 1220 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); |
| 1221 U_ASSERT(groupOffset < fPattern->fFrameSize); |
| 1222 U_ASSERT(groupOffset >= 0); |
| 1223 s = fFrame->fExtra[groupOffset]; |
| 1224 e = fFrame->fExtra[groupOffset+1]; |
| 1225 } |
| 1226 |
| 1227 if (s < 0) { |
| 1228 // A capture group wasn't part of the match |
| 1229 if (dest) { |
| 1230 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); |
| 1231 return dest; |
| 1232 } else { |
| 1233 return utext_openUChars(NULL, NULL, 0, &status); |
| 1234 } |
| 1235 } |
| 1236 U_ASSERT(s <= e); |
| 1237 |
| 1238 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 1239 U_ASSERT(e <= fInputLength); |
| 1240 if (dest) { |
| 1241 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkCo
ntents+s, (int32_t)(e-s), &status); |
| 1242 } else { |
| 1243 UText groupText = UTEXT_INITIALIZER; |
| 1244 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &stat
us); |
| 1245 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); |
| 1246 utext_close(&groupText); |
| 1247 } |
| 1248 } else { |
| 1249 int32_t len16; |
| 1250 if (UTEXT_USES_U16(fInputText)) { |
| 1251 len16 = (int32_t)(e-s); |
| 1252 } else { |
| 1253 UErrorCode lengthStatus = U_ZERO_ERROR; |
| 1254 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); |
| 1255 } |
| 1256 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); |
| 1257 if (groupChars == NULL) { |
| 1258 status = U_MEMORY_ALLOCATION_ERROR; |
| 1259 return dest; |
| 1260 } |
| 1261 utext_extract(fInputText, s, e, groupChars, len16+1, &status); |
| 1262 |
| 1263 if (dest) { |
| 1264 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16,
&status); |
| 1265 } else { |
| 1266 UText groupText = UTEXT_INITIALIZER; |
| 1267 utext_openUChars(&groupText, groupChars, len16, &status); |
| 1268 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); |
| 1269 utext_close(&groupText); |
| 1270 } |
| 1271 |
| 1272 uprv_free(groupChars); |
| 1273 } |
| 1274 return dest; |
| 1275 } |
| 1276 |
| 1277 //------------------------------------------------------------------------------
-- |
| 1278 // |
| 1279 // appendGroup() -- currently internal only, appends a group to a UText rather |
| 1280 // than replacing its contents |
| 1281 // |
| 1282 //------------------------------------------------------------------------------
-- |
| 1283 |
| 1284 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &sta
tus) const { |
| 1285 if (U_FAILURE(status)) { |
| 1286 return 0; |
| 1287 } |
| 1288 if (U_FAILURE(fDeferredStatus)) { |
| 1289 status = fDeferredStatus; |
| 1290 return 0; |
| 1291 } |
| 1292 int64_t destLen = utext_nativeLength(dest); |
| 1293 |
| 1294 if (fMatch == FALSE) { |
| 1295 status = U_REGEX_INVALID_STATE; |
| 1296 return utext_replace(dest, destLen, destLen, NULL, 0, &status); |
| 1297 } |
| 1298 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { |
| 1299 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 1300 return utext_replace(dest, destLen, destLen, NULL, 0, &status); |
| 1301 } |
| 1302 |
| 1303 int64_t s, e; |
| 1304 if (groupNum == 0) { |
| 1305 s = fMatchStart; |
| 1306 e = fMatchEnd; |
| 1307 } else { |
| 1308 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); |
| 1309 U_ASSERT(groupOffset < fPattern->fFrameSize); |
| 1310 U_ASSERT(groupOffset >= 0); |
| 1311 s = fFrame->fExtra[groupOffset]; |
| 1312 e = fFrame->fExtra[groupOffset+1]; |
| 1313 } |
| 1314 |
| 1315 if (s < 0) { |
| 1316 // A capture group wasn't part of the match |
| 1317 return utext_replace(dest, destLen, destLen, NULL, 0, &status); |
| 1318 } |
| 1319 U_ASSERT(s <= e); |
| 1320 |
| 1321 int64_t deltaLen; |
| 1322 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 1323 U_ASSERT(e <= fInputLength); |
| 1324 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkConten
ts+s, (int32_t)(e-s), &status); |
| 1325 } else { |
| 1326 int32_t len16; |
| 1327 if (UTEXT_USES_U16(fInputText)) { |
| 1328 len16 = (int32_t)(e-s); |
| 1329 } else { |
| 1330 UErrorCode lengthStatus = U_ZERO_ERROR; |
| 1331 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); |
| 1332 } |
| 1333 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); |
| 1334 if (groupChars == NULL) { |
| 1335 status = U_MEMORY_ALLOCATION_ERROR; |
| 1336 return 0; |
| 1337 } |
| 1338 utext_extract(fInputText, s, e, groupChars, len16+1, &status); |
| 1339 |
| 1340 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &sta
tus); |
| 1341 uprv_free(groupChars); |
| 1342 } |
| 1343 return deltaLen; |
| 1344 } |
| 1345 |
| 1346 |
| 1347 |
| 1348 //------------------------------------------------------------------------------
-- |
| 1349 // |
| 1350 // groupCount() |
| 1351 // |
| 1352 //------------------------------------------------------------------------------
-- |
| 1353 int32_t RegexMatcher::groupCount() const { |
| 1354 return fPattern->fGroupMap->size(); |
| 1355 } |
| 1356 |
| 1357 |
| 1358 |
| 1359 //------------------------------------------------------------------------------
-- |
| 1360 // |
| 1361 // hasAnchoringBounds() |
| 1362 // |
| 1363 //------------------------------------------------------------------------------
-- |
| 1364 UBool RegexMatcher::hasAnchoringBounds() const { |
| 1365 return fAnchoringBounds; |
| 1366 } |
| 1367 |
| 1368 |
| 1369 //------------------------------------------------------------------------------
-- |
| 1370 // |
| 1371 // hasTransparentBounds() |
| 1372 // |
| 1373 //------------------------------------------------------------------------------
-- |
| 1374 UBool RegexMatcher::hasTransparentBounds() const { |
| 1375 return fTransparentBounds; |
| 1376 } |
| 1377 |
| 1378 |
| 1379 |
| 1380 //------------------------------------------------------------------------------
-- |
| 1381 // |
| 1382 // hitEnd() |
| 1383 // |
| 1384 //------------------------------------------------------------------------------
-- |
| 1385 UBool RegexMatcher::hitEnd() const { |
| 1386 return fHitEnd; |
| 1387 } |
| 1388 |
| 1389 |
| 1390 //------------------------------------------------------------------------------
-- |
| 1391 // |
| 1392 // input() |
| 1393 // |
| 1394 //------------------------------------------------------------------------------
-- |
| 1395 const UnicodeString &RegexMatcher::input() const { |
| 1396 if (!fInput) { |
| 1397 UErrorCode status = U_ZERO_ERROR; |
| 1398 int32_t len16; |
| 1399 if (UTEXT_USES_U16(fInputText)) { |
| 1400 len16 = (int32_t)fInputLength; |
| 1401 } else { |
| 1402 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status)
; |
| 1403 status = U_ZERO_ERROR; // overflow, length status |
| 1404 } |
| 1405 UnicodeString *result = new UnicodeString(len16, 0, 0); |
| 1406 |
| 1407 UChar *inputChars = result->getBuffer(len16); |
| 1408 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status);
// unterminated warning |
| 1409 result->releaseBuffer(len16); |
| 1410 |
| 1411 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rath
er than operator= |
| 1412 } |
| 1413 |
| 1414 return *fInput; |
| 1415 } |
| 1416 |
| 1417 //------------------------------------------------------------------------------
-- |
| 1418 // |
| 1419 // inputText() |
| 1420 // |
| 1421 //------------------------------------------------------------------------------
-- |
| 1422 UText *RegexMatcher::inputText() const { |
| 1423 return fInputText; |
| 1424 } |
| 1425 |
| 1426 |
| 1427 //------------------------------------------------------------------------------
-- |
| 1428 // |
| 1429 // getInput() -- like inputText(), but makes a clone or copies into another UTe
xt |
| 1430 // |
| 1431 //------------------------------------------------------------------------------
-- |
| 1432 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { |
| 1433 UBool bailOut = FALSE; |
| 1434 if (U_FAILURE(status)) { |
| 1435 return dest; |
| 1436 } |
| 1437 if (U_FAILURE(fDeferredStatus)) { |
| 1438 status = fDeferredStatus; |
| 1439 bailOut = TRUE; |
| 1440 } |
| 1441 |
| 1442 if (bailOut) { |
| 1443 if (dest) { |
| 1444 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); |
| 1445 return dest; |
| 1446 } else { |
| 1447 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); |
| 1448 } |
| 1449 } |
| 1450 |
| 1451 if (dest) { |
| 1452 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 1453 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkCo
ntents, (int32_t)fInputLength, &status); |
| 1454 } else { |
| 1455 int32_t input16Len; |
| 1456 if (UTEXT_USES_U16(fInputText)) { |
| 1457 input16Len = (int32_t)fInputLength; |
| 1458 } else { |
| 1459 UErrorCode lengthStatus = U_ZERO_ERROR; |
| 1460 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0,
&lengthStatus); // buffer overflow error |
| 1461 } |
| 1462 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len))
; |
| 1463 if (inputChars == NULL) { |
| 1464 return dest; |
| 1465 } |
| 1466 |
| 1467 status = U_ZERO_ERROR; |
| 1468 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &
status); // not terminated warning |
| 1469 status = U_ZERO_ERROR; |
| 1470 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16
Len, &status); |
| 1471 |
| 1472 uprv_free(inputChars); |
| 1473 } |
| 1474 return dest; |
| 1475 } else { |
| 1476 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); |
| 1477 } |
| 1478 } |
| 1479 |
| 1480 |
| 1481 static UBool compat_SyncMutableUTextContents(UText *ut); |
| 1482 static UBool compat_SyncMutableUTextContents(UText *ut) { |
| 1483 UBool retVal = FALSE; |
| 1484 |
| 1485 // In the following test, we're really only interested in whether the UText
should switch |
| 1486 // between heap and stack allocation. If length hasn't changed, we won't,
so the chunkContents |
| 1487 // will still point to the correct data. |
| 1488 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { |
| 1489 UnicodeString *us=(UnicodeString *)ut->context; |
| 1490 |
| 1491 // Update to the latest length. |
| 1492 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). |
| 1493 int32_t newLength = us->length(); |
| 1494 |
| 1495 // Update the chunk description. |
| 1496 // The buffer may have switched between stack- and heap-based. |
| 1497 ut->chunkContents = us->getBuffer(); |
| 1498 ut->chunkLength = newLength; |
| 1499 ut->chunkNativeLimit = newLength; |
| 1500 ut->nativeIndexingLimit = newLength; |
| 1501 retVal = TRUE; |
| 1502 } |
| 1503 |
| 1504 return retVal; |
| 1505 } |
| 1506 |
| 1507 //------------------------------------------------------------------------------
-- |
| 1508 // |
| 1509 // lookingAt() |
| 1510 // |
| 1511 //------------------------------------------------------------------------------
-- |
| 1512 UBool RegexMatcher::lookingAt(UErrorCode &status) { |
| 1513 if (U_FAILURE(status)) { |
| 1514 return FALSE; |
| 1515 } |
| 1516 if (U_FAILURE(fDeferredStatus)) { |
| 1517 status = fDeferredStatus; |
| 1518 return FALSE; |
| 1519 } |
| 1520 |
| 1521 if (fInputUniStrMaybeMutable) { |
| 1522 if (compat_SyncMutableUTextContents(fInputText)) { |
| 1523 fInputLength = utext_nativeLength(fInputText); |
| 1524 reset(); |
| 1525 } |
| 1526 } |
| 1527 else { |
| 1528 resetPreserveRegion(); |
| 1529 } |
| 1530 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 1531 MatchChunkAt((int32_t)fActiveStart, FALSE, status); |
| 1532 } else { |
| 1533 MatchAt(fActiveStart, FALSE, status); |
| 1534 } |
| 1535 return fMatch; |
| 1536 } |
| 1537 |
| 1538 |
| 1539 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { |
| 1540 if (U_FAILURE(status)) { |
| 1541 return FALSE; |
| 1542 } |
| 1543 if (U_FAILURE(fDeferredStatus)) { |
| 1544 status = fDeferredStatus; |
| 1545 return FALSE; |
| 1546 } |
| 1547 reset(); |
| 1548 |
| 1549 if (start < 0) { |
| 1550 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 1551 return FALSE; |
| 1552 } |
| 1553 |
| 1554 if (fInputUniStrMaybeMutable) { |
| 1555 if (compat_SyncMutableUTextContents(fInputText)) { |
| 1556 fInputLength = utext_nativeLength(fInputText); |
| 1557 reset(); |
| 1558 } |
| 1559 } |
| 1560 |
| 1561 int64_t nativeStart; |
| 1562 nativeStart = start; |
| 1563 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { |
| 1564 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 1565 return FALSE; |
| 1566 } |
| 1567 |
| 1568 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 1569 MatchChunkAt((int32_t)nativeStart, FALSE, status); |
| 1570 } else { |
| 1571 MatchAt(nativeStart, FALSE, status); |
| 1572 } |
| 1573 return fMatch; |
| 1574 } |
| 1575 |
| 1576 |
| 1577 |
| 1578 //------------------------------------------------------------------------------
-- |
| 1579 // |
| 1580 // matches() |
| 1581 // |
| 1582 //------------------------------------------------------------------------------
-- |
| 1583 UBool RegexMatcher::matches(UErrorCode &status) { |
| 1584 if (U_FAILURE(status)) { |
| 1585 return FALSE; |
| 1586 } |
| 1587 if (U_FAILURE(fDeferredStatus)) { |
| 1588 status = fDeferredStatus; |
| 1589 return FALSE; |
| 1590 } |
| 1591 |
| 1592 if (fInputUniStrMaybeMutable) { |
| 1593 if (compat_SyncMutableUTextContents(fInputText)) { |
| 1594 fInputLength = utext_nativeLength(fInputText); |
| 1595 reset(); |
| 1596 } |
| 1597 } |
| 1598 else { |
| 1599 resetPreserveRegion(); |
| 1600 } |
| 1601 |
| 1602 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 1603 MatchChunkAt((int32_t)fActiveStart, TRUE, status); |
| 1604 } else { |
| 1605 MatchAt(fActiveStart, TRUE, status); |
| 1606 } |
| 1607 return fMatch; |
| 1608 } |
| 1609 |
| 1610 |
| 1611 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { |
| 1612 if (U_FAILURE(status)) { |
| 1613 return FALSE; |
| 1614 } |
| 1615 if (U_FAILURE(fDeferredStatus)) { |
| 1616 status = fDeferredStatus; |
| 1617 return FALSE; |
| 1618 } |
| 1619 reset(); |
| 1620 |
| 1621 if (start < 0) { |
| 1622 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 1623 return FALSE; |
| 1624 } |
| 1625 |
| 1626 if (fInputUniStrMaybeMutable) { |
| 1627 if (compat_SyncMutableUTextContents(fInputText)) { |
| 1628 fInputLength = utext_nativeLength(fInputText); |
| 1629 reset(); |
| 1630 } |
| 1631 } |
| 1632 |
| 1633 int64_t nativeStart; |
| 1634 nativeStart = start; |
| 1635 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { |
| 1636 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 1637 return FALSE; |
| 1638 } |
| 1639 |
| 1640 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 1641 MatchChunkAt((int32_t)nativeStart, TRUE, status); |
| 1642 } else { |
| 1643 MatchAt(nativeStart, TRUE, status); |
| 1644 } |
| 1645 return fMatch; |
| 1646 } |
| 1647 |
| 1648 |
| 1649 |
| 1650 //------------------------------------------------------------------------------
-- |
| 1651 // |
| 1652 // pattern |
| 1653 // |
| 1654 //------------------------------------------------------------------------------
-- |
| 1655 const RegexPattern &RegexMatcher::pattern() const { |
| 1656 return *fPattern; |
| 1657 } |
| 1658 |
| 1659 |
| 1660 |
| 1661 //------------------------------------------------------------------------------
-- |
| 1662 // |
| 1663 // region |
| 1664 // |
| 1665 //------------------------------------------------------------------------------
-- |
| 1666 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int
64_t startIndex, UErrorCode &status) { |
| 1667 if (U_FAILURE(status)) { |
| 1668 return *this; |
| 1669 } |
| 1670 |
| 1671 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { |
| 1672 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 1673 } |
| 1674 |
| 1675 int64_t nativeStart = regionStart; |
| 1676 int64_t nativeLimit = regionLimit; |
| 1677 if (nativeStart > fInputLength || nativeLimit > fInputLength) { |
| 1678 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 1679 } |
| 1680 |
| 1681 if (startIndex == -1) |
| 1682 this->reset(); |
| 1683 else |
| 1684 resetPreserveRegion(); |
| 1685 |
| 1686 fRegionStart = nativeStart; |
| 1687 fRegionLimit = nativeLimit; |
| 1688 fActiveStart = nativeStart; |
| 1689 fActiveLimit = nativeLimit; |
| 1690 |
| 1691 if (startIndex != -1) { |
| 1692 if (startIndex < fActiveStart || startIndex > fActiveLimit) { |
| 1693 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 1694 } |
| 1695 fMatchEnd = startIndex; |
| 1696 } |
| 1697 |
| 1698 if (!fTransparentBounds) { |
| 1699 fLookStart = nativeStart; |
| 1700 fLookLimit = nativeLimit; |
| 1701 } |
| 1702 if (fAnchoringBounds) { |
| 1703 fAnchorStart = nativeStart; |
| 1704 fAnchorLimit = nativeLimit; |
| 1705 } |
| 1706 return *this; |
| 1707 } |
| 1708 |
| 1709 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &sta
tus) { |
| 1710 return region(start, limit, -1, status); |
| 1711 } |
| 1712 |
| 1713 //------------------------------------------------------------------------------
-- |
| 1714 // |
| 1715 // regionEnd |
| 1716 // |
| 1717 //------------------------------------------------------------------------------
-- |
| 1718 int32_t RegexMatcher::regionEnd() const { |
| 1719 return (int32_t)fRegionLimit; |
| 1720 } |
| 1721 |
| 1722 int64_t RegexMatcher::regionEnd64() const { |
| 1723 return fRegionLimit; |
| 1724 } |
| 1725 |
| 1726 //------------------------------------------------------------------------------
-- |
| 1727 // |
| 1728 // regionStart |
| 1729 // |
| 1730 //------------------------------------------------------------------------------
-- |
| 1731 int32_t RegexMatcher::regionStart() const { |
| 1732 return (int32_t)fRegionStart; |
| 1733 } |
| 1734 |
| 1735 int64_t RegexMatcher::regionStart64() const { |
| 1736 return fRegionStart; |
| 1737 } |
| 1738 |
| 1739 |
| 1740 //------------------------------------------------------------------------------
-- |
| 1741 // |
| 1742 // replaceAll |
| 1743 // |
| 1744 //------------------------------------------------------------------------------
-- |
| 1745 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorC
ode &status) { |
| 1746 UText replacementText = UTEXT_INITIALIZER; |
| 1747 UText resultText = UTEXT_INITIALIZER; |
| 1748 UnicodeString resultString; |
| 1749 if (U_FAILURE(status)) { |
| 1750 return resultString; |
| 1751 } |
| 1752 |
| 1753 utext_openConstUnicodeString(&replacementText, &replacement, &status); |
| 1754 utext_openUnicodeString(&resultText, &resultString, &status); |
| 1755 |
| 1756 replaceAll(&replacementText, &resultText, status); |
| 1757 |
| 1758 utext_close(&resultText); |
| 1759 utext_close(&replacementText); |
| 1760 |
| 1761 return resultString; |
| 1762 } |
| 1763 |
| 1764 |
| 1765 // |
| 1766 // replaceAll, UText mode |
| 1767 // |
| 1768 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &sta
tus) { |
| 1769 if (U_FAILURE(status)) { |
| 1770 return dest; |
| 1771 } |
| 1772 if (U_FAILURE(fDeferredStatus)) { |
| 1773 status = fDeferredStatus; |
| 1774 return dest; |
| 1775 } |
| 1776 |
| 1777 if (dest == NULL) { |
| 1778 UnicodeString emptyString; |
| 1779 UText empty = UTEXT_INITIALIZER; |
| 1780 |
| 1781 utext_openUnicodeString(&empty, &emptyString, &status); |
| 1782 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); |
| 1783 utext_close(&empty); |
| 1784 } |
| 1785 |
| 1786 if (U_SUCCESS(status)) { |
| 1787 reset(); |
| 1788 while (find()) { |
| 1789 appendReplacement(dest, replacement, status); |
| 1790 if (U_FAILURE(status)) { |
| 1791 break; |
| 1792 } |
| 1793 } |
| 1794 appendTail(dest, status); |
| 1795 } |
| 1796 |
| 1797 return dest; |
| 1798 } |
| 1799 |
| 1800 |
| 1801 //------------------------------------------------------------------------------
-- |
| 1802 // |
| 1803 // replaceFirst |
| 1804 // |
| 1805 //------------------------------------------------------------------------------
-- |
| 1806 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErro
rCode &status) { |
| 1807 UText replacementText = UTEXT_INITIALIZER; |
| 1808 UText resultText = UTEXT_INITIALIZER; |
| 1809 UnicodeString resultString; |
| 1810 |
| 1811 utext_openConstUnicodeString(&replacementText, &replacement, &status); |
| 1812 utext_openUnicodeString(&resultText, &resultString, &status); |
| 1813 |
| 1814 replaceFirst(&replacementText, &resultText, status); |
| 1815 |
| 1816 utext_close(&resultText); |
| 1817 utext_close(&replacementText); |
| 1818 |
| 1819 return resultString; |
| 1820 } |
| 1821 |
| 1822 // |
| 1823 // replaceFirst, UText mode |
| 1824 // |
| 1825 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &s
tatus) { |
| 1826 if (U_FAILURE(status)) { |
| 1827 return dest; |
| 1828 } |
| 1829 if (U_FAILURE(fDeferredStatus)) { |
| 1830 status = fDeferredStatus; |
| 1831 return dest; |
| 1832 } |
| 1833 |
| 1834 reset(); |
| 1835 if (!find()) { |
| 1836 return getInput(dest, status); |
| 1837 } |
| 1838 |
| 1839 if (dest == NULL) { |
| 1840 UnicodeString emptyString; |
| 1841 UText empty = UTEXT_INITIALIZER; |
| 1842 |
| 1843 utext_openUnicodeString(&empty, &emptyString, &status); |
| 1844 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); |
| 1845 utext_close(&empty); |
| 1846 } |
| 1847 |
| 1848 appendReplacement(dest, replacement, status); |
| 1849 appendTail(dest, status); |
| 1850 |
| 1851 return dest; |
| 1852 } |
| 1853 |
| 1854 |
| 1855 //------------------------------------------------------------------------------
-- |
| 1856 // |
| 1857 // requireEnd |
| 1858 // |
| 1859 //------------------------------------------------------------------------------
-- |
| 1860 UBool RegexMatcher::requireEnd() const { |
| 1861 return fRequireEnd; |
| 1862 } |
| 1863 |
| 1864 |
| 1865 //------------------------------------------------------------------------------
-- |
| 1866 // |
| 1867 // reset |
| 1868 // |
| 1869 //------------------------------------------------------------------------------
-- |
| 1870 RegexMatcher &RegexMatcher::reset() { |
| 1871 fRegionStart = 0; |
| 1872 fRegionLimit = fInputLength; |
| 1873 fActiveStart = 0; |
| 1874 fActiveLimit = fInputLength; |
| 1875 fAnchorStart = 0; |
| 1876 fAnchorLimit = fInputLength; |
| 1877 fLookStart = 0; |
| 1878 fLookLimit = fInputLength; |
| 1879 resetPreserveRegion(); |
| 1880 return *this; |
| 1881 } |
| 1882 |
| 1883 |
| 1884 |
| 1885 void RegexMatcher::resetPreserveRegion() { |
| 1886 fMatchStart = 0; |
| 1887 fMatchEnd = 0; |
| 1888 fLastMatchEnd = -1; |
| 1889 fAppendPosition = 0; |
| 1890 fMatch = FALSE; |
| 1891 fHitEnd = FALSE; |
| 1892 fRequireEnd = FALSE; |
| 1893 fTime = 0; |
| 1894 fTickCounter = TIMER_INITIAL_VALUE; |
| 1895 //resetStack(); // more expensive than it looks... |
| 1896 } |
| 1897 |
| 1898 |
| 1899 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { |
| 1900 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStat
us); |
| 1901 if (fPattern->fNeedsAltInput) { |
| 1902 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDe
ferredStatus); |
| 1903 } |
| 1904 fInputLength = utext_nativeLength(fInputText); |
| 1905 |
| 1906 reset(); |
| 1907 delete fInput; |
| 1908 fInput = NULL; |
| 1909 |
| 1910 // Do the following for any UnicodeString. |
| 1911 // This is for compatibility for those clients who modify the input string
"live" during regex operations. |
| 1912 fInputUniStrMaybeMutable = TRUE; |
| 1913 |
| 1914 if (fWordBreakItr != NULL) { |
| 1915 #if UCONFIG_NO_BREAK_ITERATION==0 |
| 1916 UErrorCode status = U_ZERO_ERROR; |
| 1917 fWordBreakItr->setText(fInputText, status); |
| 1918 #endif |
| 1919 } |
| 1920 return *this; |
| 1921 } |
| 1922 |
| 1923 |
| 1924 RegexMatcher &RegexMatcher::reset(UText *input) { |
| 1925 if (fInputText != input) { |
| 1926 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatu
s); |
| 1927 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText,
fInputText, FALSE, TRUE, &fDeferredStatus); |
| 1928 fInputLength = utext_nativeLength(fInputText); |
| 1929 |
| 1930 delete fInput; |
| 1931 fInput = NULL; |
| 1932 |
| 1933 if (fWordBreakItr != NULL) { |
| 1934 #if UCONFIG_NO_BREAK_ITERATION==0 |
| 1935 UErrorCode status = U_ZERO_ERROR; |
| 1936 fWordBreakItr->setText(input, status); |
| 1937 #endif |
| 1938 } |
| 1939 } |
| 1940 reset(); |
| 1941 fInputUniStrMaybeMutable = FALSE; |
| 1942 |
| 1943 return *this; |
| 1944 } |
| 1945 |
| 1946 /*RegexMatcher &RegexMatcher::reset(const UChar *) { |
| 1947 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; |
| 1948 return *this; |
| 1949 }*/ |
| 1950 |
| 1951 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { |
| 1952 if (U_FAILURE(status)) { |
| 1953 return *this; |
| 1954 } |
| 1955 reset(); // Reset also resets the region to be the entire string. |
| 1956 |
| 1957 if (position < 0 || position > fActiveLimit) { |
| 1958 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 1959 return *this; |
| 1960 } |
| 1961 fMatchEnd = position; |
| 1962 return *this; |
| 1963 } |
| 1964 |
| 1965 |
| 1966 |
| 1967 |
| 1968 |
| 1969 //------------------------------------------------------------------------------
-- |
| 1970 // |
| 1971 // setTrace |
| 1972 // |
| 1973 //------------------------------------------------------------------------------
-- |
| 1974 void RegexMatcher::setTrace(UBool state) { |
| 1975 fTraceDebug = state; |
| 1976 } |
| 1977 |
| 1978 |
| 1979 |
| 1980 //--------------------------------------------------------------------- |
| 1981 // |
| 1982 // split |
| 1983 // |
| 1984 //--------------------------------------------------------------------- |
| 1985 int32_t RegexMatcher::split(const UnicodeString &input, |
| 1986 UnicodeString dest[], |
| 1987 int32_t destCapacity, |
| 1988 UErrorCode &status) |
| 1989 { |
| 1990 UText inputText = UTEXT_INITIALIZER; |
| 1991 utext_openConstUnicodeString(&inputText, &input, &status); |
| 1992 if (U_FAILURE(status)) { |
| 1993 return 0; |
| 1994 } |
| 1995 |
| 1996 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); |
| 1997 if (destText == NULL) { |
| 1998 status = U_MEMORY_ALLOCATION_ERROR; |
| 1999 return 0; |
| 2000 } |
| 2001 int32_t i; |
| 2002 for (i = 0; i < destCapacity; i++) { |
| 2003 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); |
| 2004 } |
| 2005 |
| 2006 int32_t fieldCount = split(&inputText, destText, destCapacity, status); |
| 2007 |
| 2008 for (i = 0; i < destCapacity; i++) { |
| 2009 utext_close(destText[i]); |
| 2010 } |
| 2011 |
| 2012 uprv_free(destText); |
| 2013 utext_close(&inputText); |
| 2014 return fieldCount; |
| 2015 } |
| 2016 |
| 2017 // |
| 2018 // split, UText mode |
| 2019 // |
| 2020 int32_t RegexMatcher::split(UText *input, |
| 2021 UText *dest[], |
| 2022 int32_t destCapacity, |
| 2023 UErrorCode &status) |
| 2024 { |
| 2025 // |
| 2026 // Check arguements for validity |
| 2027 // |
| 2028 if (U_FAILURE(status)) { |
| 2029 return 0; |
| 2030 }; |
| 2031 |
| 2032 if (destCapacity < 1) { |
| 2033 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 2034 return 0; |
| 2035 } |
| 2036 |
| 2037 // |
| 2038 // Reset for the input text |
| 2039 // |
| 2040 reset(input); |
| 2041 int64_t nextOutputStringStart = 0; |
| 2042 if (fActiveLimit == 0) { |
| 2043 return 0; |
| 2044 } |
| 2045 |
| 2046 // |
| 2047 // Loop through the input text, searching for the delimiter pattern |
| 2048 // |
| 2049 int32_t i; |
| 2050 int32_t numCaptureGroups = fPattern->fGroupMap->size(); |
| 2051 for (i=0; ; i++) { |
| 2052 if (i>=destCapacity-1) { |
| 2053 // There is one or zero output string left. |
| 2054 // Fill the last output string with whatever is left from the input,
then exit the loop. |
| 2055 // ( i will be == destCapacity if we filled the output array while
processing |
| 2056 // capture groups of the delimiter expression, in which case we w
ill discard the |
| 2057 // last capture group saved in favor of the unprocessed remainder
of the |
| 2058 // input string.) |
| 2059 i = destCapacity-1; |
| 2060 if (fActiveLimit > nextOutputStringStart) { |
| 2061 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { |
| 2062 if (dest[i]) { |
| 2063 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), |
| 2064 input->chunkContents+nextOutputStringStart
, |
| 2065 (int32_t)(fActiveLimit-nextOutputStringSta
rt), &status); |
| 2066 } else { |
| 2067 UText remainingText = UTEXT_INITIALIZER; |
| 2068 utext_openUChars(&remainingText, input->chunkContents+ne
xtOutputStringStart, |
| 2069 fActiveLimit-nextOutputStringStart, &st
atus); |
| 2070 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE,
&status); |
| 2071 utext_close(&remainingText); |
| 2072 } |
| 2073 } else { |
| 2074 UErrorCode lengthStatus = U_ZERO_ERROR; |
| 2075 int32_t remaining16Length = |
| 2076 utext_extract(input, nextOutputStringStart, fActiveLimit
, NULL, 0, &lengthStatus); |
| 2077 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(
remaining16Length+1)); |
| 2078 if (remainingChars == NULL) { |
| 2079 status = U_MEMORY_ALLOCATION_ERROR; |
| 2080 break; |
| 2081 } |
| 2082 |
| 2083 utext_extract(input, nextOutputStringStart, fActiveLimit, re
mainingChars, remaining16Length+1, &status); |
| 2084 if (dest[i]) { |
| 2085 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), r
emainingChars, remaining16Length, &status); |
| 2086 } else { |
| 2087 UText remainingText = UTEXT_INITIALIZER; |
| 2088 utext_openUChars(&remainingText, remainingChars, remaini
ng16Length, &status); |
| 2089 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE,
&status); |
| 2090 utext_close(&remainingText); |
| 2091 } |
| 2092 |
| 2093 uprv_free(remainingChars); |
| 2094 } |
| 2095 } |
| 2096 break; |
| 2097 } |
| 2098 if (find()) { |
| 2099 // We found another delimiter. Move everything from where we starte
d looking |
| 2100 // up until the start of the delimiter into the next output string. |
| 2101 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { |
| 2102 if (dest[i]) { |
| 2103 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), |
| 2104 input->chunkContents+nextOutputStringStart, |
| 2105 (int32_t)(fMatchStart-nextOutputStringStart),
&status); |
| 2106 } else { |
| 2107 UText remainingText = UTEXT_INITIALIZER; |
| 2108 utext_openUChars(&remainingText, input->chunkContents+nextOu
tputStringStart, |
| 2109 fMatchStart-nextOutputStringStart, &status
); |
| 2110 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); |
| 2111 utext_close(&remainingText); |
| 2112 } |
| 2113 } else { |
| 2114 UErrorCode lengthStatus = U_ZERO_ERROR; |
| 2115 int32_t remaining16Length = utext_extract(input, nextOutputStrin
gStart, fMatchStart, NULL, 0, &lengthStatus); |
| 2116 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(rema
ining16Length+1)); |
| 2117 if (remainingChars == NULL) { |
| 2118 status = U_MEMORY_ALLOCATION_ERROR; |
| 2119 break; |
| 2120 } |
| 2121 utext_extract(input, nextOutputStringStart, fMatchStart, remaini
ngChars, remaining16Length+1, &status); |
| 2122 if (dest[i]) { |
| 2123 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remai
ningChars, remaining16Length, &status); |
| 2124 } else { |
| 2125 UText remainingText = UTEXT_INITIALIZER; |
| 2126 utext_openUChars(&remainingText, remainingChars, remaining16
Length, &status); |
| 2127 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); |
| 2128 utext_close(&remainingText); |
| 2129 } |
| 2130 |
| 2131 uprv_free(remainingChars); |
| 2132 } |
| 2133 nextOutputStringStart = fMatchEnd; |
| 2134 |
| 2135 // If the delimiter pattern has capturing parentheses, the captured |
| 2136 // text goes out into the next n destination strings. |
| 2137 int32_t groupNum; |
| 2138 UBool lastGroupWasNullUText = FALSE; |
| 2139 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { |
| 2140 if (i==destCapacity-1) { |
| 2141 break; |
| 2142 } |
| 2143 i++; |
| 2144 lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE); |
| 2145 dest[i] = group(groupNum, dest[i], status); |
| 2146 } |
| 2147 |
| 2148 if (nextOutputStringStart == fActiveLimit) { |
| 2149 // The delimiter was at the end of the string. We're done. |
| 2150 break; |
| 2151 } else if (i == destCapacity-1) { |
| 2152 // We're out of capture groups, and the rest of the string is mo
re important |
| 2153 if (lastGroupWasNullUText) { |
| 2154 utext_close(dest[i]); |
| 2155 dest[i] = NULL; |
| 2156 } |
| 2157 } |
| 2158 |
| 2159 } |
| 2160 else |
| 2161 { |
| 2162 // We ran off the end of the input while looking for the next delimi
ter. |
| 2163 // All the remaining text goes into the current output string. |
| 2164 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { |
| 2165 if (dest[i]) { |
| 2166 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), |
| 2167 input->chunkContents+nextOutputStringStart, |
| 2168 (int32_t)(fActiveLimit-nextOutputStringStart),
&status); |
| 2169 } else { |
| 2170 UText remainingText = UTEXT_INITIALIZER; |
| 2171 utext_openUChars(&remainingText, input->chunkContents+nextOu
tputStringStart, |
| 2172 fActiveLimit-nextOutputStringStart, &status
); |
| 2173 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); |
| 2174 utext_close(&remainingText); |
| 2175 } |
| 2176 } else { |
| 2177 UErrorCode lengthStatus = U_ZERO_ERROR; |
| 2178 int32_t remaining16Length = utext_extract(input, nextOutputStrin
gStart, fActiveLimit, NULL, 0, &lengthStatus); |
| 2179 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(rema
ining16Length+1)); |
| 2180 if (remainingChars == NULL) { |
| 2181 status = U_MEMORY_ALLOCATION_ERROR; |
| 2182 break; |
| 2183 } |
| 2184 |
| 2185 utext_extract(input, nextOutputStringStart, fActiveLimit, remain
ingChars, remaining16Length+1, &status); |
| 2186 if (dest[i]) { |
| 2187 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remai
ningChars, remaining16Length, &status); |
| 2188 } else { |
| 2189 UText remainingText = UTEXT_INITIALIZER; |
| 2190 utext_openUChars(&remainingText, remainingChars, remaining16
Length, &status); |
| 2191 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); |
| 2192 utext_close(&remainingText); |
| 2193 } |
| 2194 |
| 2195 uprv_free(remainingChars); |
| 2196 } |
| 2197 break; |
| 2198 } |
| 2199 if (U_FAILURE(status)) { |
| 2200 break; |
| 2201 } |
| 2202 } // end of for loop |
| 2203 return i+1; |
| 2204 } |
| 2205 |
| 2206 |
| 2207 //------------------------------------------------------------------------------
-- |
| 2208 // |
| 2209 // start |
| 2210 // |
| 2211 //------------------------------------------------------------------------------
-- |
| 2212 int32_t RegexMatcher::start(UErrorCode &status) const { |
| 2213 return start(0, status); |
| 2214 } |
| 2215 |
| 2216 int64_t RegexMatcher::start64(UErrorCode &status) const { |
| 2217 return start64(0, status); |
| 2218 } |
| 2219 |
| 2220 //------------------------------------------------------------------------------
-- |
| 2221 // |
| 2222 // start(int32_t group, UErrorCode &status) |
| 2223 // |
| 2224 //------------------------------------------------------------------------------
-- |
| 2225 |
| 2226 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const { |
| 2227 if (U_FAILURE(status)) { |
| 2228 return -1; |
| 2229 } |
| 2230 if (U_FAILURE(fDeferredStatus)) { |
| 2231 status = fDeferredStatus; |
| 2232 return -1; |
| 2233 } |
| 2234 if (fMatch == FALSE) { |
| 2235 status = U_REGEX_INVALID_STATE; |
| 2236 return -1; |
| 2237 } |
| 2238 if (group < 0 || group > fPattern->fGroupMap->size()) { |
| 2239 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 2240 return -1; |
| 2241 } |
| 2242 int64_t s; |
| 2243 if (group == 0) { |
| 2244 s = fMatchStart; |
| 2245 } else { |
| 2246 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); |
| 2247 U_ASSERT(groupOffset < fPattern->fFrameSize); |
| 2248 U_ASSERT(groupOffset >= 0); |
| 2249 s = fFrame->fExtra[groupOffset]; |
| 2250 } |
| 2251 |
| 2252 return s; |
| 2253 } |
| 2254 |
| 2255 |
| 2256 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { |
| 2257 return (int32_t)start64(group, status); |
| 2258 } |
| 2259 |
| 2260 //------------------------------------------------------------------------------
-- |
| 2261 // |
| 2262 // useAnchoringBounds |
| 2263 // |
| 2264 //------------------------------------------------------------------------------
-- |
| 2265 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { |
| 2266 fAnchoringBounds = b; |
| 2267 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0); |
| 2268 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength); |
| 2269 return *this; |
| 2270 } |
| 2271 |
| 2272 |
| 2273 //------------------------------------------------------------------------------
-- |
| 2274 // |
| 2275 // useTransparentBounds |
| 2276 // |
| 2277 //------------------------------------------------------------------------------
-- |
| 2278 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { |
| 2279 fTransparentBounds = b; |
| 2280 fLookStart = (fTransparentBounds ? 0 : fRegionStart); |
| 2281 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit); |
| 2282 return *this; |
| 2283 } |
| 2284 |
| 2285 //------------------------------------------------------------------------------
-- |
| 2286 // |
| 2287 // setTimeLimit |
| 2288 // |
| 2289 //------------------------------------------------------------------------------
-- |
| 2290 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) { |
| 2291 if (U_FAILURE(status)) { |
| 2292 return; |
| 2293 } |
| 2294 if (U_FAILURE(fDeferredStatus)) { |
| 2295 status = fDeferredStatus; |
| 2296 return; |
| 2297 } |
| 2298 if (limit < 0) { |
| 2299 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 2300 return; |
| 2301 } |
| 2302 fTimeLimit = limit; |
| 2303 } |
| 2304 |
| 2305 |
| 2306 //------------------------------------------------------------------------------
-- |
| 2307 // |
| 2308 // getTimeLimit |
| 2309 // |
| 2310 //------------------------------------------------------------------------------
-- |
| 2311 int32_t RegexMatcher::getTimeLimit() const { |
| 2312 return fTimeLimit; |
| 2313 } |
| 2314 |
| 2315 |
| 2316 //------------------------------------------------------------------------------
-- |
| 2317 // |
| 2318 // setStackLimit |
| 2319 // |
| 2320 //------------------------------------------------------------------------------
-- |
| 2321 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) { |
| 2322 if (U_FAILURE(status)) { |
| 2323 return; |
| 2324 } |
| 2325 if (U_FAILURE(fDeferredStatus)) { |
| 2326 status = fDeferredStatus; |
| 2327 return; |
| 2328 } |
| 2329 if (limit < 0) { |
| 2330 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 2331 return; |
| 2332 } |
| 2333 |
| 2334 // Reset the matcher. This is needed here in case there is a current match |
| 2335 // whose final stack frame (containing the match results, pointed to by f
Frame) |
| 2336 // would be lost by resizing to a smaller stack size. |
| 2337 reset(); |
| 2338 |
| 2339 if (limit == 0) { |
| 2340 // Unlimited stack expansion |
| 2341 fStack->setMaxCapacity(0); |
| 2342 } else { |
| 2343 // Change the units of the limit from bytes to ints, and bump the size
up |
| 2344 // to be big enough to hold at least one stack frame for the pattern, |
| 2345 // if it isn't there already. |
| 2346 int32_t adjustedLimit = limit / sizeof(int32_t); |
| 2347 if (adjustedLimit < fPattern->fFrameSize) { |
| 2348 adjustedLimit = fPattern->fFrameSize; |
| 2349 } |
| 2350 fStack->setMaxCapacity(adjustedLimit); |
| 2351 } |
| 2352 fStackLimit = limit; |
| 2353 } |
| 2354 |
| 2355 |
| 2356 //------------------------------------------------------------------------------
-- |
| 2357 // |
| 2358 // getStackLimit |
| 2359 // |
| 2360 //------------------------------------------------------------------------------
-- |
| 2361 int32_t RegexMatcher::getStackLimit() const { |
| 2362 return fStackLimit; |
| 2363 } |
| 2364 |
| 2365 |
| 2366 //------------------------------------------------------------------------------
-- |
| 2367 // |
| 2368 // setMatchCallback |
| 2369 // |
| 2370 //------------------------------------------------------------------------------
-- |
| 2371 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, |
| 2372 const void *context, |
| 2373 UErrorCode &status) { |
| 2374 if (U_FAILURE(status)) { |
| 2375 return; |
| 2376 } |
| 2377 fCallbackFn = callback; |
| 2378 fCallbackContext = context; |
| 2379 } |
| 2380 |
| 2381 |
| 2382 //------------------------------------------------------------------------------
-- |
| 2383 // |
| 2384 // getMatchCallback |
| 2385 // |
| 2386 //------------------------------------------------------------------------------
-- |
| 2387 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, |
| 2388 const void *&context, |
| 2389 UErrorCode &status) { |
| 2390 if (U_FAILURE(status)) { |
| 2391 return; |
| 2392 } |
| 2393 callback = fCallbackFn; |
| 2394 context = fCallbackContext; |
| 2395 } |
| 2396 |
| 2397 |
| 2398 //------------------------------------------------------------------------------
-- |
| 2399 // |
| 2400 // setMatchCallback |
| 2401 // |
| 2402 //------------------------------------------------------------------------------
-- |
| 2403 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *call
back, |
| 2404 const void
*context, |
| 2405 UErrorCode
&status) { |
| 2406 if (U_FAILURE(status)) { |
| 2407 return; |
| 2408 } |
| 2409 fFindProgressCallbackFn = callback; |
| 2410 fFindProgressCallbackContext = context; |
| 2411 } |
| 2412 |
| 2413 |
| 2414 //------------------------------------------------------------------------------
-- |
| 2415 // |
| 2416 // getMatchCallback |
| 2417 // |
| 2418 //------------------------------------------------------------------------------
-- |
| 2419 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callb
ack, |
| 2420 const void *&
context, |
| 2421 UErrorCode &s
tatus) { |
| 2422 if (U_FAILURE(status)) { |
| 2423 return; |
| 2424 } |
| 2425 callback = fFindProgressCallbackFn; |
| 2426 context = fFindProgressCallbackContext; |
| 2427 } |
| 2428 |
| 2429 |
| 2430 //==============================================================================
== |
| 2431 // |
| 2432 // Code following this point in this file is the internal |
| 2433 // Match Engine Implementation. |
| 2434 // |
| 2435 //==============================================================================
== |
| 2436 |
| 2437 |
| 2438 //------------------------------------------------------------------------------
-- |
| 2439 // |
| 2440 // resetStack |
| 2441 // Discard any previous contents of the state save stack, and initiali
ze a |
| 2442 // new stack frame to all -1. The -1s are needed for capture group li
mits, |
| 2443 // where they indicate that a group has not yet matched anything. |
| 2444 //------------------------------------------------------------------------------
-- |
| 2445 REStackFrame *RegexMatcher::resetStack() { |
| 2446 // Discard any previous contents of the state save stack, and initialize a |
| 2447 // new stack frame with all -1 data. The -1s are needed for capture group
limits, |
| 2448 // where they indicate that a group has not yet matched anything. |
| 2449 fStack->removeAllElements(); |
| 2450 |
| 2451 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrame
Size, fDeferredStatus); |
| 2452 int32_t i; |
| 2453 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { |
| 2454 iFrame->fExtra[i] = -1; |
| 2455 } |
| 2456 return iFrame; |
| 2457 } |
| 2458 |
| 2459 |
| 2460 |
| 2461 //------------------------------------------------------------------------------
-- |
| 2462 // |
| 2463 // isWordBoundary |
| 2464 // in perl, "xab..cd..", \b is true at positions 0,3,5,7 |
| 2465 // For us, |
| 2466 // If the current char is a combining mark, |
| 2467 // \b is FALSE. |
| 2468 // Else Scan backwards to the first non-combining char. |
| 2469 // We are at a boundary if the this char and the orig
inal chars are |
| 2470 // opposite in membership in \w set |
| 2471 // |
| 2472 // parameters: pos - the current position in the input buffer |
| 2473 // |
| 2474 // TODO: double-check edge cases at region boundaries. |
| 2475 // |
| 2476 //------------------------------------------------------------------------------
-- |
| 2477 UBool RegexMatcher::isWordBoundary(int64_t pos) { |
| 2478 UBool isBoundary = FALSE; |
| 2479 UBool cIsWord = FALSE; |
| 2480 |
| 2481 if (pos >= fLookLimit) { |
| 2482 fHitEnd = TRUE; |
| 2483 } else { |
| 2484 // Determine whether char c at current position is a member of the word
set of chars. |
| 2485 // If we're off the end of the string, behave as though we're not at a w
ord char. |
| 2486 UTEXT_SETNATIVEINDEX(fInputText, pos); |
| 2487 UChar32 c = UTEXT_CURRENT32(fInputText); |
| 2488 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_
FORMAT_CHAR) { |
| 2489 // Current char is a combining one. Not a boundary. |
| 2490 return FALSE; |
| 2491 } |
| 2492 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); |
| 2493 } |
| 2494 |
| 2495 // Back up until we come to a non-combining char, determine whether |
| 2496 // that char is a word char. |
| 2497 UBool prevCIsWord = FALSE; |
| 2498 for (;;) { |
| 2499 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { |
| 2500 break; |
| 2501 } |
| 2502 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); |
| 2503 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) |
| 2504 || u_charType(prevChar) == U_FORMAT_CHAR)) { |
| 2505 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevCh
ar); |
| 2506 break; |
| 2507 } |
| 2508 } |
| 2509 isBoundary = cIsWord ^ prevCIsWord; |
| 2510 return isBoundary; |
| 2511 } |
| 2512 |
| 2513 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { |
| 2514 UBool isBoundary = FALSE; |
| 2515 UBool cIsWord = FALSE; |
| 2516 |
| 2517 const UChar *inputBuf = fInputText->chunkContents; |
| 2518 |
| 2519 if (pos >= fLookLimit) { |
| 2520 fHitEnd = TRUE; |
| 2521 } else { |
| 2522 // Determine whether char c at current position is a member of the word
set of chars. |
| 2523 // If we're off the end of the string, behave as though we're not at a w
ord char. |
| 2524 UChar32 c; |
| 2525 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); |
| 2526 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_
FORMAT_CHAR) { |
| 2527 // Current char is a combining one. Not a boundary. |
| 2528 return FALSE; |
| 2529 } |
| 2530 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); |
| 2531 } |
| 2532 |
| 2533 // Back up until we come to a non-combining char, determine whether |
| 2534 // that char is a word char. |
| 2535 UBool prevCIsWord = FALSE; |
| 2536 for (;;) { |
| 2537 if (pos <= fLookStart) { |
| 2538 break; |
| 2539 } |
| 2540 UChar32 prevChar; |
| 2541 U16_PREV(inputBuf, fLookStart, pos, prevChar); |
| 2542 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) |
| 2543 || u_charType(prevChar) == U_FORMAT_CHAR)) { |
| 2544 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevCh
ar); |
| 2545 break; |
| 2546 } |
| 2547 } |
| 2548 isBoundary = cIsWord ^ prevCIsWord; |
| 2549 return isBoundary; |
| 2550 } |
| 2551 |
| 2552 //------------------------------------------------------------------------------
-- |
| 2553 // |
| 2554 // isUWordBoundary |
| 2555 // |
| 2556 // Test for a word boundary using RBBI word break. |
| 2557 // |
| 2558 // parameters: pos - the current position in the input buffer |
| 2559 // |
| 2560 //------------------------------------------------------------------------------
-- |
| 2561 UBool RegexMatcher::isUWordBoundary(int64_t pos) { |
| 2562 UBool returnVal = FALSE; |
| 2563 #if UCONFIG_NO_BREAK_ITERATION==0 |
| 2564 |
| 2565 // If we haven't yet created a break iterator for this matcher, do it now. |
| 2566 if (fWordBreakItr == NULL) { |
| 2567 fWordBreakItr = |
| 2568 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::
getEnglish(), fDeferredStatus); |
| 2569 if (U_FAILURE(fDeferredStatus)) { |
| 2570 return FALSE; |
| 2571 } |
| 2572 fWordBreakItr->setText(fInputText, fDeferredStatus); |
| 2573 } |
| 2574 |
| 2575 if (pos >= fLookLimit) { |
| 2576 fHitEnd = TRUE; |
| 2577 returnVal = TRUE; // With Unicode word rules, only positions within th
e interior of "real" |
| 2578 // words are not boundaries. All non-word chars
stand by themselves, |
| 2579 // with word boundaries on both sides. |
| 2580 } else { |
| 2581 if (!UTEXT_USES_U16(fInputText)) { |
| 2582 // !!!: Would like a better way to do this! |
| 2583 UErrorCode status = U_ZERO_ERROR; |
| 2584 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status); |
| 2585 } |
| 2586 returnVal = fWordBreakItr->isBoundary((int32_t)pos); |
| 2587 } |
| 2588 #endif |
| 2589 return returnVal; |
| 2590 } |
| 2591 |
| 2592 //------------------------------------------------------------------------------
-- |
| 2593 // |
| 2594 // IncrementTime This function is called once each TIMER_INITIAL_VALUE sta
te |
| 2595 // saves. Increment the "time" counter, and call the |
| 2596 // user callback function if there is one installed. |
| 2597 // |
| 2598 // If the match operation needs to be aborted, either for a
time-out |
| 2599 // or because the user callback asked for it, just set an er
ror status. |
| 2600 // The engine will pick that up and stop in its outer loop. |
| 2601 // |
| 2602 //------------------------------------------------------------------------------
-- |
| 2603 void RegexMatcher::IncrementTime(UErrorCode &status) { |
| 2604 fTickCounter = TIMER_INITIAL_VALUE; |
| 2605 fTime++; |
| 2606 if (fCallbackFn != NULL) { |
| 2607 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) { |
| 2608 status = U_REGEX_STOPPED_BY_CALLER; |
| 2609 return; |
| 2610 } |
| 2611 } |
| 2612 if (fTimeLimit > 0 && fTime >= fTimeLimit) { |
| 2613 status = U_REGEX_TIME_OUT; |
| 2614 } |
| 2615 } |
| 2616 |
| 2617 //------------------------------------------------------------------------------
-- |
| 2618 // |
| 2619 // ReportFindProgress This function is called once for each advance in the
target |
| 2620 // string from the find() function, and calls the user
progress callback |
| 2621 // function if there is one installed. |
| 2622 // |
| 2623 // NOTE: |
| 2624 // |
| 2625 // If the match operation needs to be aborted because t
he user |
| 2626 // callback asked for it, just set an error status. |
| 2627 // The engine will pick that up and stop in its outer l
oop. |
| 2628 // |
| 2629 //------------------------------------------------------------------------------
-- |
| 2630 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) { |
| 2631 if (fFindProgressCallbackFn != NULL) { |
| 2632 if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex)
== FALSE) { |
| 2633 status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/; |
| 2634 return FALSE; |
| 2635 } |
| 2636 } |
| 2637 return TRUE; |
| 2638 } |
| 2639 |
| 2640 //------------------------------------------------------------------------------
-- |
| 2641 // |
| 2642 // StateSave |
| 2643 // Make a new stack frame, initialized as a copy of the current stack fram
e. |
| 2644 // Set the pattern index in the original stack frame from the operand valu
e |
| 2645 // in the opcode. Execution of the engine continues with the state in |
| 2646 // the newly created stack frame |
| 2647 // |
| 2648 // Note that reserveBlock() may grow the stack, resulting in the |
| 2649 // whole thing being relocated in memory. |
| 2650 // |
| 2651 // Parameters: |
| 2652 // fp The top frame pointer when called. At return, a new |
| 2653 // fame will be present |
| 2654 // savePatIdx An index into the compiled pattern. Goes into the origina
l |
| 2655 // (not new) frame. If execution ever back-tracks out of the |
| 2656 // new frame, this will be where we continue from in the patt
ern. |
| 2657 // Return |
| 2658 // The new frame pointer. |
| 2659 // |
| 2660 //------------------------------------------------------------------------------
-- |
| 2661 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatId
x, UErrorCode &status) { |
| 2662 // push storage for a new frame. |
| 2663 int64_t *newFP = fStack->reserveBlock(fFrameSize, status); |
| 2664 if (newFP == NULL) { |
| 2665 // Failure on attempted stack expansion. |
| 2666 // Stack function set some other error code, change it to a more |
| 2667 // specific one for regular expressions. |
| 2668 status = U_REGEX_STACK_OVERFLOW; |
| 2669 // We need to return a writable stack frame, so just return the |
| 2670 // previous frame. The match operation will stop quickly |
| 2671 // because of the error status, after which the frame will never |
| 2672 // be looked at again. |
| 2673 return fp; |
| 2674 } |
| 2675 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. |
| 2676 |
| 2677 // New stack frame = copy of old top frame. |
| 2678 int64_t *source = (int64_t *)fp; |
| 2679 int64_t *dest = newFP; |
| 2680 for (;;) { |
| 2681 *dest++ = *source++; |
| 2682 if (source == newFP) { |
| 2683 break; |
| 2684 } |
| 2685 } |
| 2686 |
| 2687 fTickCounter--; |
| 2688 if (fTickCounter <= 0) { |
| 2689 IncrementTime(status); // Re-initializes fTickCounter |
| 2690 } |
| 2691 fp->fPatIdx = savePatIdx; |
| 2692 return (REStackFrame *)newFP; |
| 2693 } |
| 2694 |
| 2695 |
| 2696 //------------------------------------------------------------------------------
-- |
| 2697 // |
| 2698 // MatchAt This is the actual matching engine. |
| 2699 // |
| 2700 // startIdx: begin matching a this index. |
| 2701 // toEnd: if true, match must extend to end of the input
region |
| 2702 // |
| 2703 //------------------------------------------------------------------------------
-- |
| 2704 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { |
| 2705 UBool isMatch = FALSE; // True if the we have a match. |
| 2706 |
| 2707 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-chara
cter matches for searching backwards |
| 2708 |
| 2709 int32_t op; // Operation from the compiled pattern, s
plit into |
| 2710 int32_t opType; // the opcode |
| 2711 int32_t opValue; // and the operand value. |
| 2712 |
| 2713 #ifdef REGEX_RUN_DEBUG |
| 2714 if (fTraceDebug) |
| 2715 { |
| 2716 printf("MatchAt(startIdx=%ld)\n", startIdx); |
| 2717 printf("Original Pattern: "); |
| 2718 UChar32 c = utext_next32From(fPattern->fPattern, 0); |
| 2719 while (c != U_SENTINEL) { |
| 2720 if (c<32 || c>256) { |
| 2721 c = '.'; |
| 2722 } |
| 2723 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); |
| 2724 |
| 2725 c = UTEXT_NEXT32(fPattern->fPattern); |
| 2726 } |
| 2727 printf("\n"); |
| 2728 printf("Input String: "); |
| 2729 c = utext_next32From(fInputText, 0); |
| 2730 while (c != U_SENTINEL) { |
| 2731 if (c<32 || c>256) { |
| 2732 c = '.'; |
| 2733 } |
| 2734 printf("%c", c); |
| 2735 |
| 2736 c = UTEXT_NEXT32(fInputText); |
| 2737 } |
| 2738 printf("\n"); |
| 2739 printf("\n"); |
| 2740 } |
| 2741 #endif |
| 2742 |
| 2743 if (U_FAILURE(status)) { |
| 2744 return; |
| 2745 } |
| 2746 |
| 2747 // Cache frequently referenced items from the compiled pattern |
| 2748 // |
| 2749 int64_t *pat = fPattern->fCompiledPat->getBuffer(); |
| 2750 |
| 2751 const UChar *litText = fPattern->fLiteralText.getBuffer(); |
| 2752 UVector *sets = fPattern->fSets; |
| 2753 |
| 2754 fFrameSize = fPattern->fFrameSize; |
| 2755 REStackFrame *fp = resetStack(); |
| 2756 |
| 2757 fp->fPatIdx = 0; |
| 2758 fp->fInputIdx = startIdx; |
| 2759 |
| 2760 // Zero out the pattern's static data |
| 2761 int32_t i; |
| 2762 for (i = 0; i<fPattern->fDataSize; i++) { |
| 2763 fData[i] = 0; |
| 2764 } |
| 2765 |
| 2766 // |
| 2767 // Main loop for interpreting the compiled pattern. |
| 2768 // One iteration of the loop per pattern operation performed. |
| 2769 // |
| 2770 for (;;) { |
| 2771 #if 0 |
| 2772 if (_heapchk() != _HEAPOK) { |
| 2773 fprintf(stderr, "Heap Trouble\n"); |
| 2774 } |
| 2775 #endif |
| 2776 |
| 2777 op = (int32_t)pat[fp->fPatIdx]; |
| 2778 opType = URX_TYPE(op); |
| 2779 opValue = URX_VAL(op); |
| 2780 #ifdef REGEX_RUN_DEBUG |
| 2781 if (fTraceDebug) { |
| 2782 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 2783 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp-
>fInputIdx, |
| 2784 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(),
fActiveLimit); |
| 2785 fPattern->dumpOp(fp->fPatIdx); |
| 2786 } |
| 2787 #endif |
| 2788 fp->fPatIdx++; |
| 2789 |
| 2790 switch (opType) { |
| 2791 |
| 2792 |
| 2793 case URX_NOP: |
| 2794 break; |
| 2795 |
| 2796 |
| 2797 case URX_BACKTRACK: |
| 2798 // Force a backtrack. In some circumstances, the pattern compiler |
| 2799 // will notice that the pattern can't possibly match anything, and
will |
| 2800 // emit one of these at that point. |
| 2801 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 2802 break; |
| 2803 |
| 2804 |
| 2805 case URX_ONECHAR: |
| 2806 if (fp->fInputIdx < fActiveLimit) { |
| 2807 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 2808 UChar32 c = UTEXT_NEXT32(fInputText); |
| 2809 if (c == opValue) { |
| 2810 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 2811 break; |
| 2812 } |
| 2813 } else { |
| 2814 fHitEnd = TRUE; |
| 2815 } |
| 2816 |
| 2817 #ifdef REGEX_SMART_BACKTRACKING |
| 2818 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize)
{ |
| 2819 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFra
meSize); |
| 2820 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInpu
tIdx <= prevFrame->fInputIdx) { |
| 2821 UBool success = FALSE; |
| 2822 UChar32 c = UTEXT_PREVIOUS32(fInputText); |
| 2823 while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex)
{ |
| 2824 if (c == opValue) { |
| 2825 success = TRUE; |
| 2826 break; |
| 2827 } else if (c == U_SENTINEL) { |
| 2828 break; |
| 2829 } |
| 2830 c = UTEXT_PREVIOUS32(fInputText); |
| 2831 } |
| 2832 if (success) { |
| 2833 fHitEnd = FALSE; |
| 2834 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 2835 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 2836 if (fp->fInputIdx > backSearchIndex) { |
| 2837 fp = StateSave(fp, fp->fPatIdx, status); |
| 2838 } |
| 2839 fp->fPatIdx++; // Skip the LOOP_C, we just did that |
| 2840 break; |
| 2841 } |
| 2842 } |
| 2843 } |
| 2844 #endif |
| 2845 |
| 2846 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 2847 break; |
| 2848 |
| 2849 |
| 2850 case URX_STRING: |
| 2851 { |
| 2852 // Test input against a literal string. |
| 2853 // Strings require two slots in the compiled pattern, one for th
e |
| 2854 // offset to the string text, and one for the length. |
| 2855 int32_t stringStartIdx = opValue; |
| 2856 int32_t stringLen; |
| 2857 |
| 2858 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second ope
rand |
| 2859 fp->fPatIdx++; |
| 2860 opType = URX_TYPE(op); |
| 2861 stringLen = URX_VAL(op); |
| 2862 U_ASSERT(opType == URX_STRING_LEN); |
| 2863 U_ASSERT(stringLen >= 2); |
| 2864 |
| 2865 const UChar *patternChars = litText+stringStartIdx; |
| 2866 const UChar *patternEnd = patternChars+stringLen; |
| 2867 |
| 2868 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 2869 UChar32 c; |
| 2870 UBool success = TRUE; |
| 2871 |
| 2872 while (patternChars < patternEnd && success) { |
| 2873 c = UTEXT_NEXT32(fInputText); |
| 2874 |
| 2875 if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= f
ActiveLimit) { |
| 2876 if (U_IS_BMP(c)) { |
| 2877 success = (*patternChars == c); |
| 2878 patternChars += 1; |
| 2879 } else if (patternChars+1 < patternEnd) { |
| 2880 success = (*patternChars == U16_LEAD(c) && *(pattern
Chars+1) == U16_TRAIL(c)); |
| 2881 patternChars += 2; |
| 2882 } |
| 2883 } else { |
| 2884 success = FALSE; |
| 2885 fHitEnd = TRUE; // TODO: See ticket 6074 |
| 2886 } |
| 2887 } |
| 2888 |
| 2889 if (success) { |
| 2890 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 2891 } else { |
| 2892 #ifdef REGEX_SMART_BACKTRACKING |
| 2893 if (fp->fInputIdx > backSearchIndex && fStack->size()) { |
| 2894 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFr
ame(fFrameSize); |
| 2895 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && f
p->fInputIdx <= prevFrame->fInputIdx) { |
| 2896 // Reset to last start point |
| 2897 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 2898 patternChars = litText+stringStartIdx; |
| 2899 |
| 2900 // Search backwards for a possible start |
| 2901 do { |
| 2902 c = UTEXT_PREVIOUS32(fInputText); |
| 2903 if (c == U_SENTINEL) { |
| 2904 break; |
| 2905 } else if ((U_IS_BMP(c) && *patternChars == c) |
| |
| 2906 (*patternChars == U16_LEAD(c) && *(patternCh
ars+1) == U16_TRAIL(c))) { |
| 2907 success = TRUE; |
| 2908 break; |
| 2909 } |
| 2910 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSea
rchIndex); |
| 2911 |
| 2912 // And try again |
| 2913 if (success) { |
| 2914 fp = (REStackFrame *)fStack->popFrame(fFrameSize
); |
| 2915 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)
; |
| 2916 if (fp->fInputIdx > backSearchIndex) { |
| 2917 fp = StateSave(fp, fp->fPatIdx, status); |
| 2918 } |
| 2919 fp->fPatIdx++; // Skip the LOOP_C, we just did t
hat |
| 2920 break; |
| 2921 } |
| 2922 } |
| 2923 } |
| 2924 #endif |
| 2925 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 2926 } |
| 2927 } |
| 2928 break; |
| 2929 |
| 2930 |
| 2931 case URX_STATE_SAVE: |
| 2932 fp = StateSave(fp, opValue, status); |
| 2933 break; |
| 2934 |
| 2935 |
| 2936 case URX_END: |
| 2937 // The match loop will exit via this path on a successful match, |
| 2938 // when we reach the end of the pattern. |
| 2939 if (toEnd && fp->fInputIdx != fActiveLimit) { |
| 2940 // The pattern matched, but not to the end of input. Try some m
ore. |
| 2941 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 2942 break; |
| 2943 } |
| 2944 isMatch = TRUE; |
| 2945 goto breakFromLoop; |
| 2946 |
| 2947 // Start and End Capture stack frame variables are laid out out like thi
s: |
| 2948 // fp->fExtra[opValue] - The start of a completed capture group |
| 2949 // opValue+1 - The end of a completed capture group |
| 2950 // opValue+2 - the start of a capture group whose end |
| 2951 // has not yet been reached (and might not
ever be). |
| 2952 case URX_START_CAPTURE: |
| 2953 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); |
| 2954 fp->fExtra[opValue+2] = fp->fInputIdx; |
| 2955 break; |
| 2956 |
| 2957 |
| 2958 case URX_END_CAPTURE: |
| 2959 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); |
| 2960 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for th
is group must be set. |
| 2961 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start
becomes real. |
| 2962 fp->fExtra[opValue+1] = fp->fInputIdx; // End position |
| 2963 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); |
| 2964 break; |
| 2965 |
| 2966 |
| 2967 case URX_DOLLAR: // $, test for End of line |
| 2968 // or for position before new lin
e at end of input |
| 2969 { |
| 2970 if (fp->fInputIdx >= fAnchorLimit) { |
| 2971 // We really are at the end of input. Success. |
| 2972 fHitEnd = TRUE; |
| 2973 fRequireEnd = TRUE; |
| 2974 break; |
| 2975 } |
| 2976 |
| 2977 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 2978 |
| 2979 // If we are positioned just before a new-line that is located a
t the |
| 2980 // end of input, succeed. |
| 2981 UChar32 c = UTEXT_NEXT32(fInputText); |
| 2982 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { |
| 2983 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x202
9) { |
| 2984 // If not in the middle of a CR/LF sequence |
| 2985 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && (UTEXT_P
REVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { |
| 2986 // At new-line at end of input. Success |
| 2987 fHitEnd = TRUE; |
| 2988 fRequireEnd = TRUE; |
| 2989 |
| 2990 break; |
| 2991 } |
| 2992 } |
| 2993 } else { |
| 2994 UChar32 nextC = UTEXT_NEXT32(fInputText); |
| 2995 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInpu
tText) >= fAnchorLimit) { |
| 2996 fHitEnd = TRUE; |
| 2997 fRequireEnd = TRUE; |
| 2998 break; // At CR/LF at end of inp
ut. Success |
| 2999 } |
| 3000 } |
| 3001 |
| 3002 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3003 } |
| 3004 break; |
| 3005 |
| 3006 |
| 3007 case URX_DOLLAR_D: // $, test for End of Line, in UN
IX_LINES mode. |
| 3008 if (fp->fInputIdx >= fAnchorLimit) { |
| 3009 // Off the end of input. Success. |
| 3010 fHitEnd = TRUE; |
| 3011 fRequireEnd = TRUE; |
| 3012 break; |
| 3013 } else { |
| 3014 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3015 UChar32 c = UTEXT_NEXT32(fInputText); |
| 3016 // Either at the last character of input, or off the end. |
| 3017 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimi
t) { |
| 3018 fHitEnd = TRUE; |
| 3019 fRequireEnd = TRUE; |
| 3020 break; |
| 3021 } |
| 3022 } |
| 3023 |
| 3024 // Not at end of input. Back-track out. |
| 3025 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3026 break; |
| 3027 |
| 3028 |
| 3029 case URX_DOLLAR_M: // $, test for End of line in multi-
line mode |
| 3030 { |
| 3031 if (fp->fInputIdx >= fAnchorLimit) { |
| 3032 // We really are at the end of input. Success. |
| 3033 fHitEnd = TRUE; |
| 3034 fRequireEnd = TRUE; |
| 3035 break; |
| 3036 } |
| 3037 // If we are positioned just before a new-line, succeed. |
| 3038 // It makes no difference where the new-line is within the inpu
t. |
| 3039 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3040 UChar32 c = UTEXT_CURRENT32(fInputText); |
| 3041 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { |
| 3042 // At a line end, except for the odd chance of being in th
e middle of a CR/LF sequence |
| 3043 // In multi-line mode, hitting a new-line just before the
end of input does not |
| 3044 // set the hitEnd or requireEnd flags |
| 3045 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVI
OUS32(fInputText)==0x0d)) { |
| 3046 break; |
| 3047 } |
| 3048 } |
| 3049 // not at a new line. Fail. |
| 3050 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3051 } |
| 3052 break; |
| 3053 |
| 3054 |
| 3055 case URX_DOLLAR_MD: // $, test for End of line in multi
-line and UNIX_LINES mode |
| 3056 { |
| 3057 if (fp->fInputIdx >= fAnchorLimit) { |
| 3058 // We really are at the end of input. Success. |
| 3059 fHitEnd = TRUE; |
| 3060 fRequireEnd = TRUE; // Java set requireEnd in this case, e
ven though |
| 3061 break; // adding a new-line would not lose
the match. |
| 3062 } |
| 3063 // If we are not positioned just before a new-line, the test fa
ils; backtrack out. |
| 3064 // It makes no difference where the new-line is within the inpu
t. |
| 3065 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3066 if (UTEXT_CURRENT32(fInputText) != 0x0a) { |
| 3067 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3068 } |
| 3069 } |
| 3070 break; |
| 3071 |
| 3072 |
| 3073 case URX_CARET: // ^, test for start of line |
| 3074 if (fp->fInputIdx != fAnchorStart) { |
| 3075 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3076 } |
| 3077 break; |
| 3078 |
| 3079 |
| 3080 case URX_CARET_M: // ^, test for start of line in muli
t-line mode |
| 3081 { |
| 3082 if (fp->fInputIdx == fAnchorStart) { |
| 3083 // We are at the start input. Success. |
| 3084 break; |
| 3085 } |
| 3086 // Check whether character just before the current pos is a new-l
ine |
| 3087 // unless we are at the end of input |
| 3088 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3089 UChar32 c = UTEXT_PREVIOUS32(fInputText); |
| 3090 if ((fp->fInputIdx < fAnchorLimit) && |
| 3091 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { |
| 3092 // It's a new-line. ^ is true. Success. |
| 3093 // TODO: what should be done with positions between a CR an
d LF? |
| 3094 break; |
| 3095 } |
| 3096 // Not at the start of a line. Fail. |
| 3097 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3098 } |
| 3099 break; |
| 3100 |
| 3101 |
| 3102 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line
+ Unix-line mode |
| 3103 { |
| 3104 U_ASSERT(fp->fInputIdx >= fAnchorStart); |
| 3105 if (fp->fInputIdx <= fAnchorStart) { |
| 3106 // We are at the start input. Success. |
| 3107 break; |
| 3108 } |
| 3109 // Check whether character just before the current pos is a new-l
ine |
| 3110 U_ASSERT(fp->fInputIdx <= fAnchorLimit); |
| 3111 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3112 UChar32 c = UTEXT_PREVIOUS32(fInputText); |
| 3113 if (c != 0x0a) { |
| 3114 // Not at the start of a line. Back-track out. |
| 3115 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3116 } |
| 3117 } |
| 3118 break; |
| 3119 |
| 3120 case URX_BACKSLASH_B: // Test for word boundaries |
| 3121 { |
| 3122 UBool success = isWordBoundary(fp->fInputIdx); |
| 3123 success ^= (opValue != 0); // flip sense for \B |
| 3124 if (!success) { |
| 3125 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3126 } |
| 3127 } |
| 3128 break; |
| 3129 |
| 3130 |
| 3131 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-sty
le |
| 3132 { |
| 3133 UBool success = isUWordBoundary(fp->fInputIdx); |
| 3134 success ^= (opValue != 0); // flip sense for \B |
| 3135 if (!success) { |
| 3136 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3137 } |
| 3138 } |
| 3139 break; |
| 3140 |
| 3141 |
| 3142 case URX_BACKSLASH_D: // Test for decimal digit |
| 3143 { |
| 3144 if (fp->fInputIdx >= fActiveLimit) { |
| 3145 fHitEnd = TRUE; |
| 3146 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3147 break; |
| 3148 } |
| 3149 |
| 3150 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3151 |
| 3152 UChar32 c = UTEXT_NEXT32(fInputText); |
| 3153 int8_t ctype = u_charType(c); // TODO: make a unicode set f
or this. Will be faster. |
| 3154 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); |
| 3155 success ^= (opValue != 0); // flip sense for \D |
| 3156 if (success) { |
| 3157 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3158 } else { |
| 3159 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3160 } |
| 3161 } |
| 3162 break; |
| 3163 |
| 3164 |
| 3165 case URX_BACKSLASH_G: // Test for position at end of previous m
atch |
| 3166 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->
fInputIdx==fActiveStart))) { |
| 3167 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3168 } |
| 3169 break; |
| 3170 |
| 3171 |
| 3172 case URX_BACKSLASH_X: |
| 3173 // Match a Grapheme, as defined by Unicode TR 29. |
| 3174 // Differs slightly from Perl, which consumes combining marks indep
endently |
| 3175 // of context. |
| 3176 { |
| 3177 |
| 3178 // Fail if at end of input |
| 3179 if (fp->fInputIdx >= fActiveLimit) { |
| 3180 fHitEnd = TRUE; |
| 3181 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3182 break; |
| 3183 } |
| 3184 |
| 3185 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3186 |
| 3187 // Examine (and consume) the current char. |
| 3188 // Dispatch into a little state machine, based on the char. |
| 3189 UChar32 c; |
| 3190 c = UTEXT_NEXT32(fInputText); |
| 3191 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3192 UnicodeSet **sets = fPattern->fStaticSets; |
| 3193 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; |
| 3194 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; |
| 3195 if (sets[URX_GC_L]->contains(c)) goto GC_L; |
| 3196 if (sets[URX_GC_LV]->contains(c)) goto GC_V; |
| 3197 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; |
| 3198 if (sets[URX_GC_V]->contains(c)) goto GC_V; |
| 3199 if (sets[URX_GC_T]->contains(c)) goto GC_T; |
| 3200 goto GC_Extend; |
| 3201 |
| 3202 |
| 3203 |
| 3204 GC_L: |
| 3205 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; |
| 3206 c = UTEXT_NEXT32(fInputText); |
| 3207 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3208 if (sets[URX_GC_L]->contains(c)) goto GC_L; |
| 3209 if (sets[URX_GC_LV]->contains(c)) goto GC_V; |
| 3210 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; |
| 3211 if (sets[URX_GC_V]->contains(c)) goto GC_V; |
| 3212 UTEXT_PREVIOUS32(fInputText); |
| 3213 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3214 goto GC_Extend; |
| 3215 |
| 3216 GC_V: |
| 3217 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; |
| 3218 c = UTEXT_NEXT32(fInputText); |
| 3219 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3220 if (sets[URX_GC_V]->contains(c)) goto GC_V; |
| 3221 if (sets[URX_GC_T]->contains(c)) goto GC_T; |
| 3222 UTEXT_PREVIOUS32(fInputText); |
| 3223 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3224 goto GC_Extend; |
| 3225 |
| 3226 GC_T: |
| 3227 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; |
| 3228 c = UTEXT_NEXT32(fInputText); |
| 3229 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3230 if (sets[URX_GC_T]->contains(c)) goto GC_T; |
| 3231 UTEXT_PREVIOUS32(fInputText); |
| 3232 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3233 goto GC_Extend; |
| 3234 |
| 3235 GC_Extend: |
| 3236 // Combining characters are consumed here |
| 3237 for (;;) { |
| 3238 if (fp->fInputIdx >= fActiveLimit) { |
| 3239 break; |
| 3240 } |
| 3241 c = UTEXT_CURRENT32(fInputText); |
| 3242 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { |
| 3243 break; |
| 3244 } |
| 3245 UTEXT_NEXT32(fInputText); |
| 3246 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3247 } |
| 3248 goto GC_Done; |
| 3249 |
| 3250 GC_Control: |
| 3251 // Most control chars stand alone (don't combine with combining
chars), |
| 3252 // except for that CR/LF sequence is a single grapheme cluster
. |
| 3253 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32
(fInputText) == 0x0a) { |
| 3254 c = UTEXT_NEXT32(fInputText); |
| 3255 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3256 } |
| 3257 |
| 3258 GC_Done: |
| 3259 if (fp->fInputIdx >= fActiveLimit) { |
| 3260 fHitEnd = TRUE; |
| 3261 } |
| 3262 break; |
| 3263 } |
| 3264 |
| 3265 |
| 3266 |
| 3267 |
| 3268 case URX_BACKSLASH_Z: // Test for end of Input |
| 3269 if (fp->fInputIdx < fAnchorLimit) { |
| 3270 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3271 } else { |
| 3272 fHitEnd = TRUE; |
| 3273 fRequireEnd = TRUE; |
| 3274 } |
| 3275 break; |
| 3276 |
| 3277 |
| 3278 |
| 3279 case URX_STATIC_SETREF: |
| 3280 { |
| 3281 // Test input character against one of the predefined sets |
| 3282 // (Word Characters, for example) |
| 3283 // The high bit of the op value is a flag for the match polarity
. |
| 3284 // 0: success if input char is in set. |
| 3285 // 1: success if input char is not in set. |
| 3286 if (fp->fInputIdx >= fActiveLimit) { |
| 3287 fHitEnd = TRUE; |
| 3288 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3289 break; |
| 3290 } |
| 3291 |
| 3292 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); |
| 3293 opValue &= ~URX_NEG_SET; |
| 3294 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); |
| 3295 |
| 3296 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3297 UChar32 c = UTEXT_NEXT32(fInputText); |
| 3298 if (c < 256) { |
| 3299 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; |
| 3300 if (s8->contains(c)) { |
| 3301 success = !success; |
| 3302 } |
| 3303 } else { |
| 3304 const UnicodeSet *s = fPattern->fStaticSets[opValue]; |
| 3305 if (s->contains(c)) { |
| 3306 success = !success; |
| 3307 } |
| 3308 } |
| 3309 if (success) { |
| 3310 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3311 } else { |
| 3312 // the character wasn't in the set. |
| 3313 #ifdef REGEX_SMART_BACKTRACKING |
| 3314 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFra
meSize) { |
| 3315 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFr
ame(fFrameSize); |
| 3316 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && f
p->fInputIdx <= prevFrame->fInputIdx) { |
| 3317 // Try to find it, backwards |
| 3318 UTEXT_PREVIOUS32(fInputText); // skip the first char
acter we tried |
| 3319 success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
// reset |
| 3320 do { |
| 3321 c = UTEXT_PREVIOUS32(fInputText); |
| 3322 if (c == U_SENTINEL) { |
| 3323 break; |
| 3324 } else if (c < 256) { |
| 3325 Regex8BitSet *s8 = &fPattern->fStaticSets8[o
pValue]; |
| 3326 if (s8->contains(c)) { |
| 3327 success = !success; |
| 3328 } |
| 3329 } else { |
| 3330 const UnicodeSet *s = fPattern->fStaticSets[
opValue]; |
| 3331 if (s->contains(c)) { |
| 3332 success = !success; |
| 3333 } |
| 3334 } |
| 3335 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSea
rchIndex && !success); |
| 3336 |
| 3337 if (success && c != U_SENTINEL) { |
| 3338 fp = (REStackFrame *)fStack->popFrame(fFrameSize
); |
| 3339 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText)
; |
| 3340 if (fp->fInputIdx > backSearchIndex) { |
| 3341 fp = StateSave(fp, fp->fPatIdx, status); |
| 3342 } |
| 3343 fp->fPatIdx++; // Skip the LOOP_C, we just did t
hat |
| 3344 break; |
| 3345 } |
| 3346 } |
| 3347 } |
| 3348 #endif |
| 3349 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3350 } |
| 3351 } |
| 3352 break; |
| 3353 |
| 3354 |
| 3355 case URX_STAT_SETREF_N: |
| 3356 { |
| 3357 // Test input character for NOT being a member of one of |
| 3358 // the predefined sets (Word Characters, for example) |
| 3359 if (fp->fInputIdx >= fActiveLimit) { |
| 3360 fHitEnd = TRUE; |
| 3361 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3362 break; |
| 3363 } |
| 3364 |
| 3365 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); |
| 3366 |
| 3367 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3368 |
| 3369 UChar32 c = UTEXT_NEXT32(fInputText); |
| 3370 if (c < 256) { |
| 3371 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; |
| 3372 if (s8->contains(c) == FALSE) { |
| 3373 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3374 break; |
| 3375 } |
| 3376 } else { |
| 3377 const UnicodeSet *s = fPattern->fStaticSets[opValue]; |
| 3378 if (s->contains(c) == FALSE) { |
| 3379 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3380 break; |
| 3381 } |
| 3382 } |
| 3383 // the character wasn't in the set. |
| 3384 #ifdef REGEX_SMART_BACKTRACKING |
| 3385 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSi
ze) { |
| 3386 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(
fFrameSize); |
| 3387 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->f
InputIdx <= prevFrame->fInputIdx) { |
| 3388 // Try to find it, backwards |
| 3389 UTEXT_PREVIOUS32(fInputText); // skip the first characte
r we tried |
| 3390 UBool success = FALSE; |
| 3391 do { |
| 3392 c = UTEXT_PREVIOUS32(fInputText); |
| 3393 if (c == U_SENTINEL) { |
| 3394 break; |
| 3395 } else if (c < 256) { |
| 3396 Regex8BitSet *s8 = &fPattern->fStaticSets8[opVal
ue]; |
| 3397 if (s8->contains(c) == FALSE) { |
| 3398 success = TRUE; |
| 3399 break; |
| 3400 } |
| 3401 } else { |
| 3402 const UnicodeSet *s = fPattern->fStaticSets[opVa
lue]; |
| 3403 if (s->contains(c) == FALSE) { |
| 3404 success = TRUE; |
| 3405 break; |
| 3406 } |
| 3407 } |
| 3408 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchI
ndex); |
| 3409 |
| 3410 if (success) { |
| 3411 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3412 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3413 if (fp->fInputIdx > backSearchIndex) { |
| 3414 fp = StateSave(fp, fp->fPatIdx, status); |
| 3415 } |
| 3416 fp->fPatIdx++; // Skip the LOOP_C, we just did that |
| 3417 break; |
| 3418 } |
| 3419 } |
| 3420 } |
| 3421 #endif |
| 3422 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3423 } |
| 3424 break; |
| 3425 |
| 3426 |
| 3427 case URX_SETREF: |
| 3428 if (fp->fInputIdx >= fActiveLimit) { |
| 3429 fHitEnd = TRUE; |
| 3430 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3431 break; |
| 3432 } else { |
| 3433 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3434 |
| 3435 // There is input left. Pick up one char and test it for set me
mbership. |
| 3436 UChar32 c = UTEXT_NEXT32(fInputText); |
| 3437 U_ASSERT(opValue > 0 && opValue < sets->size()); |
| 3438 if (c<256) { |
| 3439 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; |
| 3440 if (s8->contains(c)) { |
| 3441 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3442 break; |
| 3443 } |
| 3444 } else { |
| 3445 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); |
| 3446 if (s->contains(c)) { |
| 3447 // The character is in the set. A Match. |
| 3448 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3449 break; |
| 3450 } |
| 3451 } |
| 3452 |
| 3453 // the character wasn't in the set. |
| 3454 #ifdef REGEX_SMART_BACKTRACKING |
| 3455 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSi
ze) { |
| 3456 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(
fFrameSize); |
| 3457 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->f
InputIdx <= prevFrame->fInputIdx) { |
| 3458 // Try to find it, backwards |
| 3459 UTEXT_PREVIOUS32(fInputText); // skip the first characte
r we tried |
| 3460 UBool success = FALSE; |
| 3461 do { |
| 3462 c = UTEXT_PREVIOUS32(fInputText); |
| 3463 if (c == U_SENTINEL) { |
| 3464 break; |
| 3465 } else if (c < 256) { |
| 3466 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; |
| 3467 if (s8->contains(c)) { |
| 3468 success = TRUE; |
| 3469 break; |
| 3470 } |
| 3471 } else { |
| 3472 UnicodeSet *s = (UnicodeSet *)sets->elementAt(op
Value); |
| 3473 if (s->contains(c)) { |
| 3474 success = TRUE; |
| 3475 break; |
| 3476 } |
| 3477 } |
| 3478 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchI
ndex); |
| 3479 |
| 3480 if (success) { |
| 3481 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3482 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3483 if (fp->fInputIdx > backSearchIndex) { |
| 3484 fp = StateSave(fp, fp->fPatIdx, status); |
| 3485 } |
| 3486 fp->fPatIdx++; // Skip the LOOP_C, we just did that |
| 3487 break; |
| 3488 } |
| 3489 } |
| 3490 } |
| 3491 #endif |
| 3492 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3493 } |
| 3494 break; |
| 3495 |
| 3496 |
| 3497 case URX_DOTANY: |
| 3498 { |
| 3499 // . matches anything, but stops at end-of-line. |
| 3500 if (fp->fInputIdx >= fActiveLimit) { |
| 3501 // At end of input. Match failed. Backtrack out. |
| 3502 fHitEnd = TRUE; |
| 3503 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3504 break; |
| 3505 } |
| 3506 |
| 3507 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3508 |
| 3509 // There is input left. Advance over one char, unless we've hit
end-of-line |
| 3510 UChar32 c = UTEXT_NEXT32(fInputText); |
| 3511 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many
chars as possible |
| 3512 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029))
{ |
| 3513 // End of line in normal mode. . does not match. |
| 3514 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3515 break; |
| 3516 } |
| 3517 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3518 } |
| 3519 break; |
| 3520 |
| 3521 |
| 3522 case URX_DOTANY_ALL: |
| 3523 { |
| 3524 // ., in dot-matches-all (including new lines) mode |
| 3525 if (fp->fInputIdx >= fActiveLimit) { |
| 3526 // At end of input. Match failed. Backtrack out. |
| 3527 fHitEnd = TRUE; |
| 3528 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3529 break; |
| 3530 } |
| 3531 |
| 3532 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3533 |
| 3534 // There is input left. Advance over one char, except if we are |
| 3535 // at a cr/lf, advance over both of them. |
| 3536 UChar32 c; |
| 3537 c = UTEXT_NEXT32(fInputText); |
| 3538 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3539 if (c==0x0d && fp->fInputIdx < fActiveLimit) { |
| 3540 // In the case of a CR/LF, we need to advance over both. |
| 3541 UChar32 nextc = UTEXT_CURRENT32(fInputText); |
| 3542 if (nextc == 0x0a) { |
| 3543 UTEXT_NEXT32(fInputText); |
| 3544 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3545 } |
| 3546 } |
| 3547 } |
| 3548 break; |
| 3549 |
| 3550 |
| 3551 case URX_DOTANY_UNIX: |
| 3552 { |
| 3553 // '.' operator, matches all, but stops at end-of-line. |
| 3554 // UNIX_LINES mode, so 0x0a is the only recognized line ending
. |
| 3555 if (fp->fInputIdx >= fActiveLimit) { |
| 3556 // At end of input. Match failed. Backtrack out. |
| 3557 fHitEnd = TRUE; |
| 3558 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3559 break; |
| 3560 } |
| 3561 |
| 3562 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3563 |
| 3564 // There is input left. Advance over one char, unless we've hit
end-of-line |
| 3565 UChar32 c = UTEXT_NEXT32(fInputText); |
| 3566 if (c == 0x0a) { |
| 3567 // End of line in normal mode. '.' does not match the \n |
| 3568 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3569 } else { |
| 3570 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3571 } |
| 3572 } |
| 3573 break; |
| 3574 |
| 3575 |
| 3576 case URX_JMP: |
| 3577 fp->fPatIdx = opValue; |
| 3578 break; |
| 3579 |
| 3580 case URX_FAIL: |
| 3581 isMatch = FALSE; |
| 3582 goto breakFromLoop; |
| 3583 |
| 3584 case URX_JMP_SAV: |
| 3585 U_ASSERT(opValue < fPattern->fCompiledPat->size()); |
| 3586 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc
following current |
| 3587 fp->fPatIdx = opValue; // Then JMP. |
| 3588 break; |
| 3589 |
| 3590 case URX_JMP_SAV_X: |
| 3591 // This opcode is used with (x)+, when x can match a zero length str
ing. |
| 3592 // Same as JMP_SAV, except conditional on the match having made forw
ard progress. |
| 3593 // Destination of the JMP must be a URX_STO_INP_LOC, from which we g
et the |
| 3594 // data address of the input position at the start of the loop. |
| 3595 { |
| 3596 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()
); |
| 3597 int32_t stoOp = (int32_t)pat[opValue-1]; |
| 3598 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); |
| 3599 int32_t frameLoc = URX_VAL(stoOp); |
| 3600 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); |
| 3601 int64_t prevInputIdx = fp->fExtra[frameLoc]; |
| 3602 U_ASSERT(prevInputIdx <= fp->fInputIdx); |
| 3603 if (prevInputIdx < fp->fInputIdx) { |
| 3604 // The match did make progress. Repeat the loop. |
| 3605 fp = StateSave(fp, fp->fPatIdx, status); // State save to l
oc following current |
| 3606 fp->fPatIdx = opValue; |
| 3607 fp->fExtra[frameLoc] = fp->fInputIdx; |
| 3608 } |
| 3609 // If the input position did not advance, we do nothing here, |
| 3610 // execution will fall out of the loop. |
| 3611 } |
| 3612 break; |
| 3613 |
| 3614 case URX_CTR_INIT: |
| 3615 { |
| 3616 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); |
| 3617 fp->fExtra[opValue] = 0; // Set the loop counter variable
to zero |
| 3618 |
| 3619 // Pick up the three extra operands that CTR_INIT has, and |
| 3620 // skip the pattern location counter past |
| 3621 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| 3622 fp->fPatIdx += 3; |
| 3623 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); |
| 3624 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; |
| 3625 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; |
| 3626 U_ASSERT(minCount>=0); |
| 3627 U_ASSERT(maxCount>=minCount || maxCount==-1); |
| 3628 U_ASSERT(loopLoc>fp->fPatIdx); |
| 3629 |
| 3630 if (minCount == 0) { |
| 3631 fp = StateSave(fp, loopLoc+1, status); |
| 3632 } |
| 3633 if (maxCount == 0) { |
| 3634 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3635 } |
| 3636 } |
| 3637 break; |
| 3638 |
| 3639 case URX_CTR_LOOP: |
| 3640 { |
| 3641 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); |
| 3642 int32_t initOp = (int32_t)pat[opValue]; |
| 3643 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); |
| 3644 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; |
| 3645 int32_t minCount = (int32_t)pat[opValue+2]; |
| 3646 int32_t maxCount = (int32_t)pat[opValue+3]; |
| 3647 // Increment the counter. Note: we DIDN'T worry about counter |
| 3648 // overflow, since the data comes from UnicodeStrings, which |
| 3649 // stores its length in an int32_t. Do we have to think about |
| 3650 // this now that we're using UText? Probably not, since the le
ngth |
| 3651 // in UChar32s is still an int32_t. |
| 3652 (*pCounter)++; |
| 3653 U_ASSERT(*pCounter > 0); |
| 3654 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { |
| 3655 U_ASSERT(*pCounter == maxCount || maxCount == -1); |
| 3656 break; |
| 3657 } |
| 3658 if (*pCounter >= minCount) { |
| 3659 fp = StateSave(fp, fp->fPatIdx, status); |
| 3660 } |
| 3661 fp->fPatIdx = opValue + 4; // Loop back. |
| 3662 } |
| 3663 break; |
| 3664 |
| 3665 case URX_CTR_INIT_NG: |
| 3666 { |
| 3667 // Initialize a non-greedy loop |
| 3668 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); |
| 3669 fp->fExtra[opValue] = 0; // Set the loop counter variable
to zero |
| 3670 |
| 3671 // Pick up the three extra operands that CTR_INIT has, and |
| 3672 // skip the pattern location counter past |
| 3673 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| 3674 fp->fPatIdx += 3; |
| 3675 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); |
| 3676 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; |
| 3677 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; |
| 3678 U_ASSERT(minCount>=0); |
| 3679 U_ASSERT(maxCount>=minCount || maxCount==-1); |
| 3680 U_ASSERT(loopLoc>fp->fPatIdx); |
| 3681 |
| 3682 if (minCount == 0) { |
| 3683 if (maxCount != 0) { |
| 3684 fp = StateSave(fp, fp->fPatIdx, status); |
| 3685 } |
| 3686 fp->fPatIdx = loopLoc+1; // Continue with stuff after repe
ated block |
| 3687 } |
| 3688 } |
| 3689 break; |
| 3690 |
| 3691 case URX_CTR_LOOP_NG: |
| 3692 { |
| 3693 // Non-greedy {min, max} loops |
| 3694 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); |
| 3695 int32_t initOp = (int32_t)pat[opValue]; |
| 3696 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); |
| 3697 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; |
| 3698 int32_t minCount = (int32_t)pat[opValue+2]; |
| 3699 int32_t maxCount = (int32_t)pat[opValue+3]; |
| 3700 // Increment the counter. Note: we DIDN'T worry about counter |
| 3701 // overflow, since the data comes from UnicodeStrings, which |
| 3702 // stores its length in an int32_t. Do we have to think about |
| 3703 // this now that we're using UText? Probably not, since the le
ngth |
| 3704 // in UChar32s is still an int32_t. |
| 3705 (*pCounter)++; |
| 3706 U_ASSERT(*pCounter > 0); |
| 3707 |
| 3708 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { |
| 3709 // The loop has matched the maximum permitted number of time
s. |
| 3710 // Break out of here with no action. Matching will |
| 3711 // continue with the following pattern. |
| 3712 U_ASSERT(*pCounter == maxCount || maxCount == -1); |
| 3713 break; |
| 3714 } |
| 3715 |
| 3716 if (*pCounter < minCount) { |
| 3717 // We haven't met the minimum number of matches yet. |
| 3718 // Loop back for another one. |
| 3719 fp->fPatIdx = opValue + 4; // Loop back. |
| 3720 } else { |
| 3721 // We do have the minimum number of matches. |
| 3722 // Fall into the following pattern, but first do |
| 3723 // a state save to the top of the loop, so that a failure |
| 3724 // in the following pattern will try another iteration of
the loop. |
| 3725 fp = StateSave(fp, opValue + 4, status); |
| 3726 } |
| 3727 } |
| 3728 break; |
| 3729 |
| 3730 case URX_STO_SP: |
| 3731 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); |
| 3732 fData[opValue] = fStack->size(); |
| 3733 break; |
| 3734 |
| 3735 case URX_LD_SP: |
| 3736 { |
| 3737 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); |
| 3738 int32_t newStackSize = (int32_t)fData[opValue]; |
| 3739 U_ASSERT(newStackSize <= fStack->size()); |
| 3740 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize
; |
| 3741 if (newFP == (int64_t *)fp) { |
| 3742 break; |
| 3743 } |
| 3744 int32_t i; |
| 3745 for (i=0; i<fFrameSize; i++) { |
| 3746 newFP[i] = ((int64_t *)fp)[i]; |
| 3747 } |
| 3748 fp = (REStackFrame *)newFP; |
| 3749 fStack->setSize(newStackSize); |
| 3750 } |
| 3751 break; |
| 3752 |
| 3753 case URX_BACKREF: |
| 3754 case URX_BACKREF_I: |
| 3755 { |
| 3756 U_ASSERT(opValue < fFrameSize); |
| 3757 int64_t groupStartIdx = fp->fExtra[opValue]; |
| 3758 int64_t groupEndIdx = fp->fExtra[opValue+1]; |
| 3759 U_ASSERT(groupStartIdx <= groupEndIdx); |
| 3760 if (groupStartIdx < 0) { |
| 3761 // This capture group has not participated in the match thus
far, |
| 3762 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. |
| 3763 } |
| 3764 |
| 3765 if (groupEndIdx == groupStartIdx) { |
| 3766 // The capture group match was of an empty string. |
| 3767 // Verified by testing: Perl matches succeed in this case
, so |
| 3768 // we do too. |
| 3769 break; |
| 3770 } |
| 3771 |
| 3772 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); |
| 3773 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3774 |
| 3775 UBool haveMatch = (opType == URX_BACKREF ? |
| 3776 (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, f
InputText, -1)) : |
| 3777 (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndId
x, fInputText, -1, U_FOLD_CASE_DEFAULT, &status))); |
| 3778 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3779 |
| 3780 if (fp->fInputIdx > fActiveLimit) { |
| 3781 fHitEnd = TRUE; |
| 3782 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. |
| 3783 } else if (!haveMatch) { |
| 3784 if (fp->fInputIdx == fActiveLimit) { |
| 3785 fHitEnd = TRUE; |
| 3786 } |
| 3787 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. |
| 3788 } |
| 3789 } |
| 3790 break; |
| 3791 |
| 3792 case URX_STO_INP_LOC: |
| 3793 { |
| 3794 U_ASSERT(opValue >= 0 && opValue < fFrameSize); |
| 3795 fp->fExtra[opValue] = fp->fInputIdx; |
| 3796 } |
| 3797 break; |
| 3798 |
| 3799 case URX_JMPX: |
| 3800 { |
| 3801 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| 3802 fp->fPatIdx += 1; |
| 3803 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); |
| 3804 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); |
| 3805 int64_t savedInputIdx = fp->fExtra[dataLoc]; |
| 3806 U_ASSERT(savedInputIdx <= fp->fInputIdx); |
| 3807 if (savedInputIdx < fp->fInputIdx) { |
| 3808 fp->fPatIdx = opValue; // JMP |
| 3809 } else { |
| 3810 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAI
L, no progress in loop. |
| 3811 } |
| 3812 } |
| 3813 break; |
| 3814 |
| 3815 case URX_LA_START: |
| 3816 { |
| 3817 // Entering a lookahead block. |
| 3818 // Save Stack Ptr, Input Pos. |
| 3819 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 3820 fData[opValue] = fStack->size(); |
| 3821 fData[opValue+1] = fp->fInputIdx; |
| 3822 fActiveStart = fLookStart; // Set the match region
change for |
| 3823 fActiveLimit = fLookLimit; // transparent bounds. |
| 3824 } |
| 3825 break; |
| 3826 |
| 3827 case URX_LA_END: |
| 3828 { |
| 3829 // Leaving a look-ahead block. |
| 3830 // restore Stack Ptr, Input Pos to positions they had on entry
to block. |
| 3831 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 3832 int32_t stackSize = fStack->size(); |
| 3833 int32_t newStackSize =(int32_t)fData[opValue]; |
| 3834 U_ASSERT(stackSize >= newStackSize); |
| 3835 if (stackSize > newStackSize) { |
| 3836 // Copy the current top frame back to the new (cut back) top
frame. |
| 3837 // This makes the capture groups from within the look-ahea
d |
| 3838 // expression available. |
| 3839 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrame
Size; |
| 3840 int32_t i; |
| 3841 for (i=0; i<fFrameSize; i++) { |
| 3842 newFP[i] = ((int64_t *)fp)[i]; |
| 3843 } |
| 3844 fp = (REStackFrame *)newFP; |
| 3845 fStack->setSize(newStackSize); |
| 3846 } |
| 3847 fp->fInputIdx = fData[opValue+1]; |
| 3848 |
| 3849 // Restore the active region bounds in the input string; they ma
y have |
| 3850 // been changed because of transparent bounds on a Region. |
| 3851 fActiveStart = fRegionStart; |
| 3852 fActiveLimit = fRegionLimit; |
| 3853 } |
| 3854 break; |
| 3855 |
| 3856 case URX_ONECHAR_I: |
| 3857 if (fp->fInputIdx < fActiveLimit) { |
| 3858 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3859 |
| 3860 UChar32 c = UTEXT_NEXT32(fInputText); |
| 3861 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { |
| 3862 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3863 break; |
| 3864 } |
| 3865 } else { |
| 3866 fHitEnd = TRUE; |
| 3867 } |
| 3868 |
| 3869 #ifdef REGEX_SMART_BACKTRACKING |
| 3870 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize)
{ |
| 3871 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFra
meSize); |
| 3872 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInpu
tIdx <= prevFrame->fInputIdx) { |
| 3873 UBool success = FALSE; |
| 3874 UChar32 c = UTEXT_PREVIOUS32(fInputText); |
| 3875 while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex)
{ |
| 3876 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { |
| 3877 success = TRUE; |
| 3878 break; |
| 3879 } else if (c == U_SENTINEL) { |
| 3880 break; |
| 3881 } |
| 3882 c = UTEXT_PREVIOUS32(fInputText); |
| 3883 } |
| 3884 if (success) { |
| 3885 fHitEnd = FALSE; |
| 3886 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3887 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3888 if (fp->fInputIdx > backSearchIndex) { |
| 3889 fp = StateSave(fp, fp->fPatIdx, status); |
| 3890 } |
| 3891 fp->fPatIdx++; // Skip the LOOP_C, we just did that |
| 3892 break; |
| 3893 } |
| 3894 } |
| 3895 } |
| 3896 #endif |
| 3897 |
| 3898 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3899 break; |
| 3900 |
| 3901 case URX_STRING_I: |
| 3902 { |
| 3903 // Test input against a literal string. |
| 3904 // Strings require two slots in the compiled pattern, one for th
e |
| 3905 // offset to the string text, and one for the length. |
| 3906 const UCaseProps *csp = ucase_getSingleton(); |
| 3907 { |
| 3908 int32_t stringStartIdx, stringLen; |
| 3909 stringStartIdx = opValue; |
| 3910 |
| 3911 op = (int32_t)pat[fp->fPatIdx]; |
| 3912 fp->fPatIdx++; |
| 3913 opType = URX_TYPE(op); |
| 3914 opValue = URX_VAL(op); |
| 3915 U_ASSERT(opType == URX_STRING_LEN); |
| 3916 stringLen = opValue; |
| 3917 |
| 3918 const UChar *patternChars = litText+stringStartIdx; |
| 3919 const UChar *patternEnd = patternChars+stringLen; |
| 3920 |
| 3921 const UChar *foldChars = NULL; |
| 3922 int32_t foldOffset, foldLength; |
| 3923 UChar32 c; |
| 3924 |
| 3925 foldOffset = foldLength = 0; |
| 3926 UBool success = TRUE; |
| 3927 |
| 3928 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3929 while (patternChars < patternEnd && success) { |
| 3930 if(foldOffset < foldLength) { |
| 3931 U16_NEXT_UNSAFE(foldChars, foldOffset, c); |
| 3932 } else { |
| 3933 c = UTEXT_NEXT32(fInputText); |
| 3934 if (c != U_SENTINEL) { |
| 3935 foldLength = ucase_toFullFolding(csp, c, &foldCh
ars, U_FOLD_CASE_DEFAULT); |
| 3936 if(foldLength >= 0) { |
| 3937 if(foldLength <= UCASE_MAX_STRING_LENGTH) {
// !!!: Does not correctly handle chars that fold to 0-length strings |
| 3938 foldOffset = 0; |
| 3939 U16_NEXT_UNSAFE(foldChars, foldOffset, c
); |
| 3940 } else { |
| 3941 c = foldLength; |
| 3942 foldLength = foldOffset; // to avoid rea
ding chars from the folding buffer |
| 3943 } |
| 3944 } |
| 3945 } |
| 3946 |
| 3947 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3948 } |
| 3949 |
| 3950 success = FALSE; |
| 3951 if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit))
{ |
| 3952 if (U_IS_BMP(c)) { |
| 3953 success = (*patternChars == c); |
| 3954 patternChars += 1; |
| 3955 } else if (patternChars+1 < patternEnd) { |
| 3956 success = (*patternChars == U16_LEAD(c) && *(pat
ternChars+1) == U16_TRAIL(c)); |
| 3957 patternChars += 2; |
| 3958 } |
| 3959 } else { |
| 3960 fHitEnd = TRUE; // TODO: See ticket 6074 |
| 3961 } |
| 3962 } |
| 3963 |
| 3964 if (!success) { |
| 3965 #ifdef REGEX_SMART_BACKTRACKING |
| 3966 if (fp->fInputIdx > backSearchIndex && fStack->size()) { |
| 3967 REStackFrame *prevFrame = (REStackFrame *)fStack->pe
ekFrame(fFrameSize); |
| 3968 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx])
&& fp->fInputIdx <= prevFrame->fInputIdx) { |
| 3969 // Reset to last start point |
| 3970 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3971 patternChars = litText+stringStartIdx; |
| 3972 |
| 3973 // Search backwards for a possible start |
| 3974 do { |
| 3975 c = UTEXT_PREVIOUS32(fInputText); |
| 3976 if (c == U_SENTINEL) { |
| 3977 break; |
| 3978 } else { |
| 3979 foldLength = ucase_toFullFolding(csp, c,
&foldChars, U_FOLD_CASE_DEFAULT); |
| 3980 if(foldLength >= 0) { |
| 3981 if(foldLength <= UCASE_MAX_STRING_LE
NGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings |
| 3982 foldOffset = 0; |
| 3983 U16_NEXT_UNSAFE(foldChars, foldO
ffset, c); |
| 3984 } else { |
| 3985 c = foldLength; |
| 3986 foldLength = foldOffset; // to a
void reading chars from the folding buffer |
| 3987 } |
| 3988 } |
| 3989 |
| 3990 if ((U_IS_BMP(c) && *patternChars == c)
|| |
| 3991 (*patternChars == U16_LEAD(c) &&
*(patternChars+1) == U16_TRAIL(c))) { |
| 3992 success = TRUE; |
| 3993 break; |
| 3994 } |
| 3995 } |
| 3996 } while (UTEXT_GETNATIVEINDEX(fInputText) >= bac
kSearchIndex); |
| 3997 |
| 3998 // And try again |
| 3999 if (success) { |
| 4000 fp = (REStackFrame *)fStack->popFrame(fFrame
Size); |
| 4001 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputT
ext); |
| 4002 if (fp->fInputIdx > backSearchIndex) { |
| 4003 fp = StateSave(fp, fp->fPatIdx, status); |
| 4004 } |
| 4005 fp->fPatIdx++; // Skip the LOOP_C, we just d
id that |
| 4006 break; |
| 4007 } |
| 4008 } |
| 4009 } |
| 4010 #endif |
| 4011 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4012 } |
| 4013 } |
| 4014 } |
| 4015 break; |
| 4016 |
| 4017 case URX_LB_START: |
| 4018 { |
| 4019 // Entering a look-behind block. |
| 4020 // Save Stack Ptr, Input Pos. |
| 4021 // TODO: implement transparent bounds. Ticket #6067 |
| 4022 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 4023 fData[opValue] = fStack->size(); |
| 4024 fData[opValue+1] = fp->fInputIdx; |
| 4025 // Init the variable containing the start index for attempted ma
tches. |
| 4026 fData[opValue+2] = -1; |
| 4027 // Save input string length, then reset to pin any matches to en
d at |
| 4028 // the current position. |
| 4029 fData[opValue+3] = fActiveLimit; |
| 4030 fActiveLimit = fp->fInputIdx; |
| 4031 } |
| 4032 break; |
| 4033 |
| 4034 |
| 4035 case URX_LB_CONT: |
| 4036 { |
| 4037 // Positive Look-Behind, at top of loop checking for matches of
LB expression |
| 4038 // at all possible input starting positions. |
| 4039 |
| 4040 // Fetch the min and max possible match lengths. They are the o
perands |
| 4041 // of this op in the pattern. |
| 4042 int32_t minML = (int32_t)pat[fp->fPatIdx++]; |
| 4043 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; |
| 4044 U_ASSERT(minML <= maxML); |
| 4045 U_ASSERT(minML >= 0); |
| 4046 |
| 4047 // Fetch (from data) the last input index where a match was atte
mpted. |
| 4048 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 4049 int64_t *lbStartIdx = &fData[opValue+2]; |
| 4050 if (*lbStartIdx < 0) { |
| 4051 // First time through loop. |
| 4052 *lbStartIdx = fp->fInputIdx - minML; |
| 4053 } else { |
| 4054 // 2nd through nth time through the loop. |
| 4055 // Back up start position for match by one. |
| 4056 if (*lbStartIdx == 0) { |
| 4057 (*lbStartIdx)--; |
| 4058 } else { |
| 4059 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); |
| 4060 UTEXT_PREVIOUS32(fInputText); |
| 4061 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 4062 } |
| 4063 } |
| 4064 |
| 4065 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { |
| 4066 // We have tried all potential match starting points without |
| 4067 // getting a match. Backtrack out, and out of the |
| 4068 // Look Behind altogether. |
| 4069 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4070 int64_t restoreInputLen = fData[opValue+3]; |
| 4071 U_ASSERT(restoreInputLen >= fActiveLimit); |
| 4072 U_ASSERT(restoreInputLen <= fInputLength); |
| 4073 fActiveLimit = restoreInputLen; |
| 4074 break; |
| 4075 } |
| 4076 |
| 4077 // Save state to this URX_LB_CONT op, so failure to match wil
l repeat the loop. |
| 4078 // (successful match will fall off the end of the loop.) |
| 4079 fp = StateSave(fp, fp->fPatIdx-3, status); |
| 4080 fp->fInputIdx = *lbStartIdx; |
| 4081 } |
| 4082 break; |
| 4083 |
| 4084 case URX_LB_END: |
| 4085 // End of a look-behind block, after a successful match. |
| 4086 { |
| 4087 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 4088 if (fp->fInputIdx != fActiveLimit) { |
| 4089 // The look-behind expression matched, but the match did no
t |
| 4090 // extend all the way to the point that we are looking be
hind from. |
| 4091 // FAIL out of here, which will take us back to the LB_CONT
, which |
| 4092 // will retry the match starting at another position or
fail |
| 4093 // the look-behind altogether, whichever is appropriate. |
| 4094 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4095 break; |
| 4096 } |
| 4097 |
| 4098 // Look-behind match is good. Restore the orignal input string
length, |
| 4099 // which had been truncated to pin the end of the lookbehind m
atch to the |
| 4100 // position being looked-behind. |
| 4101 int64_t originalInputLen = fData[opValue+3]; |
| 4102 U_ASSERT(originalInputLen >= fActiveLimit); |
| 4103 U_ASSERT(originalInputLen <= fInputLength); |
| 4104 fActiveLimit = originalInputLen; |
| 4105 } |
| 4106 break; |
| 4107 |
| 4108 |
| 4109 case URX_LBN_CONT: |
| 4110 { |
| 4111 // Negative Look-Behind, at top of loop checking for matches of
LB expression |
| 4112 // at all possible input starting positions. |
| 4113 |
| 4114 // Fetch the extra parameters of this op. |
| 4115 int32_t minML = (int32_t)pat[fp->fPatIdx++]; |
| 4116 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; |
| 4117 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; |
| 4118 continueLoc = URX_VAL(continueLoc); |
| 4119 U_ASSERT(minML <= maxML); |
| 4120 U_ASSERT(minML >= 0); |
| 4121 U_ASSERT(continueLoc > fp->fPatIdx); |
| 4122 |
| 4123 // Fetch (from data) the last input index where a match was atte
mpted. |
| 4124 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 4125 int64_t *lbStartIdx = &fData[opValue+2]; |
| 4126 if (*lbStartIdx < 0) { |
| 4127 // First time through loop. |
| 4128 *lbStartIdx = fp->fInputIdx - minML; |
| 4129 } else { |
| 4130 // 2nd through nth time through the loop. |
| 4131 // Back up start position for match by one. |
| 4132 if (*lbStartIdx == 0) { |
| 4133 (*lbStartIdx)--; |
| 4134 } else { |
| 4135 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); |
| 4136 UTEXT_PREVIOUS32(fInputText); |
| 4137 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 4138 } |
| 4139 } |
| 4140 |
| 4141 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { |
| 4142 // We have tried all potential match starting points without |
| 4143 // getting a match, which means that the negative lookbehin
d as |
| 4144 // a whole has succeeded. Jump forward to the continue loc
ation |
| 4145 int64_t restoreInputLen = fData[opValue+3]; |
| 4146 U_ASSERT(restoreInputLen >= fActiveLimit); |
| 4147 U_ASSERT(restoreInputLen <= fInputLength); |
| 4148 fActiveLimit = restoreInputLen; |
| 4149 fp->fPatIdx = continueLoc; |
| 4150 break; |
| 4151 } |
| 4152 |
| 4153 // Save state to this URX_LB_CONT op, so failure to match wil
l repeat the loop. |
| 4154 // (successful match will cause a FAIL out of the loop alto
gether.) |
| 4155 fp = StateSave(fp, fp->fPatIdx-4, status); |
| 4156 fp->fInputIdx = *lbStartIdx; |
| 4157 } |
| 4158 break; |
| 4159 |
| 4160 case URX_LBN_END: |
| 4161 // End of a negative look-behind block, after a successful match. |
| 4162 { |
| 4163 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 4164 if (fp->fInputIdx != fActiveLimit) { |
| 4165 // The look-behind expression matched, but the match did no
t |
| 4166 // extend all the way to the point that we are looking be
hind from. |
| 4167 // FAIL out of here, which will take us back to the LB_CONT
, which |
| 4168 // will retry the match starting at another position or
succeed |
| 4169 // the look-behind altogether, whichever is appropriate. |
| 4170 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4171 break; |
| 4172 } |
| 4173 |
| 4174 // Look-behind expression matched, which means look-behind test
as |
| 4175 // a whole Fails |
| 4176 |
| 4177 // Restore the orignal input string length, which had been tru
ncated |
| 4178 // inorder to pin the end of the lookbehind match |
| 4179 // to the position being looked-behind. |
| 4180 int64_t originalInputLen = fData[opValue+3]; |
| 4181 U_ASSERT(originalInputLen >= fActiveLimit); |
| 4182 U_ASSERT(originalInputLen <= fInputLength); |
| 4183 fActiveLimit = originalInputLen; |
| 4184 |
| 4185 // Restore original stack position, discarding any state saved |
| 4186 // by the successful pattern match. |
| 4187 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 4188 int32_t newStackSize = (int32_t)fData[opValue]; |
| 4189 U_ASSERT(fStack->size() > newStackSize); |
| 4190 fStack->setSize(newStackSize); |
| 4191 |
| 4192 // FAIL, which will take control back to someplace |
| 4193 // prior to entering the look-behind test. |
| 4194 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4195 } |
| 4196 break; |
| 4197 |
| 4198 |
| 4199 case URX_LOOP_SR_I: |
| 4200 // Loop Initialization for the optimized implementation of |
| 4201 // [some character set]* |
| 4202 // This op scans through all matching input. |
| 4203 // The following LOOP_C op emulates stack unwinding if the followi
ng pattern fails. |
| 4204 { |
| 4205 U_ASSERT(opValue > 0 && opValue < sets->size()); |
| 4206 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; |
| 4207 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); |
| 4208 |
| 4209 // Loop through input, until either the input is exhausted or |
| 4210 // we reach a character that is not a member of the set. |
| 4211 int64_t ix = fp->fInputIdx; |
| 4212 UTEXT_SETNATIVEINDEX(fInputText, ix); |
| 4213 for (;;) { |
| 4214 if (ix >= fActiveLimit) { |
| 4215 fHitEnd = TRUE; |
| 4216 break; |
| 4217 } |
| 4218 UChar32 c = UTEXT_NEXT32(fInputText); |
| 4219 if (c<256) { |
| 4220 if (s8->contains(c) == FALSE) { |
| 4221 break; |
| 4222 } |
| 4223 } else { |
| 4224 if (s->contains(c) == FALSE) { |
| 4225 break; |
| 4226 } |
| 4227 } |
| 4228 ix = UTEXT_GETNATIVEINDEX(fInputText); |
| 4229 } |
| 4230 |
| 4231 // If there were no matching characters, skip over the loop alto
gether. |
| 4232 // The loop doesn't run at all, a * op always succeeds. |
| 4233 if (ix == fp->fInputIdx) { |
| 4234 fp->fPatIdx++; // skip the URX_LOOP_C op. |
| 4235 break; |
| 4236 } |
| 4237 |
| 4238 // Peek ahead in the compiled pattern, to the URX_LOOP_C that |
| 4239 // must follow. It's operand is the stack location |
| 4240 // that holds the starting input index for the match of this [
set]* |
| 4241 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; |
| 4242 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); |
| 4243 int32_t stackLoc = URX_VAL(loopcOp); |
| 4244 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); |
| 4245 fp->fExtra[stackLoc] = fp->fInputIdx; |
| 4246 #ifdef REGEX_SMART_BACKTRACKING |
| 4247 backSearchIndex = fp->fInputIdx; |
| 4248 #endif |
| 4249 fp->fInputIdx = ix; |
| 4250 |
| 4251 // Save State to the URX_LOOP_C op that follows this one, |
| 4252 // so that match failures in the following code will return to
there. |
| 4253 // Then bump the pattern idx so the LOOP_C is skipped on the w
ay out of here. |
| 4254 fp = StateSave(fp, fp->fPatIdx, status); |
| 4255 fp->fPatIdx++; |
| 4256 } |
| 4257 break; |
| 4258 |
| 4259 |
| 4260 case URX_LOOP_DOT_I: |
| 4261 // Loop Initialization for the optimized implementation of .* |
| 4262 // This op scans through all remaining input. |
| 4263 // The following LOOP_C op emulates stack unwinding if the followi
ng pattern fails. |
| 4264 { |
| 4265 // Loop through input until the input is exhausted (we reach an
end-of-line) |
| 4266 // In DOTALL mode, we can just go straight to the end of the inp
ut. |
| 4267 int64_t ix; |
| 4268 if ((opValue & 1) == 1) { |
| 4269 // Dot-matches-All mode. Jump straight to the end of the st
ring. |
| 4270 ix = fActiveLimit; |
| 4271 fHitEnd = TRUE; |
| 4272 } else { |
| 4273 // NOT DOT ALL mode. Line endings do not match '.' |
| 4274 // Scan forward until a line ending or end of input. |
| 4275 ix = fp->fInputIdx; |
| 4276 UTEXT_SETNATIVEINDEX(fInputText, ix); |
| 4277 for (;;) { |
| 4278 if (ix >= fActiveLimit) { |
| 4279 fHitEnd = TRUE; |
| 4280 break; |
| 4281 } |
| 4282 UChar32 c = UTEXT_NEXT32(fInputText); |
| 4283 if ((c & 0x7f) <= 0x29) { // Fast filter of non
-new-line-s |
| 4284 if ((c == 0x0a) || // 0x0a is newline i
n both modes. |
| 4285 (((opValue & 2) == 0) && // IF not UNIX_LINES
mode |
| 4286 (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028
|| c==0x2029) { |
| 4287 // char is a line ending. Exit the scanning lo
op. |
| 4288 break; |
| 4289 } |
| 4290 } |
| 4291 ix = UTEXT_GETNATIVEINDEX(fInputText); |
| 4292 } |
| 4293 } |
| 4294 |
| 4295 // If there were no matching characters, skip over the loop alto
gether. |
| 4296 // The loop doesn't run at all, a * op always succeeds. |
| 4297 if (ix == fp->fInputIdx) { |
| 4298 fp->fPatIdx++; // skip the URX_LOOP_C op. |
| 4299 break; |
| 4300 } |
| 4301 |
| 4302 // Peek ahead in the compiled pattern, to the URX_LOOP_C that |
| 4303 // must follow. It's operand is the stack location |
| 4304 // that holds the starting input index for the match of this .
* |
| 4305 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; |
| 4306 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); |
| 4307 int32_t stackLoc = URX_VAL(loopcOp); |
| 4308 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); |
| 4309 fp->fExtra[stackLoc] = fp->fInputIdx; |
| 4310 #ifdef REGEX_SMART_BACKTRACKING |
| 4311 backSearchIndex = fp->fInputIdx; |
| 4312 #endif |
| 4313 fp->fInputIdx = ix; |
| 4314 |
| 4315 // Save State to the URX_LOOP_C op that follows this one, |
| 4316 // so that match failures in the following code will return to
there. |
| 4317 // Then bump the pattern idx so the LOOP_C is skipped on the w
ay out of here. |
| 4318 fp = StateSave(fp, fp->fPatIdx, status); |
| 4319 fp->fPatIdx++; |
| 4320 } |
| 4321 break; |
| 4322 |
| 4323 |
| 4324 case URX_LOOP_C: |
| 4325 { |
| 4326 U_ASSERT(opValue>=0 && opValue<fFrameSize); |
| 4327 backSearchIndex = fp->fExtra[opValue]; |
| 4328 U_ASSERT(backSearchIndex <= fp->fInputIdx); |
| 4329 if (backSearchIndex == fp->fInputIdx) { |
| 4330 // We've backed up the input idx to the point that the loop
started. |
| 4331 // The loop is done. Leave here without saving state. |
| 4332 // Subsequent failures won't come back here. |
| 4333 break; |
| 4334 } |
| 4335 // Set up for the next iteration of the loop, with input index |
| 4336 // backed up by one from the last time through, |
| 4337 // and a state save to this instruction in case the following
code fails again. |
| 4338 // (We're going backwards because this loop emulates stack unw
inding, not |
| 4339 // the initial scan forward.) |
| 4340 U_ASSERT(fp->fInputIdx > 0); |
| 4341 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 4342 UChar32 prevC = UTEXT_PREVIOUS32(fInputText); |
| 4343 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 4344 |
| 4345 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); |
| 4346 if (prevC == 0x0a && |
| 4347 fp->fInputIdx > backSearchIndex && |
| 4348 twoPrevC == 0x0d) { |
| 4349 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; |
| 4350 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { |
| 4351 // .*, stepping back over CRLF pair. |
| 4352 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 4353 } |
| 4354 } |
| 4355 |
| 4356 |
| 4357 fp = StateSave(fp, fp->fPatIdx-1, status); |
| 4358 } |
| 4359 break; |
| 4360 |
| 4361 |
| 4362 |
| 4363 default: |
| 4364 // Trouble. The compiled pattern contains an entry with an |
| 4365 // unrecognized type tag. |
| 4366 U_ASSERT(FALSE); |
| 4367 } |
| 4368 |
| 4369 if (U_FAILURE(status)) { |
| 4370 isMatch = FALSE; |
| 4371 break; |
| 4372 } |
| 4373 } |
| 4374 |
| 4375 breakFromLoop: |
| 4376 fMatch = isMatch; |
| 4377 if (isMatch) { |
| 4378 fLastMatchEnd = fMatchEnd; |
| 4379 fMatchStart = startIdx; |
| 4380 fMatchEnd = fp->fInputIdx; |
| 4381 if (fTraceDebug) { |
| 4382 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart
, fMatchEnd)); |
| 4383 } |
| 4384 } |
| 4385 else |
| 4386 { |
| 4387 if (fTraceDebug) { |
| 4388 REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); |
| 4389 } |
| 4390 } |
| 4391 |
| 4392 fFrame = fp; // The active stack frame when the engine stoppe
d. |
| 4393 // Contains the capture group results that we
need to |
| 4394 // access later. |
| 4395 return; |
| 4396 } |
| 4397 |
| 4398 |
| 4399 //------------------------------------------------------------------------------
-- |
| 4400 // |
| 4401 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with t
he |
| 4402 // assumption that the entire string is available in the UText'
s |
| 4403 // chunk buffer. For now, that means we can use int32_t indexes
, |
| 4404 // except for anything that needs to be saved (like group start
s |
| 4405 // and ends). |
| 4406 // |
| 4407 // startIdx: begin matching a this index. |
| 4408 // toEnd: if true, match must extend to end of the input
region |
| 4409 // |
| 4410 //------------------------------------------------------------------------------
-- |
| 4411 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
s) { |
| 4412 UBool isMatch = FALSE; // True if the we have a match. |
| 4413 |
| 4414 int32_t backSearchIndex = INT32_MAX; // used after greedy single-charact
er matches for searching backwards |
| 4415 |
| 4416 int32_t op; // Operation from the compiled pattern, s
plit into |
| 4417 int32_t opType; // the opcode |
| 4418 int32_t opValue; // and the operand value. |
| 4419 |
| 4420 #ifdef REGEX_RUN_DEBUG |
| 4421 if (fTraceDebug) |
| 4422 { |
| 4423 printf("MatchAt(startIdx=%ld)\n", startIdx); |
| 4424 printf("Original Pattern: "); |
| 4425 UChar32 c = utext_next32From(fPattern->fPattern, 0); |
| 4426 while (c != U_SENTINEL) { |
| 4427 if (c<32 || c>256) { |
| 4428 c = '.'; |
| 4429 } |
| 4430 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); |
| 4431 |
| 4432 c = UTEXT_NEXT32(fPattern->fPattern); |
| 4433 } |
| 4434 printf("\n"); |
| 4435 printf("Input String: "); |
| 4436 c = utext_next32From(fInputText, 0); |
| 4437 while (c != U_SENTINEL) { |
| 4438 if (c<32 || c>256) { |
| 4439 c = '.'; |
| 4440 } |
| 4441 printf("%c", c); |
| 4442 |
| 4443 c = UTEXT_NEXT32(fInputText); |
| 4444 } |
| 4445 printf("\n"); |
| 4446 printf("\n"); |
| 4447 } |
| 4448 #endif |
| 4449 |
| 4450 if (U_FAILURE(status)) { |
| 4451 return; |
| 4452 } |
| 4453 |
| 4454 // Cache frequently referenced items from the compiled pattern |
| 4455 // |
| 4456 int64_t *pat = fPattern->fCompiledPat->getBuffer(); |
| 4457 |
| 4458 const UChar *litText = fPattern->fLiteralText.getBuffer(); |
| 4459 UVector *sets = fPattern->fSets; |
| 4460 |
| 4461 const UChar *inputBuf = fInputText->chunkContents; |
| 4462 |
| 4463 fFrameSize = fPattern->fFrameSize; |
| 4464 REStackFrame *fp = resetStack(); |
| 4465 |
| 4466 fp->fPatIdx = 0; |
| 4467 fp->fInputIdx = startIdx; |
| 4468 |
| 4469 // Zero out the pattern's static data |
| 4470 int32_t i; |
| 4471 for (i = 0; i<fPattern->fDataSize; i++) { |
| 4472 fData[i] = 0; |
| 4473 } |
| 4474 |
| 4475 // |
| 4476 // Main loop for interpreting the compiled pattern. |
| 4477 // One iteration of the loop per pattern operation performed. |
| 4478 // |
| 4479 for (;;) { |
| 4480 #if 0 |
| 4481 if (_heapchk() != _HEAPOK) { |
| 4482 fprintf(stderr, "Heap Trouble\n"); |
| 4483 } |
| 4484 #endif |
| 4485 |
| 4486 op = (int32_t)pat[fp->fPatIdx]; |
| 4487 opType = URX_TYPE(op); |
| 4488 opValue = URX_VAL(op); |
| 4489 #ifdef REGEX_RUN_DEBUG |
| 4490 if (fTraceDebug) { |
| 4491 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 4492 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp-
>fInputIdx, |
| 4493 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(
), fActiveLimit); |
| 4494 fPattern->dumpOp(fp->fPatIdx); |
| 4495 } |
| 4496 #endif |
| 4497 fp->fPatIdx++; |
| 4498 |
| 4499 switch (opType) { |
| 4500 |
| 4501 |
| 4502 case URX_NOP: |
| 4503 break; |
| 4504 |
| 4505 |
| 4506 case URX_BACKTRACK: |
| 4507 // Force a backtrack. In some circumstances, the pattern compiler |
| 4508 // will notice that the pattern can't possibly match anything, and
will |
| 4509 // emit one of these at that point. |
| 4510 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4511 break; |
| 4512 |
| 4513 |
| 4514 case URX_ONECHAR: |
| 4515 if (fp->fInputIdx < fActiveLimit) { |
| 4516 UChar32 c; |
| 4517 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 4518 if (c == opValue) { |
| 4519 break; |
| 4520 } |
| 4521 } else { |
| 4522 fHitEnd = TRUE; |
| 4523 } |
| 4524 |
| 4525 #ifdef REGEX_SMART_BACKTRACKING |
| 4526 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize)
{ |
| 4527 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFra
meSize); |
| 4528 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInpu
tIdx <= prevFrame->fInputIdx) { |
| 4529 int64_t reverseIndex = fp->fInputIdx; |
| 4530 UChar32 c; |
| 4531 do { |
| 4532 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); |
| 4533 if (c == opValue) { |
| 4534 break; |
| 4535 } |
| 4536 } while (reverseIndex > backSearchIndex); |
| 4537 if (c == opValue) { |
| 4538 fHitEnd = FALSE; |
| 4539 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4540 fp->fInputIdx = reverseIndex; |
| 4541 if (fp->fInputIdx > backSearchIndex) { |
| 4542 fp = StateSave(fp, fp->fPatIdx, status); |
| 4543 } |
| 4544 fp->fPatIdx++; // Skip the LOOP_C, we just did that |
| 4545 break; |
| 4546 } |
| 4547 } |
| 4548 } |
| 4549 #endif |
| 4550 |
| 4551 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4552 break; |
| 4553 |
| 4554 |
| 4555 case URX_STRING: |
| 4556 { |
| 4557 // Test input against a literal string. |
| 4558 // Strings require two slots in the compiled pattern, one for th
e |
| 4559 // offset to the string text, and one for the length. |
| 4560 int32_t stringStartIdx = opValue; |
| 4561 int32_t stringLen; |
| 4562 |
| 4563 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second ope
rand |
| 4564 fp->fPatIdx++; |
| 4565 opType = URX_TYPE(op); |
| 4566 stringLen = URX_VAL(op); |
| 4567 U_ASSERT(opType == URX_STRING_LEN); |
| 4568 U_ASSERT(stringLen >= 2); |
| 4569 |
| 4570 if (fp->fInputIdx + stringLen > fActiveLimit) { |
| 4571 // No match. String is longer than the remaining input text
. |
| 4572 fHitEnd = TRUE; // TODO: See ticket 6074 |
| 4573 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4574 break; |
| 4575 } |
| 4576 |
| 4577 const UChar * pInp = inputBuf + fp->fInputIdx; |
| 4578 const UChar * pPat = litText+stringStartIdx; |
| 4579 const UChar * pEnd = pInp + stringLen; |
| 4580 UBool success = FALSE; |
| 4581 for(;;) { |
| 4582 if (*pInp == *pPat) { |
| 4583 pInp++; |
| 4584 pPat++; |
| 4585 if (pInp == pEnd) { |
| 4586 // Successful Match. |
| 4587 success = TRUE; |
| 4588 break; |
| 4589 } |
| 4590 } else { |
| 4591 // Match failed. |
| 4592 break; |
| 4593 } |
| 4594 } |
| 4595 |
| 4596 if (success) { |
| 4597 fp->fInputIdx += stringLen; |
| 4598 } else { |
| 4599 #ifdef REGEX_SMART_BACKTRACKING |
| 4600 if (fp->fInputIdx > backSearchIndex && fStack->size()) { |
| 4601 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFr
ame(fFrameSize); |
| 4602 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && f
p->fInputIdx <= prevFrame->fInputIdx) { |
| 4603 // Reset to last start point |
| 4604 int64_t reverseIndex = fp->fInputIdx; |
| 4605 UChar32 c; |
| 4606 pPat = litText+stringStartIdx; |
| 4607 |
| 4608 // Search backwards for a possible start |
| 4609 do { |
| 4610 U16_PREV(inputBuf, backSearchIndex, reverseIndex
, c); |
| 4611 if ((U_IS_BMP(c) && *pPat == c) || |
| 4612 (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TR
AIL(c))) { |
| 4613 success = TRUE; |
| 4614 break; |
| 4615 } |
| 4616 } while (reverseIndex > backSearchIndex); |
| 4617 |
| 4618 // And try again |
| 4619 if (success) { |
| 4620 fp = (REStackFrame *)fStack->popFrame(fFrameSize
); |
| 4621 fp->fInputIdx = reverseIndex; |
| 4622 if (fp->fInputIdx > backSearchIndex) { |
| 4623 fp = StateSave(fp, fp->fPatIdx, status); |
| 4624 } |
| 4625 fp->fPatIdx++; // Skip the LOOP_C, we just did t
hat |
| 4626 break; |
| 4627 } |
| 4628 } |
| 4629 } |
| 4630 #endif |
| 4631 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4632 } |
| 4633 } |
| 4634 break; |
| 4635 |
| 4636 |
| 4637 case URX_STATE_SAVE: |
| 4638 fp = StateSave(fp, opValue, status); |
| 4639 break; |
| 4640 |
| 4641 |
| 4642 case URX_END: |
| 4643 // The match loop will exit via this path on a successful match, |
| 4644 // when we reach the end of the pattern. |
| 4645 if (toEnd && fp->fInputIdx != fActiveLimit) { |
| 4646 // The pattern matched, but not to the end of input. Try some m
ore. |
| 4647 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4648 break; |
| 4649 } |
| 4650 isMatch = TRUE; |
| 4651 goto breakFromLoop; |
| 4652 |
| 4653 // Start and End Capture stack frame variables are laid out out like
this: |
| 4654 // fp->fExtra[opValue] - The start of a completed capture group |
| 4655 // opValue+1 - The end of a completed capture group |
| 4656 // opValue+2 - the start of a capture group whose end |
| 4657 // has not yet been reached (and might not
ever be). |
| 4658 case URX_START_CAPTURE: |
| 4659 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); |
| 4660 fp->fExtra[opValue+2] = fp->fInputIdx; |
| 4661 break; |
| 4662 |
| 4663 |
| 4664 case URX_END_CAPTURE: |
| 4665 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); |
| 4666 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for th
is group must be set. |
| 4667 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start
becomes real. |
| 4668 fp->fExtra[opValue+1] = fp->fInputIdx; // End position |
| 4669 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); |
| 4670 break; |
| 4671 |
| 4672 |
| 4673 case URX_DOLLAR: // $, test for End of line |
| 4674 // or for position before new line at end of input |
| 4675 if (fp->fInputIdx < fAnchorLimit-2) { |
| 4676 // We are no where near the end of input. Fail. |
| 4677 // This is the common case. Keep it first. |
| 4678 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4679 break; |
| 4680 } |
| 4681 if (fp->fInputIdx >= fAnchorLimit) { |
| 4682 // We really are at the end of input. Success. |
| 4683 fHitEnd = TRUE; |
| 4684 fRequireEnd = TRUE; |
| 4685 break; |
| 4686 } |
| 4687 |
| 4688 // If we are positioned just before a new-line that is located at th
e |
| 4689 // end of input, succeed. |
| 4690 if (fp->fInputIdx == fAnchorLimit-1) { |
| 4691 UChar32 c; |
| 4692 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); |
| 4693 |
| 4694 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { |
| 4695 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp-
>fInputIdx-1]==0x0d)) { |
| 4696 // At new-line at end of input. Success |
| 4697 fHitEnd = TRUE; |
| 4698 fRequireEnd = TRUE; |
| 4699 break; |
| 4700 } |
| 4701 } |
| 4702 } else if (fp->fInputIdx == fAnchorLimit-2 && |
| 4703 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a
) { |
| 4704 fHitEnd = TRUE; |
| 4705 fRequireEnd = TRUE; |
| 4706 break; // At CR/LF at end of input.
Success |
| 4707 } |
| 4708 |
| 4709 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4710 |
| 4711 break; |
| 4712 |
| 4713 |
| 4714 case URX_DOLLAR_D: // $, test for End of Line, in UNI
X_LINES mode. |
| 4715 if (fp->fInputIdx >= fAnchorLimit-1) { |
| 4716 // Either at the last character of input, or off the end. |
| 4717 if (fp->fInputIdx == fAnchorLimit-1) { |
| 4718 // At last char of input. Success if it's a new line. |
| 4719 if (inputBuf[fp->fInputIdx] == 0x0a) { |
| 4720 fHitEnd = TRUE; |
| 4721 fRequireEnd = TRUE; |
| 4722 break; |
| 4723 } |
| 4724 } else { |
| 4725 // Off the end of input. Success. |
| 4726 fHitEnd = TRUE; |
| 4727 fRequireEnd = TRUE; |
| 4728 break; |
| 4729 } |
| 4730 } |
| 4731 |
| 4732 // Not at end of input. Back-track out. |
| 4733 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4734 break; |
| 4735 |
| 4736 |
| 4737 case URX_DOLLAR_M: // $, test for End of line in multi-l
ine mode |
| 4738 { |
| 4739 if (fp->fInputIdx >= fAnchorLimit) { |
| 4740 // We really are at the end of input. Success. |
| 4741 fHitEnd = TRUE; |
| 4742 fRequireEnd = TRUE; |
| 4743 break; |
| 4744 } |
| 4745 // If we are positioned just before a new-line, succeed. |
| 4746 // It makes no difference where the new-line is within the input
. |
| 4747 UChar32 c = inputBuf[fp->fInputIdx]; |
| 4748 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { |
| 4749 // At a line end, except for the odd chance of being in the
middle of a CR/LF sequence |
| 4750 // In multi-line mode, hitting a new-line just before the e
nd of input does not |
| 4751 // set the hitEnd or requireEnd flags |
| 4752 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp-
>fInputIdx-1]==0x0d)) { |
| 4753 break; |
| 4754 } |
| 4755 } |
| 4756 // not at a new line. Fail. |
| 4757 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4758 } |
| 4759 break; |
| 4760 |
| 4761 |
| 4762 case URX_DOLLAR_MD: // $, test for End of line in multi-
line and UNIX_LINES mode |
| 4763 { |
| 4764 if (fp->fInputIdx >= fAnchorLimit) { |
| 4765 // We really are at the end of input. Success. |
| 4766 fHitEnd = TRUE; |
| 4767 fRequireEnd = TRUE; // Java set requireEnd in this case, ev
en though |
| 4768 break; // adding a new-line would not lose t
he match. |
| 4769 } |
| 4770 // If we are not positioned just before a new-line, the test fai
ls; backtrack out. |
| 4771 // It makes no difference where the new-line is within the input
. |
| 4772 if (inputBuf[fp->fInputIdx] != 0x0a) { |
| 4773 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4774 } |
| 4775 } |
| 4776 break; |
| 4777 |
| 4778 |
| 4779 case URX_CARET: // ^, test for start of line |
| 4780 if (fp->fInputIdx != fAnchorStart) { |
| 4781 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4782 } |
| 4783 break; |
| 4784 |
| 4785 |
| 4786 case URX_CARET_M: // ^, test for start of line in mul
it-line mode |
| 4787 { |
| 4788 if (fp->fInputIdx == fAnchorStart) { |
| 4789 // We are at the start input. Success. |
| 4790 break; |
| 4791 } |
| 4792 // Check whether character just before the current pos is a new-
line |
| 4793 // unless we are at the end of input |
| 4794 UChar c = inputBuf[fp->fInputIdx - 1]; |
| 4795 if ((fp->fInputIdx < fAnchorLimit) && |
| 4796 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029))
{ |
| 4797 // It's a new-line. ^ is true. Success. |
| 4798 // TODO: what should be done with positions between a CR a
nd LF? |
| 4799 break; |
| 4800 } |
| 4801 // Not at the start of a line. Fail. |
| 4802 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4803 } |
| 4804 break; |
| 4805 |
| 4806 |
| 4807 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line
+ Unix-line mode |
| 4808 { |
| 4809 U_ASSERT(fp->fInputIdx >= fAnchorStart); |
| 4810 if (fp->fInputIdx <= fAnchorStart) { |
| 4811 // We are at the start input. Success. |
| 4812 break; |
| 4813 } |
| 4814 // Check whether character just before the current pos is a new-
line |
| 4815 U_ASSERT(fp->fInputIdx <= fAnchorLimit); |
| 4816 UChar c = inputBuf[fp->fInputIdx - 1]; |
| 4817 if (c != 0x0a) { |
| 4818 // Not at the start of a line. Back-track out. |
| 4819 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4820 } |
| 4821 } |
| 4822 break; |
| 4823 |
| 4824 case URX_BACKSLASH_B: // Test for word boundaries |
| 4825 { |
| 4826 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); |
| 4827 success ^= (opValue != 0); // flip sense for \B |
| 4828 if (!success) { |
| 4829 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4830 } |
| 4831 } |
| 4832 break; |
| 4833 |
| 4834 |
| 4835 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-sty
le |
| 4836 { |
| 4837 UBool success = isUWordBoundary(fp->fInputIdx); |
| 4838 success ^= (opValue != 0); // flip sense for \B |
| 4839 if (!success) { |
| 4840 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4841 } |
| 4842 } |
| 4843 break; |
| 4844 |
| 4845 |
| 4846 case URX_BACKSLASH_D: // Test for decimal digit |
| 4847 { |
| 4848 if (fp->fInputIdx >= fActiveLimit) { |
| 4849 fHitEnd = TRUE; |
| 4850 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4851 break; |
| 4852 } |
| 4853 |
| 4854 UChar32 c; |
| 4855 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 4856 int8_t ctype = u_charType(c); // TODO: make a unicode set f
or this. Will be faster. |
| 4857 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); |
| 4858 success ^= (opValue != 0); // flip sense for \D |
| 4859 if (!success) { |
| 4860 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4861 } |
| 4862 } |
| 4863 break; |
| 4864 |
| 4865 |
| 4866 case URX_BACKSLASH_G: // Test for position at end of previous m
atch |
| 4867 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->
fInputIdx==fActiveStart))) { |
| 4868 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4869 } |
| 4870 break; |
| 4871 |
| 4872 |
| 4873 case URX_BACKSLASH_X: |
| 4874 // Match a Grapheme, as defined by Unicode TR 29. |
| 4875 // Differs slightly from Perl, which consumes combining marks independe
ntly |
| 4876 // of context. |
| 4877 { |
| 4878 |
| 4879 // Fail if at end of input |
| 4880 if (fp->fInputIdx >= fActiveLimit) { |
| 4881 fHitEnd = TRUE; |
| 4882 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4883 break; |
| 4884 } |
| 4885 |
| 4886 // Examine (and consume) the current char. |
| 4887 // Dispatch into a little state machine, based on the char. |
| 4888 UChar32 c; |
| 4889 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 4890 UnicodeSet **sets = fPattern->fStaticSets; |
| 4891 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; |
| 4892 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; |
| 4893 if (sets[URX_GC_L]->contains(c)) goto GC_L; |
| 4894 if (sets[URX_GC_LV]->contains(c)) goto GC_V; |
| 4895 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; |
| 4896 if (sets[URX_GC_V]->contains(c)) goto GC_V; |
| 4897 if (sets[URX_GC_T]->contains(c)) goto GC_T; |
| 4898 goto GC_Extend; |
| 4899 |
| 4900 |
| 4901 |
| 4902 GC_L: |
| 4903 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; |
| 4904 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 4905 if (sets[URX_GC_L]->contains(c)) goto GC_L; |
| 4906 if (sets[URX_GC_LV]->contains(c)) goto GC_V; |
| 4907 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; |
| 4908 if (sets[URX_GC_V]->contains(c)) goto GC_V; |
| 4909 U16_PREV(inputBuf, 0, fp->fInputIdx, c); |
| 4910 goto GC_Extend; |
| 4911 |
| 4912 GC_V: |
| 4913 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; |
| 4914 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 4915 if (sets[URX_GC_V]->contains(c)) goto GC_V; |
| 4916 if (sets[URX_GC_T]->contains(c)) goto GC_T; |
| 4917 U16_PREV(inputBuf, 0, fp->fInputIdx, c); |
| 4918 goto GC_Extend; |
| 4919 |
| 4920 GC_T: |
| 4921 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; |
| 4922 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 4923 if (sets[URX_GC_T]->contains(c)) goto GC_T; |
| 4924 U16_PREV(inputBuf, 0, fp->fInputIdx, c); |
| 4925 goto GC_Extend; |
| 4926 |
| 4927 GC_Extend: |
| 4928 // Combining characters are consumed here |
| 4929 for (;;) { |
| 4930 if (fp->fInputIdx >= fActiveLimit) { |
| 4931 break; |
| 4932 } |
| 4933 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 4934 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { |
| 4935 U16_BACK_1(inputBuf, 0, fp->fInputIdx); |
| 4936 break; |
| 4937 } |
| 4938 } |
| 4939 goto GC_Done; |
| 4940 |
| 4941 GC_Control: |
| 4942 // Most control chars stand alone (don't combine with combining char
s), |
| 4943 // except for that CR/LF sequence is a single grapheme cluster. |
| 4944 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInput
Idx] == 0x0a) { |
| 4945 fp->fInputIdx++; |
| 4946 } |
| 4947 |
| 4948 GC_Done: |
| 4949 if (fp->fInputIdx >= fActiveLimit) { |
| 4950 fHitEnd = TRUE; |
| 4951 } |
| 4952 break; |
| 4953 } |
| 4954 |
| 4955 |
| 4956 |
| 4957 |
| 4958 case URX_BACKSLASH_Z: // Test for end of Input |
| 4959 if (fp->fInputIdx < fAnchorLimit) { |
| 4960 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4961 } else { |
| 4962 fHitEnd = TRUE; |
| 4963 fRequireEnd = TRUE; |
| 4964 } |
| 4965 break; |
| 4966 |
| 4967 |
| 4968 |
| 4969 case URX_STATIC_SETREF: |
| 4970 { |
| 4971 // Test input character against one of the predefined sets |
| 4972 // (Word Characters, for example) |
| 4973 // The high bit of the op value is a flag for the match polarity
. |
| 4974 // 0: success if input char is in set. |
| 4975 // 1: success if input char is not in set. |
| 4976 if (fp->fInputIdx >= fActiveLimit) { |
| 4977 fHitEnd = TRUE; |
| 4978 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4979 break; |
| 4980 } |
| 4981 |
| 4982 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); |
| 4983 opValue &= ~URX_NEG_SET; |
| 4984 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); |
| 4985 |
| 4986 UChar32 c; |
| 4987 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 4988 if (c < 256) { |
| 4989 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; |
| 4990 if (s8->contains(c)) { |
| 4991 success = !success; |
| 4992 } |
| 4993 } else { |
| 4994 const UnicodeSet *s = fPattern->fStaticSets[opValue]; |
| 4995 if (s->contains(c)) { |
| 4996 success = !success; |
| 4997 } |
| 4998 } |
| 4999 if (!success) { |
| 5000 #ifdef REGEX_SMART_BACKTRACKING |
| 5001 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFra
meSize) { |
| 5002 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFr
ame(fFrameSize); |
| 5003 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && f
p->fInputIdx <= prevFrame->fInputIdx) { |
| 5004 // Try to find it, backwards |
| 5005 int64_t reverseIndex = fp->fInputIdx; |
| 5006 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex);
// skip the first character we tried |
| 5007 success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
// reset |
| 5008 do { |
| 5009 U16_PREV(inputBuf, backSearchIndex, reverseIndex
, c); |
| 5010 if (c < 256) { |
| 5011 Regex8BitSet *s8 = &fPattern->fStaticSets8[o
pValue]; |
| 5012 if (s8->contains(c)) { |
| 5013 success = !success; |
| 5014 } |
| 5015 } else { |
| 5016 const UnicodeSet *s = fPattern->fStaticSets[
opValue]; |
| 5017 if (s->contains(c)) { |
| 5018 success = !success; |
| 5019 } |
| 5020 } |
| 5021 } while (reverseIndex > backSearchIndex && !success)
; |
| 5022 |
| 5023 if (success) { |
| 5024 fp = (REStackFrame *)fStack->popFrame(fFrameSize
); |
| 5025 fp->fInputIdx = reverseIndex; |
| 5026 if (fp->fInputIdx > backSearchIndex) { |
| 5027 fp = StateSave(fp, fp->fPatIdx, status); |
| 5028 } |
| 5029 fp->fPatIdx++; // Skip the LOOP_C, we just did t
hat |
| 5030 break; |
| 5031 } |
| 5032 } |
| 5033 } |
| 5034 #endif |
| 5035 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5036 } |
| 5037 } |
| 5038 break; |
| 5039 |
| 5040 |
| 5041 case URX_STAT_SETREF_N: |
| 5042 { |
| 5043 // Test input character for NOT being a member of one of |
| 5044 // the predefined sets (Word Characters, for example) |
| 5045 if (fp->fInputIdx >= fActiveLimit) { |
| 5046 fHitEnd = TRUE; |
| 5047 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5048 break; |
| 5049 } |
| 5050 |
| 5051 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); |
| 5052 |
| 5053 UChar32 c; |
| 5054 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 5055 if (c < 256) { |
| 5056 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; |
| 5057 if (s8->contains(c) == FALSE) { |
| 5058 break; |
| 5059 } |
| 5060 } else { |
| 5061 const UnicodeSet *s = fPattern->fStaticSets[opValue]; |
| 5062 if (s->contains(c) == FALSE) { |
| 5063 break; |
| 5064 } |
| 5065 } |
| 5066 |
| 5067 #ifdef REGEX_SMART_BACKTRACKING |
| 5068 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSi
ze) { |
| 5069 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(
fFrameSize); |
| 5070 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->f
InputIdx <= prevFrame->fInputIdx) { |
| 5071 // Try to find it, backwards |
| 5072 int64_t reverseIndex = fp->fInputIdx; |
| 5073 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); //
skip the first character we tried |
| 5074 UBool success = FALSE; |
| 5075 do { |
| 5076 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c)
; |
| 5077 if (c < 256) { |
| 5078 Regex8BitSet *s8 = &fPattern->fStaticSets8[opVal
ue]; |
| 5079 if (s8->contains(c) == FALSE) { |
| 5080 success = TRUE; |
| 5081 break; |
| 5082 } |
| 5083 } else { |
| 5084 const UnicodeSet *s = fPattern->fStaticSets[opVa
lue]; |
| 5085 if (s->contains(c) == FALSE) { |
| 5086 success = TRUE; |
| 5087 break; |
| 5088 } |
| 5089 } |
| 5090 } while (reverseIndex > backSearchIndex); |
| 5091 |
| 5092 if (success) { |
| 5093 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5094 fp->fInputIdx = reverseIndex; |
| 5095 if (fp->fInputIdx > backSearchIndex) { |
| 5096 fp = StateSave(fp, fp->fPatIdx, status); |
| 5097 } |
| 5098 fp->fPatIdx++; // Skip the LOOP_C, we just did that |
| 5099 break; |
| 5100 } |
| 5101 } |
| 5102 } |
| 5103 #endif |
| 5104 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5105 } |
| 5106 break; |
| 5107 |
| 5108 |
| 5109 case URX_SETREF: |
| 5110 { |
| 5111 if (fp->fInputIdx >= fActiveLimit) { |
| 5112 fHitEnd = TRUE; |
| 5113 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5114 break; |
| 5115 } |
| 5116 |
| 5117 U_ASSERT(opValue > 0 && opValue < sets->size()); |
| 5118 |
| 5119 // There is input left. Pick up one char and test it for set me
mbership. |
| 5120 UChar32 c; |
| 5121 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 5122 if (c<256) { |
| 5123 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; |
| 5124 if (s8->contains(c)) { |
| 5125 // The character is in the set. A Match. |
| 5126 break; |
| 5127 } |
| 5128 } else { |
| 5129 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); |
| 5130 if (s->contains(c)) { |
| 5131 // The character is in the set. A Match. |
| 5132 break; |
| 5133 } |
| 5134 } |
| 5135 |
| 5136 // the character wasn't in the set. |
| 5137 #ifdef REGEX_SMART_BACKTRACKING |
| 5138 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSi
ze) { |
| 5139 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(
fFrameSize); |
| 5140 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->f
InputIdx <= prevFrame->fInputIdx) { |
| 5141 // Try to find it, backwards |
| 5142 int64_t reverseIndex = fp->fInputIdx; |
| 5143 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); //
skip the first character we tried |
| 5144 UBool success = FALSE; |
| 5145 do { |
| 5146 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c)
; |
| 5147 if (c < 256) { |
| 5148 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; |
| 5149 if (s8->contains(c)) { |
| 5150 success = TRUE; |
| 5151 break; |
| 5152 } |
| 5153 } else { |
| 5154 UnicodeSet *s = (UnicodeSet *)sets->elementAt(op
Value); |
| 5155 if (s->contains(c)) { |
| 5156 success = TRUE; |
| 5157 break; |
| 5158 } |
| 5159 } |
| 5160 } while (reverseIndex > backSearchIndex); |
| 5161 |
| 5162 if (success) { |
| 5163 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5164 fp->fInputIdx = reverseIndex; |
| 5165 if (fp->fInputIdx > reverseIndex) { |
| 5166 fp = StateSave(fp, fp->fPatIdx, status); |
| 5167 } |
| 5168 fp->fPatIdx++; // Skip the LOOP_C, we just did that |
| 5169 break; |
| 5170 } |
| 5171 } |
| 5172 } |
| 5173 #endif |
| 5174 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5175 } |
| 5176 break; |
| 5177 |
| 5178 |
| 5179 case URX_DOTANY: |
| 5180 { |
| 5181 // . matches anything, but stops at end-of-line. |
| 5182 if (fp->fInputIdx >= fActiveLimit) { |
| 5183 // At end of input. Match failed. Backtrack out. |
| 5184 fHitEnd = TRUE; |
| 5185 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5186 break; |
| 5187 } |
| 5188 |
| 5189 // There is input left. Advance over one char, unless we've hit
end-of-line |
| 5190 UChar32 c; |
| 5191 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 5192 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many
chars as possible |
| 5193 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029))
{ |
| 5194 // End of line in normal mode. . does not match. |
| 5195 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5196 break; |
| 5197 } |
| 5198 } |
| 5199 break; |
| 5200 |
| 5201 |
| 5202 case URX_DOTANY_ALL: |
| 5203 { |
| 5204 // . in dot-matches-all (including new lines) mode |
| 5205 if (fp->fInputIdx >= fActiveLimit) { |
| 5206 // At end of input. Match failed. Backtrack out. |
| 5207 fHitEnd = TRUE; |
| 5208 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5209 break; |
| 5210 } |
| 5211 |
| 5212 // There is input left. Advance over one char, except if we are |
| 5213 // at a cr/lf, advance over both of them. |
| 5214 UChar32 c; |
| 5215 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 5216 if (c==0x0d && fp->fInputIdx < fActiveLimit) { |
| 5217 // In the case of a CR/LF, we need to advance over both. |
| 5218 if (inputBuf[fp->fInputIdx] == 0x0a) { |
| 5219 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); |
| 5220 } |
| 5221 } |
| 5222 } |
| 5223 break; |
| 5224 |
| 5225 |
| 5226 case URX_DOTANY_UNIX: |
| 5227 { |
| 5228 // '.' operator, matches all, but stops at end-of-line. |
| 5229 // UNIX_LINES mode, so 0x0a is the only recognized line ending
. |
| 5230 if (fp->fInputIdx >= fActiveLimit) { |
| 5231 // At end of input. Match failed. Backtrack out. |
| 5232 fHitEnd = TRUE; |
| 5233 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5234 break; |
| 5235 } |
| 5236 |
| 5237 // There is input left. Advance over one char, unless we've hit
end-of-line |
| 5238 UChar32 c; |
| 5239 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 5240 if (c == 0x0a) { |
| 5241 // End of line in normal mode. '.' does not match the \n |
| 5242 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5243 } |
| 5244 } |
| 5245 break; |
| 5246 |
| 5247 |
| 5248 case URX_JMP: |
| 5249 fp->fPatIdx = opValue; |
| 5250 break; |
| 5251 |
| 5252 case URX_FAIL: |
| 5253 isMatch = FALSE; |
| 5254 goto breakFromLoop; |
| 5255 |
| 5256 case URX_JMP_SAV: |
| 5257 U_ASSERT(opValue < fPattern->fCompiledPat->size()); |
| 5258 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc
following current |
| 5259 fp->fPatIdx = opValue; // Then JMP. |
| 5260 break; |
| 5261 |
| 5262 case URX_JMP_SAV_X: |
| 5263 // This opcode is used with (x)+, when x can match a zero length str
ing. |
| 5264 // Same as JMP_SAV, except conditional on the match having made forw
ard progress. |
| 5265 // Destination of the JMP must be a URX_STO_INP_LOC, from which we g
et the |
| 5266 // data address of the input position at the start of the loop. |
| 5267 { |
| 5268 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()
); |
| 5269 int32_t stoOp = (int32_t)pat[opValue-1]; |
| 5270 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); |
| 5271 int32_t frameLoc = URX_VAL(stoOp); |
| 5272 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); |
| 5273 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; |
| 5274 U_ASSERT(prevInputIdx <= fp->fInputIdx); |
| 5275 if (prevInputIdx < fp->fInputIdx) { |
| 5276 // The match did make progress. Repeat the loop. |
| 5277 fp = StateSave(fp, fp->fPatIdx, status); // State save to l
oc following current |
| 5278 fp->fPatIdx = opValue; |
| 5279 fp->fExtra[frameLoc] = fp->fInputIdx; |
| 5280 } |
| 5281 // If the input position did not advance, we do nothing here, |
| 5282 // execution will fall out of the loop. |
| 5283 } |
| 5284 break; |
| 5285 |
| 5286 case URX_CTR_INIT: |
| 5287 { |
| 5288 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); |
| 5289 fp->fExtra[opValue] = 0; // Set the loop counter variable
to zero |
| 5290 |
| 5291 // Pick up the three extra operands that CTR_INIT has, and |
| 5292 // skip the pattern location counter past |
| 5293 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| 5294 fp->fPatIdx += 3; |
| 5295 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); |
| 5296 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; |
| 5297 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; |
| 5298 U_ASSERT(minCount>=0); |
| 5299 U_ASSERT(maxCount>=minCount || maxCount==-1); |
| 5300 U_ASSERT(loopLoc>fp->fPatIdx); |
| 5301 |
| 5302 if (minCount == 0) { |
| 5303 fp = StateSave(fp, loopLoc+1, status); |
| 5304 } |
| 5305 if (maxCount == 0) { |
| 5306 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5307 } |
| 5308 } |
| 5309 break; |
| 5310 |
| 5311 case URX_CTR_LOOP: |
| 5312 { |
| 5313 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); |
| 5314 int32_t initOp = (int32_t)pat[opValue]; |
| 5315 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); |
| 5316 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; |
| 5317 int32_t minCount = (int32_t)pat[opValue+2]; |
| 5318 int32_t maxCount = (int32_t)pat[opValue+3]; |
| 5319 // Increment the counter. Note: we DIDN'T worry about counter |
| 5320 // overflow, since the data comes from UnicodeStrings, which |
| 5321 // stores its length in an int32_t. Do we have to think about |
| 5322 // this now that we're using UText? Probably not, since the le
ngth |
| 5323 // in UChar32s is still an int32_t. |
| 5324 (*pCounter)++; |
| 5325 U_ASSERT(*pCounter > 0); |
| 5326 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { |
| 5327 U_ASSERT(*pCounter == maxCount || maxCount == -1); |
| 5328 break; |
| 5329 } |
| 5330 if (*pCounter >= minCount) { |
| 5331 fp = StateSave(fp, fp->fPatIdx, status); |
| 5332 } |
| 5333 fp->fPatIdx = opValue + 4; // Loop back. |
| 5334 } |
| 5335 break; |
| 5336 |
| 5337 case URX_CTR_INIT_NG: |
| 5338 { |
| 5339 // Initialize a non-greedy loop |
| 5340 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); |
| 5341 fp->fExtra[opValue] = 0; // Set the loop counter variable
to zero |
| 5342 |
| 5343 // Pick up the three extra operands that CTR_INIT has, and |
| 5344 // skip the pattern location counter past |
| 5345 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| 5346 fp->fPatIdx += 3; |
| 5347 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); |
| 5348 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; |
| 5349 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; |
| 5350 U_ASSERT(minCount>=0); |
| 5351 U_ASSERT(maxCount>=minCount || maxCount==-1); |
| 5352 U_ASSERT(loopLoc>fp->fPatIdx); |
| 5353 |
| 5354 if (minCount == 0) { |
| 5355 if (maxCount != 0) { |
| 5356 fp = StateSave(fp, fp->fPatIdx, status); |
| 5357 } |
| 5358 fp->fPatIdx = loopLoc+1; // Continue with stuff after repe
ated block |
| 5359 } |
| 5360 } |
| 5361 break; |
| 5362 |
| 5363 case URX_CTR_LOOP_NG: |
| 5364 { |
| 5365 // Non-greedy {min, max} loops |
| 5366 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); |
| 5367 int32_t initOp = (int32_t)pat[opValue]; |
| 5368 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); |
| 5369 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; |
| 5370 int32_t minCount = (int32_t)pat[opValue+2]; |
| 5371 int32_t maxCount = (int32_t)pat[opValue+3]; |
| 5372 // Increment the counter. Note: we DIDN'T worry about counter |
| 5373 // overflow, since the data comes from UnicodeStrings, which |
| 5374 // stores its length in an int32_t. Do we have to think about |
| 5375 // this now that we're using UText? Probably not, since the le
ngth |
| 5376 // in UChar32s is still an int32_t. |
| 5377 (*pCounter)++; |
| 5378 U_ASSERT(*pCounter > 0); |
| 5379 |
| 5380 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { |
| 5381 // The loop has matched the maximum permitted number of time
s. |
| 5382 // Break out of here with no action. Matching will |
| 5383 // continue with the following pattern. |
| 5384 U_ASSERT(*pCounter == maxCount || maxCount == -1); |
| 5385 break; |
| 5386 } |
| 5387 |
| 5388 if (*pCounter < minCount) { |
| 5389 // We haven't met the minimum number of matches yet. |
| 5390 // Loop back for another one. |
| 5391 fp->fPatIdx = opValue + 4; // Loop back. |
| 5392 } else { |
| 5393 // We do have the minimum number of matches. |
| 5394 // Fall into the following pattern, but first do |
| 5395 // a state save to the top of the loop, so that a failure |
| 5396 // in the following pattern will try another iteration of
the loop. |
| 5397 fp = StateSave(fp, opValue + 4, status); |
| 5398 } |
| 5399 } |
| 5400 break; |
| 5401 |
| 5402 case URX_STO_SP: |
| 5403 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); |
| 5404 fData[opValue] = fStack->size(); |
| 5405 break; |
| 5406 |
| 5407 case URX_LD_SP: |
| 5408 { |
| 5409 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); |
| 5410 int32_t newStackSize = (int32_t)fData[opValue]; |
| 5411 U_ASSERT(newStackSize <= fStack->size()); |
| 5412 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize
; |
| 5413 if (newFP == (int64_t *)fp) { |
| 5414 break; |
| 5415 } |
| 5416 int32_t i; |
| 5417 for (i=0; i<fFrameSize; i++) { |
| 5418 newFP[i] = ((int64_t *)fp)[i]; |
| 5419 } |
| 5420 fp = (REStackFrame *)newFP; |
| 5421 fStack->setSize(newStackSize); |
| 5422 } |
| 5423 break; |
| 5424 |
| 5425 case URX_BACKREF: |
| 5426 case URX_BACKREF_I: |
| 5427 { |
| 5428 U_ASSERT(opValue < fFrameSize); |
| 5429 int64_t groupStartIdx = fp->fExtra[opValue]; |
| 5430 int64_t groupEndIdx = fp->fExtra[opValue+1]; |
| 5431 U_ASSERT(groupStartIdx <= groupEndIdx); |
| 5432 int64_t len = groupEndIdx-groupStartIdx; |
| 5433 if (groupStartIdx < 0) { |
| 5434 // This capture group has not participated in the match thus
far, |
| 5435 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. |
| 5436 } |
| 5437 |
| 5438 if (len == 0) { |
| 5439 // The capture group match was of an empty string. |
| 5440 // Verified by testing: Perl matches succeed in this
case, so |
| 5441 // we do too. |
| 5442 break; |
| 5443 } |
| 5444 |
| 5445 UBool haveMatch = FALSE; |
| 5446 if (fp->fInputIdx + len <= fActiveLimit) { |
| 5447 if (opType == URX_BACKREF) { |
| 5448 if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInpu
tIdx, (int32_t)len) == 0) { |
| 5449 haveMatch = TRUE; |
| 5450 } |
| 5451 } else { |
| 5452 if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->f
InputIdx, |
| 5453 (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) { |
| 5454 haveMatch = TRUE; |
| 5455 } |
| 5456 } |
| 5457 } else { |
| 5458 // TODO: probably need to do a partial string comparison, an
d only |
| 5459 // set HitEnd if the available input matched. Ticket
#6074 |
| 5460 fHitEnd = TRUE; |
| 5461 } |
| 5462 if (haveMatch) { |
| 5463 fp->fInputIdx += len; // Match. Advance current input p
osition. |
| 5464 } else { |
| 5465 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. |
| 5466 } |
| 5467 } |
| 5468 break; |
| 5469 |
| 5470 case URX_STO_INP_LOC: |
| 5471 { |
| 5472 U_ASSERT(opValue >= 0 && opValue < fFrameSize); |
| 5473 fp->fExtra[opValue] = fp->fInputIdx; |
| 5474 } |
| 5475 break; |
| 5476 |
| 5477 case URX_JMPX: |
| 5478 { |
| 5479 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| 5480 fp->fPatIdx += 1; |
| 5481 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); |
| 5482 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); |
| 5483 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; |
| 5484 U_ASSERT(savedInputIdx <= fp->fInputIdx); |
| 5485 if (savedInputIdx < fp->fInputIdx) { |
| 5486 fp->fPatIdx = opValue; // JMP |
| 5487 } else { |
| 5488 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no progress in loop. |
| 5489 } |
| 5490 } |
| 5491 break; |
| 5492 |
| 5493 case URX_LA_START: |
| 5494 { |
| 5495 // Entering a lookahead block. |
| 5496 // Save Stack Ptr, Input Pos. |
| 5497 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 5498 fData[opValue] = fStack->size(); |
| 5499 fData[opValue+1] = fp->fInputIdx; |
| 5500 fActiveStart = fLookStart; // Set the match region
change for |
| 5501 fActiveLimit = fLookLimit; // transparent bounds. |
| 5502 } |
| 5503 break; |
| 5504 |
| 5505 case URX_LA_END: |
| 5506 { |
| 5507 // Leaving a look-ahead block. |
| 5508 // restore Stack Ptr, Input Pos to positions they had on entry
to block. |
| 5509 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 5510 int32_t stackSize = fStack->size(); |
| 5511 int32_t newStackSize = (int32_t)fData[opValue]; |
| 5512 U_ASSERT(stackSize >= newStackSize); |
| 5513 if (stackSize > newStackSize) { |
| 5514 // Copy the current top frame back to the new (cut back) top
frame. |
| 5515 // This makes the capture groups from within the look-ahea
d |
| 5516 // expression available. |
| 5517 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrame
Size; |
| 5518 int32_t i; |
| 5519 for (i=0; i<fFrameSize; i++) { |
| 5520 newFP[i] = ((int64_t *)fp)[i]; |
| 5521 } |
| 5522 fp = (REStackFrame *)newFP; |
| 5523 fStack->setSize(newStackSize); |
| 5524 } |
| 5525 fp->fInputIdx = fData[opValue+1]; |
| 5526 |
| 5527 // Restore the active region bounds in the input string; they ma
y have |
| 5528 // been changed because of transparent bounds on a Region. |
| 5529 fActiveStart = fRegionStart; |
| 5530 fActiveLimit = fRegionLimit; |
| 5531 } |
| 5532 break; |
| 5533 |
| 5534 case URX_ONECHAR_I: |
| 5535 if (fp->fInputIdx < fActiveLimit) { |
| 5536 UChar32 c; |
| 5537 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 5538 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { |
| 5539 break; |
| 5540 } |
| 5541 } else { |
| 5542 fHitEnd = TRUE; |
| 5543 } |
| 5544 |
| 5545 #ifdef REGEX_SMART_BACKTRACKING |
| 5546 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize)
{ |
| 5547 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFra
meSize); |
| 5548 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInpu
tIdx <= prevFrame->fInputIdx) { |
| 5549 UBool success = FALSE; |
| 5550 int64_t reverseIndex = fp->fInputIdx; |
| 5551 UChar32 c; |
| 5552 while (reverseIndex > backSearchIndex) { |
| 5553 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); |
| 5554 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { |
| 5555 success = TRUE; |
| 5556 break; |
| 5557 } else if (c == U_SENTINEL) { |
| 5558 break; |
| 5559 } |
| 5560 } |
| 5561 if (success) { |
| 5562 fHitEnd = FALSE; |
| 5563 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5564 fp->fInputIdx = reverseIndex; |
| 5565 if (fp->fInputIdx > backSearchIndex) { |
| 5566 fp = StateSave(fp, fp->fPatIdx, status); |
| 5567 } |
| 5568 fp->fPatIdx++; // Skip the LOOP_C, we just did that |
| 5569 break; |
| 5570 } |
| 5571 } |
| 5572 } |
| 5573 #endif |
| 5574 |
| 5575 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5576 break; |
| 5577 |
| 5578 case URX_STRING_I: |
| 5579 { |
| 5580 // Test input against a literal string. |
| 5581 // Strings require two slots in the compiled pattern, one for th
e |
| 5582 // offset to the string text, and one for the length. |
| 5583 const UCaseProps *csp = ucase_getSingleton(); |
| 5584 { |
| 5585 int32_t stringStartIdx, stringLen; |
| 5586 stringStartIdx = opValue; |
| 5587 |
| 5588 op = (int32_t)pat[fp->fPatIdx]; |
| 5589 fp->fPatIdx++; |
| 5590 opType = URX_TYPE(op); |
| 5591 opValue = URX_VAL(op); |
| 5592 U_ASSERT(opType == URX_STRING_LEN); |
| 5593 stringLen = opValue; |
| 5594 |
| 5595 const UChar *patternChars = litText+stringStartIdx; |
| 5596 const UChar *patternEnd = patternChars+stringLen; |
| 5597 |
| 5598 const UChar *foldChars = NULL; |
| 5599 int32_t foldOffset, foldLength; |
| 5600 UChar32 c; |
| 5601 |
| 5602 #ifdef REGEX_SMART_BACKTRACKING |
| 5603 int32_t originalInputIdx = fp->fInputIdx; |
| 5604 #endif |
| 5605 UBool success = TRUE; |
| 5606 |
| 5607 foldOffset = foldLength = 0; |
| 5608 |
| 5609 while (patternChars < patternEnd && success) { |
| 5610 if(foldOffset < foldLength) { |
| 5611 U16_NEXT_UNSAFE(foldChars, foldOffset, c); |
| 5612 } else { |
| 5613 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 5614 foldLength = ucase_toFullFolding(csp, c, &foldChars,
U_FOLD_CASE_DEFAULT); |
| 5615 if(foldLength >= 0) { |
| 5616 if(foldLength <= UCASE_MAX_STRING_LENGTH) { //
!!!: Does not correctly handle chars that fold to 0-length strings |
| 5617 foldOffset = 0; |
| 5618 U16_NEXT_UNSAFE(foldChars, foldOffset, c); |
| 5619 } else { |
| 5620 c = foldLength; |
| 5621 foldLength = foldOffset; // to avoid reading
chars from the folding buffer |
| 5622 } |
| 5623 } |
| 5624 } |
| 5625 |
| 5626 if (fp->fInputIdx <= fActiveLimit) { |
| 5627 if (U_IS_BMP(c)) { |
| 5628 success = (*patternChars == c); |
| 5629 patternChars += 1; |
| 5630 } else if (patternChars+1 < patternEnd) { |
| 5631 success = (*patternChars == U16_LEAD(c) && *(pat
ternChars+1) == U16_TRAIL(c)); |
| 5632 patternChars += 2; |
| 5633 } |
| 5634 } else { |
| 5635 success = FALSE; |
| 5636 fHitEnd = TRUE; // TODO: See ticket 6074 |
| 5637 } |
| 5638 } |
| 5639 |
| 5640 if (!success) { |
| 5641 #ifdef REGEX_SMART_BACKTRACKING |
| 5642 if (fp->fInputIdx > backSearchIndex && fStack->size()) { |
| 5643 REStackFrame *prevFrame = (REStackFrame *)fStack->pe
ekFrame(fFrameSize); |
| 5644 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx])
&& fp->fInputIdx <= prevFrame->fInputIdx) { |
| 5645 // Reset to last start point |
| 5646 int64_t reverseIndex = originalInputIdx; |
| 5647 patternChars = litText+stringStartIdx; |
| 5648 |
| 5649 // Search backwards for a possible start |
| 5650 do { |
| 5651 U16_PREV(inputBuf, backSearchIndex, reverseI
ndex, c); |
| 5652 foldLength = ucase_toFullFolding(csp, c, &fo
ldChars, U_FOLD_CASE_DEFAULT); |
| 5653 if(foldLength >= 0) { |
| 5654 if(foldLength <= UCASE_MAX_STRING_LENGTH
) { // !!!: Does not correctly handle chars that fold to 0-length strings |
| 5655 foldOffset = 0; |
| 5656 U16_NEXT_UNSAFE(foldChars, foldOffse
t, c); |
| 5657 } else { |
| 5658 c = foldLength; |
| 5659 foldLength = foldOffset; // to avoid
reading chars from the folding buffer |
| 5660 } |
| 5661 } |
| 5662 |
| 5663 if ((U_IS_BMP(c) && *patternChars == c) || |
| 5664 (*patternChars == U16_LEAD(c) && *(pa
tternChars+1) == U16_TRAIL(c))) { |
| 5665 success = TRUE; |
| 5666 break; |
| 5667 } |
| 5668 } while (reverseIndex > backSearchIndex); |
| 5669 |
| 5670 // And try again |
| 5671 if (success) { |
| 5672 fp = (REStackFrame *)fStack->popFrame(fFrame
Size); |
| 5673 fp->fInputIdx = reverseIndex; |
| 5674 if (fp->fInputIdx > backSearchIndex) { |
| 5675 fp = StateSave(fp, fp->fPatIdx, status); |
| 5676 } |
| 5677 fp->fPatIdx++; // Skip the LOOP_C, we just d
id that |
| 5678 break; |
| 5679 } |
| 5680 } |
| 5681 } |
| 5682 #endif |
| 5683 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5684 } |
| 5685 } |
| 5686 } |
| 5687 break; |
| 5688 |
| 5689 case URX_LB_START: |
| 5690 { |
| 5691 // Entering a look-behind block. |
| 5692 // Save Stack Ptr, Input Pos. |
| 5693 // TODO: implement transparent bounds. Ticket #6067 |
| 5694 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 5695 fData[opValue] = fStack->size(); |
| 5696 fData[opValue+1] = fp->fInputIdx; |
| 5697 // Init the variable containing the start index for attempted ma
tches. |
| 5698 fData[opValue+2] = -1; |
| 5699 // Save input string length, then reset to pin any matches to en
d at |
| 5700 // the current position. |
| 5701 fData[opValue+3] = fActiveLimit; |
| 5702 fActiveLimit = fp->fInputIdx; |
| 5703 } |
| 5704 break; |
| 5705 |
| 5706 |
| 5707 case URX_LB_CONT: |
| 5708 { |
| 5709 // Positive Look-Behind, at top of loop checking for matches of
LB expression |
| 5710 // at all possible input starting positions. |
| 5711 |
| 5712 // Fetch the min and max possible match lengths. They are the o
perands |
| 5713 // of this op in the pattern. |
| 5714 int32_t minML = (int32_t)pat[fp->fPatIdx++]; |
| 5715 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; |
| 5716 U_ASSERT(minML <= maxML); |
| 5717 U_ASSERT(minML >= 0); |
| 5718 |
| 5719 // Fetch (from data) the last input index where a match was atte
mpted. |
| 5720 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 5721 int64_t *lbStartIdx = &fData[opValue+2]; |
| 5722 if (*lbStartIdx < 0) { |
| 5723 // First time through loop. |
| 5724 *lbStartIdx = fp->fInputIdx - minML; |
| 5725 } else { |
| 5726 // 2nd through nth time through the loop. |
| 5727 // Back up start position for match by one. |
| 5728 if (*lbStartIdx == 0) { |
| 5729 (*lbStartIdx)--; |
| 5730 } else { |
| 5731 U16_BACK_1(inputBuf, 0, *lbStartIdx); |
| 5732 } |
| 5733 } |
| 5734 |
| 5735 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { |
| 5736 // We have tried all potential match starting points without |
| 5737 // getting a match. Backtrack out, and out of the |
| 5738 // Look Behind altogether. |
| 5739 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5740 int64_t restoreInputLen = fData[opValue+3]; |
| 5741 U_ASSERT(restoreInputLen >= fActiveLimit); |
| 5742 U_ASSERT(restoreInputLen <= fInputLength); |
| 5743 fActiveLimit = restoreInputLen; |
| 5744 break; |
| 5745 } |
| 5746 |
| 5747 // Save state to this URX_LB_CONT op, so failure to match wil
l repeat the loop. |
| 5748 // (successful match will fall off the end of the loop.) |
| 5749 fp = StateSave(fp, fp->fPatIdx-3, status); |
| 5750 fp->fInputIdx = *lbStartIdx; |
| 5751 } |
| 5752 break; |
| 5753 |
| 5754 case URX_LB_END: |
| 5755 // End of a look-behind block, after a successful match. |
| 5756 { |
| 5757 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 5758 if (fp->fInputIdx != fActiveLimit) { |
| 5759 // The look-behind expression matched, but the match did no
t |
| 5760 // extend all the way to the point that we are looking be
hind from. |
| 5761 // FAIL out of here, which will take us back to the LB_CONT
, which |
| 5762 // will retry the match starting at another position or
fail |
| 5763 // the look-behind altogether, whichever is appropriate. |
| 5764 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5765 break; |
| 5766 } |
| 5767 |
| 5768 // Look-behind match is good. Restore the orignal input string
length, |
| 5769 // which had been truncated to pin the end of the lookbehind m
atch to the |
| 5770 // position being looked-behind. |
| 5771 int64_t originalInputLen = fData[opValue+3]; |
| 5772 U_ASSERT(originalInputLen >= fActiveLimit); |
| 5773 U_ASSERT(originalInputLen <= fInputLength); |
| 5774 fActiveLimit = originalInputLen; |
| 5775 } |
| 5776 break; |
| 5777 |
| 5778 |
| 5779 case URX_LBN_CONT: |
| 5780 { |
| 5781 // Negative Look-Behind, at top of loop checking for matches of
LB expression |
| 5782 // at all possible input starting positions. |
| 5783 |
| 5784 // Fetch the extra parameters of this op. |
| 5785 int32_t minML = (int32_t)pat[fp->fPatIdx++]; |
| 5786 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; |
| 5787 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; |
| 5788 continueLoc = URX_VAL(continueLoc); |
| 5789 U_ASSERT(minML <= maxML); |
| 5790 U_ASSERT(minML >= 0); |
| 5791 U_ASSERT(continueLoc > fp->fPatIdx); |
| 5792 |
| 5793 // Fetch (from data) the last input index where a match was atte
mpted. |
| 5794 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 5795 int64_t *lbStartIdx = &fData[opValue+2]; |
| 5796 if (*lbStartIdx < 0) { |
| 5797 // First time through loop. |
| 5798 *lbStartIdx = fp->fInputIdx - minML; |
| 5799 } else { |
| 5800 // 2nd through nth time through the loop. |
| 5801 // Back up start position for match by one. |
| 5802 if (*lbStartIdx == 0) { |
| 5803 (*lbStartIdx)--; // Because U16_BACK is unsafe startin
g at 0. |
| 5804 } else { |
| 5805 U16_BACK_1(inputBuf, 0, *lbStartIdx); |
| 5806 } |
| 5807 } |
| 5808 |
| 5809 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { |
| 5810 // We have tried all potential match starting points without |
| 5811 // getting a match, which means that the negative lookbehin
d as |
| 5812 // a whole has succeeded. Jump forward to the continue loc
ation |
| 5813 int64_t restoreInputLen = fData[opValue+3]; |
| 5814 U_ASSERT(restoreInputLen >= fActiveLimit); |
| 5815 U_ASSERT(restoreInputLen <= fInputLength); |
| 5816 fActiveLimit = restoreInputLen; |
| 5817 fp->fPatIdx = continueLoc; |
| 5818 break; |
| 5819 } |
| 5820 |
| 5821 // Save state to this URX_LB_CONT op, so failure to match wil
l repeat the loop. |
| 5822 // (successful match will cause a FAIL out of the loop alto
gether.) |
| 5823 fp = StateSave(fp, fp->fPatIdx-4, status); |
| 5824 fp->fInputIdx = *lbStartIdx; |
| 5825 } |
| 5826 break; |
| 5827 |
| 5828 case URX_LBN_END: |
| 5829 // End of a negative look-behind block, after a successful match. |
| 5830 { |
| 5831 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 5832 if (fp->fInputIdx != fActiveLimit) { |
| 5833 // The look-behind expression matched, but the match did no
t |
| 5834 // extend all the way to the point that we are looking be
hind from. |
| 5835 // FAIL out of here, which will take us back to the LB_CONT
, which |
| 5836 // will retry the match starting at another position or
succeed |
| 5837 // the look-behind altogether, whichever is appropriate. |
| 5838 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5839 break; |
| 5840 } |
| 5841 |
| 5842 // Look-behind expression matched, which means look-behind test
as |
| 5843 // a whole Fails |
| 5844 |
| 5845 // Restore the orignal input string length, which had been tru
ncated |
| 5846 // inorder to pin the end of the lookbehind match |
| 5847 // to the position being looked-behind. |
| 5848 int64_t originalInputLen = fData[opValue+3]; |
| 5849 U_ASSERT(originalInputLen >= fActiveLimit); |
| 5850 U_ASSERT(originalInputLen <= fInputLength); |
| 5851 fActiveLimit = originalInputLen; |
| 5852 |
| 5853 // Restore original stack position, discarding any state saved |
| 5854 // by the successful pattern match. |
| 5855 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 5856 int32_t newStackSize = (int32_t)fData[opValue]; |
| 5857 U_ASSERT(fStack->size() > newStackSize); |
| 5858 fStack->setSize(newStackSize); |
| 5859 |
| 5860 // FAIL, which will take control back to someplace |
| 5861 // prior to entering the look-behind test. |
| 5862 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5863 } |
| 5864 break; |
| 5865 |
| 5866 |
| 5867 case URX_LOOP_SR_I: |
| 5868 // Loop Initialization for the optimized implementation of |
| 5869 // [some character set]* |
| 5870 // This op scans through all matching input. |
| 5871 // The following LOOP_C op emulates stack unwinding if the followi
ng pattern fails. |
| 5872 { |
| 5873 U_ASSERT(opValue > 0 && opValue < sets->size()); |
| 5874 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; |
| 5875 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); |
| 5876 |
| 5877 // Loop through input, until either the input is exhausted or |
| 5878 // we reach a character that is not a member of the set. |
| 5879 int32_t ix = (int32_t)fp->fInputIdx; |
| 5880 for (;;) { |
| 5881 if (ix >= fActiveLimit) { |
| 5882 fHitEnd = TRUE; |
| 5883 break; |
| 5884 } |
| 5885 UChar32 c; |
| 5886 U16_NEXT(inputBuf, ix, fActiveLimit, c); |
| 5887 if (c<256) { |
| 5888 if (s8->contains(c) == FALSE) { |
| 5889 U16_BACK_1(inputBuf, 0, ix); |
| 5890 break; |
| 5891 } |
| 5892 } else { |
| 5893 if (s->contains(c) == FALSE) { |
| 5894 U16_BACK_1(inputBuf, 0, ix); |
| 5895 break; |
| 5896 } |
| 5897 } |
| 5898 } |
| 5899 |
| 5900 // If there were no matching characters, skip over the loop alto
gether. |
| 5901 // The loop doesn't run at all, a * op always succeeds. |
| 5902 if (ix == fp->fInputIdx) { |
| 5903 fp->fPatIdx++; // skip the URX_LOOP_C op. |
| 5904 break; |
| 5905 } |
| 5906 |
| 5907 // Peek ahead in the compiled pattern, to the URX_LOOP_C that |
| 5908 // must follow. It's operand is the stack location |
| 5909 // that holds the starting input index for the match of this [
set]* |
| 5910 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; |
| 5911 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); |
| 5912 int32_t stackLoc = URX_VAL(loopcOp); |
| 5913 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); |
| 5914 fp->fExtra[stackLoc] = fp->fInputIdx; |
| 5915 #ifdef REGEX_SMART_BACKTRACKING |
| 5916 backSearchIndex = fp->fInputIdx; |
| 5917 #endif |
| 5918 fp->fInputIdx = ix; |
| 5919 |
| 5920 // Save State to the URX_LOOP_C op that follows this one, |
| 5921 // so that match failures in the following code will return to
there. |
| 5922 // Then bump the pattern idx so the LOOP_C is skipped on the w
ay out of here. |
| 5923 fp = StateSave(fp, fp->fPatIdx, status); |
| 5924 fp->fPatIdx++; |
| 5925 } |
| 5926 break; |
| 5927 |
| 5928 |
| 5929 case URX_LOOP_DOT_I: |
| 5930 // Loop Initialization for the optimized implementation of .* |
| 5931 // This op scans through all remaining input. |
| 5932 // The following LOOP_C op emulates stack unwinding if the followi
ng pattern fails. |
| 5933 { |
| 5934 // Loop through input until the input is exhausted (we reach an
end-of-line) |
| 5935 // In DOTALL mode, we can just go straight to the end of the inp
ut. |
| 5936 int32_t ix; |
| 5937 if ((opValue & 1) == 1) { |
| 5938 // Dot-matches-All mode. Jump straight to the end of the st
ring. |
| 5939 ix = (int32_t)fActiveLimit; |
| 5940 fHitEnd = TRUE; |
| 5941 } else { |
| 5942 // NOT DOT ALL mode. Line endings do not match '.' |
| 5943 // Scan forward until a line ending or end of input. |
| 5944 ix = (int32_t)fp->fInputIdx; |
| 5945 for (;;) { |
| 5946 if (ix >= fActiveLimit) { |
| 5947 fHitEnd = TRUE; |
| 5948 break; |
| 5949 } |
| 5950 UChar32 c; |
| 5951 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputB
uf[ix++] |
| 5952 if ((c & 0x7f) <= 0x29) { // Fast filter of non
-new-line-s |
| 5953 if ((c == 0x0a) || // 0x0a is newline i
n both modes. |
| 5954 (((opValue & 2) == 0) && // IF not UNIX_LINES
mode |
| 5955 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028
|| c==0x2029))) { |
| 5956 // char is a line ending. Put the input pos ba
ck to the |
| 5957 // line ending char, and exit the scanning lo
op. |
| 5958 U16_BACK_1(inputBuf, 0, ix); |
| 5959 break; |
| 5960 } |
| 5961 } |
| 5962 } |
| 5963 } |
| 5964 |
| 5965 // If there were no matching characters, skip over the loop alto
gether. |
| 5966 // The loop doesn't run at all, a * op always succeeds. |
| 5967 if (ix == fp->fInputIdx) { |
| 5968 fp->fPatIdx++; // skip the URX_LOOP_C op. |
| 5969 break; |
| 5970 } |
| 5971 |
| 5972 // Peek ahead in the compiled pattern, to the URX_LOOP_C that |
| 5973 // must follow. It's operand is the stack location |
| 5974 // that holds the starting input index for the match of this .
* |
| 5975 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; |
| 5976 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); |
| 5977 int32_t stackLoc = URX_VAL(loopcOp); |
| 5978 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); |
| 5979 fp->fExtra[stackLoc] = fp->fInputIdx; |
| 5980 #ifdef REGEX_SMART_BACKTRACKING |
| 5981 backSearchIndex = fp->fInputIdx; |
| 5982 #endif |
| 5983 fp->fInputIdx = ix; |
| 5984 |
| 5985 // Save State to the URX_LOOP_C op that follows this one, |
| 5986 // so that match failures in the following code will return to
there. |
| 5987 // Then bump the pattern idx so the LOOP_C is skipped on the w
ay out of here. |
| 5988 fp = StateSave(fp, fp->fPatIdx, status); |
| 5989 fp->fPatIdx++; |
| 5990 } |
| 5991 break; |
| 5992 |
| 5993 |
| 5994 case URX_LOOP_C: |
| 5995 { |
| 5996 U_ASSERT(opValue>=0 && opValue<fFrameSize); |
| 5997 backSearchIndex = (int32_t)fp->fExtra[opValue]; |
| 5998 U_ASSERT(backSearchIndex <= fp->fInputIdx); |
| 5999 if (backSearchIndex == fp->fInputIdx) { |
| 6000 // We've backed up the input idx to the point that the loop
started. |
| 6001 // The loop is done. Leave here without saving state. |
| 6002 // Subsequent failures won't come back here. |
| 6003 break; |
| 6004 } |
| 6005 // Set up for the next iteration of the loop, with input index |
| 6006 // backed up by one from the last time through, |
| 6007 // and a state save to this instruction in case the following
code fails again. |
| 6008 // (We're going backwards because this loop emulates stack unw
inding, not |
| 6009 // the initial scan forward.) |
| 6010 U_ASSERT(fp->fInputIdx > 0); |
| 6011 UChar32 prevC; |
| 6012 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this
0 be one of f*Limit? |
| 6013 |
| 6014 if (prevC == 0x0a && |
| 6015 fp->fInputIdx > backSearchIndex && |
| 6016 inputBuf[fp->fInputIdx-1] == 0x0d) { |
| 6017 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; |
| 6018 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { |
| 6019 // .*, stepping back over CRLF pair. |
| 6020 U16_BACK_1(inputBuf, 0, fp->fInputIdx); |
| 6021 } |
| 6022 } |
| 6023 |
| 6024 |
| 6025 fp = StateSave(fp, fp->fPatIdx-1, status); |
| 6026 } |
| 6027 break; |
| 6028 |
| 6029 |
| 6030 |
| 6031 default: |
| 6032 // Trouble. The compiled pattern contains an entry with an |
| 6033 // unrecognized type tag. |
| 6034 U_ASSERT(FALSE); |
| 6035 } |
| 6036 |
| 6037 if (U_FAILURE(status)) { |
| 6038 isMatch = FALSE; |
| 6039 break; |
| 6040 } |
| 6041 } |
| 6042 |
| 6043 breakFromLoop: |
| 6044 fMatch = isMatch; |
| 6045 if (isMatch) { |
| 6046 fLastMatchEnd = fMatchEnd; |
| 6047 fMatchStart = startIdx; |
| 6048 fMatchEnd = fp->fInputIdx; |
| 6049 if (fTraceDebug) { |
| 6050 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart
, fMatchEnd)); |
| 6051 } |
| 6052 } |
| 6053 else |
| 6054 { |
| 6055 if (fTraceDebug) { |
| 6056 REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); |
| 6057 } |
| 6058 } |
| 6059 |
| 6060 fFrame = fp; // The active stack frame when the engine stoppe
d. |
| 6061 // Contains the capture group results that we need to |
| 6062 // access later. |
| 6063 |
| 6064 return; |
| 6065 } |
| 6066 |
| 6067 |
| 6068 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) |
| 6069 |
| 6070 U_NAMESPACE_END |
| 6071 |
| 6072 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 6073 |
OLD | NEW |