| OLD | NEW |
| 1 /* | 1 /* |
| 2 ************************************************************************** | 2 ************************************************************************** |
| 3 * Copyright (C) 2002-2013 International Business Machines Corporation * | 3 * Copyright (C) 2002-2014 International Business Machines Corporation * |
| 4 * and others. All rights reserved. * | 4 * and others. All rights reserved. * |
| 5 ************************************************************************** | 5 ************************************************************************** |
| 6 */ | 6 */ |
| 7 // | 7 // |
| 8 // file: rematch.cpp | 8 // file: rematch.cpp |
| 9 // | 9 // |
| 10 // Contains the implementation of class RegexMatcher, | 10 // Contains the implementation of class RegexMatcher, |
| 11 // which is one of the main API classes for the ICU regular expression p
ackage. | 11 // which is one of the main API classes for the ICU regular expression p
ackage. |
| 12 // | 12 // |
| 13 | 13 |
| (...skipping 12 matching lines...) Expand all Loading... |
| 26 #include "uvector.h" | 26 #include "uvector.h" |
| 27 #include "uvectr32.h" | 27 #include "uvectr32.h" |
| 28 #include "uvectr64.h" | 28 #include "uvectr64.h" |
| 29 #include "regeximp.h" | 29 #include "regeximp.h" |
| 30 #include "regexst.h" | 30 #include "regexst.h" |
| 31 #include "regextxt.h" | 31 #include "regextxt.h" |
| 32 #include "ucase.h" | 32 #include "ucase.h" |
| 33 | 33 |
| 34 // #include <malloc.h> // Needed for heapcheck testing | 34 // #include <malloc.h> // Needed for heapcheck testing |
| 35 | 35 |
| 36 | |
| 37 // Find progress callback | |
| 38 // ---------------------- | |
| 39 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary
function call. | |
| 40 // | |
| 41 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \ | |
| 42 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FAL
SE) | |
| 43 | |
| 44 | |
| 45 // Smart Backtracking | |
| 46 // ------------------ | |
| 47 // When a failure would go back to a LOOP_C instruction, | |
| 48 // strings, characters, and setrefs scan backwards for a valid start | |
| 49 // character themselves, pop the stack, and save state, emulating the | |
| 50 // LOOP_C's effect but assured that the next character of input is a | |
| 51 // possible matching character. | |
| 52 // | |
| 53 // Good idea in theory; unfortunately it only helps out a few specific | |
| 54 // cases and slows the engine down a little in the rest. | |
| 55 | |
| 56 U_NAMESPACE_BEGIN | 36 U_NAMESPACE_BEGIN |
| 57 | 37 |
| 58 // Default limit for the size of the back track stack, to avoid system | 38 // Default limit for the size of the back track stack, to avoid system |
| 59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. | 39 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. |
| 60 // This value puts ICU's limits higher than most other regexp implementations, | 40 // This value puts ICU's limits higher than most other regexp implementations, |
| 61 // which use recursion rather than the heap, and take more storage per | 41 // which use recursion rather than the heap, and take more storage per |
| 62 // backtrack point. | 42 // backtrack point. |
| 63 // | 43 // |
| 64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; | 44 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; |
| 65 | 45 |
| 66 // Time limit counter constant. | 46 // Time limit counter constant. |
| 67 // Time limits for expression evaluation are in terms of quanta of work by | 47 // Time limits for expression evaluation are in terms of quanta of work by |
| 68 // the engine, each of which is 10,000 state saves. | 48 // the engine, each of which is 10,000 state saves. |
| 69 // This constant determines that state saves per tick number. | 49 // This constant determines that state saves per tick number. |
| 70 static const int32_t TIMER_INITIAL_VALUE = 10000; | 50 static const int32_t TIMER_INITIAL_VALUE = 10000; |
| 71 | 51 |
| 72 //----------------------------------------------------------------------------- | 52 //----------------------------------------------------------------------------- |
| 73 // | 53 // |
| 74 // Constructor and Destructor | 54 // Constructor and Destructor |
| 75 // | 55 // |
| 76 //----------------------------------------------------------------------------- | 56 //----------------------------------------------------------------------------- |
| 77 RegexMatcher::RegexMatcher(const RegexPattern *pat) { | 57 RegexMatcher::RegexMatcher(const RegexPattern *pat) { |
| 78 fDeferredStatus = U_ZERO_ERROR; | 58 fDeferredStatus = U_ZERO_ERROR; |
| 79 init(fDeferredStatus); | 59 init(fDeferredStatus); |
| 80 if (U_FAILURE(fDeferredStatus)) { | 60 if (U_FAILURE(fDeferredStatus)) { |
| 81 return; | 61 return; |
| 82 } | 62 } |
| 83 if (pat==NULL) { | 63 if (pat==NULL) { |
| 84 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; | 64 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; |
| 85 return; | 65 return; |
| 86 } | 66 } |
| 87 fPattern = pat; | 67 fPattern = pat; |
| 88 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); | 68 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); |
| 89 } | 69 } |
| 90 | 70 |
| 91 | 71 |
| 92 | 72 |
| 93 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &inp
ut, | 73 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &inp
ut, |
| 94 uint32_t flags, UErrorCode &status) { | 74 uint32_t flags, UErrorCode &status) { |
| 95 init(status); | 75 init(status); |
| 96 if (U_FAILURE(status)) { | 76 if (U_FAILURE(status)) { |
| 97 return; | 77 return; |
| 98 } | 78 } |
| 99 UParseError pe; | 79 UParseError pe; |
| 100 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); | 80 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| 101 fPattern = fPatternOwned; | 81 fPattern = fPatternOwned; |
| 102 | 82 |
| 103 UText inputText = UTEXT_INITIALIZER; | 83 UText inputText = UTEXT_INITIALIZER; |
| 104 utext_openConstUnicodeString(&inputText, &input, &status); | 84 utext_openConstUnicodeString(&inputText, &input, &status); |
| 105 init2(&inputText, status); | 85 init2(&inputText, status); |
| 106 utext_close(&inputText); | 86 utext_close(&inputText); |
| 107 | 87 |
| 108 fInputUniStrMaybeMutable = TRUE; | 88 fInputUniStrMaybeMutable = TRUE; |
| 109 } | 89 } |
| 110 | 90 |
| 111 | 91 |
| 112 RegexMatcher::RegexMatcher(UText *regexp, UText *input, | 92 RegexMatcher::RegexMatcher(UText *regexp, UText *input, |
| 113 uint32_t flags, UErrorCode &status) { | 93 uint32_t flags, UErrorCode &status) { |
| 114 init(status); | 94 init(status); |
| 115 if (U_FAILURE(status)) { | 95 if (U_FAILURE(status)) { |
| 116 return; | 96 return; |
| 117 } | 97 } |
| 118 UParseError pe; | 98 UParseError pe; |
| 119 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); | 99 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| 120 if (U_FAILURE(status)) { | 100 if (U_FAILURE(status)) { |
| 121 return; | 101 return; |
| 122 } | 102 } |
| 123 | 103 |
| 124 fPattern = fPatternOwned; | 104 fPattern = fPatternOwned; |
| 125 init2(input, status); | 105 init2(input, status); |
| 126 } | 106 } |
| 127 | 107 |
| 128 | 108 |
| 129 RegexMatcher::RegexMatcher(const UnicodeString ®exp, | 109 RegexMatcher::RegexMatcher(const UnicodeString ®exp, |
| 130 uint32_t flags, UErrorCode &status) { | 110 uint32_t flags, UErrorCode &status) { |
| 131 init(status); | 111 init(status); |
| 132 if (U_FAILURE(status)) { | 112 if (U_FAILURE(status)) { |
| 133 return; | 113 return; |
| 134 } | 114 } |
| 135 UParseError pe; | 115 UParseError pe; |
| 136 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); | 116 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| 137 if (U_FAILURE(status)) { | 117 if (U_FAILURE(status)) { |
| 138 return; | 118 return; |
| 139 } | 119 } |
| 140 fPattern = fPatternOwned; | 120 fPattern = fPatternOwned; |
| 141 init2(RegexStaticSets::gStaticSets->fEmptyText, status); | 121 init2(RegexStaticSets::gStaticSets->fEmptyText, status); |
| 142 } | 122 } |
| 143 | 123 |
| 144 RegexMatcher::RegexMatcher(UText *regexp, | 124 RegexMatcher::RegexMatcher(UText *regexp, |
| 145 uint32_t flags, UErrorCode &status) { | 125 uint32_t flags, UErrorCode &status) { |
| 146 init(status); | 126 init(status); |
| 147 if (U_FAILURE(status)) { | 127 if (U_FAILURE(status)) { |
| 148 return; | 128 return; |
| 149 } | 129 } |
| 150 UParseError pe; | 130 UParseError pe; |
| 151 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); | 131 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| 152 if (U_FAILURE(status)) { | 132 if (U_FAILURE(status)) { |
| 153 return; | 133 return; |
| 154 } | 134 } |
| 155 | 135 |
| 156 fPattern = fPatternOwned; | 136 fPattern = fPatternOwned; |
| 157 init2(RegexStaticSets::gStaticSets->fEmptyText, status); | 137 init2(RegexStaticSets::gStaticSets->fEmptyText, status); |
| 158 } | 138 } |
| 159 | 139 |
| 160 | 140 |
| 161 | 141 |
| 162 | 142 |
| 163 RegexMatcher::~RegexMatcher() { | 143 RegexMatcher::~RegexMatcher() { |
| 164 delete fStack; | 144 delete fStack; |
| 165 if (fData != fSmallData) { | 145 if (fData != fSmallData) { |
| 166 uprv_free(fData); | 146 uprv_free(fData); |
| 167 fData = NULL; | 147 fData = NULL; |
| 168 } | 148 } |
| 169 if (fPatternOwned) { | 149 if (fPatternOwned) { |
| 170 delete fPatternOwned; | 150 delete fPatternOwned; |
| 171 fPatternOwned = NULL; | 151 fPatternOwned = NULL; |
| 172 fPattern = NULL; | 152 fPattern = NULL; |
| 173 } | 153 } |
| 174 | 154 |
| 175 if (fInput) { | 155 if (fInput) { |
| 176 delete fInput; | 156 delete fInput; |
| 177 } | 157 } |
| 178 if (fInputText) { | 158 if (fInputText) { |
| 179 utext_close(fInputText); | 159 utext_close(fInputText); |
| 180 } | 160 } |
| 181 if (fAltInputText) { | 161 if (fAltInputText) { |
| 182 utext_close(fAltInputText); | 162 utext_close(fAltInputText); |
| 183 } | 163 } |
| 184 | 164 |
| 185 #if UCONFIG_NO_BREAK_ITERATION==0 | 165 #if UCONFIG_NO_BREAK_ITERATION==0 |
| 186 delete fWordBreakItr; | 166 delete fWordBreakItr; |
| 187 #endif | 167 #endif |
| 188 } | 168 } |
| 189 | 169 |
| 190 // | 170 // |
| 191 // init() common initialization for use by all constructors. | 171 // init() common initialization for use by all constructors. |
| 192 // Initialize all fields, get the object into a consistent state. | 172 // Initialize all fields, get the object into a consistent state. |
| 193 // This must be done even when the initial status shows an error, | 173 // This must be done even when the initial status shows an error, |
| 194 // so that the object is initialized sufficiently well for the destru
ctor | 174 // so that the object is initialized sufficiently well for the destru
ctor |
| (...skipping 27 matching lines...) Expand all Loading... |
| 222 fTickCounter = 0; | 202 fTickCounter = 0; |
| 223 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; | 203 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; |
| 224 fCallbackFn = NULL; | 204 fCallbackFn = NULL; |
| 225 fCallbackContext = NULL; | 205 fCallbackContext = NULL; |
| 226 fFindProgressCallbackFn = NULL; | 206 fFindProgressCallbackFn = NULL; |
| 227 fFindProgressCallbackContext = NULL; | 207 fFindProgressCallbackContext = NULL; |
| 228 fTraceDebug = FALSE; | 208 fTraceDebug = FALSE; |
| 229 fDeferredStatus = status; | 209 fDeferredStatus = status; |
| 230 fData = fSmallData; | 210 fData = fSmallData; |
| 231 fWordBreakItr = NULL; | 211 fWordBreakItr = NULL; |
| 232 | 212 |
| 233 fStack = NULL; | 213 fStack = NULL; |
| 234 fInputText = NULL; | 214 fInputText = NULL; |
| 235 fAltInputText = NULL; | 215 fAltInputText = NULL; |
| 236 fInput = NULL; | 216 fInput = NULL; |
| 237 fInputLength = 0; | 217 fInputLength = 0; |
| 238 fInputUniStrMaybeMutable = FALSE; | 218 fInputUniStrMaybeMutable = FALSE; |
| 239 | 219 |
| 240 if (U_FAILURE(status)) { | 220 if (U_FAILURE(status)) { |
| 241 fDeferredStatus = status; | 221 fDeferredStatus = status; |
| 242 } | 222 } |
| 243 } | 223 } |
| 244 | 224 |
| 245 // | 225 // |
| 246 // init2() Common initialization for use by RegexMatcher constructors, part 2
. | 226 // init2() Common initialization for use by RegexMatcher constructors, part 2
. |
| 247 // This handles the common setup to be done after the Pattern is avai
lable. | 227 // This handles the common setup to be done after the Pattern is avai
lable. |
| 248 // | 228 // |
| 249 void RegexMatcher::init2(UText *input, UErrorCode &status) { | 229 void RegexMatcher::init2(UText *input, UErrorCode &status) { |
| 250 if (U_FAILURE(status)) { | 230 if (U_FAILURE(status)) { |
| 251 fDeferredStatus = status; | 231 fDeferredStatus = status; |
| 252 return; | 232 return; |
| 253 } | 233 } |
| 254 | 234 |
| 255 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0])
)) { | 235 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0])
)) { |
| 256 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); | 236 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); |
| 257 if (fData == NULL) { | 237 if (fData == NULL) { |
| 258 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; | 238 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
| 259 return; | 239 return; |
| 260 } | 240 } |
| 261 } | 241 } |
| 262 | 242 |
| 263 fStack = new UVector64(status); | 243 fStack = new UVector64(status); |
| 264 if (fStack == NULL) { | 244 if (fStack == NULL) { |
| 265 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; | 245 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
| 266 return; | 246 return; |
| (...skipping 12 matching lines...) Expand all Loading... |
| 279 static const UChar DOLLARSIGN = 0x24; | 259 static const UChar DOLLARSIGN = 0x24; |
| 280 //------------------------------------------------------------------------------
-- | 260 //------------------------------------------------------------------------------
-- |
| 281 // | 261 // |
| 282 // appendReplacement | 262 // appendReplacement |
| 283 // | 263 // |
| 284 //------------------------------------------------------------------------------
-- | 264 //------------------------------------------------------------------------------
-- |
| 285 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, | 265 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, |
| 286 const UnicodeString &replacement, | 266 const UnicodeString &replacement, |
| 287 UErrorCode &status) { | 267 UErrorCode &status) { |
| 288 UText replacementText = UTEXT_INITIALIZER; | 268 UText replacementText = UTEXT_INITIALIZER; |
| 289 | 269 |
| 290 utext_openConstUnicodeString(&replacementText, &replacement, &status); | 270 utext_openConstUnicodeString(&replacementText, &replacement, &status); |
| 291 if (U_SUCCESS(status)) { | 271 if (U_SUCCESS(status)) { |
| 292 UText resultText = UTEXT_INITIALIZER; | 272 UText resultText = UTEXT_INITIALIZER; |
| 293 utext_openUnicodeString(&resultText, &dest, &status); | 273 utext_openUnicodeString(&resultText, &dest, &status); |
| 294 | 274 |
| 295 if (U_SUCCESS(status)) { | 275 if (U_SUCCESS(status)) { |
| 296 appendReplacement(&resultText, &replacementText, status); | 276 appendReplacement(&resultText, &replacementText, status); |
| 297 utext_close(&resultText); | 277 utext_close(&resultText); |
| 298 } | 278 } |
| 299 utext_close(&replacementText); | 279 utext_close(&replacementText); |
| 300 } | 280 } |
| 301 | 281 |
| 302 return *this; | 282 return *this; |
| 303 } | 283 } |
| 304 | 284 |
| 305 // | 285 // |
| 306 // appendReplacement, UText mode | 286 // appendReplacement, UText mode |
| 307 // | 287 // |
| 308 RegexMatcher &RegexMatcher::appendReplacement(UText *dest, | 288 RegexMatcher &RegexMatcher::appendReplacement(UText *dest, |
| 309 UText *replacement, | 289 UText *replacement, |
| 310 UErrorCode &status) { | 290 UErrorCode &status) { |
| 311 if (U_FAILURE(status)) { | 291 if (U_FAILURE(status)) { |
| 312 return *this; | 292 return *this; |
| 313 } | 293 } |
| 314 if (U_FAILURE(fDeferredStatus)) { | 294 if (U_FAILURE(fDeferredStatus)) { |
| 315 status = fDeferredStatus; | 295 status = fDeferredStatus; |
| 316 return *this; | 296 return *this; |
| 317 } | 297 } |
| 318 if (fMatch == FALSE) { | 298 if (fMatch == FALSE) { |
| 319 status = U_REGEX_INVALID_STATE; | 299 status = U_REGEX_INVALID_STATE; |
| 320 return *this; | 300 return *this; |
| 321 } | 301 } |
| 322 | 302 |
| 323 // Copy input string from the end of previous match to start of current matc
h | 303 // Copy input string from the end of previous match to start of current matc
h |
| 324 int64_t destLen = utext_nativeLength(dest); | 304 int64_t destLen = utext_nativeLength(dest); |
| 325 if (fMatchStart > fAppendPosition) { | 305 if (fMatchStart > fAppendPosition) { |
| 326 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { | 306 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 327 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkCo
ntents+fAppendPosition, | 307 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkCo
ntents+fAppendPosition, |
| 328 (int32_t)(fMatchStart-fAppendPosition), &st
atus); | 308 (int32_t)(fMatchStart-fAppendPosition), &st
atus); |
| 329 } else { | 309 } else { |
| 330 int32_t len16; | 310 int32_t len16; |
| 331 if (UTEXT_USES_U16(fInputText)) { | 311 if (UTEXT_USES_U16(fInputText)) { |
| 332 len16 = (int32_t)(fMatchStart-fAppendPosition); | 312 len16 = (int32_t)(fMatchStart-fAppendPosition); |
| 333 } else { | 313 } else { |
| 334 UErrorCode lengthStatus = U_ZERO_ERROR; | 314 UErrorCode lengthStatus = U_ZERO_ERROR; |
| 335 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart,
NULL, 0, &lengthStatus); | 315 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart,
NULL, 0, &lengthStatus); |
| 336 } | 316 } |
| 337 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); | 317 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); |
| 338 if (inputChars == NULL) { | 318 if (inputChars == NULL) { |
| 339 status = U_MEMORY_ALLOCATION_ERROR; | 319 status = U_MEMORY_ALLOCATION_ERROR; |
| 340 return *this; | 320 return *this; |
| 341 } | 321 } |
| 342 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars,
len16+1, &status); | 322 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars,
len16+1, &status); |
| 343 destLen += utext_replace(dest, destLen, destLen, inputChars, len16,
&status); | 323 destLen += utext_replace(dest, destLen, destLen, inputChars, len16,
&status); |
| 344 uprv_free(inputChars); | 324 uprv_free(inputChars); |
| 345 } | 325 } |
| 346 } | 326 } |
| 347 fAppendPosition = fMatchEnd; | 327 fAppendPosition = fMatchEnd; |
| 348 | 328 |
| 349 | 329 |
| 350 // scan the replacement text, looking for substitutions ($n) and \escapes. | 330 // scan the replacement text, looking for substitutions ($n) and \escapes. |
| 351 // TODO: optimize this loop by efficiently scanning for '$' or '\', | 331 // TODO: optimize this loop by efficiently scanning for '$' or '\', |
| 352 // move entire ranges not containing substitutions. | 332 // move entire ranges not containing substitutions. |
| 353 UTEXT_SETNATIVEINDEX(replacement, 0); | 333 UTEXT_SETNATIVEINDEX(replacement, 0); |
| 354 UChar32 c = UTEXT_NEXT32(replacement); | 334 UChar32 c = UTEXT_NEXT32(replacement); |
| 355 while (c != U_SENTINEL) { | 335 while (c != U_SENTINEL) { |
| 356 if (c == BACKSLASH) { | 336 if (c == BACKSLASH) { |
| 357 // Backslash Escape. Copy the following char out without further ch
ecks. | 337 // Backslash Escape. Copy the following char out without further ch
ecks. |
| 358 // Note: Surrogate pairs don't need any special
handling | 338 // Note: Surrogate pairs don't need any special
handling |
| 359 // The second half wont be a '$' or a '\',
and | 339 // The second half wont be a '$' or a '\',
and |
| 360 // will move to the dest normally on the n
ext | 340 // will move to the dest normally on the n
ext |
| 361 // loop iteration. | 341 // loop iteration. |
| 362 c = UTEXT_CURRENT32(replacement); | 342 c = UTEXT_CURRENT32(replacement); |
| 363 if (c == U_SENTINEL) { | 343 if (c == U_SENTINEL) { |
| 364 break; | 344 break; |
| 365 } | 345 } |
| 366 | 346 |
| 367 if (c==0x55/*U*/ || c==0x75/*u*/) { | 347 if (c==0x55/*U*/ || c==0x75/*u*/) { |
| 368 // We have a \udddd or \Udddddddd escape sequence. | 348 // We have a \udddd or \Udddddddd escape sequence. |
| 369 int32_t offset = 0; | 349 int32_t offset = 0; |
| 370 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UN
ESCAPE_CONTEXT(replacement); | 350 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UN
ESCAPE_CONTEXT(replacement); |
| 371 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt,
&offset, INT32_MAX, &context); | 351 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt,
&offset, INT32_MAX, &context); |
| 372 if (escapedChar != (UChar32)0xFFFFFFFF) { | 352 if (escapedChar != (UChar32)0xFFFFFFFF) { |
| 373 if (U_IS_BMP(escapedChar)) { | 353 if (U_IS_BMP(escapedChar)) { |
| 374 UChar c16 = (UChar)escapedChar; | 354 UChar c16 = (UChar)escapedChar; |
| 375 destLen += utext_replace(dest, destLen, destLen, &c16, 1
, &status); | 355 destLen += utext_replace(dest, destLen, destLen, &c16, 1
, &status); |
| 376 } else { | 356 } else { |
| (...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 414 surrogate[0] = U16_LEAD(c); | 394 surrogate[0] = U16_LEAD(c); |
| 415 surrogate[1] = U16_TRAIL(c); | 395 surrogate[1] = U16_TRAIL(c); |
| 416 if (U_SUCCESS(status)) { | 396 if (U_SUCCESS(status)) { |
| 417 destLen += utext_replace(dest, destLen, destLen, surrogate,
2, &status); | 397 destLen += utext_replace(dest, destLen, destLen, surrogate,
2, &status); |
| 418 } | 398 } |
| 419 } | 399 } |
| 420 } else { | 400 } else { |
| 421 // We've got a $. Pick up a capture group number if one follows. | 401 // We've got a $. Pick up a capture group number if one follows. |
| 422 // Consume at most the number of digits necessary for the largest ca
pture | 402 // Consume at most the number of digits necessary for the largest ca
pture |
| 423 // number that is valid for this pattern. | 403 // number that is valid for this pattern. |
| 424 | 404 |
| 425 int32_t numDigits = 0; | 405 int32_t numDigits = 0; |
| 426 int32_t groupNum = 0; | 406 int32_t groupNum = 0; |
| 427 UChar32 digitC; | 407 UChar32 digitC; |
| 428 for (;;) { | 408 for (;;) { |
| 429 digitC = UTEXT_CURRENT32(replacement); | 409 digitC = UTEXT_CURRENT32(replacement); |
| 430 if (digitC == U_SENTINEL) { | 410 if (digitC == U_SENTINEL) { |
| 431 break; | 411 break; |
| 432 } | 412 } |
| 433 if (u_isdigit(digitC) == FALSE) { | 413 if (u_isdigit(digitC) == FALSE) { |
| 434 break; | 414 break; |
| 435 } | 415 } |
| 436 (void)UTEXT_NEXT32(replacement); | 416 (void)UTEXT_NEXT32(replacement); |
| 437 groupNum=groupNum*10 + u_charDigitValue(digitC); | 417 groupNum=groupNum*10 + u_charDigitValue(digitC); |
| 438 numDigits++; | 418 numDigits++; |
| 439 if (numDigits >= fPattern->fMaxCaptureDigits) { | 419 if (numDigits >= fPattern->fMaxCaptureDigits) { |
| 440 break; | 420 break; |
| 441 } | 421 } |
| 442 } | 422 } |
| 443 | 423 |
| 444 | 424 |
| 445 if (numDigits == 0) { | 425 if (numDigits == 0) { |
| 446 // The $ didn't introduce a group number at all. | 426 // The $ didn't introduce a group number at all. |
| 447 // Treat it as just part of the substitution text. | 427 // Treat it as just part of the substitution text. |
| 448 UChar c16 = DOLLARSIGN; | 428 UChar c16 = DOLLARSIGN; |
| 449 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &statu
s); | 429 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &statu
s); |
| 450 } else { | 430 } else { |
| 451 // Finally, append the capture group data to the destination. | 431 // Finally, append the capture group data to the destination. |
| 452 destLen += appendGroup(groupNum, dest, status); | 432 destLen += appendGroup(groupNum, dest, status); |
| 453 if (U_FAILURE(status)) { | 433 if (U_FAILURE(status)) { |
| 454 // Can fail if group number is out of range. | 434 // Can fail if group number is out of range. |
| 455 break; | 435 break; |
| 456 } | 436 } |
| 457 } | 437 } |
| 458 } | 438 } |
| 459 | 439 |
| 460 if (U_FAILURE(status)) { | 440 if (U_FAILURE(status)) { |
| 461 break; | 441 break; |
| 462 } else { | 442 } else { |
| 463 c = UTEXT_NEXT32(replacement); | 443 c = UTEXT_NEXT32(replacement); |
| 464 } | 444 } |
| 465 } | 445 } |
| 466 | 446 |
| 467 return *this; | 447 return *this; |
| 468 } | 448 } |
| 469 | 449 |
| 470 | 450 |
| 471 | 451 |
| 472 //------------------------------------------------------------------------------
-- | 452 //------------------------------------------------------------------------------
-- |
| 473 // | 453 // |
| 474 // appendTail Intended to be used in conjunction with appendReplacement() | 454 // appendTail Intended to be used in conjunction with appendReplacement() |
| 475 // To the destination string, append everything following | 455 // To the destination string, append everything following |
| 476 // the last match position from the input string. | 456 // the last match position from the input string. |
| 477 // | 457 // |
| 478 // Note: Match ranges do not affect appendTail or appendRepla
cement | 458 // Note: Match ranges do not affect appendTail or appendRepla
cement |
| 479 // | 459 // |
| 480 //------------------------------------------------------------------------------
-- | 460 //------------------------------------------------------------------------------
-- |
| 481 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { | 461 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { |
| 482 UErrorCode status = U_ZERO_ERROR; | 462 UErrorCode status = U_ZERO_ERROR; |
| 483 UText resultText = UTEXT_INITIALIZER; | 463 UText resultText = UTEXT_INITIALIZER; |
| 484 utext_openUnicodeString(&resultText, &dest, &status); | 464 utext_openUnicodeString(&resultText, &dest, &status); |
| 485 | 465 |
| 486 if (U_SUCCESS(status)) { | 466 if (U_SUCCESS(status)) { |
| 487 appendTail(&resultText, status); | 467 appendTail(&resultText, status); |
| 488 utext_close(&resultText); | 468 utext_close(&resultText); |
| 489 } | 469 } |
| 490 | 470 |
| 491 return dest; | 471 return dest; |
| 492 } | 472 } |
| 493 | 473 |
| 494 // | 474 // |
| 495 // appendTail, UText mode | 475 // appendTail, UText mode |
| 496 // | 476 // |
| 497 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { | 477 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { |
| 498 UBool bailOut = FALSE; | |
| 499 if (U_FAILURE(status)) { | 478 if (U_FAILURE(status)) { |
| 500 bailOut = TRUE; | 479 return dest; |
| 501 } | 480 } |
| 502 if (U_FAILURE(fDeferredStatus)) { | 481 if (U_FAILURE(fDeferredStatus)) { |
| 503 status = fDeferredStatus; | 482 status = fDeferredStatus; |
| 504 bailOut = TRUE; | 483 return dest; |
| 505 } | 484 } |
| 506 | 485 |
| 507 if (bailOut) { | |
| 508 // dest must not be NULL | |
| 509 if (dest) { | |
| 510 utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(des
t), NULL, 0, &status); | |
| 511 return dest; | |
| 512 } | |
| 513 } | |
| 514 | |
| 515 if (fInputLength > fAppendPosition) { | 486 if (fInputLength > fAppendPosition) { |
| 516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { | 487 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 517 int64_t destLen = utext_nativeLength(dest); | 488 int64_t destLen = utext_nativeLength(dest); |
| 518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fApp
endPosition, | 489 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fApp
endPosition, |
| 519 (int32_t)(fInputLength-fAppendPosition), &status); | 490 (int32_t)(fInputLength-fAppendPosition), &status); |
| 520 } else { | 491 } else { |
| 521 int32_t len16; | 492 int32_t len16; |
| 522 if (UTEXT_USES_U16(fInputText)) { | 493 if (UTEXT_USES_U16(fInputText)) { |
| 523 len16 = (int32_t)(fInputLength-fAppendPosition); | 494 len16 = (int32_t)(fInputLength-fAppendPosition); |
| 524 } else { | 495 } else { |
| 525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength,
NULL, 0, &status); | 496 len16 = utext_extract(fInputText, fAppendPosition, fInputLength,
NULL, 0, &status); |
| 526 status = U_ZERO_ERROR; // buffer overflow | 497 status = U_ZERO_ERROR; // buffer overflow |
| 527 } | 498 } |
| 528 | 499 |
| 529 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); | 500 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); |
| 530 if (inputChars == NULL) { | 501 if (inputChars == NULL) { |
| 531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; | 502 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
| 532 } else { | 503 } else { |
| 533 utext_extract(fInputText, fAppendPosition, fInputLength, inputCh
ars, len16, &status); // unterminated | 504 utext_extract(fInputText, fAppendPosition, fInputLength, inputCh
ars, len16, &status); // unterminated |
| 534 int64_t destLen = utext_nativeLength(dest); | 505 int64_t destLen = utext_nativeLength(dest); |
| 535 utext_replace(dest, destLen, destLen, inputChars, len16, &status
); | 506 utext_replace(dest, destLen, destLen, inputChars, len16, &status
); |
| 536 uprv_free(inputChars); | 507 uprv_free(inputChars); |
| 537 } | 508 } |
| 538 } | 509 } |
| 539 } | 510 } |
| 540 return dest; | 511 return dest; |
| 541 } | 512 } |
| 542 | 513 |
| 543 | 514 |
| (...skipping 18 matching lines...) Expand all Loading... |
| 562 if (fMatch == FALSE) { | 533 if (fMatch == FALSE) { |
| 563 err = U_REGEX_INVALID_STATE; | 534 err = U_REGEX_INVALID_STATE; |
| 564 return -1; | 535 return -1; |
| 565 } | 536 } |
| 566 if (group < 0 || group > fPattern->fGroupMap->size()) { | 537 if (group < 0 || group > fPattern->fGroupMap->size()) { |
| 567 err = U_INDEX_OUTOFBOUNDS_ERROR; | 538 err = U_INDEX_OUTOFBOUNDS_ERROR; |
| 568 return -1; | 539 return -1; |
| 569 } | 540 } |
| 570 int64_t e = -1; | 541 int64_t e = -1; |
| 571 if (group == 0) { | 542 if (group == 0) { |
| 572 e = fMatchEnd; | 543 e = fMatchEnd; |
| 573 } else { | 544 } else { |
| 574 // Get the position within the stack frame of the variables for | 545 // Get the position within the stack frame of the variables for |
| 575 // this capture group. | 546 // this capture group. |
| 576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); | 547 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); |
| 577 U_ASSERT(groupOffset < fPattern->fFrameSize); | 548 U_ASSERT(groupOffset < fPattern->fFrameSize); |
| 578 U_ASSERT(groupOffset >= 0); | 549 U_ASSERT(groupOffset >= 0); |
| 579 e = fFrame->fExtra[groupOffset + 1]; | 550 e = fFrame->fExtra[groupOffset + 1]; |
| 580 } | 551 } |
| 581 | 552 |
| 582 return e; | 553 return e; |
| 583 } | 554 } |
| 584 | 555 |
| 585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { | 556 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { |
| 586 return (int32_t)end64(group, err); | 557 return (int32_t)end64(group, err); |
| 587 } | 558 } |
| 588 | 559 |
| 560 //------------------------------------------------------------------------------
-- |
| 561 // |
| 562 // findProgressInterrupt This function is called once for each advance in the
target |
| 563 // string from the find() function, and calls the user
progress callback |
| 564 // function if there is one installed. |
| 565 // |
| 566 // Return: TRUE if the find operation is to be terminated. |
| 567 // FALSE if the find operation is to continue running. |
| 568 // |
| 569 //------------------------------------------------------------------------------
-- |
| 570 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) { |
| 571 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCall
backContext, pos)) { |
| 572 status = U_REGEX_STOPPED_BY_CALLER; |
| 573 return TRUE; |
| 574 } |
| 575 return FALSE; |
| 576 } |
| 589 | 577 |
| 590 //------------------------------------------------------------------------------
-- | 578 //------------------------------------------------------------------------------
-- |
| 591 // | 579 // |
| 592 // find() | 580 // find() |
| 593 // | 581 // |
| 594 //------------------------------------------------------------------------------
-- | 582 //------------------------------------------------------------------------------
-- |
| 595 UBool RegexMatcher::find() { | 583 UBool RegexMatcher::find() { |
| 584 if (U_FAILURE(fDeferredStatus)) { |
| 585 return FALSE; |
| 586 } |
| 587 UErrorCode status = U_ZERO_ERROR; |
| 588 UBool result = find(status); |
| 589 return result; |
| 590 } |
| 591 |
| 592 //------------------------------------------------------------------------------
-- |
| 593 // |
| 594 // find() |
| 595 // |
| 596 //------------------------------------------------------------------------------
-- |
| 597 UBool RegexMatcher::find(UErrorCode &status) { |
| 596 // Start at the position of the last match end. (Will be zero if the | 598 // Start at the position of the last match end. (Will be zero if the |
| 597 // matcher has been reset.) | 599 // matcher has been reset.) |
| 598 // | 600 // |
| 599 if (U_FAILURE(fDeferredStatus)) { | 601 if (U_FAILURE(status)) { |
| 600 return FALSE; | 602 return FALSE; |
| 601 } | 603 } |
| 602 | 604 if (U_FAILURE(fDeferredStatus)) { |
| 605 status = fDeferredStatus; |
| 606 return FALSE; |
| 607 } |
| 608 |
| 603 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { | 609 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 604 return findUsingChunk(); | 610 return findUsingChunk(status); |
| 605 } | 611 } |
| 606 | 612 |
| 607 int64_t startPos = fMatchEnd; | 613 int64_t startPos = fMatchEnd; |
| 608 if (startPos==0) { | 614 if (startPos==0) { |
| 609 startPos = fActiveStart; | 615 startPos = fActiveStart; |
| 610 } | 616 } |
| 611 | 617 |
| 612 if (fMatch) { | 618 if (fMatch) { |
| 613 // Save the position of any previous successful match. | 619 // Save the position of any previous successful match. |
| 614 fLastMatchEnd = fMatchEnd; | 620 fLastMatchEnd = fMatchEnd; |
| (...skipping 27 matching lines...) Expand all Loading... |
| 642 // Be aware of possible overflows if making changes here. | 648 // Be aware of possible overflows if making changes here. |
| 643 int64_t testStartLimit; | 649 int64_t testStartLimit; |
| 644 if (UTEXT_USES_U16(fInputText)) { | 650 if (UTEXT_USES_U16(fInputText)) { |
| 645 testStartLimit = fActiveLimit - fPattern->fMinMatchLen; | 651 testStartLimit = fActiveLimit - fPattern->fMinMatchLen; |
| 646 if (startPos > testStartLimit) { | 652 if (startPos > testStartLimit) { |
| 647 fMatch = FALSE; | 653 fMatch = FALSE; |
| 648 fHitEnd = TRUE; | 654 fHitEnd = TRUE; |
| 649 return FALSE; | 655 return FALSE; |
| 650 } | 656 } |
| 651 } else { | 657 } else { |
| 652 // For now, let the matcher discover that it can't match on its own | 658 // We don't know exactly how long the minimum match length is in native
characters. |
| 653 // We don't know how long the match len is in native characters | 659 // Treat anything > 0 as 1. |
| 654 testStartLimit = fActiveLimit; | 660 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0); |
| 655 } | 661 } |
| 656 | 662 |
| 657 UChar32 c; | 663 UChar32 c; |
| 658 U_ASSERT(startPos >= 0); | 664 U_ASSERT(startPos >= 0); |
| 659 | 665 |
| 660 switch (fPattern->fStartType) { | 666 switch (fPattern->fStartType) { |
| 661 case START_NO_INFO: | 667 case START_NO_INFO: |
| 662 // No optimization was found. | 668 // No optimization was found. |
| 663 // Try a match at each input position. | 669 // Try a match at each input position. |
| 664 for (;;) { | 670 for (;;) { |
| 665 MatchAt(startPos, FALSE, fDeferredStatus); | 671 MatchAt(startPos, FALSE, status); |
| 666 if (U_FAILURE(fDeferredStatus)) { | 672 if (U_FAILURE(status)) { |
| 667 return FALSE; | 673 return FALSE; |
| 668 } | 674 } |
| 669 if (fMatch) { | 675 if (fMatch) { |
| 670 return TRUE; | 676 return TRUE; |
| 671 } | 677 } |
| 672 if (startPos >= testStartLimit) { | 678 if (startPos >= testStartLimit) { |
| 673 fHitEnd = TRUE; | 679 fHitEnd = TRUE; |
| 674 return FALSE; | 680 return FALSE; |
| 675 } | 681 } |
| 676 UTEXT_SETNATIVEINDEX(fInputText, startPos); | 682 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| 677 (void)UTEXT_NEXT32(fInputText); | 683 (void)UTEXT_NEXT32(fInputText); |
| 678 startPos = UTEXT_GETNATIVEINDEX(fInputText); | 684 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| 679 // Note that it's perfectly OK for a pattern to have a zero-length | 685 // Note that it's perfectly OK for a pattern to have a zero-length |
| 680 // match at the end of a string, so we must make sure that the loo
p | 686 // match at the end of a string, so we must make sure that the loo
p |
| 681 // runs with startPos == testStartLimit the last time through. | 687 // runs with startPos == testStartLimit the last time through. |
| 682 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) | 688 if (findProgressInterrupt(startPos, status)) |
| 683 return FALSE; | 689 return FALSE; |
| 684 } | 690 } |
| 685 U_ASSERT(FALSE); | 691 U_ASSERT(FALSE); |
| 686 | 692 |
| 687 case START_START: | 693 case START_START: |
| 688 // Matches are only possible at the start of the input string | 694 // Matches are only possible at the start of the input string |
| 689 // (pattern begins with ^ or \A) | 695 // (pattern begins with ^ or \A) |
| 690 if (startPos > fActiveStart) { | 696 if (startPos > fActiveStart) { |
| 691 fMatch = FALSE; | 697 fMatch = FALSE; |
| 692 return FALSE; | 698 return FALSE; |
| 693 } | 699 } |
| 694 MatchAt(startPos, FALSE, fDeferredStatus); | 700 MatchAt(startPos, FALSE, status); |
| 695 if (U_FAILURE(fDeferredStatus)) { | 701 if (U_FAILURE(status)) { |
| 696 return FALSE; | 702 return FALSE; |
| 697 } | 703 } |
| 698 return fMatch; | 704 return fMatch; |
| 699 | 705 |
| 700 | 706 |
| 701 case START_SET: | 707 case START_SET: |
| 702 { | 708 { |
| 703 // Match may start on any char from a pre-computed set. | 709 // Match may start on any char from a pre-computed set. |
| 704 U_ASSERT(fPattern->fMinMatchLen > 0); | 710 U_ASSERT(fPattern->fMinMatchLen > 0); |
| 705 int64_t pos; | |
| 706 UTEXT_SETNATIVEINDEX(fInputText, startPos); | 711 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| 707 for (;;) { | 712 for (;;) { |
| 713 int64_t pos = startPos; |
| 708 c = UTEXT_NEXT32(fInputText); | 714 c = UTEXT_NEXT32(fInputText); |
| 709 pos = UTEXT_GETNATIVEINDEX(fInputText); | 715 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| 710 // c will be -1 (U_SENTINEL) at end of text, in which case we | 716 // c will be -1 (U_SENTINEL) at end of text, in which case we |
| 711 // skip this next block (so we don't have a negative array index
) | 717 // skip this next block (so we don't have a negative array index
) |
| 712 // and handle end of text in the following block. | 718 // and handle end of text in the following block. |
| 713 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c))
|| | 719 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c))
|| |
| 714 (c>=256 && fPattern->fInitialChars->contains(c))))
{ | 720 (c>=256 && fPattern->fInitialChars->contains(c))))
{ |
| 715 MatchAt(startPos, FALSE, fDeferredStatus); | 721 MatchAt(pos, FALSE, status); |
| 716 if (U_FAILURE(fDeferredStatus)) { | 722 if (U_FAILURE(status)) { |
| 717 return FALSE; | 723 return FALSE; |
| 718 } | 724 } |
| 719 if (fMatch) { | 725 if (fMatch) { |
| 720 return TRUE; | 726 return TRUE; |
| 721 } | 727 } |
| 722 UTEXT_SETNATIVEINDEX(fInputText, pos); | 728 UTEXT_SETNATIVEINDEX(fInputText, pos); |
| 723 } | 729 } |
| 724 if (startPos >= testStartLimit) { | 730 if (startPos > testStartLimit) { |
| 725 fMatch = FALSE; | 731 fMatch = FALSE; |
| 726 fHitEnd = TRUE; | 732 fHitEnd = TRUE; |
| 727 return FALSE; | 733 return FALSE; |
| 728 } | 734 } |
| 729 startPos = pos; | 735 if (findProgressInterrupt(startPos, status)) |
| 730 » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) | |
| 731 return FALSE; | 736 return FALSE; |
| 732 } | 737 } |
| 733 } | 738 } |
| 734 U_ASSERT(FALSE); | 739 U_ASSERT(FALSE); |
| 735 | 740 |
| 736 case START_STRING: | 741 case START_STRING: |
| 737 case START_CHAR: | 742 case START_CHAR: |
| 738 { | 743 { |
| 739 // Match starts on exactly one char. | 744 // Match starts on exactly one char. |
| 740 U_ASSERT(fPattern->fMinMatchLen > 0); | 745 U_ASSERT(fPattern->fMinMatchLen > 0); |
| 741 UChar32 theChar = fPattern->fInitialChar; | 746 UChar32 theChar = fPattern->fInitialChar; |
| 742 int64_t pos; | |
| 743 UTEXT_SETNATIVEINDEX(fInputText, startPos); | 747 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| 744 for (;;) { | 748 for (;;) { |
| 749 int64_t pos = startPos; |
| 745 c = UTEXT_NEXT32(fInputText); | 750 c = UTEXT_NEXT32(fInputText); |
| 746 pos = UTEXT_GETNATIVEINDEX(fInputText); | 751 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| 747 if (c == theChar) { | 752 if (c == theChar) { |
| 748 MatchAt(startPos, FALSE, fDeferredStatus); | 753 MatchAt(pos, FALSE, status); |
| 749 if (U_FAILURE(fDeferredStatus)) { | 754 if (U_FAILURE(status)) { |
| 750 return FALSE; | 755 return FALSE; |
| 751 } | 756 } |
| 752 if (fMatch) { | 757 if (fMatch) { |
| 753 return TRUE; | 758 return TRUE; |
| 754 } | 759 } |
| 755 UTEXT_SETNATIVEINDEX(fInputText, pos); | 760 UTEXT_SETNATIVEINDEX(fInputText, pos); |
| 756 } | 761 } |
| 757 if (startPos >= testStartLimit) { | 762 if (startPos > testStartLimit) { |
| 758 fMatch = FALSE; | 763 fMatch = FALSE; |
| 759 fHitEnd = TRUE; | 764 fHitEnd = TRUE; |
| 760 return FALSE; | 765 return FALSE; |
| 761 } | 766 } |
| 762 startPos = pos; | 767 if (findProgressInterrupt(startPos, status)) |
| 763 » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) | |
| 764 return FALSE; | 768 return FALSE; |
| 765 } | 769 } |
| 766 } | 770 } |
| 767 U_ASSERT(FALSE); | 771 U_ASSERT(FALSE); |
| 768 | 772 |
| 769 case START_LINE: | 773 case START_LINE: |
| 770 { | 774 { |
| 771 UChar32 c; | 775 UChar32 c; |
| 772 if (startPos == fAnchorStart) { | 776 if (startPos == fAnchorStart) { |
| 773 MatchAt(startPos, FALSE, fDeferredStatus); | 777 MatchAt(startPos, FALSE, status); |
| 774 if (U_FAILURE(fDeferredStatus)) { | 778 if (U_FAILURE(status)) { |
| 775 return FALSE; | 779 return FALSE; |
| 776 } | 780 } |
| 777 if (fMatch) { | 781 if (fMatch) { |
| 778 return TRUE; | 782 return TRUE; |
| 779 } | 783 } |
| 780 UTEXT_SETNATIVEINDEX(fInputText, startPos); | 784 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| 781 c = UTEXT_NEXT32(fInputText); | 785 c = UTEXT_NEXT32(fInputText); |
| 782 startPos = UTEXT_GETNATIVEINDEX(fInputText); | 786 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| 783 } else { | 787 } else { |
| 784 UTEXT_SETNATIVEINDEX(fInputText, startPos); | 788 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| 785 c = UTEXT_PREVIOUS32(fInputText); | 789 c = UTEXT_PREVIOUS32(fInputText); |
| 786 UTEXT_SETNATIVEINDEX(fInputText, startPos); | 790 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| 787 } | 791 } |
| 788 | 792 |
| 789 if (fPattern->fFlags & UREGEX_UNIX_LINES) { | 793 if (fPattern->fFlags & UREGEX_UNIX_LINES) { |
| 790 for (;;) { | 794 for (;;) { |
| 791 if (c == 0x0a) { | 795 if (c == 0x0a) { |
| 792 MatchAt(startPos, FALSE, fDeferredStatus); | 796 MatchAt(startPos, FALSE, status); |
| 793 if (U_FAILURE(fDeferredStatus)) { | 797 if (U_FAILURE(status)) { |
| 794 return FALSE; | 798 return FALSE; |
| 795 } | 799 } |
| 796 if (fMatch) { | 800 if (fMatch) { |
| 797 return TRUE; | 801 return TRUE; |
| 798 } | 802 } |
| 799 UTEXT_SETNATIVEINDEX(fInputText, startPos); | 803 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| 800 } | 804 } |
| 801 if (startPos >= testStartLimit) { | 805 if (startPos >= testStartLimit) { |
| 802 fMatch = FALSE; | 806 fMatch = FALSE; |
| 803 fHitEnd = TRUE; | 807 fHitEnd = TRUE; |
| 804 return FALSE; | 808 return FALSE; |
| 805 } | 809 } |
| 806 c = UTEXT_NEXT32(fInputText); | 810 c = UTEXT_NEXT32(fInputText); |
| 807 startPos = UTEXT_GETNATIVEINDEX(fInputText); | 811 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| 808 // Note that it's perfectly OK for a pattern to have a zero-
length | 812 // Note that it's perfectly OK for a pattern to have a zero-
length |
| 809 // match at the end of a string, so we must make sure that
the loop | 813 // match at the end of a string, so we must make sure that
the loop |
| 810 // runs with startPos == testStartLimit the last time thro
ugh. | 814 // runs with startPos == testStartLimit the last time thro
ugh. |
| 811 » » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferred
Status)) | 815 if (findProgressInterrupt(startPos, status)) |
| 812 return FALSE; | 816 return FALSE; |
| 813 } | 817 } |
| 814 } else { | 818 } else { |
| 815 for (;;) { | 819 for (;;) { |
| 816 if (((c & 0x7f) <= 0x29) && // First quickly bypass as m
any chars as possible | 820 if (((c & 0x7f) <= 0x29) && // First quickly bypass as m
any chars as possible |
| 817 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x202
9 )) { | 821 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x202
9 )) { |
| 818 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CU
RRENT32(fInputText) == 0x0a) { | 822 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CU
RRENT32(fInputText) == 0x0a) { |
| 819 (void)UTEXT_NEXT32(fInputText); | 823 (void)UTEXT_NEXT32(fInputText); |
| 820 startPos = UTEXT_GETNATIVEINDEX(fInputText); | 824 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| 821 } | 825 } |
| 822 MatchAt(startPos, FALSE, fDeferredStatus); | 826 MatchAt(startPos, FALSE, status); |
| 823 if (U_FAILURE(fDeferredStatus)) { | 827 if (U_FAILURE(status)) { |
| 824 return FALSE; | 828 return FALSE; |
| 825 } | 829 } |
| 826 if (fMatch) { | 830 if (fMatch) { |
| 827 return TRUE; | 831 return TRUE; |
| 828 } | 832 } |
| 829 UTEXT_SETNATIVEINDEX(fInputText, startPos); | 833 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| 830 } | 834 } |
| 831 if (startPos >= testStartLimit) { | 835 if (startPos >= testStartLimit) { |
| 832 fMatch = FALSE; | 836 fMatch = FALSE; |
| 833 fHitEnd = TRUE; | 837 fHitEnd = TRUE; |
| 834 return FALSE; | 838 return FALSE; |
| 835 } | 839 } |
| 836 c = UTEXT_NEXT32(fInputText); | 840 c = UTEXT_NEXT32(fInputText); |
| 837 startPos = UTEXT_GETNATIVEINDEX(fInputText); | 841 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| 838 // Note that it's perfectly OK for a pattern to have a zero-
length | 842 // Note that it's perfectly OK for a pattern to have a zero-
length |
| 839 // match at the end of a string, so we must make sure that
the loop | 843 // match at the end of a string, so we must make sure that
the loop |
| 840 // runs with startPos == testStartLimit the last time thro
ugh. | 844 // runs with startPos == testStartLimit the last time thro
ugh. |
| 841 » » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferred
Status)) | 845 if (findProgressInterrupt(startPos, status)) |
| 842 return FALSE; | 846 return FALSE; |
| 843 } | 847 } |
| 844 } | 848 } |
| 845 } | 849 } |
| 846 | 850 |
| 847 default: | 851 default: |
| 848 U_ASSERT(FALSE); | 852 U_ASSERT(FALSE); |
| 849 } | 853 } |
| 850 | 854 |
| 851 U_ASSERT(FALSE); | 855 U_ASSERT(FALSE); |
| 852 return FALSE; | 856 return FALSE; |
| 853 } | 857 } |
| 854 | 858 |
| 855 | 859 |
| 856 | 860 |
| 857 UBool RegexMatcher::find(int64_t start, UErrorCode &status) { | 861 UBool RegexMatcher::find(int64_t start, UErrorCode &status) { |
| 858 if (U_FAILURE(status)) { | 862 if (U_FAILURE(status)) { |
| 859 return FALSE; | 863 return FALSE; |
| 860 } | 864 } |
| 861 if (U_FAILURE(fDeferredStatus)) { | 865 if (U_FAILURE(fDeferredStatus)) { |
| 862 status = fDeferredStatus; | 866 status = fDeferredStatus; |
| 863 return FALSE; | 867 return FALSE; |
| 864 } | 868 } |
| 865 this->reset(); // Note: Reset() is specified by Java
Matcher documentation. | 869 this->reset(); // Note: Reset() is specified by Java
Matcher documentation. |
| 866 // This will reset the region t
o be the full input length. | 870 // This will reset the region t
o be the full input length. |
| 867 if (start < 0) { | 871 if (start < 0) { |
| 868 status = U_INDEX_OUTOFBOUNDS_ERROR; | 872 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 869 return FALSE; | 873 return FALSE; |
| 870 } | 874 } |
| 871 | 875 |
| 872 int64_t nativeStart = start; | 876 int64_t nativeStart = start; |
| 873 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { | 877 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { |
| 874 status = U_INDEX_OUTOFBOUNDS_ERROR; | 878 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 875 return FALSE; | 879 return FALSE; |
| 876 } | 880 } |
| 877 fMatchEnd = nativeStart; | 881 fMatchEnd = nativeStart; |
| 878 return find(); | 882 return find(status); |
| 879 } | 883 } |
| 880 | 884 |
| 881 | 885 |
| 882 //------------------------------------------------------------------------------
-- | 886 //------------------------------------------------------------------------------
-- |
| 883 // | 887 // |
| 884 // findUsingChunk() -- like find(), but with the advance knowledge that the | 888 // findUsingChunk() -- like find(), but with the advance knowledge that the |
| 885 // entire string is available in the UText's chunk buffer. | 889 // entire string is available in the UText's chunk buffer. |
| 886 // | 890 // |
| 887 //------------------------------------------------------------------------------
-- | 891 //------------------------------------------------------------------------------
-- |
| 888 UBool RegexMatcher::findUsingChunk() { | 892 UBool RegexMatcher::findUsingChunk(UErrorCode &status) { |
| 889 // Start at the position of the last match end. (Will be zero if the | 893 // Start at the position of the last match end. (Will be zero if the |
| 890 // matcher has been reset. | 894 // matcher has been reset. |
| 891 // | 895 // |
| 892 | 896 |
| 893 int32_t startPos = (int32_t)fMatchEnd; | 897 int32_t startPos = (int32_t)fMatchEnd; |
| 894 if (startPos==0) { | 898 if (startPos==0) { |
| 895 startPos = (int32_t)fActiveStart; | 899 startPos = (int32_t)fActiveStart; |
| 896 } | 900 } |
| 897 | 901 |
| 898 const UChar *inputBuf = fInputText->chunkContents; | 902 const UChar *inputBuf = fInputText->chunkContents; |
| 899 | 903 |
| 900 if (fMatch) { | 904 if (fMatch) { |
| 901 // Save the position of any previous successful match. | 905 // Save the position of any previous successful match. |
| 902 fLastMatchEnd = fMatchEnd; | 906 fLastMatchEnd = fMatchEnd; |
| 903 | 907 |
| 904 if (fMatchStart == fMatchEnd) { | 908 if (fMatchStart == fMatchEnd) { |
| 905 // Previous match had zero length. Move start position up one posit
ion | 909 // Previous match had zero length. Move start position up one posit
ion |
| 906 // to avoid sending find() into a loop on zero-length matches. | 910 // to avoid sending find() into a loop on zero-length matches. |
| 907 if (startPos >= fActiveLimit) { | 911 if (startPos >= fActiveLimit) { |
| 908 fMatch = FALSE; | 912 fMatch = FALSE; |
| 909 fHitEnd = TRUE; | 913 fHitEnd = TRUE; |
| 910 return FALSE; | 914 return FALSE; |
| 911 } | 915 } |
| 912 U16_FWD_1(inputBuf, startPos, fInputLength); | 916 U16_FWD_1(inputBuf, startPos, fInputLength); |
| 913 } | 917 } |
| 914 } else { | 918 } else { |
| 915 if (fLastMatchEnd >= 0) { | 919 if (fLastMatchEnd >= 0) { |
| 916 // A previous find() failed to match. Don't try again. | 920 // A previous find() failed to match. Don't try again. |
| 917 // (without this test, a pattern with a zero-length match | 921 // (without this test, a pattern with a zero-length match |
| 918 // could match again at the end of an input string.) | 922 // could match again at the end of an input string.) |
| 919 fHitEnd = TRUE; | 923 fHitEnd = TRUE; |
| 920 return FALSE; | 924 return FALSE; |
| 921 } | 925 } |
| 922 } | 926 } |
| 923 | 927 |
| 924 | 928 |
| 925 // Compute the position in the input string beyond which a match can not beg
in, because | 929 // Compute the position in the input string beyond which a match can not beg
in, because |
| 926 // the minimum length match would extend past the end of the input. | 930 // the minimum length match would extend past the end of the input. |
| 927 // Note: some patterns that cannot match anything will have fMinMatchLeng
th==Max Int. | 931 // Note: some patterns that cannot match anything will have fMinMatchLeng
th==Max Int. |
| 928 // Be aware of possible overflows if making changes here. | 932 // Be aware of possible overflows if making changes here. |
| 933 // Note: a match can begin at inputBuf + testLen; it is an inclusive limi
t. |
| 929 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); | 934 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); |
| 930 if (startPos > testLen) { | 935 if (startPos > testLen) { |
| 931 fMatch = FALSE; | 936 fMatch = FALSE; |
| 932 fHitEnd = TRUE; | 937 fHitEnd = TRUE; |
| 933 return FALSE; | 938 return FALSE; |
| 934 } | 939 } |
| 935 | 940 |
| 936 UChar32 c; | 941 UChar32 c; |
| 937 U_ASSERT(startPos >= 0); | 942 U_ASSERT(startPos >= 0); |
| 938 | 943 |
| 939 switch (fPattern->fStartType) { | 944 switch (fPattern->fStartType) { |
| 940 case START_NO_INFO: | 945 case START_NO_INFO: |
| 941 // No optimization was found. | 946 // No optimization was found. |
| 942 // Try a match at each input position. | 947 // Try a match at each input position. |
| 943 for (;;) { | 948 for (;;) { |
| 944 MatchChunkAt(startPos, FALSE, fDeferredStatus); | 949 MatchChunkAt(startPos, FALSE, status); |
| 945 if (U_FAILURE(fDeferredStatus)) { | 950 if (U_FAILURE(status)) { |
| 946 return FALSE; | 951 return FALSE; |
| 947 } | 952 } |
| 948 if (fMatch) { | 953 if (fMatch) { |
| 949 return TRUE; | 954 return TRUE; |
| 950 } | 955 } |
| 951 if (startPos >= testLen) { | 956 if (startPos >= testLen) { |
| 952 fHitEnd = TRUE; | 957 fHitEnd = TRUE; |
| 953 return FALSE; | 958 return FALSE; |
| 954 } | 959 } |
| 955 U16_FWD_1(inputBuf, startPos, fActiveLimit); | 960 U16_FWD_1(inputBuf, startPos, fActiveLimit); |
| 956 // Note that it's perfectly OK for a pattern to have a zero-length | 961 // Note that it's perfectly OK for a pattern to have a zero-length |
| 957 // match at the end of a string, so we must make sure that the loo
p | 962 // match at the end of a string, so we must make sure that the loo
p |
| 958 // runs with startPos == testLen the last time through. | 963 // runs with startPos == testLen the last time through. |
| 959 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) | 964 if (findProgressInterrupt(startPos, status)) |
| 960 return FALSE; | 965 return FALSE; |
| 961 } | 966 } |
| 962 U_ASSERT(FALSE); | 967 U_ASSERT(FALSE); |
| 963 | 968 |
| 964 case START_START: | 969 case START_START: |
| 965 // Matches are only possible at the start of the input string | 970 // Matches are only possible at the start of the input string |
| 966 // (pattern begins with ^ or \A) | 971 // (pattern begins with ^ or \A) |
| 967 if (startPos > fActiveStart) { | 972 if (startPos > fActiveStart) { |
| 968 fMatch = FALSE; | 973 fMatch = FALSE; |
| 969 return FALSE; | 974 return FALSE; |
| 970 } | 975 } |
| 971 MatchChunkAt(startPos, FALSE, fDeferredStatus); | 976 MatchChunkAt(startPos, FALSE, status); |
| 972 if (U_FAILURE(fDeferredStatus)) { | 977 if (U_FAILURE(status)) { |
| 973 return FALSE; | 978 return FALSE; |
| 974 } | 979 } |
| 975 return fMatch; | 980 return fMatch; |
| 976 | 981 |
| 977 | 982 |
| 978 case START_SET: | 983 case START_SET: |
| 979 { | 984 { |
| 980 // Match may start on any char from a pre-computed set. | 985 // Match may start on any char from a pre-computed set. |
| 981 U_ASSERT(fPattern->fMinMatchLen > 0); | 986 U_ASSERT(fPattern->fMinMatchLen > 0); |
| 982 for (;;) { | 987 for (;;) { |
| 983 int32_t pos = startPos; | 988 int32_t pos = startPos; |
| 984 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf
[startPos++]; | 989 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf
[startPos++]; |
| 985 if ((c<256 && fPattern->fInitialChars8->contains(c)) || | 990 if ((c<256 && fPattern->fInitialChars8->contains(c)) || |
| 986 (c>=256 && fPattern->fInitialChars->contains(c))) { | 991 (c>=256 && fPattern->fInitialChars->contains(c))) { |
| 987 MatchChunkAt(pos, FALSE, fDeferredStatus); | 992 MatchChunkAt(pos, FALSE, status); |
| 988 if (U_FAILURE(fDeferredStatus)) { | 993 if (U_FAILURE(status)) { |
| 989 return FALSE; | 994 return FALSE; |
| 990 } | 995 } |
| 991 if (fMatch) { | 996 if (fMatch) { |
| 992 return TRUE; | 997 return TRUE; |
| 993 } | 998 } |
| 994 } | 999 } |
| 995 if (pos >= testLen) { | 1000 if (startPos > testLen) { |
| 996 fMatch = FALSE; | 1001 fMatch = FALSE; |
| 997 fHitEnd = TRUE; | 1002 fHitEnd = TRUE; |
| 998 return FALSE; | 1003 return FALSE; |
| 999 } | 1004 } |
| 1000 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) | 1005 if (findProgressInterrupt(startPos, status)) |
| 1001 return FALSE; | 1006 return FALSE; |
| 1002 } | 1007 } |
| 1003 } | 1008 } |
| 1004 U_ASSERT(FALSE); | 1009 U_ASSERT(FALSE); |
| 1005 | 1010 |
| 1006 case START_STRING: | 1011 case START_STRING: |
| 1007 case START_CHAR: | 1012 case START_CHAR: |
| 1008 { | 1013 { |
| 1009 // Match starts on exactly one char. | 1014 // Match starts on exactly one char. |
| 1010 U_ASSERT(fPattern->fMinMatchLen > 0); | 1015 U_ASSERT(fPattern->fMinMatchLen > 0); |
| 1011 UChar32 theChar = fPattern->fInitialChar; | 1016 UChar32 theChar = fPattern->fInitialChar; |
| 1012 for (;;) { | 1017 for (;;) { |
| 1013 int32_t pos = startPos; | 1018 int32_t pos = startPos; |
| 1014 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf
[startPos++]; | 1019 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf
[startPos++]; |
| 1015 if (c == theChar) { | 1020 if (c == theChar) { |
| 1016 MatchChunkAt(pos, FALSE, fDeferredStatus); | 1021 MatchChunkAt(pos, FALSE, status); |
| 1017 if (U_FAILURE(fDeferredStatus)) { | 1022 if (U_FAILURE(status)) { |
| 1018 return FALSE; | 1023 return FALSE; |
| 1019 } | 1024 } |
| 1020 if (fMatch) { | 1025 if (fMatch) { |
| 1021 return TRUE; | 1026 return TRUE; |
| 1022 } | 1027 } |
| 1023 } | 1028 } |
| 1024 if (pos >= testLen) { | 1029 if (startPos > testLen) { |
| 1025 fMatch = FALSE; | 1030 fMatch = FALSE; |
| 1026 fHitEnd = TRUE; | 1031 fHitEnd = TRUE; |
| 1027 return FALSE; | 1032 return FALSE; |
| 1028 } | 1033 } |
| 1029 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) | 1034 if (findProgressInterrupt(startPos, status)) |
| 1030 return FALSE; | 1035 return FALSE; |
| 1031 } | 1036 } |
| 1032 } | 1037 } |
| 1033 U_ASSERT(FALSE); | 1038 U_ASSERT(FALSE); |
| 1034 | 1039 |
| 1035 case START_LINE: | 1040 case START_LINE: |
| 1036 { | 1041 { |
| 1037 UChar32 c; | 1042 UChar32 c; |
| 1038 if (startPos == fAnchorStart) { | 1043 if (startPos == fAnchorStart) { |
| 1039 MatchChunkAt(startPos, FALSE, fDeferredStatus); | 1044 MatchChunkAt(startPos, FALSE, status); |
| 1040 if (U_FAILURE(fDeferredStatus)) { | 1045 if (U_FAILURE(status)) { |
| 1041 return FALSE; | 1046 return FALSE; |
| 1042 } | 1047 } |
| 1043 if (fMatch) { | 1048 if (fMatch) { |
| 1044 return TRUE; | 1049 return TRUE; |
| 1045 } | 1050 } |
| 1046 U16_FWD_1(inputBuf, startPos, fActiveLimit); | 1051 U16_FWD_1(inputBuf, startPos, fActiveLimit); |
| 1047 } | 1052 } |
| 1048 | 1053 |
| 1049 if (fPattern->fFlags & UREGEX_UNIX_LINES) { | 1054 if (fPattern->fFlags & UREGEX_UNIX_LINES) { |
| 1050 for (;;) { | 1055 for (;;) { |
| 1051 c = inputBuf[startPos-1]; | 1056 c = inputBuf[startPos-1]; |
| 1052 if (c == 0x0a) { | 1057 if (c == 0x0a) { |
| 1053 MatchChunkAt(startPos, FALSE, fDeferredStatus); | 1058 MatchChunkAt(startPos, FALSE, status); |
| 1054 if (U_FAILURE(fDeferredStatus)) { | 1059 if (U_FAILURE(status)) { |
| 1055 return FALSE; | 1060 return FALSE; |
| 1056 } | 1061 } |
| 1057 if (fMatch) { | 1062 if (fMatch) { |
| 1058 return TRUE; | 1063 return TRUE; |
| 1059 } | 1064 } |
| 1060 } | 1065 } |
| 1061 if (startPos >= testLen) { | 1066 if (startPos >= testLen) { |
| 1062 fMatch = FALSE; | 1067 fMatch = FALSE; |
| 1063 fHitEnd = TRUE; | 1068 fHitEnd = TRUE; |
| 1064 return FALSE; | 1069 return FALSE; |
| 1065 } | 1070 } |
| 1066 U16_FWD_1(inputBuf, startPos, fActiveLimit); | 1071 U16_FWD_1(inputBuf, startPos, fActiveLimit); |
| 1067 // Note that it's perfectly OK for a pattern to have a zero-leng
th | 1072 // Note that it's perfectly OK for a pattern to have a zero-leng
th |
| 1068 // match at the end of a string, so we must make sure that the
loop | 1073 // match at the end of a string, so we must make sure that the
loop |
| 1069 // runs with startPos == testLen the last time through. | 1074 // runs with startPos == testLen the last time through. |
| 1070 » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) | 1075 if (findProgressInterrupt(startPos, status)) |
| 1071 return FALSE; | 1076 return FALSE; |
| 1072 } | 1077 } |
| 1073 } else { | 1078 } else { |
| 1074 for (;;) { | 1079 for (;;) { |
| 1075 c = inputBuf[startPos-1]; | 1080 c = inputBuf[startPos-1]; |
| 1076 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many
chars as possible | 1081 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many
chars as possible |
| 1077 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 ))
{ | 1082 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 ))
{ |
| 1078 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPo
s] == 0x0a) { | 1083 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPo
s] == 0x0a) { |
| 1079 startPos++; | 1084 startPos++; |
| 1080 } | 1085 } |
| 1081 MatchChunkAt(startPos, FALSE, fDeferredStatus); | 1086 MatchChunkAt(startPos, FALSE, status); |
| 1082 if (U_FAILURE(fDeferredStatus)) { | 1087 if (U_FAILURE(status)) { |
| 1083 return FALSE; | 1088 return FALSE; |
| 1084 } | 1089 } |
| 1085 if (fMatch) { | 1090 if (fMatch) { |
| 1086 return TRUE; | 1091 return TRUE; |
| 1087 } | 1092 } |
| 1088 } | 1093 } |
| 1089 if (startPos >= testLen) { | 1094 if (startPos >= testLen) { |
| 1090 fMatch = FALSE; | 1095 fMatch = FALSE; |
| 1091 fHitEnd = TRUE; | 1096 fHitEnd = TRUE; |
| 1092 return FALSE; | 1097 return FALSE; |
| 1093 } | 1098 } |
| 1094 U16_FWD_1(inputBuf, startPos, fActiveLimit); | 1099 U16_FWD_1(inputBuf, startPos, fActiveLimit); |
| 1095 // Note that it's perfectly OK for a pattern to have a zero-leng
th | 1100 // Note that it's perfectly OK for a pattern to have a zero-leng
th |
| 1096 // match at the end of a string, so we must make sure that the
loop | 1101 // match at the end of a string, so we must make sure that the
loop |
| 1097 // runs with startPos == testLen the last time through. | 1102 // runs with startPos == testLen the last time through. |
| 1098 » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) | 1103 if (findProgressInterrupt(startPos, status)) |
| 1099 return FALSE; | 1104 return FALSE; |
| 1100 } | 1105 } |
| 1101 } | 1106 } |
| 1102 } | 1107 } |
| 1103 | 1108 |
| 1104 default: | 1109 default: |
| 1105 U_ASSERT(FALSE); | 1110 U_ASSERT(FALSE); |
| 1106 } | 1111 } |
| 1107 | 1112 |
| 1108 U_ASSERT(FALSE); | 1113 U_ASSERT(FALSE); |
| 1109 return FALSE; | 1114 return FALSE; |
| 1110 } | 1115 } |
| 1111 | 1116 |
| 1112 | 1117 |
| 1113 | 1118 |
| 1114 //------------------------------------------------------------------------------
-- | 1119 //------------------------------------------------------------------------------
-- |
| 1115 // | 1120 // |
| 1116 // group() | 1121 // group() |
| 1117 // | 1122 // |
| 1118 //------------------------------------------------------------------------------
-- | 1123 //------------------------------------------------------------------------------
-- |
| 1119 UnicodeString RegexMatcher::group(UErrorCode &status) const { | 1124 UnicodeString RegexMatcher::group(UErrorCode &status) const { |
| 1120 return group(0, status); | 1125 return group(0, status); |
| 1121 } | 1126 } |
| 1122 | 1127 |
| 1123 // Return immutable shallow clone | 1128 // Return immutable shallow clone |
| 1124 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status)
const { | 1129 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status)
const { |
| 1125 return group(0, dest, group_len, status); | 1130 return group(0, dest, group_len, status); |
| 1126 } | 1131 } |
| 1127 | 1132 |
| 1128 // Return immutable shallow clone | 1133 // Return immutable shallow clone |
| 1129 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UE
rrorCode &status) const { | 1134 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UE
rrorCode &status) const { |
| 1130 group_len = 0; | 1135 group_len = 0; |
| 1131 UBool bailOut = FALSE; | |
| 1132 if (U_FAILURE(status)) { | 1136 if (U_FAILURE(status)) { |
| 1133 return dest; | 1137 return dest; |
| 1134 } | 1138 } |
| 1135 if (U_FAILURE(fDeferredStatus)) { | 1139 if (U_FAILURE(fDeferredStatus)) { |
| 1136 status = fDeferredStatus; | 1140 status = fDeferredStatus; |
| 1137 bailOut = TRUE; | 1141 } else if (fMatch == FALSE) { |
| 1142 status = U_REGEX_INVALID_STATE; |
| 1143 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { |
| 1144 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 1138 } | 1145 } |
| 1139 if (fMatch == FALSE) { | 1146 |
| 1140 status = U_REGEX_INVALID_STATE; | 1147 if (U_FAILURE(status)) { |
| 1141 bailOut = TRUE; | 1148 return dest; |
| 1142 } | 1149 } |
| 1143 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { | 1150 |
| 1144 status = U_INDEX_OUTOFBOUNDS_ERROR; | |
| 1145 bailOut = TRUE; | |
| 1146 } | |
| 1147 | |
| 1148 if (bailOut) { | |
| 1149 return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status); | |
| 1150 } | |
| 1151 | |
| 1152 int64_t s, e; | 1151 int64_t s, e; |
| 1153 if (groupNum == 0) { | 1152 if (groupNum == 0) { |
| 1154 s = fMatchStart; | 1153 s = fMatchStart; |
| 1155 e = fMatchEnd; | 1154 e = fMatchEnd; |
| 1156 } else { | 1155 } else { |
| 1157 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); | 1156 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); |
| 1158 U_ASSERT(groupOffset < fPattern->fFrameSize); | 1157 U_ASSERT(groupOffset < fPattern->fFrameSize); |
| 1159 U_ASSERT(groupOffset >= 0); | 1158 U_ASSERT(groupOffset >= 0); |
| 1160 s = fFrame->fExtra[groupOffset]; | 1159 s = fFrame->fExtra[groupOffset]; |
| 1161 e = fFrame->fExtra[groupOffset+1]; | 1160 e = fFrame->fExtra[groupOffset+1]; |
| 1162 } | 1161 } |
| 1163 | 1162 |
| 1164 if (s < 0) { | 1163 if (s < 0) { |
| 1165 // A capture group wasn't part of the match | 1164 // A capture group wasn't part of the match |
| 1166 return utext_clone(dest, fInputText, FALSE, TRUE, &status); | 1165 return utext_clone(dest, fInputText, FALSE, TRUE, &status); |
| 1167 } | 1166 } |
| 1168 U_ASSERT(s <= e); | 1167 U_ASSERT(s <= e); |
| 1169 group_len = e - s; | 1168 group_len = e - s; |
| 1170 | 1169 |
| 1171 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); | 1170 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); |
| 1172 if (dest) | 1171 if (dest) |
| 1173 UTEXT_SETNATIVEINDEX(dest, s); | 1172 UTEXT_SETNATIVEINDEX(dest, s); |
| 1174 return dest; | 1173 return dest; |
| 1175 } | 1174 } |
| 1176 | 1175 |
| 1177 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { | 1176 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { |
| 1178 UnicodeString result; | 1177 UnicodeString result; |
| 1179 if (U_FAILURE(status)) { | 1178 if (U_FAILURE(status)) { |
| 1180 return result; | 1179 return result; |
| 1181 } | 1180 } |
| 1182 UText resultText = UTEXT_INITIALIZER; | 1181 UText resultText = UTEXT_INITIALIZER; |
| 1183 utext_openUnicodeString(&resultText, &result, &status); | 1182 utext_openUnicodeString(&resultText, &result, &status); |
| 1184 group(groupNum, &resultText, status); | 1183 group(groupNum, &resultText, status); |
| 1185 utext_close(&resultText); | 1184 utext_close(&resultText); |
| 1186 return result; | 1185 return result; |
| 1187 } | 1186 } |
| 1188 | 1187 |
| 1189 | 1188 |
| 1190 // Return deep (mutable) clone | 1189 // Return deep (mutable) clone |
| 1191 //» » Technology Preview (as an API), but note that the UnicodeString
API is implemented | 1190 // Technology Preview (as an API), but note that the UnicodeString API is i
mplemented |
| 1192 //» » using this function. | 1191 // using this function. |
| 1193 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) co
nst { | 1192 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) co
nst { |
| 1194 UBool bailOut = FALSE; | |
| 1195 if (U_FAILURE(status)) { | 1193 if (U_FAILURE(status)) { |
| 1196 return dest; | 1194 return dest; |
| 1197 } | 1195 } |
| 1196 |
| 1198 if (U_FAILURE(fDeferredStatus)) { | 1197 if (U_FAILURE(fDeferredStatus)) { |
| 1199 status = fDeferredStatus; | 1198 status = fDeferredStatus; |
| 1200 bailOut = TRUE; | 1199 } else if (fMatch == FALSE) { |
| 1200 status = U_REGEX_INVALID_STATE; |
| 1201 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { |
| 1202 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 1201 } | 1203 } |
| 1202 | 1204 if (U_FAILURE(status)) { |
| 1203 if (fMatch == FALSE) { | 1205 return dest; |
| 1204 status = U_REGEX_INVALID_STATE; | |
| 1205 bailOut = TRUE; | |
| 1206 } | 1206 } |
| 1207 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { | 1207 |
| 1208 status = U_INDEX_OUTOFBOUNDS_ERROR; | |
| 1209 bailOut = TRUE; | |
| 1210 } | |
| 1211 | |
| 1212 if (bailOut) { | |
| 1213 if (dest) { | |
| 1214 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); | |
| 1215 return dest; | |
| 1216 } else { | |
| 1217 return utext_openUChars(NULL, NULL, 0, &status); | |
| 1218 } | |
| 1219 } | |
| 1220 | |
| 1221 int64_t s, e; | 1208 int64_t s, e; |
| 1222 if (groupNum == 0) { | 1209 if (groupNum == 0) { |
| 1223 s = fMatchStart; | 1210 s = fMatchStart; |
| 1224 e = fMatchEnd; | 1211 e = fMatchEnd; |
| 1225 } else { | 1212 } else { |
| 1226 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); | 1213 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); |
| 1227 U_ASSERT(groupOffset < fPattern->fFrameSize); | 1214 U_ASSERT(groupOffset < fPattern->fFrameSize); |
| 1228 U_ASSERT(groupOffset >= 0); | 1215 U_ASSERT(groupOffset >= 0); |
| 1229 s = fFrame->fExtra[groupOffset]; | 1216 s = fFrame->fExtra[groupOffset]; |
| 1230 e = fFrame->fExtra[groupOffset+1]; | 1217 e = fFrame->fExtra[groupOffset+1]; |
| 1231 } | 1218 } |
| 1232 | 1219 |
| 1233 if (s < 0) { | 1220 if (s < 0) { |
| 1234 // A capture group wasn't part of the match | 1221 // A capture group wasn't part of the match |
| 1235 if (dest) { | 1222 if (dest) { |
| 1236 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); | 1223 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); |
| 1237 return dest; | 1224 return dest; |
| 1238 } else { | 1225 } else { |
| 1239 return utext_openUChars(NULL, NULL, 0, &status); | 1226 return utext_openUChars(NULL, NULL, 0, &status); |
| 1240 } | 1227 } |
| 1241 } | 1228 } |
| 1242 U_ASSERT(s <= e); | 1229 U_ASSERT(s <= e); |
| 1243 | 1230 |
| 1244 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { | 1231 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 1245 U_ASSERT(e <= fInputLength); | 1232 U_ASSERT(e <= fInputLength); |
| 1246 if (dest) { | 1233 if (dest) { |
| 1247 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkCo
ntents+s, (int32_t)(e-s), &status); | 1234 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkCo
ntents+s, (int32_t)(e-s), &status); |
| 1248 } else { | 1235 } else { |
| 1249 UText groupText = UTEXT_INITIALIZER; | 1236 UText groupText = UTEXT_INITIALIZER; |
| 1250 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &stat
us); | 1237 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &stat
us); |
| 1251 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); | 1238 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); |
| 1252 utext_close(&groupText); | 1239 utext_close(&groupText); |
| 1253 } | 1240 } |
| (...skipping 13 matching lines...) Expand all Loading... |
| 1267 utext_extract(fInputText, s, e, groupChars, len16+1, &status); | 1254 utext_extract(fInputText, s, e, groupChars, len16+1, &status); |
| 1268 | 1255 |
| 1269 if (dest) { | 1256 if (dest) { |
| 1270 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16,
&status); | 1257 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16,
&status); |
| 1271 } else { | 1258 } else { |
| 1272 UText groupText = UTEXT_INITIALIZER; | 1259 UText groupText = UTEXT_INITIALIZER; |
| 1273 utext_openUChars(&groupText, groupChars, len16, &status); | 1260 utext_openUChars(&groupText, groupChars, len16, &status); |
| 1274 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); | 1261 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); |
| 1275 utext_close(&groupText); | 1262 utext_close(&groupText); |
| 1276 } | 1263 } |
| 1277 | 1264 |
| 1278 uprv_free(groupChars); | 1265 uprv_free(groupChars); |
| 1279 } | 1266 } |
| 1280 return dest; | 1267 return dest; |
| 1281 } | 1268 } |
| 1282 | 1269 |
| 1283 //------------------------------------------------------------------------------
-- | 1270 //------------------------------------------------------------------------------
-- |
| 1284 // | 1271 // |
| 1285 // appendGroup() -- currently internal only, appends a group to a UText rather | 1272 // appendGroup() -- currently internal only, appends a group to a UText rather |
| 1286 // than replacing its contents | 1273 // than replacing its contents |
| 1287 // | 1274 // |
| 1288 //------------------------------------------------------------------------------
-- | 1275 //------------------------------------------------------------------------------
-- |
| 1289 | 1276 |
| 1290 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &sta
tus) const { | 1277 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &sta
tus) const { |
| 1291 if (U_FAILURE(status)) { | 1278 if (U_FAILURE(status)) { |
| 1292 return 0; | 1279 return 0; |
| 1293 } | 1280 } |
| 1294 if (U_FAILURE(fDeferredStatus)) { | 1281 if (U_FAILURE(fDeferredStatus)) { |
| 1295 status = fDeferredStatus; | 1282 status = fDeferredStatus; |
| 1296 return 0; | 1283 return 0; |
| 1297 } | 1284 } |
| 1298 int64_t destLen = utext_nativeLength(dest); | 1285 int64_t destLen = utext_nativeLength(dest); |
| 1299 | 1286 |
| 1300 if (fMatch == FALSE) { | 1287 if (fMatch == FALSE) { |
| 1301 status = U_REGEX_INVALID_STATE; | 1288 status = U_REGEX_INVALID_STATE; |
| 1302 return utext_replace(dest, destLen, destLen, NULL, 0, &status); | 1289 return utext_replace(dest, destLen, destLen, NULL, 0, &status); |
| 1303 } | 1290 } |
| 1304 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { | 1291 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { |
| 1305 status = U_INDEX_OUTOFBOUNDS_ERROR; | 1292 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 1306 return utext_replace(dest, destLen, destLen, NULL, 0, &status); | 1293 return utext_replace(dest, destLen, destLen, NULL, 0, &status); |
| 1307 } | 1294 } |
| 1308 | 1295 |
| 1309 int64_t s, e; | 1296 int64_t s, e; |
| 1310 if (groupNum == 0) { | 1297 if (groupNum == 0) { |
| 1311 s = fMatchStart; | 1298 s = fMatchStart; |
| 1312 e = fMatchEnd; | 1299 e = fMatchEnd; |
| 1313 } else { | 1300 } else { |
| 1314 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); | 1301 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); |
| 1315 U_ASSERT(groupOffset < fPattern->fFrameSize); | 1302 U_ASSERT(groupOffset < fPattern->fFrameSize); |
| 1316 U_ASSERT(groupOffset >= 0); | 1303 U_ASSERT(groupOffset >= 0); |
| 1317 s = fFrame->fExtra[groupOffset]; | 1304 s = fFrame->fExtra[groupOffset]; |
| 1318 e = fFrame->fExtra[groupOffset+1]; | 1305 e = fFrame->fExtra[groupOffset+1]; |
| 1319 } | 1306 } |
| 1320 | 1307 |
| 1321 if (s < 0) { | 1308 if (s < 0) { |
| 1322 // A capture group wasn't part of the match | 1309 // A capture group wasn't part of the match |
| 1323 return utext_replace(dest, destLen, destLen, NULL, 0, &status); | 1310 return utext_replace(dest, destLen, destLen, NULL, 0, &status); |
| 1324 } | 1311 } |
| 1325 U_ASSERT(s <= e); | 1312 U_ASSERT(s <= e); |
| 1326 | 1313 |
| 1327 int64_t deltaLen; | 1314 int64_t deltaLen; |
| 1328 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { | 1315 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 1329 U_ASSERT(e <= fInputLength); | 1316 U_ASSERT(e <= fInputLength); |
| 1330 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkConten
ts+s, (int32_t)(e-s), &status); | 1317 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkConten
ts+s, (int32_t)(e-s), &status); |
| 1331 } else { | 1318 } else { |
| 1332 int32_t len16; | 1319 int32_t len16; |
| 1333 if (UTEXT_USES_U16(fInputText)) { | 1320 if (UTEXT_USES_U16(fInputText)) { |
| 1334 len16 = (int32_t)(e-s); | 1321 len16 = (int32_t)(e-s); |
| 1335 } else { | 1322 } else { |
| 1336 UErrorCode lengthStatus = U_ZERO_ERROR; | 1323 UErrorCode lengthStatus = U_ZERO_ERROR; |
| 1337 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); | 1324 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); |
| 1338 } | 1325 } |
| 1339 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); | 1326 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); |
| 1340 if (groupChars == NULL) { | 1327 if (groupChars == NULL) { |
| 1341 status = U_MEMORY_ALLOCATION_ERROR; | 1328 status = U_MEMORY_ALLOCATION_ERROR; |
| 1342 return 0; | 1329 return 0; |
| 1343 } | 1330 } |
| 1344 utext_extract(fInputText, s, e, groupChars, len16+1, &status); | 1331 utext_extract(fInputText, s, e, groupChars, len16+1, &status); |
| 1345 | 1332 |
| 1346 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &sta
tus); | 1333 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &sta
tus); |
| 1347 uprv_free(groupChars); | 1334 uprv_free(groupChars); |
| 1348 } | 1335 } |
| 1349 return deltaLen; | 1336 return deltaLen; |
| 1350 } | 1337 } |
| 1351 | 1338 |
| 1352 | 1339 |
| 1353 | 1340 |
| 1354 //------------------------------------------------------------------------------
-- | 1341 //------------------------------------------------------------------------------
-- |
| 1355 // | 1342 // |
| (...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1402 if (!fInput) { | 1389 if (!fInput) { |
| 1403 UErrorCode status = U_ZERO_ERROR; | 1390 UErrorCode status = U_ZERO_ERROR; |
| 1404 int32_t len16; | 1391 int32_t len16; |
| 1405 if (UTEXT_USES_U16(fInputText)) { | 1392 if (UTEXT_USES_U16(fInputText)) { |
| 1406 len16 = (int32_t)fInputLength; | 1393 len16 = (int32_t)fInputLength; |
| 1407 } else { | 1394 } else { |
| 1408 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status)
; | 1395 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status)
; |
| 1409 status = U_ZERO_ERROR; // overflow, length status | 1396 status = U_ZERO_ERROR; // overflow, length status |
| 1410 } | 1397 } |
| 1411 UnicodeString *result = new UnicodeString(len16, 0, 0); | 1398 UnicodeString *result = new UnicodeString(len16, 0, 0); |
| 1412 | 1399 |
| 1413 UChar *inputChars = result->getBuffer(len16); | 1400 UChar *inputChars = result->getBuffer(len16); |
| 1414 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status);
// unterminated warning | 1401 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status);
// unterminated warning |
| 1415 result->releaseBuffer(len16); | 1402 result->releaseBuffer(len16); |
| 1416 | 1403 |
| 1417 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rath
er than operator= | 1404 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rath
er than operator= |
| 1418 } | 1405 } |
| 1419 | 1406 |
| 1420 return *fInput; | 1407 return *fInput; |
| 1421 } | 1408 } |
| 1422 | 1409 |
| 1423 //------------------------------------------------------------------------------
-- | 1410 //------------------------------------------------------------------------------
-- |
| 1424 // | 1411 // |
| 1425 // inputText() | 1412 // inputText() |
| 1426 // | 1413 // |
| 1427 //------------------------------------------------------------------------------
-- | 1414 //------------------------------------------------------------------------------
-- |
| 1428 UText *RegexMatcher::inputText() const { | 1415 UText *RegexMatcher::inputText() const { |
| 1429 return fInputText; | 1416 return fInputText; |
| 1430 } | 1417 } |
| 1431 | 1418 |
| 1432 | 1419 |
| 1433 //------------------------------------------------------------------------------
-- | 1420 //------------------------------------------------------------------------------
-- |
| 1434 // | 1421 // |
| 1435 // getInput() -- like inputText(), but makes a clone or copies into another UTe
xt | 1422 // getInput() -- like inputText(), but makes a clone or copies into another UTe
xt |
| 1436 // | 1423 // |
| 1437 //------------------------------------------------------------------------------
-- | 1424 //------------------------------------------------------------------------------
-- |
| 1438 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { | 1425 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { |
| 1439 UBool bailOut = FALSE; | |
| 1440 if (U_FAILURE(status)) { | 1426 if (U_FAILURE(status)) { |
| 1441 return dest; | 1427 return dest; |
| 1442 } | 1428 } |
| 1443 if (U_FAILURE(fDeferredStatus)) { | 1429 if (U_FAILURE(fDeferredStatus)) { |
| 1444 status = fDeferredStatus; | 1430 status = fDeferredStatus; |
| 1445 bailOut = TRUE; | 1431 return dest; |
| 1446 } | 1432 } |
| 1447 | 1433 |
| 1448 if (bailOut) { | |
| 1449 if (dest) { | |
| 1450 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); | |
| 1451 return dest; | |
| 1452 } else { | |
| 1453 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); | |
| 1454 } | |
| 1455 } | |
| 1456 | |
| 1457 if (dest) { | 1434 if (dest) { |
| 1458 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { | 1435 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 1459 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkCo
ntents, (int32_t)fInputLength, &status); | 1436 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkCo
ntents, (int32_t)fInputLength, &status); |
| 1460 } else { | 1437 } else { |
| 1461 int32_t input16Len; | 1438 int32_t input16Len; |
| 1462 if (UTEXT_USES_U16(fInputText)) { | 1439 if (UTEXT_USES_U16(fInputText)) { |
| 1463 input16Len = (int32_t)fInputLength; | 1440 input16Len = (int32_t)fInputLength; |
| 1464 } else { | 1441 } else { |
| 1465 UErrorCode lengthStatus = U_ZERO_ERROR; | 1442 UErrorCode lengthStatus = U_ZERO_ERROR; |
| 1466 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0,
&lengthStatus); // buffer overflow error | 1443 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0,
&lengthStatus); // buffer overflow error |
| 1467 } | 1444 } |
| 1468 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len))
; | 1445 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len))
; |
| 1469 if (inputChars == NULL) { | 1446 if (inputChars == NULL) { |
| 1470 return dest; | 1447 return dest; |
| 1471 } | 1448 } |
| 1472 | 1449 |
| 1473 status = U_ZERO_ERROR; | 1450 status = U_ZERO_ERROR; |
| 1474 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &
status); // not terminated warning | 1451 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &
status); // not terminated warning |
| 1475 status = U_ZERO_ERROR; | 1452 status = U_ZERO_ERROR; |
| 1476 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16
Len, &status); | 1453 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16
Len, &status); |
| 1477 | 1454 |
| 1478 uprv_free(inputChars); | 1455 uprv_free(inputChars); |
| 1479 } | 1456 } |
| 1480 return dest; | 1457 return dest; |
| 1481 } else { | 1458 } else { |
| 1482 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); | 1459 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); |
| 1483 } | 1460 } |
| 1484 } | 1461 } |
| 1485 | 1462 |
| 1486 | 1463 |
| 1487 static UBool compat_SyncMutableUTextContents(UText *ut); | 1464 static UBool compat_SyncMutableUTextContents(UText *ut); |
| 1488 static UBool compat_SyncMutableUTextContents(UText *ut) { | 1465 static UBool compat_SyncMutableUTextContents(UText *ut) { |
| 1489 UBool retVal = FALSE; | 1466 UBool retVal = FALSE; |
| 1490 | 1467 |
| 1491 // In the following test, we're really only interested in whether the UText
should switch | 1468 // In the following test, we're really only interested in whether the UText
should switch |
| 1492 // between heap and stack allocation. If length hasn't changed, we won't,
so the chunkContents | 1469 // between heap and stack allocation. If length hasn't changed, we won't,
so the chunkContents |
| 1493 // will still point to the correct data. | 1470 // will still point to the correct data. |
| 1494 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { | 1471 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { |
| 1495 UnicodeString *us=(UnicodeString *)ut->context; | 1472 UnicodeString *us=(UnicodeString *)ut->context; |
| 1496 | 1473 |
| 1497 // Update to the latest length. | 1474 // Update to the latest length. |
| 1498 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). | 1475 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). |
| 1499 int32_t newLength = us->length(); | 1476 int32_t newLength = us->length(); |
| 1500 | 1477 |
| 1501 // Update the chunk description. | 1478 // Update the chunk description. |
| 1502 // The buffer may have switched between stack- and heap-based. | 1479 // The buffer may have switched between stack- and heap-based. |
| 1503 ut->chunkContents = us->getBuffer(); | 1480 ut->chunkContents = us->getBuffer(); |
| 1504 ut->chunkLength = newLength; | 1481 ut->chunkLength = newLength; |
| 1505 ut->chunkNativeLimit = newLength; | 1482 ut->chunkNativeLimit = newLength; |
| 1506 ut->nativeIndexingLimit = newLength; | 1483 ut->nativeIndexingLimit = newLength; |
| 1507 retVal = TRUE; | 1484 retVal = TRUE; |
| 1508 } | 1485 } |
| 1509 | 1486 |
| 1510 return retVal; | 1487 return retVal; |
| 1511 } | 1488 } |
| 1512 | 1489 |
| 1513 //------------------------------------------------------------------------------
-- | 1490 //------------------------------------------------------------------------------
-- |
| 1514 // | 1491 // |
| 1515 // lookingAt() | 1492 // lookingAt() |
| 1516 // | 1493 // |
| 1517 //------------------------------------------------------------------------------
-- | 1494 //------------------------------------------------------------------------------
-- |
| 1518 UBool RegexMatcher::lookingAt(UErrorCode &status) { | 1495 UBool RegexMatcher::lookingAt(UErrorCode &status) { |
| 1519 if (U_FAILURE(status)) { | 1496 if (U_FAILURE(status)) { |
| 1520 return FALSE; | 1497 return FALSE; |
| 1521 } | 1498 } |
| 1522 if (U_FAILURE(fDeferredStatus)) { | 1499 if (U_FAILURE(fDeferredStatus)) { |
| 1523 status = fDeferredStatus; | 1500 status = fDeferredStatus; |
| 1524 return FALSE; | 1501 return FALSE; |
| 1525 } | 1502 } |
| 1526 | 1503 |
| 1527 if (fInputUniStrMaybeMutable) { | 1504 if (fInputUniStrMaybeMutable) { |
| 1528 if (compat_SyncMutableUTextContents(fInputText)) { | 1505 if (compat_SyncMutableUTextContents(fInputText)) { |
| 1529 fInputLength = utext_nativeLength(fInputText); | 1506 fInputLength = utext_nativeLength(fInputText); |
| 1530 reset(); | 1507 reset(); |
| 1531 } | 1508 } |
| 1532 } | 1509 } |
| 1533 else { | 1510 else { |
| 1534 resetPreserveRegion(); | 1511 resetPreserveRegion(); |
| 1535 } | 1512 } |
| 1536 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { | 1513 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 1537 MatchChunkAt((int32_t)fActiveStart, FALSE, status); | 1514 MatchChunkAt((int32_t)fActiveStart, FALSE, status); |
| 1538 } else { | 1515 } else { |
| 1539 MatchAt(fActiveStart, FALSE, status); | 1516 MatchAt(fActiveStart, FALSE, status); |
| 1540 } | 1517 } |
| 1541 return fMatch; | 1518 return fMatch; |
| 1542 } | 1519 } |
| 1543 | 1520 |
| 1544 | 1521 |
| 1545 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { | 1522 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { |
| 1546 if (U_FAILURE(status)) { | 1523 if (U_FAILURE(status)) { |
| 1547 return FALSE; | 1524 return FALSE; |
| 1548 } | 1525 } |
| 1549 if (U_FAILURE(fDeferredStatus)) { | 1526 if (U_FAILURE(fDeferredStatus)) { |
| 1550 status = fDeferredStatus; | 1527 status = fDeferredStatus; |
| 1551 return FALSE; | 1528 return FALSE; |
| 1552 } | 1529 } |
| 1553 reset(); | 1530 reset(); |
| 1554 | 1531 |
| 1555 if (start < 0) { | 1532 if (start < 0) { |
| 1556 status = U_INDEX_OUTOFBOUNDS_ERROR; | 1533 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 1557 return FALSE; | 1534 return FALSE; |
| 1558 } | 1535 } |
| 1559 | 1536 |
| 1560 if (fInputUniStrMaybeMutable) { | 1537 if (fInputUniStrMaybeMutable) { |
| 1561 if (compat_SyncMutableUTextContents(fInputText)) { | 1538 if (compat_SyncMutableUTextContents(fInputText)) { |
| 1562 fInputLength = utext_nativeLength(fInputText); | 1539 fInputLength = utext_nativeLength(fInputText); |
| 1563 reset(); | 1540 reset(); |
| 1564 } | 1541 } |
| 1565 } | 1542 } |
| 1566 | 1543 |
| 1567 int64_t nativeStart; | 1544 int64_t nativeStart; |
| 1568 nativeStart = start; | 1545 nativeStart = start; |
| 1569 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { | 1546 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { |
| 1570 status = U_INDEX_OUTOFBOUNDS_ERROR; | 1547 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 1571 return FALSE; | 1548 return FALSE; |
| 1572 } | 1549 } |
| 1573 | 1550 |
| 1574 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { | 1551 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| 1575 MatchChunkAt((int32_t)nativeStart, FALSE, status); | 1552 MatchChunkAt((int32_t)nativeStart, FALSE, status); |
| 1576 } else { | 1553 } else { |
| 1577 MatchAt(nativeStart, FALSE, status); | 1554 MatchAt(nativeStart, FALSE, status); |
| 1578 } | 1555 } |
| 1579 return fMatch; | 1556 return fMatch; |
| 1580 } | 1557 } |
| 1581 | 1558 |
| 1582 | 1559 |
| 1583 | 1560 |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1616 | 1593 |
| 1617 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { | 1594 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { |
| 1618 if (U_FAILURE(status)) { | 1595 if (U_FAILURE(status)) { |
| 1619 return FALSE; | 1596 return FALSE; |
| 1620 } | 1597 } |
| 1621 if (U_FAILURE(fDeferredStatus)) { | 1598 if (U_FAILURE(fDeferredStatus)) { |
| 1622 status = fDeferredStatus; | 1599 status = fDeferredStatus; |
| 1623 return FALSE; | 1600 return FALSE; |
| 1624 } | 1601 } |
| 1625 reset(); | 1602 reset(); |
| 1626 | 1603 |
| 1627 if (start < 0) { | 1604 if (start < 0) { |
| 1628 status = U_INDEX_OUTOFBOUNDS_ERROR; | 1605 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 1629 return FALSE; | 1606 return FALSE; |
| 1630 } | 1607 } |
| 1631 | 1608 |
| 1632 if (fInputUniStrMaybeMutable) { | 1609 if (fInputUniStrMaybeMutable) { |
| 1633 if (compat_SyncMutableUTextContents(fInputText)) { | 1610 if (compat_SyncMutableUTextContents(fInputText)) { |
| 1634 fInputLength = utext_nativeLength(fInputText); | 1611 fInputLength = utext_nativeLength(fInputText); |
| 1635 reset(); | 1612 reset(); |
| 1636 } | 1613 } |
| (...skipping 29 matching lines...) Expand all Loading... |
| 1666 | 1643 |
| 1667 //------------------------------------------------------------------------------
-- | 1644 //------------------------------------------------------------------------------
-- |
| 1668 // | 1645 // |
| 1669 // region | 1646 // region |
| 1670 // | 1647 // |
| 1671 //------------------------------------------------------------------------------
-- | 1648 //------------------------------------------------------------------------------
-- |
| 1672 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int
64_t startIndex, UErrorCode &status) { | 1649 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int
64_t startIndex, UErrorCode &status) { |
| 1673 if (U_FAILURE(status)) { | 1650 if (U_FAILURE(status)) { |
| 1674 return *this; | 1651 return *this; |
| 1675 } | 1652 } |
| 1676 | 1653 |
| 1677 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { | 1654 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { |
| 1678 status = U_ILLEGAL_ARGUMENT_ERROR; | 1655 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 1679 } | 1656 } |
| 1680 | 1657 |
| 1681 int64_t nativeStart = regionStart; | 1658 int64_t nativeStart = regionStart; |
| 1682 int64_t nativeLimit = regionLimit; | 1659 int64_t nativeLimit = regionLimit; |
| 1683 if (nativeStart > fInputLength || nativeLimit > fInputLength) { | 1660 if (nativeStart > fInputLength || nativeLimit > fInputLength) { |
| 1684 status = U_ILLEGAL_ARGUMENT_ERROR; | 1661 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 1685 } | 1662 } |
| 1686 | 1663 |
| 1687 if (startIndex == -1) | 1664 if (startIndex == -1) |
| 1688 this->reset(); | 1665 this->reset(); |
| 1689 else | 1666 else |
| 1690 resetPreserveRegion(); | 1667 resetPreserveRegion(); |
| 1691 | 1668 |
| 1692 fRegionStart = nativeStart; | 1669 fRegionStart = nativeStart; |
| 1693 fRegionLimit = nativeLimit; | 1670 fRegionLimit = nativeLimit; |
| 1694 fActiveStart = nativeStart; | 1671 fActiveStart = nativeStart; |
| 1695 fActiveLimit = nativeLimit; | 1672 fActiveLimit = nativeLimit; |
| 1696 | 1673 |
| 1697 if (startIndex != -1) { | 1674 if (startIndex != -1) { |
| 1698 if (startIndex < fActiveStart || startIndex > fActiveLimit) { | 1675 if (startIndex < fActiveStart || startIndex > fActiveLimit) { |
| 1699 status = U_INDEX_OUTOFBOUNDS_ERROR; | 1676 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 1700 } | 1677 } |
| 1701 fMatchEnd = startIndex; | 1678 fMatchEnd = startIndex; |
| 1702 } | 1679 } |
| 1703 | 1680 |
| 1704 if (!fTransparentBounds) { | 1681 if (!fTransparentBounds) { |
| 1705 fLookStart = nativeStart; | 1682 fLookStart = nativeStart; |
| 1706 fLookLimit = nativeLimit; | 1683 fLookLimit = nativeLimit; |
| 1707 } | 1684 } |
| 1708 if (fAnchoringBounds) { | 1685 if (fAnchoringBounds) { |
| 1709 fAnchorStart = nativeStart; | 1686 fAnchorStart = nativeStart; |
| 1710 fAnchorLimit = nativeLimit; | 1687 fAnchorLimit = nativeLimit; |
| 1711 } | 1688 } |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1748 // replaceAll | 1725 // replaceAll |
| 1749 // | 1726 // |
| 1750 //------------------------------------------------------------------------------
-- | 1727 //------------------------------------------------------------------------------
-- |
| 1751 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorC
ode &status) { | 1728 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorC
ode &status) { |
| 1752 UText replacementText = UTEXT_INITIALIZER; | 1729 UText replacementText = UTEXT_INITIALIZER; |
| 1753 UText resultText = UTEXT_INITIALIZER; | 1730 UText resultText = UTEXT_INITIALIZER; |
| 1754 UnicodeString resultString; | 1731 UnicodeString resultString; |
| 1755 if (U_FAILURE(status)) { | 1732 if (U_FAILURE(status)) { |
| 1756 return resultString; | 1733 return resultString; |
| 1757 } | 1734 } |
| 1758 | 1735 |
| 1759 utext_openConstUnicodeString(&replacementText, &replacement, &status); | 1736 utext_openConstUnicodeString(&replacementText, &replacement, &status); |
| 1760 utext_openUnicodeString(&resultText, &resultString, &status); | 1737 utext_openUnicodeString(&resultText, &resultString, &status); |
| 1761 | 1738 |
| 1762 replaceAll(&replacementText, &resultText, status); | 1739 replaceAll(&replacementText, &resultText, status); |
| 1763 | 1740 |
| 1764 utext_close(&resultText); | 1741 utext_close(&resultText); |
| 1765 utext_close(&replacementText); | 1742 utext_close(&replacementText); |
| 1766 | 1743 |
| 1767 return resultString; | 1744 return resultString; |
| 1768 } | 1745 } |
| 1769 | 1746 |
| 1770 | 1747 |
| 1771 // | 1748 // |
| 1772 // replaceAll, UText mode | 1749 // replaceAll, UText mode |
| 1773 // | 1750 // |
| 1774 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &sta
tus) { | 1751 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &sta
tus) { |
| 1775 if (U_FAILURE(status)) { | 1752 if (U_FAILURE(status)) { |
| 1776 return dest; | 1753 return dest; |
| 1777 } | 1754 } |
| 1778 if (U_FAILURE(fDeferredStatus)) { | 1755 if (U_FAILURE(fDeferredStatus)) { |
| 1779 status = fDeferredStatus; | 1756 status = fDeferredStatus; |
| 1780 return dest; | 1757 return dest; |
| 1781 } | 1758 } |
| 1782 | 1759 |
| 1783 if (dest == NULL) { | 1760 if (dest == NULL) { |
| 1784 UnicodeString emptyString; | 1761 UnicodeString emptyString; |
| 1785 UText empty = UTEXT_INITIALIZER; | 1762 UText empty = UTEXT_INITIALIZER; |
| 1786 | 1763 |
| 1787 utext_openUnicodeString(&empty, &emptyString, &status); | 1764 utext_openUnicodeString(&empty, &emptyString, &status); |
| 1788 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); | 1765 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); |
| 1789 utext_close(&empty); | 1766 utext_close(&empty); |
| 1790 } | 1767 } |
| 1791 | 1768 |
| 1792 if (U_SUCCESS(status)) { | 1769 if (U_SUCCESS(status)) { |
| 1793 reset(); | 1770 reset(); |
| 1794 while (find()) { | 1771 while (find()) { |
| 1795 appendReplacement(dest, replacement, status); | 1772 appendReplacement(dest, replacement, status); |
| 1796 if (U_FAILURE(status)) { | 1773 if (U_FAILURE(status)) { |
| 1797 break; | 1774 break; |
| 1798 } | 1775 } |
| 1799 } | 1776 } |
| 1800 appendTail(dest, status); | 1777 appendTail(dest, status); |
| 1801 } | 1778 } |
| 1802 | 1779 |
| 1803 return dest; | 1780 return dest; |
| 1804 } | 1781 } |
| 1805 | 1782 |
| 1806 | 1783 |
| 1807 //------------------------------------------------------------------------------
-- | 1784 //------------------------------------------------------------------------------
-- |
| 1808 // | 1785 // |
| 1809 // replaceFirst | 1786 // replaceFirst |
| 1810 // | 1787 // |
| 1811 //------------------------------------------------------------------------------
-- | 1788 //------------------------------------------------------------------------------
-- |
| 1812 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErro
rCode &status) { | 1789 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErro
rCode &status) { |
| 1813 UText replacementText = UTEXT_INITIALIZER; | 1790 UText replacementText = UTEXT_INITIALIZER; |
| 1814 UText resultText = UTEXT_INITIALIZER; | 1791 UText resultText = UTEXT_INITIALIZER; |
| 1815 UnicodeString resultString; | 1792 UnicodeString resultString; |
| 1816 | 1793 |
| 1817 utext_openConstUnicodeString(&replacementText, &replacement, &status); | 1794 utext_openConstUnicodeString(&replacementText, &replacement, &status); |
| 1818 utext_openUnicodeString(&resultText, &resultString, &status); | 1795 utext_openUnicodeString(&resultText, &resultString, &status); |
| 1819 | 1796 |
| 1820 replaceFirst(&replacementText, &resultText, status); | 1797 replaceFirst(&replacementText, &resultText, status); |
| 1821 | 1798 |
| 1822 utext_close(&resultText); | 1799 utext_close(&resultText); |
| 1823 utext_close(&replacementText); | 1800 utext_close(&replacementText); |
| 1824 | 1801 |
| 1825 return resultString; | 1802 return resultString; |
| 1826 } | 1803 } |
| 1827 | 1804 |
| 1828 // | 1805 // |
| 1829 // replaceFirst, UText mode | 1806 // replaceFirst, UText mode |
| 1830 // | 1807 // |
| 1831 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &s
tatus) { | 1808 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &s
tatus) { |
| 1832 if (U_FAILURE(status)) { | 1809 if (U_FAILURE(status)) { |
| 1833 return dest; | 1810 return dest; |
| 1834 } | 1811 } |
| 1835 if (U_FAILURE(fDeferredStatus)) { | 1812 if (U_FAILURE(fDeferredStatus)) { |
| 1836 status = fDeferredStatus; | 1813 status = fDeferredStatus; |
| 1837 return dest; | 1814 return dest; |
| 1838 } | 1815 } |
| 1839 | 1816 |
| 1840 reset(); | 1817 reset(); |
| 1841 if (!find()) { | 1818 if (!find()) { |
| 1842 return getInput(dest, status); | 1819 return getInput(dest, status); |
| 1843 } | 1820 } |
| 1844 | 1821 |
| 1845 if (dest == NULL) { | 1822 if (dest == NULL) { |
| 1846 UnicodeString emptyString; | 1823 UnicodeString emptyString; |
| 1847 UText empty = UTEXT_INITIALIZER; | 1824 UText empty = UTEXT_INITIALIZER; |
| 1848 | 1825 |
| 1849 utext_openUnicodeString(&empty, &emptyString, &status); | 1826 utext_openUnicodeString(&empty, &emptyString, &status); |
| 1850 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); | 1827 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); |
| 1851 utext_close(&empty); | 1828 utext_close(&empty); |
| 1852 } | 1829 } |
| 1853 | 1830 |
| 1854 appendReplacement(dest, replacement, status); | 1831 appendReplacement(dest, replacement, status); |
| 1855 appendTail(dest, status); | 1832 appendTail(dest, status); |
| 1856 | 1833 |
| 1857 return dest; | 1834 return dest; |
| 1858 } | 1835 } |
| 1859 | 1836 |
| 1860 | 1837 |
| 1861 //------------------------------------------------------------------------------
-- | 1838 //------------------------------------------------------------------------------
-- |
| 1862 // | 1839 // |
| 1863 // requireEnd | 1840 // requireEnd |
| 1864 // | 1841 // |
| 1865 //------------------------------------------------------------------------------
-- | 1842 //------------------------------------------------------------------------------
-- |
| 1866 UBool RegexMatcher::requireEnd() const { | 1843 UBool RegexMatcher::requireEnd() const { |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1901 //resetStack(); // more expensive than it looks... | 1878 //resetStack(); // more expensive than it looks... |
| 1902 } | 1879 } |
| 1903 | 1880 |
| 1904 | 1881 |
| 1905 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { | 1882 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { |
| 1906 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStat
us); | 1883 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStat
us); |
| 1907 if (fPattern->fNeedsAltInput) { | 1884 if (fPattern->fNeedsAltInput) { |
| 1908 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDe
ferredStatus); | 1885 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDe
ferredStatus); |
| 1909 } | 1886 } |
| 1910 fInputLength = utext_nativeLength(fInputText); | 1887 fInputLength = utext_nativeLength(fInputText); |
| 1911 | 1888 |
| 1912 reset(); | 1889 reset(); |
| 1913 delete fInput; | 1890 delete fInput; |
| 1914 fInput = NULL; | 1891 fInput = NULL; |
| 1915 | 1892 |
| 1916 // Do the following for any UnicodeString. | 1893 // Do the following for any UnicodeString. |
| 1917 // This is for compatibility for those clients who modify the input string
"live" during regex operations. | 1894 // This is for compatibility for those clients who modify the input string
"live" during regex operations. |
| 1918 fInputUniStrMaybeMutable = TRUE; | 1895 fInputUniStrMaybeMutable = TRUE; |
| 1919 | 1896 |
| 1920 if (fWordBreakItr != NULL) { | 1897 if (fWordBreakItr != NULL) { |
| 1921 #if UCONFIG_NO_BREAK_ITERATION==0 | 1898 #if UCONFIG_NO_BREAK_ITERATION==0 |
| 1922 UErrorCode status = U_ZERO_ERROR; | 1899 UErrorCode status = U_ZERO_ERROR; |
| 1923 fWordBreakItr->setText(fInputText, status); | 1900 fWordBreakItr->setText(fInputText, status); |
| 1924 #endif | 1901 #endif |
| 1925 } | 1902 } |
| 1926 return *this; | 1903 return *this; |
| 1927 } | 1904 } |
| 1928 | 1905 |
| 1929 | 1906 |
| 1930 RegexMatcher &RegexMatcher::reset(UText *input) { | 1907 RegexMatcher &RegexMatcher::reset(UText *input) { |
| 1931 if (fInputText != input) { | 1908 if (fInputText != input) { |
| 1932 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatu
s); | 1909 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatu
s); |
| 1933 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText,
fInputText, FALSE, TRUE, &fDeferredStatus); | 1910 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText,
fInputText, FALSE, TRUE, &fDeferredStatus); |
| 1934 fInputLength = utext_nativeLength(fInputText); | 1911 fInputLength = utext_nativeLength(fInputText); |
| 1935 | 1912 |
| 1936 delete fInput; | 1913 delete fInput; |
| 1937 fInput = NULL; | 1914 fInput = NULL; |
| 1938 | 1915 |
| 1939 if (fWordBreakItr != NULL) { | 1916 if (fWordBreakItr != NULL) { |
| 1940 #if UCONFIG_NO_BREAK_ITERATION==0 | 1917 #if UCONFIG_NO_BREAK_ITERATION==0 |
| 1941 UErrorCode status = U_ZERO_ERROR; | 1918 UErrorCode status = U_ZERO_ERROR; |
| 1942 fWordBreakItr->setText(input, status); | 1919 fWordBreakItr->setText(input, status); |
| 1943 #endif | 1920 #endif |
| 1944 } | 1921 } |
| 1945 } | 1922 } |
| 1946 reset(); | 1923 reset(); |
| 1947 fInputUniStrMaybeMutable = FALSE; | 1924 fInputUniStrMaybeMutable = FALSE; |
| 1948 | 1925 |
| 1949 return *this; | 1926 return *this; |
| 1950 } | 1927 } |
| 1951 | 1928 |
| 1952 /*RegexMatcher &RegexMatcher::reset(const UChar *) { | 1929 /*RegexMatcher &RegexMatcher::reset(const UChar *) { |
| 1953 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; | 1930 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; |
| 1954 return *this; | 1931 return *this; |
| 1955 }*/ | 1932 }*/ |
| 1956 | 1933 |
| 1957 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { | 1934 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { |
| 1958 if (U_FAILURE(status)) { | 1935 if (U_FAILURE(status)) { |
| 1959 return *this; | 1936 return *this; |
| 1960 } | 1937 } |
| 1961 reset(); // Reset also resets the region to be the entire string. | 1938 reset(); // Reset also resets the region to be the entire string. |
| 1962 | 1939 |
| 1963 if (position < 0 || position > fActiveLimit) { | 1940 if (position < 0 || position > fActiveLimit) { |
| 1964 status = U_INDEX_OUTOFBOUNDS_ERROR; | 1941 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 1965 return *this; | 1942 return *this; |
| 1966 } | 1943 } |
| 1967 fMatchEnd = position; | 1944 fMatchEnd = position; |
| 1968 return *this; | 1945 return *this; |
| 1969 } | 1946 } |
| 1970 | 1947 |
| 1971 | 1948 |
| 1972 //------------------------------------------------------------------------------
-- | 1949 //------------------------------------------------------------------------------
-- |
| (...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2036 | 2013 |
| 2037 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); | 2014 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); |
| 2038 if (destText == NULL) { | 2015 if (destText == NULL) { |
| 2039 status = U_MEMORY_ALLOCATION_ERROR; | 2016 status = U_MEMORY_ALLOCATION_ERROR; |
| 2040 return 0; | 2017 return 0; |
| 2041 } | 2018 } |
| 2042 int32_t i; | 2019 int32_t i; |
| 2043 for (i = 0; i < destCapacity; i++) { | 2020 for (i = 0; i < destCapacity; i++) { |
| 2044 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); | 2021 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); |
| 2045 } | 2022 } |
| 2046 | 2023 |
| 2047 int32_t fieldCount = split(&inputText, destText, destCapacity, status); | 2024 int32_t fieldCount = split(&inputText, destText, destCapacity, status); |
| 2048 | 2025 |
| 2049 for (i = 0; i < destCapacity; i++) { | 2026 for (i = 0; i < destCapacity; i++) { |
| 2050 utext_close(destText[i]); | 2027 utext_close(destText[i]); |
| 2051 } | 2028 } |
| 2052 | 2029 |
| 2053 uprv_free(destText); | 2030 uprv_free(destText); |
| 2054 utext_close(&inputText); | 2031 utext_close(&inputText); |
| 2055 return fieldCount; | 2032 return fieldCount; |
| 2056 } | 2033 } |
| 2057 | 2034 |
| 2058 // | 2035 // |
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2094 // There is one or zero output string left. | 2071 // There is one or zero output string left. |
| 2095 // Fill the last output string with whatever is left from the input,
then exit the loop. | 2072 // Fill the last output string with whatever is left from the input,
then exit the loop. |
| 2096 // ( i will be == destCapacity if we filled the output array while
processing | 2073 // ( i will be == destCapacity if we filled the output array while
processing |
| 2097 // capture groups of the delimiter expression, in which case we w
ill discard the | 2074 // capture groups of the delimiter expression, in which case we w
ill discard the |
| 2098 // last capture group saved in favor of the unprocessed remainder
of the | 2075 // last capture group saved in favor of the unprocessed remainder
of the |
| 2099 // input string.) | 2076 // input string.) |
| 2100 i = destCapacity-1; | 2077 i = destCapacity-1; |
| 2101 if (fActiveLimit > nextOutputStringStart) { | 2078 if (fActiveLimit > nextOutputStringStart) { |
| 2102 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { | 2079 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { |
| 2103 if (dest[i]) { | 2080 if (dest[i]) { |
| 2104 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), | 2081 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), |
| 2105 input->chunkContents+nextOutputStringStart
, | 2082 input->chunkContents+nextOutputStringStart
, |
| 2106 (int32_t)(fActiveLimit-nextOutputStringSta
rt), &status); | 2083 (int32_t)(fActiveLimit-nextOutputStringSta
rt), &status); |
| 2107 } else { | 2084 } else { |
| 2108 UText remainingText = UTEXT_INITIALIZER; | 2085 UText remainingText = UTEXT_INITIALIZER; |
| 2109 utext_openUChars(&remainingText, input->chunkContents+ne
xtOutputStringStart, | 2086 utext_openUChars(&remainingText, input->chunkContents+ne
xtOutputStringStart, |
| 2110 fActiveLimit-nextOutputStringStart, &st
atus); | 2087 fActiveLimit-nextOutputStringStart, &st
atus); |
| 2111 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE,
&status); | 2088 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE,
&status); |
| 2112 utext_close(&remainingText); | 2089 utext_close(&remainingText); |
| 2113 } | 2090 } |
| 2114 } else { | 2091 } else { |
| 2115 UErrorCode lengthStatus = U_ZERO_ERROR; | 2092 UErrorCode lengthStatus = U_ZERO_ERROR; |
| 2116 int32_t remaining16Length = | 2093 int32_t remaining16Length = |
| 2117 utext_extract(input, nextOutputStringStart, fActiveLimit
, NULL, 0, &lengthStatus); | 2094 utext_extract(input, nextOutputStringStart, fActiveLimit
, NULL, 0, &lengthStatus); |
| 2118 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(
remaining16Length+1)); | 2095 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(
remaining16Length+1)); |
| 2119 if (remainingChars == NULL) { | 2096 if (remainingChars == NULL) { |
| 2120 status = U_MEMORY_ALLOCATION_ERROR; | 2097 status = U_MEMORY_ALLOCATION_ERROR; |
| 2121 break; | 2098 break; |
| 2122 } | 2099 } |
| 2123 | 2100 |
| 2124 utext_extract(input, nextOutputStringStart, fActiveLimit, re
mainingChars, remaining16Length+1, &status); | 2101 utext_extract(input, nextOutputStringStart, fActiveLimit, re
mainingChars, remaining16Length+1, &status); |
| 2125 if (dest[i]) { | 2102 if (dest[i]) { |
| 2126 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), r
emainingChars, remaining16Length, &status); | 2103 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), r
emainingChars, remaining16Length, &status); |
| 2127 } else { | 2104 } else { |
| 2128 UText remainingText = UTEXT_INITIALIZER; | 2105 UText remainingText = UTEXT_INITIALIZER; |
| 2129 utext_openUChars(&remainingText, remainingChars, remaini
ng16Length, &status); | 2106 utext_openUChars(&remainingText, remainingChars, remaini
ng16Length, &status); |
| 2130 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE,
&status); | 2107 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE,
&status); |
| 2131 utext_close(&remainingText); | 2108 utext_close(&remainingText); |
| 2132 } | 2109 } |
| 2133 | 2110 |
| 2134 uprv_free(remainingChars); | 2111 uprv_free(remainingChars); |
| 2135 } | 2112 } |
| 2136 } | 2113 } |
| 2137 break; | 2114 break; |
| 2138 } | 2115 } |
| 2139 if (find()) { | 2116 if (find()) { |
| 2140 // We found another delimiter. Move everything from where we starte
d looking | 2117 // We found another delimiter. Move everything from where we starte
d looking |
| 2141 // up until the start of the delimiter into the next output string. | 2118 // up until the start of the delimiter into the next output string. |
| 2142 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { | 2119 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { |
| 2143 if (dest[i]) { | 2120 if (dest[i]) { |
| 2144 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), | 2121 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), |
| 2145 input->chunkContents+nextOutputStringStart, | 2122 input->chunkContents+nextOutputStringStart, |
| 2146 (int32_t)(fMatchStart-nextOutputStringStart),
&status); | 2123 (int32_t)(fMatchStart-nextOutputStringStart),
&status); |
| 2147 } else { | 2124 } else { |
| 2148 UText remainingText = UTEXT_INITIALIZER; | 2125 UText remainingText = UTEXT_INITIALIZER; |
| 2149 utext_openUChars(&remainingText, input->chunkContents+nextOu
tputStringStart, | 2126 utext_openUChars(&remainingText, input->chunkContents+nextOu
tputStringStart, |
| 2150 fMatchStart-nextOutputStringStart, &status
); | 2127 fMatchStart-nextOutputStringStart, &status
); |
| 2151 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); | 2128 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); |
| 2152 utext_close(&remainingText); | 2129 utext_close(&remainingText); |
| 2153 } | 2130 } |
| 2154 } else { | 2131 } else { |
| 2155 UErrorCode lengthStatus = U_ZERO_ERROR; | 2132 UErrorCode lengthStatus = U_ZERO_ERROR; |
| 2156 int32_t remaining16Length = utext_extract(input, nextOutputStrin
gStart, fMatchStart, NULL, 0, &lengthStatus); | 2133 int32_t remaining16Length = utext_extract(input, nextOutputStrin
gStart, fMatchStart, NULL, 0, &lengthStatus); |
| 2157 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(rema
ining16Length+1)); | 2134 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(rema
ining16Length+1)); |
| 2158 if (remainingChars == NULL) { | 2135 if (remainingChars == NULL) { |
| 2159 status = U_MEMORY_ALLOCATION_ERROR; | 2136 status = U_MEMORY_ALLOCATION_ERROR; |
| 2160 break; | 2137 break; |
| 2161 } | 2138 } |
| 2162 utext_extract(input, nextOutputStringStart, fMatchStart, remaini
ngChars, remaining16Length+1, &status); | 2139 utext_extract(input, nextOutputStringStart, fMatchStart, remaini
ngChars, remaining16Length+1, &status); |
| 2163 if (dest[i]) { | 2140 if (dest[i]) { |
| 2164 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remai
ningChars, remaining16Length, &status); | 2141 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remai
ningChars, remaining16Length, &status); |
| 2165 } else { | 2142 } else { |
| 2166 UText remainingText = UTEXT_INITIALIZER; | 2143 UText remainingText = UTEXT_INITIALIZER; |
| 2167 utext_openUChars(&remainingText, remainingChars, remaining16
Length, &status); | 2144 utext_openUChars(&remainingText, remainingChars, remaining16
Length, &status); |
| 2168 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); | 2145 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); |
| 2169 utext_close(&remainingText); | 2146 utext_close(&remainingText); |
| 2170 } | 2147 } |
| 2171 | 2148 |
| 2172 uprv_free(remainingChars); | 2149 uprv_free(remainingChars); |
| 2173 } | 2150 } |
| 2174 nextOutputStringStart = fMatchEnd; | 2151 nextOutputStringStart = fMatchEnd; |
| 2175 | 2152 |
| 2176 // If the delimiter pattern has capturing parentheses, the captured | 2153 // If the delimiter pattern has capturing parentheses, the captured |
| 2177 // text goes out into the next n destination strings. | 2154 // text goes out into the next n destination strings. |
| 2178 int32_t groupNum; | 2155 int32_t groupNum; |
| 2179 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { | 2156 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { |
| 2180 if (i >= destCapacity-2) { | 2157 if (i >= destCapacity-2) { |
| 2181 // Never fill the last available output string with capture
group text. | 2158 // Never fill the last available output string with capture
group text. |
| (...skipping 12 matching lines...) Expand all Loading... |
| 2194 if (i+1 < destCapacity) { | 2171 if (i+1 < destCapacity) { |
| 2195 ++i; | 2172 ++i; |
| 2196 if (dest[i] == NULL) { | 2173 if (dest[i] == NULL) { |
| 2197 dest[i] = utext_openUChars(NULL, NULL, 0, &status); | 2174 dest[i] = utext_openUChars(NULL, NULL, 0, &status); |
| 2198 } else { | 2175 } else { |
| 2199 static UChar emptyString[] = {(UChar)0}; | 2176 static UChar emptyString[] = {(UChar)0}; |
| 2200 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), e
mptyString, 0, &status); | 2177 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), e
mptyString, 0, &status); |
| 2201 } | 2178 } |
| 2202 } | 2179 } |
| 2203 break; | 2180 break; |
| 2204 | 2181 |
| 2205 } | 2182 } |
| 2206 } | 2183 } |
| 2207 else | 2184 else |
| 2208 { | 2185 { |
| 2209 // We ran off the end of the input while looking for the next delimi
ter. | 2186 // We ran off the end of the input while looking for the next delimi
ter. |
| 2210 // All the remaining text goes into the current output string. | 2187 // All the remaining text goes into the current output string. |
| 2211 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { | 2188 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { |
| 2212 if (dest[i]) { | 2189 if (dest[i]) { |
| 2213 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), | 2190 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), |
| 2214 input->chunkContents+nextOutputStringStart, | 2191 input->chunkContents+nextOutputStringStart, |
| 2215 (int32_t)(fActiveLimit-nextOutputStringStart),
&status); | 2192 (int32_t)(fActiveLimit-nextOutputStringStart),
&status); |
| 2216 } else { | 2193 } else { |
| 2217 UText remainingText = UTEXT_INITIALIZER; | 2194 UText remainingText = UTEXT_INITIALIZER; |
| 2218 utext_openUChars(&remainingText, input->chunkContents+nextOu
tputStringStart, | 2195 utext_openUChars(&remainingText, input->chunkContents+nextOu
tputStringStart, |
| 2219 fActiveLimit-nextOutputStringStart, &status
); | 2196 fActiveLimit-nextOutputStringStart, &status
); |
| 2220 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); | 2197 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); |
| 2221 utext_close(&remainingText); | 2198 utext_close(&remainingText); |
| 2222 } | 2199 } |
| 2223 } else { | 2200 } else { |
| 2224 UErrorCode lengthStatus = U_ZERO_ERROR; | 2201 UErrorCode lengthStatus = U_ZERO_ERROR; |
| 2225 int32_t remaining16Length = utext_extract(input, nextOutputStrin
gStart, fActiveLimit, NULL, 0, &lengthStatus); | 2202 int32_t remaining16Length = utext_extract(input, nextOutputStrin
gStart, fActiveLimit, NULL, 0, &lengthStatus); |
| 2226 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(rema
ining16Length+1)); | 2203 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(rema
ining16Length+1)); |
| 2227 if (remainingChars == NULL) { | 2204 if (remainingChars == NULL) { |
| 2228 status = U_MEMORY_ALLOCATION_ERROR; | 2205 status = U_MEMORY_ALLOCATION_ERROR; |
| 2229 break; | 2206 break; |
| 2230 } | 2207 } |
| 2231 | 2208 |
| 2232 utext_extract(input, nextOutputStringStart, fActiveLimit, remain
ingChars, remaining16Length+1, &status); | 2209 utext_extract(input, nextOutputStringStart, fActiveLimit, remain
ingChars, remaining16Length+1, &status); |
| 2233 if (dest[i]) { | 2210 if (dest[i]) { |
| 2234 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remai
ningChars, remaining16Length, &status); | 2211 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remai
ningChars, remaining16Length, &status); |
| 2235 } else { | 2212 } else { |
| 2236 UText remainingText = UTEXT_INITIALIZER; | 2213 UText remainingText = UTEXT_INITIALIZER; |
| 2237 utext_openUChars(&remainingText, remainingChars, remaining16
Length, &status); | 2214 utext_openUChars(&remainingText, remainingChars, remaining16
Length, &status); |
| 2238 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); | 2215 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); |
| 2239 utext_close(&remainingText); | 2216 utext_close(&remainingText); |
| 2240 } | 2217 } |
| 2241 | 2218 |
| 2242 uprv_free(remainingChars); | 2219 uprv_free(remainingChars); |
| 2243 } | 2220 } |
| 2244 break; | 2221 break; |
| 2245 } | 2222 } |
| 2246 if (U_FAILURE(status)) { | 2223 if (U_FAILURE(status)) { |
| 2247 break; | 2224 break; |
| 2248 } | 2225 } |
| 2249 } // end of for loop | 2226 } // end of for loop |
| 2250 return i+1; | 2227 return i+1; |
| 2251 } | 2228 } |
| (...skipping 29 matching lines...) Expand all Loading... |
| 2281 if (fMatch == FALSE) { | 2258 if (fMatch == FALSE) { |
| 2282 status = U_REGEX_INVALID_STATE; | 2259 status = U_REGEX_INVALID_STATE; |
| 2283 return -1; | 2260 return -1; |
| 2284 } | 2261 } |
| 2285 if (group < 0 || group > fPattern->fGroupMap->size()) { | 2262 if (group < 0 || group > fPattern->fGroupMap->size()) { |
| 2286 status = U_INDEX_OUTOFBOUNDS_ERROR; | 2263 status = U_INDEX_OUTOFBOUNDS_ERROR; |
| 2287 return -1; | 2264 return -1; |
| 2288 } | 2265 } |
| 2289 int64_t s; | 2266 int64_t s; |
| 2290 if (group == 0) { | 2267 if (group == 0) { |
| 2291 s = fMatchStart; | 2268 s = fMatchStart; |
| 2292 } else { | 2269 } else { |
| 2293 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); | 2270 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); |
| 2294 U_ASSERT(groupOffset < fPattern->fFrameSize); | 2271 U_ASSERT(groupOffset < fPattern->fFrameSize); |
| 2295 U_ASSERT(groupOffset >= 0); | 2272 U_ASSERT(groupOffset >= 0); |
| 2296 s = fFrame->fExtra[groupOffset]; | 2273 s = fFrame->fExtra[groupOffset]; |
| 2297 } | 2274 } |
| 2298 | 2275 |
| 2299 return s; | 2276 return s; |
| 2300 } | 2277 } |
| 2301 | 2278 |
| 2302 | 2279 |
| 2303 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { | 2280 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { |
| 2304 return (int32_t)start64(group, status); | 2281 return (int32_t)start64(group, status); |
| 2305 } | 2282 } |
| 2306 | 2283 |
| 2307 //------------------------------------------------------------------------------
-- | 2284 //------------------------------------------------------------------------------
-- |
| 2308 // | 2285 // |
| (...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2370 return; | 2347 return; |
| 2371 } | 2348 } |
| 2372 if (U_FAILURE(fDeferredStatus)) { | 2349 if (U_FAILURE(fDeferredStatus)) { |
| 2373 status = fDeferredStatus; | 2350 status = fDeferredStatus; |
| 2374 return; | 2351 return; |
| 2375 } | 2352 } |
| 2376 if (limit < 0) { | 2353 if (limit < 0) { |
| 2377 status = U_ILLEGAL_ARGUMENT_ERROR; | 2354 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 2378 return; | 2355 return; |
| 2379 } | 2356 } |
| 2380 | 2357 |
| 2381 // Reset the matcher. This is needed here in case there is a current match | 2358 // Reset the matcher. This is needed here in case there is a current match |
| 2382 // whose final stack frame (containing the match results, pointed to by f
Frame) | 2359 // whose final stack frame (containing the match results, pointed to by f
Frame) |
| 2383 // would be lost by resizing to a smaller stack size. | 2360 // would be lost by resizing to a smaller stack size. |
| 2384 reset(); | 2361 reset(); |
| 2385 | 2362 |
| 2386 if (limit == 0) { | 2363 if (limit == 0) { |
| 2387 // Unlimited stack expansion | 2364 // Unlimited stack expansion |
| 2388 fStack->setMaxCapacity(0); | 2365 fStack->setMaxCapacity(0); |
| 2389 } else { | 2366 } else { |
| 2390 // Change the units of the limit from bytes to ints, and bump the size
up | 2367 // Change the units of the limit from bytes to ints, and bump the size
up |
| 2391 // to be big enough to hold at least one stack frame for the pattern, | 2368 // to be big enough to hold at least one stack frame for the pattern, |
| 2392 // if it isn't there already. | 2369 // if it isn't there already. |
| 2393 int32_t adjustedLimit = limit / sizeof(int32_t); | 2370 int32_t adjustedLimit = limit / sizeof(int32_t); |
| 2394 if (adjustedLimit < fPattern->fFrameSize) { | 2371 if (adjustedLimit < fPattern->fFrameSize) { |
| 2395 adjustedLimit = fPattern->fFrameSize; | 2372 adjustedLimit = fPattern->fFrameSize; |
| 2396 } | 2373 } |
| 2397 fStack->setMaxCapacity(adjustedLimit); | 2374 fStack->setMaxCapacity(adjustedLimit); |
| 2398 } | 2375 } |
| 2399 fStackLimit = limit; | 2376 fStackLimit = limit; |
| 2400 } | 2377 } |
| 2401 | 2378 |
| (...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2479 // Code following this point in this file is the internal | 2456 // Code following this point in this file is the internal |
| 2480 // Match Engine Implementation. | 2457 // Match Engine Implementation. |
| 2481 // | 2458 // |
| 2482 //==============================================================================
== | 2459 //==============================================================================
== |
| 2483 | 2460 |
| 2484 | 2461 |
| 2485 //------------------------------------------------------------------------------
-- | 2462 //------------------------------------------------------------------------------
-- |
| 2486 // | 2463 // |
| 2487 // resetStack | 2464 // resetStack |
| 2488 // Discard any previous contents of the state save stack, and initiali
ze a | 2465 // Discard any previous contents of the state save stack, and initiali
ze a |
| 2489 // new stack frame to all -1. The -1s are needed for capture group li
mits, | 2466 // new stack frame to all -1. The -1s are needed for capture group li
mits, |
| 2490 // where they indicate that a group has not yet matched anything. | 2467 // where they indicate that a group has not yet matched anything. |
| 2491 //------------------------------------------------------------------------------
-- | 2468 //------------------------------------------------------------------------------
-- |
| 2492 REStackFrame *RegexMatcher::resetStack() { | 2469 REStackFrame *RegexMatcher::resetStack() { |
| 2493 // Discard any previous contents of the state save stack, and initialize a | 2470 // Discard any previous contents of the state save stack, and initialize a |
| 2494 // new stack frame with all -1 data. The -1s are needed for capture group
limits, | 2471 // new stack frame with all -1 data. The -1s are needed for capture group
limits, |
| 2495 // where they indicate that a group has not yet matched anything. | 2472 // where they indicate that a group has not yet matched anything. |
| 2496 fStack->removeAllElements(); | 2473 fStack->removeAllElements(); |
| 2497 | 2474 |
| 2498 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrame
Size, fDeferredStatus); | 2475 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrame
Size, fDeferredStatus); |
| 2499 int32_t i; | 2476 int32_t i; |
| 2500 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { | 2477 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { |
| 2501 iFrame->fExtra[i] = -1; | 2478 iFrame->fExtra[i] = -1; |
| 2502 } | 2479 } |
| 2503 return iFrame; | 2480 return iFrame; |
| 2504 } | 2481 } |
| 2505 | 2482 |
| 2506 | 2483 |
| 2507 | 2484 |
| 2508 //------------------------------------------------------------------------------
-- | 2485 //------------------------------------------------------------------------------
-- |
| 2509 // | 2486 // |
| 2510 // isWordBoundary | 2487 // isWordBoundary |
| 2511 // in perl, "xab..cd..", \b is true at positions 0,3,5,7 | 2488 // in perl, "xab..cd..", \b is true at positions 0,3,5,7 |
| 2512 // For us, | 2489 // For us, |
| 2513 // If the current char is a combining mark, | 2490 // If the current char is a combining mark, |
| 2514 // \b is FALSE. | 2491 // \b is FALSE. |
| 2515 // Else Scan backwards to the first non-combining char. | 2492 // Else Scan backwards to the first non-combining char. |
| 2516 // We are at a boundary if the this char and the orig
inal chars are | 2493 // We are at a boundary if the this char and the orig
inal chars are |
| 2517 // opposite in membership in \w set | 2494 // opposite in membership in \w set |
| 2518 // | 2495 // |
| 2519 // parameters: pos - the current position in the input buffer | 2496 // parameters: pos - the current position in the input buffer |
| 2520 // | 2497 // |
| 2521 // TODO: double-check edge cases at region boundaries. | 2498 // TODO: double-check edge cases at region boundaries. |
| 2522 // | 2499 // |
| 2523 //------------------------------------------------------------------------------
-- | 2500 //------------------------------------------------------------------------------
-- |
| 2524 UBool RegexMatcher::isWordBoundary(int64_t pos) { | 2501 UBool RegexMatcher::isWordBoundary(int64_t pos) { |
| 2525 UBool isBoundary = FALSE; | 2502 UBool isBoundary = FALSE; |
| 2526 UBool cIsWord = FALSE; | 2503 UBool cIsWord = FALSE; |
| 2527 | 2504 |
| 2528 if (pos >= fLookLimit) { | 2505 if (pos >= fLookLimit) { |
| 2529 fHitEnd = TRUE; | 2506 fHitEnd = TRUE; |
| 2530 } else { | 2507 } else { |
| 2531 // Determine whether char c at current position is a member of the word
set of chars. | 2508 // Determine whether char c at current position is a member of the word
set of chars. |
| 2532 // If we're off the end of the string, behave as though we're not at a w
ord char. | 2509 // If we're off the end of the string, behave as though we're not at a w
ord char. |
| 2533 UTEXT_SETNATIVEINDEX(fInputText, pos); | 2510 UTEXT_SETNATIVEINDEX(fInputText, pos); |
| 2534 UChar32 c = UTEXT_CURRENT32(fInputText); | 2511 UChar32 c = UTEXT_CURRENT32(fInputText); |
| 2535 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_
FORMAT_CHAR) { | 2512 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_
FORMAT_CHAR) { |
| 2536 // Current char is a combining one. Not a boundary. | 2513 // Current char is a combining one. Not a boundary. |
| 2537 return FALSE; | 2514 return FALSE; |
| 2538 } | 2515 } |
| 2539 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); | 2516 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); |
| 2540 } | 2517 } |
| 2541 | 2518 |
| 2542 // Back up until we come to a non-combining char, determine whether | 2519 // Back up until we come to a non-combining char, determine whether |
| 2543 // that char is a word char. | 2520 // that char is a word char. |
| 2544 UBool prevCIsWord = FALSE; | 2521 UBool prevCIsWord = FALSE; |
| 2545 for (;;) { | 2522 for (;;) { |
| 2546 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { | 2523 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { |
| 2547 break; | 2524 break; |
| 2548 } | 2525 } |
| 2549 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); | 2526 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); |
| 2550 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) | 2527 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) |
| 2551 || u_charType(prevChar) == U_FORMAT_CHAR)) { | 2528 || u_charType(prevChar) == U_FORMAT_CHAR)) { |
| 2552 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevCh
ar); | 2529 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevCh
ar); |
| 2553 break; | 2530 break; |
| 2554 } | 2531 } |
| 2555 } | 2532 } |
| 2556 isBoundary = cIsWord ^ prevCIsWord; | 2533 isBoundary = cIsWord ^ prevCIsWord; |
| 2557 return isBoundary; | 2534 return isBoundary; |
| 2558 } | 2535 } |
| 2559 | 2536 |
| 2560 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { | 2537 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { |
| 2561 UBool isBoundary = FALSE; | 2538 UBool isBoundary = FALSE; |
| 2562 UBool cIsWord = FALSE; | 2539 UBool cIsWord = FALSE; |
| 2563 | 2540 |
| 2564 const UChar *inputBuf = fInputText->chunkContents; | 2541 const UChar *inputBuf = fInputText->chunkContents; |
| 2565 | 2542 |
| 2566 if (pos >= fLookLimit) { | 2543 if (pos >= fLookLimit) { |
| 2567 fHitEnd = TRUE; | 2544 fHitEnd = TRUE; |
| 2568 } else { | 2545 } else { |
| 2569 // Determine whether char c at current position is a member of the word
set of chars. | 2546 // Determine whether char c at current position is a member of the word
set of chars. |
| 2570 // If we're off the end of the string, behave as though we're not at a w
ord char. | 2547 // If we're off the end of the string, behave as though we're not at a w
ord char. |
| 2571 UChar32 c; | 2548 UChar32 c; |
| 2572 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); | 2549 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); |
| 2573 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_
FORMAT_CHAR) { | 2550 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_
FORMAT_CHAR) { |
| 2574 // Current char is a combining one. Not a boundary. | 2551 // Current char is a combining one. Not a boundary. |
| 2575 return FALSE; | 2552 return FALSE; |
| 2576 } | 2553 } |
| 2577 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); | 2554 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); |
| 2578 } | 2555 } |
| 2579 | 2556 |
| 2580 // Back up until we come to a non-combining char, determine whether | 2557 // Back up until we come to a non-combining char, determine whether |
| 2581 // that char is a word char. | 2558 // that char is a word char. |
| 2582 UBool prevCIsWord = FALSE; | 2559 UBool prevCIsWord = FALSE; |
| 2583 for (;;) { | 2560 for (;;) { |
| 2584 if (pos <= fLookStart) { | 2561 if (pos <= fLookStart) { |
| 2585 break; | 2562 break; |
| 2586 } | 2563 } |
| 2587 UChar32 prevChar; | 2564 UChar32 prevChar; |
| 2588 U16_PREV(inputBuf, fLookStart, pos, prevChar); | 2565 U16_PREV(inputBuf, fLookStart, pos, prevChar); |
| 2589 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) | 2566 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) |
| 2590 || u_charType(prevChar) == U_FORMAT_CHAR)) { | 2567 || u_charType(prevChar) == U_FORMAT_CHAR)) { |
| 2591 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevCh
ar); | 2568 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevCh
ar); |
| 2592 break; | 2569 break; |
| 2593 } | 2570 } |
| 2594 } | 2571 } |
| 2595 isBoundary = cIsWord ^ prevCIsWord; | 2572 isBoundary = cIsWord ^ prevCIsWord; |
| 2596 return isBoundary; | 2573 return isBoundary; |
| 2597 } | 2574 } |
| 2598 | 2575 |
| 2599 //------------------------------------------------------------------------------
-- | 2576 //------------------------------------------------------------------------------
-- |
| 2600 // | 2577 // |
| 2601 // isUWordBoundary | 2578 // isUWordBoundary |
| 2602 // | 2579 // |
| 2603 // Test for a word boundary using RBBI word break. | 2580 // Test for a word boundary using RBBI word break. |
| 2604 // | 2581 // |
| 2605 // parameters: pos - the current position in the input buffer | 2582 // parameters: pos - the current position in the input buffer |
| 2606 // | 2583 // |
| 2607 //------------------------------------------------------------------------------
-- | 2584 //------------------------------------------------------------------------------
-- |
| 2608 UBool RegexMatcher::isUWordBoundary(int64_t pos) { | 2585 UBool RegexMatcher::isUWordBoundary(int64_t pos) { |
| 2609 UBool returnVal = FALSE; | 2586 UBool returnVal = FALSE; |
| 2610 #if UCONFIG_NO_BREAK_ITERATION==0 | 2587 #if UCONFIG_NO_BREAK_ITERATION==0 |
| 2611 | 2588 |
| 2612 // If we haven't yet created a break iterator for this matcher, do it now. | 2589 // If we haven't yet created a break iterator for this matcher, do it now. |
| 2613 if (fWordBreakItr == NULL) { | 2590 if (fWordBreakItr == NULL) { |
| 2614 fWordBreakItr = | 2591 fWordBreakItr = |
| 2615 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::
getEnglish(), fDeferredStatus); | 2592 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::
getEnglish(), fDeferredStatus); |
| 2616 if (U_FAILURE(fDeferredStatus)) { | 2593 if (U_FAILURE(fDeferredStatus)) { |
| 2617 return FALSE; | 2594 return FALSE; |
| 2618 } | 2595 } |
| 2619 fWordBreakItr->setText(fInputText, fDeferredStatus); | 2596 fWordBreakItr->setText(fInputText, fDeferredStatus); |
| 2620 } | 2597 } |
| 2621 | 2598 |
| 2622 if (pos >= fLookLimit) { | 2599 if (pos >= fLookLimit) { |
| 2623 fHitEnd = TRUE; | 2600 fHitEnd = TRUE; |
| 2624 returnVal = TRUE; // With Unicode word rules, only positions within th
e interior of "real" | 2601 returnVal = TRUE; // With Unicode word rules, only positions within th
e interior of "real" |
| (...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2656 return; | 2633 return; |
| 2657 } | 2634 } |
| 2658 } | 2635 } |
| 2659 if (fTimeLimit > 0 && fTime >= fTimeLimit) { | 2636 if (fTimeLimit > 0 && fTime >= fTimeLimit) { |
| 2660 status = U_REGEX_TIME_OUT; | 2637 status = U_REGEX_TIME_OUT; |
| 2661 } | 2638 } |
| 2662 } | 2639 } |
| 2663 | 2640 |
| 2664 //------------------------------------------------------------------------------
-- | 2641 //------------------------------------------------------------------------------
-- |
| 2665 // | 2642 // |
| 2666 // ReportFindProgress This function is called once for each advance in the
target | |
| 2667 // string from the find() function, and calls the user
progress callback | |
| 2668 // function if there is one installed. | |
| 2669 // | |
| 2670 // NOTE: | |
| 2671 // | |
| 2672 // If the match operation needs to be aborted because t
he user | |
| 2673 // callback asked for it, just set an error status. | |
| 2674 // The engine will pick that up and stop in its outer l
oop. | |
| 2675 // | |
| 2676 //------------------------------------------------------------------------------
-- | |
| 2677 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) { | |
| 2678 if (fFindProgressCallbackFn != NULL) { | |
| 2679 if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex)
== FALSE) { | |
| 2680 status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/; | |
| 2681 return FALSE; | |
| 2682 } | |
| 2683 } | |
| 2684 return TRUE; | |
| 2685 } | |
| 2686 | |
| 2687 //------------------------------------------------------------------------------
-- | |
| 2688 // | |
| 2689 // StateSave | 2643 // StateSave |
| 2690 // Make a new stack frame, initialized as a copy of the current stack fram
e. | 2644 // Make a new stack frame, initialized as a copy of the current stack fram
e. |
| 2691 // Set the pattern index in the original stack frame from the operand valu
e | 2645 // Set the pattern index in the original stack frame from the operand valu
e |
| 2692 // in the opcode. Execution of the engine continues with the state in | 2646 // in the opcode. Execution of the engine continues with the state in |
| 2693 // the newly created stack frame | 2647 // the newly created stack frame |
| 2694 // | 2648 // |
| 2695 // Note that reserveBlock() may grow the stack, resulting in the | 2649 // Note that reserveBlock() may grow the stack, resulting in the |
| 2696 // whole thing being relocated in memory. | 2650 // whole thing being relocated in memory. |
| 2697 // | 2651 // |
| 2698 // Parameters: | 2652 // Parameters: |
| 2699 // fp The top frame pointer when called. At return, a new | 2653 // fp The top frame pointer when called. At return, a new |
| 2700 // fame will be present | 2654 // fame will be present |
| 2701 // savePatIdx An index into the compiled pattern. Goes into the origina
l | 2655 // savePatIdx An index into the compiled pattern. Goes into the origina
l |
| 2702 // (not new) frame. If execution ever back-tracks out of the | 2656 // (not new) frame. If execution ever back-tracks out of the |
| 2703 // new frame, this will be where we continue from in the patt
ern. | 2657 // new frame, this will be where we continue from in the patt
ern. |
| 2704 // Return | 2658 // Return |
| 2705 // The new frame pointer. | 2659 // The new frame pointer. |
| 2706 // | 2660 // |
| 2707 //------------------------------------------------------------------------------
-- | 2661 //------------------------------------------------------------------------------
-- |
| 2708 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatId
x, UErrorCode &status) { | 2662 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatId
x, UErrorCode &status) { |
| 2709 // push storage for a new frame. | 2663 // push storage for a new frame. |
| 2710 int64_t *newFP = fStack->reserveBlock(fFrameSize, status); | 2664 int64_t *newFP = fStack->reserveBlock(fFrameSize, status); |
| 2711 if (newFP == NULL) { | 2665 if (newFP == NULL) { |
| 2712 // Failure on attempted stack expansion. | 2666 // Failure on attempted stack expansion. |
| 2713 // Stack function set some other error code, change it to a more | 2667 // Stack function set some other error code, change it to a more |
| 2714 // specific one for regular expressions. | 2668 // specific one for regular expressions. |
| 2715 status = U_REGEX_STACK_OVERFLOW; | 2669 status = U_REGEX_STACK_OVERFLOW; |
| 2716 // We need to return a writable stack frame, so just return the | 2670 // We need to return a writable stack frame, so just return the |
| 2717 // previous frame. The match operation will stop quickly | 2671 // previous frame. The match operation will stop quickly |
| 2718 // because of the error status, after which the frame will never | 2672 // because of the error status, after which the frame will never |
| 2719 // be looked at again. | 2673 // be looked at again. |
| 2720 return fp; | 2674 return fp; |
| 2721 } | 2675 } |
| 2722 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. | 2676 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. |
| 2723 | 2677 |
| 2724 // New stack frame = copy of old top frame. | 2678 // New stack frame = copy of old top frame. |
| 2725 int64_t *source = (int64_t *)fp; | 2679 int64_t *source = (int64_t *)fp; |
| 2726 int64_t *dest = newFP; | 2680 int64_t *dest = newFP; |
| 2727 for (;;) { | 2681 for (;;) { |
| 2728 *dest++ = *source++; | 2682 *dest++ = *source++; |
| 2729 if (source == newFP) { | 2683 if (source == newFP) { |
| 2730 break; | 2684 break; |
| 2731 } | 2685 } |
| 2732 } | 2686 } |
| 2733 | 2687 |
| 2734 fTickCounter--; | 2688 fTickCounter--; |
| 2735 if (fTickCounter <= 0) { | 2689 if (fTickCounter <= 0) { |
| 2736 IncrementTime(status); // Re-initializes fTickCounter | 2690 IncrementTime(status); // Re-initializes fTickCounter |
| 2737 } | 2691 } |
| 2738 fp->fPatIdx = savePatIdx; | 2692 fp->fPatIdx = savePatIdx; |
| 2739 return (REStackFrame *)newFP; | 2693 return (REStackFrame *)newFP; |
| 2740 } | 2694 } |
| 2741 | 2695 |
| 2742 | 2696 |
| 2743 //------------------------------------------------------------------------------
-- | 2697 //------------------------------------------------------------------------------
-- |
| 2744 // | 2698 // |
| 2745 // MatchAt This is the actual matching engine. | 2699 // MatchAt This is the actual matching engine. |
| 2746 // | 2700 // |
| 2747 // startIdx: begin matching a this index. | 2701 // startIdx: begin matching a this index. |
| 2748 // toEnd: if true, match must extend to end of the input
region | 2702 // toEnd: if true, match must extend to end of the input
region |
| 2749 // | 2703 // |
| 2750 //------------------------------------------------------------------------------
-- | 2704 //------------------------------------------------------------------------------
-- |
| 2751 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { | 2705 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { |
| 2752 UBool isMatch = FALSE; // True if the we have a match. | 2706 UBool isMatch = FALSE; // True if the we have a match. |
| 2753 | 2707 |
| 2754 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-chara
cter matches for searching backwards | 2708 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-chara
cter matches for searching backwards |
| 2755 | 2709 |
| 2756 int32_t op; // Operation from the compiled pattern, s
plit into | 2710 int32_t op; // Operation from the compiled pattern, s
plit into |
| 2757 int32_t opType; // the opcode | 2711 int32_t opType; // the opcode |
| 2758 int32_t opValue; // and the operand value. | 2712 int32_t opValue; // and the operand value. |
| 2759 | 2713 |
| 2760 #ifdef REGEX_RUN_DEBUG | 2714 #ifdef REGEX_RUN_DEBUG |
| 2761 if (fTraceDebug) | 2715 if (fTraceDebug) |
| 2762 { | 2716 { |
| 2763 printf("MatchAt(startIdx=%ld)\n", startIdx); | 2717 printf("MatchAt(startIdx=%ld)\n", startIdx); |
| 2764 printf("Original Pattern: "); | 2718 printf("Original Pattern: "); |
| 2765 UChar32 c = utext_next32From(fPattern->fPattern, 0); | 2719 UChar32 c = utext_next32From(fPattern->fPattern, 0); |
| 2766 while (c != U_SENTINEL) { | 2720 while (c != U_SENTINEL) { |
| 2767 if (c<32 || c>256) { | 2721 if (c<32 || c>256) { |
| 2768 c = '.'; | 2722 c = '.'; |
| 2769 } | 2723 } |
| 2770 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); | 2724 printf("%c", c); |
| 2771 | 2725 |
| 2772 c = UTEXT_NEXT32(fPattern->fPattern); | 2726 c = UTEXT_NEXT32(fPattern->fPattern); |
| 2773 } | 2727 } |
| 2774 printf("\n"); | 2728 printf("\n"); |
| 2775 printf("Input String: "); | 2729 printf("Input String: "); |
| 2776 c = utext_next32From(fInputText, 0); | 2730 c = utext_next32From(fInputText, 0); |
| 2777 while (c != U_SENTINEL) { | 2731 while (c != U_SENTINEL) { |
| 2778 if (c<32 || c>256) { | 2732 if (c<32 || c>256) { |
| 2779 c = '.'; | 2733 c = '.'; |
| 2780 } | 2734 } |
| 2781 printf("%c", c); | 2735 printf("%c", c); |
| 2782 | 2736 |
| 2783 c = UTEXT_NEXT32(fInputText); | 2737 c = UTEXT_NEXT32(fInputText); |
| 2784 } | 2738 } |
| 2785 printf("\n"); | 2739 printf("\n"); |
| 2786 printf("\n"); | 2740 printf("\n"); |
| 2787 } | 2741 } |
| 2788 #endif | 2742 #endif |
| 2789 | 2743 |
| 2790 if (U_FAILURE(status)) { | 2744 if (U_FAILURE(status)) { |
| 2791 return; | 2745 return; |
| 2792 } | 2746 } |
| 2793 | 2747 |
| 2794 // Cache frequently referenced items from the compiled pattern | 2748 // Cache frequently referenced items from the compiled pattern |
| 2795 // | 2749 // |
| 2796 int64_t *pat = fPattern->fCompiledPat->getBuffer(); | 2750 int64_t *pat = fPattern->fCompiledPat->getBuffer(); |
| 2797 | 2751 |
| 2798 const UChar *litText = fPattern->fLiteralText.getBuffer(); | 2752 const UChar *litText = fPattern->fLiteralText.getBuffer(); |
| 2799 UVector *sets = fPattern->fSets; | 2753 UVector *sets = fPattern->fSets; |
| 2800 | 2754 |
| 2801 fFrameSize = fPattern->fFrameSize; | 2755 fFrameSize = fPattern->fFrameSize; |
| 2802 REStackFrame *fp = resetStack(); | 2756 REStackFrame *fp = resetStack(); |
| 2803 | 2757 |
| 2804 fp->fPatIdx = 0; | 2758 fp->fPatIdx = 0; |
| 2805 fp->fInputIdx = startIdx; | 2759 fp->fInputIdx = startIdx; |
| 2806 | 2760 |
| 2807 // Zero out the pattern's static data | 2761 // Zero out the pattern's static data |
| 2808 int32_t i; | 2762 int32_t i; |
| 2809 for (i = 0; i<fPattern->fDataSize; i++) { | 2763 for (i = 0; i<fPattern->fDataSize; i++) { |
| 2810 fData[i] = 0; | 2764 fData[i] = 0; |
| 2811 } | 2765 } |
| 2812 | 2766 |
| 2813 // | 2767 // |
| 2814 // Main loop for interpreting the compiled pattern. | 2768 // Main loop for interpreting the compiled pattern. |
| 2815 // One iteration of the loop per pattern operation performed. | 2769 // One iteration of the loop per pattern operation performed. |
| 2816 // | 2770 // |
| 2817 for (;;) { | 2771 for (;;) { |
| 2818 #if 0 | |
| 2819 if (_heapchk() != _HEAPOK) { | |
| 2820 fprintf(stderr, "Heap Trouble\n"); | |
| 2821 } | |
| 2822 #endif | |
| 2823 | |
| 2824 op = (int32_t)pat[fp->fPatIdx]; | 2772 op = (int32_t)pat[fp->fPatIdx]; |
| 2825 opType = URX_TYPE(op); | 2773 opType = URX_TYPE(op); |
| 2826 opValue = URX_VAL(op); | 2774 opValue = URX_VAL(op); |
| 2827 #ifdef REGEX_RUN_DEBUG | 2775 #ifdef REGEX_RUN_DEBUG |
| 2828 if (fTraceDebug) { | 2776 if (fTraceDebug) { |
| 2829 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 2777 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 2830 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ",
fp->fInputIdx, | 2778 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ",
fp->fInputIdx, |
| 2831 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(),
fActiveLimit); | 2779 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(),
fActiveLimit); |
| 2832 fPattern->dumpOp(fp->fPatIdx); | 2780 fPattern->dumpOp(fp->fPatIdx); |
| 2833 } | 2781 } |
| 2834 #endif | 2782 #endif |
| 2835 fp->fPatIdx++; | 2783 fp->fPatIdx++; |
| 2836 | 2784 |
| 2837 switch (opType) { | 2785 switch (opType) { |
| 2838 | 2786 |
| 2839 | 2787 |
| 2840 case URX_NOP: | 2788 case URX_NOP: |
| 2841 break; | 2789 break; |
| 2842 | 2790 |
| 2843 | 2791 |
| 2844 case URX_BACKTRACK: | 2792 case URX_BACKTRACK: |
| 2845 // Force a backtrack. In some circumstances, the pattern compiler | 2793 // Force a backtrack. In some circumstances, the pattern compiler |
| 2846 // will notice that the pattern can't possibly match anything, and
will | 2794 // will notice that the pattern can't possibly match anything, and
will |
| (...skipping 23 matching lines...) Expand all Loading... |
| 2870 // Strings require two slots in the compiled pattern, one for th
e | 2818 // Strings require two slots in the compiled pattern, one for th
e |
| 2871 // offset to the string text, and one for the length. | 2819 // offset to the string text, and one for the length. |
| 2872 | 2820 |
| 2873 int32_t stringStartIdx = opValue; | 2821 int32_t stringStartIdx = opValue; |
| 2874 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second ope
rand | 2822 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second ope
rand |
| 2875 fp->fPatIdx++; | 2823 fp->fPatIdx++; |
| 2876 opType = URX_TYPE(op); | 2824 opType = URX_TYPE(op); |
| 2877 int32_t stringLen = URX_VAL(op); | 2825 int32_t stringLen = URX_VAL(op); |
| 2878 U_ASSERT(opType == URX_STRING_LEN); | 2826 U_ASSERT(opType == URX_STRING_LEN); |
| 2879 U_ASSERT(stringLen >= 2); | 2827 U_ASSERT(stringLen >= 2); |
| 2880 | 2828 |
| 2881 const UChar *patternString = litText+stringStartIdx; | 2829 const UChar *patternString = litText+stringStartIdx; |
| 2882 int32_t patternStringIndex = 0; | 2830 int32_t patternStringIndex = 0; |
| 2883 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 2831 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 2884 UChar32 inputChar; | 2832 UChar32 inputChar; |
| 2885 UChar32 patternChar; | 2833 UChar32 patternChar; |
| 2886 UBool success = TRUE; | 2834 UBool success = TRUE; |
| 2887 while (patternStringIndex < stringLen) { | 2835 while (patternStringIndex < stringLen) { |
| 2888 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { | 2836 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { |
| 2889 success = FALSE; | 2837 success = FALSE; |
| 2890 fHitEnd = TRUE; | 2838 fHitEnd = TRUE; |
| 2891 break; | 2839 break; |
| 2892 } | 2840 } |
| 2893 inputChar = UTEXT_NEXT32(fInputText); | 2841 inputChar = UTEXT_NEXT32(fInputText); |
| 2894 U16_NEXT(patternString, patternStringIndex, stringLen, patte
rnChar); | 2842 U16_NEXT(patternString, patternStringIndex, stringLen, patte
rnChar); |
| 2895 if (patternChar != inputChar) { | 2843 if (patternChar != inputChar) { |
| 2896 success = FALSE; | 2844 success = FALSE; |
| 2897 break; | 2845 break; |
| 2898 } | 2846 } |
| 2899 } | 2847 } |
| 2900 | 2848 |
| 2901 if (success) { | 2849 if (success) { |
| 2902 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 2850 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 2903 } else { | 2851 } else { |
| 2904 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 2852 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 2905 } | 2853 } |
| 2906 } | 2854 } |
| 2907 break; | 2855 break; |
| 2908 | 2856 |
| 2909 | 2857 |
| 2910 case URX_STATE_SAVE: | 2858 case URX_STATE_SAVE: |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2945 | 2893 |
| 2946 case URX_DOLLAR: // $, test for End of line | 2894 case URX_DOLLAR: // $, test for End of line |
| 2947 // or for position before new lin
e at end of input | 2895 // or for position before new lin
e at end of input |
| 2948 { | 2896 { |
| 2949 if (fp->fInputIdx >= fAnchorLimit) { | 2897 if (fp->fInputIdx >= fAnchorLimit) { |
| 2950 // We really are at the end of input. Success. | 2898 // We really are at the end of input. Success. |
| 2951 fHitEnd = TRUE; | 2899 fHitEnd = TRUE; |
| 2952 fRequireEnd = TRUE; | 2900 fRequireEnd = TRUE; |
| 2953 break; | 2901 break; |
| 2954 } | 2902 } |
| 2955 | 2903 |
| 2956 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 2904 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 2957 | 2905 |
| 2958 // If we are positioned just before a new-line that is located a
t the | 2906 // If we are positioned just before a new-line that is located a
t the |
| 2959 // end of input, succeed. | 2907 // end of input, succeed. |
| 2960 UChar32 c = UTEXT_NEXT32(fInputText); | 2908 UChar32 c = UTEXT_NEXT32(fInputText); |
| 2961 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { | 2909 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { |
| 2962 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x202
9) { | 2910 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x202
9) { |
| 2963 // If not in the middle of a CR/LF sequence | 2911 // If not in the middle of a CR/LF sequence |
| 2964 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTE
XT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { | 2912 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTE
XT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { |
| 2965 // At new-line at end of input. Success | 2913 // At new-line at end of input. Success |
| 2966 fHitEnd = TRUE; | 2914 fHitEnd = TRUE; |
| 2967 fRequireEnd = TRUE; | 2915 fRequireEnd = TRUE; |
| 2968 | 2916 |
| 2969 break; | 2917 break; |
| 2970 } | 2918 } |
| 2971 } | 2919 } |
| 2972 } else { | 2920 } else { |
| 2973 UChar32 nextC = UTEXT_NEXT32(fInputText); | 2921 UChar32 nextC = UTEXT_NEXT32(fInputText); |
| 2974 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInpu
tText) >= fAnchorLimit) { | 2922 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInpu
tText) >= fAnchorLimit) { |
| 2975 fHitEnd = TRUE; | 2923 fHitEnd = TRUE; |
| 2976 fRequireEnd = TRUE; | 2924 fRequireEnd = TRUE; |
| 2977 break; // At CR/LF at end of inp
ut. Success | 2925 break; // At CR/LF at end of inp
ut. Success |
| 2978 } | 2926 } |
| (...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3058 | 3006 |
| 3059 case URX_CARET_M: // ^, test for start of line in muli
t-line mode | 3007 case URX_CARET_M: // ^, test for start of line in muli
t-line mode |
| 3060 { | 3008 { |
| 3061 if (fp->fInputIdx == fAnchorStart) { | 3009 if (fp->fInputIdx == fAnchorStart) { |
| 3062 // We are at the start input. Success. | 3010 // We are at the start input. Success. |
| 3063 break; | 3011 break; |
| 3064 } | 3012 } |
| 3065 // Check whether character just before the current pos is a new-l
ine | 3013 // Check whether character just before the current pos is a new-l
ine |
| 3066 // unless we are at the end of input | 3014 // unless we are at the end of input |
| 3067 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3015 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3068 UChar32 c = UTEXT_PREVIOUS32(fInputText); | 3016 UChar32 c = UTEXT_PREVIOUS32(fInputText); |
| 3069 if ((fp->fInputIdx < fAnchorLimit) && | 3017 if ((fp->fInputIdx < fAnchorLimit) && |
| 3070 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { | 3018 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { |
| 3071 // It's a new-line. ^ is true. Success. | 3019 // It's a new-line. ^ is true. Success. |
| 3072 // TODO: what should be done with positions between a CR an
d LF? | 3020 // TODO: what should be done with positions between a CR an
d LF? |
| 3073 break; | 3021 break; |
| 3074 } | 3022 } |
| 3075 // Not at the start of a line. Fail. | 3023 // Not at the start of a line. Fail. |
| 3076 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3024 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3077 } | 3025 } |
| 3078 break; | 3026 break; |
| 3079 | 3027 |
| (...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3141 break; | 3089 break; |
| 3142 | 3090 |
| 3143 | 3091 |
| 3144 case URX_BACKSLASH_G: // Test for position at end of previous m
atch | 3092 case URX_BACKSLASH_G: // Test for position at end of previous m
atch |
| 3145 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->
fInputIdx==fActiveStart))) { | 3093 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->
fInputIdx==fActiveStart))) { |
| 3146 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3094 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3147 } | 3095 } |
| 3148 break; | 3096 break; |
| 3149 | 3097 |
| 3150 | 3098 |
| 3151 case URX_BACKSLASH_X: | 3099 case URX_BACKSLASH_X: |
| 3152 // Match a Grapheme, as defined by Unicode TR 29. | 3100 // Match a Grapheme, as defined by Unicode TR 29. |
| 3153 // Differs slightly from Perl, which consumes combining marks indep
endently | 3101 // Differs slightly from Perl, which consumes combining marks indep
endently |
| 3154 // of context. | 3102 // of context. |
| 3155 { | 3103 { |
| 3156 | 3104 |
| 3157 // Fail if at end of input | 3105 // Fail if at end of input |
| 3158 if (fp->fInputIdx >= fActiveLimit) { | 3106 if (fp->fInputIdx >= fActiveLimit) { |
| 3159 fHitEnd = TRUE; | 3107 fHitEnd = TRUE; |
| 3160 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3108 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3161 break; | 3109 break; |
| 3162 } | 3110 } |
| 3163 | 3111 |
| 3164 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3112 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3165 | 3113 |
| 3166 // Examine (and consume) the current char. | 3114 // Examine (and consume) the current char. |
| 3167 // Dispatch into a little state machine, based on the char. | 3115 // Dispatch into a little state machine, based on the char. |
| 3168 UChar32 c; | 3116 UChar32 c; |
| 3169 c = UTEXT_NEXT32(fInputText); | 3117 c = UTEXT_NEXT32(fInputText); |
| 3170 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3118 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3171 UnicodeSet **sets = fPattern->fStaticSets; | 3119 UnicodeSet **sets = fPattern->fStaticSets; |
| 3172 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; | 3120 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; |
| 3173 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; | 3121 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; |
| (...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3220 c = UTEXT_CURRENT32(fInputText); | 3168 c = UTEXT_CURRENT32(fInputText); |
| 3221 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { | 3169 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { |
| 3222 break; | 3170 break; |
| 3223 } | 3171 } |
| 3224 (void)UTEXT_NEXT32(fInputText); | 3172 (void)UTEXT_NEXT32(fInputText); |
| 3225 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3173 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3226 } | 3174 } |
| 3227 goto GC_Done; | 3175 goto GC_Done; |
| 3228 | 3176 |
| 3229 GC_Control: | 3177 GC_Control: |
| 3230 // Most control chars stand alone (don't combine with combining
chars), | 3178 // Most control chars stand alone (don't combine with combining
chars), |
| 3231 // except for that CR/LF sequence is a single grapheme cluster
. | 3179 // except for that CR/LF sequence is a single grapheme cluster
. |
| 3232 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32
(fInputText) == 0x0a) { | 3180 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32
(fInputText) == 0x0a) { |
| 3233 c = UTEXT_NEXT32(fInputText); | 3181 c = UTEXT_NEXT32(fInputText); |
| 3234 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3182 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3235 } | 3183 } |
| 3236 | 3184 |
| 3237 GC_Done: | 3185 GC_Done: |
| 3238 if (fp->fInputIdx >= fActiveLimit) { | 3186 if (fp->fInputIdx >= fActiveLimit) { |
| 3239 fHitEnd = TRUE; | 3187 fHitEnd = TRUE; |
| 3240 } | 3188 } |
| 3241 break; | 3189 break; |
| 3242 } | 3190 } |
| 3243 | |
| 3244 | 3191 |
| 3245 | 3192 |
| 3246 | 3193 |
| 3194 |
| 3247 case URX_BACKSLASH_Z: // Test for end of Input | 3195 case URX_BACKSLASH_Z: // Test for end of Input |
| 3248 if (fp->fInputIdx < fAnchorLimit) { | 3196 if (fp->fInputIdx < fAnchorLimit) { |
| 3249 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3197 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3250 } else { | 3198 } else { |
| 3251 fHitEnd = TRUE; | 3199 fHitEnd = TRUE; |
| 3252 fRequireEnd = TRUE; | 3200 fRequireEnd = TRUE; |
| 3253 } | 3201 } |
| 3254 break; | 3202 break; |
| 3255 | 3203 |
| 3256 | 3204 |
| 3257 | 3205 |
| 3258 case URX_STATIC_SETREF: | 3206 case URX_STATIC_SETREF: |
| 3259 { | 3207 { |
| 3260 // Test input character against one of the predefined sets | 3208 // Test input character against one of the predefined sets |
| 3261 // (Word Characters, for example) | 3209 // (Word Characters, for example) |
| 3262 // The high bit of the op value is a flag for the match polarity
. | 3210 // The high bit of the op value is a flag for the match polarity
. |
| 3263 // 0: success if input char is in set. | 3211 // 0: success if input char is in set. |
| 3264 // 1: success if input char is not in set. | 3212 // 1: success if input char is not in set. |
| 3265 if (fp->fInputIdx >= fActiveLimit) { | 3213 if (fp->fInputIdx >= fActiveLimit) { |
| 3266 fHitEnd = TRUE; | 3214 fHitEnd = TRUE; |
| 3267 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3215 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3268 break; | 3216 break; |
| 3269 } | 3217 } |
| 3270 | 3218 |
| 3271 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); | 3219 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); |
| 3272 opValue &= ~URX_NEG_SET; | 3220 opValue &= ~URX_NEG_SET; |
| 3273 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); | 3221 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); |
| 3274 | 3222 |
| 3275 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3223 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3276 UChar32 c = UTEXT_NEXT32(fInputText); | 3224 UChar32 c = UTEXT_NEXT32(fInputText); |
| 3277 if (c < 256) { | 3225 if (c < 256) { |
| 3278 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; | 3226 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; |
| 3279 if (s8->contains(c)) { | 3227 if (s8->contains(c)) { |
| 3280 success = !success; | 3228 success = !success; |
| 3281 } | 3229 } |
| 3282 } else { | 3230 } else { |
| 3283 const UnicodeSet *s = fPattern->fStaticSets[opValue]; | 3231 const UnicodeSet *s = fPattern->fStaticSets[opValue]; |
| 3284 if (s->contains(c)) { | 3232 if (s->contains(c)) { |
| 3285 success = !success; | 3233 success = !success; |
| 3286 } | 3234 } |
| 3287 } | 3235 } |
| 3288 if (success) { | 3236 if (success) { |
| 3289 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3237 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3290 } else { | 3238 } else { |
| 3291 // the character wasn't in the set. | 3239 // the character wasn't in the set. |
| 3292 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3240 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3293 } | 3241 } |
| 3294 } | 3242 } |
| 3295 break; | 3243 break; |
| 3296 | 3244 |
| 3297 | 3245 |
| 3298 case URX_STAT_SETREF_N: | 3246 case URX_STAT_SETREF_N: |
| 3299 { | 3247 { |
| 3300 // Test input character for NOT being a member of one of | 3248 // Test input character for NOT being a member of one of |
| 3301 // the predefined sets (Word Characters, for example) | 3249 // the predefined sets (Word Characters, for example) |
| 3302 if (fp->fInputIdx >= fActiveLimit) { | 3250 if (fp->fInputIdx >= fActiveLimit) { |
| 3303 fHitEnd = TRUE; | 3251 fHitEnd = TRUE; |
| 3304 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3252 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3305 break; | 3253 break; |
| 3306 } | 3254 } |
| 3307 | 3255 |
| 3308 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); | 3256 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); |
| 3309 | 3257 |
| 3310 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3258 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3311 | 3259 |
| 3312 UChar32 c = UTEXT_NEXT32(fInputText); | 3260 UChar32 c = UTEXT_NEXT32(fInputText); |
| 3313 if (c < 256) { | 3261 if (c < 256) { |
| 3314 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; | 3262 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; |
| 3315 if (s8->contains(c) == FALSE) { | 3263 if (s8->contains(c) == FALSE) { |
| 3316 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3264 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3317 break; | 3265 break; |
| 3318 } | 3266 } |
| 3319 } else { | 3267 } else { |
| 3320 const UnicodeSet *s = fPattern->fStaticSets[opValue]; | 3268 const UnicodeSet *s = fPattern->fStaticSets[opValue]; |
| 3321 if (s->contains(c) == FALSE) { | 3269 if (s->contains(c) == FALSE) { |
| 3322 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3270 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3323 break; | 3271 break; |
| 3324 } | 3272 } |
| 3325 } | 3273 } |
| 3326 // the character wasn't in the set. | 3274 // the character wasn't in the set. |
| 3327 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3275 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3328 } | 3276 } |
| 3329 break; | 3277 break; |
| 3330 | 3278 |
| 3331 | 3279 |
| 3332 case URX_SETREF: | 3280 case URX_SETREF: |
| 3333 if (fp->fInputIdx >= fActiveLimit) { | 3281 if (fp->fInputIdx >= fActiveLimit) { |
| 3334 fHitEnd = TRUE; | 3282 fHitEnd = TRUE; |
| 3335 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3283 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3336 break; | 3284 break; |
| 3337 } else { | 3285 } else { |
| 3338 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3286 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3339 | 3287 |
| 3340 // There is input left. Pick up one char and test it for set me
mbership. | 3288 // There is input left. Pick up one char and test it for set me
mbership. |
| 3341 UChar32 c = UTEXT_NEXT32(fInputText); | 3289 UChar32 c = UTEXT_NEXT32(fInputText); |
| 3342 U_ASSERT(opValue > 0 && opValue < sets->size()); | 3290 U_ASSERT(opValue > 0 && opValue < sets->size()); |
| 3343 if (c<256) { | 3291 if (c<256) { |
| 3344 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; | 3292 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; |
| 3345 if (s8->contains(c)) { | 3293 if (s8->contains(c)) { |
| 3346 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3294 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3347 break; | 3295 break; |
| 3348 } | 3296 } |
| 3349 } else { | 3297 } else { |
| 3350 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); | 3298 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); |
| 3351 if (s->contains(c)) { | 3299 if (s->contains(c)) { |
| 3352 // The character is in the set. A Match. | 3300 // The character is in the set. A Match. |
| 3353 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3301 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3354 break; | 3302 break; |
| 3355 } | 3303 } |
| 3356 } | 3304 } |
| 3357 | 3305 |
| 3358 // the character wasn't in the set. | 3306 // the character wasn't in the set. |
| 3359 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3307 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3360 } | 3308 } |
| 3361 break; | 3309 break; |
| 3362 | 3310 |
| 3363 | 3311 |
| 3364 case URX_DOTANY: | 3312 case URX_DOTANY: |
| 3365 { | 3313 { |
| 3366 // . matches anything, but stops at end-of-line. | 3314 // . matches anything, but stops at end-of-line. |
| 3367 if (fp->fInputIdx >= fActiveLimit) { | 3315 if (fp->fInputIdx >= fActiveLimit) { |
| 3368 // At end of input. Match failed. Backtrack out. | 3316 // At end of input. Match failed. Backtrack out. |
| 3369 fHitEnd = TRUE; | 3317 fHitEnd = TRUE; |
| 3370 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3318 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3371 break; | 3319 break; |
| 3372 } | 3320 } |
| 3373 | 3321 |
| 3374 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3322 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3375 | 3323 |
| 3376 // There is input left. Advance over one char, unless we've hit
end-of-line | 3324 // There is input left. Advance over one char, unless we've hit
end-of-line |
| 3377 UChar32 c = UTEXT_NEXT32(fInputText); | 3325 UChar32 c = UTEXT_NEXT32(fInputText); |
| 3378 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many
chars as possible | 3326 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many
chars as possible |
| 3379 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029))
{ | 3327 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029))
{ |
| 3380 // End of line in normal mode. . does not match. | 3328 // End of line in normal mode. . does not match. |
| 3381 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3329 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3382 break; | 3330 break; |
| 3383 } | 3331 } |
| 3384 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3332 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3385 } | 3333 } |
| 3386 break; | 3334 break; |
| 3387 | 3335 |
| 3388 | 3336 |
| 3389 case URX_DOTANY_ALL: | 3337 case URX_DOTANY_ALL: |
| 3390 { | 3338 { |
| 3391 // ., in dot-matches-all (including new lines) mode | 3339 // ., in dot-matches-all (including new lines) mode |
| 3392 if (fp->fInputIdx >= fActiveLimit) { | 3340 if (fp->fInputIdx >= fActiveLimit) { |
| 3393 // At end of input. Match failed. Backtrack out. | 3341 // At end of input. Match failed. Backtrack out. |
| 3394 fHitEnd = TRUE; | 3342 fHitEnd = TRUE; |
| 3395 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3343 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3396 break; | 3344 break; |
| 3397 } | 3345 } |
| 3398 | 3346 |
| 3399 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3347 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3400 | 3348 |
| 3401 // There is input left. Advance over one char, except if we are | 3349 // There is input left. Advance over one char, except if we are |
| 3402 // at a cr/lf, advance over both of them. | 3350 // at a cr/lf, advance over both of them. |
| 3403 UChar32 c; | 3351 UChar32 c; |
| 3404 c = UTEXT_NEXT32(fInputText); | 3352 c = UTEXT_NEXT32(fInputText); |
| 3405 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3353 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3406 if (c==0x0d && fp->fInputIdx < fActiveLimit) { | 3354 if (c==0x0d && fp->fInputIdx < fActiveLimit) { |
| 3407 // In the case of a CR/LF, we need to advance over both. | 3355 // In the case of a CR/LF, we need to advance over both. |
| 3408 UChar32 nextc = UTEXT_CURRENT32(fInputText); | 3356 UChar32 nextc = UTEXT_CURRENT32(fInputText); |
| 3409 if (nextc == 0x0a) { | 3357 if (nextc == 0x0a) { |
| 3410 (void)UTEXT_NEXT32(fInputText); | 3358 (void)UTEXT_NEXT32(fInputText); |
| 3411 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3359 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3412 } | 3360 } |
| 3413 } | 3361 } |
| 3414 } | 3362 } |
| 3415 break; | 3363 break; |
| 3416 | 3364 |
| 3417 | 3365 |
| 3418 case URX_DOTANY_UNIX: | 3366 case URX_DOTANY_UNIX: |
| 3419 { | 3367 { |
| 3420 // '.' operator, matches all, but stops at end-of-line. | 3368 // '.' operator, matches all, but stops at end-of-line. |
| 3421 // UNIX_LINES mode, so 0x0a is the only recognized line ending
. | 3369 // UNIX_LINES mode, so 0x0a is the only recognized line ending
. |
| 3422 if (fp->fInputIdx >= fActiveLimit) { | 3370 if (fp->fInputIdx >= fActiveLimit) { |
| 3423 // At end of input. Match failed. Backtrack out. | 3371 // At end of input. Match failed. Backtrack out. |
| 3424 fHitEnd = TRUE; | 3372 fHitEnd = TRUE; |
| 3425 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3373 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3426 break; | 3374 break; |
| 3427 } | 3375 } |
| 3428 | 3376 |
| 3429 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3377 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3430 | 3378 |
| 3431 // There is input left. Advance over one char, unless we've hit
end-of-line | 3379 // There is input left. Advance over one char, unless we've hit
end-of-line |
| 3432 UChar32 c = UTEXT_NEXT32(fInputText); | 3380 UChar32 c = UTEXT_NEXT32(fInputText); |
| 3433 if (c == 0x0a) { | 3381 if (c == 0x0a) { |
| 3434 // End of line in normal mode. '.' does not match the \n | 3382 // End of line in normal mode. '.' does not match the \n |
| 3435 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3383 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3436 } else { | 3384 } else { |
| 3437 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3385 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3438 } | 3386 } |
| 3439 } | 3387 } |
| 3440 break; | 3388 break; |
| (...skipping 24 matching lines...) Expand all Loading... |
| 3465 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); | 3413 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); |
| 3466 int32_t frameLoc = URX_VAL(stoOp); | 3414 int32_t frameLoc = URX_VAL(stoOp); |
| 3467 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); | 3415 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); |
| 3468 int64_t prevInputIdx = fp->fExtra[frameLoc]; | 3416 int64_t prevInputIdx = fp->fExtra[frameLoc]; |
| 3469 U_ASSERT(prevInputIdx <= fp->fInputIdx); | 3417 U_ASSERT(prevInputIdx <= fp->fInputIdx); |
| 3470 if (prevInputIdx < fp->fInputIdx) { | 3418 if (prevInputIdx < fp->fInputIdx) { |
| 3471 // The match did make progress. Repeat the loop. | 3419 // The match did make progress. Repeat the loop. |
| 3472 fp = StateSave(fp, fp->fPatIdx, status); // State save to l
oc following current | 3420 fp = StateSave(fp, fp->fPatIdx, status); // State save to l
oc following current |
| 3473 fp->fPatIdx = opValue; | 3421 fp->fPatIdx = opValue; |
| 3474 fp->fExtra[frameLoc] = fp->fInputIdx; | 3422 fp->fExtra[frameLoc] = fp->fInputIdx; |
| 3475 } | 3423 } |
| 3476 // If the input position did not advance, we do nothing here, | 3424 // If the input position did not advance, we do nothing here, |
| 3477 // execution will fall out of the loop. | 3425 // execution will fall out of the loop. |
| 3478 } | 3426 } |
| 3479 break; | 3427 break; |
| 3480 | 3428 |
| 3481 case URX_CTR_INIT: | 3429 case URX_CTR_INIT: |
| 3482 { | 3430 { |
| 3483 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); | 3431 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); |
| 3484 fp->fExtra[opValue] = 0; // Set the loop counte
r variable to zero | 3432 fp->fExtra[opValue] = 0; // Set the loop counte
r variable to zero |
| 3485 | 3433 |
| 3486 // Pick up the three extra operands that CTR_INIT has, and | 3434 // Pick up the three extra operands that CTR_INIT has, and |
| 3487 // skip the pattern location counter past | 3435 // skip the pattern location counter past |
| 3488 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; | 3436 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| 3489 fp->fPatIdx += 3; | 3437 fp->fPatIdx += 3; |
| 3490 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); | 3438 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); |
| 3491 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; | 3439 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; |
| 3492 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; | 3440 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; |
| 3493 U_ASSERT(minCount>=0); | 3441 U_ASSERT(minCount>=0); |
| 3494 U_ASSERT(maxCount>=minCount || maxCount==-1); | 3442 U_ASSERT(maxCount>=minCount || maxCount==-1); |
| 3495 U_ASSERT(loopLoc>=fp->fPatIdx); | 3443 U_ASSERT(loopLoc>=fp->fPatIdx); |
| 3496 | 3444 |
| 3497 if (minCount == 0) { | 3445 if (minCount == 0) { |
| (...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3535 } | 3483 } |
| 3536 break; | 3484 break; |
| 3537 | 3485 |
| 3538 case URX_CTR_INIT_NG: | 3486 case URX_CTR_INIT_NG: |
| 3539 { | 3487 { |
| 3540 // Initialize a non-greedy loop | 3488 // Initialize a non-greedy loop |
| 3541 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); | 3489 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); |
| 3542 fp->fExtra[opValue] = 0; // Set the loop counte
r variable to zero | 3490 fp->fExtra[opValue] = 0; // Set the loop counte
r variable to zero |
| 3543 | 3491 |
| 3544 // Pick up the three extra operands that CTR_INIT_NG has, and | 3492 // Pick up the three extra operands that CTR_INIT_NG has, and |
| 3545 // skip the pattern location counter past | 3493 // skip the pattern location counter past |
| 3546 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; | 3494 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| 3547 fp->fPatIdx += 3; | 3495 fp->fPatIdx += 3; |
| 3548 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); | 3496 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); |
| 3549 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; | 3497 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; |
| 3550 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; | 3498 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; |
| 3551 U_ASSERT(minCount>=0); | 3499 U_ASSERT(minCount>=0); |
| 3552 U_ASSERT(maxCount>=minCount || maxCount==-1); | 3500 U_ASSERT(maxCount>=minCount || maxCount==-1); |
| 3553 U_ASSERT(loopLoc>fp->fPatIdx); | 3501 U_ASSERT(loopLoc>fp->fPatIdx); |
| 3554 if (maxCount == -1) { | 3502 if (maxCount == -1) { |
| 3555 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial in
put index for loop breaking. | 3503 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial in
put index for loop breaking. |
| 3556 } | 3504 } |
| 3557 | 3505 |
| 3558 if (minCount == 0) { | 3506 if (minCount == 0) { |
| 3559 if (maxCount != 0) { | 3507 if (maxCount != 0) { |
| 3560 fp = StateSave(fp, fp->fPatIdx, status); | 3508 fp = StateSave(fp, fp->fPatIdx, status); |
| 3561 } | 3509 } |
| 3562 fp->fPatIdx = loopLoc+1; // Continue with stuff after repe
ated block | 3510 fp->fPatIdx = loopLoc+1; // Continue with stuff after repe
ated block |
| 3563 } | 3511 } |
| 3564 } | 3512 } |
| 3565 break; | 3513 break; |
| 3566 | 3514 |
| 3567 case URX_CTR_LOOP_NG: | 3515 case URX_CTR_LOOP_NG: |
| 3568 { | 3516 { |
| 3569 // Non-greedy {min, max} loops | 3517 // Non-greedy {min, max} loops |
| 3570 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); | 3518 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); |
| 3571 int32_t initOp = (int32_t)pat[opValue]; | 3519 int32_t initOp = (int32_t)pat[opValue]; |
| 3572 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); | 3520 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); |
| 3573 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; | 3521 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; |
| (...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3640 U_ASSERT(groupStartIdx <= groupEndIdx); | 3588 U_ASSERT(groupStartIdx <= groupEndIdx); |
| 3641 if (groupStartIdx < 0) { | 3589 if (groupStartIdx < 0) { |
| 3642 // This capture group has not participated in the match thus
far, | 3590 // This capture group has not participated in the match thus
far, |
| 3643 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. | 3591 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. |
| 3644 break; | 3592 break; |
| 3645 } | 3593 } |
| 3646 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); | 3594 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); |
| 3647 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3595 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3648 | 3596 |
| 3649 // Note: if the capture group match was of an empty string the
backref | 3597 // Note: if the capture group match was of an empty string the
backref |
| 3650 // match succeeds. Verified by testing: Perl matches s
ucceed | 3598 // match succeeds. Verified by testing: Perl matches s
ucceed |
| 3651 // in this case, so we do too. | 3599 // in this case, so we do too. |
| 3652 | 3600 |
| 3653 UBool success = TRUE; | 3601 UBool success = TRUE; |
| 3654 for (;;) { | 3602 for (;;) { |
| 3655 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) { | 3603 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) { |
| 3656 success = TRUE; | 3604 success = TRUE; |
| 3657 break; | 3605 break; |
| 3658 } | 3606 } |
| 3659 if (utext_getNativeIndex(fInputText) >= fActiveLimit) { | 3607 if (utext_getNativeIndex(fInputText) >= fActiveLimit) { |
| 3660 success = FALSE; | 3608 success = FALSE; |
| 3661 fHitEnd = TRUE; | 3609 fHitEnd = TRUE; |
| 3662 break; | 3610 break; |
| (...skipping 26 matching lines...) Expand all Loading... |
| 3689 // This capture group has not participated in the match thus
far, | 3637 // This capture group has not participated in the match thus
far, |
| 3690 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. | 3638 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. |
| 3691 break; | 3639 break; |
| 3692 } | 3640 } |
| 3693 utext_setNativeIndex(fAltInputText, groupStartIdx); | 3641 utext_setNativeIndex(fAltInputText, groupStartIdx); |
| 3694 utext_setNativeIndex(fInputText, fp->fInputIdx); | 3642 utext_setNativeIndex(fInputText, fp->fInputIdx); |
| 3695 CaseFoldingUTextIterator captureGroupItr(*fAltInputText); | 3643 CaseFoldingUTextIterator captureGroupItr(*fAltInputText); |
| 3696 CaseFoldingUTextIterator inputItr(*fInputText); | 3644 CaseFoldingUTextIterator inputItr(*fInputText); |
| 3697 | 3645 |
| 3698 // Note: if the capture group match was of an empty string the
backref | 3646 // Note: if the capture group match was of an empty string the
backref |
| 3699 // match succeeds. Verified by testing: Perl matches s
ucceed | 3647 // match succeeds. Verified by testing: Perl matches s
ucceed |
| 3700 // in this case, so we do too. | 3648 // in this case, so we do too. |
| 3701 | 3649 |
| 3702 UBool success = TRUE; | 3650 UBool success = TRUE; |
| 3703 for (;;) { | 3651 for (;;) { |
| 3704 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(f
AltInputText) >= groupEndIdx) { | 3652 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(f
AltInputText) >= groupEndIdx) { |
| 3705 success = TRUE; | 3653 success = TRUE; |
| 3706 break; | 3654 break; |
| 3707 } | 3655 } |
| 3708 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputTe
xt) >= fActiveLimit) { | 3656 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputTe
xt) >= fActiveLimit) { |
| 3709 success = FALSE; | 3657 success = FALSE; |
| 3710 fHitEnd = TRUE; | 3658 fHitEnd = TRUE; |
| 3711 break; | 3659 break; |
| 3712 } | 3660 } |
| 3713 UChar32 captureGroupChar = captureGroupItr.next(); | 3661 UChar32 captureGroupChar = captureGroupItr.next(); |
| 3714 UChar32 inputChar = inputItr.next(); | 3662 UChar32 inputChar = inputItr.next(); |
| 3715 if (inputChar != captureGroupChar) { | 3663 if (inputChar != captureGroupChar) { |
| 3716 success = FALSE; | 3664 success = FALSE; |
| 3717 break; | 3665 break; |
| 3718 } | 3666 } |
| 3719 } | 3667 } |
| 3720 | 3668 |
| 3721 if (success && inputItr.inExpansion()) { | 3669 if (success && inputItr.inExpansion()) { |
| 3722 // We otained a match by consuming part of a string obtained
from | 3670 // We otained a match by consuming part of a string obtained
from |
| 3723 // case-folding a single code point of the input text. | 3671 // case-folding a single code point of the input text. |
| 3724 // This does not count as an overall match. | 3672 // This does not count as an overall match. |
| 3725 success = FALSE; | 3673 success = FALSE; |
| 3726 } | 3674 } |
| 3727 | 3675 |
| 3728 if (success) { | 3676 if (success) { |
| 3729 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3677 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3730 } else { | 3678 } else { |
| 3731 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3679 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3732 } | 3680 } |
| 3733 | 3681 |
| 3734 } | 3682 } |
| 3735 break; | 3683 break; |
| 3736 | 3684 |
| 3737 case URX_STO_INP_LOC: | 3685 case URX_STO_INP_LOC: |
| 3738 { | 3686 { |
| 3739 U_ASSERT(opValue >= 0 && opValue < fFrameSize); | 3687 U_ASSERT(opValue >= 0 && opValue < fFrameSize); |
| 3740 fp->fExtra[opValue] = fp->fInputIdx; | 3688 fp->fExtra[opValue] = fp->fInputIdx; |
| 3741 } | 3689 } |
| 3742 break; | 3690 break; |
| 3743 | 3691 |
| 3744 case URX_JMPX: | 3692 case URX_JMPX: |
| 3745 { | 3693 { |
| 3746 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; | 3694 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| (...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3806 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3754 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3807 | 3755 |
| 3808 UChar32 c = UTEXT_NEXT32(fInputText); | 3756 UChar32 c = UTEXT_NEXT32(fInputText); |
| 3809 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { | 3757 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { |
| 3810 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3758 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 3811 break; | 3759 break; |
| 3812 } | 3760 } |
| 3813 } else { | 3761 } else { |
| 3814 fHitEnd = TRUE; | 3762 fHitEnd = TRUE; |
| 3815 } | 3763 } |
| 3816 | 3764 |
| 3817 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3765 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3818 break; | 3766 break; |
| 3819 | 3767 |
| 3820 case URX_STRING_I: | 3768 case URX_STRING_I: |
| 3821 { | 3769 { |
| 3822 // Case-insensitive test input against a literal string. | 3770 // Case-insensitive test input against a literal string. |
| 3823 // Strings require two slots in the compiled pattern, one for th
e | 3771 // Strings require two slots in the compiled pattern, one for th
e |
| 3824 // offset to the string text, and one for the length. | 3772 // offset to the string text, and one for the length. |
| 3825 // The compiled string has already been case folded. | 3773 // The compiled string has already been case folded. |
| 3826 { | 3774 { |
| 3827 const UChar *patternString = litText + opValue; | 3775 const UChar *patternString = litText + opValue; |
| 3828 int32_t patternStringIdx = 0; | 3776 int32_t patternStringIdx = 0; |
| 3829 | 3777 |
| 3830 op = (int32_t)pat[fp->fPatIdx]; | 3778 op = (int32_t)pat[fp->fPatIdx]; |
| 3831 fp->fPatIdx++; | 3779 fp->fPatIdx++; |
| 3832 opType = URX_TYPE(op); | 3780 opType = URX_TYPE(op); |
| 3833 opValue = URX_VAL(op); | 3781 opValue = URX_VAL(op); |
| 3834 U_ASSERT(opType == URX_STRING_LEN); | 3782 U_ASSERT(opType == URX_STRING_LEN); |
| 3835 int32_t patternStringLen = opValue; // Length of the string
from the pattern. | 3783 int32_t patternStringLen = opValue; // Length of the string
from the pattern. |
| 3836 | 3784 |
| 3837 | 3785 |
| 3838 UChar32 cPattern; | 3786 UChar32 cPattern; |
| 3839 UChar32 cText; | 3787 UChar32 cText; |
| 3840 UBool success = TRUE; | 3788 UBool success = TRUE; |
| 3841 | 3789 |
| 3842 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3790 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 3843 CaseFoldingUTextIterator inputIterator(*fInputText); | 3791 CaseFoldingUTextIterator inputIterator(*fInputText); |
| 3844 while (patternStringIdx < patternStringLen) { | 3792 while (patternStringIdx < patternStringLen) { |
| 3845 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX
(fInputText) >= fActiveLimit) { | 3793 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX
(fInputText) >= fActiveLimit) { |
| 3846 success = FALSE; | 3794 success = FALSE; |
| 3847 fHitEnd = TRUE; | 3795 fHitEnd = TRUE; |
| (...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3942 // The look-behind expression matched, but the match did no
t | 3890 // The look-behind expression matched, but the match did no
t |
| 3943 // extend all the way to the point that we are looking be
hind from. | 3891 // extend all the way to the point that we are looking be
hind from. |
| 3944 // FAIL out of here, which will take us back to the LB_CONT
, which | 3892 // FAIL out of here, which will take us back to the LB_CONT
, which |
| 3945 // will retry the match starting at another position or
fail | 3893 // will retry the match starting at another position or
fail |
| 3946 // the look-behind altogether, whichever is appropriate. | 3894 // the look-behind altogether, whichever is appropriate. |
| 3947 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3895 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 3948 break; | 3896 break; |
| 3949 } | 3897 } |
| 3950 | 3898 |
| 3951 // Look-behind match is good. Restore the orignal input string
length, | 3899 // Look-behind match is good. Restore the orignal input string
length, |
| 3952 // which had been truncated to pin the end of the lookbehind m
atch to the | 3900 // which had been truncated to pin the end of the lookbehind m
atch to the |
| 3953 // position being looked-behind. | 3901 // position being looked-behind. |
| 3954 int64_t originalInputLen = fData[opValue+3]; | 3902 int64_t originalInputLen = fData[opValue+3]; |
| 3955 U_ASSERT(originalInputLen >= fActiveLimit); | 3903 U_ASSERT(originalInputLen >= fActiveLimit); |
| 3956 U_ASSERT(originalInputLen <= fInputLength); | 3904 U_ASSERT(originalInputLen <= fInputLength); |
| 3957 fActiveLimit = originalInputLen; | 3905 fActiveLimit = originalInputLen; |
| 3958 } | 3906 } |
| 3959 break; | 3907 break; |
| 3960 | 3908 |
| 3961 | 3909 |
| 3962 case URX_LBN_CONT: | 3910 case URX_LBN_CONT: |
| (...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4019 // extend all the way to the point that we are looking be
hind from. | 3967 // extend all the way to the point that we are looking be
hind from. |
| 4020 // FAIL out of here, which will take us back to the LB_CONT
, which | 3968 // FAIL out of here, which will take us back to the LB_CONT
, which |
| 4021 // will retry the match starting at another position or
succeed | 3969 // will retry the match starting at another position or
succeed |
| 4022 // the look-behind altogether, whichever is appropriate. | 3970 // the look-behind altogether, whichever is appropriate. |
| 4023 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3971 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4024 break; | 3972 break; |
| 4025 } | 3973 } |
| 4026 | 3974 |
| 4027 // Look-behind expression matched, which means look-behind test
as | 3975 // Look-behind expression matched, which means look-behind test
as |
| 4028 // a whole Fails | 3976 // a whole Fails |
| 4029 | 3977 |
| 4030 // Restore the orignal input string length, which had been tru
ncated | 3978 // Restore the orignal input string length, which had been tru
ncated |
| 4031 // inorder to pin the end of the lookbehind match | 3979 // inorder to pin the end of the lookbehind match |
| 4032 // to the position being looked-behind. | 3980 // to the position being looked-behind. |
| 4033 int64_t originalInputLen = fData[opValue+3]; | 3981 int64_t originalInputLen = fData[opValue+3]; |
| 4034 U_ASSERT(originalInputLen >= fActiveLimit); | 3982 U_ASSERT(originalInputLen >= fActiveLimit); |
| 4035 U_ASSERT(originalInputLen <= fInputLength); | 3983 U_ASSERT(originalInputLen <= fInputLength); |
| 4036 fActiveLimit = originalInputLen; | 3984 fActiveLimit = originalInputLen; |
| 4037 | 3985 |
| 4038 // Restore original stack position, discarding any state saved | 3986 // Restore original stack position, discarding any state saved |
| 4039 // by the successful pattern match. | 3987 // by the successful pattern match. |
| 4040 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); | 3988 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 4041 int32_t newStackSize = (int32_t)fData[opValue]; | 3989 int32_t newStackSize = (int32_t)fData[opValue]; |
| 4042 U_ASSERT(fStack->size() > newStackSize); | 3990 U_ASSERT(fStack->size() > newStackSize); |
| 4043 fStack->setSize(newStackSize); | 3991 fStack->setSize(newStackSize); |
| 4044 | 3992 |
| 4045 // FAIL, which will take control back to someplace | 3993 // FAIL, which will take control back to someplace |
| 4046 // prior to entering the look-behind test. | 3994 // prior to entering the look-behind test. |
| 4047 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3995 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4048 } | 3996 } |
| 4049 break; | 3997 break; |
| 4050 | 3998 |
| 4051 | 3999 |
| 4052 case URX_LOOP_SR_I: | 4000 case URX_LOOP_SR_I: |
| 4053 // Loop Initialization for the optimized implementation of | 4001 // Loop Initialization for the optimized implementation of |
| 4054 // [some character set]* | 4002 // [some character set]* |
| 4055 // This op scans through all matching input. | 4003 // This op scans through all matching input. |
| (...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4168 break; | 4116 break; |
| 4169 | 4117 |
| 4170 | 4118 |
| 4171 case URX_LOOP_C: | 4119 case URX_LOOP_C: |
| 4172 { | 4120 { |
| 4173 U_ASSERT(opValue>=0 && opValue<fFrameSize); | 4121 U_ASSERT(opValue>=0 && opValue<fFrameSize); |
| 4174 backSearchIndex = fp->fExtra[opValue]; | 4122 backSearchIndex = fp->fExtra[opValue]; |
| 4175 U_ASSERT(backSearchIndex <= fp->fInputIdx); | 4123 U_ASSERT(backSearchIndex <= fp->fInputIdx); |
| 4176 if (backSearchIndex == fp->fInputIdx) { | 4124 if (backSearchIndex == fp->fInputIdx) { |
| 4177 // We've backed up the input idx to the point that the loop
started. | 4125 // We've backed up the input idx to the point that the loop
started. |
| 4178 // The loop is done. Leave here without saving state. | 4126 // The loop is done. Leave here without saving state. |
| 4179 // Subsequent failures won't come back here. | 4127 // Subsequent failures won't come back here. |
| 4180 break; | 4128 break; |
| 4181 } | 4129 } |
| 4182 // Set up for the next iteration of the loop, with input index | 4130 // Set up for the next iteration of the loop, with input index |
| 4183 // backed up by one from the last time through, | 4131 // backed up by one from the last time through, |
| 4184 // and a state save to this instruction in case the following
code fails again. | 4132 // and a state save to this instruction in case the following
code fails again. |
| 4185 // (We're going backwards because this loop emulates stack unw
inding, not | 4133 // (We're going backwards because this loop emulates stack unw
inding, not |
| 4186 // the initial scan forward.) | 4134 // the initial scan forward.) |
| 4187 U_ASSERT(fp->fInputIdx > 0); | 4135 U_ASSERT(fp->fInputIdx > 0); |
| 4188 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 4136 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 4189 UChar32 prevC = UTEXT_PREVIOUS32(fInputText); | 4137 UChar32 prevC = UTEXT_PREVIOUS32(fInputText); |
| 4190 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 4138 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 4191 | 4139 |
| 4192 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); | 4140 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); |
| 4193 if (prevC == 0x0a && | 4141 if (prevC == 0x0a && |
| 4194 fp->fInputIdx > backSearchIndex && | 4142 fp->fInputIdx > backSearchIndex && |
| 4195 twoPrevC == 0x0d) { | 4143 twoPrevC == 0x0d) { |
| 4196 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; | 4144 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; |
| 4197 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { | 4145 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { |
| 4198 // .*, stepping back over CRLF pair. | 4146 // .*, stepping back over CRLF pair. |
| 4199 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 4147 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| 4200 } | 4148 } |
| 4201 } | 4149 } |
| 4202 | 4150 |
| 4203 | 4151 |
| 4204 fp = StateSave(fp, fp->fPatIdx-1, status); | 4152 fp = StateSave(fp, fp->fPatIdx-1, status); |
| 4205 } | 4153 } |
| 4206 break; | 4154 break; |
| 4207 | 4155 |
| 4208 | 4156 |
| 4209 | 4157 |
| 4210 default: | 4158 default: |
| 4211 // Trouble. The compiled pattern contains an entry with an | 4159 // Trouble. The compiled pattern contains an entry with an |
| 4212 // unrecognized type tag. | 4160 // unrecognized type tag. |
| 4213 U_ASSERT(FALSE); | 4161 U_ASSERT(FALSE); |
| 4214 } | 4162 } |
| 4215 | 4163 |
| 4216 if (U_FAILURE(status)) { | 4164 if (U_FAILURE(status)) { |
| 4217 isMatch = FALSE; | 4165 isMatch = FALSE; |
| 4218 break; | 4166 break; |
| 4219 } | 4167 } |
| 4220 } | 4168 } |
| 4221 | 4169 |
| 4222 breakFromLoop: | 4170 breakFromLoop: |
| 4223 fMatch = isMatch; | 4171 fMatch = isMatch; |
| 4224 if (isMatch) { | 4172 if (isMatch) { |
| 4225 fLastMatchEnd = fMatchEnd; | 4173 fLastMatchEnd = fMatchEnd; |
| 4226 fMatchStart = startIdx; | 4174 fMatchStart = startIdx; |
| 4227 fMatchEnd = fp->fInputIdx; | 4175 fMatchEnd = fp->fInputIdx; |
| 4228 if (fTraceDebug) { | 4176 } |
| 4229 REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchSta
rt, fMatchEnd)); | 4177 |
| 4178 #ifdef REGEX_RUN_DEBUG |
| 4179 if (fTraceDebug) { |
| 4180 if (isMatch) { |
| 4181 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd); |
| 4182 } else { |
| 4183 printf("No match\n\n"); |
| 4230 } | 4184 } |
| 4231 } | 4185 } |
| 4232 else | 4186 #endif |
| 4233 { | |
| 4234 if (fTraceDebug) { | |
| 4235 REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); | |
| 4236 } | |
| 4237 } | |
| 4238 | 4187 |
| 4239 fFrame = fp; // The active stack frame when the engine stoppe
d. | 4188 fFrame = fp; // The active stack frame when the engine stoppe
d. |
| 4240 // Contains the capture group results that we
need to | 4189 // Contains the capture group results that we
need to |
| 4241 // access later. | 4190 // access later. |
| 4242 return; | 4191 return; |
| 4243 } | 4192 } |
| 4244 | 4193 |
| 4245 | 4194 |
| 4246 //------------------------------------------------------------------------------
-- | 4195 //------------------------------------------------------------------------------
-- |
| 4247 // | 4196 // |
| 4248 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with t
he | 4197 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with t
he |
| 4249 // assumption that the entire string is available in the UText'
s | 4198 // assumption that the entire string is available in the UText'
s |
| 4250 // chunk buffer. For now, that means we can use int32_t indexes
, | 4199 // chunk buffer. For now, that means we can use int32_t indexes
, |
| 4251 // except for anything that needs to be saved (like group start
s | 4200 // except for anything that needs to be saved (like group start
s |
| 4252 // and ends). | 4201 // and ends). |
| 4253 // | 4202 // |
| 4254 // startIdx: begin matching a this index. | 4203 // startIdx: begin matching a this index. |
| 4255 // toEnd: if true, match must extend to end of the input
region | 4204 // toEnd: if true, match must extend to end of the input
region |
| 4256 // | 4205 // |
| 4257 //------------------------------------------------------------------------------
-- | 4206 //------------------------------------------------------------------------------
-- |
| 4258 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
s) { | 4207 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
s) { |
| 4259 UBool isMatch = FALSE; // True if the we have a match. | 4208 UBool isMatch = FALSE; // True if the we have a match. |
| 4260 | 4209 |
| 4261 int32_t backSearchIndex = INT32_MAX; // used after greedy single-charact
er matches for searching backwards | 4210 int32_t backSearchIndex = INT32_MAX; // used after greedy single-charact
er matches for searching backwards |
| 4262 | 4211 |
| 4263 int32_t op; // Operation from the compiled pattern, s
plit into | 4212 int32_t op; // Operation from the compiled pattern, s
plit into |
| 4264 int32_t opType; // the opcode | 4213 int32_t opType; // the opcode |
| 4265 int32_t opValue; // and the operand value. | 4214 int32_t opValue; // and the operand value. |
| 4266 | 4215 |
| 4267 #ifdef REGEX_RUN_DEBUG | 4216 #ifdef REGEX_RUN_DEBUG |
| 4268 if (fTraceDebug) | 4217 if (fTraceDebug) { |
| 4269 { | |
| 4270 printf("MatchAt(startIdx=%d)\n", startIdx); | 4218 printf("MatchAt(startIdx=%d)\n", startIdx); |
| 4271 printf("Original Pattern: "); | 4219 printf("Original Pattern: "); |
| 4272 UChar32 c = utext_next32From(fPattern->fPattern, 0); | 4220 UChar32 c = utext_next32From(fPattern->fPattern, 0); |
| 4273 while (c != U_SENTINEL) { | 4221 while (c != U_SENTINEL) { |
| 4274 if (c<32 || c>256) { | 4222 if (c<32 || c>256) { |
| 4275 c = '.'; | 4223 c = '.'; |
| 4276 } | 4224 } |
| 4277 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); | 4225 printf("%c", c); |
| 4278 | 4226 |
| 4279 c = UTEXT_NEXT32(fPattern->fPattern); | 4227 c = UTEXT_NEXT32(fPattern->fPattern); |
| 4280 } | 4228 } |
| 4281 printf("\n"); | 4229 printf("\n"); |
| 4282 printf("Input String: "); | 4230 printf("Input String: "); |
| 4283 c = utext_next32From(fInputText, 0); | 4231 c = utext_next32From(fInputText, 0); |
| 4284 while (c != U_SENTINEL) { | 4232 while (c != U_SENTINEL) { |
| 4285 if (c<32 || c>256) { | 4233 if (c<32 || c>256) { |
| 4286 c = '.'; | 4234 c = '.'; |
| 4287 } | 4235 } |
| 4288 printf("%c", c); | 4236 printf("%c", c); |
| 4289 | 4237 |
| 4290 c = UTEXT_NEXT32(fInputText); | 4238 c = UTEXT_NEXT32(fInputText); |
| 4291 } | 4239 } |
| 4292 printf("\n"); | 4240 printf("\n"); |
| 4293 printf("\n"); | 4241 printf("\n"); |
| 4294 } | 4242 } |
| 4295 #endif | 4243 #endif |
| 4296 | 4244 |
| 4297 if (U_FAILURE(status)) { | 4245 if (U_FAILURE(status)) { |
| 4298 return; | 4246 return; |
| 4299 } | 4247 } |
| 4300 | 4248 |
| 4301 // Cache frequently referenced items from the compiled pattern | 4249 // Cache frequently referenced items from the compiled pattern |
| 4302 // | 4250 // |
| 4303 int64_t *pat = fPattern->fCompiledPat->getBuffer(); | 4251 int64_t *pat = fPattern->fCompiledPat->getBuffer(); |
| 4304 | 4252 |
| 4305 const UChar *litText = fPattern->fLiteralText.getBuffer(); | 4253 const UChar *litText = fPattern->fLiteralText.getBuffer(); |
| 4306 UVector *sets = fPattern->fSets; | 4254 UVector *sets = fPattern->fSets; |
| 4307 | 4255 |
| 4308 const UChar *inputBuf = fInputText->chunkContents; | 4256 const UChar *inputBuf = fInputText->chunkContents; |
| 4309 | 4257 |
| 4310 fFrameSize = fPattern->fFrameSize; | 4258 fFrameSize = fPattern->fFrameSize; |
| 4311 REStackFrame *fp = resetStack(); | 4259 REStackFrame *fp = resetStack(); |
| 4312 | 4260 |
| 4313 fp->fPatIdx = 0; | 4261 fp->fPatIdx = 0; |
| 4314 fp->fInputIdx = startIdx; | 4262 fp->fInputIdx = startIdx; |
| 4315 | 4263 |
| 4316 // Zero out the pattern's static data | 4264 // Zero out the pattern's static data |
| 4317 int32_t i; | 4265 int32_t i; |
| 4318 for (i = 0; i<fPattern->fDataSize; i++) { | 4266 for (i = 0; i<fPattern->fDataSize; i++) { |
| 4319 fData[i] = 0; | 4267 fData[i] = 0; |
| 4320 } | 4268 } |
| 4321 | 4269 |
| 4322 // | 4270 // |
| 4323 // Main loop for interpreting the compiled pattern. | 4271 // Main loop for interpreting the compiled pattern. |
| 4324 // One iteration of the loop per pattern operation performed. | 4272 // One iteration of the loop per pattern operation performed. |
| 4325 // | 4273 // |
| 4326 for (;;) { | 4274 for (;;) { |
| 4327 #if 0 | |
| 4328 if (_heapchk() != _HEAPOK) { | |
| 4329 fprintf(stderr, "Heap Trouble\n"); | |
| 4330 } | |
| 4331 #endif | |
| 4332 | |
| 4333 op = (int32_t)pat[fp->fPatIdx]; | 4275 op = (int32_t)pat[fp->fPatIdx]; |
| 4334 opType = URX_TYPE(op); | 4276 opType = URX_TYPE(op); |
| 4335 opValue = URX_VAL(op); | 4277 opValue = URX_VAL(op); |
| 4336 #ifdef REGEX_RUN_DEBUG | 4278 #ifdef REGEX_RUN_DEBUG |
| 4337 if (fTraceDebug) { | 4279 if (fTraceDebug) { |
| 4338 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 4280 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| 4339 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ",
fp->fInputIdx, | 4281 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ",
fp->fInputIdx, |
| 4340 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(
), fActiveLimit); | 4282 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(
), fActiveLimit); |
| 4341 fPattern->dumpOp(fp->fPatIdx); | 4283 fPattern->dumpOp(fp->fPatIdx); |
| 4342 } | 4284 } |
| 4343 #endif | 4285 #endif |
| 4344 fp->fPatIdx++; | 4286 fp->fPatIdx++; |
| 4345 | 4287 |
| 4346 switch (opType) { | 4288 switch (opType) { |
| 4347 | 4289 |
| 4348 | 4290 |
| 4349 case URX_NOP: | 4291 case URX_NOP: |
| 4350 break; | 4292 break; |
| 4351 | 4293 |
| 4352 | 4294 |
| 4353 case URX_BACKTRACK: | 4295 case URX_BACKTRACK: |
| 4354 // Force a backtrack. In some circumstances, the pattern compiler | 4296 // Force a backtrack. In some circumstances, the pattern compiler |
| 4355 // will notice that the pattern can't possibly match anything, and
will | 4297 // will notice that the pattern can't possibly match anything, and
will |
| 4356 // emit one of these at that point. | 4298 // emit one of these at that point. |
| 4357 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4299 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4358 break; | 4300 break; |
| 4359 | 4301 |
| 4360 | 4302 |
| 4361 case URX_ONECHAR: | 4303 case URX_ONECHAR: |
| 4362 if (fp->fInputIdx < fActiveLimit) { | 4304 if (fp->fInputIdx < fActiveLimit) { |
| 4363 UChar32 c; | 4305 UChar32 c; |
| 4364 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4306 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 4365 if (c == opValue) { | 4307 if (c == opValue) { |
| 4366 break; | 4308 break; |
| 4367 } | 4309 } |
| 4368 } else { | 4310 } else { |
| 4369 fHitEnd = TRUE; | 4311 fHitEnd = TRUE; |
| 4370 } | 4312 } |
| 4371 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4313 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4372 break; | 4314 break; |
| 4373 | 4315 |
| 4374 | 4316 |
| 4375 case URX_STRING: | 4317 case URX_STRING: |
| 4376 { | 4318 { |
| 4377 // Test input against a literal string. | 4319 // Test input against a literal string. |
| 4378 // Strings require two slots in the compiled pattern, one for th
e | 4320 // Strings require two slots in the compiled pattern, one for th
e |
| 4379 // offset to the string text, and one for the length. | 4321 // offset to the string text, and one for the length. |
| 4380 int32_t stringStartIdx = opValue; | 4322 int32_t stringStartIdx = opValue; |
| 4381 int32_t stringLen; | 4323 int32_t stringLen; |
| 4382 | 4324 |
| 4383 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second ope
rand | 4325 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second ope
rand |
| 4384 fp->fPatIdx++; | 4326 fp->fPatIdx++; |
| 4385 opType = URX_TYPE(op); | 4327 opType = URX_TYPE(op); |
| 4386 stringLen = URX_VAL(op); | 4328 stringLen = URX_VAL(op); |
| 4387 U_ASSERT(opType == URX_STRING_LEN); | 4329 U_ASSERT(opType == URX_STRING_LEN); |
| 4388 U_ASSERT(stringLen >= 2); | 4330 U_ASSERT(stringLen >= 2); |
| 4389 | 4331 |
| 4390 const UChar * pInp = inputBuf + fp->fInputIdx; | 4332 const UChar * pInp = inputBuf + fp->fInputIdx; |
| 4391 const UChar * pInpLimit = inputBuf + fActiveLimit; | 4333 const UChar * pInpLimit = inputBuf + fActiveLimit; |
| 4392 const UChar * pPat = litText+stringStartIdx; | 4334 const UChar * pPat = litText+stringStartIdx; |
| 4393 const UChar * pEnd = pInp + stringLen; | 4335 const UChar * pEnd = pInp + stringLen; |
| 4394 UBool success = TRUE; | 4336 UBool success = TRUE; |
| 4395 while (pInp < pEnd) { | 4337 while (pInp < pEnd) { |
| 4396 if (pInp >= pInpLimit) { | 4338 if (pInp >= pInpLimit) { |
| 4397 fHitEnd = TRUE; | 4339 fHitEnd = TRUE; |
| 4398 success = FALSE; | 4340 success = FALSE; |
| 4399 break; | 4341 break; |
| 4400 } | 4342 } |
| 4401 if (*pInp++ != *pPat++) { | 4343 if (*pInp++ != *pPat++) { |
| 4402 success = FALSE; | 4344 success = FALSE; |
| 4403 break; | 4345 break; |
| 4404 } | 4346 } |
| 4405 } | 4347 } |
| 4406 | 4348 |
| 4407 if (success) { | 4349 if (success) { |
| 4408 fp->fInputIdx += stringLen; | 4350 fp->fInputIdx += stringLen; |
| 4409 } else { | 4351 } else { |
| 4410 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4352 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4411 } | 4353 } |
| 4412 } | 4354 } |
| 4413 break; | 4355 break; |
| 4414 | 4356 |
| 4415 | 4357 |
| 4416 case URX_STATE_SAVE: | 4358 case URX_STATE_SAVE: |
| 4417 fp = StateSave(fp, opValue, status); | 4359 fp = StateSave(fp, opValue, status); |
| 4418 break; | 4360 break; |
| 4419 | 4361 |
| 4420 | 4362 |
| 4421 case URX_END: | 4363 case URX_END: |
| 4422 // The match loop will exit via this path on a successful match, | 4364 // The match loop will exit via this path on a successful match, |
| 4423 // when we reach the end of the pattern. | 4365 // when we reach the end of the pattern. |
| 4424 if (toEnd && fp->fInputIdx != fActiveLimit) { | 4366 if (toEnd && fp->fInputIdx != fActiveLimit) { |
| 4425 // The pattern matched, but not to the end of input. Try some m
ore. | 4367 // The pattern matched, but not to the end of input. Try some m
ore. |
| 4426 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4368 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4427 break; | 4369 break; |
| 4428 } | 4370 } |
| 4429 isMatch = TRUE; | 4371 isMatch = TRUE; |
| 4430 goto breakFromLoop; | 4372 goto breakFromLoop; |
| 4431 | 4373 |
| 4432 // Start and End Capture stack frame variables are laid out out like
this: | 4374 // Start and End Capture stack frame variables are laid out out like
this: |
| 4433 // fp->fExtra[opValue] - The start of a completed capture group | 4375 // fp->fExtra[opValue] - The start of a completed capture group |
| 4434 // opValue+1 - The end of a completed capture group | 4376 // opValue+1 - The end of a completed capture group |
| 4435 // opValue+2 - the start of a capture group whose end | 4377 // opValue+2 - the start of a capture group whose end |
| 4436 // has not yet been reached (and might not
ever be). | 4378 // has not yet been reached (and might not
ever be). |
| 4437 case URX_START_CAPTURE: | 4379 case URX_START_CAPTURE: |
| 4438 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); | 4380 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); |
| 4439 fp->fExtra[opValue+2] = fp->fInputIdx; | 4381 fp->fExtra[opValue+2] = fp->fInputIdx; |
| 4440 break; | 4382 break; |
| 4441 | 4383 |
| 4442 | 4384 |
| 4443 case URX_END_CAPTURE: | 4385 case URX_END_CAPTURE: |
| 4444 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); | 4386 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); |
| 4445 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for th
is group must be set. | 4387 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for th
is group must be set. |
| 4446 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start
becomes real. | 4388 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start
becomes real. |
| 4447 fp->fExtra[opValue+1] = fp->fInputIdx; // End position | 4389 fp->fExtra[opValue+1] = fp->fInputIdx; // End position |
| 4448 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); | 4390 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); |
| 4449 break; | 4391 break; |
| 4450 | 4392 |
| 4451 | 4393 |
| 4452 case URX_DOLLAR: // $, test for End of line | 4394 case URX_DOLLAR: // $, test for End of line |
| 4453 // or for position before new line at end of input | 4395 // or for position before new line at end of input |
| 4454 if (fp->fInputIdx < fAnchorLimit-2) { | 4396 if (fp->fInputIdx < fAnchorLimit-2) { |
| 4455 // We are no where near the end of input. Fail. | 4397 // We are no where near the end of input. Fail. |
| 4456 // This is the common case. Keep it first. | 4398 // This is the common case. Keep it first. |
| 4457 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4399 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4458 break; | 4400 break; |
| 4459 } | 4401 } |
| 4460 if (fp->fInputIdx >= fAnchorLimit) { | 4402 if (fp->fInputIdx >= fAnchorLimit) { |
| 4461 // We really are at the end of input. Success. | 4403 // We really are at the end of input. Success. |
| 4462 fHitEnd = TRUE; | 4404 fHitEnd = TRUE; |
| 4463 fRequireEnd = TRUE; | 4405 fRequireEnd = TRUE; |
| 4464 break; | 4406 break; |
| 4465 } | 4407 } |
| 4466 | 4408 |
| 4467 // If we are positioned just before a new-line that is located at th
e | 4409 // If we are positioned just before a new-line that is located at th
e |
| 4468 // end of input, succeed. | 4410 // end of input, succeed. |
| 4469 if (fp->fInputIdx == fAnchorLimit-1) { | 4411 if (fp->fInputIdx == fAnchorLimit-1) { |
| 4470 UChar32 c; | 4412 UChar32 c; |
| 4471 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); | 4413 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); |
| 4472 | 4414 |
| 4473 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { | 4415 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { |
| 4474 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp-
>fInputIdx-1]==0x0d)) { | 4416 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp-
>fInputIdx-1]==0x0d)) { |
| 4475 // At new-line at end of input. Success | 4417 // At new-line at end of input. Success |
| 4476 fHitEnd = TRUE; | 4418 fHitEnd = TRUE; |
| 4477 fRequireEnd = TRUE; | 4419 fRequireEnd = TRUE; |
| 4478 break; | 4420 break; |
| 4479 } | 4421 } |
| 4480 } | 4422 } |
| 4481 } else if (fp->fInputIdx == fAnchorLimit-2 && | 4423 } else if (fp->fInputIdx == fAnchorLimit-2 && |
| 4482 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a
) { | 4424 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a
) { |
| 4483 fHitEnd = TRUE; | 4425 fHitEnd = TRUE; |
| 4484 fRequireEnd = TRUE; | 4426 fRequireEnd = TRUE; |
| 4485 break; // At CR/LF at end of input.
Success | 4427 break; // At CR/LF at end of input.
Success |
| 4486 } | 4428 } |
| 4487 | 4429 |
| 4488 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4430 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4489 | 4431 |
| 4490 break; | 4432 break; |
| 4491 | 4433 |
| 4492 | 4434 |
| 4493 case URX_DOLLAR_D: // $, test for End of Line, in UNI
X_LINES mode. | 4435 case URX_DOLLAR_D: // $, test for End of Line, in UNI
X_LINES mode. |
| 4494 if (fp->fInputIdx >= fAnchorLimit-1) { | 4436 if (fp->fInputIdx >= fAnchorLimit-1) { |
| 4495 // Either at the last character of input, or off the end. | 4437 // Either at the last character of input, or off the end. |
| 4496 if (fp->fInputIdx == fAnchorLimit-1) { | 4438 if (fp->fInputIdx == fAnchorLimit-1) { |
| 4497 // At last char of input. Success if it's a new line. | 4439 // At last char of input. Success if it's a new line. |
| 4498 if (inputBuf[fp->fInputIdx] == 0x0a) { | 4440 if (inputBuf[fp->fInputIdx] == 0x0a) { |
| 4499 fHitEnd = TRUE; | 4441 fHitEnd = TRUE; |
| 4500 fRequireEnd = TRUE; | 4442 fRequireEnd = TRUE; |
| 4501 break; | 4443 break; |
| 4502 } | 4444 } |
| 4503 } else { | 4445 } else { |
| 4504 // Off the end of input. Success. | 4446 // Off the end of input. Success. |
| 4505 fHitEnd = TRUE; | 4447 fHitEnd = TRUE; |
| 4506 fRequireEnd = TRUE; | 4448 fRequireEnd = TRUE; |
| 4507 break; | 4449 break; |
| 4508 } | 4450 } |
| 4509 } | 4451 } |
| 4510 | 4452 |
| 4511 // Not at end of input. Back-track out. | 4453 // Not at end of input. Back-track out. |
| 4512 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4454 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4513 break; | 4455 break; |
| 4514 | 4456 |
| 4515 | 4457 |
| 4516 case URX_DOLLAR_M: // $, test for End of line in multi-l
ine mode | 4458 case URX_DOLLAR_M: // $, test for End of line in multi-l
ine mode |
| 4517 { | 4459 { |
| 4518 if (fp->fInputIdx >= fAnchorLimit) { | 4460 if (fp->fInputIdx >= fAnchorLimit) { |
| 4519 // We really are at the end of input. Success. | 4461 // We really are at the end of input. Success. |
| 4520 fHitEnd = TRUE; | 4462 fHitEnd = TRUE; |
| 4521 fRequireEnd = TRUE; | 4463 fRequireEnd = TRUE; |
| 4522 break; | 4464 break; |
| 4523 } | 4465 } |
| 4524 // If we are positioned just before a new-line, succeed. | 4466 // If we are positioned just before a new-line, succeed. |
| 4525 // It makes no difference where the new-line is within the input
. | 4467 // It makes no difference where the new-line is within the input
. |
| 4526 UChar32 c = inputBuf[fp->fInputIdx]; | 4468 UChar32 c = inputBuf[fp->fInputIdx]; |
| 4527 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { | 4469 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { |
| 4528 // At a line end, except for the odd chance of being in the
middle of a CR/LF sequence | 4470 // At a line end, except for the odd chance of being in the
middle of a CR/LF sequence |
| 4529 // In multi-line mode, hitting a new-line just before the e
nd of input does not | 4471 // In multi-line mode, hitting a new-line just before the e
nd of input does not |
| 4530 // set the hitEnd or requireEnd flags | 4472 // set the hitEnd or requireEnd flags |
| 4531 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp-
>fInputIdx-1]==0x0d)) { | 4473 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp-
>fInputIdx-1]==0x0d)) { |
| 4532 break; | 4474 break; |
| 4533 } | 4475 } |
| 4534 } | 4476 } |
| 4535 // not at a new line. Fail. | 4477 // not at a new line. Fail. |
| 4536 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4478 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4537 } | 4479 } |
| 4538 break; | 4480 break; |
| 4539 | 4481 |
| 4540 | 4482 |
| 4541 case URX_DOLLAR_MD: // $, test for End of line in multi-
line and UNIX_LINES mode | 4483 case URX_DOLLAR_MD: // $, test for End of line in multi-
line and UNIX_LINES mode |
| 4542 { | 4484 { |
| 4543 if (fp->fInputIdx >= fAnchorLimit) { | 4485 if (fp->fInputIdx >= fAnchorLimit) { |
| 4544 // We really are at the end of input. Success. | 4486 // We really are at the end of input. Success. |
| 4545 fHitEnd = TRUE; | 4487 fHitEnd = TRUE; |
| 4546 fRequireEnd = TRUE; // Java set requireEnd in this case, ev
en though | 4488 fRequireEnd = TRUE; // Java set requireEnd in this case, ev
en though |
| 4547 break; // adding a new-line would not lose t
he match. | 4489 break; // adding a new-line would not lose t
he match. |
| 4548 } | 4490 } |
| 4549 // If we are not positioned just before a new-line, the test fai
ls; backtrack out. | 4491 // If we are not positioned just before a new-line, the test fai
ls; backtrack out. |
| 4550 // It makes no difference where the new-line is within the input
. | 4492 // It makes no difference where the new-line is within the input
. |
| 4551 if (inputBuf[fp->fInputIdx] != 0x0a) { | 4493 if (inputBuf[fp->fInputIdx] != 0x0a) { |
| 4552 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4494 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4553 } | 4495 } |
| 4554 } | 4496 } |
| 4555 break; | 4497 break; |
| 4556 | 4498 |
| 4557 | 4499 |
| 4558 case URX_CARET: // ^, test for start of line | 4500 case URX_CARET: // ^, test for start of line |
| 4559 if (fp->fInputIdx != fAnchorStart) { | 4501 if (fp->fInputIdx != fAnchorStart) { |
| 4560 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4502 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4561 } | 4503 } |
| 4562 break; | 4504 break; |
| 4563 | 4505 |
| 4564 | 4506 |
| 4565 case URX_CARET_M: // ^, test for start of line in mul
it-line mode | 4507 case URX_CARET_M: // ^, test for start of line in mul
it-line mode |
| 4566 { | 4508 { |
| 4567 if (fp->fInputIdx == fAnchorStart) { | 4509 if (fp->fInputIdx == fAnchorStart) { |
| 4568 // We are at the start input. Success. | 4510 // We are at the start input. Success. |
| 4569 break; | 4511 break; |
| 4570 } | 4512 } |
| 4571 // Check whether character just before the current pos is a new-
line | 4513 // Check whether character just before the current pos is a new-
line |
| 4572 // unless we are at the end of input | 4514 // unless we are at the end of input |
| 4573 UChar c = inputBuf[fp->fInputIdx - 1]; | 4515 UChar c = inputBuf[fp->fInputIdx - 1]; |
| 4574 if ((fp->fInputIdx < fAnchorLimit) && | 4516 if ((fp->fInputIdx < fAnchorLimit) && |
| 4575 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029))
{ | 4517 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029))
{ |
| 4576 // It's a new-line. ^ is true. Success. | 4518 // It's a new-line. ^ is true. Success. |
| 4577 // TODO: what should be done with positions between a CR a
nd LF? | 4519 // TODO: what should be done with positions between a CR a
nd LF? |
| 4578 break; | 4520 break; |
| 4579 } | 4521 } |
| 4580 // Not at the start of a line. Fail. | 4522 // Not at the start of a line. Fail. |
| 4581 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4523 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4582 } | 4524 } |
| 4583 break; | 4525 break; |
| 4584 | 4526 |
| 4585 | 4527 |
| 4586 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line
+ Unix-line mode | 4528 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line
+ Unix-line mode |
| 4587 { | 4529 { |
| 4588 U_ASSERT(fp->fInputIdx >= fAnchorStart); | 4530 U_ASSERT(fp->fInputIdx >= fAnchorStart); |
| 4589 if (fp->fInputIdx <= fAnchorStart) { | 4531 if (fp->fInputIdx <= fAnchorStart) { |
| 4590 // We are at the start input. Success. | 4532 // We are at the start input. Success. |
| 4591 break; | 4533 break; |
| 4592 } | 4534 } |
| 4593 // Check whether character just before the current pos is a new-
line | 4535 // Check whether character just before the current pos is a new-
line |
| 4594 U_ASSERT(fp->fInputIdx <= fAnchorLimit); | 4536 U_ASSERT(fp->fInputIdx <= fAnchorLimit); |
| 4595 UChar c = inputBuf[fp->fInputIdx - 1]; | 4537 UChar c = inputBuf[fp->fInputIdx - 1]; |
| 4596 if (c != 0x0a) { | 4538 if (c != 0x0a) { |
| 4597 // Not at the start of a line. Back-track out. | 4539 // Not at the start of a line. Back-track out. |
| 4598 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4540 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4599 } | 4541 } |
| 4600 } | 4542 } |
| 4601 break; | 4543 break; |
| 4602 | 4544 |
| 4603 case URX_BACKSLASH_B: // Test for word boundaries | 4545 case URX_BACKSLASH_B: // Test for word boundaries |
| 4604 { | 4546 { |
| 4605 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); | 4547 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); |
| 4606 success ^= (UBool)(opValue != 0); // flip sense for \B | 4548 success ^= (UBool)(opValue != 0); // flip sense for \B |
| 4607 if (!success) { | 4549 if (!success) { |
| 4608 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4550 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4609 } | 4551 } |
| 4610 } | 4552 } |
| 4611 break; | 4553 break; |
| 4612 | 4554 |
| 4613 | 4555 |
| 4614 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-sty
le | 4556 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-sty
le |
| 4615 { | 4557 { |
| 4616 UBool success = isUWordBoundary(fp->fInputIdx); | 4558 UBool success = isUWordBoundary(fp->fInputIdx); |
| 4617 success ^= (UBool)(opValue != 0); // flip sense for \B | 4559 success ^= (UBool)(opValue != 0); // flip sense for \B |
| 4618 if (!success) { | 4560 if (!success) { |
| 4619 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4561 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4620 } | 4562 } |
| 4621 } | 4563 } |
| 4622 break; | 4564 break; |
| 4623 | 4565 |
| 4624 | 4566 |
| 4625 case URX_BACKSLASH_D: // Test for decimal digit | 4567 case URX_BACKSLASH_D: // Test for decimal digit |
| 4626 { | 4568 { |
| 4627 if (fp->fInputIdx >= fActiveLimit) { | 4569 if (fp->fInputIdx >= fActiveLimit) { |
| 4628 fHitEnd = TRUE; | 4570 fHitEnd = TRUE; |
| 4629 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4571 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4630 break; | 4572 break; |
| 4631 } | 4573 } |
| 4632 | 4574 |
| 4633 UChar32 c; | 4575 UChar32 c; |
| 4634 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4576 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 4635 int8_t ctype = u_charType(c); // TODO: make a unicode set f
or this. Will be faster. | 4577 int8_t ctype = u_charType(c); // TODO: make a unicode set f
or this. Will be faster. |
| 4636 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); | 4578 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); |
| 4637 success ^= (UBool)(opValue != 0); // flip sense for \D | 4579 success ^= (UBool)(opValue != 0); // flip sense for \D |
| 4638 if (!success) { | 4580 if (!success) { |
| 4639 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4581 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4640 } | 4582 } |
| 4641 } | 4583 } |
| 4642 break; | 4584 break; |
| 4643 | 4585 |
| 4644 | 4586 |
| 4645 case URX_BACKSLASH_G: // Test for position at end of previous m
atch | 4587 case URX_BACKSLASH_G: // Test for position at end of previous m
atch |
| 4646 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->
fInputIdx==fActiveStart))) { | 4588 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->
fInputIdx==fActiveStart))) { |
| 4647 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4589 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4648 } | 4590 } |
| 4649 break; | 4591 break; |
| 4650 | 4592 |
| 4651 | 4593 |
| 4652 case URX_BACKSLASH_X: | 4594 case URX_BACKSLASH_X: |
| 4653 // Match a Grapheme, as defined by Unicode TR 29. | 4595 // Match a Grapheme, as defined by Unicode TR 29. |
| 4654 // Differs slightly from Perl, which consumes combining marks independe
ntly | 4596 // Differs slightly from Perl, which consumes combining marks independe
ntly |
| 4655 // of context. | 4597 // of context. |
| 4656 { | 4598 { |
| 4657 | 4599 |
| 4658 // Fail if at end of input | 4600 // Fail if at end of input |
| 4659 if (fp->fInputIdx >= fActiveLimit) { | 4601 if (fp->fInputIdx >= fActiveLimit) { |
| 4660 fHitEnd = TRUE; | 4602 fHitEnd = TRUE; |
| 4661 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4603 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4662 break; | 4604 break; |
| (...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4711 } | 4653 } |
| 4712 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4654 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 4713 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { | 4655 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { |
| 4714 U16_BACK_1(inputBuf, 0, fp->fInputIdx); | 4656 U16_BACK_1(inputBuf, 0, fp->fInputIdx); |
| 4715 break; | 4657 break; |
| 4716 } | 4658 } |
| 4717 } | 4659 } |
| 4718 goto GC_Done; | 4660 goto GC_Done; |
| 4719 | 4661 |
| 4720 GC_Control: | 4662 GC_Control: |
| 4721 // Most control chars stand alone (don't combine with combining char
s), | 4663 // Most control chars stand alone (don't combine with combining char
s), |
| 4722 // except for that CR/LF sequence is a single grapheme cluster. | 4664 // except for that CR/LF sequence is a single grapheme cluster. |
| 4723 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInput
Idx] == 0x0a) { | 4665 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInput
Idx] == 0x0a) { |
| 4724 fp->fInputIdx++; | 4666 fp->fInputIdx++; |
| 4725 } | 4667 } |
| 4726 | 4668 |
| 4727 GC_Done: | 4669 GC_Done: |
| 4728 if (fp->fInputIdx >= fActiveLimit) { | 4670 if (fp->fInputIdx >= fActiveLimit) { |
| 4729 fHitEnd = TRUE; | 4671 fHitEnd = TRUE; |
| 4730 } | 4672 } |
| 4731 break; | 4673 break; |
| 4732 } | 4674 } |
| 4733 | 4675 |
| 4734 | 4676 |
| 4735 | 4677 |
| 4736 | 4678 |
| 4737 case URX_BACKSLASH_Z: // Test for end of Input | 4679 case URX_BACKSLASH_Z: // Test for end of Input |
| 4738 if (fp->fInputIdx < fAnchorLimit) { | 4680 if (fp->fInputIdx < fAnchorLimit) { |
| 4739 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4681 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4740 } else { | 4682 } else { |
| 4741 fHitEnd = TRUE; | 4683 fHitEnd = TRUE; |
| 4742 fRequireEnd = TRUE; | 4684 fRequireEnd = TRUE; |
| 4743 } | 4685 } |
| 4744 break; | 4686 break; |
| 4745 | 4687 |
| 4746 | 4688 |
| 4747 | 4689 |
| 4748 case URX_STATIC_SETREF: | 4690 case URX_STATIC_SETREF: |
| 4749 { | 4691 { |
| 4750 // Test input character against one of the predefined sets | 4692 // Test input character against one of the predefined sets |
| 4751 // (Word Characters, for example) | 4693 // (Word Characters, for example) |
| 4752 // The high bit of the op value is a flag for the match polarity
. | 4694 // The high bit of the op value is a flag for the match polarity
. |
| 4753 // 0: success if input char is in set. | 4695 // 0: success if input char is in set. |
| 4754 // 1: success if input char is not in set. | 4696 // 1: success if input char is not in set. |
| 4755 if (fp->fInputIdx >= fActiveLimit) { | 4697 if (fp->fInputIdx >= fActiveLimit) { |
| 4756 fHitEnd = TRUE; | 4698 fHitEnd = TRUE; |
| 4757 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4699 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4758 break; | 4700 break; |
| 4759 } | 4701 } |
| 4760 | 4702 |
| 4761 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); | 4703 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); |
| 4762 opValue &= ~URX_NEG_SET; | 4704 opValue &= ~URX_NEG_SET; |
| 4763 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); | 4705 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); |
| 4764 | 4706 |
| 4765 UChar32 c; | 4707 UChar32 c; |
| 4766 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4708 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 4767 if (c < 256) { | 4709 if (c < 256) { |
| 4768 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; | 4710 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; |
| 4769 if (s8->contains(c)) { | 4711 if (s8->contains(c)) { |
| 4770 success = !success; | 4712 success = !success; |
| 4771 } | 4713 } |
| 4772 } else { | 4714 } else { |
| 4773 const UnicodeSet *s = fPattern->fStaticSets[opValue]; | 4715 const UnicodeSet *s = fPattern->fStaticSets[opValue]; |
| 4774 if (s->contains(c)) { | 4716 if (s->contains(c)) { |
| 4775 success = !success; | 4717 success = !success; |
| 4776 } | 4718 } |
| 4777 } | 4719 } |
| 4778 if (!success) { | 4720 if (!success) { |
| 4779 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4721 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4780 } | 4722 } |
| 4781 } | 4723 } |
| 4782 break; | 4724 break; |
| 4783 | 4725 |
| 4784 | 4726 |
| 4785 case URX_STAT_SETREF_N: | 4727 case URX_STAT_SETREF_N: |
| 4786 { | 4728 { |
| 4787 // Test input character for NOT being a member of one of | 4729 // Test input character for NOT being a member of one of |
| 4788 // the predefined sets (Word Characters, for example) | 4730 // the predefined sets (Word Characters, for example) |
| 4789 if (fp->fInputIdx >= fActiveLimit) { | 4731 if (fp->fInputIdx >= fActiveLimit) { |
| 4790 fHitEnd = TRUE; | 4732 fHitEnd = TRUE; |
| 4791 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4733 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4792 break; | 4734 break; |
| 4793 } | 4735 } |
| 4794 | 4736 |
| 4795 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); | 4737 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); |
| 4796 | 4738 |
| 4797 UChar32 c; | 4739 UChar32 c; |
| 4798 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4740 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 4799 if (c < 256) { | 4741 if (c < 256) { |
| 4800 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; | 4742 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; |
| 4801 if (s8->contains(c) == FALSE) { | 4743 if (s8->contains(c) == FALSE) { |
| 4802 break; | 4744 break; |
| 4803 } | 4745 } |
| 4804 } else { | 4746 } else { |
| 4805 const UnicodeSet *s = fPattern->fStaticSets[opValue]; | 4747 const UnicodeSet *s = fPattern->fStaticSets[opValue]; |
| 4806 if (s->contains(c) == FALSE) { | 4748 if (s->contains(c) == FALSE) { |
| 4807 break; | 4749 break; |
| 4808 } | 4750 } |
| 4809 } | 4751 } |
| 4810 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4752 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4811 } | 4753 } |
| 4812 break; | 4754 break; |
| 4813 | 4755 |
| 4814 | 4756 |
| 4815 case URX_SETREF: | 4757 case URX_SETREF: |
| 4816 { | 4758 { |
| 4817 if (fp->fInputIdx >= fActiveLimit) { | 4759 if (fp->fInputIdx >= fActiveLimit) { |
| 4818 fHitEnd = TRUE; | 4760 fHitEnd = TRUE; |
| 4819 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4761 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4820 break; | 4762 break; |
| 4821 } | 4763 } |
| 4822 | 4764 |
| 4823 U_ASSERT(opValue > 0 && opValue < sets->size()); | 4765 U_ASSERT(opValue > 0 && opValue < sets->size()); |
| 4824 | 4766 |
| 4825 // There is input left. Pick up one char and test it for set me
mbership. | 4767 // There is input left. Pick up one char and test it for set me
mbership. |
| 4826 UChar32 c; | 4768 UChar32 c; |
| 4827 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4769 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 4828 if (c<256) { | 4770 if (c<256) { |
| 4829 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; | 4771 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; |
| 4830 if (s8->contains(c)) { | 4772 if (s8->contains(c)) { |
| 4831 // The character is in the set. A Match. | 4773 // The character is in the set. A Match. |
| 4832 break; | 4774 break; |
| 4833 } | 4775 } |
| 4834 } else { | 4776 } else { |
| 4835 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); | 4777 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); |
| 4836 if (s->contains(c)) { | 4778 if (s->contains(c)) { |
| 4837 // The character is in the set. A Match. | 4779 // The character is in the set. A Match. |
| 4838 break; | 4780 break; |
| 4839 } | 4781 } |
| 4840 } | 4782 } |
| 4841 | 4783 |
| 4842 // the character wasn't in the set. | 4784 // the character wasn't in the set. |
| 4843 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4785 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4844 } | 4786 } |
| 4845 break; | 4787 break; |
| 4846 | 4788 |
| 4847 | 4789 |
| 4848 case URX_DOTANY: | 4790 case URX_DOTANY: |
| 4849 { | 4791 { |
| 4850 // . matches anything, but stops at end-of-line. | 4792 // . matches anything, but stops at end-of-line. |
| 4851 if (fp->fInputIdx >= fActiveLimit) { | 4793 if (fp->fInputIdx >= fActiveLimit) { |
| 4852 // At end of input. Match failed. Backtrack out. | 4794 // At end of input. Match failed. Backtrack out. |
| 4853 fHitEnd = TRUE; | 4795 fHitEnd = TRUE; |
| 4854 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4796 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4855 break; | 4797 break; |
| 4856 } | 4798 } |
| 4857 | 4799 |
| 4858 // There is input left. Advance over one char, unless we've hit
end-of-line | 4800 // There is input left. Advance over one char, unless we've hit
end-of-line |
| 4859 UChar32 c; | 4801 UChar32 c; |
| 4860 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4802 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 4861 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many
chars as possible | 4803 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many
chars as possible |
| 4862 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029))
{ | 4804 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029))
{ |
| 4863 // End of line in normal mode. . does not match. | 4805 // End of line in normal mode. . does not match. |
| 4864 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4806 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4865 break; | 4807 break; |
| 4866 } | 4808 } |
| 4867 } | 4809 } |
| 4868 break; | 4810 break; |
| 4869 | 4811 |
| 4870 | 4812 |
| 4871 case URX_DOTANY_ALL: | 4813 case URX_DOTANY_ALL: |
| 4872 { | 4814 { |
| 4873 // . in dot-matches-all (including new lines) mode | 4815 // . in dot-matches-all (including new lines) mode |
| 4874 if (fp->fInputIdx >= fActiveLimit) { | 4816 if (fp->fInputIdx >= fActiveLimit) { |
| 4875 // At end of input. Match failed. Backtrack out. | 4817 // At end of input. Match failed. Backtrack out. |
| 4876 fHitEnd = TRUE; | 4818 fHitEnd = TRUE; |
| 4877 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4819 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4878 break; | 4820 break; |
| 4879 } | 4821 } |
| 4880 | 4822 |
| 4881 // There is input left. Advance over one char, except if we are | 4823 // There is input left. Advance over one char, except if we are |
| 4882 // at a cr/lf, advance over both of them. | 4824 // at a cr/lf, advance over both of them. |
| 4883 UChar32 c; | 4825 UChar32 c; |
| 4884 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4826 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 4885 if (c==0x0d && fp->fInputIdx < fActiveLimit) { | 4827 if (c==0x0d && fp->fInputIdx < fActiveLimit) { |
| 4886 // In the case of a CR/LF, we need to advance over both. | 4828 // In the case of a CR/LF, we need to advance over both. |
| 4887 if (inputBuf[fp->fInputIdx] == 0x0a) { | 4829 if (inputBuf[fp->fInputIdx] == 0x0a) { |
| 4888 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); | 4830 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); |
| 4889 } | 4831 } |
| 4890 } | 4832 } |
| 4891 } | 4833 } |
| 4892 break; | 4834 break; |
| 4893 | 4835 |
| 4894 | 4836 |
| 4895 case URX_DOTANY_UNIX: | 4837 case URX_DOTANY_UNIX: |
| 4896 { | 4838 { |
| 4897 // '.' operator, matches all, but stops at end-of-line. | 4839 // '.' operator, matches all, but stops at end-of-line. |
| 4898 // UNIX_LINES mode, so 0x0a is the only recognized line ending
. | 4840 // UNIX_LINES mode, so 0x0a is the only recognized line ending
. |
| 4899 if (fp->fInputIdx >= fActiveLimit) { | 4841 if (fp->fInputIdx >= fActiveLimit) { |
| 4900 // At end of input. Match failed. Backtrack out. | 4842 // At end of input. Match failed. Backtrack out. |
| 4901 fHitEnd = TRUE; | 4843 fHitEnd = TRUE; |
| 4902 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4844 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4903 break; | 4845 break; |
| 4904 } | 4846 } |
| 4905 | 4847 |
| 4906 // There is input left. Advance over one char, unless we've hit
end-of-line | 4848 // There is input left. Advance over one char, unless we've hit
end-of-line |
| 4907 UChar32 c; | 4849 UChar32 c; |
| 4908 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4850 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 4909 if (c == 0x0a) { | 4851 if (c == 0x0a) { |
| 4910 // End of line in normal mode. '.' does not match the \n | 4852 // End of line in normal mode. '.' does not match the \n |
| 4911 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4853 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4912 } | 4854 } |
| 4913 } | 4855 } |
| 4914 break; | 4856 break; |
| 4915 | 4857 |
| 4916 | 4858 |
| 4917 case URX_JMP: | 4859 case URX_JMP: |
| 4918 fp->fPatIdx = opValue; | 4860 fp->fPatIdx = opValue; |
| 4919 break; | 4861 break; |
| 4920 | 4862 |
| 4921 case URX_FAIL: | 4863 case URX_FAIL: |
| 4922 isMatch = FALSE; | 4864 isMatch = FALSE; |
| 4923 goto breakFromLoop; | 4865 goto breakFromLoop; |
| 4924 | 4866 |
| 4925 case URX_JMP_SAV: | 4867 case URX_JMP_SAV: |
| 4926 U_ASSERT(opValue < fPattern->fCompiledPat->size()); | 4868 U_ASSERT(opValue < fPattern->fCompiledPat->size()); |
| 4927 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc
following current | 4869 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc
following current |
| 4928 fp->fPatIdx = opValue; // Then JMP. | 4870 fp->fPatIdx = opValue; // Then JMP. |
| 4929 break; | 4871 break; |
| 4930 | 4872 |
| 4931 case URX_JMP_SAV_X: | 4873 case URX_JMP_SAV_X: |
| 4932 // This opcode is used with (x)+, when x can match a zero length str
ing. | 4874 // This opcode is used with (x)+, when x can match a zero length str
ing. |
| 4933 // Same as JMP_SAV, except conditional on the match having made forw
ard progress. | 4875 // Same as JMP_SAV, except conditional on the match having made forw
ard progress. |
| 4934 // Destination of the JMP must be a URX_STO_INP_LOC, from which we g
et the | 4876 // Destination of the JMP must be a URX_STO_INP_LOC, from which we g
et the |
| 4935 // data address of the input position at the start of the loop. | 4877 // data address of the input position at the start of the loop. |
| 4936 { | 4878 { |
| 4937 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()
); | 4879 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()
); |
| 4938 int32_t stoOp = (int32_t)pat[opValue-1]; | 4880 int32_t stoOp = (int32_t)pat[opValue-1]; |
| 4939 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); | 4881 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); |
| 4940 int32_t frameLoc = URX_VAL(stoOp); | 4882 int32_t frameLoc = URX_VAL(stoOp); |
| 4941 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); | 4883 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); |
| 4942 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; | 4884 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; |
| 4943 U_ASSERT(prevInputIdx <= fp->fInputIdx); | 4885 U_ASSERT(prevInputIdx <= fp->fInputIdx); |
| 4944 if (prevInputIdx < fp->fInputIdx) { | 4886 if (prevInputIdx < fp->fInputIdx) { |
| 4945 // The match did make progress. Repeat the loop. | 4887 // The match did make progress. Repeat the loop. |
| 4946 fp = StateSave(fp, fp->fPatIdx, status); // State save to l
oc following current | 4888 fp = StateSave(fp, fp->fPatIdx, status); // State save to l
oc following current |
| 4947 fp->fPatIdx = opValue; | 4889 fp->fPatIdx = opValue; |
| 4948 fp->fExtra[frameLoc] = fp->fInputIdx; | 4890 fp->fExtra[frameLoc] = fp->fInputIdx; |
| 4949 } | 4891 } |
| 4950 // If the input position did not advance, we do nothing here, | 4892 // If the input position did not advance, we do nothing here, |
| 4951 // execution will fall out of the loop. | 4893 // execution will fall out of the loop. |
| 4952 } | 4894 } |
| 4953 break; | 4895 break; |
| 4954 | 4896 |
| 4955 case URX_CTR_INIT: | 4897 case URX_CTR_INIT: |
| 4956 { | 4898 { |
| 4957 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); | 4899 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); |
| 4958 fp->fExtra[opValue] = 0; // Set the loop counte
r variable to zero | 4900 fp->fExtra[opValue] = 0; // Set the loop counte
r variable to zero |
| 4959 | 4901 |
| 4960 // Pick up the three extra operands that CTR_INIT has, and | 4902 // Pick up the three extra operands that CTR_INIT has, and |
| 4961 // skip the pattern location counter past | 4903 // skip the pattern location counter past |
| 4962 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; | 4904 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| 4963 fp->fPatIdx += 3; | 4905 fp->fPatIdx += 3; |
| 4964 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); | 4906 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); |
| 4965 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; | 4907 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; |
| 4966 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; | 4908 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; |
| 4967 U_ASSERT(minCount>=0); | 4909 U_ASSERT(minCount>=0); |
| 4968 U_ASSERT(maxCount>=minCount || maxCount==-1); | 4910 U_ASSERT(maxCount>=minCount || maxCount==-1); |
| 4969 U_ASSERT(loopLoc>=fp->fPatIdx); | 4911 U_ASSERT(loopLoc>=fp->fPatIdx); |
| 4970 | 4912 |
| 4971 if (minCount == 0) { | 4913 if (minCount == 0) { |
| 4972 fp = StateSave(fp, loopLoc+1, status); | 4914 fp = StateSave(fp, loopLoc+1, status); |
| 4973 } | 4915 } |
| 4974 if (maxCount == -1) { | 4916 if (maxCount == -1) { |
| 4975 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaki
ng. | 4917 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaki
ng. |
| 4976 } else if (maxCount == 0) { | 4918 } else if (maxCount == 0) { |
| 4977 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4919 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 4978 } | 4920 } |
| 4979 } | 4921 } |
| 4980 break; | 4922 break; |
| 4981 | 4923 |
| 4982 case URX_CTR_LOOP: | 4924 case URX_CTR_LOOP: |
| 4983 { | 4925 { |
| 4984 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); | 4926 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); |
| 4985 int32_t initOp = (int32_t)pat[opValue]; | 4927 int32_t initOp = (int32_t)pat[opValue]; |
| 4986 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); | 4928 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); |
| 4987 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; | 4929 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; |
| 4988 int32_t minCount = (int32_t)pat[opValue+2]; | 4930 int32_t minCount = (int32_t)pat[opValue+2]; |
| 4989 int32_t maxCount = (int32_t)pat[opValue+3]; | 4931 int32_t maxCount = (int32_t)pat[opValue+3]; |
| 4990 (*pCounter)++; | 4932 (*pCounter)++; |
| 4991 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1)
{ | 4933 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1)
{ |
| 4992 U_ASSERT(*pCounter == maxCount); | 4934 U_ASSERT(*pCounter == maxCount); |
| 4993 break; | 4935 break; |
| 4994 } | 4936 } |
| 4995 if (*pCounter >= minCount) { | 4937 if (*pCounter >= minCount) { |
| 4996 if (maxCount == -1) { | 4938 if (maxCount == -1) { |
| 4997 // Loop has no hard upper bound. | 4939 // Loop has no hard upper bound. |
| 4998 // Check that it is progressing through the input, break
if it is not. | 4940 // Check that it is progressing through the input, break
if it is not. |
| 4999 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) +
1]; | 4941 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) +
1]; |
| 5000 if (fp->fInputIdx == *pLastInputIdx) { | 4942 if (fp->fInputIdx == *pLastInputIdx) { |
| 5001 break; | 4943 break; |
| 5002 } else { | 4944 } else { |
| 5003 *pLastInputIdx = fp->fInputIdx; | 4945 *pLastInputIdx = fp->fInputIdx; |
| 5004 } | 4946 } |
| 5005 } | 4947 } |
| 5006 fp = StateSave(fp, fp->fPatIdx, status); | 4948 fp = StateSave(fp, fp->fPatIdx, status); |
| 5007 } | 4949 } |
| 5008 fp->fPatIdx = opValue + 4; // Loop back. | 4950 fp->fPatIdx = opValue + 4; // Loop back. |
| 5009 } | 4951 } |
| 5010 break; | 4952 break; |
| 5011 | 4953 |
| 5012 case URX_CTR_INIT_NG: | 4954 case URX_CTR_INIT_NG: |
| 5013 { | 4955 { |
| 5014 // Initialize a non-greedy loop | 4956 // Initialize a non-greedy loop |
| 5015 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); | 4957 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); |
| 5016 fp->fExtra[opValue] = 0; // Set the loop counte
r variable to zero | 4958 fp->fExtra[opValue] = 0; // Set the loop counte
r variable to zero |
| 5017 | 4959 |
| 5018 // Pick up the three extra operands that CTR_INIT_NG has, and | 4960 // Pick up the three extra operands that CTR_INIT_NG has, and |
| 5019 // skip the pattern location counter past | 4961 // skip the pattern location counter past |
| 5020 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; | 4962 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| 5021 fp->fPatIdx += 3; | 4963 fp->fPatIdx += 3; |
| 5022 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); | 4964 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); |
| 5023 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; | 4965 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; |
| 5024 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; | 4966 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; |
| 5025 U_ASSERT(minCount>=0); | 4967 U_ASSERT(minCount>=0); |
| 5026 U_ASSERT(maxCount>=minCount || maxCount==-1); | 4968 U_ASSERT(maxCount>=minCount || maxCount==-1); |
| 5027 U_ASSERT(loopLoc>fp->fPatIdx); | 4969 U_ASSERT(loopLoc>fp->fPatIdx); |
| 5028 if (maxCount == -1) { | 4970 if (maxCount == -1) { |
| 5029 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial in
put index for loop breaking. | 4971 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial in
put index for loop breaking. |
| 5030 } | 4972 } |
| 5031 | 4973 |
| 5032 if (minCount == 0) { | 4974 if (minCount == 0) { |
| 5033 if (maxCount != 0) { | 4975 if (maxCount != 0) { |
| 5034 fp = StateSave(fp, fp->fPatIdx, status); | 4976 fp = StateSave(fp, fp->fPatIdx, status); |
| 5035 } | 4977 } |
| 5036 fp->fPatIdx = loopLoc+1; // Continue with stuff after repe
ated block | 4978 fp->fPatIdx = loopLoc+1; // Continue with stuff after repe
ated block |
| 5037 } | 4979 } |
| 5038 } | 4980 } |
| 5039 break; | 4981 break; |
| 5040 | 4982 |
| 5041 case URX_CTR_LOOP_NG: | 4983 case URX_CTR_LOOP_NG: |
| 5042 { | 4984 { |
| 5043 // Non-greedy {min, max} loops | 4985 // Non-greedy {min, max} loops |
| 5044 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); | 4986 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); |
| 5045 int32_t initOp = (int32_t)pat[opValue]; | 4987 int32_t initOp = (int32_t)pat[opValue]; |
| 5046 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); | 4988 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); |
| 5047 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; | 4989 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; |
| 5048 int32_t minCount = (int32_t)pat[opValue+2]; | 4990 int32_t minCount = (int32_t)pat[opValue+2]; |
| 5049 int32_t maxCount = (int32_t)pat[opValue+3]; | 4991 int32_t maxCount = (int32_t)pat[opValue+3]; |
| 5050 | 4992 |
| 5051 (*pCounter)++; | 4993 (*pCounter)++; |
| 5052 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1)
{ | 4994 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1)
{ |
| 5053 // The loop has matched the maximum permitted number of time
s. | 4995 // The loop has matched the maximum permitted number of time
s. |
| 5054 // Break out of here with no action. Matching will | 4996 // Break out of here with no action. Matching will |
| 5055 // continue with the following pattern. | 4997 // continue with the following pattern. |
| 5056 U_ASSERT(*pCounter == maxCount); | 4998 U_ASSERT(*pCounter == maxCount); |
| 5057 break; | 4999 break; |
| 5058 } | 5000 } |
| 5059 | 5001 |
| 5060 if (*pCounter < minCount) { | 5002 if (*pCounter < minCount) { |
| 5061 // We haven't met the minimum number of matches yet. | 5003 // We haven't met the minimum number of matches yet. |
| 5062 // Loop back for another one. | 5004 // Loop back for another one. |
| 5063 fp->fPatIdx = opValue + 4; // Loop back. | 5005 fp->fPatIdx = opValue + 4; // Loop back. |
| 5064 } else { | 5006 } else { |
| 5065 // We do have the minimum number of matches. | 5007 // We do have the minimum number of matches. |
| 5066 | 5008 |
| 5067 // If there is no upper bound on the loop iterations, check
that the input index | 5009 // If there is no upper bound on the loop iterations, check
that the input index |
| 5068 // is progressing, and stop the loop if it is not. | 5010 // is progressing, and stop the loop if it is not. |
| 5069 if (maxCount == -1) { | 5011 if (maxCount == -1) { |
| 5070 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) +
1]; | 5012 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) +
1]; |
| 5071 if (fp->fInputIdx == *pLastInputIdx) { | 5013 if (fp->fInputIdx == *pLastInputIdx) { |
| 5072 break; | 5014 break; |
| 5073 } | 5015 } |
| 5074 *pLastInputIdx = fp->fInputIdx; | 5016 *pLastInputIdx = fp->fInputIdx; |
| 5075 } | 5017 } |
| 5076 | 5018 |
| 5077 // Loop Continuation: we will fall into the pattern followin
g the loop | 5019 // Loop Continuation: we will fall into the pattern followin
g the loop |
| 5078 // (non-greedy, don't execute loop body first), but first
do | 5020 // (non-greedy, don't execute loop body first), but first
do |
| 5079 // a state save to the top of the loop, so that a match fa
ilure | 5021 // a state save to the top of the loop, so that a match fa
ilure |
| 5080 // in the following pattern will try another iteration of
the loop. | 5022 // in the following pattern will try another iteration of
the loop. |
| 5081 fp = StateSave(fp, opValue + 4, status); | 5023 fp = StateSave(fp, opValue + 4, status); |
| 5082 } | 5024 } |
| 5083 } | 5025 } |
| 5084 break; | 5026 break; |
| 5085 | 5027 |
| 5086 case URX_STO_SP: | 5028 case URX_STO_SP: |
| 5087 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); | 5029 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); |
| 5088 fData[opValue] = fStack->size(); | 5030 fData[opValue] = fStack->size(); |
| 5089 break; | 5031 break; |
| 5090 | 5032 |
| 5091 case URX_LD_SP: | 5033 case URX_LD_SP: |
| 5092 { | 5034 { |
| 5093 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); | 5035 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); |
| 5094 int32_t newStackSize = (int32_t)fData[opValue]; | 5036 int32_t newStackSize = (int32_t)fData[opValue]; |
| 5095 U_ASSERT(newStackSize <= fStack->size()); | 5037 U_ASSERT(newStackSize <= fStack->size()); |
| 5096 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize
; | 5038 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize
; |
| 5097 if (newFP == (int64_t *)fp) { | 5039 if (newFP == (int64_t *)fp) { |
| 5098 break; | 5040 break; |
| 5099 } | 5041 } |
| 5100 int32_t i; | 5042 int32_t i; |
| 5101 for (i=0; i<fFrameSize; i++) { | 5043 for (i=0; i<fFrameSize; i++) { |
| 5102 newFP[i] = ((int64_t *)fp)[i]; | 5044 newFP[i] = ((int64_t *)fp)[i]; |
| 5103 } | 5045 } |
| 5104 fp = (REStackFrame *)newFP; | 5046 fp = (REStackFrame *)newFP; |
| 5105 fStack->setSize(newStackSize); | 5047 fStack->setSize(newStackSize); |
| 5106 } | 5048 } |
| 5107 break; | 5049 break; |
| 5108 | 5050 |
| 5109 case URX_BACKREF: | 5051 case URX_BACKREF: |
| 5110 { | 5052 { |
| 5111 U_ASSERT(opValue < fFrameSize); | 5053 U_ASSERT(opValue < fFrameSize); |
| 5112 int64_t groupStartIdx = fp->fExtra[opValue]; | 5054 int64_t groupStartIdx = fp->fExtra[opValue]; |
| 5113 int64_t groupEndIdx = fp->fExtra[opValue+1]; | 5055 int64_t groupEndIdx = fp->fExtra[opValue+1]; |
| 5114 U_ASSERT(groupStartIdx <= groupEndIdx); | 5056 U_ASSERT(groupStartIdx <= groupEndIdx); |
| 5115 int64_t inputIndex = fp->fInputIdx; | 5057 int64_t inputIndex = fp->fInputIdx; |
| 5116 if (groupStartIdx < 0) { | 5058 if (groupStartIdx < 0) { |
| 5117 // This capture group has not participated in the match thus
far, | 5059 // This capture group has not participated in the match thus
far, |
| 5118 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. | 5060 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. |
| (...skipping 11 matching lines...) Expand all Loading... |
| 5130 break; | 5072 break; |
| 5131 } | 5073 } |
| 5132 } | 5074 } |
| 5133 if (success) { | 5075 if (success) { |
| 5134 fp->fInputIdx = inputIndex; | 5076 fp->fInputIdx = inputIndex; |
| 5135 } else { | 5077 } else { |
| 5136 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 5078 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5137 } | 5079 } |
| 5138 } | 5080 } |
| 5139 break; | 5081 break; |
| 5140 | 5082 |
| 5141 case URX_BACKREF_I: | 5083 case URX_BACKREF_I: |
| 5142 { | 5084 { |
| 5143 U_ASSERT(opValue < fFrameSize); | 5085 U_ASSERT(opValue < fFrameSize); |
| 5144 int64_t groupStartIdx = fp->fExtra[opValue]; | 5086 int64_t groupStartIdx = fp->fExtra[opValue]; |
| 5145 int64_t groupEndIdx = fp->fExtra[opValue+1]; | 5087 int64_t groupEndIdx = fp->fExtra[opValue+1]; |
| 5146 U_ASSERT(groupStartIdx <= groupEndIdx); | 5088 U_ASSERT(groupStartIdx <= groupEndIdx); |
| 5147 if (groupStartIdx < 0) { | 5089 if (groupStartIdx < 0) { |
| 5148 // This capture group has not participated in the match thus
far, | 5090 // This capture group has not participated in the match thus
far, |
| 5149 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. | 5091 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. |
| 5150 break; | 5092 break; |
| 5151 } | 5093 } |
| 5152 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx
, groupEndIdx); | 5094 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx
, groupEndIdx); |
| 5153 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActi
veLimit); | 5095 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActi
veLimit); |
| 5154 | 5096 |
| 5155 // Note: if the capture group match was of an empty string the
backref | 5097 // Note: if the capture group match was of an empty string the
backref |
| 5156 // match succeeds. Verified by testing: Perl matches s
ucceed | 5098 // match succeeds. Verified by testing: Perl matches s
ucceed |
| 5157 // in this case, so we do too. | 5099 // in this case, so we do too. |
| 5158 | 5100 |
| 5159 UBool success = TRUE; | 5101 UBool success = TRUE; |
| 5160 for (;;) { | 5102 for (;;) { |
| 5161 UChar32 captureGroupChar = captureGroupItr.next(); | 5103 UChar32 captureGroupChar = captureGroupItr.next(); |
| 5162 if (captureGroupChar == U_SENTINEL) { | 5104 if (captureGroupChar == U_SENTINEL) { |
| 5163 success = TRUE; | 5105 success = TRUE; |
| 5164 break; | 5106 break; |
| 5165 } | 5107 } |
| 5166 UChar32 inputChar = inputItr.next(); | 5108 UChar32 inputChar = inputItr.next(); |
| 5167 if (inputChar == U_SENTINEL) { | 5109 if (inputChar == U_SENTINEL) { |
| 5168 success = FALSE; | 5110 success = FALSE; |
| 5169 fHitEnd = TRUE; | 5111 fHitEnd = TRUE; |
| 5170 break; | 5112 break; |
| 5171 } | 5113 } |
| 5172 if (inputChar != captureGroupChar) { | 5114 if (inputChar != captureGroupChar) { |
| 5173 success = FALSE; | 5115 success = FALSE; |
| 5174 break; | 5116 break; |
| 5175 } | 5117 } |
| 5176 } | 5118 } |
| 5177 | 5119 |
| 5178 if (success && inputItr.inExpansion()) { | 5120 if (success && inputItr.inExpansion()) { |
| 5179 // We otained a match by consuming part of a string obtained
from | 5121 // We otained a match by consuming part of a string obtained
from |
| 5180 // case-folding a single code point of the input text. | 5122 // case-folding a single code point of the input text. |
| 5181 // This does not count as an overall match. | 5123 // This does not count as an overall match. |
| 5182 success = FALSE; | 5124 success = FALSE; |
| 5183 } | 5125 } |
| 5184 | 5126 |
| 5185 if (success) { | 5127 if (success) { |
| 5186 fp->fInputIdx = inputItr.getIndex(); | 5128 fp->fInputIdx = inputItr.getIndex(); |
| 5187 } else { | 5129 } else { |
| 5188 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 5130 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5189 } | 5131 } |
| 5190 } | 5132 } |
| 5191 break; | 5133 break; |
| 5192 | 5134 |
| 5193 case URX_STO_INP_LOC: | 5135 case URX_STO_INP_LOC: |
| 5194 { | 5136 { |
| 5195 U_ASSERT(opValue >= 0 && opValue < fFrameSize); | 5137 U_ASSERT(opValue >= 0 && opValue < fFrameSize); |
| 5196 fp->fExtra[opValue] = fp->fInputIdx; | 5138 fp->fExtra[opValue] = fp->fInputIdx; |
| 5197 } | 5139 } |
| 5198 break; | 5140 break; |
| 5199 | 5141 |
| 5200 case URX_JMPX: | 5142 case URX_JMPX: |
| 5201 { | 5143 { |
| 5202 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; | 5144 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| 5203 fp->fPatIdx += 1; | 5145 fp->fPatIdx += 1; |
| 5204 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); | 5146 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); |
| 5205 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); | 5147 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); |
| 5206 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; | 5148 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; |
| 5207 U_ASSERT(savedInputIdx <= fp->fInputIdx); | 5149 U_ASSERT(savedInputIdx <= fp->fInputIdx); |
| 5208 if (savedInputIdx < fp->fInputIdx) { | 5150 if (savedInputIdx < fp->fInputIdx) { |
| 5209 fp->fPatIdx = opValue; // JMP | 5151 fp->fPatIdx = opValue; // JMP |
| 5210 } else { | 5152 } else { |
| 5211 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no progress in loop. | 5153 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no progress in loop. |
| 5212 } | 5154 } |
| 5213 } | 5155 } |
| 5214 break; | 5156 break; |
| 5215 | 5157 |
| 5216 case URX_LA_START: | 5158 case URX_LA_START: |
| 5217 { | 5159 { |
| 5218 // Entering a lookahead block. | 5160 // Entering a lookahead block. |
| 5219 // Save Stack Ptr, Input Pos. | 5161 // Save Stack Ptr, Input Pos. |
| 5220 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); | 5162 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 5221 fData[opValue] = fStack->size(); | 5163 fData[opValue] = fStack->size(); |
| 5222 fData[opValue+1] = fp->fInputIdx; | 5164 fData[opValue+1] = fp->fInputIdx; |
| 5223 fActiveStart = fLookStart; // Set the match region
change for | 5165 fActiveStart = fLookStart; // Set the match region
change for |
| 5224 fActiveLimit = fLookLimit; // transparent bounds. | 5166 fActiveLimit = fLookLimit; // transparent bounds. |
| 5225 } | 5167 } |
| 5226 break; | 5168 break; |
| 5227 | 5169 |
| 5228 case URX_LA_END: | 5170 case URX_LA_END: |
| 5229 { | 5171 { |
| 5230 // Leaving a look-ahead block. | 5172 // Leaving a look-ahead block. |
| 5231 // restore Stack Ptr, Input Pos to positions they had on entry
to block. | 5173 // restore Stack Ptr, Input Pos to positions they had on entry
to block. |
| 5232 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); | 5174 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 5233 int32_t stackSize = fStack->size(); | 5175 int32_t stackSize = fStack->size(); |
| 5234 int32_t newStackSize = (int32_t)fData[opValue]; | 5176 int32_t newStackSize = (int32_t)fData[opValue]; |
| 5235 U_ASSERT(stackSize >= newStackSize); | 5177 U_ASSERT(stackSize >= newStackSize); |
| 5236 if (stackSize > newStackSize) { | 5178 if (stackSize > newStackSize) { |
| 5237 // Copy the current top frame back to the new (cut back) top
frame. | 5179 // Copy the current top frame back to the new (cut back) top
frame. |
| 5238 // This makes the capture groups from within the look-ahea
d | 5180 // This makes the capture groups from within the look-ahea
d |
| 5239 // expression available. | 5181 // expression available. |
| 5240 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrame
Size; | 5182 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrame
Size; |
| 5241 int32_t i; | 5183 int32_t i; |
| 5242 for (i=0; i<fFrameSize; i++) { | 5184 for (i=0; i<fFrameSize; i++) { |
| 5243 newFP[i] = ((int64_t *)fp)[i]; | 5185 newFP[i] = ((int64_t *)fp)[i]; |
| 5244 } | 5186 } |
| 5245 fp = (REStackFrame *)newFP; | 5187 fp = (REStackFrame *)newFP; |
| 5246 fStack->setSize(newStackSize); | 5188 fStack->setSize(newStackSize); |
| 5247 } | 5189 } |
| 5248 fp->fInputIdx = fData[opValue+1]; | 5190 fp->fInputIdx = fData[opValue+1]; |
| 5249 | 5191 |
| 5250 // Restore the active region bounds in the input string; they ma
y have | 5192 // Restore the active region bounds in the input string; they ma
y have |
| 5251 // been changed because of transparent bounds on a Region. | 5193 // been changed because of transparent bounds on a Region. |
| 5252 fActiveStart = fRegionStart; | 5194 fActiveStart = fRegionStart; |
| 5253 fActiveLimit = fRegionLimit; | 5195 fActiveLimit = fRegionLimit; |
| 5254 } | 5196 } |
| 5255 break; | 5197 break; |
| 5256 | 5198 |
| 5257 case URX_ONECHAR_I: | 5199 case URX_ONECHAR_I: |
| 5258 if (fp->fInputIdx < fActiveLimit) { | 5200 if (fp->fInputIdx < fActiveLimit) { |
| 5259 UChar32 c; | 5201 UChar32 c; |
| 5260 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 5202 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| 5261 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { | 5203 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { |
| 5262 break; | 5204 break; |
| 5263 } | 5205 } |
| 5264 } else { | 5206 } else { |
| 5265 fHitEnd = TRUE; | 5207 fHitEnd = TRUE; |
| 5266 } | 5208 } |
| 5267 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 5209 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5268 break; | 5210 break; |
| 5269 | 5211 |
| 5270 case URX_STRING_I: | 5212 case URX_STRING_I: |
| 5271 // Case-insensitive test input against a literal string. | 5213 // Case-insensitive test input against a literal string. |
| 5272 // Strings require two slots in the compiled pattern, one for the | 5214 // Strings require two slots in the compiled pattern, one for the |
| 5273 // offset to the string text, and one for the length. | 5215 // offset to the string text, and one for the length. |
| 5274 // The compiled string has already been case folded. | 5216 // The compiled string has already been case folded. |
| 5275 { | 5217 { |
| 5276 const UChar *patternString = litText + opValue; | 5218 const UChar *patternString = litText + opValue; |
| 5277 | 5219 |
| 5278 op = (int32_t)pat[fp->fPatIdx]; | 5220 op = (int32_t)pat[fp->fPatIdx]; |
| 5279 fp->fPatIdx++; | 5221 fp->fPatIdx++; |
| 5280 opType = URX_TYPE(op); | 5222 opType = URX_TYPE(op); |
| 5281 opValue = URX_VAL(op); | 5223 opValue = URX_VAL(op); |
| 5282 U_ASSERT(opType == URX_STRING_LEN); | 5224 U_ASSERT(opType == URX_STRING_LEN); |
| 5283 int32_t patternStringLen = opValue; // Length of the string fro
m the pattern. | 5225 int32_t patternStringLen = opValue; // Length of the string fro
m the pattern. |
| 5284 | 5226 |
| 5285 UChar32 cText; | 5227 UChar32 cText; |
| 5286 UChar32 cPattern; | 5228 UChar32 cPattern; |
| 5287 UBool success = TRUE; | 5229 UBool success = TRUE; |
| 5288 int32_t patternStringIdx = 0; | 5230 int32_t patternStringIdx = 0; |
| 5289 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx,
fActiveLimit); | 5231 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx,
fActiveLimit); |
| 5290 while (patternStringIdx < patternStringLen) { | 5232 while (patternStringIdx < patternStringLen) { |
| 5291 U16_NEXT(patternString, patternStringIdx, patternStringLen,
cPattern); | 5233 U16_NEXT(patternString, patternStringIdx, patternStringLen,
cPattern); |
| 5292 cText = inputIterator.next(); | 5234 cText = inputIterator.next(); |
| 5293 if (cText != cPattern) { | 5235 if (cText != cPattern) { |
| 5294 success = FALSE; | 5236 success = FALSE; |
| (...skipping 24 matching lines...) Expand all Loading... |
| 5319 fData[opValue] = fStack->size(); | 5261 fData[opValue] = fStack->size(); |
| 5320 fData[opValue+1] = fp->fInputIdx; | 5262 fData[opValue+1] = fp->fInputIdx; |
| 5321 // Init the variable containing the start index for attempted ma
tches. | 5263 // Init the variable containing the start index for attempted ma
tches. |
| 5322 fData[opValue+2] = -1; | 5264 fData[opValue+2] = -1; |
| 5323 // Save input string length, then reset to pin any matches to en
d at | 5265 // Save input string length, then reset to pin any matches to en
d at |
| 5324 // the current position. | 5266 // the current position. |
| 5325 fData[opValue+3] = fActiveLimit; | 5267 fData[opValue+3] = fActiveLimit; |
| 5326 fActiveLimit = fp->fInputIdx; | 5268 fActiveLimit = fp->fInputIdx; |
| 5327 } | 5269 } |
| 5328 break; | 5270 break; |
| 5329 | 5271 |
| 5330 | 5272 |
| 5331 case URX_LB_CONT: | 5273 case URX_LB_CONT: |
| 5332 { | 5274 { |
| 5333 // Positive Look-Behind, at top of loop checking for matches of
LB expression | 5275 // Positive Look-Behind, at top of loop checking for matches of
LB expression |
| 5334 // at all possible input starting positions. | 5276 // at all possible input starting positions. |
| 5335 | 5277 |
| 5336 // Fetch the min and max possible match lengths. They are the o
perands | 5278 // Fetch the min and max possible match lengths. They are the o
perands |
| 5337 // of this op in the pattern. | 5279 // of this op in the pattern. |
| 5338 int32_t minML = (int32_t)pat[fp->fPatIdx++]; | 5280 int32_t minML = (int32_t)pat[fp->fPatIdx++]; |
| 5339 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; | 5281 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; |
| 5340 U_ASSERT(minML <= maxML); | 5282 U_ASSERT(minML <= maxML); |
| 5341 U_ASSERT(minML >= 0); | 5283 U_ASSERT(minML >= 0); |
| 5342 | 5284 |
| 5343 // Fetch (from data) the last input index where a match was atte
mpted. | 5285 // Fetch (from data) the last input index where a match was atte
mpted. |
| 5344 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); | 5286 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 5345 int64_t *lbStartIdx = &fData[opValue+2]; | 5287 int64_t *lbStartIdx = &fData[opValue+2]; |
| 5346 if (*lbStartIdx < 0) { | 5288 if (*lbStartIdx < 0) { |
| 5347 // First time through loop. | 5289 // First time through loop. |
| 5348 *lbStartIdx = fp->fInputIdx - minML; | 5290 *lbStartIdx = fp->fInputIdx - minML; |
| 5349 } else { | 5291 } else { |
| 5350 // 2nd through nth time through the loop. | 5292 // 2nd through nth time through the loop. |
| 5351 // Back up start position for match by one. | 5293 // Back up start position for match by one. |
| 5352 if (*lbStartIdx == 0) { | 5294 if (*lbStartIdx == 0) { |
| 5353 (*lbStartIdx)--; | 5295 (*lbStartIdx)--; |
| 5354 } else { | 5296 } else { |
| 5355 U16_BACK_1(inputBuf, 0, *lbStartIdx); | 5297 U16_BACK_1(inputBuf, 0, *lbStartIdx); |
| 5356 } | 5298 } |
| 5357 } | 5299 } |
| 5358 | 5300 |
| 5359 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { | 5301 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { |
| 5360 // We have tried all potential match starting points without | 5302 // We have tried all potential match starting points without |
| 5361 // getting a match. Backtrack out, and out of the | 5303 // getting a match. Backtrack out, and out of the |
| 5362 // Look Behind altogether. | 5304 // Look Behind altogether. |
| 5363 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 5305 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5364 int64_t restoreInputLen = fData[opValue+3]; | 5306 int64_t restoreInputLen = fData[opValue+3]; |
| 5365 U_ASSERT(restoreInputLen >= fActiveLimit); | 5307 U_ASSERT(restoreInputLen >= fActiveLimit); |
| 5366 U_ASSERT(restoreInputLen <= fInputLength); | 5308 U_ASSERT(restoreInputLen <= fInputLength); |
| 5367 fActiveLimit = restoreInputLen; | 5309 fActiveLimit = restoreInputLen; |
| 5368 break; | 5310 break; |
| 5369 } | 5311 } |
| 5370 | 5312 |
| 5371 // Save state to this URX_LB_CONT op, so failure to match wil
l repeat the loop. | 5313 // Save state to this URX_LB_CONT op, so failure to match wil
l repeat the loop. |
| 5372 // (successful match will fall off the end of the loop.) | 5314 // (successful match will fall off the end of the loop.) |
| 5373 fp = StateSave(fp, fp->fPatIdx-3, status); | 5315 fp = StateSave(fp, fp->fPatIdx-3, status); |
| 5374 fp->fInputIdx = *lbStartIdx; | 5316 fp->fInputIdx = *lbStartIdx; |
| 5375 } | 5317 } |
| 5376 break; | 5318 break; |
| 5377 | 5319 |
| 5378 case URX_LB_END: | 5320 case URX_LB_END: |
| 5379 // End of a look-behind block, after a successful match. | 5321 // End of a look-behind block, after a successful match. |
| 5380 { | 5322 { |
| 5381 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); | 5323 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 5382 if (fp->fInputIdx != fActiveLimit) { | 5324 if (fp->fInputIdx != fActiveLimit) { |
| 5383 // The look-behind expression matched, but the match did no
t | 5325 // The look-behind expression matched, but the match did no
t |
| 5384 // extend all the way to the point that we are looking be
hind from. | 5326 // extend all the way to the point that we are looking be
hind from. |
| 5385 // FAIL out of here, which will take us back to the LB_CONT
, which | 5327 // FAIL out of here, which will take us back to the LB_CONT
, which |
| 5386 // will retry the match starting at another position or
fail | 5328 // will retry the match starting at another position or
fail |
| 5387 // the look-behind altogether, whichever is appropriate. | 5329 // the look-behind altogether, whichever is appropriate. |
| 5388 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 5330 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5389 break; | 5331 break; |
| 5390 } | 5332 } |
| 5391 | 5333 |
| 5392 // Look-behind match is good. Restore the orignal input string
length, | 5334 // Look-behind match is good. Restore the orignal input string
length, |
| 5393 // which had been truncated to pin the end of the lookbehind m
atch to the | 5335 // which had been truncated to pin the end of the lookbehind m
atch to the |
| 5394 // position being looked-behind. | 5336 // position being looked-behind. |
| 5395 int64_t originalInputLen = fData[opValue+3]; | 5337 int64_t originalInputLen = fData[opValue+3]; |
| 5396 U_ASSERT(originalInputLen >= fActiveLimit); | 5338 U_ASSERT(originalInputLen >= fActiveLimit); |
| 5397 U_ASSERT(originalInputLen <= fInputLength); | 5339 U_ASSERT(originalInputLen <= fInputLength); |
| 5398 fActiveLimit = originalInputLen; | 5340 fActiveLimit = originalInputLen; |
| 5399 } | 5341 } |
| 5400 break; | 5342 break; |
| 5401 | 5343 |
| 5402 | 5344 |
| 5403 case URX_LBN_CONT: | 5345 case URX_LBN_CONT: |
| 5404 { | 5346 { |
| 5405 // Negative Look-Behind, at top of loop checking for matches of
LB expression | 5347 // Negative Look-Behind, at top of loop checking for matches of
LB expression |
| 5406 // at all possible input starting positions. | 5348 // at all possible input starting positions. |
| 5407 | 5349 |
| 5408 // Fetch the extra parameters of this op. | 5350 // Fetch the extra parameters of this op. |
| 5409 int32_t minML = (int32_t)pat[fp->fPatIdx++]; | 5351 int32_t minML = (int32_t)pat[fp->fPatIdx++]; |
| 5410 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; | 5352 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; |
| 5411 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; | 5353 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; |
| 5412 continueLoc = URX_VAL(continueLoc); | 5354 continueLoc = URX_VAL(continueLoc); |
| 5413 U_ASSERT(minML <= maxML); | 5355 U_ASSERT(minML <= maxML); |
| 5414 U_ASSERT(minML >= 0); | 5356 U_ASSERT(minML >= 0); |
| 5415 U_ASSERT(continueLoc > fp->fPatIdx); | 5357 U_ASSERT(continueLoc > fp->fPatIdx); |
| 5416 | 5358 |
| 5417 // Fetch (from data) the last input index where a match was atte
mpted. | 5359 // Fetch (from data) the last input index where a match was atte
mpted. |
| 5418 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); | 5360 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 5419 int64_t *lbStartIdx = &fData[opValue+2]; | 5361 int64_t *lbStartIdx = &fData[opValue+2]; |
| 5420 if (*lbStartIdx < 0) { | 5362 if (*lbStartIdx < 0) { |
| 5421 // First time through loop. | 5363 // First time through loop. |
| 5422 *lbStartIdx = fp->fInputIdx - minML; | 5364 *lbStartIdx = fp->fInputIdx - minML; |
| 5423 } else { | 5365 } else { |
| 5424 // 2nd through nth time through the loop. | 5366 // 2nd through nth time through the loop. |
| 5425 // Back up start position for match by one. | 5367 // Back up start position for match by one. |
| 5426 if (*lbStartIdx == 0) { | 5368 if (*lbStartIdx == 0) { |
| 5427 (*lbStartIdx)--; // Because U16_BACK is unsafe startin
g at 0. | 5369 (*lbStartIdx)--; // Because U16_BACK is unsafe startin
g at 0. |
| 5428 } else { | 5370 } else { |
| 5429 U16_BACK_1(inputBuf, 0, *lbStartIdx); | 5371 U16_BACK_1(inputBuf, 0, *lbStartIdx); |
| 5430 } | 5372 } |
| 5431 } | 5373 } |
| 5432 | 5374 |
| 5433 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { | 5375 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { |
| 5434 // We have tried all potential match starting points without | 5376 // We have tried all potential match starting points without |
| 5435 // getting a match, which means that the negative lookbehin
d as | 5377 // getting a match, which means that the negative lookbehin
d as |
| 5436 // a whole has succeeded. Jump forward to the continue loc
ation | 5378 // a whole has succeeded. Jump forward to the continue loc
ation |
| 5437 int64_t restoreInputLen = fData[opValue+3]; | 5379 int64_t restoreInputLen = fData[opValue+3]; |
| 5438 U_ASSERT(restoreInputLen >= fActiveLimit); | 5380 U_ASSERT(restoreInputLen >= fActiveLimit); |
| 5439 U_ASSERT(restoreInputLen <= fInputLength); | 5381 U_ASSERT(restoreInputLen <= fInputLength); |
| 5440 fActiveLimit = restoreInputLen; | 5382 fActiveLimit = restoreInputLen; |
| 5441 fp->fPatIdx = continueLoc; | 5383 fp->fPatIdx = continueLoc; |
| 5442 break; | 5384 break; |
| 5443 } | 5385 } |
| 5444 | 5386 |
| 5445 // Save state to this URX_LB_CONT op, so failure to match wil
l repeat the loop. | 5387 // Save state to this URX_LB_CONT op, so failure to match wil
l repeat the loop. |
| 5446 // (successful match will cause a FAIL out of the loop alto
gether.) | 5388 // (successful match will cause a FAIL out of the loop alto
gether.) |
| 5447 fp = StateSave(fp, fp->fPatIdx-4, status); | 5389 fp = StateSave(fp, fp->fPatIdx-4, status); |
| 5448 fp->fInputIdx = *lbStartIdx; | 5390 fp->fInputIdx = *lbStartIdx; |
| 5449 } | 5391 } |
| 5450 break; | 5392 break; |
| 5451 | 5393 |
| 5452 case URX_LBN_END: | 5394 case URX_LBN_END: |
| 5453 // End of a negative look-behind block, after a successful match. | 5395 // End of a negative look-behind block, after a successful match. |
| 5454 { | 5396 { |
| 5455 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); | 5397 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 5456 if (fp->fInputIdx != fActiveLimit) { | 5398 if (fp->fInputIdx != fActiveLimit) { |
| 5457 // The look-behind expression matched, but the match did no
t | 5399 // The look-behind expression matched, but the match did no
t |
| 5458 // extend all the way to the point that we are looking be
hind from. | 5400 // extend all the way to the point that we are looking be
hind from. |
| 5459 // FAIL out of here, which will take us back to the LB_CONT
, which | 5401 // FAIL out of here, which will take us back to the LB_CONT
, which |
| 5460 // will retry the match starting at another position or
succeed | 5402 // will retry the match starting at another position or
succeed |
| 5461 // the look-behind altogether, whichever is appropriate. | 5403 // the look-behind altogether, whichever is appropriate. |
| 5462 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 5404 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5463 break; | 5405 break; |
| 5464 } | 5406 } |
| 5465 | 5407 |
| 5466 // Look-behind expression matched, which means look-behind test
as | 5408 // Look-behind expression matched, which means look-behind test
as |
| 5467 // a whole Fails | 5409 // a whole Fails |
| 5468 | 5410 |
| 5469 // Restore the orignal input string length, which had been tru
ncated | 5411 // Restore the orignal input string length, which had been tru
ncated |
| 5470 // inorder to pin the end of the lookbehind match | 5412 // inorder to pin the end of the lookbehind match |
| 5471 // to the position being looked-behind. | 5413 // to the position being looked-behind. |
| 5472 int64_t originalInputLen = fData[opValue+3]; | 5414 int64_t originalInputLen = fData[opValue+3]; |
| 5473 U_ASSERT(originalInputLen >= fActiveLimit); | 5415 U_ASSERT(originalInputLen >= fActiveLimit); |
| 5474 U_ASSERT(originalInputLen <= fInputLength); | 5416 U_ASSERT(originalInputLen <= fInputLength); |
| 5475 fActiveLimit = originalInputLen; | 5417 fActiveLimit = originalInputLen; |
| 5476 | 5418 |
| 5477 // Restore original stack position, discarding any state saved | 5419 // Restore original stack position, discarding any state saved |
| 5478 // by the successful pattern match. | 5420 // by the successful pattern match. |
| 5479 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); | 5421 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| 5480 int32_t newStackSize = (int32_t)fData[opValue]; | 5422 int32_t newStackSize = (int32_t)fData[opValue]; |
| 5481 U_ASSERT(fStack->size() > newStackSize); | 5423 U_ASSERT(fStack->size() > newStackSize); |
| 5482 fStack->setSize(newStackSize); | 5424 fStack->setSize(newStackSize); |
| 5483 | 5425 |
| 5484 // FAIL, which will take control back to someplace | 5426 // FAIL, which will take control back to someplace |
| 5485 // prior to entering the look-behind test. | 5427 // prior to entering the look-behind test. |
| 5486 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 5428 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| 5487 } | 5429 } |
| 5488 break; | 5430 break; |
| 5489 | 5431 |
| 5490 | 5432 |
| 5491 case URX_LOOP_SR_I: | 5433 case URX_LOOP_SR_I: |
| 5492 // Loop Initialization for the optimized implementation of | 5434 // Loop Initialization for the optimized implementation of |
| 5493 // [some character set]* | 5435 // [some character set]* |
| 5494 // This op scans through all matching input. | 5436 // This op scans through all matching input. |
| 5495 // The following LOOP_C op emulates stack unwinding if the followi
ng pattern fails. | 5437 // The following LOOP_C op emulates stack unwinding if the followi
ng pattern fails. |
| 5496 { | 5438 { |
| 5497 U_ASSERT(opValue > 0 && opValue < sets->size()); | 5439 U_ASSERT(opValue > 0 && opValue < sets->size()); |
| 5498 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; | 5440 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; |
| 5499 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); | 5441 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); |
| 5500 | 5442 |
| 5501 // Loop through input, until either the input is exhausted or | 5443 // Loop through input, until either the input is exhausted or |
| 5502 // we reach a character that is not a member of the set. | 5444 // we reach a character that is not a member of the set. |
| 5503 int32_t ix = (int32_t)fp->fInputIdx; | 5445 int32_t ix = (int32_t)fp->fInputIdx; |
| 5504 for (;;) { | 5446 for (;;) { |
| 5505 if (ix >= fActiveLimit) { | 5447 if (ix >= fActiveLimit) { |
| 5506 fHitEnd = TRUE; | 5448 fHitEnd = TRUE; |
| 5507 break; | 5449 break; |
| 5508 } | 5450 } |
| 5509 UChar32 c; | 5451 UChar32 c; |
| 5510 U16_NEXT(inputBuf, ix, fActiveLimit, c); | 5452 U16_NEXT(inputBuf, ix, fActiveLimit, c); |
| 5511 if (c<256) { | 5453 if (c<256) { |
| 5512 if (s8->contains(c) == FALSE) { | 5454 if (s8->contains(c) == FALSE) { |
| 5513 U16_BACK_1(inputBuf, 0, ix); | 5455 U16_BACK_1(inputBuf, 0, ix); |
| 5514 break; | 5456 break; |
| 5515 } | 5457 } |
| 5516 } else { | 5458 } else { |
| 5517 if (s->contains(c) == FALSE) { | 5459 if (s->contains(c) == FALSE) { |
| 5518 U16_BACK_1(inputBuf, 0, ix); | 5460 U16_BACK_1(inputBuf, 0, ix); |
| 5519 break; | 5461 break; |
| 5520 } | 5462 } |
| 5521 } | 5463 } |
| 5522 } | 5464 } |
| 5523 | 5465 |
| 5524 // If there were no matching characters, skip over the loop alto
gether. | 5466 // If there were no matching characters, skip over the loop alto
gether. |
| 5525 // The loop doesn't run at all, a * op always succeeds. | 5467 // The loop doesn't run at all, a * op always succeeds. |
| 5526 if (ix == fp->fInputIdx) { | 5468 if (ix == fp->fInputIdx) { |
| 5527 fp->fPatIdx++; // skip the URX_LOOP_C op. | 5469 fp->fPatIdx++; // skip the URX_LOOP_C op. |
| 5528 break; | 5470 break; |
| 5529 } | 5471 } |
| 5530 | 5472 |
| 5531 // Peek ahead in the compiled pattern, to the URX_LOOP_C that | 5473 // Peek ahead in the compiled pattern, to the URX_LOOP_C that |
| 5532 // must follow. It's operand is the stack location | 5474 // must follow. It's operand is the stack location |
| 5533 // that holds the starting input index for the match of this [
set]* | 5475 // that holds the starting input index for the match of this [
set]* |
| 5534 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; | 5476 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; |
| 5535 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); | 5477 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); |
| 5536 int32_t stackLoc = URX_VAL(loopcOp); | 5478 int32_t stackLoc = URX_VAL(loopcOp); |
| 5537 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); | 5479 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); |
| 5538 fp->fExtra[stackLoc] = fp->fInputIdx; | 5480 fp->fExtra[stackLoc] = fp->fInputIdx; |
| 5539 fp->fInputIdx = ix; | 5481 fp->fInputIdx = ix; |
| 5540 | 5482 |
| 5541 // Save State to the URX_LOOP_C op that follows this one, | 5483 // Save State to the URX_LOOP_C op that follows this one, |
| 5542 // so that match failures in the following code will return to
there. | 5484 // so that match failures in the following code will return to
there. |
| 5543 // Then bump the pattern idx so the LOOP_C is skipped on the w
ay out of here. | 5485 // Then bump the pattern idx so the LOOP_C is skipped on the w
ay out of here. |
| 5544 fp = StateSave(fp, fp->fPatIdx, status); | 5486 fp = StateSave(fp, fp->fPatIdx, status); |
| 5545 fp->fPatIdx++; | 5487 fp->fPatIdx++; |
| 5546 } | 5488 } |
| 5547 break; | 5489 break; |
| 5548 | 5490 |
| 5549 | 5491 |
| 5550 case URX_LOOP_DOT_I: | 5492 case URX_LOOP_DOT_I: |
| 5551 // Loop Initialization for the optimized implementation of .* | 5493 // Loop Initialization for the optimized implementation of .* |
| 5552 // This op scans through all remaining input. | 5494 // This op scans through all remaining input. |
| 5553 // The following LOOP_C op emulates stack unwinding if the followi
ng pattern fails. | 5495 // The following LOOP_C op emulates stack unwinding if the followi
ng pattern fails. |
| 5554 { | 5496 { |
| 5555 // Loop through input until the input is exhausted (we reach an
end-of-line) | 5497 // Loop through input until the input is exhausted (we reach an
end-of-line) |
| 5556 // In DOTALL mode, we can just go straight to the end of the inp
ut. | 5498 // In DOTALL mode, we can just go straight to the end of the inp
ut. |
| 5557 int32_t ix; | 5499 int32_t ix; |
| 5558 if ((opValue & 1) == 1) { | 5500 if ((opValue & 1) == 1) { |
| 5559 // Dot-matches-All mode. Jump straight to the end of the st
ring. | 5501 // Dot-matches-All mode. Jump straight to the end of the st
ring. |
| (...skipping 15 matching lines...) Expand all Loading... |
| 5575 (((opValue & 2) == 0) && // IF not UNIX_LINES
mode | 5517 (((opValue & 2) == 0) && // IF not UNIX_LINES
mode |
| 5576 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028
|| c==0x2029))) { | 5518 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028
|| c==0x2029))) { |
| 5577 // char is a line ending. Put the input pos ba
ck to the | 5519 // char is a line ending. Put the input pos ba
ck to the |
| 5578 // line ending char, and exit the scanning lo
op. | 5520 // line ending char, and exit the scanning lo
op. |
| 5579 U16_BACK_1(inputBuf, 0, ix); | 5521 U16_BACK_1(inputBuf, 0, ix); |
| 5580 break; | 5522 break; |
| 5581 } | 5523 } |
| 5582 } | 5524 } |
| 5583 } | 5525 } |
| 5584 } | 5526 } |
| 5585 | 5527 |
| 5586 // If there were no matching characters, skip over the loop alto
gether. | 5528 // If there were no matching characters, skip over the loop alto
gether. |
| 5587 // The loop doesn't run at all, a * op always succeeds. | 5529 // The loop doesn't run at all, a * op always succeeds. |
| 5588 if (ix == fp->fInputIdx) { | 5530 if (ix == fp->fInputIdx) { |
| 5589 fp->fPatIdx++; // skip the URX_LOOP_C op. | 5531 fp->fPatIdx++; // skip the URX_LOOP_C op. |
| 5590 break; | 5532 break; |
| 5591 } | 5533 } |
| 5592 | 5534 |
| 5593 // Peek ahead in the compiled pattern, to the URX_LOOP_C that | 5535 // Peek ahead in the compiled pattern, to the URX_LOOP_C that |
| 5594 // must follow. It's operand is the stack location | 5536 // must follow. It's operand is the stack location |
| 5595 // that holds the starting input index for the match of this .
* | 5537 // that holds the starting input index for the match of this .
* |
| 5596 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; | 5538 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; |
| 5597 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); | 5539 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); |
| 5598 int32_t stackLoc = URX_VAL(loopcOp); | 5540 int32_t stackLoc = URX_VAL(loopcOp); |
| 5599 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); | 5541 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); |
| 5600 fp->fExtra[stackLoc] = fp->fInputIdx; | 5542 fp->fExtra[stackLoc] = fp->fInputIdx; |
| 5601 fp->fInputIdx = ix; | 5543 fp->fInputIdx = ix; |
| 5602 | 5544 |
| 5603 // Save State to the URX_LOOP_C op that follows this one, | 5545 // Save State to the URX_LOOP_C op that follows this one, |
| 5604 // so that match failures in the following code will return to
there. | 5546 // so that match failures in the following code will return to
there. |
| 5605 // Then bump the pattern idx so the LOOP_C is skipped on the w
ay out of here. | 5547 // Then bump the pattern idx so the LOOP_C is skipped on the w
ay out of here. |
| 5606 fp = StateSave(fp, fp->fPatIdx, status); | 5548 fp = StateSave(fp, fp->fPatIdx, status); |
| 5607 fp->fPatIdx++; | 5549 fp->fPatIdx++; |
| 5608 } | 5550 } |
| 5609 break; | 5551 break; |
| 5610 | 5552 |
| 5611 | 5553 |
| 5612 case URX_LOOP_C: | 5554 case URX_LOOP_C: |
| 5613 { | 5555 { |
| 5614 U_ASSERT(opValue>=0 && opValue<fFrameSize); | 5556 U_ASSERT(opValue>=0 && opValue<fFrameSize); |
| 5615 backSearchIndex = (int32_t)fp->fExtra[opValue]; | 5557 backSearchIndex = (int32_t)fp->fExtra[opValue]; |
| 5616 U_ASSERT(backSearchIndex <= fp->fInputIdx); | 5558 U_ASSERT(backSearchIndex <= fp->fInputIdx); |
| 5617 if (backSearchIndex == fp->fInputIdx) { | 5559 if (backSearchIndex == fp->fInputIdx) { |
| 5618 // We've backed up the input idx to the point that the loop
started. | 5560 // We've backed up the input idx to the point that the loop
started. |
| 5619 // The loop is done. Leave here without saving state. | 5561 // The loop is done. Leave here without saving state. |
| 5620 // Subsequent failures won't come back here. | 5562 // Subsequent failures won't come back here. |
| 5621 break; | 5563 break; |
| 5622 } | 5564 } |
| 5623 // Set up for the next iteration of the loop, with input index | 5565 // Set up for the next iteration of the loop, with input index |
| 5624 // backed up by one from the last time through, | 5566 // backed up by one from the last time through, |
| 5625 // and a state save to this instruction in case the following
code fails again. | 5567 // and a state save to this instruction in case the following
code fails again. |
| 5626 // (We're going backwards because this loop emulates stack unw
inding, not | 5568 // (We're going backwards because this loop emulates stack unw
inding, not |
| 5627 // the initial scan forward.) | 5569 // the initial scan forward.) |
| 5628 U_ASSERT(fp->fInputIdx > 0); | 5570 U_ASSERT(fp->fInputIdx > 0); |
| 5629 UChar32 prevC; | 5571 UChar32 prevC; |
| 5630 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this
0 be one of f*Limit? | 5572 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this
0 be one of f*Limit? |
| 5631 | 5573 |
| 5632 if (prevC == 0x0a && | 5574 if (prevC == 0x0a && |
| 5633 fp->fInputIdx > backSearchIndex && | 5575 fp->fInputIdx > backSearchIndex && |
| 5634 inputBuf[fp->fInputIdx-1] == 0x0d) { | 5576 inputBuf[fp->fInputIdx-1] == 0x0d) { |
| 5635 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; | 5577 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; |
| 5636 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { | 5578 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { |
| 5637 // .*, stepping back over CRLF pair. | 5579 // .*, stepping back over CRLF pair. |
| 5638 U16_BACK_1(inputBuf, 0, fp->fInputIdx); | 5580 U16_BACK_1(inputBuf, 0, fp->fInputIdx); |
| 5639 } | 5581 } |
| 5640 } | 5582 } |
| 5641 | 5583 |
| 5642 | 5584 |
| 5643 fp = StateSave(fp, fp->fPatIdx-1, status); | 5585 fp = StateSave(fp, fp->fPatIdx-1, status); |
| 5644 } | 5586 } |
| 5645 break; | 5587 break; |
| 5646 | 5588 |
| 5647 | 5589 |
| 5648 | 5590 |
| 5649 default: | 5591 default: |
| 5650 // Trouble. The compiled pattern contains an entry with an | 5592 // Trouble. The compiled pattern contains an entry with an |
| 5651 // unrecognized type tag. | 5593 // unrecognized type tag. |
| 5652 U_ASSERT(FALSE); | 5594 U_ASSERT(FALSE); |
| 5653 } | 5595 } |
| 5654 | 5596 |
| 5655 if (U_FAILURE(status)) { | 5597 if (U_FAILURE(status)) { |
| 5656 isMatch = FALSE; | 5598 isMatch = FALSE; |
| 5657 break; | 5599 break; |
| 5658 } | 5600 } |
| 5659 } | 5601 } |
| 5660 | 5602 |
| 5661 breakFromLoop: | 5603 breakFromLoop: |
| 5662 fMatch = isMatch; | 5604 fMatch = isMatch; |
| 5663 if (isMatch) { | 5605 if (isMatch) { |
| 5664 fLastMatchEnd = fMatchEnd; | 5606 fLastMatchEnd = fMatchEnd; |
| 5665 fMatchStart = startIdx; | 5607 fMatchStart = startIdx; |
| 5666 fMatchEnd = fp->fInputIdx; | 5608 fMatchEnd = fp->fInputIdx; |
| 5667 if (fTraceDebug) { | 5609 } |
| 5668 REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchSta
rt, fMatchEnd)); | 5610 |
| 5611 #ifdef REGEX_RUN_DEBUG |
| 5612 if (fTraceDebug) { |
| 5613 if (isMatch) { |
| 5614 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd); |
| 5615 } else { |
| 5616 printf("No match\n\n"); |
| 5669 } | 5617 } |
| 5670 } | 5618 } |
| 5671 else | 5619 #endif |
| 5672 { | 5620 |
| 5673 if (fTraceDebug) { | |
| 5674 REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); | |
| 5675 } | |
| 5676 } | |
| 5677 | |
| 5678 fFrame = fp; // The active stack frame when the engine stoppe
d. | 5621 fFrame = fp; // The active stack frame when the engine stoppe
d. |
| 5679 // Contains the capture group results that we need to | 5622 // Contains the capture group results that we
need to |
| 5680 // access later. | 5623 // access later. |
| 5681 | 5624 |
| 5682 return; | 5625 return; |
| 5683 } | 5626 } |
| 5684 | 5627 |
| 5685 | 5628 |
| 5686 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) | 5629 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) |
| 5687 | 5630 |
| 5688 U_NAMESPACE_END | 5631 U_NAMESPACE_END |
| 5689 | 5632 |
| 5690 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS | 5633 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
| OLD | NEW |