OLD | NEW |
1 /* | 1 /* |
2 ************************************************************************** | 2 ************************************************************************** |
3 * Copyright (C) 2002-2013 International Business Machines Corporation * | 3 * Copyright (C) 2002-2014 International Business Machines Corporation * |
4 * and others. All rights reserved. * | 4 * and others. All rights reserved. * |
5 ************************************************************************** | 5 ************************************************************************** |
6 */ | 6 */ |
7 // | 7 // |
8 // file: rematch.cpp | 8 // file: rematch.cpp |
9 // | 9 // |
10 // Contains the implementation of class RegexMatcher, | 10 // Contains the implementation of class RegexMatcher, |
11 // which is one of the main API classes for the ICU regular expression p
ackage. | 11 // which is one of the main API classes for the ICU regular expression p
ackage. |
12 // | 12 // |
13 | 13 |
(...skipping 12 matching lines...) Expand all Loading... |
26 #include "uvector.h" | 26 #include "uvector.h" |
27 #include "uvectr32.h" | 27 #include "uvectr32.h" |
28 #include "uvectr64.h" | 28 #include "uvectr64.h" |
29 #include "regeximp.h" | 29 #include "regeximp.h" |
30 #include "regexst.h" | 30 #include "regexst.h" |
31 #include "regextxt.h" | 31 #include "regextxt.h" |
32 #include "ucase.h" | 32 #include "ucase.h" |
33 | 33 |
34 // #include <malloc.h> // Needed for heapcheck testing | 34 // #include <malloc.h> // Needed for heapcheck testing |
35 | 35 |
36 | |
37 // Find progress callback | |
38 // ---------------------- | |
39 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary
function call. | |
40 // | |
41 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \ | |
42 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FAL
SE) | |
43 | |
44 | |
45 // Smart Backtracking | |
46 // ------------------ | |
47 // When a failure would go back to a LOOP_C instruction, | |
48 // strings, characters, and setrefs scan backwards for a valid start | |
49 // character themselves, pop the stack, and save state, emulating the | |
50 // LOOP_C's effect but assured that the next character of input is a | |
51 // possible matching character. | |
52 // | |
53 // Good idea in theory; unfortunately it only helps out a few specific | |
54 // cases and slows the engine down a little in the rest. | |
55 | |
56 U_NAMESPACE_BEGIN | 36 U_NAMESPACE_BEGIN |
57 | 37 |
58 // Default limit for the size of the back track stack, to avoid system | 38 // Default limit for the size of the back track stack, to avoid system |
59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. | 39 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. |
60 // This value puts ICU's limits higher than most other regexp implementations, | 40 // This value puts ICU's limits higher than most other regexp implementations, |
61 // which use recursion rather than the heap, and take more storage per | 41 // which use recursion rather than the heap, and take more storage per |
62 // backtrack point. | 42 // backtrack point. |
63 // | 43 // |
64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; | 44 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; |
65 | 45 |
66 // Time limit counter constant. | 46 // Time limit counter constant. |
67 // Time limits for expression evaluation are in terms of quanta of work by | 47 // Time limits for expression evaluation are in terms of quanta of work by |
68 // the engine, each of which is 10,000 state saves. | 48 // the engine, each of which is 10,000 state saves. |
69 // This constant determines that state saves per tick number. | 49 // This constant determines that state saves per tick number. |
70 static const int32_t TIMER_INITIAL_VALUE = 10000; | 50 static const int32_t TIMER_INITIAL_VALUE = 10000; |
71 | 51 |
72 //----------------------------------------------------------------------------- | 52 //----------------------------------------------------------------------------- |
73 // | 53 // |
74 // Constructor and Destructor | 54 // Constructor and Destructor |
75 // | 55 // |
76 //----------------------------------------------------------------------------- | 56 //----------------------------------------------------------------------------- |
77 RegexMatcher::RegexMatcher(const RegexPattern *pat) { | 57 RegexMatcher::RegexMatcher(const RegexPattern *pat) { |
78 fDeferredStatus = U_ZERO_ERROR; | 58 fDeferredStatus = U_ZERO_ERROR; |
79 init(fDeferredStatus); | 59 init(fDeferredStatus); |
80 if (U_FAILURE(fDeferredStatus)) { | 60 if (U_FAILURE(fDeferredStatus)) { |
81 return; | 61 return; |
82 } | 62 } |
83 if (pat==NULL) { | 63 if (pat==NULL) { |
84 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; | 64 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; |
85 return; | 65 return; |
86 } | 66 } |
87 fPattern = pat; | 67 fPattern = pat; |
88 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); | 68 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); |
89 } | 69 } |
90 | 70 |
91 | 71 |
92 | 72 |
93 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &inp
ut, | 73 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &inp
ut, |
94 uint32_t flags, UErrorCode &status) { | 74 uint32_t flags, UErrorCode &status) { |
95 init(status); | 75 init(status); |
96 if (U_FAILURE(status)) { | 76 if (U_FAILURE(status)) { |
97 return; | 77 return; |
98 } | 78 } |
99 UParseError pe; | 79 UParseError pe; |
100 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); | 80 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
101 fPattern = fPatternOwned; | 81 fPattern = fPatternOwned; |
102 | 82 |
103 UText inputText = UTEXT_INITIALIZER; | 83 UText inputText = UTEXT_INITIALIZER; |
104 utext_openConstUnicodeString(&inputText, &input, &status); | 84 utext_openConstUnicodeString(&inputText, &input, &status); |
105 init2(&inputText, status); | 85 init2(&inputText, status); |
106 utext_close(&inputText); | 86 utext_close(&inputText); |
107 | 87 |
108 fInputUniStrMaybeMutable = TRUE; | 88 fInputUniStrMaybeMutable = TRUE; |
109 } | 89 } |
110 | 90 |
111 | 91 |
112 RegexMatcher::RegexMatcher(UText *regexp, UText *input, | 92 RegexMatcher::RegexMatcher(UText *regexp, UText *input, |
113 uint32_t flags, UErrorCode &status) { | 93 uint32_t flags, UErrorCode &status) { |
114 init(status); | 94 init(status); |
115 if (U_FAILURE(status)) { | 95 if (U_FAILURE(status)) { |
116 return; | 96 return; |
117 } | 97 } |
118 UParseError pe; | 98 UParseError pe; |
119 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); | 99 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
120 if (U_FAILURE(status)) { | 100 if (U_FAILURE(status)) { |
121 return; | 101 return; |
122 } | 102 } |
123 | 103 |
124 fPattern = fPatternOwned; | 104 fPattern = fPatternOwned; |
125 init2(input, status); | 105 init2(input, status); |
126 } | 106 } |
127 | 107 |
128 | 108 |
129 RegexMatcher::RegexMatcher(const UnicodeString ®exp, | 109 RegexMatcher::RegexMatcher(const UnicodeString ®exp, |
130 uint32_t flags, UErrorCode &status) { | 110 uint32_t flags, UErrorCode &status) { |
131 init(status); | 111 init(status); |
132 if (U_FAILURE(status)) { | 112 if (U_FAILURE(status)) { |
133 return; | 113 return; |
134 } | 114 } |
135 UParseError pe; | 115 UParseError pe; |
136 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); | 116 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
137 if (U_FAILURE(status)) { | 117 if (U_FAILURE(status)) { |
138 return; | 118 return; |
139 } | 119 } |
140 fPattern = fPatternOwned; | 120 fPattern = fPatternOwned; |
141 init2(RegexStaticSets::gStaticSets->fEmptyText, status); | 121 init2(RegexStaticSets::gStaticSets->fEmptyText, status); |
142 } | 122 } |
143 | 123 |
144 RegexMatcher::RegexMatcher(UText *regexp, | 124 RegexMatcher::RegexMatcher(UText *regexp, |
145 uint32_t flags, UErrorCode &status) { | 125 uint32_t flags, UErrorCode &status) { |
146 init(status); | 126 init(status); |
147 if (U_FAILURE(status)) { | 127 if (U_FAILURE(status)) { |
148 return; | 128 return; |
149 } | 129 } |
150 UParseError pe; | 130 UParseError pe; |
151 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); | 131 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
152 if (U_FAILURE(status)) { | 132 if (U_FAILURE(status)) { |
153 return; | 133 return; |
154 } | 134 } |
155 | 135 |
156 fPattern = fPatternOwned; | 136 fPattern = fPatternOwned; |
157 init2(RegexStaticSets::gStaticSets->fEmptyText, status); | 137 init2(RegexStaticSets::gStaticSets->fEmptyText, status); |
158 } | 138 } |
159 | 139 |
160 | 140 |
161 | 141 |
162 | 142 |
163 RegexMatcher::~RegexMatcher() { | 143 RegexMatcher::~RegexMatcher() { |
164 delete fStack; | 144 delete fStack; |
165 if (fData != fSmallData) { | 145 if (fData != fSmallData) { |
166 uprv_free(fData); | 146 uprv_free(fData); |
167 fData = NULL; | 147 fData = NULL; |
168 } | 148 } |
169 if (fPatternOwned) { | 149 if (fPatternOwned) { |
170 delete fPatternOwned; | 150 delete fPatternOwned; |
171 fPatternOwned = NULL; | 151 fPatternOwned = NULL; |
172 fPattern = NULL; | 152 fPattern = NULL; |
173 } | 153 } |
174 | 154 |
175 if (fInput) { | 155 if (fInput) { |
176 delete fInput; | 156 delete fInput; |
177 } | 157 } |
178 if (fInputText) { | 158 if (fInputText) { |
179 utext_close(fInputText); | 159 utext_close(fInputText); |
180 } | 160 } |
181 if (fAltInputText) { | 161 if (fAltInputText) { |
182 utext_close(fAltInputText); | 162 utext_close(fAltInputText); |
183 } | 163 } |
184 | 164 |
185 #if UCONFIG_NO_BREAK_ITERATION==0 | 165 #if UCONFIG_NO_BREAK_ITERATION==0 |
186 delete fWordBreakItr; | 166 delete fWordBreakItr; |
187 #endif | 167 #endif |
188 } | 168 } |
189 | 169 |
190 // | 170 // |
191 // init() common initialization for use by all constructors. | 171 // init() common initialization for use by all constructors. |
192 // Initialize all fields, get the object into a consistent state. | 172 // Initialize all fields, get the object into a consistent state. |
193 // This must be done even when the initial status shows an error, | 173 // This must be done even when the initial status shows an error, |
194 // so that the object is initialized sufficiently well for the destru
ctor | 174 // so that the object is initialized sufficiently well for the destru
ctor |
(...skipping 27 matching lines...) Expand all Loading... |
222 fTickCounter = 0; | 202 fTickCounter = 0; |
223 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; | 203 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; |
224 fCallbackFn = NULL; | 204 fCallbackFn = NULL; |
225 fCallbackContext = NULL; | 205 fCallbackContext = NULL; |
226 fFindProgressCallbackFn = NULL; | 206 fFindProgressCallbackFn = NULL; |
227 fFindProgressCallbackContext = NULL; | 207 fFindProgressCallbackContext = NULL; |
228 fTraceDebug = FALSE; | 208 fTraceDebug = FALSE; |
229 fDeferredStatus = status; | 209 fDeferredStatus = status; |
230 fData = fSmallData; | 210 fData = fSmallData; |
231 fWordBreakItr = NULL; | 211 fWordBreakItr = NULL; |
232 | 212 |
233 fStack = NULL; | 213 fStack = NULL; |
234 fInputText = NULL; | 214 fInputText = NULL; |
235 fAltInputText = NULL; | 215 fAltInputText = NULL; |
236 fInput = NULL; | 216 fInput = NULL; |
237 fInputLength = 0; | 217 fInputLength = 0; |
238 fInputUniStrMaybeMutable = FALSE; | 218 fInputUniStrMaybeMutable = FALSE; |
239 | 219 |
240 if (U_FAILURE(status)) { | 220 if (U_FAILURE(status)) { |
241 fDeferredStatus = status; | 221 fDeferredStatus = status; |
242 } | 222 } |
243 } | 223 } |
244 | 224 |
245 // | 225 // |
246 // init2() Common initialization for use by RegexMatcher constructors, part 2
. | 226 // init2() Common initialization for use by RegexMatcher constructors, part 2
. |
247 // This handles the common setup to be done after the Pattern is avai
lable. | 227 // This handles the common setup to be done after the Pattern is avai
lable. |
248 // | 228 // |
249 void RegexMatcher::init2(UText *input, UErrorCode &status) { | 229 void RegexMatcher::init2(UText *input, UErrorCode &status) { |
250 if (U_FAILURE(status)) { | 230 if (U_FAILURE(status)) { |
251 fDeferredStatus = status; | 231 fDeferredStatus = status; |
252 return; | 232 return; |
253 } | 233 } |
254 | 234 |
255 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0])
)) { | 235 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0])
)) { |
256 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); | 236 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); |
257 if (fData == NULL) { | 237 if (fData == NULL) { |
258 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; | 238 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
259 return; | 239 return; |
260 } | 240 } |
261 } | 241 } |
262 | 242 |
263 fStack = new UVector64(status); | 243 fStack = new UVector64(status); |
264 if (fStack == NULL) { | 244 if (fStack == NULL) { |
265 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; | 245 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
266 return; | 246 return; |
(...skipping 12 matching lines...) Expand all Loading... |
279 static const UChar DOLLARSIGN = 0x24; | 259 static const UChar DOLLARSIGN = 0x24; |
280 //------------------------------------------------------------------------------
-- | 260 //------------------------------------------------------------------------------
-- |
281 // | 261 // |
282 // appendReplacement | 262 // appendReplacement |
283 // | 263 // |
284 //------------------------------------------------------------------------------
-- | 264 //------------------------------------------------------------------------------
-- |
285 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, | 265 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, |
286 const UnicodeString &replacement, | 266 const UnicodeString &replacement, |
287 UErrorCode &status) { | 267 UErrorCode &status) { |
288 UText replacementText = UTEXT_INITIALIZER; | 268 UText replacementText = UTEXT_INITIALIZER; |
289 | 269 |
290 utext_openConstUnicodeString(&replacementText, &replacement, &status); | 270 utext_openConstUnicodeString(&replacementText, &replacement, &status); |
291 if (U_SUCCESS(status)) { | 271 if (U_SUCCESS(status)) { |
292 UText resultText = UTEXT_INITIALIZER; | 272 UText resultText = UTEXT_INITIALIZER; |
293 utext_openUnicodeString(&resultText, &dest, &status); | 273 utext_openUnicodeString(&resultText, &dest, &status); |
294 | 274 |
295 if (U_SUCCESS(status)) { | 275 if (U_SUCCESS(status)) { |
296 appendReplacement(&resultText, &replacementText, status); | 276 appendReplacement(&resultText, &replacementText, status); |
297 utext_close(&resultText); | 277 utext_close(&resultText); |
298 } | 278 } |
299 utext_close(&replacementText); | 279 utext_close(&replacementText); |
300 } | 280 } |
301 | 281 |
302 return *this; | 282 return *this; |
303 } | 283 } |
304 | 284 |
305 // | 285 // |
306 // appendReplacement, UText mode | 286 // appendReplacement, UText mode |
307 // | 287 // |
308 RegexMatcher &RegexMatcher::appendReplacement(UText *dest, | 288 RegexMatcher &RegexMatcher::appendReplacement(UText *dest, |
309 UText *replacement, | 289 UText *replacement, |
310 UErrorCode &status) { | 290 UErrorCode &status) { |
311 if (U_FAILURE(status)) { | 291 if (U_FAILURE(status)) { |
312 return *this; | 292 return *this; |
313 } | 293 } |
314 if (U_FAILURE(fDeferredStatus)) { | 294 if (U_FAILURE(fDeferredStatus)) { |
315 status = fDeferredStatus; | 295 status = fDeferredStatus; |
316 return *this; | 296 return *this; |
317 } | 297 } |
318 if (fMatch == FALSE) { | 298 if (fMatch == FALSE) { |
319 status = U_REGEX_INVALID_STATE; | 299 status = U_REGEX_INVALID_STATE; |
320 return *this; | 300 return *this; |
321 } | 301 } |
322 | 302 |
323 // Copy input string from the end of previous match to start of current matc
h | 303 // Copy input string from the end of previous match to start of current matc
h |
324 int64_t destLen = utext_nativeLength(dest); | 304 int64_t destLen = utext_nativeLength(dest); |
325 if (fMatchStart > fAppendPosition) { | 305 if (fMatchStart > fAppendPosition) { |
326 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { | 306 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
327 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkCo
ntents+fAppendPosition, | 307 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkCo
ntents+fAppendPosition, |
328 (int32_t)(fMatchStart-fAppendPosition), &st
atus); | 308 (int32_t)(fMatchStart-fAppendPosition), &st
atus); |
329 } else { | 309 } else { |
330 int32_t len16; | 310 int32_t len16; |
331 if (UTEXT_USES_U16(fInputText)) { | 311 if (UTEXT_USES_U16(fInputText)) { |
332 len16 = (int32_t)(fMatchStart-fAppendPosition); | 312 len16 = (int32_t)(fMatchStart-fAppendPosition); |
333 } else { | 313 } else { |
334 UErrorCode lengthStatus = U_ZERO_ERROR; | 314 UErrorCode lengthStatus = U_ZERO_ERROR; |
335 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart,
NULL, 0, &lengthStatus); | 315 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart,
NULL, 0, &lengthStatus); |
336 } | 316 } |
337 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); | 317 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); |
338 if (inputChars == NULL) { | 318 if (inputChars == NULL) { |
339 status = U_MEMORY_ALLOCATION_ERROR; | 319 status = U_MEMORY_ALLOCATION_ERROR; |
340 return *this; | 320 return *this; |
341 } | 321 } |
342 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars,
len16+1, &status); | 322 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars,
len16+1, &status); |
343 destLen += utext_replace(dest, destLen, destLen, inputChars, len16,
&status); | 323 destLen += utext_replace(dest, destLen, destLen, inputChars, len16,
&status); |
344 uprv_free(inputChars); | 324 uprv_free(inputChars); |
345 } | 325 } |
346 } | 326 } |
347 fAppendPosition = fMatchEnd; | 327 fAppendPosition = fMatchEnd; |
348 | 328 |
349 | 329 |
350 // scan the replacement text, looking for substitutions ($n) and \escapes. | 330 // scan the replacement text, looking for substitutions ($n) and \escapes. |
351 // TODO: optimize this loop by efficiently scanning for '$' or '\', | 331 // TODO: optimize this loop by efficiently scanning for '$' or '\', |
352 // move entire ranges not containing substitutions. | 332 // move entire ranges not containing substitutions. |
353 UTEXT_SETNATIVEINDEX(replacement, 0); | 333 UTEXT_SETNATIVEINDEX(replacement, 0); |
354 UChar32 c = UTEXT_NEXT32(replacement); | 334 UChar32 c = UTEXT_NEXT32(replacement); |
355 while (c != U_SENTINEL) { | 335 while (c != U_SENTINEL) { |
356 if (c == BACKSLASH) { | 336 if (c == BACKSLASH) { |
357 // Backslash Escape. Copy the following char out without further ch
ecks. | 337 // Backslash Escape. Copy the following char out without further ch
ecks. |
358 // Note: Surrogate pairs don't need any special
handling | 338 // Note: Surrogate pairs don't need any special
handling |
359 // The second half wont be a '$' or a '\',
and | 339 // The second half wont be a '$' or a '\',
and |
360 // will move to the dest normally on the n
ext | 340 // will move to the dest normally on the n
ext |
361 // loop iteration. | 341 // loop iteration. |
362 c = UTEXT_CURRENT32(replacement); | 342 c = UTEXT_CURRENT32(replacement); |
363 if (c == U_SENTINEL) { | 343 if (c == U_SENTINEL) { |
364 break; | 344 break; |
365 } | 345 } |
366 | 346 |
367 if (c==0x55/*U*/ || c==0x75/*u*/) { | 347 if (c==0x55/*U*/ || c==0x75/*u*/) { |
368 // We have a \udddd or \Udddddddd escape sequence. | 348 // We have a \udddd or \Udddddddd escape sequence. |
369 int32_t offset = 0; | 349 int32_t offset = 0; |
370 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UN
ESCAPE_CONTEXT(replacement); | 350 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UN
ESCAPE_CONTEXT(replacement); |
371 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt,
&offset, INT32_MAX, &context); | 351 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt,
&offset, INT32_MAX, &context); |
372 if (escapedChar != (UChar32)0xFFFFFFFF) { | 352 if (escapedChar != (UChar32)0xFFFFFFFF) { |
373 if (U_IS_BMP(escapedChar)) { | 353 if (U_IS_BMP(escapedChar)) { |
374 UChar c16 = (UChar)escapedChar; | 354 UChar c16 = (UChar)escapedChar; |
375 destLen += utext_replace(dest, destLen, destLen, &c16, 1
, &status); | 355 destLen += utext_replace(dest, destLen, destLen, &c16, 1
, &status); |
376 } else { | 356 } else { |
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
414 surrogate[0] = U16_LEAD(c); | 394 surrogate[0] = U16_LEAD(c); |
415 surrogate[1] = U16_TRAIL(c); | 395 surrogate[1] = U16_TRAIL(c); |
416 if (U_SUCCESS(status)) { | 396 if (U_SUCCESS(status)) { |
417 destLen += utext_replace(dest, destLen, destLen, surrogate,
2, &status); | 397 destLen += utext_replace(dest, destLen, destLen, surrogate,
2, &status); |
418 } | 398 } |
419 } | 399 } |
420 } else { | 400 } else { |
421 // We've got a $. Pick up a capture group number if one follows. | 401 // We've got a $. Pick up a capture group number if one follows. |
422 // Consume at most the number of digits necessary for the largest ca
pture | 402 // Consume at most the number of digits necessary for the largest ca
pture |
423 // number that is valid for this pattern. | 403 // number that is valid for this pattern. |
424 | 404 |
425 int32_t numDigits = 0; | 405 int32_t numDigits = 0; |
426 int32_t groupNum = 0; | 406 int32_t groupNum = 0; |
427 UChar32 digitC; | 407 UChar32 digitC; |
428 for (;;) { | 408 for (;;) { |
429 digitC = UTEXT_CURRENT32(replacement); | 409 digitC = UTEXT_CURRENT32(replacement); |
430 if (digitC == U_SENTINEL) { | 410 if (digitC == U_SENTINEL) { |
431 break; | 411 break; |
432 } | 412 } |
433 if (u_isdigit(digitC) == FALSE) { | 413 if (u_isdigit(digitC) == FALSE) { |
434 break; | 414 break; |
435 } | 415 } |
436 (void)UTEXT_NEXT32(replacement); | 416 (void)UTEXT_NEXT32(replacement); |
437 groupNum=groupNum*10 + u_charDigitValue(digitC); | 417 groupNum=groupNum*10 + u_charDigitValue(digitC); |
438 numDigits++; | 418 numDigits++; |
439 if (numDigits >= fPattern->fMaxCaptureDigits) { | 419 if (numDigits >= fPattern->fMaxCaptureDigits) { |
440 break; | 420 break; |
441 } | 421 } |
442 } | 422 } |
443 | 423 |
444 | 424 |
445 if (numDigits == 0) { | 425 if (numDigits == 0) { |
446 // The $ didn't introduce a group number at all. | 426 // The $ didn't introduce a group number at all. |
447 // Treat it as just part of the substitution text. | 427 // Treat it as just part of the substitution text. |
448 UChar c16 = DOLLARSIGN; | 428 UChar c16 = DOLLARSIGN; |
449 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &statu
s); | 429 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &statu
s); |
450 } else { | 430 } else { |
451 // Finally, append the capture group data to the destination. | 431 // Finally, append the capture group data to the destination. |
452 destLen += appendGroup(groupNum, dest, status); | 432 destLen += appendGroup(groupNum, dest, status); |
453 if (U_FAILURE(status)) { | 433 if (U_FAILURE(status)) { |
454 // Can fail if group number is out of range. | 434 // Can fail if group number is out of range. |
455 break; | 435 break; |
456 } | 436 } |
457 } | 437 } |
458 } | 438 } |
459 | 439 |
460 if (U_FAILURE(status)) { | 440 if (U_FAILURE(status)) { |
461 break; | 441 break; |
462 } else { | 442 } else { |
463 c = UTEXT_NEXT32(replacement); | 443 c = UTEXT_NEXT32(replacement); |
464 } | 444 } |
465 } | 445 } |
466 | 446 |
467 return *this; | 447 return *this; |
468 } | 448 } |
469 | 449 |
470 | 450 |
471 | 451 |
472 //------------------------------------------------------------------------------
-- | 452 //------------------------------------------------------------------------------
-- |
473 // | 453 // |
474 // appendTail Intended to be used in conjunction with appendReplacement() | 454 // appendTail Intended to be used in conjunction with appendReplacement() |
475 // To the destination string, append everything following | 455 // To the destination string, append everything following |
476 // the last match position from the input string. | 456 // the last match position from the input string. |
477 // | 457 // |
478 // Note: Match ranges do not affect appendTail or appendRepla
cement | 458 // Note: Match ranges do not affect appendTail or appendRepla
cement |
479 // | 459 // |
480 //------------------------------------------------------------------------------
-- | 460 //------------------------------------------------------------------------------
-- |
481 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { | 461 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { |
482 UErrorCode status = U_ZERO_ERROR; | 462 UErrorCode status = U_ZERO_ERROR; |
483 UText resultText = UTEXT_INITIALIZER; | 463 UText resultText = UTEXT_INITIALIZER; |
484 utext_openUnicodeString(&resultText, &dest, &status); | 464 utext_openUnicodeString(&resultText, &dest, &status); |
485 | 465 |
486 if (U_SUCCESS(status)) { | 466 if (U_SUCCESS(status)) { |
487 appendTail(&resultText, status); | 467 appendTail(&resultText, status); |
488 utext_close(&resultText); | 468 utext_close(&resultText); |
489 } | 469 } |
490 | 470 |
491 return dest; | 471 return dest; |
492 } | 472 } |
493 | 473 |
494 // | 474 // |
495 // appendTail, UText mode | 475 // appendTail, UText mode |
496 // | 476 // |
497 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { | 477 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { |
498 UBool bailOut = FALSE; | |
499 if (U_FAILURE(status)) { | 478 if (U_FAILURE(status)) { |
500 bailOut = TRUE; | 479 return dest; |
501 } | 480 } |
502 if (U_FAILURE(fDeferredStatus)) { | 481 if (U_FAILURE(fDeferredStatus)) { |
503 status = fDeferredStatus; | 482 status = fDeferredStatus; |
504 bailOut = TRUE; | 483 return dest; |
505 } | 484 } |
506 | 485 |
507 if (bailOut) { | |
508 // dest must not be NULL | |
509 if (dest) { | |
510 utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(des
t), NULL, 0, &status); | |
511 return dest; | |
512 } | |
513 } | |
514 | |
515 if (fInputLength > fAppendPosition) { | 486 if (fInputLength > fAppendPosition) { |
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { | 487 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
517 int64_t destLen = utext_nativeLength(dest); | 488 int64_t destLen = utext_nativeLength(dest); |
518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fApp
endPosition, | 489 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fApp
endPosition, |
519 (int32_t)(fInputLength-fAppendPosition), &status); | 490 (int32_t)(fInputLength-fAppendPosition), &status); |
520 } else { | 491 } else { |
521 int32_t len16; | 492 int32_t len16; |
522 if (UTEXT_USES_U16(fInputText)) { | 493 if (UTEXT_USES_U16(fInputText)) { |
523 len16 = (int32_t)(fInputLength-fAppendPosition); | 494 len16 = (int32_t)(fInputLength-fAppendPosition); |
524 } else { | 495 } else { |
525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength,
NULL, 0, &status); | 496 len16 = utext_extract(fInputText, fAppendPosition, fInputLength,
NULL, 0, &status); |
526 status = U_ZERO_ERROR; // buffer overflow | 497 status = U_ZERO_ERROR; // buffer overflow |
527 } | 498 } |
528 | 499 |
529 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); | 500 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); |
530 if (inputChars == NULL) { | 501 if (inputChars == NULL) { |
531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; | 502 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
532 } else { | 503 } else { |
533 utext_extract(fInputText, fAppendPosition, fInputLength, inputCh
ars, len16, &status); // unterminated | 504 utext_extract(fInputText, fAppendPosition, fInputLength, inputCh
ars, len16, &status); // unterminated |
534 int64_t destLen = utext_nativeLength(dest); | 505 int64_t destLen = utext_nativeLength(dest); |
535 utext_replace(dest, destLen, destLen, inputChars, len16, &status
); | 506 utext_replace(dest, destLen, destLen, inputChars, len16, &status
); |
536 uprv_free(inputChars); | 507 uprv_free(inputChars); |
537 } | 508 } |
538 } | 509 } |
539 } | 510 } |
540 return dest; | 511 return dest; |
541 } | 512 } |
542 | 513 |
543 | 514 |
(...skipping 18 matching lines...) Expand all Loading... |
562 if (fMatch == FALSE) { | 533 if (fMatch == FALSE) { |
563 err = U_REGEX_INVALID_STATE; | 534 err = U_REGEX_INVALID_STATE; |
564 return -1; | 535 return -1; |
565 } | 536 } |
566 if (group < 0 || group > fPattern->fGroupMap->size()) { | 537 if (group < 0 || group > fPattern->fGroupMap->size()) { |
567 err = U_INDEX_OUTOFBOUNDS_ERROR; | 538 err = U_INDEX_OUTOFBOUNDS_ERROR; |
568 return -1; | 539 return -1; |
569 } | 540 } |
570 int64_t e = -1; | 541 int64_t e = -1; |
571 if (group == 0) { | 542 if (group == 0) { |
572 e = fMatchEnd; | 543 e = fMatchEnd; |
573 } else { | 544 } else { |
574 // Get the position within the stack frame of the variables for | 545 // Get the position within the stack frame of the variables for |
575 // this capture group. | 546 // this capture group. |
576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); | 547 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); |
577 U_ASSERT(groupOffset < fPattern->fFrameSize); | 548 U_ASSERT(groupOffset < fPattern->fFrameSize); |
578 U_ASSERT(groupOffset >= 0); | 549 U_ASSERT(groupOffset >= 0); |
579 e = fFrame->fExtra[groupOffset + 1]; | 550 e = fFrame->fExtra[groupOffset + 1]; |
580 } | 551 } |
581 | 552 |
582 return e; | 553 return e; |
583 } | 554 } |
584 | 555 |
585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { | 556 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { |
586 return (int32_t)end64(group, err); | 557 return (int32_t)end64(group, err); |
587 } | 558 } |
588 | 559 |
| 560 //------------------------------------------------------------------------------
-- |
| 561 // |
| 562 // findProgressInterrupt This function is called once for each advance in the
target |
| 563 // string from the find() function, and calls the user
progress callback |
| 564 // function if there is one installed. |
| 565 // |
| 566 // Return: TRUE if the find operation is to be terminated. |
| 567 // FALSE if the find operation is to continue running. |
| 568 // |
| 569 //------------------------------------------------------------------------------
-- |
| 570 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) { |
| 571 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCall
backContext, pos)) { |
| 572 status = U_REGEX_STOPPED_BY_CALLER; |
| 573 return TRUE; |
| 574 } |
| 575 return FALSE; |
| 576 } |
589 | 577 |
590 //------------------------------------------------------------------------------
-- | 578 //------------------------------------------------------------------------------
-- |
591 // | 579 // |
592 // find() | 580 // find() |
593 // | 581 // |
594 //------------------------------------------------------------------------------
-- | 582 //------------------------------------------------------------------------------
-- |
595 UBool RegexMatcher::find() { | 583 UBool RegexMatcher::find() { |
| 584 if (U_FAILURE(fDeferredStatus)) { |
| 585 return FALSE; |
| 586 } |
| 587 UErrorCode status = U_ZERO_ERROR; |
| 588 UBool result = find(status); |
| 589 return result; |
| 590 } |
| 591 |
| 592 //------------------------------------------------------------------------------
-- |
| 593 // |
| 594 // find() |
| 595 // |
| 596 //------------------------------------------------------------------------------
-- |
| 597 UBool RegexMatcher::find(UErrorCode &status) { |
596 // Start at the position of the last match end. (Will be zero if the | 598 // Start at the position of the last match end. (Will be zero if the |
597 // matcher has been reset.) | 599 // matcher has been reset.) |
598 // | 600 // |
599 if (U_FAILURE(fDeferredStatus)) { | 601 if (U_FAILURE(status)) { |
600 return FALSE; | 602 return FALSE; |
601 } | 603 } |
602 | 604 if (U_FAILURE(fDeferredStatus)) { |
| 605 status = fDeferredStatus; |
| 606 return FALSE; |
| 607 } |
| 608 |
603 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { | 609 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
604 return findUsingChunk(); | 610 return findUsingChunk(status); |
605 } | 611 } |
606 | 612 |
607 int64_t startPos = fMatchEnd; | 613 int64_t startPos = fMatchEnd; |
608 if (startPos==0) { | 614 if (startPos==0) { |
609 startPos = fActiveStart; | 615 startPos = fActiveStart; |
610 } | 616 } |
611 | 617 |
612 if (fMatch) { | 618 if (fMatch) { |
613 // Save the position of any previous successful match. | 619 // Save the position of any previous successful match. |
614 fLastMatchEnd = fMatchEnd; | 620 fLastMatchEnd = fMatchEnd; |
(...skipping 27 matching lines...) Expand all Loading... |
642 // Be aware of possible overflows if making changes here. | 648 // Be aware of possible overflows if making changes here. |
643 int64_t testStartLimit; | 649 int64_t testStartLimit; |
644 if (UTEXT_USES_U16(fInputText)) { | 650 if (UTEXT_USES_U16(fInputText)) { |
645 testStartLimit = fActiveLimit - fPattern->fMinMatchLen; | 651 testStartLimit = fActiveLimit - fPattern->fMinMatchLen; |
646 if (startPos > testStartLimit) { | 652 if (startPos > testStartLimit) { |
647 fMatch = FALSE; | 653 fMatch = FALSE; |
648 fHitEnd = TRUE; | 654 fHitEnd = TRUE; |
649 return FALSE; | 655 return FALSE; |
650 } | 656 } |
651 } else { | 657 } else { |
652 // For now, let the matcher discover that it can't match on its own | 658 // We don't know exactly how long the minimum match length is in native
characters. |
653 // We don't know how long the match len is in native characters | 659 // Treat anything > 0 as 1. |
654 testStartLimit = fActiveLimit; | 660 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0); |
655 } | 661 } |
656 | 662 |
657 UChar32 c; | 663 UChar32 c; |
658 U_ASSERT(startPos >= 0); | 664 U_ASSERT(startPos >= 0); |
659 | 665 |
660 switch (fPattern->fStartType) { | 666 switch (fPattern->fStartType) { |
661 case START_NO_INFO: | 667 case START_NO_INFO: |
662 // No optimization was found. | 668 // No optimization was found. |
663 // Try a match at each input position. | 669 // Try a match at each input position. |
664 for (;;) { | 670 for (;;) { |
665 MatchAt(startPos, FALSE, fDeferredStatus); | 671 MatchAt(startPos, FALSE, status); |
666 if (U_FAILURE(fDeferredStatus)) { | 672 if (U_FAILURE(status)) { |
667 return FALSE; | 673 return FALSE; |
668 } | 674 } |
669 if (fMatch) { | 675 if (fMatch) { |
670 return TRUE; | 676 return TRUE; |
671 } | 677 } |
672 if (startPos >= testStartLimit) { | 678 if (startPos >= testStartLimit) { |
673 fHitEnd = TRUE; | 679 fHitEnd = TRUE; |
674 return FALSE; | 680 return FALSE; |
675 } | 681 } |
676 UTEXT_SETNATIVEINDEX(fInputText, startPos); | 682 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
677 (void)UTEXT_NEXT32(fInputText); | 683 (void)UTEXT_NEXT32(fInputText); |
678 startPos = UTEXT_GETNATIVEINDEX(fInputText); | 684 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
679 // Note that it's perfectly OK for a pattern to have a zero-length | 685 // Note that it's perfectly OK for a pattern to have a zero-length |
680 // match at the end of a string, so we must make sure that the loo
p | 686 // match at the end of a string, so we must make sure that the loo
p |
681 // runs with startPos == testStartLimit the last time through. | 687 // runs with startPos == testStartLimit the last time through. |
682 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) | 688 if (findProgressInterrupt(startPos, status)) |
683 return FALSE; | 689 return FALSE; |
684 } | 690 } |
685 U_ASSERT(FALSE); | 691 U_ASSERT(FALSE); |
686 | 692 |
687 case START_START: | 693 case START_START: |
688 // Matches are only possible at the start of the input string | 694 // Matches are only possible at the start of the input string |
689 // (pattern begins with ^ or \A) | 695 // (pattern begins with ^ or \A) |
690 if (startPos > fActiveStart) { | 696 if (startPos > fActiveStart) { |
691 fMatch = FALSE; | 697 fMatch = FALSE; |
692 return FALSE; | 698 return FALSE; |
693 } | 699 } |
694 MatchAt(startPos, FALSE, fDeferredStatus); | 700 MatchAt(startPos, FALSE, status); |
695 if (U_FAILURE(fDeferredStatus)) { | 701 if (U_FAILURE(status)) { |
696 return FALSE; | 702 return FALSE; |
697 } | 703 } |
698 return fMatch; | 704 return fMatch; |
699 | 705 |
700 | 706 |
701 case START_SET: | 707 case START_SET: |
702 { | 708 { |
703 // Match may start on any char from a pre-computed set. | 709 // Match may start on any char from a pre-computed set. |
704 U_ASSERT(fPattern->fMinMatchLen > 0); | 710 U_ASSERT(fPattern->fMinMatchLen > 0); |
705 int64_t pos; | |
706 UTEXT_SETNATIVEINDEX(fInputText, startPos); | 711 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
707 for (;;) { | 712 for (;;) { |
| 713 int64_t pos = startPos; |
708 c = UTEXT_NEXT32(fInputText); | 714 c = UTEXT_NEXT32(fInputText); |
709 pos = UTEXT_GETNATIVEINDEX(fInputText); | 715 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
710 // c will be -1 (U_SENTINEL) at end of text, in which case we | 716 // c will be -1 (U_SENTINEL) at end of text, in which case we |
711 // skip this next block (so we don't have a negative array index
) | 717 // skip this next block (so we don't have a negative array index
) |
712 // and handle end of text in the following block. | 718 // and handle end of text in the following block. |
713 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c))
|| | 719 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c))
|| |
714 (c>=256 && fPattern->fInitialChars->contains(c))))
{ | 720 (c>=256 && fPattern->fInitialChars->contains(c))))
{ |
715 MatchAt(startPos, FALSE, fDeferredStatus); | 721 MatchAt(pos, FALSE, status); |
716 if (U_FAILURE(fDeferredStatus)) { | 722 if (U_FAILURE(status)) { |
717 return FALSE; | 723 return FALSE; |
718 } | 724 } |
719 if (fMatch) { | 725 if (fMatch) { |
720 return TRUE; | 726 return TRUE; |
721 } | 727 } |
722 UTEXT_SETNATIVEINDEX(fInputText, pos); | 728 UTEXT_SETNATIVEINDEX(fInputText, pos); |
723 } | 729 } |
724 if (startPos >= testStartLimit) { | 730 if (startPos > testStartLimit) { |
725 fMatch = FALSE; | 731 fMatch = FALSE; |
726 fHitEnd = TRUE; | 732 fHitEnd = TRUE; |
727 return FALSE; | 733 return FALSE; |
728 } | 734 } |
729 startPos = pos; | 735 if (findProgressInterrupt(startPos, status)) |
730 » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) | |
731 return FALSE; | 736 return FALSE; |
732 } | 737 } |
733 } | 738 } |
734 U_ASSERT(FALSE); | 739 U_ASSERT(FALSE); |
735 | 740 |
736 case START_STRING: | 741 case START_STRING: |
737 case START_CHAR: | 742 case START_CHAR: |
738 { | 743 { |
739 // Match starts on exactly one char. | 744 // Match starts on exactly one char. |
740 U_ASSERT(fPattern->fMinMatchLen > 0); | 745 U_ASSERT(fPattern->fMinMatchLen > 0); |
741 UChar32 theChar = fPattern->fInitialChar; | 746 UChar32 theChar = fPattern->fInitialChar; |
742 int64_t pos; | |
743 UTEXT_SETNATIVEINDEX(fInputText, startPos); | 747 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
744 for (;;) { | 748 for (;;) { |
| 749 int64_t pos = startPos; |
745 c = UTEXT_NEXT32(fInputText); | 750 c = UTEXT_NEXT32(fInputText); |
746 pos = UTEXT_GETNATIVEINDEX(fInputText); | 751 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
747 if (c == theChar) { | 752 if (c == theChar) { |
748 MatchAt(startPos, FALSE, fDeferredStatus); | 753 MatchAt(pos, FALSE, status); |
749 if (U_FAILURE(fDeferredStatus)) { | 754 if (U_FAILURE(status)) { |
750 return FALSE; | 755 return FALSE; |
751 } | 756 } |
752 if (fMatch) { | 757 if (fMatch) { |
753 return TRUE; | 758 return TRUE; |
754 } | 759 } |
755 UTEXT_SETNATIVEINDEX(fInputText, pos); | 760 UTEXT_SETNATIVEINDEX(fInputText, pos); |
756 } | 761 } |
757 if (startPos >= testStartLimit) { | 762 if (startPos > testStartLimit) { |
758 fMatch = FALSE; | 763 fMatch = FALSE; |
759 fHitEnd = TRUE; | 764 fHitEnd = TRUE; |
760 return FALSE; | 765 return FALSE; |
761 } | 766 } |
762 startPos = pos; | 767 if (findProgressInterrupt(startPos, status)) |
763 » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) | |
764 return FALSE; | 768 return FALSE; |
765 } | 769 } |
766 } | 770 } |
767 U_ASSERT(FALSE); | 771 U_ASSERT(FALSE); |
768 | 772 |
769 case START_LINE: | 773 case START_LINE: |
770 { | 774 { |
771 UChar32 c; | 775 UChar32 c; |
772 if (startPos == fAnchorStart) { | 776 if (startPos == fAnchorStart) { |
773 MatchAt(startPos, FALSE, fDeferredStatus); | 777 MatchAt(startPos, FALSE, status); |
774 if (U_FAILURE(fDeferredStatus)) { | 778 if (U_FAILURE(status)) { |
775 return FALSE; | 779 return FALSE; |
776 } | 780 } |
777 if (fMatch) { | 781 if (fMatch) { |
778 return TRUE; | 782 return TRUE; |
779 } | 783 } |
780 UTEXT_SETNATIVEINDEX(fInputText, startPos); | 784 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
781 c = UTEXT_NEXT32(fInputText); | 785 c = UTEXT_NEXT32(fInputText); |
782 startPos = UTEXT_GETNATIVEINDEX(fInputText); | 786 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
783 } else { | 787 } else { |
784 UTEXT_SETNATIVEINDEX(fInputText, startPos); | 788 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
785 c = UTEXT_PREVIOUS32(fInputText); | 789 c = UTEXT_PREVIOUS32(fInputText); |
786 UTEXT_SETNATIVEINDEX(fInputText, startPos); | 790 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
787 } | 791 } |
788 | 792 |
789 if (fPattern->fFlags & UREGEX_UNIX_LINES) { | 793 if (fPattern->fFlags & UREGEX_UNIX_LINES) { |
790 for (;;) { | 794 for (;;) { |
791 if (c == 0x0a) { | 795 if (c == 0x0a) { |
792 MatchAt(startPos, FALSE, fDeferredStatus); | 796 MatchAt(startPos, FALSE, status); |
793 if (U_FAILURE(fDeferredStatus)) { | 797 if (U_FAILURE(status)) { |
794 return FALSE; | 798 return FALSE; |
795 } | 799 } |
796 if (fMatch) { | 800 if (fMatch) { |
797 return TRUE; | 801 return TRUE; |
798 } | 802 } |
799 UTEXT_SETNATIVEINDEX(fInputText, startPos); | 803 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
800 } | 804 } |
801 if (startPos >= testStartLimit) { | 805 if (startPos >= testStartLimit) { |
802 fMatch = FALSE; | 806 fMatch = FALSE; |
803 fHitEnd = TRUE; | 807 fHitEnd = TRUE; |
804 return FALSE; | 808 return FALSE; |
805 } | 809 } |
806 c = UTEXT_NEXT32(fInputText); | 810 c = UTEXT_NEXT32(fInputText); |
807 startPos = UTEXT_GETNATIVEINDEX(fInputText); | 811 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
808 // Note that it's perfectly OK for a pattern to have a zero-
length | 812 // Note that it's perfectly OK for a pattern to have a zero-
length |
809 // match at the end of a string, so we must make sure that
the loop | 813 // match at the end of a string, so we must make sure that
the loop |
810 // runs with startPos == testStartLimit the last time thro
ugh. | 814 // runs with startPos == testStartLimit the last time thro
ugh. |
811 » » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferred
Status)) | 815 if (findProgressInterrupt(startPos, status)) |
812 return FALSE; | 816 return FALSE; |
813 } | 817 } |
814 } else { | 818 } else { |
815 for (;;) { | 819 for (;;) { |
816 if (((c & 0x7f) <= 0x29) && // First quickly bypass as m
any chars as possible | 820 if (((c & 0x7f) <= 0x29) && // First quickly bypass as m
any chars as possible |
817 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x202
9 )) { | 821 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x202
9 )) { |
818 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CU
RRENT32(fInputText) == 0x0a) { | 822 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CU
RRENT32(fInputText) == 0x0a) { |
819 (void)UTEXT_NEXT32(fInputText); | 823 (void)UTEXT_NEXT32(fInputText); |
820 startPos = UTEXT_GETNATIVEINDEX(fInputText); | 824 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
821 } | 825 } |
822 MatchAt(startPos, FALSE, fDeferredStatus); | 826 MatchAt(startPos, FALSE, status); |
823 if (U_FAILURE(fDeferredStatus)) { | 827 if (U_FAILURE(status)) { |
824 return FALSE; | 828 return FALSE; |
825 } | 829 } |
826 if (fMatch) { | 830 if (fMatch) { |
827 return TRUE; | 831 return TRUE; |
828 } | 832 } |
829 UTEXT_SETNATIVEINDEX(fInputText, startPos); | 833 UTEXT_SETNATIVEINDEX(fInputText, startPos); |
830 } | 834 } |
831 if (startPos >= testStartLimit) { | 835 if (startPos >= testStartLimit) { |
832 fMatch = FALSE; | 836 fMatch = FALSE; |
833 fHitEnd = TRUE; | 837 fHitEnd = TRUE; |
834 return FALSE; | 838 return FALSE; |
835 } | 839 } |
836 c = UTEXT_NEXT32(fInputText); | 840 c = UTEXT_NEXT32(fInputText); |
837 startPos = UTEXT_GETNATIVEINDEX(fInputText); | 841 startPos = UTEXT_GETNATIVEINDEX(fInputText); |
838 // Note that it's perfectly OK for a pattern to have a zero-
length | 842 // Note that it's perfectly OK for a pattern to have a zero-
length |
839 // match at the end of a string, so we must make sure that
the loop | 843 // match at the end of a string, so we must make sure that
the loop |
840 // runs with startPos == testStartLimit the last time thro
ugh. | 844 // runs with startPos == testStartLimit the last time thro
ugh. |
841 » » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferred
Status)) | 845 if (findProgressInterrupt(startPos, status)) |
842 return FALSE; | 846 return FALSE; |
843 } | 847 } |
844 } | 848 } |
845 } | 849 } |
846 | 850 |
847 default: | 851 default: |
848 U_ASSERT(FALSE); | 852 U_ASSERT(FALSE); |
849 } | 853 } |
850 | 854 |
851 U_ASSERT(FALSE); | 855 U_ASSERT(FALSE); |
852 return FALSE; | 856 return FALSE; |
853 } | 857 } |
854 | 858 |
855 | 859 |
856 | 860 |
857 UBool RegexMatcher::find(int64_t start, UErrorCode &status) { | 861 UBool RegexMatcher::find(int64_t start, UErrorCode &status) { |
858 if (U_FAILURE(status)) { | 862 if (U_FAILURE(status)) { |
859 return FALSE; | 863 return FALSE; |
860 } | 864 } |
861 if (U_FAILURE(fDeferredStatus)) { | 865 if (U_FAILURE(fDeferredStatus)) { |
862 status = fDeferredStatus; | 866 status = fDeferredStatus; |
863 return FALSE; | 867 return FALSE; |
864 } | 868 } |
865 this->reset(); // Note: Reset() is specified by Java
Matcher documentation. | 869 this->reset(); // Note: Reset() is specified by Java
Matcher documentation. |
866 // This will reset the region t
o be the full input length. | 870 // This will reset the region t
o be the full input length. |
867 if (start < 0) { | 871 if (start < 0) { |
868 status = U_INDEX_OUTOFBOUNDS_ERROR; | 872 status = U_INDEX_OUTOFBOUNDS_ERROR; |
869 return FALSE; | 873 return FALSE; |
870 } | 874 } |
871 | 875 |
872 int64_t nativeStart = start; | 876 int64_t nativeStart = start; |
873 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { | 877 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { |
874 status = U_INDEX_OUTOFBOUNDS_ERROR; | 878 status = U_INDEX_OUTOFBOUNDS_ERROR; |
875 return FALSE; | 879 return FALSE; |
876 } | 880 } |
877 fMatchEnd = nativeStart; | 881 fMatchEnd = nativeStart; |
878 return find(); | 882 return find(status); |
879 } | 883 } |
880 | 884 |
881 | 885 |
882 //------------------------------------------------------------------------------
-- | 886 //------------------------------------------------------------------------------
-- |
883 // | 887 // |
884 // findUsingChunk() -- like find(), but with the advance knowledge that the | 888 // findUsingChunk() -- like find(), but with the advance knowledge that the |
885 // entire string is available in the UText's chunk buffer. | 889 // entire string is available in the UText's chunk buffer. |
886 // | 890 // |
887 //------------------------------------------------------------------------------
-- | 891 //------------------------------------------------------------------------------
-- |
888 UBool RegexMatcher::findUsingChunk() { | 892 UBool RegexMatcher::findUsingChunk(UErrorCode &status) { |
889 // Start at the position of the last match end. (Will be zero if the | 893 // Start at the position of the last match end. (Will be zero if the |
890 // matcher has been reset. | 894 // matcher has been reset. |
891 // | 895 // |
892 | 896 |
893 int32_t startPos = (int32_t)fMatchEnd; | 897 int32_t startPos = (int32_t)fMatchEnd; |
894 if (startPos==0) { | 898 if (startPos==0) { |
895 startPos = (int32_t)fActiveStart; | 899 startPos = (int32_t)fActiveStart; |
896 } | 900 } |
897 | 901 |
898 const UChar *inputBuf = fInputText->chunkContents; | 902 const UChar *inputBuf = fInputText->chunkContents; |
899 | 903 |
900 if (fMatch) { | 904 if (fMatch) { |
901 // Save the position of any previous successful match. | 905 // Save the position of any previous successful match. |
902 fLastMatchEnd = fMatchEnd; | 906 fLastMatchEnd = fMatchEnd; |
903 | 907 |
904 if (fMatchStart == fMatchEnd) { | 908 if (fMatchStart == fMatchEnd) { |
905 // Previous match had zero length. Move start position up one posit
ion | 909 // Previous match had zero length. Move start position up one posit
ion |
906 // to avoid sending find() into a loop on zero-length matches. | 910 // to avoid sending find() into a loop on zero-length matches. |
907 if (startPos >= fActiveLimit) { | 911 if (startPos >= fActiveLimit) { |
908 fMatch = FALSE; | 912 fMatch = FALSE; |
909 fHitEnd = TRUE; | 913 fHitEnd = TRUE; |
910 return FALSE; | 914 return FALSE; |
911 } | 915 } |
912 U16_FWD_1(inputBuf, startPos, fInputLength); | 916 U16_FWD_1(inputBuf, startPos, fInputLength); |
913 } | 917 } |
914 } else { | 918 } else { |
915 if (fLastMatchEnd >= 0) { | 919 if (fLastMatchEnd >= 0) { |
916 // A previous find() failed to match. Don't try again. | 920 // A previous find() failed to match. Don't try again. |
917 // (without this test, a pattern with a zero-length match | 921 // (without this test, a pattern with a zero-length match |
918 // could match again at the end of an input string.) | 922 // could match again at the end of an input string.) |
919 fHitEnd = TRUE; | 923 fHitEnd = TRUE; |
920 return FALSE; | 924 return FALSE; |
921 } | 925 } |
922 } | 926 } |
923 | 927 |
924 | 928 |
925 // Compute the position in the input string beyond which a match can not beg
in, because | 929 // Compute the position in the input string beyond which a match can not beg
in, because |
926 // the minimum length match would extend past the end of the input. | 930 // the minimum length match would extend past the end of the input. |
927 // Note: some patterns that cannot match anything will have fMinMatchLeng
th==Max Int. | 931 // Note: some patterns that cannot match anything will have fMinMatchLeng
th==Max Int. |
928 // Be aware of possible overflows if making changes here. | 932 // Be aware of possible overflows if making changes here. |
| 933 // Note: a match can begin at inputBuf + testLen; it is an inclusive limi
t. |
929 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); | 934 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); |
930 if (startPos > testLen) { | 935 if (startPos > testLen) { |
931 fMatch = FALSE; | 936 fMatch = FALSE; |
932 fHitEnd = TRUE; | 937 fHitEnd = TRUE; |
933 return FALSE; | 938 return FALSE; |
934 } | 939 } |
935 | 940 |
936 UChar32 c; | 941 UChar32 c; |
937 U_ASSERT(startPos >= 0); | 942 U_ASSERT(startPos >= 0); |
938 | 943 |
939 switch (fPattern->fStartType) { | 944 switch (fPattern->fStartType) { |
940 case START_NO_INFO: | 945 case START_NO_INFO: |
941 // No optimization was found. | 946 // No optimization was found. |
942 // Try a match at each input position. | 947 // Try a match at each input position. |
943 for (;;) { | 948 for (;;) { |
944 MatchChunkAt(startPos, FALSE, fDeferredStatus); | 949 MatchChunkAt(startPos, FALSE, status); |
945 if (U_FAILURE(fDeferredStatus)) { | 950 if (U_FAILURE(status)) { |
946 return FALSE; | 951 return FALSE; |
947 } | 952 } |
948 if (fMatch) { | 953 if (fMatch) { |
949 return TRUE; | 954 return TRUE; |
950 } | 955 } |
951 if (startPos >= testLen) { | 956 if (startPos >= testLen) { |
952 fHitEnd = TRUE; | 957 fHitEnd = TRUE; |
953 return FALSE; | 958 return FALSE; |
954 } | 959 } |
955 U16_FWD_1(inputBuf, startPos, fActiveLimit); | 960 U16_FWD_1(inputBuf, startPos, fActiveLimit); |
956 // Note that it's perfectly OK for a pattern to have a zero-length | 961 // Note that it's perfectly OK for a pattern to have a zero-length |
957 // match at the end of a string, so we must make sure that the loo
p | 962 // match at the end of a string, so we must make sure that the loo
p |
958 // runs with startPos == testLen the last time through. | 963 // runs with startPos == testLen the last time through. |
959 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) | 964 if (findProgressInterrupt(startPos, status)) |
960 return FALSE; | 965 return FALSE; |
961 } | 966 } |
962 U_ASSERT(FALSE); | 967 U_ASSERT(FALSE); |
963 | 968 |
964 case START_START: | 969 case START_START: |
965 // Matches are only possible at the start of the input string | 970 // Matches are only possible at the start of the input string |
966 // (pattern begins with ^ or \A) | 971 // (pattern begins with ^ or \A) |
967 if (startPos > fActiveStart) { | 972 if (startPos > fActiveStart) { |
968 fMatch = FALSE; | 973 fMatch = FALSE; |
969 return FALSE; | 974 return FALSE; |
970 } | 975 } |
971 MatchChunkAt(startPos, FALSE, fDeferredStatus); | 976 MatchChunkAt(startPos, FALSE, status); |
972 if (U_FAILURE(fDeferredStatus)) { | 977 if (U_FAILURE(status)) { |
973 return FALSE; | 978 return FALSE; |
974 } | 979 } |
975 return fMatch; | 980 return fMatch; |
976 | 981 |
977 | 982 |
978 case START_SET: | 983 case START_SET: |
979 { | 984 { |
980 // Match may start on any char from a pre-computed set. | 985 // Match may start on any char from a pre-computed set. |
981 U_ASSERT(fPattern->fMinMatchLen > 0); | 986 U_ASSERT(fPattern->fMinMatchLen > 0); |
982 for (;;) { | 987 for (;;) { |
983 int32_t pos = startPos; | 988 int32_t pos = startPos; |
984 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf
[startPos++]; | 989 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf
[startPos++]; |
985 if ((c<256 && fPattern->fInitialChars8->contains(c)) || | 990 if ((c<256 && fPattern->fInitialChars8->contains(c)) || |
986 (c>=256 && fPattern->fInitialChars->contains(c))) { | 991 (c>=256 && fPattern->fInitialChars->contains(c))) { |
987 MatchChunkAt(pos, FALSE, fDeferredStatus); | 992 MatchChunkAt(pos, FALSE, status); |
988 if (U_FAILURE(fDeferredStatus)) { | 993 if (U_FAILURE(status)) { |
989 return FALSE; | 994 return FALSE; |
990 } | 995 } |
991 if (fMatch) { | 996 if (fMatch) { |
992 return TRUE; | 997 return TRUE; |
993 } | 998 } |
994 } | 999 } |
995 if (pos >= testLen) { | 1000 if (startPos > testLen) { |
996 fMatch = FALSE; | 1001 fMatch = FALSE; |
997 fHitEnd = TRUE; | 1002 fHitEnd = TRUE; |
998 return FALSE; | 1003 return FALSE; |
999 } | 1004 } |
1000 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) | 1005 if (findProgressInterrupt(startPos, status)) |
1001 return FALSE; | 1006 return FALSE; |
1002 } | 1007 } |
1003 } | 1008 } |
1004 U_ASSERT(FALSE); | 1009 U_ASSERT(FALSE); |
1005 | 1010 |
1006 case START_STRING: | 1011 case START_STRING: |
1007 case START_CHAR: | 1012 case START_CHAR: |
1008 { | 1013 { |
1009 // Match starts on exactly one char. | 1014 // Match starts on exactly one char. |
1010 U_ASSERT(fPattern->fMinMatchLen > 0); | 1015 U_ASSERT(fPattern->fMinMatchLen > 0); |
1011 UChar32 theChar = fPattern->fInitialChar; | 1016 UChar32 theChar = fPattern->fInitialChar; |
1012 for (;;) { | 1017 for (;;) { |
1013 int32_t pos = startPos; | 1018 int32_t pos = startPos; |
1014 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf
[startPos++]; | 1019 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf
[startPos++]; |
1015 if (c == theChar) { | 1020 if (c == theChar) { |
1016 MatchChunkAt(pos, FALSE, fDeferredStatus); | 1021 MatchChunkAt(pos, FALSE, status); |
1017 if (U_FAILURE(fDeferredStatus)) { | 1022 if (U_FAILURE(status)) { |
1018 return FALSE; | 1023 return FALSE; |
1019 } | 1024 } |
1020 if (fMatch) { | 1025 if (fMatch) { |
1021 return TRUE; | 1026 return TRUE; |
1022 } | 1027 } |
1023 } | 1028 } |
1024 if (pos >= testLen) { | 1029 if (startPos > testLen) { |
1025 fMatch = FALSE; | 1030 fMatch = FALSE; |
1026 fHitEnd = TRUE; | 1031 fHitEnd = TRUE; |
1027 return FALSE; | 1032 return FALSE; |
1028 } | 1033 } |
1029 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) | 1034 if (findProgressInterrupt(startPos, status)) |
1030 return FALSE; | 1035 return FALSE; |
1031 } | 1036 } |
1032 } | 1037 } |
1033 U_ASSERT(FALSE); | 1038 U_ASSERT(FALSE); |
1034 | 1039 |
1035 case START_LINE: | 1040 case START_LINE: |
1036 { | 1041 { |
1037 UChar32 c; | 1042 UChar32 c; |
1038 if (startPos == fAnchorStart) { | 1043 if (startPos == fAnchorStart) { |
1039 MatchChunkAt(startPos, FALSE, fDeferredStatus); | 1044 MatchChunkAt(startPos, FALSE, status); |
1040 if (U_FAILURE(fDeferredStatus)) { | 1045 if (U_FAILURE(status)) { |
1041 return FALSE; | 1046 return FALSE; |
1042 } | 1047 } |
1043 if (fMatch) { | 1048 if (fMatch) { |
1044 return TRUE; | 1049 return TRUE; |
1045 } | 1050 } |
1046 U16_FWD_1(inputBuf, startPos, fActiveLimit); | 1051 U16_FWD_1(inputBuf, startPos, fActiveLimit); |
1047 } | 1052 } |
1048 | 1053 |
1049 if (fPattern->fFlags & UREGEX_UNIX_LINES) { | 1054 if (fPattern->fFlags & UREGEX_UNIX_LINES) { |
1050 for (;;) { | 1055 for (;;) { |
1051 c = inputBuf[startPos-1]; | 1056 c = inputBuf[startPos-1]; |
1052 if (c == 0x0a) { | 1057 if (c == 0x0a) { |
1053 MatchChunkAt(startPos, FALSE, fDeferredStatus); | 1058 MatchChunkAt(startPos, FALSE, status); |
1054 if (U_FAILURE(fDeferredStatus)) { | 1059 if (U_FAILURE(status)) { |
1055 return FALSE; | 1060 return FALSE; |
1056 } | 1061 } |
1057 if (fMatch) { | 1062 if (fMatch) { |
1058 return TRUE; | 1063 return TRUE; |
1059 } | 1064 } |
1060 } | 1065 } |
1061 if (startPos >= testLen) { | 1066 if (startPos >= testLen) { |
1062 fMatch = FALSE; | 1067 fMatch = FALSE; |
1063 fHitEnd = TRUE; | 1068 fHitEnd = TRUE; |
1064 return FALSE; | 1069 return FALSE; |
1065 } | 1070 } |
1066 U16_FWD_1(inputBuf, startPos, fActiveLimit); | 1071 U16_FWD_1(inputBuf, startPos, fActiveLimit); |
1067 // Note that it's perfectly OK for a pattern to have a zero-leng
th | 1072 // Note that it's perfectly OK for a pattern to have a zero-leng
th |
1068 // match at the end of a string, so we must make sure that the
loop | 1073 // match at the end of a string, so we must make sure that the
loop |
1069 // runs with startPos == testLen the last time through. | 1074 // runs with startPos == testLen the last time through. |
1070 » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) | 1075 if (findProgressInterrupt(startPos, status)) |
1071 return FALSE; | 1076 return FALSE; |
1072 } | 1077 } |
1073 } else { | 1078 } else { |
1074 for (;;) { | 1079 for (;;) { |
1075 c = inputBuf[startPos-1]; | 1080 c = inputBuf[startPos-1]; |
1076 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many
chars as possible | 1081 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many
chars as possible |
1077 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 ))
{ | 1082 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 ))
{ |
1078 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPo
s] == 0x0a) { | 1083 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPo
s] == 0x0a) { |
1079 startPos++; | 1084 startPos++; |
1080 } | 1085 } |
1081 MatchChunkAt(startPos, FALSE, fDeferredStatus); | 1086 MatchChunkAt(startPos, FALSE, status); |
1082 if (U_FAILURE(fDeferredStatus)) { | 1087 if (U_FAILURE(status)) { |
1083 return FALSE; | 1088 return FALSE; |
1084 } | 1089 } |
1085 if (fMatch) { | 1090 if (fMatch) { |
1086 return TRUE; | 1091 return TRUE; |
1087 } | 1092 } |
1088 } | 1093 } |
1089 if (startPos >= testLen) { | 1094 if (startPos >= testLen) { |
1090 fMatch = FALSE; | 1095 fMatch = FALSE; |
1091 fHitEnd = TRUE; | 1096 fHitEnd = TRUE; |
1092 return FALSE; | 1097 return FALSE; |
1093 } | 1098 } |
1094 U16_FWD_1(inputBuf, startPos, fActiveLimit); | 1099 U16_FWD_1(inputBuf, startPos, fActiveLimit); |
1095 // Note that it's perfectly OK for a pattern to have a zero-leng
th | 1100 // Note that it's perfectly OK for a pattern to have a zero-leng
th |
1096 // match at the end of a string, so we must make sure that the
loop | 1101 // match at the end of a string, so we must make sure that the
loop |
1097 // runs with startPos == testLen the last time through. | 1102 // runs with startPos == testLen the last time through. |
1098 » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) | 1103 if (findProgressInterrupt(startPos, status)) |
1099 return FALSE; | 1104 return FALSE; |
1100 } | 1105 } |
1101 } | 1106 } |
1102 } | 1107 } |
1103 | 1108 |
1104 default: | 1109 default: |
1105 U_ASSERT(FALSE); | 1110 U_ASSERT(FALSE); |
1106 } | 1111 } |
1107 | 1112 |
1108 U_ASSERT(FALSE); | 1113 U_ASSERT(FALSE); |
1109 return FALSE; | 1114 return FALSE; |
1110 } | 1115 } |
1111 | 1116 |
1112 | 1117 |
1113 | 1118 |
1114 //------------------------------------------------------------------------------
-- | 1119 //------------------------------------------------------------------------------
-- |
1115 // | 1120 // |
1116 // group() | 1121 // group() |
1117 // | 1122 // |
1118 //------------------------------------------------------------------------------
-- | 1123 //------------------------------------------------------------------------------
-- |
1119 UnicodeString RegexMatcher::group(UErrorCode &status) const { | 1124 UnicodeString RegexMatcher::group(UErrorCode &status) const { |
1120 return group(0, status); | 1125 return group(0, status); |
1121 } | 1126 } |
1122 | 1127 |
1123 // Return immutable shallow clone | 1128 // Return immutable shallow clone |
1124 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status)
const { | 1129 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status)
const { |
1125 return group(0, dest, group_len, status); | 1130 return group(0, dest, group_len, status); |
1126 } | 1131 } |
1127 | 1132 |
1128 // Return immutable shallow clone | 1133 // Return immutable shallow clone |
1129 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UE
rrorCode &status) const { | 1134 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UE
rrorCode &status) const { |
1130 group_len = 0; | 1135 group_len = 0; |
1131 UBool bailOut = FALSE; | |
1132 if (U_FAILURE(status)) { | 1136 if (U_FAILURE(status)) { |
1133 return dest; | 1137 return dest; |
1134 } | 1138 } |
1135 if (U_FAILURE(fDeferredStatus)) { | 1139 if (U_FAILURE(fDeferredStatus)) { |
1136 status = fDeferredStatus; | 1140 status = fDeferredStatus; |
1137 bailOut = TRUE; | 1141 } else if (fMatch == FALSE) { |
| 1142 status = U_REGEX_INVALID_STATE; |
| 1143 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { |
| 1144 status = U_INDEX_OUTOFBOUNDS_ERROR; |
1138 } | 1145 } |
1139 if (fMatch == FALSE) { | 1146 |
1140 status = U_REGEX_INVALID_STATE; | 1147 if (U_FAILURE(status)) { |
1141 bailOut = TRUE; | 1148 return dest; |
1142 } | 1149 } |
1143 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { | 1150 |
1144 status = U_INDEX_OUTOFBOUNDS_ERROR; | |
1145 bailOut = TRUE; | |
1146 } | |
1147 | |
1148 if (bailOut) { | |
1149 return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status); | |
1150 } | |
1151 | |
1152 int64_t s, e; | 1151 int64_t s, e; |
1153 if (groupNum == 0) { | 1152 if (groupNum == 0) { |
1154 s = fMatchStart; | 1153 s = fMatchStart; |
1155 e = fMatchEnd; | 1154 e = fMatchEnd; |
1156 } else { | 1155 } else { |
1157 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); | 1156 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); |
1158 U_ASSERT(groupOffset < fPattern->fFrameSize); | 1157 U_ASSERT(groupOffset < fPattern->fFrameSize); |
1159 U_ASSERT(groupOffset >= 0); | 1158 U_ASSERT(groupOffset >= 0); |
1160 s = fFrame->fExtra[groupOffset]; | 1159 s = fFrame->fExtra[groupOffset]; |
1161 e = fFrame->fExtra[groupOffset+1]; | 1160 e = fFrame->fExtra[groupOffset+1]; |
1162 } | 1161 } |
1163 | 1162 |
1164 if (s < 0) { | 1163 if (s < 0) { |
1165 // A capture group wasn't part of the match | 1164 // A capture group wasn't part of the match |
1166 return utext_clone(dest, fInputText, FALSE, TRUE, &status); | 1165 return utext_clone(dest, fInputText, FALSE, TRUE, &status); |
1167 } | 1166 } |
1168 U_ASSERT(s <= e); | 1167 U_ASSERT(s <= e); |
1169 group_len = e - s; | 1168 group_len = e - s; |
1170 | 1169 |
1171 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); | 1170 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); |
1172 if (dest) | 1171 if (dest) |
1173 UTEXT_SETNATIVEINDEX(dest, s); | 1172 UTEXT_SETNATIVEINDEX(dest, s); |
1174 return dest; | 1173 return dest; |
1175 } | 1174 } |
1176 | 1175 |
1177 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { | 1176 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { |
1178 UnicodeString result; | 1177 UnicodeString result; |
1179 if (U_FAILURE(status)) { | 1178 if (U_FAILURE(status)) { |
1180 return result; | 1179 return result; |
1181 } | 1180 } |
1182 UText resultText = UTEXT_INITIALIZER; | 1181 UText resultText = UTEXT_INITIALIZER; |
1183 utext_openUnicodeString(&resultText, &result, &status); | 1182 utext_openUnicodeString(&resultText, &result, &status); |
1184 group(groupNum, &resultText, status); | 1183 group(groupNum, &resultText, status); |
1185 utext_close(&resultText); | 1184 utext_close(&resultText); |
1186 return result; | 1185 return result; |
1187 } | 1186 } |
1188 | 1187 |
1189 | 1188 |
1190 // Return deep (mutable) clone | 1189 // Return deep (mutable) clone |
1191 //» » Technology Preview (as an API), but note that the UnicodeString
API is implemented | 1190 // Technology Preview (as an API), but note that the UnicodeString API is i
mplemented |
1192 //» » using this function. | 1191 // using this function. |
1193 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) co
nst { | 1192 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) co
nst { |
1194 UBool bailOut = FALSE; | |
1195 if (U_FAILURE(status)) { | 1193 if (U_FAILURE(status)) { |
1196 return dest; | 1194 return dest; |
1197 } | 1195 } |
| 1196 |
1198 if (U_FAILURE(fDeferredStatus)) { | 1197 if (U_FAILURE(fDeferredStatus)) { |
1199 status = fDeferredStatus; | 1198 status = fDeferredStatus; |
1200 bailOut = TRUE; | 1199 } else if (fMatch == FALSE) { |
| 1200 status = U_REGEX_INVALID_STATE; |
| 1201 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { |
| 1202 status = U_INDEX_OUTOFBOUNDS_ERROR; |
1201 } | 1203 } |
1202 | 1204 if (U_FAILURE(status)) { |
1203 if (fMatch == FALSE) { | 1205 return dest; |
1204 status = U_REGEX_INVALID_STATE; | |
1205 bailOut = TRUE; | |
1206 } | 1206 } |
1207 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { | 1207 |
1208 status = U_INDEX_OUTOFBOUNDS_ERROR; | |
1209 bailOut = TRUE; | |
1210 } | |
1211 | |
1212 if (bailOut) { | |
1213 if (dest) { | |
1214 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); | |
1215 return dest; | |
1216 } else { | |
1217 return utext_openUChars(NULL, NULL, 0, &status); | |
1218 } | |
1219 } | |
1220 | |
1221 int64_t s, e; | 1208 int64_t s, e; |
1222 if (groupNum == 0) { | 1209 if (groupNum == 0) { |
1223 s = fMatchStart; | 1210 s = fMatchStart; |
1224 e = fMatchEnd; | 1211 e = fMatchEnd; |
1225 } else { | 1212 } else { |
1226 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); | 1213 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); |
1227 U_ASSERT(groupOffset < fPattern->fFrameSize); | 1214 U_ASSERT(groupOffset < fPattern->fFrameSize); |
1228 U_ASSERT(groupOffset >= 0); | 1215 U_ASSERT(groupOffset >= 0); |
1229 s = fFrame->fExtra[groupOffset]; | 1216 s = fFrame->fExtra[groupOffset]; |
1230 e = fFrame->fExtra[groupOffset+1]; | 1217 e = fFrame->fExtra[groupOffset+1]; |
1231 } | 1218 } |
1232 | 1219 |
1233 if (s < 0) { | 1220 if (s < 0) { |
1234 // A capture group wasn't part of the match | 1221 // A capture group wasn't part of the match |
1235 if (dest) { | 1222 if (dest) { |
1236 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); | 1223 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); |
1237 return dest; | 1224 return dest; |
1238 } else { | 1225 } else { |
1239 return utext_openUChars(NULL, NULL, 0, &status); | 1226 return utext_openUChars(NULL, NULL, 0, &status); |
1240 } | 1227 } |
1241 } | 1228 } |
1242 U_ASSERT(s <= e); | 1229 U_ASSERT(s <= e); |
1243 | 1230 |
1244 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { | 1231 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
1245 U_ASSERT(e <= fInputLength); | 1232 U_ASSERT(e <= fInputLength); |
1246 if (dest) { | 1233 if (dest) { |
1247 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkCo
ntents+s, (int32_t)(e-s), &status); | 1234 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkCo
ntents+s, (int32_t)(e-s), &status); |
1248 } else { | 1235 } else { |
1249 UText groupText = UTEXT_INITIALIZER; | 1236 UText groupText = UTEXT_INITIALIZER; |
1250 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &stat
us); | 1237 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &stat
us); |
1251 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); | 1238 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); |
1252 utext_close(&groupText); | 1239 utext_close(&groupText); |
1253 } | 1240 } |
(...skipping 13 matching lines...) Expand all Loading... |
1267 utext_extract(fInputText, s, e, groupChars, len16+1, &status); | 1254 utext_extract(fInputText, s, e, groupChars, len16+1, &status); |
1268 | 1255 |
1269 if (dest) { | 1256 if (dest) { |
1270 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16,
&status); | 1257 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16,
&status); |
1271 } else { | 1258 } else { |
1272 UText groupText = UTEXT_INITIALIZER; | 1259 UText groupText = UTEXT_INITIALIZER; |
1273 utext_openUChars(&groupText, groupChars, len16, &status); | 1260 utext_openUChars(&groupText, groupChars, len16, &status); |
1274 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); | 1261 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); |
1275 utext_close(&groupText); | 1262 utext_close(&groupText); |
1276 } | 1263 } |
1277 | 1264 |
1278 uprv_free(groupChars); | 1265 uprv_free(groupChars); |
1279 } | 1266 } |
1280 return dest; | 1267 return dest; |
1281 } | 1268 } |
1282 | 1269 |
1283 //------------------------------------------------------------------------------
-- | 1270 //------------------------------------------------------------------------------
-- |
1284 // | 1271 // |
1285 // appendGroup() -- currently internal only, appends a group to a UText rather | 1272 // appendGroup() -- currently internal only, appends a group to a UText rather |
1286 // than replacing its contents | 1273 // than replacing its contents |
1287 // | 1274 // |
1288 //------------------------------------------------------------------------------
-- | 1275 //------------------------------------------------------------------------------
-- |
1289 | 1276 |
1290 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &sta
tus) const { | 1277 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &sta
tus) const { |
1291 if (U_FAILURE(status)) { | 1278 if (U_FAILURE(status)) { |
1292 return 0; | 1279 return 0; |
1293 } | 1280 } |
1294 if (U_FAILURE(fDeferredStatus)) { | 1281 if (U_FAILURE(fDeferredStatus)) { |
1295 status = fDeferredStatus; | 1282 status = fDeferredStatus; |
1296 return 0; | 1283 return 0; |
1297 } | 1284 } |
1298 int64_t destLen = utext_nativeLength(dest); | 1285 int64_t destLen = utext_nativeLength(dest); |
1299 | 1286 |
1300 if (fMatch == FALSE) { | 1287 if (fMatch == FALSE) { |
1301 status = U_REGEX_INVALID_STATE; | 1288 status = U_REGEX_INVALID_STATE; |
1302 return utext_replace(dest, destLen, destLen, NULL, 0, &status); | 1289 return utext_replace(dest, destLen, destLen, NULL, 0, &status); |
1303 } | 1290 } |
1304 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { | 1291 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { |
1305 status = U_INDEX_OUTOFBOUNDS_ERROR; | 1292 status = U_INDEX_OUTOFBOUNDS_ERROR; |
1306 return utext_replace(dest, destLen, destLen, NULL, 0, &status); | 1293 return utext_replace(dest, destLen, destLen, NULL, 0, &status); |
1307 } | 1294 } |
1308 | 1295 |
1309 int64_t s, e; | 1296 int64_t s, e; |
1310 if (groupNum == 0) { | 1297 if (groupNum == 0) { |
1311 s = fMatchStart; | 1298 s = fMatchStart; |
1312 e = fMatchEnd; | 1299 e = fMatchEnd; |
1313 } else { | 1300 } else { |
1314 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); | 1301 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); |
1315 U_ASSERT(groupOffset < fPattern->fFrameSize); | 1302 U_ASSERT(groupOffset < fPattern->fFrameSize); |
1316 U_ASSERT(groupOffset >= 0); | 1303 U_ASSERT(groupOffset >= 0); |
1317 s = fFrame->fExtra[groupOffset]; | 1304 s = fFrame->fExtra[groupOffset]; |
1318 e = fFrame->fExtra[groupOffset+1]; | 1305 e = fFrame->fExtra[groupOffset+1]; |
1319 } | 1306 } |
1320 | 1307 |
1321 if (s < 0) { | 1308 if (s < 0) { |
1322 // A capture group wasn't part of the match | 1309 // A capture group wasn't part of the match |
1323 return utext_replace(dest, destLen, destLen, NULL, 0, &status); | 1310 return utext_replace(dest, destLen, destLen, NULL, 0, &status); |
1324 } | 1311 } |
1325 U_ASSERT(s <= e); | 1312 U_ASSERT(s <= e); |
1326 | 1313 |
1327 int64_t deltaLen; | 1314 int64_t deltaLen; |
1328 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { | 1315 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
1329 U_ASSERT(e <= fInputLength); | 1316 U_ASSERT(e <= fInputLength); |
1330 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkConten
ts+s, (int32_t)(e-s), &status); | 1317 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkConten
ts+s, (int32_t)(e-s), &status); |
1331 } else { | 1318 } else { |
1332 int32_t len16; | 1319 int32_t len16; |
1333 if (UTEXT_USES_U16(fInputText)) { | 1320 if (UTEXT_USES_U16(fInputText)) { |
1334 len16 = (int32_t)(e-s); | 1321 len16 = (int32_t)(e-s); |
1335 } else { | 1322 } else { |
1336 UErrorCode lengthStatus = U_ZERO_ERROR; | 1323 UErrorCode lengthStatus = U_ZERO_ERROR; |
1337 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); | 1324 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); |
1338 } | 1325 } |
1339 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); | 1326 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); |
1340 if (groupChars == NULL) { | 1327 if (groupChars == NULL) { |
1341 status = U_MEMORY_ALLOCATION_ERROR; | 1328 status = U_MEMORY_ALLOCATION_ERROR; |
1342 return 0; | 1329 return 0; |
1343 } | 1330 } |
1344 utext_extract(fInputText, s, e, groupChars, len16+1, &status); | 1331 utext_extract(fInputText, s, e, groupChars, len16+1, &status); |
1345 | 1332 |
1346 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &sta
tus); | 1333 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &sta
tus); |
1347 uprv_free(groupChars); | 1334 uprv_free(groupChars); |
1348 } | 1335 } |
1349 return deltaLen; | 1336 return deltaLen; |
1350 } | 1337 } |
1351 | 1338 |
1352 | 1339 |
1353 | 1340 |
1354 //------------------------------------------------------------------------------
-- | 1341 //------------------------------------------------------------------------------
-- |
1355 // | 1342 // |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1402 if (!fInput) { | 1389 if (!fInput) { |
1403 UErrorCode status = U_ZERO_ERROR; | 1390 UErrorCode status = U_ZERO_ERROR; |
1404 int32_t len16; | 1391 int32_t len16; |
1405 if (UTEXT_USES_U16(fInputText)) { | 1392 if (UTEXT_USES_U16(fInputText)) { |
1406 len16 = (int32_t)fInputLength; | 1393 len16 = (int32_t)fInputLength; |
1407 } else { | 1394 } else { |
1408 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status)
; | 1395 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status)
; |
1409 status = U_ZERO_ERROR; // overflow, length status | 1396 status = U_ZERO_ERROR; // overflow, length status |
1410 } | 1397 } |
1411 UnicodeString *result = new UnicodeString(len16, 0, 0); | 1398 UnicodeString *result = new UnicodeString(len16, 0, 0); |
1412 | 1399 |
1413 UChar *inputChars = result->getBuffer(len16); | 1400 UChar *inputChars = result->getBuffer(len16); |
1414 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status);
// unterminated warning | 1401 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status);
// unterminated warning |
1415 result->releaseBuffer(len16); | 1402 result->releaseBuffer(len16); |
1416 | 1403 |
1417 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rath
er than operator= | 1404 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rath
er than operator= |
1418 } | 1405 } |
1419 | 1406 |
1420 return *fInput; | 1407 return *fInput; |
1421 } | 1408 } |
1422 | 1409 |
1423 //------------------------------------------------------------------------------
-- | 1410 //------------------------------------------------------------------------------
-- |
1424 // | 1411 // |
1425 // inputText() | 1412 // inputText() |
1426 // | 1413 // |
1427 //------------------------------------------------------------------------------
-- | 1414 //------------------------------------------------------------------------------
-- |
1428 UText *RegexMatcher::inputText() const { | 1415 UText *RegexMatcher::inputText() const { |
1429 return fInputText; | 1416 return fInputText; |
1430 } | 1417 } |
1431 | 1418 |
1432 | 1419 |
1433 //------------------------------------------------------------------------------
-- | 1420 //------------------------------------------------------------------------------
-- |
1434 // | 1421 // |
1435 // getInput() -- like inputText(), but makes a clone or copies into another UTe
xt | 1422 // getInput() -- like inputText(), but makes a clone or copies into another UTe
xt |
1436 // | 1423 // |
1437 //------------------------------------------------------------------------------
-- | 1424 //------------------------------------------------------------------------------
-- |
1438 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { | 1425 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { |
1439 UBool bailOut = FALSE; | |
1440 if (U_FAILURE(status)) { | 1426 if (U_FAILURE(status)) { |
1441 return dest; | 1427 return dest; |
1442 } | 1428 } |
1443 if (U_FAILURE(fDeferredStatus)) { | 1429 if (U_FAILURE(fDeferredStatus)) { |
1444 status = fDeferredStatus; | 1430 status = fDeferredStatus; |
1445 bailOut = TRUE; | 1431 return dest; |
1446 } | 1432 } |
1447 | 1433 |
1448 if (bailOut) { | |
1449 if (dest) { | |
1450 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); | |
1451 return dest; | |
1452 } else { | |
1453 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); | |
1454 } | |
1455 } | |
1456 | |
1457 if (dest) { | 1434 if (dest) { |
1458 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { | 1435 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
1459 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkCo
ntents, (int32_t)fInputLength, &status); | 1436 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkCo
ntents, (int32_t)fInputLength, &status); |
1460 } else { | 1437 } else { |
1461 int32_t input16Len; | 1438 int32_t input16Len; |
1462 if (UTEXT_USES_U16(fInputText)) { | 1439 if (UTEXT_USES_U16(fInputText)) { |
1463 input16Len = (int32_t)fInputLength; | 1440 input16Len = (int32_t)fInputLength; |
1464 } else { | 1441 } else { |
1465 UErrorCode lengthStatus = U_ZERO_ERROR; | 1442 UErrorCode lengthStatus = U_ZERO_ERROR; |
1466 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0,
&lengthStatus); // buffer overflow error | 1443 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0,
&lengthStatus); // buffer overflow error |
1467 } | 1444 } |
1468 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len))
; | 1445 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len))
; |
1469 if (inputChars == NULL) { | 1446 if (inputChars == NULL) { |
1470 return dest; | 1447 return dest; |
1471 } | 1448 } |
1472 | 1449 |
1473 status = U_ZERO_ERROR; | 1450 status = U_ZERO_ERROR; |
1474 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &
status); // not terminated warning | 1451 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &
status); // not terminated warning |
1475 status = U_ZERO_ERROR; | 1452 status = U_ZERO_ERROR; |
1476 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16
Len, &status); | 1453 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16
Len, &status); |
1477 | 1454 |
1478 uprv_free(inputChars); | 1455 uprv_free(inputChars); |
1479 } | 1456 } |
1480 return dest; | 1457 return dest; |
1481 } else { | 1458 } else { |
1482 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); | 1459 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); |
1483 } | 1460 } |
1484 } | 1461 } |
1485 | 1462 |
1486 | 1463 |
1487 static UBool compat_SyncMutableUTextContents(UText *ut); | 1464 static UBool compat_SyncMutableUTextContents(UText *ut); |
1488 static UBool compat_SyncMutableUTextContents(UText *ut) { | 1465 static UBool compat_SyncMutableUTextContents(UText *ut) { |
1489 UBool retVal = FALSE; | 1466 UBool retVal = FALSE; |
1490 | 1467 |
1491 // In the following test, we're really only interested in whether the UText
should switch | 1468 // In the following test, we're really only interested in whether the UText
should switch |
1492 // between heap and stack allocation. If length hasn't changed, we won't,
so the chunkContents | 1469 // between heap and stack allocation. If length hasn't changed, we won't,
so the chunkContents |
1493 // will still point to the correct data. | 1470 // will still point to the correct data. |
1494 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { | 1471 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { |
1495 UnicodeString *us=(UnicodeString *)ut->context; | 1472 UnicodeString *us=(UnicodeString *)ut->context; |
1496 | 1473 |
1497 // Update to the latest length. | 1474 // Update to the latest length. |
1498 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). | 1475 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). |
1499 int32_t newLength = us->length(); | 1476 int32_t newLength = us->length(); |
1500 | 1477 |
1501 // Update the chunk description. | 1478 // Update the chunk description. |
1502 // The buffer may have switched between stack- and heap-based. | 1479 // The buffer may have switched between stack- and heap-based. |
1503 ut->chunkContents = us->getBuffer(); | 1480 ut->chunkContents = us->getBuffer(); |
1504 ut->chunkLength = newLength; | 1481 ut->chunkLength = newLength; |
1505 ut->chunkNativeLimit = newLength; | 1482 ut->chunkNativeLimit = newLength; |
1506 ut->nativeIndexingLimit = newLength; | 1483 ut->nativeIndexingLimit = newLength; |
1507 retVal = TRUE; | 1484 retVal = TRUE; |
1508 } | 1485 } |
1509 | 1486 |
1510 return retVal; | 1487 return retVal; |
1511 } | 1488 } |
1512 | 1489 |
1513 //------------------------------------------------------------------------------
-- | 1490 //------------------------------------------------------------------------------
-- |
1514 // | 1491 // |
1515 // lookingAt() | 1492 // lookingAt() |
1516 // | 1493 // |
1517 //------------------------------------------------------------------------------
-- | 1494 //------------------------------------------------------------------------------
-- |
1518 UBool RegexMatcher::lookingAt(UErrorCode &status) { | 1495 UBool RegexMatcher::lookingAt(UErrorCode &status) { |
1519 if (U_FAILURE(status)) { | 1496 if (U_FAILURE(status)) { |
1520 return FALSE; | 1497 return FALSE; |
1521 } | 1498 } |
1522 if (U_FAILURE(fDeferredStatus)) { | 1499 if (U_FAILURE(fDeferredStatus)) { |
1523 status = fDeferredStatus; | 1500 status = fDeferredStatus; |
1524 return FALSE; | 1501 return FALSE; |
1525 } | 1502 } |
1526 | 1503 |
1527 if (fInputUniStrMaybeMutable) { | 1504 if (fInputUniStrMaybeMutable) { |
1528 if (compat_SyncMutableUTextContents(fInputText)) { | 1505 if (compat_SyncMutableUTextContents(fInputText)) { |
1529 fInputLength = utext_nativeLength(fInputText); | 1506 fInputLength = utext_nativeLength(fInputText); |
1530 reset(); | 1507 reset(); |
1531 } | 1508 } |
1532 } | 1509 } |
1533 else { | 1510 else { |
1534 resetPreserveRegion(); | 1511 resetPreserveRegion(); |
1535 } | 1512 } |
1536 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { | 1513 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
1537 MatchChunkAt((int32_t)fActiveStart, FALSE, status); | 1514 MatchChunkAt((int32_t)fActiveStart, FALSE, status); |
1538 } else { | 1515 } else { |
1539 MatchAt(fActiveStart, FALSE, status); | 1516 MatchAt(fActiveStart, FALSE, status); |
1540 } | 1517 } |
1541 return fMatch; | 1518 return fMatch; |
1542 } | 1519 } |
1543 | 1520 |
1544 | 1521 |
1545 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { | 1522 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { |
1546 if (U_FAILURE(status)) { | 1523 if (U_FAILURE(status)) { |
1547 return FALSE; | 1524 return FALSE; |
1548 } | 1525 } |
1549 if (U_FAILURE(fDeferredStatus)) { | 1526 if (U_FAILURE(fDeferredStatus)) { |
1550 status = fDeferredStatus; | 1527 status = fDeferredStatus; |
1551 return FALSE; | 1528 return FALSE; |
1552 } | 1529 } |
1553 reset(); | 1530 reset(); |
1554 | 1531 |
1555 if (start < 0) { | 1532 if (start < 0) { |
1556 status = U_INDEX_OUTOFBOUNDS_ERROR; | 1533 status = U_INDEX_OUTOFBOUNDS_ERROR; |
1557 return FALSE; | 1534 return FALSE; |
1558 } | 1535 } |
1559 | 1536 |
1560 if (fInputUniStrMaybeMutable) { | 1537 if (fInputUniStrMaybeMutable) { |
1561 if (compat_SyncMutableUTextContents(fInputText)) { | 1538 if (compat_SyncMutableUTextContents(fInputText)) { |
1562 fInputLength = utext_nativeLength(fInputText); | 1539 fInputLength = utext_nativeLength(fInputText); |
1563 reset(); | 1540 reset(); |
1564 } | 1541 } |
1565 } | 1542 } |
1566 | 1543 |
1567 int64_t nativeStart; | 1544 int64_t nativeStart; |
1568 nativeStart = start; | 1545 nativeStart = start; |
1569 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { | 1546 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { |
1570 status = U_INDEX_OUTOFBOUNDS_ERROR; | 1547 status = U_INDEX_OUTOFBOUNDS_ERROR; |
1571 return FALSE; | 1548 return FALSE; |
1572 } | 1549 } |
1573 | 1550 |
1574 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { | 1551 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
1575 MatchChunkAt((int32_t)nativeStart, FALSE, status); | 1552 MatchChunkAt((int32_t)nativeStart, FALSE, status); |
1576 } else { | 1553 } else { |
1577 MatchAt(nativeStart, FALSE, status); | 1554 MatchAt(nativeStart, FALSE, status); |
1578 } | 1555 } |
1579 return fMatch; | 1556 return fMatch; |
1580 } | 1557 } |
1581 | 1558 |
1582 | 1559 |
1583 | 1560 |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1616 | 1593 |
1617 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { | 1594 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { |
1618 if (U_FAILURE(status)) { | 1595 if (U_FAILURE(status)) { |
1619 return FALSE; | 1596 return FALSE; |
1620 } | 1597 } |
1621 if (U_FAILURE(fDeferredStatus)) { | 1598 if (U_FAILURE(fDeferredStatus)) { |
1622 status = fDeferredStatus; | 1599 status = fDeferredStatus; |
1623 return FALSE; | 1600 return FALSE; |
1624 } | 1601 } |
1625 reset(); | 1602 reset(); |
1626 | 1603 |
1627 if (start < 0) { | 1604 if (start < 0) { |
1628 status = U_INDEX_OUTOFBOUNDS_ERROR; | 1605 status = U_INDEX_OUTOFBOUNDS_ERROR; |
1629 return FALSE; | 1606 return FALSE; |
1630 } | 1607 } |
1631 | 1608 |
1632 if (fInputUniStrMaybeMutable) { | 1609 if (fInputUniStrMaybeMutable) { |
1633 if (compat_SyncMutableUTextContents(fInputText)) { | 1610 if (compat_SyncMutableUTextContents(fInputText)) { |
1634 fInputLength = utext_nativeLength(fInputText); | 1611 fInputLength = utext_nativeLength(fInputText); |
1635 reset(); | 1612 reset(); |
1636 } | 1613 } |
(...skipping 29 matching lines...) Expand all Loading... |
1666 | 1643 |
1667 //------------------------------------------------------------------------------
-- | 1644 //------------------------------------------------------------------------------
-- |
1668 // | 1645 // |
1669 // region | 1646 // region |
1670 // | 1647 // |
1671 //------------------------------------------------------------------------------
-- | 1648 //------------------------------------------------------------------------------
-- |
1672 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int
64_t startIndex, UErrorCode &status) { | 1649 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int
64_t startIndex, UErrorCode &status) { |
1673 if (U_FAILURE(status)) { | 1650 if (U_FAILURE(status)) { |
1674 return *this; | 1651 return *this; |
1675 } | 1652 } |
1676 | 1653 |
1677 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { | 1654 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { |
1678 status = U_ILLEGAL_ARGUMENT_ERROR; | 1655 status = U_ILLEGAL_ARGUMENT_ERROR; |
1679 } | 1656 } |
1680 | 1657 |
1681 int64_t nativeStart = regionStart; | 1658 int64_t nativeStart = regionStart; |
1682 int64_t nativeLimit = regionLimit; | 1659 int64_t nativeLimit = regionLimit; |
1683 if (nativeStart > fInputLength || nativeLimit > fInputLength) { | 1660 if (nativeStart > fInputLength || nativeLimit > fInputLength) { |
1684 status = U_ILLEGAL_ARGUMENT_ERROR; | 1661 status = U_ILLEGAL_ARGUMENT_ERROR; |
1685 } | 1662 } |
1686 | 1663 |
1687 if (startIndex == -1) | 1664 if (startIndex == -1) |
1688 this->reset(); | 1665 this->reset(); |
1689 else | 1666 else |
1690 resetPreserveRegion(); | 1667 resetPreserveRegion(); |
1691 | 1668 |
1692 fRegionStart = nativeStart; | 1669 fRegionStart = nativeStart; |
1693 fRegionLimit = nativeLimit; | 1670 fRegionLimit = nativeLimit; |
1694 fActiveStart = nativeStart; | 1671 fActiveStart = nativeStart; |
1695 fActiveLimit = nativeLimit; | 1672 fActiveLimit = nativeLimit; |
1696 | 1673 |
1697 if (startIndex != -1) { | 1674 if (startIndex != -1) { |
1698 if (startIndex < fActiveStart || startIndex > fActiveLimit) { | 1675 if (startIndex < fActiveStart || startIndex > fActiveLimit) { |
1699 status = U_INDEX_OUTOFBOUNDS_ERROR; | 1676 status = U_INDEX_OUTOFBOUNDS_ERROR; |
1700 } | 1677 } |
1701 fMatchEnd = startIndex; | 1678 fMatchEnd = startIndex; |
1702 } | 1679 } |
1703 | 1680 |
1704 if (!fTransparentBounds) { | 1681 if (!fTransparentBounds) { |
1705 fLookStart = nativeStart; | 1682 fLookStart = nativeStart; |
1706 fLookLimit = nativeLimit; | 1683 fLookLimit = nativeLimit; |
1707 } | 1684 } |
1708 if (fAnchoringBounds) { | 1685 if (fAnchoringBounds) { |
1709 fAnchorStart = nativeStart; | 1686 fAnchorStart = nativeStart; |
1710 fAnchorLimit = nativeLimit; | 1687 fAnchorLimit = nativeLimit; |
1711 } | 1688 } |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1748 // replaceAll | 1725 // replaceAll |
1749 // | 1726 // |
1750 //------------------------------------------------------------------------------
-- | 1727 //------------------------------------------------------------------------------
-- |
1751 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorC
ode &status) { | 1728 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorC
ode &status) { |
1752 UText replacementText = UTEXT_INITIALIZER; | 1729 UText replacementText = UTEXT_INITIALIZER; |
1753 UText resultText = UTEXT_INITIALIZER; | 1730 UText resultText = UTEXT_INITIALIZER; |
1754 UnicodeString resultString; | 1731 UnicodeString resultString; |
1755 if (U_FAILURE(status)) { | 1732 if (U_FAILURE(status)) { |
1756 return resultString; | 1733 return resultString; |
1757 } | 1734 } |
1758 | 1735 |
1759 utext_openConstUnicodeString(&replacementText, &replacement, &status); | 1736 utext_openConstUnicodeString(&replacementText, &replacement, &status); |
1760 utext_openUnicodeString(&resultText, &resultString, &status); | 1737 utext_openUnicodeString(&resultText, &resultString, &status); |
1761 | 1738 |
1762 replaceAll(&replacementText, &resultText, status); | 1739 replaceAll(&replacementText, &resultText, status); |
1763 | 1740 |
1764 utext_close(&resultText); | 1741 utext_close(&resultText); |
1765 utext_close(&replacementText); | 1742 utext_close(&replacementText); |
1766 | 1743 |
1767 return resultString; | 1744 return resultString; |
1768 } | 1745 } |
1769 | 1746 |
1770 | 1747 |
1771 // | 1748 // |
1772 // replaceAll, UText mode | 1749 // replaceAll, UText mode |
1773 // | 1750 // |
1774 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &sta
tus) { | 1751 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &sta
tus) { |
1775 if (U_FAILURE(status)) { | 1752 if (U_FAILURE(status)) { |
1776 return dest; | 1753 return dest; |
1777 } | 1754 } |
1778 if (U_FAILURE(fDeferredStatus)) { | 1755 if (U_FAILURE(fDeferredStatus)) { |
1779 status = fDeferredStatus; | 1756 status = fDeferredStatus; |
1780 return dest; | 1757 return dest; |
1781 } | 1758 } |
1782 | 1759 |
1783 if (dest == NULL) { | 1760 if (dest == NULL) { |
1784 UnicodeString emptyString; | 1761 UnicodeString emptyString; |
1785 UText empty = UTEXT_INITIALIZER; | 1762 UText empty = UTEXT_INITIALIZER; |
1786 | 1763 |
1787 utext_openUnicodeString(&empty, &emptyString, &status); | 1764 utext_openUnicodeString(&empty, &emptyString, &status); |
1788 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); | 1765 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); |
1789 utext_close(&empty); | 1766 utext_close(&empty); |
1790 } | 1767 } |
1791 | 1768 |
1792 if (U_SUCCESS(status)) { | 1769 if (U_SUCCESS(status)) { |
1793 reset(); | 1770 reset(); |
1794 while (find()) { | 1771 while (find()) { |
1795 appendReplacement(dest, replacement, status); | 1772 appendReplacement(dest, replacement, status); |
1796 if (U_FAILURE(status)) { | 1773 if (U_FAILURE(status)) { |
1797 break; | 1774 break; |
1798 } | 1775 } |
1799 } | 1776 } |
1800 appendTail(dest, status); | 1777 appendTail(dest, status); |
1801 } | 1778 } |
1802 | 1779 |
1803 return dest; | 1780 return dest; |
1804 } | 1781 } |
1805 | 1782 |
1806 | 1783 |
1807 //------------------------------------------------------------------------------
-- | 1784 //------------------------------------------------------------------------------
-- |
1808 // | 1785 // |
1809 // replaceFirst | 1786 // replaceFirst |
1810 // | 1787 // |
1811 //------------------------------------------------------------------------------
-- | 1788 //------------------------------------------------------------------------------
-- |
1812 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErro
rCode &status) { | 1789 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErro
rCode &status) { |
1813 UText replacementText = UTEXT_INITIALIZER; | 1790 UText replacementText = UTEXT_INITIALIZER; |
1814 UText resultText = UTEXT_INITIALIZER; | 1791 UText resultText = UTEXT_INITIALIZER; |
1815 UnicodeString resultString; | 1792 UnicodeString resultString; |
1816 | 1793 |
1817 utext_openConstUnicodeString(&replacementText, &replacement, &status); | 1794 utext_openConstUnicodeString(&replacementText, &replacement, &status); |
1818 utext_openUnicodeString(&resultText, &resultString, &status); | 1795 utext_openUnicodeString(&resultText, &resultString, &status); |
1819 | 1796 |
1820 replaceFirst(&replacementText, &resultText, status); | 1797 replaceFirst(&replacementText, &resultText, status); |
1821 | 1798 |
1822 utext_close(&resultText); | 1799 utext_close(&resultText); |
1823 utext_close(&replacementText); | 1800 utext_close(&replacementText); |
1824 | 1801 |
1825 return resultString; | 1802 return resultString; |
1826 } | 1803 } |
1827 | 1804 |
1828 // | 1805 // |
1829 // replaceFirst, UText mode | 1806 // replaceFirst, UText mode |
1830 // | 1807 // |
1831 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &s
tatus) { | 1808 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &s
tatus) { |
1832 if (U_FAILURE(status)) { | 1809 if (U_FAILURE(status)) { |
1833 return dest; | 1810 return dest; |
1834 } | 1811 } |
1835 if (U_FAILURE(fDeferredStatus)) { | 1812 if (U_FAILURE(fDeferredStatus)) { |
1836 status = fDeferredStatus; | 1813 status = fDeferredStatus; |
1837 return dest; | 1814 return dest; |
1838 } | 1815 } |
1839 | 1816 |
1840 reset(); | 1817 reset(); |
1841 if (!find()) { | 1818 if (!find()) { |
1842 return getInput(dest, status); | 1819 return getInput(dest, status); |
1843 } | 1820 } |
1844 | 1821 |
1845 if (dest == NULL) { | 1822 if (dest == NULL) { |
1846 UnicodeString emptyString; | 1823 UnicodeString emptyString; |
1847 UText empty = UTEXT_INITIALIZER; | 1824 UText empty = UTEXT_INITIALIZER; |
1848 | 1825 |
1849 utext_openUnicodeString(&empty, &emptyString, &status); | 1826 utext_openUnicodeString(&empty, &emptyString, &status); |
1850 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); | 1827 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); |
1851 utext_close(&empty); | 1828 utext_close(&empty); |
1852 } | 1829 } |
1853 | 1830 |
1854 appendReplacement(dest, replacement, status); | 1831 appendReplacement(dest, replacement, status); |
1855 appendTail(dest, status); | 1832 appendTail(dest, status); |
1856 | 1833 |
1857 return dest; | 1834 return dest; |
1858 } | 1835 } |
1859 | 1836 |
1860 | 1837 |
1861 //------------------------------------------------------------------------------
-- | 1838 //------------------------------------------------------------------------------
-- |
1862 // | 1839 // |
1863 // requireEnd | 1840 // requireEnd |
1864 // | 1841 // |
1865 //------------------------------------------------------------------------------
-- | 1842 //------------------------------------------------------------------------------
-- |
1866 UBool RegexMatcher::requireEnd() const { | 1843 UBool RegexMatcher::requireEnd() const { |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1901 //resetStack(); // more expensive than it looks... | 1878 //resetStack(); // more expensive than it looks... |
1902 } | 1879 } |
1903 | 1880 |
1904 | 1881 |
1905 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { | 1882 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { |
1906 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStat
us); | 1883 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStat
us); |
1907 if (fPattern->fNeedsAltInput) { | 1884 if (fPattern->fNeedsAltInput) { |
1908 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDe
ferredStatus); | 1885 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDe
ferredStatus); |
1909 } | 1886 } |
1910 fInputLength = utext_nativeLength(fInputText); | 1887 fInputLength = utext_nativeLength(fInputText); |
1911 | 1888 |
1912 reset(); | 1889 reset(); |
1913 delete fInput; | 1890 delete fInput; |
1914 fInput = NULL; | 1891 fInput = NULL; |
1915 | 1892 |
1916 // Do the following for any UnicodeString. | 1893 // Do the following for any UnicodeString. |
1917 // This is for compatibility for those clients who modify the input string
"live" during regex operations. | 1894 // This is for compatibility for those clients who modify the input string
"live" during regex operations. |
1918 fInputUniStrMaybeMutable = TRUE; | 1895 fInputUniStrMaybeMutable = TRUE; |
1919 | 1896 |
1920 if (fWordBreakItr != NULL) { | 1897 if (fWordBreakItr != NULL) { |
1921 #if UCONFIG_NO_BREAK_ITERATION==0 | 1898 #if UCONFIG_NO_BREAK_ITERATION==0 |
1922 UErrorCode status = U_ZERO_ERROR; | 1899 UErrorCode status = U_ZERO_ERROR; |
1923 fWordBreakItr->setText(fInputText, status); | 1900 fWordBreakItr->setText(fInputText, status); |
1924 #endif | 1901 #endif |
1925 } | 1902 } |
1926 return *this; | 1903 return *this; |
1927 } | 1904 } |
1928 | 1905 |
1929 | 1906 |
1930 RegexMatcher &RegexMatcher::reset(UText *input) { | 1907 RegexMatcher &RegexMatcher::reset(UText *input) { |
1931 if (fInputText != input) { | 1908 if (fInputText != input) { |
1932 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatu
s); | 1909 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatu
s); |
1933 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText,
fInputText, FALSE, TRUE, &fDeferredStatus); | 1910 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText,
fInputText, FALSE, TRUE, &fDeferredStatus); |
1934 fInputLength = utext_nativeLength(fInputText); | 1911 fInputLength = utext_nativeLength(fInputText); |
1935 | 1912 |
1936 delete fInput; | 1913 delete fInput; |
1937 fInput = NULL; | 1914 fInput = NULL; |
1938 | 1915 |
1939 if (fWordBreakItr != NULL) { | 1916 if (fWordBreakItr != NULL) { |
1940 #if UCONFIG_NO_BREAK_ITERATION==0 | 1917 #if UCONFIG_NO_BREAK_ITERATION==0 |
1941 UErrorCode status = U_ZERO_ERROR; | 1918 UErrorCode status = U_ZERO_ERROR; |
1942 fWordBreakItr->setText(input, status); | 1919 fWordBreakItr->setText(input, status); |
1943 #endif | 1920 #endif |
1944 } | 1921 } |
1945 } | 1922 } |
1946 reset(); | 1923 reset(); |
1947 fInputUniStrMaybeMutable = FALSE; | 1924 fInputUniStrMaybeMutable = FALSE; |
1948 | 1925 |
1949 return *this; | 1926 return *this; |
1950 } | 1927 } |
1951 | 1928 |
1952 /*RegexMatcher &RegexMatcher::reset(const UChar *) { | 1929 /*RegexMatcher &RegexMatcher::reset(const UChar *) { |
1953 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; | 1930 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; |
1954 return *this; | 1931 return *this; |
1955 }*/ | 1932 }*/ |
1956 | 1933 |
1957 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { | 1934 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { |
1958 if (U_FAILURE(status)) { | 1935 if (U_FAILURE(status)) { |
1959 return *this; | 1936 return *this; |
1960 } | 1937 } |
1961 reset(); // Reset also resets the region to be the entire string. | 1938 reset(); // Reset also resets the region to be the entire string. |
1962 | 1939 |
1963 if (position < 0 || position > fActiveLimit) { | 1940 if (position < 0 || position > fActiveLimit) { |
1964 status = U_INDEX_OUTOFBOUNDS_ERROR; | 1941 status = U_INDEX_OUTOFBOUNDS_ERROR; |
1965 return *this; | 1942 return *this; |
1966 } | 1943 } |
1967 fMatchEnd = position; | 1944 fMatchEnd = position; |
1968 return *this; | 1945 return *this; |
1969 } | 1946 } |
1970 | 1947 |
1971 | 1948 |
1972 //------------------------------------------------------------------------------
-- | 1949 //------------------------------------------------------------------------------
-- |
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2036 | 2013 |
2037 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); | 2014 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); |
2038 if (destText == NULL) { | 2015 if (destText == NULL) { |
2039 status = U_MEMORY_ALLOCATION_ERROR; | 2016 status = U_MEMORY_ALLOCATION_ERROR; |
2040 return 0; | 2017 return 0; |
2041 } | 2018 } |
2042 int32_t i; | 2019 int32_t i; |
2043 for (i = 0; i < destCapacity; i++) { | 2020 for (i = 0; i < destCapacity; i++) { |
2044 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); | 2021 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); |
2045 } | 2022 } |
2046 | 2023 |
2047 int32_t fieldCount = split(&inputText, destText, destCapacity, status); | 2024 int32_t fieldCount = split(&inputText, destText, destCapacity, status); |
2048 | 2025 |
2049 for (i = 0; i < destCapacity; i++) { | 2026 for (i = 0; i < destCapacity; i++) { |
2050 utext_close(destText[i]); | 2027 utext_close(destText[i]); |
2051 } | 2028 } |
2052 | 2029 |
2053 uprv_free(destText); | 2030 uprv_free(destText); |
2054 utext_close(&inputText); | 2031 utext_close(&inputText); |
2055 return fieldCount; | 2032 return fieldCount; |
2056 } | 2033 } |
2057 | 2034 |
2058 // | 2035 // |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2094 // There is one or zero output string left. | 2071 // There is one or zero output string left. |
2095 // Fill the last output string with whatever is left from the input,
then exit the loop. | 2072 // Fill the last output string with whatever is left from the input,
then exit the loop. |
2096 // ( i will be == destCapacity if we filled the output array while
processing | 2073 // ( i will be == destCapacity if we filled the output array while
processing |
2097 // capture groups of the delimiter expression, in which case we w
ill discard the | 2074 // capture groups of the delimiter expression, in which case we w
ill discard the |
2098 // last capture group saved in favor of the unprocessed remainder
of the | 2075 // last capture group saved in favor of the unprocessed remainder
of the |
2099 // input string.) | 2076 // input string.) |
2100 i = destCapacity-1; | 2077 i = destCapacity-1; |
2101 if (fActiveLimit > nextOutputStringStart) { | 2078 if (fActiveLimit > nextOutputStringStart) { |
2102 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { | 2079 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { |
2103 if (dest[i]) { | 2080 if (dest[i]) { |
2104 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), | 2081 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), |
2105 input->chunkContents+nextOutputStringStart
, | 2082 input->chunkContents+nextOutputStringStart
, |
2106 (int32_t)(fActiveLimit-nextOutputStringSta
rt), &status); | 2083 (int32_t)(fActiveLimit-nextOutputStringSta
rt), &status); |
2107 } else { | 2084 } else { |
2108 UText remainingText = UTEXT_INITIALIZER; | 2085 UText remainingText = UTEXT_INITIALIZER; |
2109 utext_openUChars(&remainingText, input->chunkContents+ne
xtOutputStringStart, | 2086 utext_openUChars(&remainingText, input->chunkContents+ne
xtOutputStringStart, |
2110 fActiveLimit-nextOutputStringStart, &st
atus); | 2087 fActiveLimit-nextOutputStringStart, &st
atus); |
2111 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE,
&status); | 2088 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE,
&status); |
2112 utext_close(&remainingText); | 2089 utext_close(&remainingText); |
2113 } | 2090 } |
2114 } else { | 2091 } else { |
2115 UErrorCode lengthStatus = U_ZERO_ERROR; | 2092 UErrorCode lengthStatus = U_ZERO_ERROR; |
2116 int32_t remaining16Length = | 2093 int32_t remaining16Length = |
2117 utext_extract(input, nextOutputStringStart, fActiveLimit
, NULL, 0, &lengthStatus); | 2094 utext_extract(input, nextOutputStringStart, fActiveLimit
, NULL, 0, &lengthStatus); |
2118 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(
remaining16Length+1)); | 2095 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(
remaining16Length+1)); |
2119 if (remainingChars == NULL) { | 2096 if (remainingChars == NULL) { |
2120 status = U_MEMORY_ALLOCATION_ERROR; | 2097 status = U_MEMORY_ALLOCATION_ERROR; |
2121 break; | 2098 break; |
2122 } | 2099 } |
2123 | 2100 |
2124 utext_extract(input, nextOutputStringStart, fActiveLimit, re
mainingChars, remaining16Length+1, &status); | 2101 utext_extract(input, nextOutputStringStart, fActiveLimit, re
mainingChars, remaining16Length+1, &status); |
2125 if (dest[i]) { | 2102 if (dest[i]) { |
2126 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), r
emainingChars, remaining16Length, &status); | 2103 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), r
emainingChars, remaining16Length, &status); |
2127 } else { | 2104 } else { |
2128 UText remainingText = UTEXT_INITIALIZER; | 2105 UText remainingText = UTEXT_INITIALIZER; |
2129 utext_openUChars(&remainingText, remainingChars, remaini
ng16Length, &status); | 2106 utext_openUChars(&remainingText, remainingChars, remaini
ng16Length, &status); |
2130 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE,
&status); | 2107 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE,
&status); |
2131 utext_close(&remainingText); | 2108 utext_close(&remainingText); |
2132 } | 2109 } |
2133 | 2110 |
2134 uprv_free(remainingChars); | 2111 uprv_free(remainingChars); |
2135 } | 2112 } |
2136 } | 2113 } |
2137 break; | 2114 break; |
2138 } | 2115 } |
2139 if (find()) { | 2116 if (find()) { |
2140 // We found another delimiter. Move everything from where we starte
d looking | 2117 // We found another delimiter. Move everything from where we starte
d looking |
2141 // up until the start of the delimiter into the next output string. | 2118 // up until the start of the delimiter into the next output string. |
2142 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { | 2119 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { |
2143 if (dest[i]) { | 2120 if (dest[i]) { |
2144 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), | 2121 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), |
2145 input->chunkContents+nextOutputStringStart, | 2122 input->chunkContents+nextOutputStringStart, |
2146 (int32_t)(fMatchStart-nextOutputStringStart),
&status); | 2123 (int32_t)(fMatchStart-nextOutputStringStart),
&status); |
2147 } else { | 2124 } else { |
2148 UText remainingText = UTEXT_INITIALIZER; | 2125 UText remainingText = UTEXT_INITIALIZER; |
2149 utext_openUChars(&remainingText, input->chunkContents+nextOu
tputStringStart, | 2126 utext_openUChars(&remainingText, input->chunkContents+nextOu
tputStringStart, |
2150 fMatchStart-nextOutputStringStart, &status
); | 2127 fMatchStart-nextOutputStringStart, &status
); |
2151 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); | 2128 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); |
2152 utext_close(&remainingText); | 2129 utext_close(&remainingText); |
2153 } | 2130 } |
2154 } else { | 2131 } else { |
2155 UErrorCode lengthStatus = U_ZERO_ERROR; | 2132 UErrorCode lengthStatus = U_ZERO_ERROR; |
2156 int32_t remaining16Length = utext_extract(input, nextOutputStrin
gStart, fMatchStart, NULL, 0, &lengthStatus); | 2133 int32_t remaining16Length = utext_extract(input, nextOutputStrin
gStart, fMatchStart, NULL, 0, &lengthStatus); |
2157 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(rema
ining16Length+1)); | 2134 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(rema
ining16Length+1)); |
2158 if (remainingChars == NULL) { | 2135 if (remainingChars == NULL) { |
2159 status = U_MEMORY_ALLOCATION_ERROR; | 2136 status = U_MEMORY_ALLOCATION_ERROR; |
2160 break; | 2137 break; |
2161 } | 2138 } |
2162 utext_extract(input, nextOutputStringStart, fMatchStart, remaini
ngChars, remaining16Length+1, &status); | 2139 utext_extract(input, nextOutputStringStart, fMatchStart, remaini
ngChars, remaining16Length+1, &status); |
2163 if (dest[i]) { | 2140 if (dest[i]) { |
2164 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remai
ningChars, remaining16Length, &status); | 2141 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remai
ningChars, remaining16Length, &status); |
2165 } else { | 2142 } else { |
2166 UText remainingText = UTEXT_INITIALIZER; | 2143 UText remainingText = UTEXT_INITIALIZER; |
2167 utext_openUChars(&remainingText, remainingChars, remaining16
Length, &status); | 2144 utext_openUChars(&remainingText, remainingChars, remaining16
Length, &status); |
2168 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); | 2145 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); |
2169 utext_close(&remainingText); | 2146 utext_close(&remainingText); |
2170 } | 2147 } |
2171 | 2148 |
2172 uprv_free(remainingChars); | 2149 uprv_free(remainingChars); |
2173 } | 2150 } |
2174 nextOutputStringStart = fMatchEnd; | 2151 nextOutputStringStart = fMatchEnd; |
2175 | 2152 |
2176 // If the delimiter pattern has capturing parentheses, the captured | 2153 // If the delimiter pattern has capturing parentheses, the captured |
2177 // text goes out into the next n destination strings. | 2154 // text goes out into the next n destination strings. |
2178 int32_t groupNum; | 2155 int32_t groupNum; |
2179 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { | 2156 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { |
2180 if (i >= destCapacity-2) { | 2157 if (i >= destCapacity-2) { |
2181 // Never fill the last available output string with capture
group text. | 2158 // Never fill the last available output string with capture
group text. |
(...skipping 12 matching lines...) Expand all Loading... |
2194 if (i+1 < destCapacity) { | 2171 if (i+1 < destCapacity) { |
2195 ++i; | 2172 ++i; |
2196 if (dest[i] == NULL) { | 2173 if (dest[i] == NULL) { |
2197 dest[i] = utext_openUChars(NULL, NULL, 0, &status); | 2174 dest[i] = utext_openUChars(NULL, NULL, 0, &status); |
2198 } else { | 2175 } else { |
2199 static UChar emptyString[] = {(UChar)0}; | 2176 static UChar emptyString[] = {(UChar)0}; |
2200 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), e
mptyString, 0, &status); | 2177 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), e
mptyString, 0, &status); |
2201 } | 2178 } |
2202 } | 2179 } |
2203 break; | 2180 break; |
2204 | 2181 |
2205 } | 2182 } |
2206 } | 2183 } |
2207 else | 2184 else |
2208 { | 2185 { |
2209 // We ran off the end of the input while looking for the next delimi
ter. | 2186 // We ran off the end of the input while looking for the next delimi
ter. |
2210 // All the remaining text goes into the current output string. | 2187 // All the remaining text goes into the current output string. |
2211 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { | 2188 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { |
2212 if (dest[i]) { | 2189 if (dest[i]) { |
2213 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), | 2190 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), |
2214 input->chunkContents+nextOutputStringStart, | 2191 input->chunkContents+nextOutputStringStart, |
2215 (int32_t)(fActiveLimit-nextOutputStringStart),
&status); | 2192 (int32_t)(fActiveLimit-nextOutputStringStart),
&status); |
2216 } else { | 2193 } else { |
2217 UText remainingText = UTEXT_INITIALIZER; | 2194 UText remainingText = UTEXT_INITIALIZER; |
2218 utext_openUChars(&remainingText, input->chunkContents+nextOu
tputStringStart, | 2195 utext_openUChars(&remainingText, input->chunkContents+nextOu
tputStringStart, |
2219 fActiveLimit-nextOutputStringStart, &status
); | 2196 fActiveLimit-nextOutputStringStart, &status
); |
2220 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); | 2197 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); |
2221 utext_close(&remainingText); | 2198 utext_close(&remainingText); |
2222 } | 2199 } |
2223 } else { | 2200 } else { |
2224 UErrorCode lengthStatus = U_ZERO_ERROR; | 2201 UErrorCode lengthStatus = U_ZERO_ERROR; |
2225 int32_t remaining16Length = utext_extract(input, nextOutputStrin
gStart, fActiveLimit, NULL, 0, &lengthStatus); | 2202 int32_t remaining16Length = utext_extract(input, nextOutputStrin
gStart, fActiveLimit, NULL, 0, &lengthStatus); |
2226 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(rema
ining16Length+1)); | 2203 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(rema
ining16Length+1)); |
2227 if (remainingChars == NULL) { | 2204 if (remainingChars == NULL) { |
2228 status = U_MEMORY_ALLOCATION_ERROR; | 2205 status = U_MEMORY_ALLOCATION_ERROR; |
2229 break; | 2206 break; |
2230 } | 2207 } |
2231 | 2208 |
2232 utext_extract(input, nextOutputStringStart, fActiveLimit, remain
ingChars, remaining16Length+1, &status); | 2209 utext_extract(input, nextOutputStringStart, fActiveLimit, remain
ingChars, remaining16Length+1, &status); |
2233 if (dest[i]) { | 2210 if (dest[i]) { |
2234 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remai
ningChars, remaining16Length, &status); | 2211 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remai
ningChars, remaining16Length, &status); |
2235 } else { | 2212 } else { |
2236 UText remainingText = UTEXT_INITIALIZER; | 2213 UText remainingText = UTEXT_INITIALIZER; |
2237 utext_openUChars(&remainingText, remainingChars, remaining16
Length, &status); | 2214 utext_openUChars(&remainingText, remainingChars, remaining16
Length, &status); |
2238 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); | 2215 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st
atus); |
2239 utext_close(&remainingText); | 2216 utext_close(&remainingText); |
2240 } | 2217 } |
2241 | 2218 |
2242 uprv_free(remainingChars); | 2219 uprv_free(remainingChars); |
2243 } | 2220 } |
2244 break; | 2221 break; |
2245 } | 2222 } |
2246 if (U_FAILURE(status)) { | 2223 if (U_FAILURE(status)) { |
2247 break; | 2224 break; |
2248 } | 2225 } |
2249 } // end of for loop | 2226 } // end of for loop |
2250 return i+1; | 2227 return i+1; |
2251 } | 2228 } |
(...skipping 29 matching lines...) Expand all Loading... |
2281 if (fMatch == FALSE) { | 2258 if (fMatch == FALSE) { |
2282 status = U_REGEX_INVALID_STATE; | 2259 status = U_REGEX_INVALID_STATE; |
2283 return -1; | 2260 return -1; |
2284 } | 2261 } |
2285 if (group < 0 || group > fPattern->fGroupMap->size()) { | 2262 if (group < 0 || group > fPattern->fGroupMap->size()) { |
2286 status = U_INDEX_OUTOFBOUNDS_ERROR; | 2263 status = U_INDEX_OUTOFBOUNDS_ERROR; |
2287 return -1; | 2264 return -1; |
2288 } | 2265 } |
2289 int64_t s; | 2266 int64_t s; |
2290 if (group == 0) { | 2267 if (group == 0) { |
2291 s = fMatchStart; | 2268 s = fMatchStart; |
2292 } else { | 2269 } else { |
2293 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); | 2270 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); |
2294 U_ASSERT(groupOffset < fPattern->fFrameSize); | 2271 U_ASSERT(groupOffset < fPattern->fFrameSize); |
2295 U_ASSERT(groupOffset >= 0); | 2272 U_ASSERT(groupOffset >= 0); |
2296 s = fFrame->fExtra[groupOffset]; | 2273 s = fFrame->fExtra[groupOffset]; |
2297 } | 2274 } |
2298 | 2275 |
2299 return s; | 2276 return s; |
2300 } | 2277 } |
2301 | 2278 |
2302 | 2279 |
2303 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { | 2280 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { |
2304 return (int32_t)start64(group, status); | 2281 return (int32_t)start64(group, status); |
2305 } | 2282 } |
2306 | 2283 |
2307 //------------------------------------------------------------------------------
-- | 2284 //------------------------------------------------------------------------------
-- |
2308 // | 2285 // |
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2370 return; | 2347 return; |
2371 } | 2348 } |
2372 if (U_FAILURE(fDeferredStatus)) { | 2349 if (U_FAILURE(fDeferredStatus)) { |
2373 status = fDeferredStatus; | 2350 status = fDeferredStatus; |
2374 return; | 2351 return; |
2375 } | 2352 } |
2376 if (limit < 0) { | 2353 if (limit < 0) { |
2377 status = U_ILLEGAL_ARGUMENT_ERROR; | 2354 status = U_ILLEGAL_ARGUMENT_ERROR; |
2378 return; | 2355 return; |
2379 } | 2356 } |
2380 | 2357 |
2381 // Reset the matcher. This is needed here in case there is a current match | 2358 // Reset the matcher. This is needed here in case there is a current match |
2382 // whose final stack frame (containing the match results, pointed to by f
Frame) | 2359 // whose final stack frame (containing the match results, pointed to by f
Frame) |
2383 // would be lost by resizing to a smaller stack size. | 2360 // would be lost by resizing to a smaller stack size. |
2384 reset(); | 2361 reset(); |
2385 | 2362 |
2386 if (limit == 0) { | 2363 if (limit == 0) { |
2387 // Unlimited stack expansion | 2364 // Unlimited stack expansion |
2388 fStack->setMaxCapacity(0); | 2365 fStack->setMaxCapacity(0); |
2389 } else { | 2366 } else { |
2390 // Change the units of the limit from bytes to ints, and bump the size
up | 2367 // Change the units of the limit from bytes to ints, and bump the size
up |
2391 // to be big enough to hold at least one stack frame for the pattern, | 2368 // to be big enough to hold at least one stack frame for the pattern, |
2392 // if it isn't there already. | 2369 // if it isn't there already. |
2393 int32_t adjustedLimit = limit / sizeof(int32_t); | 2370 int32_t adjustedLimit = limit / sizeof(int32_t); |
2394 if (adjustedLimit < fPattern->fFrameSize) { | 2371 if (adjustedLimit < fPattern->fFrameSize) { |
2395 adjustedLimit = fPattern->fFrameSize; | 2372 adjustedLimit = fPattern->fFrameSize; |
2396 } | 2373 } |
2397 fStack->setMaxCapacity(adjustedLimit); | 2374 fStack->setMaxCapacity(adjustedLimit); |
2398 } | 2375 } |
2399 fStackLimit = limit; | 2376 fStackLimit = limit; |
2400 } | 2377 } |
2401 | 2378 |
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2479 // Code following this point in this file is the internal | 2456 // Code following this point in this file is the internal |
2480 // Match Engine Implementation. | 2457 // Match Engine Implementation. |
2481 // | 2458 // |
2482 //==============================================================================
== | 2459 //==============================================================================
== |
2483 | 2460 |
2484 | 2461 |
2485 //------------------------------------------------------------------------------
-- | 2462 //------------------------------------------------------------------------------
-- |
2486 // | 2463 // |
2487 // resetStack | 2464 // resetStack |
2488 // Discard any previous contents of the state save stack, and initiali
ze a | 2465 // Discard any previous contents of the state save stack, and initiali
ze a |
2489 // new stack frame to all -1. The -1s are needed for capture group li
mits, | 2466 // new stack frame to all -1. The -1s are needed for capture group li
mits, |
2490 // where they indicate that a group has not yet matched anything. | 2467 // where they indicate that a group has not yet matched anything. |
2491 //------------------------------------------------------------------------------
-- | 2468 //------------------------------------------------------------------------------
-- |
2492 REStackFrame *RegexMatcher::resetStack() { | 2469 REStackFrame *RegexMatcher::resetStack() { |
2493 // Discard any previous contents of the state save stack, and initialize a | 2470 // Discard any previous contents of the state save stack, and initialize a |
2494 // new stack frame with all -1 data. The -1s are needed for capture group
limits, | 2471 // new stack frame with all -1 data. The -1s are needed for capture group
limits, |
2495 // where they indicate that a group has not yet matched anything. | 2472 // where they indicate that a group has not yet matched anything. |
2496 fStack->removeAllElements(); | 2473 fStack->removeAllElements(); |
2497 | 2474 |
2498 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrame
Size, fDeferredStatus); | 2475 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrame
Size, fDeferredStatus); |
2499 int32_t i; | 2476 int32_t i; |
2500 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { | 2477 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { |
2501 iFrame->fExtra[i] = -1; | 2478 iFrame->fExtra[i] = -1; |
2502 } | 2479 } |
2503 return iFrame; | 2480 return iFrame; |
2504 } | 2481 } |
2505 | 2482 |
2506 | 2483 |
2507 | 2484 |
2508 //------------------------------------------------------------------------------
-- | 2485 //------------------------------------------------------------------------------
-- |
2509 // | 2486 // |
2510 // isWordBoundary | 2487 // isWordBoundary |
2511 // in perl, "xab..cd..", \b is true at positions 0,3,5,7 | 2488 // in perl, "xab..cd..", \b is true at positions 0,3,5,7 |
2512 // For us, | 2489 // For us, |
2513 // If the current char is a combining mark, | 2490 // If the current char is a combining mark, |
2514 // \b is FALSE. | 2491 // \b is FALSE. |
2515 // Else Scan backwards to the first non-combining char. | 2492 // Else Scan backwards to the first non-combining char. |
2516 // We are at a boundary if the this char and the orig
inal chars are | 2493 // We are at a boundary if the this char and the orig
inal chars are |
2517 // opposite in membership in \w set | 2494 // opposite in membership in \w set |
2518 // | 2495 // |
2519 // parameters: pos - the current position in the input buffer | 2496 // parameters: pos - the current position in the input buffer |
2520 // | 2497 // |
2521 // TODO: double-check edge cases at region boundaries. | 2498 // TODO: double-check edge cases at region boundaries. |
2522 // | 2499 // |
2523 //------------------------------------------------------------------------------
-- | 2500 //------------------------------------------------------------------------------
-- |
2524 UBool RegexMatcher::isWordBoundary(int64_t pos) { | 2501 UBool RegexMatcher::isWordBoundary(int64_t pos) { |
2525 UBool isBoundary = FALSE; | 2502 UBool isBoundary = FALSE; |
2526 UBool cIsWord = FALSE; | 2503 UBool cIsWord = FALSE; |
2527 | 2504 |
2528 if (pos >= fLookLimit) { | 2505 if (pos >= fLookLimit) { |
2529 fHitEnd = TRUE; | 2506 fHitEnd = TRUE; |
2530 } else { | 2507 } else { |
2531 // Determine whether char c at current position is a member of the word
set of chars. | 2508 // Determine whether char c at current position is a member of the word
set of chars. |
2532 // If we're off the end of the string, behave as though we're not at a w
ord char. | 2509 // If we're off the end of the string, behave as though we're not at a w
ord char. |
2533 UTEXT_SETNATIVEINDEX(fInputText, pos); | 2510 UTEXT_SETNATIVEINDEX(fInputText, pos); |
2534 UChar32 c = UTEXT_CURRENT32(fInputText); | 2511 UChar32 c = UTEXT_CURRENT32(fInputText); |
2535 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_
FORMAT_CHAR) { | 2512 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_
FORMAT_CHAR) { |
2536 // Current char is a combining one. Not a boundary. | 2513 // Current char is a combining one. Not a boundary. |
2537 return FALSE; | 2514 return FALSE; |
2538 } | 2515 } |
2539 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); | 2516 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); |
2540 } | 2517 } |
2541 | 2518 |
2542 // Back up until we come to a non-combining char, determine whether | 2519 // Back up until we come to a non-combining char, determine whether |
2543 // that char is a word char. | 2520 // that char is a word char. |
2544 UBool prevCIsWord = FALSE; | 2521 UBool prevCIsWord = FALSE; |
2545 for (;;) { | 2522 for (;;) { |
2546 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { | 2523 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { |
2547 break; | 2524 break; |
2548 } | 2525 } |
2549 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); | 2526 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); |
2550 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) | 2527 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) |
2551 || u_charType(prevChar) == U_FORMAT_CHAR)) { | 2528 || u_charType(prevChar) == U_FORMAT_CHAR)) { |
2552 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevCh
ar); | 2529 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevCh
ar); |
2553 break; | 2530 break; |
2554 } | 2531 } |
2555 } | 2532 } |
2556 isBoundary = cIsWord ^ prevCIsWord; | 2533 isBoundary = cIsWord ^ prevCIsWord; |
2557 return isBoundary; | 2534 return isBoundary; |
2558 } | 2535 } |
2559 | 2536 |
2560 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { | 2537 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { |
2561 UBool isBoundary = FALSE; | 2538 UBool isBoundary = FALSE; |
2562 UBool cIsWord = FALSE; | 2539 UBool cIsWord = FALSE; |
2563 | 2540 |
2564 const UChar *inputBuf = fInputText->chunkContents; | 2541 const UChar *inputBuf = fInputText->chunkContents; |
2565 | 2542 |
2566 if (pos >= fLookLimit) { | 2543 if (pos >= fLookLimit) { |
2567 fHitEnd = TRUE; | 2544 fHitEnd = TRUE; |
2568 } else { | 2545 } else { |
2569 // Determine whether char c at current position is a member of the word
set of chars. | 2546 // Determine whether char c at current position is a member of the word
set of chars. |
2570 // If we're off the end of the string, behave as though we're not at a w
ord char. | 2547 // If we're off the end of the string, behave as though we're not at a w
ord char. |
2571 UChar32 c; | 2548 UChar32 c; |
2572 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); | 2549 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); |
2573 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_
FORMAT_CHAR) { | 2550 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_
FORMAT_CHAR) { |
2574 // Current char is a combining one. Not a boundary. | 2551 // Current char is a combining one. Not a boundary. |
2575 return FALSE; | 2552 return FALSE; |
2576 } | 2553 } |
2577 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); | 2554 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); |
2578 } | 2555 } |
2579 | 2556 |
2580 // Back up until we come to a non-combining char, determine whether | 2557 // Back up until we come to a non-combining char, determine whether |
2581 // that char is a word char. | 2558 // that char is a word char. |
2582 UBool prevCIsWord = FALSE; | 2559 UBool prevCIsWord = FALSE; |
2583 for (;;) { | 2560 for (;;) { |
2584 if (pos <= fLookStart) { | 2561 if (pos <= fLookStart) { |
2585 break; | 2562 break; |
2586 } | 2563 } |
2587 UChar32 prevChar; | 2564 UChar32 prevChar; |
2588 U16_PREV(inputBuf, fLookStart, pos, prevChar); | 2565 U16_PREV(inputBuf, fLookStart, pos, prevChar); |
2589 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) | 2566 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) |
2590 || u_charType(prevChar) == U_FORMAT_CHAR)) { | 2567 || u_charType(prevChar) == U_FORMAT_CHAR)) { |
2591 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevCh
ar); | 2568 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevCh
ar); |
2592 break; | 2569 break; |
2593 } | 2570 } |
2594 } | 2571 } |
2595 isBoundary = cIsWord ^ prevCIsWord; | 2572 isBoundary = cIsWord ^ prevCIsWord; |
2596 return isBoundary; | 2573 return isBoundary; |
2597 } | 2574 } |
2598 | 2575 |
2599 //------------------------------------------------------------------------------
-- | 2576 //------------------------------------------------------------------------------
-- |
2600 // | 2577 // |
2601 // isUWordBoundary | 2578 // isUWordBoundary |
2602 // | 2579 // |
2603 // Test for a word boundary using RBBI word break. | 2580 // Test for a word boundary using RBBI word break. |
2604 // | 2581 // |
2605 // parameters: pos - the current position in the input buffer | 2582 // parameters: pos - the current position in the input buffer |
2606 // | 2583 // |
2607 //------------------------------------------------------------------------------
-- | 2584 //------------------------------------------------------------------------------
-- |
2608 UBool RegexMatcher::isUWordBoundary(int64_t pos) { | 2585 UBool RegexMatcher::isUWordBoundary(int64_t pos) { |
2609 UBool returnVal = FALSE; | 2586 UBool returnVal = FALSE; |
2610 #if UCONFIG_NO_BREAK_ITERATION==0 | 2587 #if UCONFIG_NO_BREAK_ITERATION==0 |
2611 | 2588 |
2612 // If we haven't yet created a break iterator for this matcher, do it now. | 2589 // If we haven't yet created a break iterator for this matcher, do it now. |
2613 if (fWordBreakItr == NULL) { | 2590 if (fWordBreakItr == NULL) { |
2614 fWordBreakItr = | 2591 fWordBreakItr = |
2615 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::
getEnglish(), fDeferredStatus); | 2592 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::
getEnglish(), fDeferredStatus); |
2616 if (U_FAILURE(fDeferredStatus)) { | 2593 if (U_FAILURE(fDeferredStatus)) { |
2617 return FALSE; | 2594 return FALSE; |
2618 } | 2595 } |
2619 fWordBreakItr->setText(fInputText, fDeferredStatus); | 2596 fWordBreakItr->setText(fInputText, fDeferredStatus); |
2620 } | 2597 } |
2621 | 2598 |
2622 if (pos >= fLookLimit) { | 2599 if (pos >= fLookLimit) { |
2623 fHitEnd = TRUE; | 2600 fHitEnd = TRUE; |
2624 returnVal = TRUE; // With Unicode word rules, only positions within th
e interior of "real" | 2601 returnVal = TRUE; // With Unicode word rules, only positions within th
e interior of "real" |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2656 return; | 2633 return; |
2657 } | 2634 } |
2658 } | 2635 } |
2659 if (fTimeLimit > 0 && fTime >= fTimeLimit) { | 2636 if (fTimeLimit > 0 && fTime >= fTimeLimit) { |
2660 status = U_REGEX_TIME_OUT; | 2637 status = U_REGEX_TIME_OUT; |
2661 } | 2638 } |
2662 } | 2639 } |
2663 | 2640 |
2664 //------------------------------------------------------------------------------
-- | 2641 //------------------------------------------------------------------------------
-- |
2665 // | 2642 // |
2666 // ReportFindProgress This function is called once for each advance in the
target | |
2667 // string from the find() function, and calls the user
progress callback | |
2668 // function if there is one installed. | |
2669 // | |
2670 // NOTE: | |
2671 // | |
2672 // If the match operation needs to be aborted because t
he user | |
2673 // callback asked for it, just set an error status. | |
2674 // The engine will pick that up and stop in its outer l
oop. | |
2675 // | |
2676 //------------------------------------------------------------------------------
-- | |
2677 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) { | |
2678 if (fFindProgressCallbackFn != NULL) { | |
2679 if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex)
== FALSE) { | |
2680 status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/; | |
2681 return FALSE; | |
2682 } | |
2683 } | |
2684 return TRUE; | |
2685 } | |
2686 | |
2687 //------------------------------------------------------------------------------
-- | |
2688 // | |
2689 // StateSave | 2643 // StateSave |
2690 // Make a new stack frame, initialized as a copy of the current stack fram
e. | 2644 // Make a new stack frame, initialized as a copy of the current stack fram
e. |
2691 // Set the pattern index in the original stack frame from the operand valu
e | 2645 // Set the pattern index in the original stack frame from the operand valu
e |
2692 // in the opcode. Execution of the engine continues with the state in | 2646 // in the opcode. Execution of the engine continues with the state in |
2693 // the newly created stack frame | 2647 // the newly created stack frame |
2694 // | 2648 // |
2695 // Note that reserveBlock() may grow the stack, resulting in the | 2649 // Note that reserveBlock() may grow the stack, resulting in the |
2696 // whole thing being relocated in memory. | 2650 // whole thing being relocated in memory. |
2697 // | 2651 // |
2698 // Parameters: | 2652 // Parameters: |
2699 // fp The top frame pointer when called. At return, a new | 2653 // fp The top frame pointer when called. At return, a new |
2700 // fame will be present | 2654 // fame will be present |
2701 // savePatIdx An index into the compiled pattern. Goes into the origina
l | 2655 // savePatIdx An index into the compiled pattern. Goes into the origina
l |
2702 // (not new) frame. If execution ever back-tracks out of the | 2656 // (not new) frame. If execution ever back-tracks out of the |
2703 // new frame, this will be where we continue from in the patt
ern. | 2657 // new frame, this will be where we continue from in the patt
ern. |
2704 // Return | 2658 // Return |
2705 // The new frame pointer. | 2659 // The new frame pointer. |
2706 // | 2660 // |
2707 //------------------------------------------------------------------------------
-- | 2661 //------------------------------------------------------------------------------
-- |
2708 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatId
x, UErrorCode &status) { | 2662 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatId
x, UErrorCode &status) { |
2709 // push storage for a new frame. | 2663 // push storage for a new frame. |
2710 int64_t *newFP = fStack->reserveBlock(fFrameSize, status); | 2664 int64_t *newFP = fStack->reserveBlock(fFrameSize, status); |
2711 if (newFP == NULL) { | 2665 if (newFP == NULL) { |
2712 // Failure on attempted stack expansion. | 2666 // Failure on attempted stack expansion. |
2713 // Stack function set some other error code, change it to a more | 2667 // Stack function set some other error code, change it to a more |
2714 // specific one for regular expressions. | 2668 // specific one for regular expressions. |
2715 status = U_REGEX_STACK_OVERFLOW; | 2669 status = U_REGEX_STACK_OVERFLOW; |
2716 // We need to return a writable stack frame, so just return the | 2670 // We need to return a writable stack frame, so just return the |
2717 // previous frame. The match operation will stop quickly | 2671 // previous frame. The match operation will stop quickly |
2718 // because of the error status, after which the frame will never | 2672 // because of the error status, after which the frame will never |
2719 // be looked at again. | 2673 // be looked at again. |
2720 return fp; | 2674 return fp; |
2721 } | 2675 } |
2722 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. | 2676 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. |
2723 | 2677 |
2724 // New stack frame = copy of old top frame. | 2678 // New stack frame = copy of old top frame. |
2725 int64_t *source = (int64_t *)fp; | 2679 int64_t *source = (int64_t *)fp; |
2726 int64_t *dest = newFP; | 2680 int64_t *dest = newFP; |
2727 for (;;) { | 2681 for (;;) { |
2728 *dest++ = *source++; | 2682 *dest++ = *source++; |
2729 if (source == newFP) { | 2683 if (source == newFP) { |
2730 break; | 2684 break; |
2731 } | 2685 } |
2732 } | 2686 } |
2733 | 2687 |
2734 fTickCounter--; | 2688 fTickCounter--; |
2735 if (fTickCounter <= 0) { | 2689 if (fTickCounter <= 0) { |
2736 IncrementTime(status); // Re-initializes fTickCounter | 2690 IncrementTime(status); // Re-initializes fTickCounter |
2737 } | 2691 } |
2738 fp->fPatIdx = savePatIdx; | 2692 fp->fPatIdx = savePatIdx; |
2739 return (REStackFrame *)newFP; | 2693 return (REStackFrame *)newFP; |
2740 } | 2694 } |
2741 | 2695 |
2742 | 2696 |
2743 //------------------------------------------------------------------------------
-- | 2697 //------------------------------------------------------------------------------
-- |
2744 // | 2698 // |
2745 // MatchAt This is the actual matching engine. | 2699 // MatchAt This is the actual matching engine. |
2746 // | 2700 // |
2747 // startIdx: begin matching a this index. | 2701 // startIdx: begin matching a this index. |
2748 // toEnd: if true, match must extend to end of the input
region | 2702 // toEnd: if true, match must extend to end of the input
region |
2749 // | 2703 // |
2750 //------------------------------------------------------------------------------
-- | 2704 //------------------------------------------------------------------------------
-- |
2751 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { | 2705 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { |
2752 UBool isMatch = FALSE; // True if the we have a match. | 2706 UBool isMatch = FALSE; // True if the we have a match. |
2753 | 2707 |
2754 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-chara
cter matches for searching backwards | 2708 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-chara
cter matches for searching backwards |
2755 | 2709 |
2756 int32_t op; // Operation from the compiled pattern, s
plit into | 2710 int32_t op; // Operation from the compiled pattern, s
plit into |
2757 int32_t opType; // the opcode | 2711 int32_t opType; // the opcode |
2758 int32_t opValue; // and the operand value. | 2712 int32_t opValue; // and the operand value. |
2759 | 2713 |
2760 #ifdef REGEX_RUN_DEBUG | 2714 #ifdef REGEX_RUN_DEBUG |
2761 if (fTraceDebug) | 2715 if (fTraceDebug) |
2762 { | 2716 { |
2763 printf("MatchAt(startIdx=%ld)\n", startIdx); | 2717 printf("MatchAt(startIdx=%ld)\n", startIdx); |
2764 printf("Original Pattern: "); | 2718 printf("Original Pattern: "); |
2765 UChar32 c = utext_next32From(fPattern->fPattern, 0); | 2719 UChar32 c = utext_next32From(fPattern->fPattern, 0); |
2766 while (c != U_SENTINEL) { | 2720 while (c != U_SENTINEL) { |
2767 if (c<32 || c>256) { | 2721 if (c<32 || c>256) { |
2768 c = '.'; | 2722 c = '.'; |
2769 } | 2723 } |
2770 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); | 2724 printf("%c", c); |
2771 | 2725 |
2772 c = UTEXT_NEXT32(fPattern->fPattern); | 2726 c = UTEXT_NEXT32(fPattern->fPattern); |
2773 } | 2727 } |
2774 printf("\n"); | 2728 printf("\n"); |
2775 printf("Input String: "); | 2729 printf("Input String: "); |
2776 c = utext_next32From(fInputText, 0); | 2730 c = utext_next32From(fInputText, 0); |
2777 while (c != U_SENTINEL) { | 2731 while (c != U_SENTINEL) { |
2778 if (c<32 || c>256) { | 2732 if (c<32 || c>256) { |
2779 c = '.'; | 2733 c = '.'; |
2780 } | 2734 } |
2781 printf("%c", c); | 2735 printf("%c", c); |
2782 | 2736 |
2783 c = UTEXT_NEXT32(fInputText); | 2737 c = UTEXT_NEXT32(fInputText); |
2784 } | 2738 } |
2785 printf("\n"); | 2739 printf("\n"); |
2786 printf("\n"); | 2740 printf("\n"); |
2787 } | 2741 } |
2788 #endif | 2742 #endif |
2789 | 2743 |
2790 if (U_FAILURE(status)) { | 2744 if (U_FAILURE(status)) { |
2791 return; | 2745 return; |
2792 } | 2746 } |
2793 | 2747 |
2794 // Cache frequently referenced items from the compiled pattern | 2748 // Cache frequently referenced items from the compiled pattern |
2795 // | 2749 // |
2796 int64_t *pat = fPattern->fCompiledPat->getBuffer(); | 2750 int64_t *pat = fPattern->fCompiledPat->getBuffer(); |
2797 | 2751 |
2798 const UChar *litText = fPattern->fLiteralText.getBuffer(); | 2752 const UChar *litText = fPattern->fLiteralText.getBuffer(); |
2799 UVector *sets = fPattern->fSets; | 2753 UVector *sets = fPattern->fSets; |
2800 | 2754 |
2801 fFrameSize = fPattern->fFrameSize; | 2755 fFrameSize = fPattern->fFrameSize; |
2802 REStackFrame *fp = resetStack(); | 2756 REStackFrame *fp = resetStack(); |
2803 | 2757 |
2804 fp->fPatIdx = 0; | 2758 fp->fPatIdx = 0; |
2805 fp->fInputIdx = startIdx; | 2759 fp->fInputIdx = startIdx; |
2806 | 2760 |
2807 // Zero out the pattern's static data | 2761 // Zero out the pattern's static data |
2808 int32_t i; | 2762 int32_t i; |
2809 for (i = 0; i<fPattern->fDataSize; i++) { | 2763 for (i = 0; i<fPattern->fDataSize; i++) { |
2810 fData[i] = 0; | 2764 fData[i] = 0; |
2811 } | 2765 } |
2812 | 2766 |
2813 // | 2767 // |
2814 // Main loop for interpreting the compiled pattern. | 2768 // Main loop for interpreting the compiled pattern. |
2815 // One iteration of the loop per pattern operation performed. | 2769 // One iteration of the loop per pattern operation performed. |
2816 // | 2770 // |
2817 for (;;) { | 2771 for (;;) { |
2818 #if 0 | |
2819 if (_heapchk() != _HEAPOK) { | |
2820 fprintf(stderr, "Heap Trouble\n"); | |
2821 } | |
2822 #endif | |
2823 | |
2824 op = (int32_t)pat[fp->fPatIdx]; | 2772 op = (int32_t)pat[fp->fPatIdx]; |
2825 opType = URX_TYPE(op); | 2773 opType = URX_TYPE(op); |
2826 opValue = URX_VAL(op); | 2774 opValue = URX_VAL(op); |
2827 #ifdef REGEX_RUN_DEBUG | 2775 #ifdef REGEX_RUN_DEBUG |
2828 if (fTraceDebug) { | 2776 if (fTraceDebug) { |
2829 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 2777 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
2830 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ",
fp->fInputIdx, | 2778 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ",
fp->fInputIdx, |
2831 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(),
fActiveLimit); | 2779 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(),
fActiveLimit); |
2832 fPattern->dumpOp(fp->fPatIdx); | 2780 fPattern->dumpOp(fp->fPatIdx); |
2833 } | 2781 } |
2834 #endif | 2782 #endif |
2835 fp->fPatIdx++; | 2783 fp->fPatIdx++; |
2836 | 2784 |
2837 switch (opType) { | 2785 switch (opType) { |
2838 | 2786 |
2839 | 2787 |
2840 case URX_NOP: | 2788 case URX_NOP: |
2841 break; | 2789 break; |
2842 | 2790 |
2843 | 2791 |
2844 case URX_BACKTRACK: | 2792 case URX_BACKTRACK: |
2845 // Force a backtrack. In some circumstances, the pattern compiler | 2793 // Force a backtrack. In some circumstances, the pattern compiler |
2846 // will notice that the pattern can't possibly match anything, and
will | 2794 // will notice that the pattern can't possibly match anything, and
will |
(...skipping 23 matching lines...) Expand all Loading... |
2870 // Strings require two slots in the compiled pattern, one for th
e | 2818 // Strings require two slots in the compiled pattern, one for th
e |
2871 // offset to the string text, and one for the length. | 2819 // offset to the string text, and one for the length. |
2872 | 2820 |
2873 int32_t stringStartIdx = opValue; | 2821 int32_t stringStartIdx = opValue; |
2874 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second ope
rand | 2822 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second ope
rand |
2875 fp->fPatIdx++; | 2823 fp->fPatIdx++; |
2876 opType = URX_TYPE(op); | 2824 opType = URX_TYPE(op); |
2877 int32_t stringLen = URX_VAL(op); | 2825 int32_t stringLen = URX_VAL(op); |
2878 U_ASSERT(opType == URX_STRING_LEN); | 2826 U_ASSERT(opType == URX_STRING_LEN); |
2879 U_ASSERT(stringLen >= 2); | 2827 U_ASSERT(stringLen >= 2); |
2880 | 2828 |
2881 const UChar *patternString = litText+stringStartIdx; | 2829 const UChar *patternString = litText+stringStartIdx; |
2882 int32_t patternStringIndex = 0; | 2830 int32_t patternStringIndex = 0; |
2883 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 2831 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
2884 UChar32 inputChar; | 2832 UChar32 inputChar; |
2885 UChar32 patternChar; | 2833 UChar32 patternChar; |
2886 UBool success = TRUE; | 2834 UBool success = TRUE; |
2887 while (patternStringIndex < stringLen) { | 2835 while (patternStringIndex < stringLen) { |
2888 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { | 2836 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { |
2889 success = FALSE; | 2837 success = FALSE; |
2890 fHitEnd = TRUE; | 2838 fHitEnd = TRUE; |
2891 break; | 2839 break; |
2892 } | 2840 } |
2893 inputChar = UTEXT_NEXT32(fInputText); | 2841 inputChar = UTEXT_NEXT32(fInputText); |
2894 U16_NEXT(patternString, patternStringIndex, stringLen, patte
rnChar); | 2842 U16_NEXT(patternString, patternStringIndex, stringLen, patte
rnChar); |
2895 if (patternChar != inputChar) { | 2843 if (patternChar != inputChar) { |
2896 success = FALSE; | 2844 success = FALSE; |
2897 break; | 2845 break; |
2898 } | 2846 } |
2899 } | 2847 } |
2900 | 2848 |
2901 if (success) { | 2849 if (success) { |
2902 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 2850 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
2903 } else { | 2851 } else { |
2904 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 2852 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
2905 } | 2853 } |
2906 } | 2854 } |
2907 break; | 2855 break; |
2908 | 2856 |
2909 | 2857 |
2910 case URX_STATE_SAVE: | 2858 case URX_STATE_SAVE: |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2945 | 2893 |
2946 case URX_DOLLAR: // $, test for End of line | 2894 case URX_DOLLAR: // $, test for End of line |
2947 // or for position before new lin
e at end of input | 2895 // or for position before new lin
e at end of input |
2948 { | 2896 { |
2949 if (fp->fInputIdx >= fAnchorLimit) { | 2897 if (fp->fInputIdx >= fAnchorLimit) { |
2950 // We really are at the end of input. Success. | 2898 // We really are at the end of input. Success. |
2951 fHitEnd = TRUE; | 2899 fHitEnd = TRUE; |
2952 fRequireEnd = TRUE; | 2900 fRequireEnd = TRUE; |
2953 break; | 2901 break; |
2954 } | 2902 } |
2955 | 2903 |
2956 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 2904 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
2957 | 2905 |
2958 // If we are positioned just before a new-line that is located a
t the | 2906 // If we are positioned just before a new-line that is located a
t the |
2959 // end of input, succeed. | 2907 // end of input, succeed. |
2960 UChar32 c = UTEXT_NEXT32(fInputText); | 2908 UChar32 c = UTEXT_NEXT32(fInputText); |
2961 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { | 2909 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { |
2962 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x202
9) { | 2910 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x202
9) { |
2963 // If not in the middle of a CR/LF sequence | 2911 // If not in the middle of a CR/LF sequence |
2964 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTE
XT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { | 2912 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTE
XT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { |
2965 // At new-line at end of input. Success | 2913 // At new-line at end of input. Success |
2966 fHitEnd = TRUE; | 2914 fHitEnd = TRUE; |
2967 fRequireEnd = TRUE; | 2915 fRequireEnd = TRUE; |
2968 | 2916 |
2969 break; | 2917 break; |
2970 } | 2918 } |
2971 } | 2919 } |
2972 } else { | 2920 } else { |
2973 UChar32 nextC = UTEXT_NEXT32(fInputText); | 2921 UChar32 nextC = UTEXT_NEXT32(fInputText); |
2974 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInpu
tText) >= fAnchorLimit) { | 2922 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInpu
tText) >= fAnchorLimit) { |
2975 fHitEnd = TRUE; | 2923 fHitEnd = TRUE; |
2976 fRequireEnd = TRUE; | 2924 fRequireEnd = TRUE; |
2977 break; // At CR/LF at end of inp
ut. Success | 2925 break; // At CR/LF at end of inp
ut. Success |
2978 } | 2926 } |
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3058 | 3006 |
3059 case URX_CARET_M: // ^, test for start of line in muli
t-line mode | 3007 case URX_CARET_M: // ^, test for start of line in muli
t-line mode |
3060 { | 3008 { |
3061 if (fp->fInputIdx == fAnchorStart) { | 3009 if (fp->fInputIdx == fAnchorStart) { |
3062 // We are at the start input. Success. | 3010 // We are at the start input. Success. |
3063 break; | 3011 break; |
3064 } | 3012 } |
3065 // Check whether character just before the current pos is a new-l
ine | 3013 // Check whether character just before the current pos is a new-l
ine |
3066 // unless we are at the end of input | 3014 // unless we are at the end of input |
3067 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3015 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
3068 UChar32 c = UTEXT_PREVIOUS32(fInputText); | 3016 UChar32 c = UTEXT_PREVIOUS32(fInputText); |
3069 if ((fp->fInputIdx < fAnchorLimit) && | 3017 if ((fp->fInputIdx < fAnchorLimit) && |
3070 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { | 3018 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { |
3071 // It's a new-line. ^ is true. Success. | 3019 // It's a new-line. ^ is true. Success. |
3072 // TODO: what should be done with positions between a CR an
d LF? | 3020 // TODO: what should be done with positions between a CR an
d LF? |
3073 break; | 3021 break; |
3074 } | 3022 } |
3075 // Not at the start of a line. Fail. | 3023 // Not at the start of a line. Fail. |
3076 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3024 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3077 } | 3025 } |
3078 break; | 3026 break; |
3079 | 3027 |
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3141 break; | 3089 break; |
3142 | 3090 |
3143 | 3091 |
3144 case URX_BACKSLASH_G: // Test for position at end of previous m
atch | 3092 case URX_BACKSLASH_G: // Test for position at end of previous m
atch |
3145 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->
fInputIdx==fActiveStart))) { | 3093 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->
fInputIdx==fActiveStart))) { |
3146 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3094 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3147 } | 3095 } |
3148 break; | 3096 break; |
3149 | 3097 |
3150 | 3098 |
3151 case URX_BACKSLASH_X: | 3099 case URX_BACKSLASH_X: |
3152 // Match a Grapheme, as defined by Unicode TR 29. | 3100 // Match a Grapheme, as defined by Unicode TR 29. |
3153 // Differs slightly from Perl, which consumes combining marks indep
endently | 3101 // Differs slightly from Perl, which consumes combining marks indep
endently |
3154 // of context. | 3102 // of context. |
3155 { | 3103 { |
3156 | 3104 |
3157 // Fail if at end of input | 3105 // Fail if at end of input |
3158 if (fp->fInputIdx >= fActiveLimit) { | 3106 if (fp->fInputIdx >= fActiveLimit) { |
3159 fHitEnd = TRUE; | 3107 fHitEnd = TRUE; |
3160 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3108 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3161 break; | 3109 break; |
3162 } | 3110 } |
3163 | 3111 |
3164 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3112 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
3165 | 3113 |
3166 // Examine (and consume) the current char. | 3114 // Examine (and consume) the current char. |
3167 // Dispatch into a little state machine, based on the char. | 3115 // Dispatch into a little state machine, based on the char. |
3168 UChar32 c; | 3116 UChar32 c; |
3169 c = UTEXT_NEXT32(fInputText); | 3117 c = UTEXT_NEXT32(fInputText); |
3170 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3118 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
3171 UnicodeSet **sets = fPattern->fStaticSets; | 3119 UnicodeSet **sets = fPattern->fStaticSets; |
3172 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; | 3120 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; |
3173 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; | 3121 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3220 c = UTEXT_CURRENT32(fInputText); | 3168 c = UTEXT_CURRENT32(fInputText); |
3221 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { | 3169 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { |
3222 break; | 3170 break; |
3223 } | 3171 } |
3224 (void)UTEXT_NEXT32(fInputText); | 3172 (void)UTEXT_NEXT32(fInputText); |
3225 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3173 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
3226 } | 3174 } |
3227 goto GC_Done; | 3175 goto GC_Done; |
3228 | 3176 |
3229 GC_Control: | 3177 GC_Control: |
3230 // Most control chars stand alone (don't combine with combining
chars), | 3178 // Most control chars stand alone (don't combine with combining
chars), |
3231 // except for that CR/LF sequence is a single grapheme cluster
. | 3179 // except for that CR/LF sequence is a single grapheme cluster
. |
3232 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32
(fInputText) == 0x0a) { | 3180 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32
(fInputText) == 0x0a) { |
3233 c = UTEXT_NEXT32(fInputText); | 3181 c = UTEXT_NEXT32(fInputText); |
3234 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3182 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
3235 } | 3183 } |
3236 | 3184 |
3237 GC_Done: | 3185 GC_Done: |
3238 if (fp->fInputIdx >= fActiveLimit) { | 3186 if (fp->fInputIdx >= fActiveLimit) { |
3239 fHitEnd = TRUE; | 3187 fHitEnd = TRUE; |
3240 } | 3188 } |
3241 break; | 3189 break; |
3242 } | 3190 } |
3243 | |
3244 | 3191 |
3245 | 3192 |
3246 | 3193 |
| 3194 |
3247 case URX_BACKSLASH_Z: // Test for end of Input | 3195 case URX_BACKSLASH_Z: // Test for end of Input |
3248 if (fp->fInputIdx < fAnchorLimit) { | 3196 if (fp->fInputIdx < fAnchorLimit) { |
3249 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3197 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3250 } else { | 3198 } else { |
3251 fHitEnd = TRUE; | 3199 fHitEnd = TRUE; |
3252 fRequireEnd = TRUE; | 3200 fRequireEnd = TRUE; |
3253 } | 3201 } |
3254 break; | 3202 break; |
3255 | 3203 |
3256 | 3204 |
3257 | 3205 |
3258 case URX_STATIC_SETREF: | 3206 case URX_STATIC_SETREF: |
3259 { | 3207 { |
3260 // Test input character against one of the predefined sets | 3208 // Test input character against one of the predefined sets |
3261 // (Word Characters, for example) | 3209 // (Word Characters, for example) |
3262 // The high bit of the op value is a flag for the match polarity
. | 3210 // The high bit of the op value is a flag for the match polarity
. |
3263 // 0: success if input char is in set. | 3211 // 0: success if input char is in set. |
3264 // 1: success if input char is not in set. | 3212 // 1: success if input char is not in set. |
3265 if (fp->fInputIdx >= fActiveLimit) { | 3213 if (fp->fInputIdx >= fActiveLimit) { |
3266 fHitEnd = TRUE; | 3214 fHitEnd = TRUE; |
3267 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3215 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3268 break; | 3216 break; |
3269 } | 3217 } |
3270 | 3218 |
3271 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); | 3219 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); |
3272 opValue &= ~URX_NEG_SET; | 3220 opValue &= ~URX_NEG_SET; |
3273 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); | 3221 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); |
3274 | 3222 |
3275 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3223 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
3276 UChar32 c = UTEXT_NEXT32(fInputText); | 3224 UChar32 c = UTEXT_NEXT32(fInputText); |
3277 if (c < 256) { | 3225 if (c < 256) { |
3278 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; | 3226 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; |
3279 if (s8->contains(c)) { | 3227 if (s8->contains(c)) { |
3280 success = !success; | 3228 success = !success; |
3281 } | 3229 } |
3282 } else { | 3230 } else { |
3283 const UnicodeSet *s = fPattern->fStaticSets[opValue]; | 3231 const UnicodeSet *s = fPattern->fStaticSets[opValue]; |
3284 if (s->contains(c)) { | 3232 if (s->contains(c)) { |
3285 success = !success; | 3233 success = !success; |
3286 } | 3234 } |
3287 } | 3235 } |
3288 if (success) { | 3236 if (success) { |
3289 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3237 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
3290 } else { | 3238 } else { |
3291 // the character wasn't in the set. | 3239 // the character wasn't in the set. |
3292 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3240 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3293 } | 3241 } |
3294 } | 3242 } |
3295 break; | 3243 break; |
3296 | 3244 |
3297 | 3245 |
3298 case URX_STAT_SETREF_N: | 3246 case URX_STAT_SETREF_N: |
3299 { | 3247 { |
3300 // Test input character for NOT being a member of one of | 3248 // Test input character for NOT being a member of one of |
3301 // the predefined sets (Word Characters, for example) | 3249 // the predefined sets (Word Characters, for example) |
3302 if (fp->fInputIdx >= fActiveLimit) { | 3250 if (fp->fInputIdx >= fActiveLimit) { |
3303 fHitEnd = TRUE; | 3251 fHitEnd = TRUE; |
3304 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3252 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3305 break; | 3253 break; |
3306 } | 3254 } |
3307 | 3255 |
3308 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); | 3256 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); |
3309 | 3257 |
3310 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3258 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
3311 | 3259 |
3312 UChar32 c = UTEXT_NEXT32(fInputText); | 3260 UChar32 c = UTEXT_NEXT32(fInputText); |
3313 if (c < 256) { | 3261 if (c < 256) { |
3314 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; | 3262 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; |
3315 if (s8->contains(c) == FALSE) { | 3263 if (s8->contains(c) == FALSE) { |
3316 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3264 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
3317 break; | 3265 break; |
3318 } | 3266 } |
3319 } else { | 3267 } else { |
3320 const UnicodeSet *s = fPattern->fStaticSets[opValue]; | 3268 const UnicodeSet *s = fPattern->fStaticSets[opValue]; |
3321 if (s->contains(c) == FALSE) { | 3269 if (s->contains(c) == FALSE) { |
3322 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3270 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
3323 break; | 3271 break; |
3324 } | 3272 } |
3325 } | 3273 } |
3326 // the character wasn't in the set. | 3274 // the character wasn't in the set. |
3327 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3275 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3328 } | 3276 } |
3329 break; | 3277 break; |
3330 | 3278 |
3331 | 3279 |
3332 case URX_SETREF: | 3280 case URX_SETREF: |
3333 if (fp->fInputIdx >= fActiveLimit) { | 3281 if (fp->fInputIdx >= fActiveLimit) { |
3334 fHitEnd = TRUE; | 3282 fHitEnd = TRUE; |
3335 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3283 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3336 break; | 3284 break; |
3337 } else { | 3285 } else { |
3338 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3286 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
3339 | 3287 |
3340 // There is input left. Pick up one char and test it for set me
mbership. | 3288 // There is input left. Pick up one char and test it for set me
mbership. |
3341 UChar32 c = UTEXT_NEXT32(fInputText); | 3289 UChar32 c = UTEXT_NEXT32(fInputText); |
3342 U_ASSERT(opValue > 0 && opValue < sets->size()); | 3290 U_ASSERT(opValue > 0 && opValue < sets->size()); |
3343 if (c<256) { | 3291 if (c<256) { |
3344 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; | 3292 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; |
3345 if (s8->contains(c)) { | 3293 if (s8->contains(c)) { |
3346 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3294 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
3347 break; | 3295 break; |
3348 } | 3296 } |
3349 } else { | 3297 } else { |
3350 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); | 3298 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); |
3351 if (s->contains(c)) { | 3299 if (s->contains(c)) { |
3352 // The character is in the set. A Match. | 3300 // The character is in the set. A Match. |
3353 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3301 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
3354 break; | 3302 break; |
3355 } | 3303 } |
3356 } | 3304 } |
3357 | 3305 |
3358 // the character wasn't in the set. | 3306 // the character wasn't in the set. |
3359 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3307 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3360 } | 3308 } |
3361 break; | 3309 break; |
3362 | 3310 |
3363 | 3311 |
3364 case URX_DOTANY: | 3312 case URX_DOTANY: |
3365 { | 3313 { |
3366 // . matches anything, but stops at end-of-line. | 3314 // . matches anything, but stops at end-of-line. |
3367 if (fp->fInputIdx >= fActiveLimit) { | 3315 if (fp->fInputIdx >= fActiveLimit) { |
3368 // At end of input. Match failed. Backtrack out. | 3316 // At end of input. Match failed. Backtrack out. |
3369 fHitEnd = TRUE; | 3317 fHitEnd = TRUE; |
3370 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3318 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3371 break; | 3319 break; |
3372 } | 3320 } |
3373 | 3321 |
3374 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3322 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
3375 | 3323 |
3376 // There is input left. Advance over one char, unless we've hit
end-of-line | 3324 // There is input left. Advance over one char, unless we've hit
end-of-line |
3377 UChar32 c = UTEXT_NEXT32(fInputText); | 3325 UChar32 c = UTEXT_NEXT32(fInputText); |
3378 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many
chars as possible | 3326 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many
chars as possible |
3379 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029))
{ | 3327 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029))
{ |
3380 // End of line in normal mode. . does not match. | 3328 // End of line in normal mode. . does not match. |
3381 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3329 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3382 break; | 3330 break; |
3383 } | 3331 } |
3384 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3332 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
3385 } | 3333 } |
3386 break; | 3334 break; |
3387 | 3335 |
3388 | 3336 |
3389 case URX_DOTANY_ALL: | 3337 case URX_DOTANY_ALL: |
3390 { | 3338 { |
3391 // ., in dot-matches-all (including new lines) mode | 3339 // ., in dot-matches-all (including new lines) mode |
3392 if (fp->fInputIdx >= fActiveLimit) { | 3340 if (fp->fInputIdx >= fActiveLimit) { |
3393 // At end of input. Match failed. Backtrack out. | 3341 // At end of input. Match failed. Backtrack out. |
3394 fHitEnd = TRUE; | 3342 fHitEnd = TRUE; |
3395 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3343 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3396 break; | 3344 break; |
3397 } | 3345 } |
3398 | 3346 |
3399 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3347 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
3400 | 3348 |
3401 // There is input left. Advance over one char, except if we are | 3349 // There is input left. Advance over one char, except if we are |
3402 // at a cr/lf, advance over both of them. | 3350 // at a cr/lf, advance over both of them. |
3403 UChar32 c; | 3351 UChar32 c; |
3404 c = UTEXT_NEXT32(fInputText); | 3352 c = UTEXT_NEXT32(fInputText); |
3405 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3353 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
3406 if (c==0x0d && fp->fInputIdx < fActiveLimit) { | 3354 if (c==0x0d && fp->fInputIdx < fActiveLimit) { |
3407 // In the case of a CR/LF, we need to advance over both. | 3355 // In the case of a CR/LF, we need to advance over both. |
3408 UChar32 nextc = UTEXT_CURRENT32(fInputText); | 3356 UChar32 nextc = UTEXT_CURRENT32(fInputText); |
3409 if (nextc == 0x0a) { | 3357 if (nextc == 0x0a) { |
3410 (void)UTEXT_NEXT32(fInputText); | 3358 (void)UTEXT_NEXT32(fInputText); |
3411 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3359 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
3412 } | 3360 } |
3413 } | 3361 } |
3414 } | 3362 } |
3415 break; | 3363 break; |
3416 | 3364 |
3417 | 3365 |
3418 case URX_DOTANY_UNIX: | 3366 case URX_DOTANY_UNIX: |
3419 { | 3367 { |
3420 // '.' operator, matches all, but stops at end-of-line. | 3368 // '.' operator, matches all, but stops at end-of-line. |
3421 // UNIX_LINES mode, so 0x0a is the only recognized line ending
. | 3369 // UNIX_LINES mode, so 0x0a is the only recognized line ending
. |
3422 if (fp->fInputIdx >= fActiveLimit) { | 3370 if (fp->fInputIdx >= fActiveLimit) { |
3423 // At end of input. Match failed. Backtrack out. | 3371 // At end of input. Match failed. Backtrack out. |
3424 fHitEnd = TRUE; | 3372 fHitEnd = TRUE; |
3425 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3373 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3426 break; | 3374 break; |
3427 } | 3375 } |
3428 | 3376 |
3429 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3377 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
3430 | 3378 |
3431 // There is input left. Advance over one char, unless we've hit
end-of-line | 3379 // There is input left. Advance over one char, unless we've hit
end-of-line |
3432 UChar32 c = UTEXT_NEXT32(fInputText); | 3380 UChar32 c = UTEXT_NEXT32(fInputText); |
3433 if (c == 0x0a) { | 3381 if (c == 0x0a) { |
3434 // End of line in normal mode. '.' does not match the \n | 3382 // End of line in normal mode. '.' does not match the \n |
3435 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3383 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3436 } else { | 3384 } else { |
3437 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3385 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
3438 } | 3386 } |
3439 } | 3387 } |
3440 break; | 3388 break; |
(...skipping 24 matching lines...) Expand all Loading... |
3465 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); | 3413 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); |
3466 int32_t frameLoc = URX_VAL(stoOp); | 3414 int32_t frameLoc = URX_VAL(stoOp); |
3467 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); | 3415 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); |
3468 int64_t prevInputIdx = fp->fExtra[frameLoc]; | 3416 int64_t prevInputIdx = fp->fExtra[frameLoc]; |
3469 U_ASSERT(prevInputIdx <= fp->fInputIdx); | 3417 U_ASSERT(prevInputIdx <= fp->fInputIdx); |
3470 if (prevInputIdx < fp->fInputIdx) { | 3418 if (prevInputIdx < fp->fInputIdx) { |
3471 // The match did make progress. Repeat the loop. | 3419 // The match did make progress. Repeat the loop. |
3472 fp = StateSave(fp, fp->fPatIdx, status); // State save to l
oc following current | 3420 fp = StateSave(fp, fp->fPatIdx, status); // State save to l
oc following current |
3473 fp->fPatIdx = opValue; | 3421 fp->fPatIdx = opValue; |
3474 fp->fExtra[frameLoc] = fp->fInputIdx; | 3422 fp->fExtra[frameLoc] = fp->fInputIdx; |
3475 } | 3423 } |
3476 // If the input position did not advance, we do nothing here, | 3424 // If the input position did not advance, we do nothing here, |
3477 // execution will fall out of the loop. | 3425 // execution will fall out of the loop. |
3478 } | 3426 } |
3479 break; | 3427 break; |
3480 | 3428 |
3481 case URX_CTR_INIT: | 3429 case URX_CTR_INIT: |
3482 { | 3430 { |
3483 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); | 3431 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); |
3484 fp->fExtra[opValue] = 0; // Set the loop counte
r variable to zero | 3432 fp->fExtra[opValue] = 0; // Set the loop counte
r variable to zero |
3485 | 3433 |
3486 // Pick up the three extra operands that CTR_INIT has, and | 3434 // Pick up the three extra operands that CTR_INIT has, and |
3487 // skip the pattern location counter past | 3435 // skip the pattern location counter past |
3488 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; | 3436 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
3489 fp->fPatIdx += 3; | 3437 fp->fPatIdx += 3; |
3490 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); | 3438 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); |
3491 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; | 3439 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; |
3492 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; | 3440 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; |
3493 U_ASSERT(minCount>=0); | 3441 U_ASSERT(minCount>=0); |
3494 U_ASSERT(maxCount>=minCount || maxCount==-1); | 3442 U_ASSERT(maxCount>=minCount || maxCount==-1); |
3495 U_ASSERT(loopLoc>=fp->fPatIdx); | 3443 U_ASSERT(loopLoc>=fp->fPatIdx); |
3496 | 3444 |
3497 if (minCount == 0) { | 3445 if (minCount == 0) { |
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3535 } | 3483 } |
3536 break; | 3484 break; |
3537 | 3485 |
3538 case URX_CTR_INIT_NG: | 3486 case URX_CTR_INIT_NG: |
3539 { | 3487 { |
3540 // Initialize a non-greedy loop | 3488 // Initialize a non-greedy loop |
3541 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); | 3489 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); |
3542 fp->fExtra[opValue] = 0; // Set the loop counte
r variable to zero | 3490 fp->fExtra[opValue] = 0; // Set the loop counte
r variable to zero |
3543 | 3491 |
3544 // Pick up the three extra operands that CTR_INIT_NG has, and | 3492 // Pick up the three extra operands that CTR_INIT_NG has, and |
3545 // skip the pattern location counter past | 3493 // skip the pattern location counter past |
3546 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; | 3494 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
3547 fp->fPatIdx += 3; | 3495 fp->fPatIdx += 3; |
3548 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); | 3496 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); |
3549 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; | 3497 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; |
3550 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; | 3498 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; |
3551 U_ASSERT(minCount>=0); | 3499 U_ASSERT(minCount>=0); |
3552 U_ASSERT(maxCount>=minCount || maxCount==-1); | 3500 U_ASSERT(maxCount>=minCount || maxCount==-1); |
3553 U_ASSERT(loopLoc>fp->fPatIdx); | 3501 U_ASSERT(loopLoc>fp->fPatIdx); |
3554 if (maxCount == -1) { | 3502 if (maxCount == -1) { |
3555 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial in
put index for loop breaking. | 3503 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial in
put index for loop breaking. |
3556 } | 3504 } |
3557 | 3505 |
3558 if (minCount == 0) { | 3506 if (minCount == 0) { |
3559 if (maxCount != 0) { | 3507 if (maxCount != 0) { |
3560 fp = StateSave(fp, fp->fPatIdx, status); | 3508 fp = StateSave(fp, fp->fPatIdx, status); |
3561 } | 3509 } |
3562 fp->fPatIdx = loopLoc+1; // Continue with stuff after repe
ated block | 3510 fp->fPatIdx = loopLoc+1; // Continue with stuff after repe
ated block |
3563 } | 3511 } |
3564 } | 3512 } |
3565 break; | 3513 break; |
3566 | 3514 |
3567 case URX_CTR_LOOP_NG: | 3515 case URX_CTR_LOOP_NG: |
3568 { | 3516 { |
3569 // Non-greedy {min, max} loops | 3517 // Non-greedy {min, max} loops |
3570 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); | 3518 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); |
3571 int32_t initOp = (int32_t)pat[opValue]; | 3519 int32_t initOp = (int32_t)pat[opValue]; |
3572 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); | 3520 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); |
3573 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; | 3521 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; |
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3640 U_ASSERT(groupStartIdx <= groupEndIdx); | 3588 U_ASSERT(groupStartIdx <= groupEndIdx); |
3641 if (groupStartIdx < 0) { | 3589 if (groupStartIdx < 0) { |
3642 // This capture group has not participated in the match thus
far, | 3590 // This capture group has not participated in the match thus
far, |
3643 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. | 3591 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. |
3644 break; | 3592 break; |
3645 } | 3593 } |
3646 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); | 3594 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); |
3647 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3595 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
3648 | 3596 |
3649 // Note: if the capture group match was of an empty string the
backref | 3597 // Note: if the capture group match was of an empty string the
backref |
3650 // match succeeds. Verified by testing: Perl matches s
ucceed | 3598 // match succeeds. Verified by testing: Perl matches s
ucceed |
3651 // in this case, so we do too. | 3599 // in this case, so we do too. |
3652 | 3600 |
3653 UBool success = TRUE; | 3601 UBool success = TRUE; |
3654 for (;;) { | 3602 for (;;) { |
3655 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) { | 3603 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) { |
3656 success = TRUE; | 3604 success = TRUE; |
3657 break; | 3605 break; |
3658 } | 3606 } |
3659 if (utext_getNativeIndex(fInputText) >= fActiveLimit) { | 3607 if (utext_getNativeIndex(fInputText) >= fActiveLimit) { |
3660 success = FALSE; | 3608 success = FALSE; |
3661 fHitEnd = TRUE; | 3609 fHitEnd = TRUE; |
3662 break; | 3610 break; |
(...skipping 26 matching lines...) Expand all Loading... |
3689 // This capture group has not participated in the match thus
far, | 3637 // This capture group has not participated in the match thus
far, |
3690 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. | 3638 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. |
3691 break; | 3639 break; |
3692 } | 3640 } |
3693 utext_setNativeIndex(fAltInputText, groupStartIdx); | 3641 utext_setNativeIndex(fAltInputText, groupStartIdx); |
3694 utext_setNativeIndex(fInputText, fp->fInputIdx); | 3642 utext_setNativeIndex(fInputText, fp->fInputIdx); |
3695 CaseFoldingUTextIterator captureGroupItr(*fAltInputText); | 3643 CaseFoldingUTextIterator captureGroupItr(*fAltInputText); |
3696 CaseFoldingUTextIterator inputItr(*fInputText); | 3644 CaseFoldingUTextIterator inputItr(*fInputText); |
3697 | 3645 |
3698 // Note: if the capture group match was of an empty string the
backref | 3646 // Note: if the capture group match was of an empty string the
backref |
3699 // match succeeds. Verified by testing: Perl matches s
ucceed | 3647 // match succeeds. Verified by testing: Perl matches s
ucceed |
3700 // in this case, so we do too. | 3648 // in this case, so we do too. |
3701 | 3649 |
3702 UBool success = TRUE; | 3650 UBool success = TRUE; |
3703 for (;;) { | 3651 for (;;) { |
3704 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(f
AltInputText) >= groupEndIdx) { | 3652 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(f
AltInputText) >= groupEndIdx) { |
3705 success = TRUE; | 3653 success = TRUE; |
3706 break; | 3654 break; |
3707 } | 3655 } |
3708 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputTe
xt) >= fActiveLimit) { | 3656 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputTe
xt) >= fActiveLimit) { |
3709 success = FALSE; | 3657 success = FALSE; |
3710 fHitEnd = TRUE; | 3658 fHitEnd = TRUE; |
3711 break; | 3659 break; |
3712 } | 3660 } |
3713 UChar32 captureGroupChar = captureGroupItr.next(); | 3661 UChar32 captureGroupChar = captureGroupItr.next(); |
3714 UChar32 inputChar = inputItr.next(); | 3662 UChar32 inputChar = inputItr.next(); |
3715 if (inputChar != captureGroupChar) { | 3663 if (inputChar != captureGroupChar) { |
3716 success = FALSE; | 3664 success = FALSE; |
3717 break; | 3665 break; |
3718 } | 3666 } |
3719 } | 3667 } |
3720 | 3668 |
3721 if (success && inputItr.inExpansion()) { | 3669 if (success && inputItr.inExpansion()) { |
3722 // We otained a match by consuming part of a string obtained
from | 3670 // We otained a match by consuming part of a string obtained
from |
3723 // case-folding a single code point of the input text. | 3671 // case-folding a single code point of the input text. |
3724 // This does not count as an overall match. | 3672 // This does not count as an overall match. |
3725 success = FALSE; | 3673 success = FALSE; |
3726 } | 3674 } |
3727 | 3675 |
3728 if (success) { | 3676 if (success) { |
3729 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3677 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
3730 } else { | 3678 } else { |
3731 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3679 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3732 } | 3680 } |
3733 | 3681 |
3734 } | 3682 } |
3735 break; | 3683 break; |
3736 | 3684 |
3737 case URX_STO_INP_LOC: | 3685 case URX_STO_INP_LOC: |
3738 { | 3686 { |
3739 U_ASSERT(opValue >= 0 && opValue < fFrameSize); | 3687 U_ASSERT(opValue >= 0 && opValue < fFrameSize); |
3740 fp->fExtra[opValue] = fp->fInputIdx; | 3688 fp->fExtra[opValue] = fp->fInputIdx; |
3741 } | 3689 } |
3742 break; | 3690 break; |
3743 | 3691 |
3744 case URX_JMPX: | 3692 case URX_JMPX: |
3745 { | 3693 { |
3746 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; | 3694 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3806 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3754 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
3807 | 3755 |
3808 UChar32 c = UTEXT_NEXT32(fInputText); | 3756 UChar32 c = UTEXT_NEXT32(fInputText); |
3809 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { | 3757 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { |
3810 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 3758 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
3811 break; | 3759 break; |
3812 } | 3760 } |
3813 } else { | 3761 } else { |
3814 fHitEnd = TRUE; | 3762 fHitEnd = TRUE; |
3815 } | 3763 } |
3816 | 3764 |
3817 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3765 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3818 break; | 3766 break; |
3819 | 3767 |
3820 case URX_STRING_I: | 3768 case URX_STRING_I: |
3821 { | 3769 { |
3822 // Case-insensitive test input against a literal string. | 3770 // Case-insensitive test input against a literal string. |
3823 // Strings require two slots in the compiled pattern, one for th
e | 3771 // Strings require two slots in the compiled pattern, one for th
e |
3824 // offset to the string text, and one for the length. | 3772 // offset to the string text, and one for the length. |
3825 // The compiled string has already been case folded. | 3773 // The compiled string has already been case folded. |
3826 { | 3774 { |
3827 const UChar *patternString = litText + opValue; | 3775 const UChar *patternString = litText + opValue; |
3828 int32_t patternStringIdx = 0; | 3776 int32_t patternStringIdx = 0; |
3829 | 3777 |
3830 op = (int32_t)pat[fp->fPatIdx]; | 3778 op = (int32_t)pat[fp->fPatIdx]; |
3831 fp->fPatIdx++; | 3779 fp->fPatIdx++; |
3832 opType = URX_TYPE(op); | 3780 opType = URX_TYPE(op); |
3833 opValue = URX_VAL(op); | 3781 opValue = URX_VAL(op); |
3834 U_ASSERT(opType == URX_STRING_LEN); | 3782 U_ASSERT(opType == URX_STRING_LEN); |
3835 int32_t patternStringLen = opValue; // Length of the string
from the pattern. | 3783 int32_t patternStringLen = opValue; // Length of the string
from the pattern. |
3836 | 3784 |
3837 | 3785 |
3838 UChar32 cPattern; | 3786 UChar32 cPattern; |
3839 UChar32 cText; | 3787 UChar32 cText; |
3840 UBool success = TRUE; | 3788 UBool success = TRUE; |
3841 | 3789 |
3842 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 3790 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
3843 CaseFoldingUTextIterator inputIterator(*fInputText); | 3791 CaseFoldingUTextIterator inputIterator(*fInputText); |
3844 while (patternStringIdx < patternStringLen) { | 3792 while (patternStringIdx < patternStringLen) { |
3845 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX
(fInputText) >= fActiveLimit) { | 3793 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX
(fInputText) >= fActiveLimit) { |
3846 success = FALSE; | 3794 success = FALSE; |
3847 fHitEnd = TRUE; | 3795 fHitEnd = TRUE; |
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3942 // The look-behind expression matched, but the match did no
t | 3890 // The look-behind expression matched, but the match did no
t |
3943 // extend all the way to the point that we are looking be
hind from. | 3891 // extend all the way to the point that we are looking be
hind from. |
3944 // FAIL out of here, which will take us back to the LB_CONT
, which | 3892 // FAIL out of here, which will take us back to the LB_CONT
, which |
3945 // will retry the match starting at another position or
fail | 3893 // will retry the match starting at another position or
fail |
3946 // the look-behind altogether, whichever is appropriate. | 3894 // the look-behind altogether, whichever is appropriate. |
3947 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3895 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
3948 break; | 3896 break; |
3949 } | 3897 } |
3950 | 3898 |
3951 // Look-behind match is good. Restore the orignal input string
length, | 3899 // Look-behind match is good. Restore the orignal input string
length, |
3952 // which had been truncated to pin the end of the lookbehind m
atch to the | 3900 // which had been truncated to pin the end of the lookbehind m
atch to the |
3953 // position being looked-behind. | 3901 // position being looked-behind. |
3954 int64_t originalInputLen = fData[opValue+3]; | 3902 int64_t originalInputLen = fData[opValue+3]; |
3955 U_ASSERT(originalInputLen >= fActiveLimit); | 3903 U_ASSERT(originalInputLen >= fActiveLimit); |
3956 U_ASSERT(originalInputLen <= fInputLength); | 3904 U_ASSERT(originalInputLen <= fInputLength); |
3957 fActiveLimit = originalInputLen; | 3905 fActiveLimit = originalInputLen; |
3958 } | 3906 } |
3959 break; | 3907 break; |
3960 | 3908 |
3961 | 3909 |
3962 case URX_LBN_CONT: | 3910 case URX_LBN_CONT: |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4019 // extend all the way to the point that we are looking be
hind from. | 3967 // extend all the way to the point that we are looking be
hind from. |
4020 // FAIL out of here, which will take us back to the LB_CONT
, which | 3968 // FAIL out of here, which will take us back to the LB_CONT
, which |
4021 // will retry the match starting at another position or
succeed | 3969 // will retry the match starting at another position or
succeed |
4022 // the look-behind altogether, whichever is appropriate. | 3970 // the look-behind altogether, whichever is appropriate. |
4023 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3971 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4024 break; | 3972 break; |
4025 } | 3973 } |
4026 | 3974 |
4027 // Look-behind expression matched, which means look-behind test
as | 3975 // Look-behind expression matched, which means look-behind test
as |
4028 // a whole Fails | 3976 // a whole Fails |
4029 | 3977 |
4030 // Restore the orignal input string length, which had been tru
ncated | 3978 // Restore the orignal input string length, which had been tru
ncated |
4031 // inorder to pin the end of the lookbehind match | 3979 // inorder to pin the end of the lookbehind match |
4032 // to the position being looked-behind. | 3980 // to the position being looked-behind. |
4033 int64_t originalInputLen = fData[opValue+3]; | 3981 int64_t originalInputLen = fData[opValue+3]; |
4034 U_ASSERT(originalInputLen >= fActiveLimit); | 3982 U_ASSERT(originalInputLen >= fActiveLimit); |
4035 U_ASSERT(originalInputLen <= fInputLength); | 3983 U_ASSERT(originalInputLen <= fInputLength); |
4036 fActiveLimit = originalInputLen; | 3984 fActiveLimit = originalInputLen; |
4037 | 3985 |
4038 // Restore original stack position, discarding any state saved | 3986 // Restore original stack position, discarding any state saved |
4039 // by the successful pattern match. | 3987 // by the successful pattern match. |
4040 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); | 3988 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
4041 int32_t newStackSize = (int32_t)fData[opValue]; | 3989 int32_t newStackSize = (int32_t)fData[opValue]; |
4042 U_ASSERT(fStack->size() > newStackSize); | 3990 U_ASSERT(fStack->size() > newStackSize); |
4043 fStack->setSize(newStackSize); | 3991 fStack->setSize(newStackSize); |
4044 | 3992 |
4045 // FAIL, which will take control back to someplace | 3993 // FAIL, which will take control back to someplace |
4046 // prior to entering the look-behind test. | 3994 // prior to entering the look-behind test. |
4047 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 3995 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4048 } | 3996 } |
4049 break; | 3997 break; |
4050 | 3998 |
4051 | 3999 |
4052 case URX_LOOP_SR_I: | 4000 case URX_LOOP_SR_I: |
4053 // Loop Initialization for the optimized implementation of | 4001 // Loop Initialization for the optimized implementation of |
4054 // [some character set]* | 4002 // [some character set]* |
4055 // This op scans through all matching input. | 4003 // This op scans through all matching input. |
(...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4168 break; | 4116 break; |
4169 | 4117 |
4170 | 4118 |
4171 case URX_LOOP_C: | 4119 case URX_LOOP_C: |
4172 { | 4120 { |
4173 U_ASSERT(opValue>=0 && opValue<fFrameSize); | 4121 U_ASSERT(opValue>=0 && opValue<fFrameSize); |
4174 backSearchIndex = fp->fExtra[opValue]; | 4122 backSearchIndex = fp->fExtra[opValue]; |
4175 U_ASSERT(backSearchIndex <= fp->fInputIdx); | 4123 U_ASSERT(backSearchIndex <= fp->fInputIdx); |
4176 if (backSearchIndex == fp->fInputIdx) { | 4124 if (backSearchIndex == fp->fInputIdx) { |
4177 // We've backed up the input idx to the point that the loop
started. | 4125 // We've backed up the input idx to the point that the loop
started. |
4178 // The loop is done. Leave here without saving state. | 4126 // The loop is done. Leave here without saving state. |
4179 // Subsequent failures won't come back here. | 4127 // Subsequent failures won't come back here. |
4180 break; | 4128 break; |
4181 } | 4129 } |
4182 // Set up for the next iteration of the loop, with input index | 4130 // Set up for the next iteration of the loop, with input index |
4183 // backed up by one from the last time through, | 4131 // backed up by one from the last time through, |
4184 // and a state save to this instruction in case the following
code fails again. | 4132 // and a state save to this instruction in case the following
code fails again. |
4185 // (We're going backwards because this loop emulates stack unw
inding, not | 4133 // (We're going backwards because this loop emulates stack unw
inding, not |
4186 // the initial scan forward.) | 4134 // the initial scan forward.) |
4187 U_ASSERT(fp->fInputIdx > 0); | 4135 U_ASSERT(fp->fInputIdx > 0); |
4188 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 4136 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
4189 UChar32 prevC = UTEXT_PREVIOUS32(fInputText); | 4137 UChar32 prevC = UTEXT_PREVIOUS32(fInputText); |
4190 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 4138 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
4191 | 4139 |
4192 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); | 4140 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); |
4193 if (prevC == 0x0a && | 4141 if (prevC == 0x0a && |
4194 fp->fInputIdx > backSearchIndex && | 4142 fp->fInputIdx > backSearchIndex && |
4195 twoPrevC == 0x0d) { | 4143 twoPrevC == 0x0d) { |
4196 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; | 4144 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; |
4197 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { | 4145 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { |
4198 // .*, stepping back over CRLF pair. | 4146 // .*, stepping back over CRLF pair. |
4199 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); | 4147 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
4200 } | 4148 } |
4201 } | 4149 } |
4202 | 4150 |
4203 | 4151 |
4204 fp = StateSave(fp, fp->fPatIdx-1, status); | 4152 fp = StateSave(fp, fp->fPatIdx-1, status); |
4205 } | 4153 } |
4206 break; | 4154 break; |
4207 | 4155 |
4208 | 4156 |
4209 | 4157 |
4210 default: | 4158 default: |
4211 // Trouble. The compiled pattern contains an entry with an | 4159 // Trouble. The compiled pattern contains an entry with an |
4212 // unrecognized type tag. | 4160 // unrecognized type tag. |
4213 U_ASSERT(FALSE); | 4161 U_ASSERT(FALSE); |
4214 } | 4162 } |
4215 | 4163 |
4216 if (U_FAILURE(status)) { | 4164 if (U_FAILURE(status)) { |
4217 isMatch = FALSE; | 4165 isMatch = FALSE; |
4218 break; | 4166 break; |
4219 } | 4167 } |
4220 } | 4168 } |
4221 | 4169 |
4222 breakFromLoop: | 4170 breakFromLoop: |
4223 fMatch = isMatch; | 4171 fMatch = isMatch; |
4224 if (isMatch) { | 4172 if (isMatch) { |
4225 fLastMatchEnd = fMatchEnd; | 4173 fLastMatchEnd = fMatchEnd; |
4226 fMatchStart = startIdx; | 4174 fMatchStart = startIdx; |
4227 fMatchEnd = fp->fInputIdx; | 4175 fMatchEnd = fp->fInputIdx; |
4228 if (fTraceDebug) { | 4176 } |
4229 REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchSta
rt, fMatchEnd)); | 4177 |
| 4178 #ifdef REGEX_RUN_DEBUG |
| 4179 if (fTraceDebug) { |
| 4180 if (isMatch) { |
| 4181 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd); |
| 4182 } else { |
| 4183 printf("No match\n\n"); |
4230 } | 4184 } |
4231 } | 4185 } |
4232 else | 4186 #endif |
4233 { | |
4234 if (fTraceDebug) { | |
4235 REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); | |
4236 } | |
4237 } | |
4238 | 4187 |
4239 fFrame = fp; // The active stack frame when the engine stoppe
d. | 4188 fFrame = fp; // The active stack frame when the engine stoppe
d. |
4240 // Contains the capture group results that we
need to | 4189 // Contains the capture group results that we
need to |
4241 // access later. | 4190 // access later. |
4242 return; | 4191 return; |
4243 } | 4192 } |
4244 | 4193 |
4245 | 4194 |
4246 //------------------------------------------------------------------------------
-- | 4195 //------------------------------------------------------------------------------
-- |
4247 // | 4196 // |
4248 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with t
he | 4197 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with t
he |
4249 // assumption that the entire string is available in the UText'
s | 4198 // assumption that the entire string is available in the UText'
s |
4250 // chunk buffer. For now, that means we can use int32_t indexes
, | 4199 // chunk buffer. For now, that means we can use int32_t indexes
, |
4251 // except for anything that needs to be saved (like group start
s | 4200 // except for anything that needs to be saved (like group start
s |
4252 // and ends). | 4201 // and ends). |
4253 // | 4202 // |
4254 // startIdx: begin matching a this index. | 4203 // startIdx: begin matching a this index. |
4255 // toEnd: if true, match must extend to end of the input
region | 4204 // toEnd: if true, match must extend to end of the input
region |
4256 // | 4205 // |
4257 //------------------------------------------------------------------------------
-- | 4206 //------------------------------------------------------------------------------
-- |
4258 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
s) { | 4207 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
s) { |
4259 UBool isMatch = FALSE; // True if the we have a match. | 4208 UBool isMatch = FALSE; // True if the we have a match. |
4260 | 4209 |
4261 int32_t backSearchIndex = INT32_MAX; // used after greedy single-charact
er matches for searching backwards | 4210 int32_t backSearchIndex = INT32_MAX; // used after greedy single-charact
er matches for searching backwards |
4262 | 4211 |
4263 int32_t op; // Operation from the compiled pattern, s
plit into | 4212 int32_t op; // Operation from the compiled pattern, s
plit into |
4264 int32_t opType; // the opcode | 4213 int32_t opType; // the opcode |
4265 int32_t opValue; // and the operand value. | 4214 int32_t opValue; // and the operand value. |
4266 | 4215 |
4267 #ifdef REGEX_RUN_DEBUG | 4216 #ifdef REGEX_RUN_DEBUG |
4268 if (fTraceDebug) | 4217 if (fTraceDebug) { |
4269 { | |
4270 printf("MatchAt(startIdx=%d)\n", startIdx); | 4218 printf("MatchAt(startIdx=%d)\n", startIdx); |
4271 printf("Original Pattern: "); | 4219 printf("Original Pattern: "); |
4272 UChar32 c = utext_next32From(fPattern->fPattern, 0); | 4220 UChar32 c = utext_next32From(fPattern->fPattern, 0); |
4273 while (c != U_SENTINEL) { | 4221 while (c != U_SENTINEL) { |
4274 if (c<32 || c>256) { | 4222 if (c<32 || c>256) { |
4275 c = '.'; | 4223 c = '.'; |
4276 } | 4224 } |
4277 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); | 4225 printf("%c", c); |
4278 | 4226 |
4279 c = UTEXT_NEXT32(fPattern->fPattern); | 4227 c = UTEXT_NEXT32(fPattern->fPattern); |
4280 } | 4228 } |
4281 printf("\n"); | 4229 printf("\n"); |
4282 printf("Input String: "); | 4230 printf("Input String: "); |
4283 c = utext_next32From(fInputText, 0); | 4231 c = utext_next32From(fInputText, 0); |
4284 while (c != U_SENTINEL) { | 4232 while (c != U_SENTINEL) { |
4285 if (c<32 || c>256) { | 4233 if (c<32 || c>256) { |
4286 c = '.'; | 4234 c = '.'; |
4287 } | 4235 } |
4288 printf("%c", c); | 4236 printf("%c", c); |
4289 | 4237 |
4290 c = UTEXT_NEXT32(fInputText); | 4238 c = UTEXT_NEXT32(fInputText); |
4291 } | 4239 } |
4292 printf("\n"); | 4240 printf("\n"); |
4293 printf("\n"); | 4241 printf("\n"); |
4294 } | 4242 } |
4295 #endif | 4243 #endif |
4296 | 4244 |
4297 if (U_FAILURE(status)) { | 4245 if (U_FAILURE(status)) { |
4298 return; | 4246 return; |
4299 } | 4247 } |
4300 | 4248 |
4301 // Cache frequently referenced items from the compiled pattern | 4249 // Cache frequently referenced items from the compiled pattern |
4302 // | 4250 // |
4303 int64_t *pat = fPattern->fCompiledPat->getBuffer(); | 4251 int64_t *pat = fPattern->fCompiledPat->getBuffer(); |
4304 | 4252 |
4305 const UChar *litText = fPattern->fLiteralText.getBuffer(); | 4253 const UChar *litText = fPattern->fLiteralText.getBuffer(); |
4306 UVector *sets = fPattern->fSets; | 4254 UVector *sets = fPattern->fSets; |
4307 | 4255 |
4308 const UChar *inputBuf = fInputText->chunkContents; | 4256 const UChar *inputBuf = fInputText->chunkContents; |
4309 | 4257 |
4310 fFrameSize = fPattern->fFrameSize; | 4258 fFrameSize = fPattern->fFrameSize; |
4311 REStackFrame *fp = resetStack(); | 4259 REStackFrame *fp = resetStack(); |
4312 | 4260 |
4313 fp->fPatIdx = 0; | 4261 fp->fPatIdx = 0; |
4314 fp->fInputIdx = startIdx; | 4262 fp->fInputIdx = startIdx; |
4315 | 4263 |
4316 // Zero out the pattern's static data | 4264 // Zero out the pattern's static data |
4317 int32_t i; | 4265 int32_t i; |
4318 for (i = 0; i<fPattern->fDataSize; i++) { | 4266 for (i = 0; i<fPattern->fDataSize; i++) { |
4319 fData[i] = 0; | 4267 fData[i] = 0; |
4320 } | 4268 } |
4321 | 4269 |
4322 // | 4270 // |
4323 // Main loop for interpreting the compiled pattern. | 4271 // Main loop for interpreting the compiled pattern. |
4324 // One iteration of the loop per pattern operation performed. | 4272 // One iteration of the loop per pattern operation performed. |
4325 // | 4273 // |
4326 for (;;) { | 4274 for (;;) { |
4327 #if 0 | |
4328 if (_heapchk() != _HEAPOK) { | |
4329 fprintf(stderr, "Heap Trouble\n"); | |
4330 } | |
4331 #endif | |
4332 | |
4333 op = (int32_t)pat[fp->fPatIdx]; | 4275 op = (int32_t)pat[fp->fPatIdx]; |
4334 opType = URX_TYPE(op); | 4276 opType = URX_TYPE(op); |
4335 opValue = URX_VAL(op); | 4277 opValue = URX_VAL(op); |
4336 #ifdef REGEX_RUN_DEBUG | 4278 #ifdef REGEX_RUN_DEBUG |
4337 if (fTraceDebug) { | 4279 if (fTraceDebug) { |
4338 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); | 4280 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
4339 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ",
fp->fInputIdx, | 4281 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ",
fp->fInputIdx, |
4340 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(
), fActiveLimit); | 4282 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(
), fActiveLimit); |
4341 fPattern->dumpOp(fp->fPatIdx); | 4283 fPattern->dumpOp(fp->fPatIdx); |
4342 } | 4284 } |
4343 #endif | 4285 #endif |
4344 fp->fPatIdx++; | 4286 fp->fPatIdx++; |
4345 | 4287 |
4346 switch (opType) { | 4288 switch (opType) { |
4347 | 4289 |
4348 | 4290 |
4349 case URX_NOP: | 4291 case URX_NOP: |
4350 break; | 4292 break; |
4351 | 4293 |
4352 | 4294 |
4353 case URX_BACKTRACK: | 4295 case URX_BACKTRACK: |
4354 // Force a backtrack. In some circumstances, the pattern compiler | 4296 // Force a backtrack. In some circumstances, the pattern compiler |
4355 // will notice that the pattern can't possibly match anything, and
will | 4297 // will notice that the pattern can't possibly match anything, and
will |
4356 // emit one of these at that point. | 4298 // emit one of these at that point. |
4357 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4299 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4358 break; | 4300 break; |
4359 | 4301 |
4360 | 4302 |
4361 case URX_ONECHAR: | 4303 case URX_ONECHAR: |
4362 if (fp->fInputIdx < fActiveLimit) { | 4304 if (fp->fInputIdx < fActiveLimit) { |
4363 UChar32 c; | 4305 UChar32 c; |
4364 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4306 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
4365 if (c == opValue) { | 4307 if (c == opValue) { |
4366 break; | 4308 break; |
4367 } | 4309 } |
4368 } else { | 4310 } else { |
4369 fHitEnd = TRUE; | 4311 fHitEnd = TRUE; |
4370 } | 4312 } |
4371 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4313 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4372 break; | 4314 break; |
4373 | 4315 |
4374 | 4316 |
4375 case URX_STRING: | 4317 case URX_STRING: |
4376 { | 4318 { |
4377 // Test input against a literal string. | 4319 // Test input against a literal string. |
4378 // Strings require two slots in the compiled pattern, one for th
e | 4320 // Strings require two slots in the compiled pattern, one for th
e |
4379 // offset to the string text, and one for the length. | 4321 // offset to the string text, and one for the length. |
4380 int32_t stringStartIdx = opValue; | 4322 int32_t stringStartIdx = opValue; |
4381 int32_t stringLen; | 4323 int32_t stringLen; |
4382 | 4324 |
4383 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second ope
rand | 4325 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second ope
rand |
4384 fp->fPatIdx++; | 4326 fp->fPatIdx++; |
4385 opType = URX_TYPE(op); | 4327 opType = URX_TYPE(op); |
4386 stringLen = URX_VAL(op); | 4328 stringLen = URX_VAL(op); |
4387 U_ASSERT(opType == URX_STRING_LEN); | 4329 U_ASSERT(opType == URX_STRING_LEN); |
4388 U_ASSERT(stringLen >= 2); | 4330 U_ASSERT(stringLen >= 2); |
4389 | 4331 |
4390 const UChar * pInp = inputBuf + fp->fInputIdx; | 4332 const UChar * pInp = inputBuf + fp->fInputIdx; |
4391 const UChar * pInpLimit = inputBuf + fActiveLimit; | 4333 const UChar * pInpLimit = inputBuf + fActiveLimit; |
4392 const UChar * pPat = litText+stringStartIdx; | 4334 const UChar * pPat = litText+stringStartIdx; |
4393 const UChar * pEnd = pInp + stringLen; | 4335 const UChar * pEnd = pInp + stringLen; |
4394 UBool success = TRUE; | 4336 UBool success = TRUE; |
4395 while (pInp < pEnd) { | 4337 while (pInp < pEnd) { |
4396 if (pInp >= pInpLimit) { | 4338 if (pInp >= pInpLimit) { |
4397 fHitEnd = TRUE; | 4339 fHitEnd = TRUE; |
4398 success = FALSE; | 4340 success = FALSE; |
4399 break; | 4341 break; |
4400 } | 4342 } |
4401 if (*pInp++ != *pPat++) { | 4343 if (*pInp++ != *pPat++) { |
4402 success = FALSE; | 4344 success = FALSE; |
4403 break; | 4345 break; |
4404 } | 4346 } |
4405 } | 4347 } |
4406 | 4348 |
4407 if (success) { | 4349 if (success) { |
4408 fp->fInputIdx += stringLen; | 4350 fp->fInputIdx += stringLen; |
4409 } else { | 4351 } else { |
4410 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4352 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4411 } | 4353 } |
4412 } | 4354 } |
4413 break; | 4355 break; |
4414 | 4356 |
4415 | 4357 |
4416 case URX_STATE_SAVE: | 4358 case URX_STATE_SAVE: |
4417 fp = StateSave(fp, opValue, status); | 4359 fp = StateSave(fp, opValue, status); |
4418 break; | 4360 break; |
4419 | 4361 |
4420 | 4362 |
4421 case URX_END: | 4363 case URX_END: |
4422 // The match loop will exit via this path on a successful match, | 4364 // The match loop will exit via this path on a successful match, |
4423 // when we reach the end of the pattern. | 4365 // when we reach the end of the pattern. |
4424 if (toEnd && fp->fInputIdx != fActiveLimit) { | 4366 if (toEnd && fp->fInputIdx != fActiveLimit) { |
4425 // The pattern matched, but not to the end of input. Try some m
ore. | 4367 // The pattern matched, but not to the end of input. Try some m
ore. |
4426 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4368 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4427 break; | 4369 break; |
4428 } | 4370 } |
4429 isMatch = TRUE; | 4371 isMatch = TRUE; |
4430 goto breakFromLoop; | 4372 goto breakFromLoop; |
4431 | 4373 |
4432 // Start and End Capture stack frame variables are laid out out like
this: | 4374 // Start and End Capture stack frame variables are laid out out like
this: |
4433 // fp->fExtra[opValue] - The start of a completed capture group | 4375 // fp->fExtra[opValue] - The start of a completed capture group |
4434 // opValue+1 - The end of a completed capture group | 4376 // opValue+1 - The end of a completed capture group |
4435 // opValue+2 - the start of a capture group whose end | 4377 // opValue+2 - the start of a capture group whose end |
4436 // has not yet been reached (and might not
ever be). | 4378 // has not yet been reached (and might not
ever be). |
4437 case URX_START_CAPTURE: | 4379 case URX_START_CAPTURE: |
4438 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); | 4380 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); |
4439 fp->fExtra[opValue+2] = fp->fInputIdx; | 4381 fp->fExtra[opValue+2] = fp->fInputIdx; |
4440 break; | 4382 break; |
4441 | 4383 |
4442 | 4384 |
4443 case URX_END_CAPTURE: | 4385 case URX_END_CAPTURE: |
4444 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); | 4386 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); |
4445 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for th
is group must be set. | 4387 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for th
is group must be set. |
4446 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start
becomes real. | 4388 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start
becomes real. |
4447 fp->fExtra[opValue+1] = fp->fInputIdx; // End position | 4389 fp->fExtra[opValue+1] = fp->fInputIdx; // End position |
4448 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); | 4390 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); |
4449 break; | 4391 break; |
4450 | 4392 |
4451 | 4393 |
4452 case URX_DOLLAR: // $, test for End of line | 4394 case URX_DOLLAR: // $, test for End of line |
4453 // or for position before new line at end of input | 4395 // or for position before new line at end of input |
4454 if (fp->fInputIdx < fAnchorLimit-2) { | 4396 if (fp->fInputIdx < fAnchorLimit-2) { |
4455 // We are no where near the end of input. Fail. | 4397 // We are no where near the end of input. Fail. |
4456 // This is the common case. Keep it first. | 4398 // This is the common case. Keep it first. |
4457 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4399 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4458 break; | 4400 break; |
4459 } | 4401 } |
4460 if (fp->fInputIdx >= fAnchorLimit) { | 4402 if (fp->fInputIdx >= fAnchorLimit) { |
4461 // We really are at the end of input. Success. | 4403 // We really are at the end of input. Success. |
4462 fHitEnd = TRUE; | 4404 fHitEnd = TRUE; |
4463 fRequireEnd = TRUE; | 4405 fRequireEnd = TRUE; |
4464 break; | 4406 break; |
4465 } | 4407 } |
4466 | 4408 |
4467 // If we are positioned just before a new-line that is located at th
e | 4409 // If we are positioned just before a new-line that is located at th
e |
4468 // end of input, succeed. | 4410 // end of input, succeed. |
4469 if (fp->fInputIdx == fAnchorLimit-1) { | 4411 if (fp->fInputIdx == fAnchorLimit-1) { |
4470 UChar32 c; | 4412 UChar32 c; |
4471 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); | 4413 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); |
4472 | 4414 |
4473 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { | 4415 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { |
4474 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp-
>fInputIdx-1]==0x0d)) { | 4416 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp-
>fInputIdx-1]==0x0d)) { |
4475 // At new-line at end of input. Success | 4417 // At new-line at end of input. Success |
4476 fHitEnd = TRUE; | 4418 fHitEnd = TRUE; |
4477 fRequireEnd = TRUE; | 4419 fRequireEnd = TRUE; |
4478 break; | 4420 break; |
4479 } | 4421 } |
4480 } | 4422 } |
4481 } else if (fp->fInputIdx == fAnchorLimit-2 && | 4423 } else if (fp->fInputIdx == fAnchorLimit-2 && |
4482 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a
) { | 4424 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a
) { |
4483 fHitEnd = TRUE; | 4425 fHitEnd = TRUE; |
4484 fRequireEnd = TRUE; | 4426 fRequireEnd = TRUE; |
4485 break; // At CR/LF at end of input.
Success | 4427 break; // At CR/LF at end of input.
Success |
4486 } | 4428 } |
4487 | 4429 |
4488 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4430 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4489 | 4431 |
4490 break; | 4432 break; |
4491 | 4433 |
4492 | 4434 |
4493 case URX_DOLLAR_D: // $, test for End of Line, in UNI
X_LINES mode. | 4435 case URX_DOLLAR_D: // $, test for End of Line, in UNI
X_LINES mode. |
4494 if (fp->fInputIdx >= fAnchorLimit-1) { | 4436 if (fp->fInputIdx >= fAnchorLimit-1) { |
4495 // Either at the last character of input, or off the end. | 4437 // Either at the last character of input, or off the end. |
4496 if (fp->fInputIdx == fAnchorLimit-1) { | 4438 if (fp->fInputIdx == fAnchorLimit-1) { |
4497 // At last char of input. Success if it's a new line. | 4439 // At last char of input. Success if it's a new line. |
4498 if (inputBuf[fp->fInputIdx] == 0x0a) { | 4440 if (inputBuf[fp->fInputIdx] == 0x0a) { |
4499 fHitEnd = TRUE; | 4441 fHitEnd = TRUE; |
4500 fRequireEnd = TRUE; | 4442 fRequireEnd = TRUE; |
4501 break; | 4443 break; |
4502 } | 4444 } |
4503 } else { | 4445 } else { |
4504 // Off the end of input. Success. | 4446 // Off the end of input. Success. |
4505 fHitEnd = TRUE; | 4447 fHitEnd = TRUE; |
4506 fRequireEnd = TRUE; | 4448 fRequireEnd = TRUE; |
4507 break; | 4449 break; |
4508 } | 4450 } |
4509 } | 4451 } |
4510 | 4452 |
4511 // Not at end of input. Back-track out. | 4453 // Not at end of input. Back-track out. |
4512 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4454 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4513 break; | 4455 break; |
4514 | 4456 |
4515 | 4457 |
4516 case URX_DOLLAR_M: // $, test for End of line in multi-l
ine mode | 4458 case URX_DOLLAR_M: // $, test for End of line in multi-l
ine mode |
4517 { | 4459 { |
4518 if (fp->fInputIdx >= fAnchorLimit) { | 4460 if (fp->fInputIdx >= fAnchorLimit) { |
4519 // We really are at the end of input. Success. | 4461 // We really are at the end of input. Success. |
4520 fHitEnd = TRUE; | 4462 fHitEnd = TRUE; |
4521 fRequireEnd = TRUE; | 4463 fRequireEnd = TRUE; |
4522 break; | 4464 break; |
4523 } | 4465 } |
4524 // If we are positioned just before a new-line, succeed. | 4466 // If we are positioned just before a new-line, succeed. |
4525 // It makes no difference where the new-line is within the input
. | 4467 // It makes no difference where the new-line is within the input
. |
4526 UChar32 c = inputBuf[fp->fInputIdx]; | 4468 UChar32 c = inputBuf[fp->fInputIdx]; |
4527 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { | 4469 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { |
4528 // At a line end, except for the odd chance of being in the
middle of a CR/LF sequence | 4470 // At a line end, except for the odd chance of being in the
middle of a CR/LF sequence |
4529 // In multi-line mode, hitting a new-line just before the e
nd of input does not | 4471 // In multi-line mode, hitting a new-line just before the e
nd of input does not |
4530 // set the hitEnd or requireEnd flags | 4472 // set the hitEnd or requireEnd flags |
4531 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp-
>fInputIdx-1]==0x0d)) { | 4473 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp-
>fInputIdx-1]==0x0d)) { |
4532 break; | 4474 break; |
4533 } | 4475 } |
4534 } | 4476 } |
4535 // not at a new line. Fail. | 4477 // not at a new line. Fail. |
4536 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4478 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4537 } | 4479 } |
4538 break; | 4480 break; |
4539 | 4481 |
4540 | 4482 |
4541 case URX_DOLLAR_MD: // $, test for End of line in multi-
line and UNIX_LINES mode | 4483 case URX_DOLLAR_MD: // $, test for End of line in multi-
line and UNIX_LINES mode |
4542 { | 4484 { |
4543 if (fp->fInputIdx >= fAnchorLimit) { | 4485 if (fp->fInputIdx >= fAnchorLimit) { |
4544 // We really are at the end of input. Success. | 4486 // We really are at the end of input. Success. |
4545 fHitEnd = TRUE; | 4487 fHitEnd = TRUE; |
4546 fRequireEnd = TRUE; // Java set requireEnd in this case, ev
en though | 4488 fRequireEnd = TRUE; // Java set requireEnd in this case, ev
en though |
4547 break; // adding a new-line would not lose t
he match. | 4489 break; // adding a new-line would not lose t
he match. |
4548 } | 4490 } |
4549 // If we are not positioned just before a new-line, the test fai
ls; backtrack out. | 4491 // If we are not positioned just before a new-line, the test fai
ls; backtrack out. |
4550 // It makes no difference where the new-line is within the input
. | 4492 // It makes no difference where the new-line is within the input
. |
4551 if (inputBuf[fp->fInputIdx] != 0x0a) { | 4493 if (inputBuf[fp->fInputIdx] != 0x0a) { |
4552 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4494 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4553 } | 4495 } |
4554 } | 4496 } |
4555 break; | 4497 break; |
4556 | 4498 |
4557 | 4499 |
4558 case URX_CARET: // ^, test for start of line | 4500 case URX_CARET: // ^, test for start of line |
4559 if (fp->fInputIdx != fAnchorStart) { | 4501 if (fp->fInputIdx != fAnchorStart) { |
4560 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4502 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4561 } | 4503 } |
4562 break; | 4504 break; |
4563 | 4505 |
4564 | 4506 |
4565 case URX_CARET_M: // ^, test for start of line in mul
it-line mode | 4507 case URX_CARET_M: // ^, test for start of line in mul
it-line mode |
4566 { | 4508 { |
4567 if (fp->fInputIdx == fAnchorStart) { | 4509 if (fp->fInputIdx == fAnchorStart) { |
4568 // We are at the start input. Success. | 4510 // We are at the start input. Success. |
4569 break; | 4511 break; |
4570 } | 4512 } |
4571 // Check whether character just before the current pos is a new-
line | 4513 // Check whether character just before the current pos is a new-
line |
4572 // unless we are at the end of input | 4514 // unless we are at the end of input |
4573 UChar c = inputBuf[fp->fInputIdx - 1]; | 4515 UChar c = inputBuf[fp->fInputIdx - 1]; |
4574 if ((fp->fInputIdx < fAnchorLimit) && | 4516 if ((fp->fInputIdx < fAnchorLimit) && |
4575 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029))
{ | 4517 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029))
{ |
4576 // It's a new-line. ^ is true. Success. | 4518 // It's a new-line. ^ is true. Success. |
4577 // TODO: what should be done with positions between a CR a
nd LF? | 4519 // TODO: what should be done with positions between a CR a
nd LF? |
4578 break; | 4520 break; |
4579 } | 4521 } |
4580 // Not at the start of a line. Fail. | 4522 // Not at the start of a line. Fail. |
4581 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4523 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4582 } | 4524 } |
4583 break; | 4525 break; |
4584 | 4526 |
4585 | 4527 |
4586 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line
+ Unix-line mode | 4528 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line
+ Unix-line mode |
4587 { | 4529 { |
4588 U_ASSERT(fp->fInputIdx >= fAnchorStart); | 4530 U_ASSERT(fp->fInputIdx >= fAnchorStart); |
4589 if (fp->fInputIdx <= fAnchorStart) { | 4531 if (fp->fInputIdx <= fAnchorStart) { |
4590 // We are at the start input. Success. | 4532 // We are at the start input. Success. |
4591 break; | 4533 break; |
4592 } | 4534 } |
4593 // Check whether character just before the current pos is a new-
line | 4535 // Check whether character just before the current pos is a new-
line |
4594 U_ASSERT(fp->fInputIdx <= fAnchorLimit); | 4536 U_ASSERT(fp->fInputIdx <= fAnchorLimit); |
4595 UChar c = inputBuf[fp->fInputIdx - 1]; | 4537 UChar c = inputBuf[fp->fInputIdx - 1]; |
4596 if (c != 0x0a) { | 4538 if (c != 0x0a) { |
4597 // Not at the start of a line. Back-track out. | 4539 // Not at the start of a line. Back-track out. |
4598 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4540 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4599 } | 4541 } |
4600 } | 4542 } |
4601 break; | 4543 break; |
4602 | 4544 |
4603 case URX_BACKSLASH_B: // Test for word boundaries | 4545 case URX_BACKSLASH_B: // Test for word boundaries |
4604 { | 4546 { |
4605 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); | 4547 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); |
4606 success ^= (UBool)(opValue != 0); // flip sense for \B | 4548 success ^= (UBool)(opValue != 0); // flip sense for \B |
4607 if (!success) { | 4549 if (!success) { |
4608 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4550 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4609 } | 4551 } |
4610 } | 4552 } |
4611 break; | 4553 break; |
4612 | 4554 |
4613 | 4555 |
4614 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-sty
le | 4556 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-sty
le |
4615 { | 4557 { |
4616 UBool success = isUWordBoundary(fp->fInputIdx); | 4558 UBool success = isUWordBoundary(fp->fInputIdx); |
4617 success ^= (UBool)(opValue != 0); // flip sense for \B | 4559 success ^= (UBool)(opValue != 0); // flip sense for \B |
4618 if (!success) { | 4560 if (!success) { |
4619 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4561 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4620 } | 4562 } |
4621 } | 4563 } |
4622 break; | 4564 break; |
4623 | 4565 |
4624 | 4566 |
4625 case URX_BACKSLASH_D: // Test for decimal digit | 4567 case URX_BACKSLASH_D: // Test for decimal digit |
4626 { | 4568 { |
4627 if (fp->fInputIdx >= fActiveLimit) { | 4569 if (fp->fInputIdx >= fActiveLimit) { |
4628 fHitEnd = TRUE; | 4570 fHitEnd = TRUE; |
4629 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4571 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4630 break; | 4572 break; |
4631 } | 4573 } |
4632 | 4574 |
4633 UChar32 c; | 4575 UChar32 c; |
4634 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4576 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
4635 int8_t ctype = u_charType(c); // TODO: make a unicode set f
or this. Will be faster. | 4577 int8_t ctype = u_charType(c); // TODO: make a unicode set f
or this. Will be faster. |
4636 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); | 4578 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); |
4637 success ^= (UBool)(opValue != 0); // flip sense for \D | 4579 success ^= (UBool)(opValue != 0); // flip sense for \D |
4638 if (!success) { | 4580 if (!success) { |
4639 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4581 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4640 } | 4582 } |
4641 } | 4583 } |
4642 break; | 4584 break; |
4643 | 4585 |
4644 | 4586 |
4645 case URX_BACKSLASH_G: // Test for position at end of previous m
atch | 4587 case URX_BACKSLASH_G: // Test for position at end of previous m
atch |
4646 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->
fInputIdx==fActiveStart))) { | 4588 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->
fInputIdx==fActiveStart))) { |
4647 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4589 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4648 } | 4590 } |
4649 break; | 4591 break; |
4650 | 4592 |
4651 | 4593 |
4652 case URX_BACKSLASH_X: | 4594 case URX_BACKSLASH_X: |
4653 // Match a Grapheme, as defined by Unicode TR 29. | 4595 // Match a Grapheme, as defined by Unicode TR 29. |
4654 // Differs slightly from Perl, which consumes combining marks independe
ntly | 4596 // Differs slightly from Perl, which consumes combining marks independe
ntly |
4655 // of context. | 4597 // of context. |
4656 { | 4598 { |
4657 | 4599 |
4658 // Fail if at end of input | 4600 // Fail if at end of input |
4659 if (fp->fInputIdx >= fActiveLimit) { | 4601 if (fp->fInputIdx >= fActiveLimit) { |
4660 fHitEnd = TRUE; | 4602 fHitEnd = TRUE; |
4661 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4603 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4662 break; | 4604 break; |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4711 } | 4653 } |
4712 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4654 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
4713 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { | 4655 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { |
4714 U16_BACK_1(inputBuf, 0, fp->fInputIdx); | 4656 U16_BACK_1(inputBuf, 0, fp->fInputIdx); |
4715 break; | 4657 break; |
4716 } | 4658 } |
4717 } | 4659 } |
4718 goto GC_Done; | 4660 goto GC_Done; |
4719 | 4661 |
4720 GC_Control: | 4662 GC_Control: |
4721 // Most control chars stand alone (don't combine with combining char
s), | 4663 // Most control chars stand alone (don't combine with combining char
s), |
4722 // except for that CR/LF sequence is a single grapheme cluster. | 4664 // except for that CR/LF sequence is a single grapheme cluster. |
4723 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInput
Idx] == 0x0a) { | 4665 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInput
Idx] == 0x0a) { |
4724 fp->fInputIdx++; | 4666 fp->fInputIdx++; |
4725 } | 4667 } |
4726 | 4668 |
4727 GC_Done: | 4669 GC_Done: |
4728 if (fp->fInputIdx >= fActiveLimit) { | 4670 if (fp->fInputIdx >= fActiveLimit) { |
4729 fHitEnd = TRUE; | 4671 fHitEnd = TRUE; |
4730 } | 4672 } |
4731 break; | 4673 break; |
4732 } | 4674 } |
4733 | 4675 |
4734 | 4676 |
4735 | 4677 |
4736 | 4678 |
4737 case URX_BACKSLASH_Z: // Test for end of Input | 4679 case URX_BACKSLASH_Z: // Test for end of Input |
4738 if (fp->fInputIdx < fAnchorLimit) { | 4680 if (fp->fInputIdx < fAnchorLimit) { |
4739 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4681 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4740 } else { | 4682 } else { |
4741 fHitEnd = TRUE; | 4683 fHitEnd = TRUE; |
4742 fRequireEnd = TRUE; | 4684 fRequireEnd = TRUE; |
4743 } | 4685 } |
4744 break; | 4686 break; |
4745 | 4687 |
4746 | 4688 |
4747 | 4689 |
4748 case URX_STATIC_SETREF: | 4690 case URX_STATIC_SETREF: |
4749 { | 4691 { |
4750 // Test input character against one of the predefined sets | 4692 // Test input character against one of the predefined sets |
4751 // (Word Characters, for example) | 4693 // (Word Characters, for example) |
4752 // The high bit of the op value is a flag for the match polarity
. | 4694 // The high bit of the op value is a flag for the match polarity
. |
4753 // 0: success if input char is in set. | 4695 // 0: success if input char is in set. |
4754 // 1: success if input char is not in set. | 4696 // 1: success if input char is not in set. |
4755 if (fp->fInputIdx >= fActiveLimit) { | 4697 if (fp->fInputIdx >= fActiveLimit) { |
4756 fHitEnd = TRUE; | 4698 fHitEnd = TRUE; |
4757 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4699 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4758 break; | 4700 break; |
4759 } | 4701 } |
4760 | 4702 |
4761 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); | 4703 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); |
4762 opValue &= ~URX_NEG_SET; | 4704 opValue &= ~URX_NEG_SET; |
4763 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); | 4705 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); |
4764 | 4706 |
4765 UChar32 c; | 4707 UChar32 c; |
4766 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4708 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
4767 if (c < 256) { | 4709 if (c < 256) { |
4768 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; | 4710 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; |
4769 if (s8->contains(c)) { | 4711 if (s8->contains(c)) { |
4770 success = !success; | 4712 success = !success; |
4771 } | 4713 } |
4772 } else { | 4714 } else { |
4773 const UnicodeSet *s = fPattern->fStaticSets[opValue]; | 4715 const UnicodeSet *s = fPattern->fStaticSets[opValue]; |
4774 if (s->contains(c)) { | 4716 if (s->contains(c)) { |
4775 success = !success; | 4717 success = !success; |
4776 } | 4718 } |
4777 } | 4719 } |
4778 if (!success) { | 4720 if (!success) { |
4779 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4721 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4780 } | 4722 } |
4781 } | 4723 } |
4782 break; | 4724 break; |
4783 | 4725 |
4784 | 4726 |
4785 case URX_STAT_SETREF_N: | 4727 case URX_STAT_SETREF_N: |
4786 { | 4728 { |
4787 // Test input character for NOT being a member of one of | 4729 // Test input character for NOT being a member of one of |
4788 // the predefined sets (Word Characters, for example) | 4730 // the predefined sets (Word Characters, for example) |
4789 if (fp->fInputIdx >= fActiveLimit) { | 4731 if (fp->fInputIdx >= fActiveLimit) { |
4790 fHitEnd = TRUE; | 4732 fHitEnd = TRUE; |
4791 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4733 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4792 break; | 4734 break; |
4793 } | 4735 } |
4794 | 4736 |
4795 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); | 4737 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); |
4796 | 4738 |
4797 UChar32 c; | 4739 UChar32 c; |
4798 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4740 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
4799 if (c < 256) { | 4741 if (c < 256) { |
4800 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; | 4742 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; |
4801 if (s8->contains(c) == FALSE) { | 4743 if (s8->contains(c) == FALSE) { |
4802 break; | 4744 break; |
4803 } | 4745 } |
4804 } else { | 4746 } else { |
4805 const UnicodeSet *s = fPattern->fStaticSets[opValue]; | 4747 const UnicodeSet *s = fPattern->fStaticSets[opValue]; |
4806 if (s->contains(c) == FALSE) { | 4748 if (s->contains(c) == FALSE) { |
4807 break; | 4749 break; |
4808 } | 4750 } |
4809 } | 4751 } |
4810 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4752 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4811 } | 4753 } |
4812 break; | 4754 break; |
4813 | 4755 |
4814 | 4756 |
4815 case URX_SETREF: | 4757 case URX_SETREF: |
4816 { | 4758 { |
4817 if (fp->fInputIdx >= fActiveLimit) { | 4759 if (fp->fInputIdx >= fActiveLimit) { |
4818 fHitEnd = TRUE; | 4760 fHitEnd = TRUE; |
4819 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4761 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4820 break; | 4762 break; |
4821 } | 4763 } |
4822 | 4764 |
4823 U_ASSERT(opValue > 0 && opValue < sets->size()); | 4765 U_ASSERT(opValue > 0 && opValue < sets->size()); |
4824 | 4766 |
4825 // There is input left. Pick up one char and test it for set me
mbership. | 4767 // There is input left. Pick up one char and test it for set me
mbership. |
4826 UChar32 c; | 4768 UChar32 c; |
4827 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4769 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
4828 if (c<256) { | 4770 if (c<256) { |
4829 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; | 4771 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; |
4830 if (s8->contains(c)) { | 4772 if (s8->contains(c)) { |
4831 // The character is in the set. A Match. | 4773 // The character is in the set. A Match. |
4832 break; | 4774 break; |
4833 } | 4775 } |
4834 } else { | 4776 } else { |
4835 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); | 4777 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); |
4836 if (s->contains(c)) { | 4778 if (s->contains(c)) { |
4837 // The character is in the set. A Match. | 4779 // The character is in the set. A Match. |
4838 break; | 4780 break; |
4839 } | 4781 } |
4840 } | 4782 } |
4841 | 4783 |
4842 // the character wasn't in the set. | 4784 // the character wasn't in the set. |
4843 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4785 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4844 } | 4786 } |
4845 break; | 4787 break; |
4846 | 4788 |
4847 | 4789 |
4848 case URX_DOTANY: | 4790 case URX_DOTANY: |
4849 { | 4791 { |
4850 // . matches anything, but stops at end-of-line. | 4792 // . matches anything, but stops at end-of-line. |
4851 if (fp->fInputIdx >= fActiveLimit) { | 4793 if (fp->fInputIdx >= fActiveLimit) { |
4852 // At end of input. Match failed. Backtrack out. | 4794 // At end of input. Match failed. Backtrack out. |
4853 fHitEnd = TRUE; | 4795 fHitEnd = TRUE; |
4854 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4796 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4855 break; | 4797 break; |
4856 } | 4798 } |
4857 | 4799 |
4858 // There is input left. Advance over one char, unless we've hit
end-of-line | 4800 // There is input left. Advance over one char, unless we've hit
end-of-line |
4859 UChar32 c; | 4801 UChar32 c; |
4860 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4802 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
4861 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many
chars as possible | 4803 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many
chars as possible |
4862 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029))
{ | 4804 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029))
{ |
4863 // End of line in normal mode. . does not match. | 4805 // End of line in normal mode. . does not match. |
4864 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4806 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4865 break; | 4807 break; |
4866 } | 4808 } |
4867 } | 4809 } |
4868 break; | 4810 break; |
4869 | 4811 |
4870 | 4812 |
4871 case URX_DOTANY_ALL: | 4813 case URX_DOTANY_ALL: |
4872 { | 4814 { |
4873 // . in dot-matches-all (including new lines) mode | 4815 // . in dot-matches-all (including new lines) mode |
4874 if (fp->fInputIdx >= fActiveLimit) { | 4816 if (fp->fInputIdx >= fActiveLimit) { |
4875 // At end of input. Match failed. Backtrack out. | 4817 // At end of input. Match failed. Backtrack out. |
4876 fHitEnd = TRUE; | 4818 fHitEnd = TRUE; |
4877 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4819 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4878 break; | 4820 break; |
4879 } | 4821 } |
4880 | 4822 |
4881 // There is input left. Advance over one char, except if we are | 4823 // There is input left. Advance over one char, except if we are |
4882 // at a cr/lf, advance over both of them. | 4824 // at a cr/lf, advance over both of them. |
4883 UChar32 c; | 4825 UChar32 c; |
4884 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4826 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
4885 if (c==0x0d && fp->fInputIdx < fActiveLimit) { | 4827 if (c==0x0d && fp->fInputIdx < fActiveLimit) { |
4886 // In the case of a CR/LF, we need to advance over both. | 4828 // In the case of a CR/LF, we need to advance over both. |
4887 if (inputBuf[fp->fInputIdx] == 0x0a) { | 4829 if (inputBuf[fp->fInputIdx] == 0x0a) { |
4888 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); | 4830 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); |
4889 } | 4831 } |
4890 } | 4832 } |
4891 } | 4833 } |
4892 break; | 4834 break; |
4893 | 4835 |
4894 | 4836 |
4895 case URX_DOTANY_UNIX: | 4837 case URX_DOTANY_UNIX: |
4896 { | 4838 { |
4897 // '.' operator, matches all, but stops at end-of-line. | 4839 // '.' operator, matches all, but stops at end-of-line. |
4898 // UNIX_LINES mode, so 0x0a is the only recognized line ending
. | 4840 // UNIX_LINES mode, so 0x0a is the only recognized line ending
. |
4899 if (fp->fInputIdx >= fActiveLimit) { | 4841 if (fp->fInputIdx >= fActiveLimit) { |
4900 // At end of input. Match failed. Backtrack out. | 4842 // At end of input. Match failed. Backtrack out. |
4901 fHitEnd = TRUE; | 4843 fHitEnd = TRUE; |
4902 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4844 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4903 break; | 4845 break; |
4904 } | 4846 } |
4905 | 4847 |
4906 // There is input left. Advance over one char, unless we've hit
end-of-line | 4848 // There is input left. Advance over one char, unless we've hit
end-of-line |
4907 UChar32 c; | 4849 UChar32 c; |
4908 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 4850 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
4909 if (c == 0x0a) { | 4851 if (c == 0x0a) { |
4910 // End of line in normal mode. '.' does not match the \n | 4852 // End of line in normal mode. '.' does not match the \n |
4911 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4853 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4912 } | 4854 } |
4913 } | 4855 } |
4914 break; | 4856 break; |
4915 | 4857 |
4916 | 4858 |
4917 case URX_JMP: | 4859 case URX_JMP: |
4918 fp->fPatIdx = opValue; | 4860 fp->fPatIdx = opValue; |
4919 break; | 4861 break; |
4920 | 4862 |
4921 case URX_FAIL: | 4863 case URX_FAIL: |
4922 isMatch = FALSE; | 4864 isMatch = FALSE; |
4923 goto breakFromLoop; | 4865 goto breakFromLoop; |
4924 | 4866 |
4925 case URX_JMP_SAV: | 4867 case URX_JMP_SAV: |
4926 U_ASSERT(opValue < fPattern->fCompiledPat->size()); | 4868 U_ASSERT(opValue < fPattern->fCompiledPat->size()); |
4927 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc
following current | 4869 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc
following current |
4928 fp->fPatIdx = opValue; // Then JMP. | 4870 fp->fPatIdx = opValue; // Then JMP. |
4929 break; | 4871 break; |
4930 | 4872 |
4931 case URX_JMP_SAV_X: | 4873 case URX_JMP_SAV_X: |
4932 // This opcode is used with (x)+, when x can match a zero length str
ing. | 4874 // This opcode is used with (x)+, when x can match a zero length str
ing. |
4933 // Same as JMP_SAV, except conditional on the match having made forw
ard progress. | 4875 // Same as JMP_SAV, except conditional on the match having made forw
ard progress. |
4934 // Destination of the JMP must be a URX_STO_INP_LOC, from which we g
et the | 4876 // Destination of the JMP must be a URX_STO_INP_LOC, from which we g
et the |
4935 // data address of the input position at the start of the loop. | 4877 // data address of the input position at the start of the loop. |
4936 { | 4878 { |
4937 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()
); | 4879 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()
); |
4938 int32_t stoOp = (int32_t)pat[opValue-1]; | 4880 int32_t stoOp = (int32_t)pat[opValue-1]; |
4939 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); | 4881 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); |
4940 int32_t frameLoc = URX_VAL(stoOp); | 4882 int32_t frameLoc = URX_VAL(stoOp); |
4941 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); | 4883 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); |
4942 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; | 4884 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; |
4943 U_ASSERT(prevInputIdx <= fp->fInputIdx); | 4885 U_ASSERT(prevInputIdx <= fp->fInputIdx); |
4944 if (prevInputIdx < fp->fInputIdx) { | 4886 if (prevInputIdx < fp->fInputIdx) { |
4945 // The match did make progress. Repeat the loop. | 4887 // The match did make progress. Repeat the loop. |
4946 fp = StateSave(fp, fp->fPatIdx, status); // State save to l
oc following current | 4888 fp = StateSave(fp, fp->fPatIdx, status); // State save to l
oc following current |
4947 fp->fPatIdx = opValue; | 4889 fp->fPatIdx = opValue; |
4948 fp->fExtra[frameLoc] = fp->fInputIdx; | 4890 fp->fExtra[frameLoc] = fp->fInputIdx; |
4949 } | 4891 } |
4950 // If the input position did not advance, we do nothing here, | 4892 // If the input position did not advance, we do nothing here, |
4951 // execution will fall out of the loop. | 4893 // execution will fall out of the loop. |
4952 } | 4894 } |
4953 break; | 4895 break; |
4954 | 4896 |
4955 case URX_CTR_INIT: | 4897 case URX_CTR_INIT: |
4956 { | 4898 { |
4957 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); | 4899 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); |
4958 fp->fExtra[opValue] = 0; // Set the loop counte
r variable to zero | 4900 fp->fExtra[opValue] = 0; // Set the loop counte
r variable to zero |
4959 | 4901 |
4960 // Pick up the three extra operands that CTR_INIT has, and | 4902 // Pick up the three extra operands that CTR_INIT has, and |
4961 // skip the pattern location counter past | 4903 // skip the pattern location counter past |
4962 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; | 4904 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
4963 fp->fPatIdx += 3; | 4905 fp->fPatIdx += 3; |
4964 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); | 4906 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); |
4965 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; | 4907 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; |
4966 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; | 4908 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; |
4967 U_ASSERT(minCount>=0); | 4909 U_ASSERT(minCount>=0); |
4968 U_ASSERT(maxCount>=minCount || maxCount==-1); | 4910 U_ASSERT(maxCount>=minCount || maxCount==-1); |
4969 U_ASSERT(loopLoc>=fp->fPatIdx); | 4911 U_ASSERT(loopLoc>=fp->fPatIdx); |
4970 | 4912 |
4971 if (minCount == 0) { | 4913 if (minCount == 0) { |
4972 fp = StateSave(fp, loopLoc+1, status); | 4914 fp = StateSave(fp, loopLoc+1, status); |
4973 } | 4915 } |
4974 if (maxCount == -1) { | 4916 if (maxCount == -1) { |
4975 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaki
ng. | 4917 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaki
ng. |
4976 } else if (maxCount == 0) { | 4918 } else if (maxCount == 0) { |
4977 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 4919 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
4978 } | 4920 } |
4979 } | 4921 } |
4980 break; | 4922 break; |
4981 | 4923 |
4982 case URX_CTR_LOOP: | 4924 case URX_CTR_LOOP: |
4983 { | 4925 { |
4984 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); | 4926 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); |
4985 int32_t initOp = (int32_t)pat[opValue]; | 4927 int32_t initOp = (int32_t)pat[opValue]; |
4986 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); | 4928 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); |
4987 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; | 4929 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; |
4988 int32_t minCount = (int32_t)pat[opValue+2]; | 4930 int32_t minCount = (int32_t)pat[opValue+2]; |
4989 int32_t maxCount = (int32_t)pat[opValue+3]; | 4931 int32_t maxCount = (int32_t)pat[opValue+3]; |
4990 (*pCounter)++; | 4932 (*pCounter)++; |
4991 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1)
{ | 4933 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1)
{ |
4992 U_ASSERT(*pCounter == maxCount); | 4934 U_ASSERT(*pCounter == maxCount); |
4993 break; | 4935 break; |
4994 } | 4936 } |
4995 if (*pCounter >= minCount) { | 4937 if (*pCounter >= minCount) { |
4996 if (maxCount == -1) { | 4938 if (maxCount == -1) { |
4997 // Loop has no hard upper bound. | 4939 // Loop has no hard upper bound. |
4998 // Check that it is progressing through the input, break
if it is not. | 4940 // Check that it is progressing through the input, break
if it is not. |
4999 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) +
1]; | 4941 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) +
1]; |
5000 if (fp->fInputIdx == *pLastInputIdx) { | 4942 if (fp->fInputIdx == *pLastInputIdx) { |
5001 break; | 4943 break; |
5002 } else { | 4944 } else { |
5003 *pLastInputIdx = fp->fInputIdx; | 4945 *pLastInputIdx = fp->fInputIdx; |
5004 } | 4946 } |
5005 } | 4947 } |
5006 fp = StateSave(fp, fp->fPatIdx, status); | 4948 fp = StateSave(fp, fp->fPatIdx, status); |
5007 } | 4949 } |
5008 fp->fPatIdx = opValue + 4; // Loop back. | 4950 fp->fPatIdx = opValue + 4; // Loop back. |
5009 } | 4951 } |
5010 break; | 4952 break; |
5011 | 4953 |
5012 case URX_CTR_INIT_NG: | 4954 case URX_CTR_INIT_NG: |
5013 { | 4955 { |
5014 // Initialize a non-greedy loop | 4956 // Initialize a non-greedy loop |
5015 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); | 4957 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); |
5016 fp->fExtra[opValue] = 0; // Set the loop counte
r variable to zero | 4958 fp->fExtra[opValue] = 0; // Set the loop counte
r variable to zero |
5017 | 4959 |
5018 // Pick up the three extra operands that CTR_INIT_NG has, and | 4960 // Pick up the three extra operands that CTR_INIT_NG has, and |
5019 // skip the pattern location counter past | 4961 // skip the pattern location counter past |
5020 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; | 4962 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
5021 fp->fPatIdx += 3; | 4963 fp->fPatIdx += 3; |
5022 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); | 4964 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); |
5023 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; | 4965 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; |
5024 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; | 4966 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; |
5025 U_ASSERT(minCount>=0); | 4967 U_ASSERT(minCount>=0); |
5026 U_ASSERT(maxCount>=minCount || maxCount==-1); | 4968 U_ASSERT(maxCount>=minCount || maxCount==-1); |
5027 U_ASSERT(loopLoc>fp->fPatIdx); | 4969 U_ASSERT(loopLoc>fp->fPatIdx); |
5028 if (maxCount == -1) { | 4970 if (maxCount == -1) { |
5029 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial in
put index for loop breaking. | 4971 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial in
put index for loop breaking. |
5030 } | 4972 } |
5031 | 4973 |
5032 if (minCount == 0) { | 4974 if (minCount == 0) { |
5033 if (maxCount != 0) { | 4975 if (maxCount != 0) { |
5034 fp = StateSave(fp, fp->fPatIdx, status); | 4976 fp = StateSave(fp, fp->fPatIdx, status); |
5035 } | 4977 } |
5036 fp->fPatIdx = loopLoc+1; // Continue with stuff after repe
ated block | 4978 fp->fPatIdx = loopLoc+1; // Continue with stuff after repe
ated block |
5037 } | 4979 } |
5038 } | 4980 } |
5039 break; | 4981 break; |
5040 | 4982 |
5041 case URX_CTR_LOOP_NG: | 4983 case URX_CTR_LOOP_NG: |
5042 { | 4984 { |
5043 // Non-greedy {min, max} loops | 4985 // Non-greedy {min, max} loops |
5044 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); | 4986 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); |
5045 int32_t initOp = (int32_t)pat[opValue]; | 4987 int32_t initOp = (int32_t)pat[opValue]; |
5046 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); | 4988 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); |
5047 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; | 4989 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; |
5048 int32_t minCount = (int32_t)pat[opValue+2]; | 4990 int32_t minCount = (int32_t)pat[opValue+2]; |
5049 int32_t maxCount = (int32_t)pat[opValue+3]; | 4991 int32_t maxCount = (int32_t)pat[opValue+3]; |
5050 | 4992 |
5051 (*pCounter)++; | 4993 (*pCounter)++; |
5052 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1)
{ | 4994 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1)
{ |
5053 // The loop has matched the maximum permitted number of time
s. | 4995 // The loop has matched the maximum permitted number of time
s. |
5054 // Break out of here with no action. Matching will | 4996 // Break out of here with no action. Matching will |
5055 // continue with the following pattern. | 4997 // continue with the following pattern. |
5056 U_ASSERT(*pCounter == maxCount); | 4998 U_ASSERT(*pCounter == maxCount); |
5057 break; | 4999 break; |
5058 } | 5000 } |
5059 | 5001 |
5060 if (*pCounter < minCount) { | 5002 if (*pCounter < minCount) { |
5061 // We haven't met the minimum number of matches yet. | 5003 // We haven't met the minimum number of matches yet. |
5062 // Loop back for another one. | 5004 // Loop back for another one. |
5063 fp->fPatIdx = opValue + 4; // Loop back. | 5005 fp->fPatIdx = opValue + 4; // Loop back. |
5064 } else { | 5006 } else { |
5065 // We do have the minimum number of matches. | 5007 // We do have the minimum number of matches. |
5066 | 5008 |
5067 // If there is no upper bound on the loop iterations, check
that the input index | 5009 // If there is no upper bound on the loop iterations, check
that the input index |
5068 // is progressing, and stop the loop if it is not. | 5010 // is progressing, and stop the loop if it is not. |
5069 if (maxCount == -1) { | 5011 if (maxCount == -1) { |
5070 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) +
1]; | 5012 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) +
1]; |
5071 if (fp->fInputIdx == *pLastInputIdx) { | 5013 if (fp->fInputIdx == *pLastInputIdx) { |
5072 break; | 5014 break; |
5073 } | 5015 } |
5074 *pLastInputIdx = fp->fInputIdx; | 5016 *pLastInputIdx = fp->fInputIdx; |
5075 } | 5017 } |
5076 | 5018 |
5077 // Loop Continuation: we will fall into the pattern followin
g the loop | 5019 // Loop Continuation: we will fall into the pattern followin
g the loop |
5078 // (non-greedy, don't execute loop body first), but first
do | 5020 // (non-greedy, don't execute loop body first), but first
do |
5079 // a state save to the top of the loop, so that a match fa
ilure | 5021 // a state save to the top of the loop, so that a match fa
ilure |
5080 // in the following pattern will try another iteration of
the loop. | 5022 // in the following pattern will try another iteration of
the loop. |
5081 fp = StateSave(fp, opValue + 4, status); | 5023 fp = StateSave(fp, opValue + 4, status); |
5082 } | 5024 } |
5083 } | 5025 } |
5084 break; | 5026 break; |
5085 | 5027 |
5086 case URX_STO_SP: | 5028 case URX_STO_SP: |
5087 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); | 5029 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); |
5088 fData[opValue] = fStack->size(); | 5030 fData[opValue] = fStack->size(); |
5089 break; | 5031 break; |
5090 | 5032 |
5091 case URX_LD_SP: | 5033 case URX_LD_SP: |
5092 { | 5034 { |
5093 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); | 5035 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); |
5094 int32_t newStackSize = (int32_t)fData[opValue]; | 5036 int32_t newStackSize = (int32_t)fData[opValue]; |
5095 U_ASSERT(newStackSize <= fStack->size()); | 5037 U_ASSERT(newStackSize <= fStack->size()); |
5096 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize
; | 5038 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize
; |
5097 if (newFP == (int64_t *)fp) { | 5039 if (newFP == (int64_t *)fp) { |
5098 break; | 5040 break; |
5099 } | 5041 } |
5100 int32_t i; | 5042 int32_t i; |
5101 for (i=0; i<fFrameSize; i++) { | 5043 for (i=0; i<fFrameSize; i++) { |
5102 newFP[i] = ((int64_t *)fp)[i]; | 5044 newFP[i] = ((int64_t *)fp)[i]; |
5103 } | 5045 } |
5104 fp = (REStackFrame *)newFP; | 5046 fp = (REStackFrame *)newFP; |
5105 fStack->setSize(newStackSize); | 5047 fStack->setSize(newStackSize); |
5106 } | 5048 } |
5107 break; | 5049 break; |
5108 | 5050 |
5109 case URX_BACKREF: | 5051 case URX_BACKREF: |
5110 { | 5052 { |
5111 U_ASSERT(opValue < fFrameSize); | 5053 U_ASSERT(opValue < fFrameSize); |
5112 int64_t groupStartIdx = fp->fExtra[opValue]; | 5054 int64_t groupStartIdx = fp->fExtra[opValue]; |
5113 int64_t groupEndIdx = fp->fExtra[opValue+1]; | 5055 int64_t groupEndIdx = fp->fExtra[opValue+1]; |
5114 U_ASSERT(groupStartIdx <= groupEndIdx); | 5056 U_ASSERT(groupStartIdx <= groupEndIdx); |
5115 int64_t inputIndex = fp->fInputIdx; | 5057 int64_t inputIndex = fp->fInputIdx; |
5116 if (groupStartIdx < 0) { | 5058 if (groupStartIdx < 0) { |
5117 // This capture group has not participated in the match thus
far, | 5059 // This capture group has not participated in the match thus
far, |
5118 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. | 5060 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. |
(...skipping 11 matching lines...) Expand all Loading... |
5130 break; | 5072 break; |
5131 } | 5073 } |
5132 } | 5074 } |
5133 if (success) { | 5075 if (success) { |
5134 fp->fInputIdx = inputIndex; | 5076 fp->fInputIdx = inputIndex; |
5135 } else { | 5077 } else { |
5136 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 5078 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
5137 } | 5079 } |
5138 } | 5080 } |
5139 break; | 5081 break; |
5140 | 5082 |
5141 case URX_BACKREF_I: | 5083 case URX_BACKREF_I: |
5142 { | 5084 { |
5143 U_ASSERT(opValue < fFrameSize); | 5085 U_ASSERT(opValue < fFrameSize); |
5144 int64_t groupStartIdx = fp->fExtra[opValue]; | 5086 int64_t groupStartIdx = fp->fExtra[opValue]; |
5145 int64_t groupEndIdx = fp->fExtra[opValue+1]; | 5087 int64_t groupEndIdx = fp->fExtra[opValue+1]; |
5146 U_ASSERT(groupStartIdx <= groupEndIdx); | 5088 U_ASSERT(groupStartIdx <= groupEndIdx); |
5147 if (groupStartIdx < 0) { | 5089 if (groupStartIdx < 0) { |
5148 // This capture group has not participated in the match thus
far, | 5090 // This capture group has not participated in the match thus
far, |
5149 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. | 5091 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no match. |
5150 break; | 5092 break; |
5151 } | 5093 } |
5152 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx
, groupEndIdx); | 5094 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx
, groupEndIdx); |
5153 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActi
veLimit); | 5095 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActi
veLimit); |
5154 | 5096 |
5155 // Note: if the capture group match was of an empty string the
backref | 5097 // Note: if the capture group match was of an empty string the
backref |
5156 // match succeeds. Verified by testing: Perl matches s
ucceed | 5098 // match succeeds. Verified by testing: Perl matches s
ucceed |
5157 // in this case, so we do too. | 5099 // in this case, so we do too. |
5158 | 5100 |
5159 UBool success = TRUE; | 5101 UBool success = TRUE; |
5160 for (;;) { | 5102 for (;;) { |
5161 UChar32 captureGroupChar = captureGroupItr.next(); | 5103 UChar32 captureGroupChar = captureGroupItr.next(); |
5162 if (captureGroupChar == U_SENTINEL) { | 5104 if (captureGroupChar == U_SENTINEL) { |
5163 success = TRUE; | 5105 success = TRUE; |
5164 break; | 5106 break; |
5165 } | 5107 } |
5166 UChar32 inputChar = inputItr.next(); | 5108 UChar32 inputChar = inputItr.next(); |
5167 if (inputChar == U_SENTINEL) { | 5109 if (inputChar == U_SENTINEL) { |
5168 success = FALSE; | 5110 success = FALSE; |
5169 fHitEnd = TRUE; | 5111 fHitEnd = TRUE; |
5170 break; | 5112 break; |
5171 } | 5113 } |
5172 if (inputChar != captureGroupChar) { | 5114 if (inputChar != captureGroupChar) { |
5173 success = FALSE; | 5115 success = FALSE; |
5174 break; | 5116 break; |
5175 } | 5117 } |
5176 } | 5118 } |
5177 | 5119 |
5178 if (success && inputItr.inExpansion()) { | 5120 if (success && inputItr.inExpansion()) { |
5179 // We otained a match by consuming part of a string obtained
from | 5121 // We otained a match by consuming part of a string obtained
from |
5180 // case-folding a single code point of the input text. | 5122 // case-folding a single code point of the input text. |
5181 // This does not count as an overall match. | 5123 // This does not count as an overall match. |
5182 success = FALSE; | 5124 success = FALSE; |
5183 } | 5125 } |
5184 | 5126 |
5185 if (success) { | 5127 if (success) { |
5186 fp->fInputIdx = inputItr.getIndex(); | 5128 fp->fInputIdx = inputItr.getIndex(); |
5187 } else { | 5129 } else { |
5188 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 5130 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
5189 } | 5131 } |
5190 } | 5132 } |
5191 break; | 5133 break; |
5192 | 5134 |
5193 case URX_STO_INP_LOC: | 5135 case URX_STO_INP_LOC: |
5194 { | 5136 { |
5195 U_ASSERT(opValue >= 0 && opValue < fFrameSize); | 5137 U_ASSERT(opValue >= 0 && opValue < fFrameSize); |
5196 fp->fExtra[opValue] = fp->fInputIdx; | 5138 fp->fExtra[opValue] = fp->fInputIdx; |
5197 } | 5139 } |
5198 break; | 5140 break; |
5199 | 5141 |
5200 case URX_JMPX: | 5142 case URX_JMPX: |
5201 { | 5143 { |
5202 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; | 5144 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
5203 fp->fPatIdx += 1; | 5145 fp->fPatIdx += 1; |
5204 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); | 5146 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); |
5205 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); | 5147 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); |
5206 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; | 5148 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; |
5207 U_ASSERT(savedInputIdx <= fp->fInputIdx); | 5149 U_ASSERT(savedInputIdx <= fp->fInputIdx); |
5208 if (savedInputIdx < fp->fInputIdx) { | 5150 if (savedInputIdx < fp->fInputIdx) { |
5209 fp->fPatIdx = opValue; // JMP | 5151 fp->fPatIdx = opValue; // JMP |
5210 } else { | 5152 } else { |
5211 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no progress in loop. | 5153 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL
, no progress in loop. |
5212 } | 5154 } |
5213 } | 5155 } |
5214 break; | 5156 break; |
5215 | 5157 |
5216 case URX_LA_START: | 5158 case URX_LA_START: |
5217 { | 5159 { |
5218 // Entering a lookahead block. | 5160 // Entering a lookahead block. |
5219 // Save Stack Ptr, Input Pos. | 5161 // Save Stack Ptr, Input Pos. |
5220 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); | 5162 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
5221 fData[opValue] = fStack->size(); | 5163 fData[opValue] = fStack->size(); |
5222 fData[opValue+1] = fp->fInputIdx; | 5164 fData[opValue+1] = fp->fInputIdx; |
5223 fActiveStart = fLookStart; // Set the match region
change for | 5165 fActiveStart = fLookStart; // Set the match region
change for |
5224 fActiveLimit = fLookLimit; // transparent bounds. | 5166 fActiveLimit = fLookLimit; // transparent bounds. |
5225 } | 5167 } |
5226 break; | 5168 break; |
5227 | 5169 |
5228 case URX_LA_END: | 5170 case URX_LA_END: |
5229 { | 5171 { |
5230 // Leaving a look-ahead block. | 5172 // Leaving a look-ahead block. |
5231 // restore Stack Ptr, Input Pos to positions they had on entry
to block. | 5173 // restore Stack Ptr, Input Pos to positions they had on entry
to block. |
5232 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); | 5174 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
5233 int32_t stackSize = fStack->size(); | 5175 int32_t stackSize = fStack->size(); |
5234 int32_t newStackSize = (int32_t)fData[opValue]; | 5176 int32_t newStackSize = (int32_t)fData[opValue]; |
5235 U_ASSERT(stackSize >= newStackSize); | 5177 U_ASSERT(stackSize >= newStackSize); |
5236 if (stackSize > newStackSize) { | 5178 if (stackSize > newStackSize) { |
5237 // Copy the current top frame back to the new (cut back) top
frame. | 5179 // Copy the current top frame back to the new (cut back) top
frame. |
5238 // This makes the capture groups from within the look-ahea
d | 5180 // This makes the capture groups from within the look-ahea
d |
5239 // expression available. | 5181 // expression available. |
5240 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrame
Size; | 5182 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrame
Size; |
5241 int32_t i; | 5183 int32_t i; |
5242 for (i=0; i<fFrameSize; i++) { | 5184 for (i=0; i<fFrameSize; i++) { |
5243 newFP[i] = ((int64_t *)fp)[i]; | 5185 newFP[i] = ((int64_t *)fp)[i]; |
5244 } | 5186 } |
5245 fp = (REStackFrame *)newFP; | 5187 fp = (REStackFrame *)newFP; |
5246 fStack->setSize(newStackSize); | 5188 fStack->setSize(newStackSize); |
5247 } | 5189 } |
5248 fp->fInputIdx = fData[opValue+1]; | 5190 fp->fInputIdx = fData[opValue+1]; |
5249 | 5191 |
5250 // Restore the active region bounds in the input string; they ma
y have | 5192 // Restore the active region bounds in the input string; they ma
y have |
5251 // been changed because of transparent bounds on a Region. | 5193 // been changed because of transparent bounds on a Region. |
5252 fActiveStart = fRegionStart; | 5194 fActiveStart = fRegionStart; |
5253 fActiveLimit = fRegionLimit; | 5195 fActiveLimit = fRegionLimit; |
5254 } | 5196 } |
5255 break; | 5197 break; |
5256 | 5198 |
5257 case URX_ONECHAR_I: | 5199 case URX_ONECHAR_I: |
5258 if (fp->fInputIdx < fActiveLimit) { | 5200 if (fp->fInputIdx < fActiveLimit) { |
5259 UChar32 c; | 5201 UChar32 c; |
5260 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); | 5202 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
5261 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { | 5203 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { |
5262 break; | 5204 break; |
5263 } | 5205 } |
5264 } else { | 5206 } else { |
5265 fHitEnd = TRUE; | 5207 fHitEnd = TRUE; |
5266 } | 5208 } |
5267 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 5209 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
5268 break; | 5210 break; |
5269 | 5211 |
5270 case URX_STRING_I: | 5212 case URX_STRING_I: |
5271 // Case-insensitive test input against a literal string. | 5213 // Case-insensitive test input against a literal string. |
5272 // Strings require two slots in the compiled pattern, one for the | 5214 // Strings require two slots in the compiled pattern, one for the |
5273 // offset to the string text, and one for the length. | 5215 // offset to the string text, and one for the length. |
5274 // The compiled string has already been case folded. | 5216 // The compiled string has already been case folded. |
5275 { | 5217 { |
5276 const UChar *patternString = litText + opValue; | 5218 const UChar *patternString = litText + opValue; |
5277 | 5219 |
5278 op = (int32_t)pat[fp->fPatIdx]; | 5220 op = (int32_t)pat[fp->fPatIdx]; |
5279 fp->fPatIdx++; | 5221 fp->fPatIdx++; |
5280 opType = URX_TYPE(op); | 5222 opType = URX_TYPE(op); |
5281 opValue = URX_VAL(op); | 5223 opValue = URX_VAL(op); |
5282 U_ASSERT(opType == URX_STRING_LEN); | 5224 U_ASSERT(opType == URX_STRING_LEN); |
5283 int32_t patternStringLen = opValue; // Length of the string fro
m the pattern. | 5225 int32_t patternStringLen = opValue; // Length of the string fro
m the pattern. |
5284 | 5226 |
5285 UChar32 cText; | 5227 UChar32 cText; |
5286 UChar32 cPattern; | 5228 UChar32 cPattern; |
5287 UBool success = TRUE; | 5229 UBool success = TRUE; |
5288 int32_t patternStringIdx = 0; | 5230 int32_t patternStringIdx = 0; |
5289 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx,
fActiveLimit); | 5231 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx,
fActiveLimit); |
5290 while (patternStringIdx < patternStringLen) { | 5232 while (patternStringIdx < patternStringLen) { |
5291 U16_NEXT(patternString, patternStringIdx, patternStringLen,
cPattern); | 5233 U16_NEXT(patternString, patternStringIdx, patternStringLen,
cPattern); |
5292 cText = inputIterator.next(); | 5234 cText = inputIterator.next(); |
5293 if (cText != cPattern) { | 5235 if (cText != cPattern) { |
5294 success = FALSE; | 5236 success = FALSE; |
(...skipping 24 matching lines...) Expand all Loading... |
5319 fData[opValue] = fStack->size(); | 5261 fData[opValue] = fStack->size(); |
5320 fData[opValue+1] = fp->fInputIdx; | 5262 fData[opValue+1] = fp->fInputIdx; |
5321 // Init the variable containing the start index for attempted ma
tches. | 5263 // Init the variable containing the start index for attempted ma
tches. |
5322 fData[opValue+2] = -1; | 5264 fData[opValue+2] = -1; |
5323 // Save input string length, then reset to pin any matches to en
d at | 5265 // Save input string length, then reset to pin any matches to en
d at |
5324 // the current position. | 5266 // the current position. |
5325 fData[opValue+3] = fActiveLimit; | 5267 fData[opValue+3] = fActiveLimit; |
5326 fActiveLimit = fp->fInputIdx; | 5268 fActiveLimit = fp->fInputIdx; |
5327 } | 5269 } |
5328 break; | 5270 break; |
5329 | 5271 |
5330 | 5272 |
5331 case URX_LB_CONT: | 5273 case URX_LB_CONT: |
5332 { | 5274 { |
5333 // Positive Look-Behind, at top of loop checking for matches of
LB expression | 5275 // Positive Look-Behind, at top of loop checking for matches of
LB expression |
5334 // at all possible input starting positions. | 5276 // at all possible input starting positions. |
5335 | 5277 |
5336 // Fetch the min and max possible match lengths. They are the o
perands | 5278 // Fetch the min and max possible match lengths. They are the o
perands |
5337 // of this op in the pattern. | 5279 // of this op in the pattern. |
5338 int32_t minML = (int32_t)pat[fp->fPatIdx++]; | 5280 int32_t minML = (int32_t)pat[fp->fPatIdx++]; |
5339 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; | 5281 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; |
5340 U_ASSERT(minML <= maxML); | 5282 U_ASSERT(minML <= maxML); |
5341 U_ASSERT(minML >= 0); | 5283 U_ASSERT(minML >= 0); |
5342 | 5284 |
5343 // Fetch (from data) the last input index where a match was atte
mpted. | 5285 // Fetch (from data) the last input index where a match was atte
mpted. |
5344 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); | 5286 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
5345 int64_t *lbStartIdx = &fData[opValue+2]; | 5287 int64_t *lbStartIdx = &fData[opValue+2]; |
5346 if (*lbStartIdx < 0) { | 5288 if (*lbStartIdx < 0) { |
5347 // First time through loop. | 5289 // First time through loop. |
5348 *lbStartIdx = fp->fInputIdx - minML; | 5290 *lbStartIdx = fp->fInputIdx - minML; |
5349 } else { | 5291 } else { |
5350 // 2nd through nth time through the loop. | 5292 // 2nd through nth time through the loop. |
5351 // Back up start position for match by one. | 5293 // Back up start position for match by one. |
5352 if (*lbStartIdx == 0) { | 5294 if (*lbStartIdx == 0) { |
5353 (*lbStartIdx)--; | 5295 (*lbStartIdx)--; |
5354 } else { | 5296 } else { |
5355 U16_BACK_1(inputBuf, 0, *lbStartIdx); | 5297 U16_BACK_1(inputBuf, 0, *lbStartIdx); |
5356 } | 5298 } |
5357 } | 5299 } |
5358 | 5300 |
5359 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { | 5301 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { |
5360 // We have tried all potential match starting points without | 5302 // We have tried all potential match starting points without |
5361 // getting a match. Backtrack out, and out of the | 5303 // getting a match. Backtrack out, and out of the |
5362 // Look Behind altogether. | 5304 // Look Behind altogether. |
5363 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 5305 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
5364 int64_t restoreInputLen = fData[opValue+3]; | 5306 int64_t restoreInputLen = fData[opValue+3]; |
5365 U_ASSERT(restoreInputLen >= fActiveLimit); | 5307 U_ASSERT(restoreInputLen >= fActiveLimit); |
5366 U_ASSERT(restoreInputLen <= fInputLength); | 5308 U_ASSERT(restoreInputLen <= fInputLength); |
5367 fActiveLimit = restoreInputLen; | 5309 fActiveLimit = restoreInputLen; |
5368 break; | 5310 break; |
5369 } | 5311 } |
5370 | 5312 |
5371 // Save state to this URX_LB_CONT op, so failure to match wil
l repeat the loop. | 5313 // Save state to this URX_LB_CONT op, so failure to match wil
l repeat the loop. |
5372 // (successful match will fall off the end of the loop.) | 5314 // (successful match will fall off the end of the loop.) |
5373 fp = StateSave(fp, fp->fPatIdx-3, status); | 5315 fp = StateSave(fp, fp->fPatIdx-3, status); |
5374 fp->fInputIdx = *lbStartIdx; | 5316 fp->fInputIdx = *lbStartIdx; |
5375 } | 5317 } |
5376 break; | 5318 break; |
5377 | 5319 |
5378 case URX_LB_END: | 5320 case URX_LB_END: |
5379 // End of a look-behind block, after a successful match. | 5321 // End of a look-behind block, after a successful match. |
5380 { | 5322 { |
5381 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); | 5323 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
5382 if (fp->fInputIdx != fActiveLimit) { | 5324 if (fp->fInputIdx != fActiveLimit) { |
5383 // The look-behind expression matched, but the match did no
t | 5325 // The look-behind expression matched, but the match did no
t |
5384 // extend all the way to the point that we are looking be
hind from. | 5326 // extend all the way to the point that we are looking be
hind from. |
5385 // FAIL out of here, which will take us back to the LB_CONT
, which | 5327 // FAIL out of here, which will take us back to the LB_CONT
, which |
5386 // will retry the match starting at another position or
fail | 5328 // will retry the match starting at another position or
fail |
5387 // the look-behind altogether, whichever is appropriate. | 5329 // the look-behind altogether, whichever is appropriate. |
5388 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 5330 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
5389 break; | 5331 break; |
5390 } | 5332 } |
5391 | 5333 |
5392 // Look-behind match is good. Restore the orignal input string
length, | 5334 // Look-behind match is good. Restore the orignal input string
length, |
5393 // which had been truncated to pin the end of the lookbehind m
atch to the | 5335 // which had been truncated to pin the end of the lookbehind m
atch to the |
5394 // position being looked-behind. | 5336 // position being looked-behind. |
5395 int64_t originalInputLen = fData[opValue+3]; | 5337 int64_t originalInputLen = fData[opValue+3]; |
5396 U_ASSERT(originalInputLen >= fActiveLimit); | 5338 U_ASSERT(originalInputLen >= fActiveLimit); |
5397 U_ASSERT(originalInputLen <= fInputLength); | 5339 U_ASSERT(originalInputLen <= fInputLength); |
5398 fActiveLimit = originalInputLen; | 5340 fActiveLimit = originalInputLen; |
5399 } | 5341 } |
5400 break; | 5342 break; |
5401 | 5343 |
5402 | 5344 |
5403 case URX_LBN_CONT: | 5345 case URX_LBN_CONT: |
5404 { | 5346 { |
5405 // Negative Look-Behind, at top of loop checking for matches of
LB expression | 5347 // Negative Look-Behind, at top of loop checking for matches of
LB expression |
5406 // at all possible input starting positions. | 5348 // at all possible input starting positions. |
5407 | 5349 |
5408 // Fetch the extra parameters of this op. | 5350 // Fetch the extra parameters of this op. |
5409 int32_t minML = (int32_t)pat[fp->fPatIdx++]; | 5351 int32_t minML = (int32_t)pat[fp->fPatIdx++]; |
5410 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; | 5352 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; |
5411 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; | 5353 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; |
5412 continueLoc = URX_VAL(continueLoc); | 5354 continueLoc = URX_VAL(continueLoc); |
5413 U_ASSERT(minML <= maxML); | 5355 U_ASSERT(minML <= maxML); |
5414 U_ASSERT(minML >= 0); | 5356 U_ASSERT(minML >= 0); |
5415 U_ASSERT(continueLoc > fp->fPatIdx); | 5357 U_ASSERT(continueLoc > fp->fPatIdx); |
5416 | 5358 |
5417 // Fetch (from data) the last input index where a match was atte
mpted. | 5359 // Fetch (from data) the last input index where a match was atte
mpted. |
5418 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); | 5360 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
5419 int64_t *lbStartIdx = &fData[opValue+2]; | 5361 int64_t *lbStartIdx = &fData[opValue+2]; |
5420 if (*lbStartIdx < 0) { | 5362 if (*lbStartIdx < 0) { |
5421 // First time through loop. | 5363 // First time through loop. |
5422 *lbStartIdx = fp->fInputIdx - minML; | 5364 *lbStartIdx = fp->fInputIdx - minML; |
5423 } else { | 5365 } else { |
5424 // 2nd through nth time through the loop. | 5366 // 2nd through nth time through the loop. |
5425 // Back up start position for match by one. | 5367 // Back up start position for match by one. |
5426 if (*lbStartIdx == 0) { | 5368 if (*lbStartIdx == 0) { |
5427 (*lbStartIdx)--; // Because U16_BACK is unsafe startin
g at 0. | 5369 (*lbStartIdx)--; // Because U16_BACK is unsafe startin
g at 0. |
5428 } else { | 5370 } else { |
5429 U16_BACK_1(inputBuf, 0, *lbStartIdx); | 5371 U16_BACK_1(inputBuf, 0, *lbStartIdx); |
5430 } | 5372 } |
5431 } | 5373 } |
5432 | 5374 |
5433 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { | 5375 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { |
5434 // We have tried all potential match starting points without | 5376 // We have tried all potential match starting points without |
5435 // getting a match, which means that the negative lookbehin
d as | 5377 // getting a match, which means that the negative lookbehin
d as |
5436 // a whole has succeeded. Jump forward to the continue loc
ation | 5378 // a whole has succeeded. Jump forward to the continue loc
ation |
5437 int64_t restoreInputLen = fData[opValue+3]; | 5379 int64_t restoreInputLen = fData[opValue+3]; |
5438 U_ASSERT(restoreInputLen >= fActiveLimit); | 5380 U_ASSERT(restoreInputLen >= fActiveLimit); |
5439 U_ASSERT(restoreInputLen <= fInputLength); | 5381 U_ASSERT(restoreInputLen <= fInputLength); |
5440 fActiveLimit = restoreInputLen; | 5382 fActiveLimit = restoreInputLen; |
5441 fp->fPatIdx = continueLoc; | 5383 fp->fPatIdx = continueLoc; |
5442 break; | 5384 break; |
5443 } | 5385 } |
5444 | 5386 |
5445 // Save state to this URX_LB_CONT op, so failure to match wil
l repeat the loop. | 5387 // Save state to this URX_LB_CONT op, so failure to match wil
l repeat the loop. |
5446 // (successful match will cause a FAIL out of the loop alto
gether.) | 5388 // (successful match will cause a FAIL out of the loop alto
gether.) |
5447 fp = StateSave(fp, fp->fPatIdx-4, status); | 5389 fp = StateSave(fp, fp->fPatIdx-4, status); |
5448 fp->fInputIdx = *lbStartIdx; | 5390 fp->fInputIdx = *lbStartIdx; |
5449 } | 5391 } |
5450 break; | 5392 break; |
5451 | 5393 |
5452 case URX_LBN_END: | 5394 case URX_LBN_END: |
5453 // End of a negative look-behind block, after a successful match. | 5395 // End of a negative look-behind block, after a successful match. |
5454 { | 5396 { |
5455 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); | 5397 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
5456 if (fp->fInputIdx != fActiveLimit) { | 5398 if (fp->fInputIdx != fActiveLimit) { |
5457 // The look-behind expression matched, but the match did no
t | 5399 // The look-behind expression matched, but the match did no
t |
5458 // extend all the way to the point that we are looking be
hind from. | 5400 // extend all the way to the point that we are looking be
hind from. |
5459 // FAIL out of here, which will take us back to the LB_CONT
, which | 5401 // FAIL out of here, which will take us back to the LB_CONT
, which |
5460 // will retry the match starting at another position or
succeed | 5402 // will retry the match starting at another position or
succeed |
5461 // the look-behind altogether, whichever is appropriate. | 5403 // the look-behind altogether, whichever is appropriate. |
5462 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 5404 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
5463 break; | 5405 break; |
5464 } | 5406 } |
5465 | 5407 |
5466 // Look-behind expression matched, which means look-behind test
as | 5408 // Look-behind expression matched, which means look-behind test
as |
5467 // a whole Fails | 5409 // a whole Fails |
5468 | 5410 |
5469 // Restore the orignal input string length, which had been tru
ncated | 5411 // Restore the orignal input string length, which had been tru
ncated |
5470 // inorder to pin the end of the lookbehind match | 5412 // inorder to pin the end of the lookbehind match |
5471 // to the position being looked-behind. | 5413 // to the position being looked-behind. |
5472 int64_t originalInputLen = fData[opValue+3]; | 5414 int64_t originalInputLen = fData[opValue+3]; |
5473 U_ASSERT(originalInputLen >= fActiveLimit); | 5415 U_ASSERT(originalInputLen >= fActiveLimit); |
5474 U_ASSERT(originalInputLen <= fInputLength); | 5416 U_ASSERT(originalInputLen <= fInputLength); |
5475 fActiveLimit = originalInputLen; | 5417 fActiveLimit = originalInputLen; |
5476 | 5418 |
5477 // Restore original stack position, discarding any state saved | 5419 // Restore original stack position, discarding any state saved |
5478 // by the successful pattern match. | 5420 // by the successful pattern match. |
5479 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); | 5421 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
5480 int32_t newStackSize = (int32_t)fData[opValue]; | 5422 int32_t newStackSize = (int32_t)fData[opValue]; |
5481 U_ASSERT(fStack->size() > newStackSize); | 5423 U_ASSERT(fStack->size() > newStackSize); |
5482 fStack->setSize(newStackSize); | 5424 fStack->setSize(newStackSize); |
5483 | 5425 |
5484 // FAIL, which will take control back to someplace | 5426 // FAIL, which will take control back to someplace |
5485 // prior to entering the look-behind test. | 5427 // prior to entering the look-behind test. |
5486 fp = (REStackFrame *)fStack->popFrame(fFrameSize); | 5428 fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
5487 } | 5429 } |
5488 break; | 5430 break; |
5489 | 5431 |
5490 | 5432 |
5491 case URX_LOOP_SR_I: | 5433 case URX_LOOP_SR_I: |
5492 // Loop Initialization for the optimized implementation of | 5434 // Loop Initialization for the optimized implementation of |
5493 // [some character set]* | 5435 // [some character set]* |
5494 // This op scans through all matching input. | 5436 // This op scans through all matching input. |
5495 // The following LOOP_C op emulates stack unwinding if the followi
ng pattern fails. | 5437 // The following LOOP_C op emulates stack unwinding if the followi
ng pattern fails. |
5496 { | 5438 { |
5497 U_ASSERT(opValue > 0 && opValue < sets->size()); | 5439 U_ASSERT(opValue > 0 && opValue < sets->size()); |
5498 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; | 5440 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; |
5499 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); | 5441 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); |
5500 | 5442 |
5501 // Loop through input, until either the input is exhausted or | 5443 // Loop through input, until either the input is exhausted or |
5502 // we reach a character that is not a member of the set. | 5444 // we reach a character that is not a member of the set. |
5503 int32_t ix = (int32_t)fp->fInputIdx; | 5445 int32_t ix = (int32_t)fp->fInputIdx; |
5504 for (;;) { | 5446 for (;;) { |
5505 if (ix >= fActiveLimit) { | 5447 if (ix >= fActiveLimit) { |
5506 fHitEnd = TRUE; | 5448 fHitEnd = TRUE; |
5507 break; | 5449 break; |
5508 } | 5450 } |
5509 UChar32 c; | 5451 UChar32 c; |
5510 U16_NEXT(inputBuf, ix, fActiveLimit, c); | 5452 U16_NEXT(inputBuf, ix, fActiveLimit, c); |
5511 if (c<256) { | 5453 if (c<256) { |
5512 if (s8->contains(c) == FALSE) { | 5454 if (s8->contains(c) == FALSE) { |
5513 U16_BACK_1(inputBuf, 0, ix); | 5455 U16_BACK_1(inputBuf, 0, ix); |
5514 break; | 5456 break; |
5515 } | 5457 } |
5516 } else { | 5458 } else { |
5517 if (s->contains(c) == FALSE) { | 5459 if (s->contains(c) == FALSE) { |
5518 U16_BACK_1(inputBuf, 0, ix); | 5460 U16_BACK_1(inputBuf, 0, ix); |
5519 break; | 5461 break; |
5520 } | 5462 } |
5521 } | 5463 } |
5522 } | 5464 } |
5523 | 5465 |
5524 // If there were no matching characters, skip over the loop alto
gether. | 5466 // If there were no matching characters, skip over the loop alto
gether. |
5525 // The loop doesn't run at all, a * op always succeeds. | 5467 // The loop doesn't run at all, a * op always succeeds. |
5526 if (ix == fp->fInputIdx) { | 5468 if (ix == fp->fInputIdx) { |
5527 fp->fPatIdx++; // skip the URX_LOOP_C op. | 5469 fp->fPatIdx++; // skip the URX_LOOP_C op. |
5528 break; | 5470 break; |
5529 } | 5471 } |
5530 | 5472 |
5531 // Peek ahead in the compiled pattern, to the URX_LOOP_C that | 5473 // Peek ahead in the compiled pattern, to the URX_LOOP_C that |
5532 // must follow. It's operand is the stack location | 5474 // must follow. It's operand is the stack location |
5533 // that holds the starting input index for the match of this [
set]* | 5475 // that holds the starting input index for the match of this [
set]* |
5534 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; | 5476 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; |
5535 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); | 5477 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); |
5536 int32_t stackLoc = URX_VAL(loopcOp); | 5478 int32_t stackLoc = URX_VAL(loopcOp); |
5537 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); | 5479 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); |
5538 fp->fExtra[stackLoc] = fp->fInputIdx; | 5480 fp->fExtra[stackLoc] = fp->fInputIdx; |
5539 fp->fInputIdx = ix; | 5481 fp->fInputIdx = ix; |
5540 | 5482 |
5541 // Save State to the URX_LOOP_C op that follows this one, | 5483 // Save State to the URX_LOOP_C op that follows this one, |
5542 // so that match failures in the following code will return to
there. | 5484 // so that match failures in the following code will return to
there. |
5543 // Then bump the pattern idx so the LOOP_C is skipped on the w
ay out of here. | 5485 // Then bump the pattern idx so the LOOP_C is skipped on the w
ay out of here. |
5544 fp = StateSave(fp, fp->fPatIdx, status); | 5486 fp = StateSave(fp, fp->fPatIdx, status); |
5545 fp->fPatIdx++; | 5487 fp->fPatIdx++; |
5546 } | 5488 } |
5547 break; | 5489 break; |
5548 | 5490 |
5549 | 5491 |
5550 case URX_LOOP_DOT_I: | 5492 case URX_LOOP_DOT_I: |
5551 // Loop Initialization for the optimized implementation of .* | 5493 // Loop Initialization for the optimized implementation of .* |
5552 // This op scans through all remaining input. | 5494 // This op scans through all remaining input. |
5553 // The following LOOP_C op emulates stack unwinding if the followi
ng pattern fails. | 5495 // The following LOOP_C op emulates stack unwinding if the followi
ng pattern fails. |
5554 { | 5496 { |
5555 // Loop through input until the input is exhausted (we reach an
end-of-line) | 5497 // Loop through input until the input is exhausted (we reach an
end-of-line) |
5556 // In DOTALL mode, we can just go straight to the end of the inp
ut. | 5498 // In DOTALL mode, we can just go straight to the end of the inp
ut. |
5557 int32_t ix; | 5499 int32_t ix; |
5558 if ((opValue & 1) == 1) { | 5500 if ((opValue & 1) == 1) { |
5559 // Dot-matches-All mode. Jump straight to the end of the st
ring. | 5501 // Dot-matches-All mode. Jump straight to the end of the st
ring. |
(...skipping 15 matching lines...) Expand all Loading... |
5575 (((opValue & 2) == 0) && // IF not UNIX_LINES
mode | 5517 (((opValue & 2) == 0) && // IF not UNIX_LINES
mode |
5576 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028
|| c==0x2029))) { | 5518 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028
|| c==0x2029))) { |
5577 // char is a line ending. Put the input pos ba
ck to the | 5519 // char is a line ending. Put the input pos ba
ck to the |
5578 // line ending char, and exit the scanning lo
op. | 5520 // line ending char, and exit the scanning lo
op. |
5579 U16_BACK_1(inputBuf, 0, ix); | 5521 U16_BACK_1(inputBuf, 0, ix); |
5580 break; | 5522 break; |
5581 } | 5523 } |
5582 } | 5524 } |
5583 } | 5525 } |
5584 } | 5526 } |
5585 | 5527 |
5586 // If there were no matching characters, skip over the loop alto
gether. | 5528 // If there were no matching characters, skip over the loop alto
gether. |
5587 // The loop doesn't run at all, a * op always succeeds. | 5529 // The loop doesn't run at all, a * op always succeeds. |
5588 if (ix == fp->fInputIdx) { | 5530 if (ix == fp->fInputIdx) { |
5589 fp->fPatIdx++; // skip the URX_LOOP_C op. | 5531 fp->fPatIdx++; // skip the URX_LOOP_C op. |
5590 break; | 5532 break; |
5591 } | 5533 } |
5592 | 5534 |
5593 // Peek ahead in the compiled pattern, to the URX_LOOP_C that | 5535 // Peek ahead in the compiled pattern, to the URX_LOOP_C that |
5594 // must follow. It's operand is the stack location | 5536 // must follow. It's operand is the stack location |
5595 // that holds the starting input index for the match of this .
* | 5537 // that holds the starting input index for the match of this .
* |
5596 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; | 5538 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; |
5597 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); | 5539 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); |
5598 int32_t stackLoc = URX_VAL(loopcOp); | 5540 int32_t stackLoc = URX_VAL(loopcOp); |
5599 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); | 5541 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); |
5600 fp->fExtra[stackLoc] = fp->fInputIdx; | 5542 fp->fExtra[stackLoc] = fp->fInputIdx; |
5601 fp->fInputIdx = ix; | 5543 fp->fInputIdx = ix; |
5602 | 5544 |
5603 // Save State to the URX_LOOP_C op that follows this one, | 5545 // Save State to the URX_LOOP_C op that follows this one, |
5604 // so that match failures in the following code will return to
there. | 5546 // so that match failures in the following code will return to
there. |
5605 // Then bump the pattern idx so the LOOP_C is skipped on the w
ay out of here. | 5547 // Then bump the pattern idx so the LOOP_C is skipped on the w
ay out of here. |
5606 fp = StateSave(fp, fp->fPatIdx, status); | 5548 fp = StateSave(fp, fp->fPatIdx, status); |
5607 fp->fPatIdx++; | 5549 fp->fPatIdx++; |
5608 } | 5550 } |
5609 break; | 5551 break; |
5610 | 5552 |
5611 | 5553 |
5612 case URX_LOOP_C: | 5554 case URX_LOOP_C: |
5613 { | 5555 { |
5614 U_ASSERT(opValue>=0 && opValue<fFrameSize); | 5556 U_ASSERT(opValue>=0 && opValue<fFrameSize); |
5615 backSearchIndex = (int32_t)fp->fExtra[opValue]; | 5557 backSearchIndex = (int32_t)fp->fExtra[opValue]; |
5616 U_ASSERT(backSearchIndex <= fp->fInputIdx); | 5558 U_ASSERT(backSearchIndex <= fp->fInputIdx); |
5617 if (backSearchIndex == fp->fInputIdx) { | 5559 if (backSearchIndex == fp->fInputIdx) { |
5618 // We've backed up the input idx to the point that the loop
started. | 5560 // We've backed up the input idx to the point that the loop
started. |
5619 // The loop is done. Leave here without saving state. | 5561 // The loop is done. Leave here without saving state. |
5620 // Subsequent failures won't come back here. | 5562 // Subsequent failures won't come back here. |
5621 break; | 5563 break; |
5622 } | 5564 } |
5623 // Set up for the next iteration of the loop, with input index | 5565 // Set up for the next iteration of the loop, with input index |
5624 // backed up by one from the last time through, | 5566 // backed up by one from the last time through, |
5625 // and a state save to this instruction in case the following
code fails again. | 5567 // and a state save to this instruction in case the following
code fails again. |
5626 // (We're going backwards because this loop emulates stack unw
inding, not | 5568 // (We're going backwards because this loop emulates stack unw
inding, not |
5627 // the initial scan forward.) | 5569 // the initial scan forward.) |
5628 U_ASSERT(fp->fInputIdx > 0); | 5570 U_ASSERT(fp->fInputIdx > 0); |
5629 UChar32 prevC; | 5571 UChar32 prevC; |
5630 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this
0 be one of f*Limit? | 5572 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this
0 be one of f*Limit? |
5631 | 5573 |
5632 if (prevC == 0x0a && | 5574 if (prevC == 0x0a && |
5633 fp->fInputIdx > backSearchIndex && | 5575 fp->fInputIdx > backSearchIndex && |
5634 inputBuf[fp->fInputIdx-1] == 0x0d) { | 5576 inputBuf[fp->fInputIdx-1] == 0x0d) { |
5635 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; | 5577 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; |
5636 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { | 5578 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { |
5637 // .*, stepping back over CRLF pair. | 5579 // .*, stepping back over CRLF pair. |
5638 U16_BACK_1(inputBuf, 0, fp->fInputIdx); | 5580 U16_BACK_1(inputBuf, 0, fp->fInputIdx); |
5639 } | 5581 } |
5640 } | 5582 } |
5641 | 5583 |
5642 | 5584 |
5643 fp = StateSave(fp, fp->fPatIdx-1, status); | 5585 fp = StateSave(fp, fp->fPatIdx-1, status); |
5644 } | 5586 } |
5645 break; | 5587 break; |
5646 | 5588 |
5647 | 5589 |
5648 | 5590 |
5649 default: | 5591 default: |
5650 // Trouble. The compiled pattern contains an entry with an | 5592 // Trouble. The compiled pattern contains an entry with an |
5651 // unrecognized type tag. | 5593 // unrecognized type tag. |
5652 U_ASSERT(FALSE); | 5594 U_ASSERT(FALSE); |
5653 } | 5595 } |
5654 | 5596 |
5655 if (U_FAILURE(status)) { | 5597 if (U_FAILURE(status)) { |
5656 isMatch = FALSE; | 5598 isMatch = FALSE; |
5657 break; | 5599 break; |
5658 } | 5600 } |
5659 } | 5601 } |
5660 | 5602 |
5661 breakFromLoop: | 5603 breakFromLoop: |
5662 fMatch = isMatch; | 5604 fMatch = isMatch; |
5663 if (isMatch) { | 5605 if (isMatch) { |
5664 fLastMatchEnd = fMatchEnd; | 5606 fLastMatchEnd = fMatchEnd; |
5665 fMatchStart = startIdx; | 5607 fMatchStart = startIdx; |
5666 fMatchEnd = fp->fInputIdx; | 5608 fMatchEnd = fp->fInputIdx; |
5667 if (fTraceDebug) { | 5609 } |
5668 REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchSta
rt, fMatchEnd)); | 5610 |
| 5611 #ifdef REGEX_RUN_DEBUG |
| 5612 if (fTraceDebug) { |
| 5613 if (isMatch) { |
| 5614 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd); |
| 5615 } else { |
| 5616 printf("No match\n\n"); |
5669 } | 5617 } |
5670 } | 5618 } |
5671 else | 5619 #endif |
5672 { | 5620 |
5673 if (fTraceDebug) { | |
5674 REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); | |
5675 } | |
5676 } | |
5677 | |
5678 fFrame = fp; // The active stack frame when the engine stoppe
d. | 5621 fFrame = fp; // The active stack frame when the engine stoppe
d. |
5679 // Contains the capture group results that we need to | 5622 // Contains the capture group results that we
need to |
5680 // access later. | 5623 // access later. |
5681 | 5624 |
5682 return; | 5625 return; |
5683 } | 5626 } |
5684 | 5627 |
5685 | 5628 |
5686 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) | 5629 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) |
5687 | 5630 |
5688 U_NAMESPACE_END | 5631 U_NAMESPACE_END |
5689 | 5632 |
5690 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS | 5633 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
OLD | NEW |