Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(990)

Side by Side Diff: source/i18n/rematch.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master
Patch Set: remove unusued directories Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/i18n/reldtfmt.cpp ('k') | source/i18n/repattrn.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 ************************************************************************** 2 **************************************************************************
3 * Copyright (C) 2002-2013 International Business Machines Corporation * 3 * Copyright (C) 2002-2014 International Business Machines Corporation *
4 * and others. All rights reserved. * 4 * and others. All rights reserved. *
5 ************************************************************************** 5 **************************************************************************
6 */ 6 */
7 // 7 //
8 // file: rematch.cpp 8 // file: rematch.cpp
9 // 9 //
10 // Contains the implementation of class RegexMatcher, 10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression p ackage. 11 // which is one of the main API classes for the ICU regular expression p ackage.
12 // 12 //
13 13
(...skipping 12 matching lines...) Expand all
26 #include "uvector.h" 26 #include "uvector.h"
27 #include "uvectr32.h" 27 #include "uvectr32.h"
28 #include "uvectr64.h" 28 #include "uvectr64.h"
29 #include "regeximp.h" 29 #include "regeximp.h"
30 #include "regexst.h" 30 #include "regexst.h"
31 #include "regextxt.h" 31 #include "regextxt.h"
32 #include "ucase.h" 32 #include "ucase.h"
33 33
34 // #include <malloc.h> // Needed for heapcheck testing 34 // #include <malloc.h> // Needed for heapcheck testing
35 35
36
37 // Find progress callback
38 // ----------------------
39 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call.
40 //
41 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \
42 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FAL SE)
43
44
45 // Smart Backtracking
46 // ------------------
47 // When a failure would go back to a LOOP_C instruction,
48 // strings, characters, and setrefs scan backwards for a valid start
49 // character themselves, pop the stack, and save state, emulating the
50 // LOOP_C's effect but assured that the next character of input is a
51 // possible matching character.
52 //
53 // Good idea in theory; unfortunately it only helps out a few specific
54 // cases and slows the engine down a little in the rest.
55
56 U_NAMESPACE_BEGIN 36 U_NAMESPACE_BEGIN
57 37
58 // Default limit for the size of the back track stack, to avoid system 38 // Default limit for the size of the back track stack, to avoid system
59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. 39 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
60 // This value puts ICU's limits higher than most other regexp implementations, 40 // This value puts ICU's limits higher than most other regexp implementations,
61 // which use recursion rather than the heap, and take more storage per 41 // which use recursion rather than the heap, and take more storage per
62 // backtrack point. 42 // backtrack point.
63 // 43 //
64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; 44 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
65 45
66 // Time limit counter constant. 46 // Time limit counter constant.
67 // Time limits for expression evaluation are in terms of quanta of work by 47 // Time limits for expression evaluation are in terms of quanta of work by
68 // the engine, each of which is 10,000 state saves. 48 // the engine, each of which is 10,000 state saves.
69 // This constant determines that state saves per tick number. 49 // This constant determines that state saves per tick number.
70 static const int32_t TIMER_INITIAL_VALUE = 10000; 50 static const int32_t TIMER_INITIAL_VALUE = 10000;
71 51
72 //----------------------------------------------------------------------------- 52 //-----------------------------------------------------------------------------
73 // 53 //
74 // Constructor and Destructor 54 // Constructor and Destructor
75 // 55 //
76 //----------------------------------------------------------------------------- 56 //-----------------------------------------------------------------------------
77 RegexMatcher::RegexMatcher(const RegexPattern *pat) { 57 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
78 fDeferredStatus = U_ZERO_ERROR; 58 fDeferredStatus = U_ZERO_ERROR;
79 init(fDeferredStatus); 59 init(fDeferredStatus);
80 if (U_FAILURE(fDeferredStatus)) { 60 if (U_FAILURE(fDeferredStatus)) {
81 return; 61 return;
82 } 62 }
83 if (pat==NULL) { 63 if (pat==NULL) {
84 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; 64 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
85 return; 65 return;
86 } 66 }
87 fPattern = pat; 67 fPattern = pat;
88 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); 68 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
89 } 69 }
90 70
91 71
92 72
93 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &inp ut, 73 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &inp ut,
94 uint32_t flags, UErrorCode &status) { 74 uint32_t flags, UErrorCode &status) {
95 init(status); 75 init(status);
96 if (U_FAILURE(status)) { 76 if (U_FAILURE(status)) {
97 return; 77 return;
98 } 78 }
99 UParseError pe; 79 UParseError pe;
100 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 80 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
101 fPattern = fPatternOwned; 81 fPattern = fPatternOwned;
102 82
103 UText inputText = UTEXT_INITIALIZER; 83 UText inputText = UTEXT_INITIALIZER;
104 utext_openConstUnicodeString(&inputText, &input, &status); 84 utext_openConstUnicodeString(&inputText, &input, &status);
105 init2(&inputText, status); 85 init2(&inputText, status);
106 utext_close(&inputText); 86 utext_close(&inputText);
107 87
108 fInputUniStrMaybeMutable = TRUE; 88 fInputUniStrMaybeMutable = TRUE;
109 } 89 }
110 90
111 91
112 RegexMatcher::RegexMatcher(UText *regexp, UText *input, 92 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
113 uint32_t flags, UErrorCode &status) { 93 uint32_t flags, UErrorCode &status) {
114 init(status); 94 init(status);
115 if (U_FAILURE(status)) { 95 if (U_FAILURE(status)) {
116 return; 96 return;
117 } 97 }
118 UParseError pe; 98 UParseError pe;
119 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 99 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
120 if (U_FAILURE(status)) { 100 if (U_FAILURE(status)) {
121 return; 101 return;
122 } 102 }
123 103
124 fPattern = fPatternOwned; 104 fPattern = fPatternOwned;
125 init2(input, status); 105 init2(input, status);
126 } 106 }
127 107
128 108
129 RegexMatcher::RegexMatcher(const UnicodeString &regexp, 109 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
130 uint32_t flags, UErrorCode &status) { 110 uint32_t flags, UErrorCode &status) {
131 init(status); 111 init(status);
132 if (U_FAILURE(status)) { 112 if (U_FAILURE(status)) {
133 return; 113 return;
134 } 114 }
135 UParseError pe; 115 UParseError pe;
136 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 116 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
137 if (U_FAILURE(status)) { 117 if (U_FAILURE(status)) {
138 return; 118 return;
139 } 119 }
140 fPattern = fPatternOwned; 120 fPattern = fPatternOwned;
141 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 121 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
142 } 122 }
143 123
144 RegexMatcher::RegexMatcher(UText *regexp, 124 RegexMatcher::RegexMatcher(UText *regexp,
145 uint32_t flags, UErrorCode &status) { 125 uint32_t flags, UErrorCode &status) {
146 init(status); 126 init(status);
147 if (U_FAILURE(status)) { 127 if (U_FAILURE(status)) {
148 return; 128 return;
149 } 129 }
150 UParseError pe; 130 UParseError pe;
151 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 131 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
152 if (U_FAILURE(status)) { 132 if (U_FAILURE(status)) {
153 return; 133 return;
154 } 134 }
155 135
156 fPattern = fPatternOwned; 136 fPattern = fPatternOwned;
157 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 137 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
158 } 138 }
159 139
160 140
161 141
162 142
163 RegexMatcher::~RegexMatcher() { 143 RegexMatcher::~RegexMatcher() {
164 delete fStack; 144 delete fStack;
165 if (fData != fSmallData) { 145 if (fData != fSmallData) {
166 uprv_free(fData); 146 uprv_free(fData);
167 fData = NULL; 147 fData = NULL;
168 } 148 }
169 if (fPatternOwned) { 149 if (fPatternOwned) {
170 delete fPatternOwned; 150 delete fPatternOwned;
171 fPatternOwned = NULL; 151 fPatternOwned = NULL;
172 fPattern = NULL; 152 fPattern = NULL;
173 } 153 }
174 154
175 if (fInput) { 155 if (fInput) {
176 delete fInput; 156 delete fInput;
177 } 157 }
178 if (fInputText) { 158 if (fInputText) {
179 utext_close(fInputText); 159 utext_close(fInputText);
180 } 160 }
181 if (fAltInputText) { 161 if (fAltInputText) {
182 utext_close(fAltInputText); 162 utext_close(fAltInputText);
183 } 163 }
184 164
185 #if UCONFIG_NO_BREAK_ITERATION==0 165 #if UCONFIG_NO_BREAK_ITERATION==0
186 delete fWordBreakItr; 166 delete fWordBreakItr;
187 #endif 167 #endif
188 } 168 }
189 169
190 // 170 //
191 // init() common initialization for use by all constructors. 171 // init() common initialization for use by all constructors.
192 // Initialize all fields, get the object into a consistent state. 172 // Initialize all fields, get the object into a consistent state.
193 // This must be done even when the initial status shows an error, 173 // This must be done even when the initial status shows an error,
194 // so that the object is initialized sufficiently well for the destru ctor 174 // so that the object is initialized sufficiently well for the destru ctor
(...skipping 27 matching lines...) Expand all
222 fTickCounter = 0; 202 fTickCounter = 0;
223 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; 203 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
224 fCallbackFn = NULL; 204 fCallbackFn = NULL;
225 fCallbackContext = NULL; 205 fCallbackContext = NULL;
226 fFindProgressCallbackFn = NULL; 206 fFindProgressCallbackFn = NULL;
227 fFindProgressCallbackContext = NULL; 207 fFindProgressCallbackContext = NULL;
228 fTraceDebug = FALSE; 208 fTraceDebug = FALSE;
229 fDeferredStatus = status; 209 fDeferredStatus = status;
230 fData = fSmallData; 210 fData = fSmallData;
231 fWordBreakItr = NULL; 211 fWordBreakItr = NULL;
232 212
233 fStack = NULL; 213 fStack = NULL;
234 fInputText = NULL; 214 fInputText = NULL;
235 fAltInputText = NULL; 215 fAltInputText = NULL;
236 fInput = NULL; 216 fInput = NULL;
237 fInputLength = 0; 217 fInputLength = 0;
238 fInputUniStrMaybeMutable = FALSE; 218 fInputUniStrMaybeMutable = FALSE;
239 219
240 if (U_FAILURE(status)) { 220 if (U_FAILURE(status)) {
241 fDeferredStatus = status; 221 fDeferredStatus = status;
242 } 222 }
243 } 223 }
244 224
245 // 225 //
246 // init2() Common initialization for use by RegexMatcher constructors, part 2 . 226 // init2() Common initialization for use by RegexMatcher constructors, part 2 .
247 // This handles the common setup to be done after the Pattern is avai lable. 227 // This handles the common setup to be done after the Pattern is avai lable.
248 // 228 //
249 void RegexMatcher::init2(UText *input, UErrorCode &status) { 229 void RegexMatcher::init2(UText *input, UErrorCode &status) {
250 if (U_FAILURE(status)) { 230 if (U_FAILURE(status)) {
251 fDeferredStatus = status; 231 fDeferredStatus = status;
252 return; 232 return;
253 } 233 }
254 234
255 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]) )) { 235 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]) )) {
256 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); 236 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
257 if (fData == NULL) { 237 if (fData == NULL) {
258 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 238 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259 return; 239 return;
260 } 240 }
261 } 241 }
262 242
263 fStack = new UVector64(status); 243 fStack = new UVector64(status);
264 if (fStack == NULL) { 244 if (fStack == NULL) {
265 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 245 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
266 return; 246 return;
(...skipping 12 matching lines...) Expand all
279 static const UChar DOLLARSIGN = 0x24; 259 static const UChar DOLLARSIGN = 0x24;
280 //------------------------------------------------------------------------------ -- 260 //------------------------------------------------------------------------------ --
281 // 261 //
282 // appendReplacement 262 // appendReplacement
283 // 263 //
284 //------------------------------------------------------------------------------ -- 264 //------------------------------------------------------------------------------ --
285 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, 265 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
286 const UnicodeString &replacement, 266 const UnicodeString &replacement,
287 UErrorCode &status) { 267 UErrorCode &status) {
288 UText replacementText = UTEXT_INITIALIZER; 268 UText replacementText = UTEXT_INITIALIZER;
289 269
290 utext_openConstUnicodeString(&replacementText, &replacement, &status); 270 utext_openConstUnicodeString(&replacementText, &replacement, &status);
291 if (U_SUCCESS(status)) { 271 if (U_SUCCESS(status)) {
292 UText resultText = UTEXT_INITIALIZER; 272 UText resultText = UTEXT_INITIALIZER;
293 utext_openUnicodeString(&resultText, &dest, &status); 273 utext_openUnicodeString(&resultText, &dest, &status);
294 274
295 if (U_SUCCESS(status)) { 275 if (U_SUCCESS(status)) {
296 appendReplacement(&resultText, &replacementText, status); 276 appendReplacement(&resultText, &replacementText, status);
297 utext_close(&resultText); 277 utext_close(&resultText);
298 } 278 }
299 utext_close(&replacementText); 279 utext_close(&replacementText);
300 } 280 }
301 281
302 return *this; 282 return *this;
303 } 283 }
304 284
305 // 285 //
306 // appendReplacement, UText mode 286 // appendReplacement, UText mode
307 // 287 //
308 RegexMatcher &RegexMatcher::appendReplacement(UText *dest, 288 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
309 UText *replacement, 289 UText *replacement,
310 UErrorCode &status) { 290 UErrorCode &status) {
311 if (U_FAILURE(status)) { 291 if (U_FAILURE(status)) {
312 return *this; 292 return *this;
313 } 293 }
314 if (U_FAILURE(fDeferredStatus)) { 294 if (U_FAILURE(fDeferredStatus)) {
315 status = fDeferredStatus; 295 status = fDeferredStatus;
316 return *this; 296 return *this;
317 } 297 }
318 if (fMatch == FALSE) { 298 if (fMatch == FALSE) {
319 status = U_REGEX_INVALID_STATE; 299 status = U_REGEX_INVALID_STATE;
320 return *this; 300 return *this;
321 } 301 }
322 302
323 // Copy input string from the end of previous match to start of current matc h 303 // Copy input string from the end of previous match to start of current matc h
324 int64_t destLen = utext_nativeLength(dest); 304 int64_t destLen = utext_nativeLength(dest);
325 if (fMatchStart > fAppendPosition) { 305 if (fMatchStart > fAppendPosition) {
326 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 306 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
327 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkCo ntents+fAppendPosition, 307 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkCo ntents+fAppendPosition,
328 (int32_t)(fMatchStart-fAppendPosition), &st atus); 308 (int32_t)(fMatchStart-fAppendPosition), &st atus);
329 } else { 309 } else {
330 int32_t len16; 310 int32_t len16;
331 if (UTEXT_USES_U16(fInputText)) { 311 if (UTEXT_USES_U16(fInputText)) {
332 len16 = (int32_t)(fMatchStart-fAppendPosition); 312 len16 = (int32_t)(fMatchStart-fAppendPosition);
333 } else { 313 } else {
334 UErrorCode lengthStatus = U_ZERO_ERROR; 314 UErrorCode lengthStatus = U_ZERO_ERROR;
335 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus); 315 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
336 } 316 }
337 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 317 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
338 if (inputChars == NULL) { 318 if (inputChars == NULL) {
339 status = U_MEMORY_ALLOCATION_ERROR; 319 status = U_MEMORY_ALLOCATION_ERROR;
340 return *this; 320 return *this;
341 } 321 }
342 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status); 322 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
343 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status); 323 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
344 uprv_free(inputChars); 324 uprv_free(inputChars);
345 } 325 }
346 } 326 }
347 fAppendPosition = fMatchEnd; 327 fAppendPosition = fMatchEnd;
348 328
349 329
350 // scan the replacement text, looking for substitutions ($n) and \escapes. 330 // scan the replacement text, looking for substitutions ($n) and \escapes.
351 // TODO: optimize this loop by efficiently scanning for '$' or '\', 331 // TODO: optimize this loop by efficiently scanning for '$' or '\',
352 // move entire ranges not containing substitutions. 332 // move entire ranges not containing substitutions.
353 UTEXT_SETNATIVEINDEX(replacement, 0); 333 UTEXT_SETNATIVEINDEX(replacement, 0);
354 UChar32 c = UTEXT_NEXT32(replacement); 334 UChar32 c = UTEXT_NEXT32(replacement);
355 while (c != U_SENTINEL) { 335 while (c != U_SENTINEL) {
356 if (c == BACKSLASH) { 336 if (c == BACKSLASH) {
357 // Backslash Escape. Copy the following char out without further ch ecks. 337 // Backslash Escape. Copy the following char out without further ch ecks.
358 // Note: Surrogate pairs don't need any special handling 338 // Note: Surrogate pairs don't need any special handling
359 // The second half wont be a '$' or a '\', and 339 // The second half wont be a '$' or a '\', and
360 // will move to the dest normally on the n ext 340 // will move to the dest normally on the n ext
361 // loop iteration. 341 // loop iteration.
362 c = UTEXT_CURRENT32(replacement); 342 c = UTEXT_CURRENT32(replacement);
363 if (c == U_SENTINEL) { 343 if (c == U_SENTINEL) {
364 break; 344 break;
365 } 345 }
366 346
367 if (c==0x55/*U*/ || c==0x75/*u*/) { 347 if (c==0x55/*U*/ || c==0x75/*u*/) {
368 // We have a \udddd or \Udddddddd escape sequence. 348 // We have a \udddd or \Udddddddd escape sequence.
369 int32_t offset = 0; 349 int32_t offset = 0;
370 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UN ESCAPE_CONTEXT(replacement); 350 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UN ESCAPE_CONTEXT(replacement);
371 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 351 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
372 if (escapedChar != (UChar32)0xFFFFFFFF) { 352 if (escapedChar != (UChar32)0xFFFFFFFF) {
373 if (U_IS_BMP(escapedChar)) { 353 if (U_IS_BMP(escapedChar)) {
374 UChar c16 = (UChar)escapedChar; 354 UChar c16 = (UChar)escapedChar;
375 destLen += utext_replace(dest, destLen, destLen, &c16, 1 , &status); 355 destLen += utext_replace(dest, destLen, destLen, &c16, 1 , &status);
376 } else { 356 } else {
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
414 surrogate[0] = U16_LEAD(c); 394 surrogate[0] = U16_LEAD(c);
415 surrogate[1] = U16_TRAIL(c); 395 surrogate[1] = U16_TRAIL(c);
416 if (U_SUCCESS(status)) { 396 if (U_SUCCESS(status)) {
417 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 397 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
418 } 398 }
419 } 399 }
420 } else { 400 } else {
421 // We've got a $. Pick up a capture group number if one follows. 401 // We've got a $. Pick up a capture group number if one follows.
422 // Consume at most the number of digits necessary for the largest ca pture 402 // Consume at most the number of digits necessary for the largest ca pture
423 // number that is valid for this pattern. 403 // number that is valid for this pattern.
424 404
425 int32_t numDigits = 0; 405 int32_t numDigits = 0;
426 int32_t groupNum = 0; 406 int32_t groupNum = 0;
427 UChar32 digitC; 407 UChar32 digitC;
428 for (;;) { 408 for (;;) {
429 digitC = UTEXT_CURRENT32(replacement); 409 digitC = UTEXT_CURRENT32(replacement);
430 if (digitC == U_SENTINEL) { 410 if (digitC == U_SENTINEL) {
431 break; 411 break;
432 } 412 }
433 if (u_isdigit(digitC) == FALSE) { 413 if (u_isdigit(digitC) == FALSE) {
434 break; 414 break;
435 } 415 }
436 (void)UTEXT_NEXT32(replacement); 416 (void)UTEXT_NEXT32(replacement);
437 groupNum=groupNum*10 + u_charDigitValue(digitC); 417 groupNum=groupNum*10 + u_charDigitValue(digitC);
438 numDigits++; 418 numDigits++;
439 if (numDigits >= fPattern->fMaxCaptureDigits) { 419 if (numDigits >= fPattern->fMaxCaptureDigits) {
440 break; 420 break;
441 } 421 }
442 } 422 }
443 423
444 424
445 if (numDigits == 0) { 425 if (numDigits == 0) {
446 // The $ didn't introduce a group number at all. 426 // The $ didn't introduce a group number at all.
447 // Treat it as just part of the substitution text. 427 // Treat it as just part of the substitution text.
448 UChar c16 = DOLLARSIGN; 428 UChar c16 = DOLLARSIGN;
449 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &statu s); 429 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &statu s);
450 } else { 430 } else {
451 // Finally, append the capture group data to the destination. 431 // Finally, append the capture group data to the destination.
452 destLen += appendGroup(groupNum, dest, status); 432 destLen += appendGroup(groupNum, dest, status);
453 if (U_FAILURE(status)) { 433 if (U_FAILURE(status)) {
454 // Can fail if group number is out of range. 434 // Can fail if group number is out of range.
455 break; 435 break;
456 } 436 }
457 } 437 }
458 } 438 }
459 439
460 if (U_FAILURE(status)) { 440 if (U_FAILURE(status)) {
461 break; 441 break;
462 } else { 442 } else {
463 c = UTEXT_NEXT32(replacement); 443 c = UTEXT_NEXT32(replacement);
464 } 444 }
465 } 445 }
466 446
467 return *this; 447 return *this;
468 } 448 }
469 449
470 450
471 451
472 //------------------------------------------------------------------------------ -- 452 //------------------------------------------------------------------------------ --
473 // 453 //
474 // appendTail Intended to be used in conjunction with appendReplacement() 454 // appendTail Intended to be used in conjunction with appendReplacement()
475 // To the destination string, append everything following 455 // To the destination string, append everything following
476 // the last match position from the input string. 456 // the last match position from the input string.
477 // 457 //
478 // Note: Match ranges do not affect appendTail or appendRepla cement 458 // Note: Match ranges do not affect appendTail or appendRepla cement
479 // 459 //
480 //------------------------------------------------------------------------------ -- 460 //------------------------------------------------------------------------------ --
481 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { 461 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
482 UErrorCode status = U_ZERO_ERROR; 462 UErrorCode status = U_ZERO_ERROR;
483 UText resultText = UTEXT_INITIALIZER; 463 UText resultText = UTEXT_INITIALIZER;
484 utext_openUnicodeString(&resultText, &dest, &status); 464 utext_openUnicodeString(&resultText, &dest, &status);
485 465
486 if (U_SUCCESS(status)) { 466 if (U_SUCCESS(status)) {
487 appendTail(&resultText, status); 467 appendTail(&resultText, status);
488 utext_close(&resultText); 468 utext_close(&resultText);
489 } 469 }
490 470
491 return dest; 471 return dest;
492 } 472 }
493 473
494 // 474 //
495 // appendTail, UText mode 475 // appendTail, UText mode
496 // 476 //
497 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { 477 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
498 UBool bailOut = FALSE;
499 if (U_FAILURE(status)) { 478 if (U_FAILURE(status)) {
500 bailOut = TRUE; 479 return dest;
501 } 480 }
502 if (U_FAILURE(fDeferredStatus)) { 481 if (U_FAILURE(fDeferredStatus)) {
503 status = fDeferredStatus; 482 status = fDeferredStatus;
504 bailOut = TRUE; 483 return dest;
505 } 484 }
506 485
507 if (bailOut) {
508 // dest must not be NULL
509 if (dest) {
510 utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(des t), NULL, 0, &status);
511 return dest;
512 }
513 }
514
515 if (fInputLength > fAppendPosition) { 486 if (fInputLength > fAppendPosition) {
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 487 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517 int64_t destLen = utext_nativeLength(dest); 488 int64_t destLen = utext_nativeLength(dest);
518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fApp endPosition, 489 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fApp endPosition,
519 (int32_t)(fInputLength-fAppendPosition), &status); 490 (int32_t)(fInputLength-fAppendPosition), &status);
520 } else { 491 } else {
521 int32_t len16; 492 int32_t len16;
522 if (UTEXT_USES_U16(fInputText)) { 493 if (UTEXT_USES_U16(fInputText)) {
523 len16 = (int32_t)(fInputLength-fAppendPosition); 494 len16 = (int32_t)(fInputLength-fAppendPosition);
524 } else { 495 } else {
525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status); 496 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
526 status = U_ZERO_ERROR; // buffer overflow 497 status = U_ZERO_ERROR; // buffer overflow
527 } 498 }
528 499
529 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); 500 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
530 if (inputChars == NULL) { 501 if (inputChars == NULL) {
531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 502 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532 } else { 503 } else {
533 utext_extract(fInputText, fAppendPosition, fInputLength, inputCh ars, len16, &status); // unterminated 504 utext_extract(fInputText, fAppendPosition, fInputLength, inputCh ars, len16, &status); // unterminated
534 int64_t destLen = utext_nativeLength(dest); 505 int64_t destLen = utext_nativeLength(dest);
535 utext_replace(dest, destLen, destLen, inputChars, len16, &status ); 506 utext_replace(dest, destLen, destLen, inputChars, len16, &status );
536 uprv_free(inputChars); 507 uprv_free(inputChars);
537 } 508 }
538 } 509 }
539 } 510 }
540 return dest; 511 return dest;
541 } 512 }
542 513
543 514
(...skipping 18 matching lines...) Expand all
562 if (fMatch == FALSE) { 533 if (fMatch == FALSE) {
563 err = U_REGEX_INVALID_STATE; 534 err = U_REGEX_INVALID_STATE;
564 return -1; 535 return -1;
565 } 536 }
566 if (group < 0 || group > fPattern->fGroupMap->size()) { 537 if (group < 0 || group > fPattern->fGroupMap->size()) {
567 err = U_INDEX_OUTOFBOUNDS_ERROR; 538 err = U_INDEX_OUTOFBOUNDS_ERROR;
568 return -1; 539 return -1;
569 } 540 }
570 int64_t e = -1; 541 int64_t e = -1;
571 if (group == 0) { 542 if (group == 0) {
572 e = fMatchEnd; 543 e = fMatchEnd;
573 } else { 544 } else {
574 // Get the position within the stack frame of the variables for 545 // Get the position within the stack frame of the variables for
575 // this capture group. 546 // this capture group.
576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 547 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577 U_ASSERT(groupOffset < fPattern->fFrameSize); 548 U_ASSERT(groupOffset < fPattern->fFrameSize);
578 U_ASSERT(groupOffset >= 0); 549 U_ASSERT(groupOffset >= 0);
579 e = fFrame->fExtra[groupOffset + 1]; 550 e = fFrame->fExtra[groupOffset + 1];
580 } 551 }
581 552
582 return e; 553 return e;
583 } 554 }
584 555
585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { 556 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586 return (int32_t)end64(group, err); 557 return (int32_t)end64(group, err);
587 } 558 }
588 559
560 //------------------------------------------------------------------------------ --
561 //
562 // findProgressInterrupt This function is called once for each advance in the target
563 // string from the find() function, and calls the user progress callback
564 // function if there is one installed.
565 //
566 // Return: TRUE if the find operation is to be terminated.
567 // FALSE if the find operation is to continue running.
568 //
569 //------------------------------------------------------------------------------ --
570 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
571 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCall backContext, pos)) {
572 status = U_REGEX_STOPPED_BY_CALLER;
573 return TRUE;
574 }
575 return FALSE;
576 }
589 577
590 //------------------------------------------------------------------------------ -- 578 //------------------------------------------------------------------------------ --
591 // 579 //
592 // find() 580 // find()
593 // 581 //
594 //------------------------------------------------------------------------------ -- 582 //------------------------------------------------------------------------------ --
595 UBool RegexMatcher::find() { 583 UBool RegexMatcher::find() {
584 if (U_FAILURE(fDeferredStatus)) {
585 return FALSE;
586 }
587 UErrorCode status = U_ZERO_ERROR;
588 UBool result = find(status);
589 return result;
590 }
591
592 //------------------------------------------------------------------------------ --
593 //
594 // find()
595 //
596 //------------------------------------------------------------------------------ --
597 UBool RegexMatcher::find(UErrorCode &status) {
596 // Start at the position of the last match end. (Will be zero if the 598 // Start at the position of the last match end. (Will be zero if the
597 // matcher has been reset.) 599 // matcher has been reset.)
598 // 600 //
599 if (U_FAILURE(fDeferredStatus)) { 601 if (U_FAILURE(status)) {
600 return FALSE; 602 return FALSE;
601 } 603 }
602 604 if (U_FAILURE(fDeferredStatus)) {
605 status = fDeferredStatus;
606 return FALSE;
607 }
608
603 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 609 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
604 return findUsingChunk(); 610 return findUsingChunk(status);
605 } 611 }
606 612
607 int64_t startPos = fMatchEnd; 613 int64_t startPos = fMatchEnd;
608 if (startPos==0) { 614 if (startPos==0) {
609 startPos = fActiveStart; 615 startPos = fActiveStart;
610 } 616 }
611 617
612 if (fMatch) { 618 if (fMatch) {
613 // Save the position of any previous successful match. 619 // Save the position of any previous successful match.
614 fLastMatchEnd = fMatchEnd; 620 fLastMatchEnd = fMatchEnd;
(...skipping 27 matching lines...) Expand all
642 // Be aware of possible overflows if making changes here. 648 // Be aware of possible overflows if making changes here.
643 int64_t testStartLimit; 649 int64_t testStartLimit;
644 if (UTEXT_USES_U16(fInputText)) { 650 if (UTEXT_USES_U16(fInputText)) {
645 testStartLimit = fActiveLimit - fPattern->fMinMatchLen; 651 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
646 if (startPos > testStartLimit) { 652 if (startPos > testStartLimit) {
647 fMatch = FALSE; 653 fMatch = FALSE;
648 fHitEnd = TRUE; 654 fHitEnd = TRUE;
649 return FALSE; 655 return FALSE;
650 } 656 }
651 } else { 657 } else {
652 // For now, let the matcher discover that it can't match on its own 658 // We don't know exactly how long the minimum match length is in native characters.
653 // We don't know how long the match len is in native characters 659 // Treat anything > 0 as 1.
654 testStartLimit = fActiveLimit; 660 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
655 } 661 }
656 662
657 UChar32 c; 663 UChar32 c;
658 U_ASSERT(startPos >= 0); 664 U_ASSERT(startPos >= 0);
659 665
660 switch (fPattern->fStartType) { 666 switch (fPattern->fStartType) {
661 case START_NO_INFO: 667 case START_NO_INFO:
662 // No optimization was found. 668 // No optimization was found.
663 // Try a match at each input position. 669 // Try a match at each input position.
664 for (;;) { 670 for (;;) {
665 MatchAt(startPos, FALSE, fDeferredStatus); 671 MatchAt(startPos, FALSE, status);
666 if (U_FAILURE(fDeferredStatus)) { 672 if (U_FAILURE(status)) {
667 return FALSE; 673 return FALSE;
668 } 674 }
669 if (fMatch) { 675 if (fMatch) {
670 return TRUE; 676 return TRUE;
671 } 677 }
672 if (startPos >= testStartLimit) { 678 if (startPos >= testStartLimit) {
673 fHitEnd = TRUE; 679 fHitEnd = TRUE;
674 return FALSE; 680 return FALSE;
675 } 681 }
676 UTEXT_SETNATIVEINDEX(fInputText, startPos); 682 UTEXT_SETNATIVEINDEX(fInputText, startPos);
677 (void)UTEXT_NEXT32(fInputText); 683 (void)UTEXT_NEXT32(fInputText);
678 startPos = UTEXT_GETNATIVEINDEX(fInputText); 684 startPos = UTEXT_GETNATIVEINDEX(fInputText);
679 // Note that it's perfectly OK for a pattern to have a zero-length 685 // Note that it's perfectly OK for a pattern to have a zero-length
680 // match at the end of a string, so we must make sure that the loo p 686 // match at the end of a string, so we must make sure that the loo p
681 // runs with startPos == testStartLimit the last time through. 687 // runs with startPos == testStartLimit the last time through.
682 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 688 if (findProgressInterrupt(startPos, status))
683 return FALSE; 689 return FALSE;
684 } 690 }
685 U_ASSERT(FALSE); 691 U_ASSERT(FALSE);
686 692
687 case START_START: 693 case START_START:
688 // Matches are only possible at the start of the input string 694 // Matches are only possible at the start of the input string
689 // (pattern begins with ^ or \A) 695 // (pattern begins with ^ or \A)
690 if (startPos > fActiveStart) { 696 if (startPos > fActiveStart) {
691 fMatch = FALSE; 697 fMatch = FALSE;
692 return FALSE; 698 return FALSE;
693 } 699 }
694 MatchAt(startPos, FALSE, fDeferredStatus); 700 MatchAt(startPos, FALSE, status);
695 if (U_FAILURE(fDeferredStatus)) { 701 if (U_FAILURE(status)) {
696 return FALSE; 702 return FALSE;
697 } 703 }
698 return fMatch; 704 return fMatch;
699 705
700 706
701 case START_SET: 707 case START_SET:
702 { 708 {
703 // Match may start on any char from a pre-computed set. 709 // Match may start on any char from a pre-computed set.
704 U_ASSERT(fPattern->fMinMatchLen > 0); 710 U_ASSERT(fPattern->fMinMatchLen > 0);
705 int64_t pos;
706 UTEXT_SETNATIVEINDEX(fInputText, startPos); 711 UTEXT_SETNATIVEINDEX(fInputText, startPos);
707 for (;;) { 712 for (;;) {
713 int64_t pos = startPos;
708 c = UTEXT_NEXT32(fInputText); 714 c = UTEXT_NEXT32(fInputText);
709 pos = UTEXT_GETNATIVEINDEX(fInputText); 715 startPos = UTEXT_GETNATIVEINDEX(fInputText);
710 // c will be -1 (U_SENTINEL) at end of text, in which case we 716 // c will be -1 (U_SENTINEL) at end of text, in which case we
711 // skip this next block (so we don't have a negative array index ) 717 // skip this next block (so we don't have a negative array index )
712 // and handle end of text in the following block. 718 // and handle end of text in the following block.
713 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) || 719 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
714 (c>=256 && fPattern->fInitialChars->contains(c)))) { 720 (c>=256 && fPattern->fInitialChars->contains(c)))) {
715 MatchAt(startPos, FALSE, fDeferredStatus); 721 MatchAt(pos, FALSE, status);
716 if (U_FAILURE(fDeferredStatus)) { 722 if (U_FAILURE(status)) {
717 return FALSE; 723 return FALSE;
718 } 724 }
719 if (fMatch) { 725 if (fMatch) {
720 return TRUE; 726 return TRUE;
721 } 727 }
722 UTEXT_SETNATIVEINDEX(fInputText, pos); 728 UTEXT_SETNATIVEINDEX(fInputText, pos);
723 } 729 }
724 if (startPos >= testStartLimit) { 730 if (startPos > testStartLimit) {
725 fMatch = FALSE; 731 fMatch = FALSE;
726 fHitEnd = TRUE; 732 fHitEnd = TRUE;
727 return FALSE; 733 return FALSE;
728 } 734 }
729 startPos = pos; 735 if (findProgressInterrupt(startPos, status))
730 » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
731 return FALSE; 736 return FALSE;
732 } 737 }
733 } 738 }
734 U_ASSERT(FALSE); 739 U_ASSERT(FALSE);
735 740
736 case START_STRING: 741 case START_STRING:
737 case START_CHAR: 742 case START_CHAR:
738 { 743 {
739 // Match starts on exactly one char. 744 // Match starts on exactly one char.
740 U_ASSERT(fPattern->fMinMatchLen > 0); 745 U_ASSERT(fPattern->fMinMatchLen > 0);
741 UChar32 theChar = fPattern->fInitialChar; 746 UChar32 theChar = fPattern->fInitialChar;
742 int64_t pos;
743 UTEXT_SETNATIVEINDEX(fInputText, startPos); 747 UTEXT_SETNATIVEINDEX(fInputText, startPos);
744 for (;;) { 748 for (;;) {
749 int64_t pos = startPos;
745 c = UTEXT_NEXT32(fInputText); 750 c = UTEXT_NEXT32(fInputText);
746 pos = UTEXT_GETNATIVEINDEX(fInputText); 751 startPos = UTEXT_GETNATIVEINDEX(fInputText);
747 if (c == theChar) { 752 if (c == theChar) {
748 MatchAt(startPos, FALSE, fDeferredStatus); 753 MatchAt(pos, FALSE, status);
749 if (U_FAILURE(fDeferredStatus)) { 754 if (U_FAILURE(status)) {
750 return FALSE; 755 return FALSE;
751 } 756 }
752 if (fMatch) { 757 if (fMatch) {
753 return TRUE; 758 return TRUE;
754 } 759 }
755 UTEXT_SETNATIVEINDEX(fInputText, pos); 760 UTEXT_SETNATIVEINDEX(fInputText, pos);
756 } 761 }
757 if (startPos >= testStartLimit) { 762 if (startPos > testStartLimit) {
758 fMatch = FALSE; 763 fMatch = FALSE;
759 fHitEnd = TRUE; 764 fHitEnd = TRUE;
760 return FALSE; 765 return FALSE;
761 } 766 }
762 startPos = pos; 767 if (findProgressInterrupt(startPos, status))
763 » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
764 return FALSE; 768 return FALSE;
765 } 769 }
766 } 770 }
767 U_ASSERT(FALSE); 771 U_ASSERT(FALSE);
768 772
769 case START_LINE: 773 case START_LINE:
770 { 774 {
771 UChar32 c; 775 UChar32 c;
772 if (startPos == fAnchorStart) { 776 if (startPos == fAnchorStart) {
773 MatchAt(startPos, FALSE, fDeferredStatus); 777 MatchAt(startPos, FALSE, status);
774 if (U_FAILURE(fDeferredStatus)) { 778 if (U_FAILURE(status)) {
775 return FALSE; 779 return FALSE;
776 } 780 }
777 if (fMatch) { 781 if (fMatch) {
778 return TRUE; 782 return TRUE;
779 } 783 }
780 UTEXT_SETNATIVEINDEX(fInputText, startPos); 784 UTEXT_SETNATIVEINDEX(fInputText, startPos);
781 c = UTEXT_NEXT32(fInputText); 785 c = UTEXT_NEXT32(fInputText);
782 startPos = UTEXT_GETNATIVEINDEX(fInputText); 786 startPos = UTEXT_GETNATIVEINDEX(fInputText);
783 } else { 787 } else {
784 UTEXT_SETNATIVEINDEX(fInputText, startPos); 788 UTEXT_SETNATIVEINDEX(fInputText, startPos);
785 c = UTEXT_PREVIOUS32(fInputText); 789 c = UTEXT_PREVIOUS32(fInputText);
786 UTEXT_SETNATIVEINDEX(fInputText, startPos); 790 UTEXT_SETNATIVEINDEX(fInputText, startPos);
787 } 791 }
788 792
789 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 793 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
790 for (;;) { 794 for (;;) {
791 if (c == 0x0a) { 795 if (c == 0x0a) {
792 MatchAt(startPos, FALSE, fDeferredStatus); 796 MatchAt(startPos, FALSE, status);
793 if (U_FAILURE(fDeferredStatus)) { 797 if (U_FAILURE(status)) {
794 return FALSE; 798 return FALSE;
795 } 799 }
796 if (fMatch) { 800 if (fMatch) {
797 return TRUE; 801 return TRUE;
798 } 802 }
799 UTEXT_SETNATIVEINDEX(fInputText, startPos); 803 UTEXT_SETNATIVEINDEX(fInputText, startPos);
800 } 804 }
801 if (startPos >= testStartLimit) { 805 if (startPos >= testStartLimit) {
802 fMatch = FALSE; 806 fMatch = FALSE;
803 fHitEnd = TRUE; 807 fHitEnd = TRUE;
804 return FALSE; 808 return FALSE;
805 } 809 }
806 c = UTEXT_NEXT32(fInputText); 810 c = UTEXT_NEXT32(fInputText);
807 startPos = UTEXT_GETNATIVEINDEX(fInputText); 811 startPos = UTEXT_GETNATIVEINDEX(fInputText);
808 // Note that it's perfectly OK for a pattern to have a zero- length 812 // Note that it's perfectly OK for a pattern to have a zero- length
809 // match at the end of a string, so we must make sure that the loop 813 // match at the end of a string, so we must make sure that the loop
810 // runs with startPos == testStartLimit the last time thro ugh. 814 // runs with startPos == testStartLimit the last time thro ugh.
811 » » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferred Status)) 815 if (findProgressInterrupt(startPos, status))
812 return FALSE; 816 return FALSE;
813 } 817 }
814 } else { 818 } else {
815 for (;;) { 819 for (;;) {
816 if (((c & 0x7f) <= 0x29) && // First quickly bypass as m any chars as possible 820 if (((c & 0x7f) <= 0x29) && // First quickly bypass as m any chars as possible
817 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x202 9 )) { 821 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x202 9 )) {
818 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CU RRENT32(fInputText) == 0x0a) { 822 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CU RRENT32(fInputText) == 0x0a) {
819 (void)UTEXT_NEXT32(fInputText); 823 (void)UTEXT_NEXT32(fInputText);
820 startPos = UTEXT_GETNATIVEINDEX(fInputText); 824 startPos = UTEXT_GETNATIVEINDEX(fInputText);
821 } 825 }
822 MatchAt(startPos, FALSE, fDeferredStatus); 826 MatchAt(startPos, FALSE, status);
823 if (U_FAILURE(fDeferredStatus)) { 827 if (U_FAILURE(status)) {
824 return FALSE; 828 return FALSE;
825 } 829 }
826 if (fMatch) { 830 if (fMatch) {
827 return TRUE; 831 return TRUE;
828 } 832 }
829 UTEXT_SETNATIVEINDEX(fInputText, startPos); 833 UTEXT_SETNATIVEINDEX(fInputText, startPos);
830 } 834 }
831 if (startPos >= testStartLimit) { 835 if (startPos >= testStartLimit) {
832 fMatch = FALSE; 836 fMatch = FALSE;
833 fHitEnd = TRUE; 837 fHitEnd = TRUE;
834 return FALSE; 838 return FALSE;
835 } 839 }
836 c = UTEXT_NEXT32(fInputText); 840 c = UTEXT_NEXT32(fInputText);
837 startPos = UTEXT_GETNATIVEINDEX(fInputText); 841 startPos = UTEXT_GETNATIVEINDEX(fInputText);
838 // Note that it's perfectly OK for a pattern to have a zero- length 842 // Note that it's perfectly OK for a pattern to have a zero- length
839 // match at the end of a string, so we must make sure that the loop 843 // match at the end of a string, so we must make sure that the loop
840 // runs with startPos == testStartLimit the last time thro ugh. 844 // runs with startPos == testStartLimit the last time thro ugh.
841 » » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferred Status)) 845 if (findProgressInterrupt(startPos, status))
842 return FALSE; 846 return FALSE;
843 } 847 }
844 } 848 }
845 } 849 }
846 850
847 default: 851 default:
848 U_ASSERT(FALSE); 852 U_ASSERT(FALSE);
849 } 853 }
850 854
851 U_ASSERT(FALSE); 855 U_ASSERT(FALSE);
852 return FALSE; 856 return FALSE;
853 } 857 }
854 858
855 859
856 860
857 UBool RegexMatcher::find(int64_t start, UErrorCode &status) { 861 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
858 if (U_FAILURE(status)) { 862 if (U_FAILURE(status)) {
859 return FALSE; 863 return FALSE;
860 } 864 }
861 if (U_FAILURE(fDeferredStatus)) { 865 if (U_FAILURE(fDeferredStatus)) {
862 status = fDeferredStatus; 866 status = fDeferredStatus;
863 return FALSE; 867 return FALSE;
864 } 868 }
865 this->reset(); // Note: Reset() is specified by Java Matcher documentation. 869 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
866 // This will reset the region t o be the full input length. 870 // This will reset the region t o be the full input length.
867 if (start < 0) { 871 if (start < 0) {
868 status = U_INDEX_OUTOFBOUNDS_ERROR; 872 status = U_INDEX_OUTOFBOUNDS_ERROR;
869 return FALSE; 873 return FALSE;
870 } 874 }
871 875
872 int64_t nativeStart = start; 876 int64_t nativeStart = start;
873 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 877 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
874 status = U_INDEX_OUTOFBOUNDS_ERROR; 878 status = U_INDEX_OUTOFBOUNDS_ERROR;
875 return FALSE; 879 return FALSE;
876 } 880 }
877 fMatchEnd = nativeStart; 881 fMatchEnd = nativeStart;
878 return find(); 882 return find(status);
879 } 883 }
880 884
881 885
882 //------------------------------------------------------------------------------ -- 886 //------------------------------------------------------------------------------ --
883 // 887 //
884 // findUsingChunk() -- like find(), but with the advance knowledge that the 888 // findUsingChunk() -- like find(), but with the advance knowledge that the
885 // entire string is available in the UText's chunk buffer. 889 // entire string is available in the UText's chunk buffer.
886 // 890 //
887 //------------------------------------------------------------------------------ -- 891 //------------------------------------------------------------------------------ --
888 UBool RegexMatcher::findUsingChunk() { 892 UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
889 // Start at the position of the last match end. (Will be zero if the 893 // Start at the position of the last match end. (Will be zero if the
890 // matcher has been reset. 894 // matcher has been reset.
891 // 895 //
892 896
893 int32_t startPos = (int32_t)fMatchEnd; 897 int32_t startPos = (int32_t)fMatchEnd;
894 if (startPos==0) { 898 if (startPos==0) {
895 startPos = (int32_t)fActiveStart; 899 startPos = (int32_t)fActiveStart;
896 } 900 }
897 901
898 const UChar *inputBuf = fInputText->chunkContents; 902 const UChar *inputBuf = fInputText->chunkContents;
899 903
900 if (fMatch) { 904 if (fMatch) {
901 // Save the position of any previous successful match. 905 // Save the position of any previous successful match.
902 fLastMatchEnd = fMatchEnd; 906 fLastMatchEnd = fMatchEnd;
903 907
904 if (fMatchStart == fMatchEnd) { 908 if (fMatchStart == fMatchEnd) {
905 // Previous match had zero length. Move start position up one posit ion 909 // Previous match had zero length. Move start position up one posit ion
906 // to avoid sending find() into a loop on zero-length matches. 910 // to avoid sending find() into a loop on zero-length matches.
907 if (startPos >= fActiveLimit) { 911 if (startPos >= fActiveLimit) {
908 fMatch = FALSE; 912 fMatch = FALSE;
909 fHitEnd = TRUE; 913 fHitEnd = TRUE;
910 return FALSE; 914 return FALSE;
911 } 915 }
912 U16_FWD_1(inputBuf, startPos, fInputLength); 916 U16_FWD_1(inputBuf, startPos, fInputLength);
913 } 917 }
914 } else { 918 } else {
915 if (fLastMatchEnd >= 0) { 919 if (fLastMatchEnd >= 0) {
916 // A previous find() failed to match. Don't try again. 920 // A previous find() failed to match. Don't try again.
917 // (without this test, a pattern with a zero-length match 921 // (without this test, a pattern with a zero-length match
918 // could match again at the end of an input string.) 922 // could match again at the end of an input string.)
919 fHitEnd = TRUE; 923 fHitEnd = TRUE;
920 return FALSE; 924 return FALSE;
921 } 925 }
922 } 926 }
923 927
924 928
925 // Compute the position in the input string beyond which a match can not beg in, because 929 // Compute the position in the input string beyond which a match can not beg in, because
926 // the minimum length match would extend past the end of the input. 930 // the minimum length match would extend past the end of the input.
927 // Note: some patterns that cannot match anything will have fMinMatchLeng th==Max Int. 931 // Note: some patterns that cannot match anything will have fMinMatchLeng th==Max Int.
928 // Be aware of possible overflows if making changes here. 932 // Be aware of possible overflows if making changes here.
933 // Note: a match can begin at inputBuf + testLen; it is an inclusive limi t.
929 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); 934 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
930 if (startPos > testLen) { 935 if (startPos > testLen) {
931 fMatch = FALSE; 936 fMatch = FALSE;
932 fHitEnd = TRUE; 937 fHitEnd = TRUE;
933 return FALSE; 938 return FALSE;
934 } 939 }
935 940
936 UChar32 c; 941 UChar32 c;
937 U_ASSERT(startPos >= 0); 942 U_ASSERT(startPos >= 0);
938 943
939 switch (fPattern->fStartType) { 944 switch (fPattern->fStartType) {
940 case START_NO_INFO: 945 case START_NO_INFO:
941 // No optimization was found. 946 // No optimization was found.
942 // Try a match at each input position. 947 // Try a match at each input position.
943 for (;;) { 948 for (;;) {
944 MatchChunkAt(startPos, FALSE, fDeferredStatus); 949 MatchChunkAt(startPos, FALSE, status);
945 if (U_FAILURE(fDeferredStatus)) { 950 if (U_FAILURE(status)) {
946 return FALSE; 951 return FALSE;
947 } 952 }
948 if (fMatch) { 953 if (fMatch) {
949 return TRUE; 954 return TRUE;
950 } 955 }
951 if (startPos >= testLen) { 956 if (startPos >= testLen) {
952 fHitEnd = TRUE; 957 fHitEnd = TRUE;
953 return FALSE; 958 return FALSE;
954 } 959 }
955 U16_FWD_1(inputBuf, startPos, fActiveLimit); 960 U16_FWD_1(inputBuf, startPos, fActiveLimit);
956 // Note that it's perfectly OK for a pattern to have a zero-length 961 // Note that it's perfectly OK for a pattern to have a zero-length
957 // match at the end of a string, so we must make sure that the loo p 962 // match at the end of a string, so we must make sure that the loo p
958 // runs with startPos == testLen the last time through. 963 // runs with startPos == testLen the last time through.
959 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 964 if (findProgressInterrupt(startPos, status))
960 return FALSE; 965 return FALSE;
961 } 966 }
962 U_ASSERT(FALSE); 967 U_ASSERT(FALSE);
963 968
964 case START_START: 969 case START_START:
965 // Matches are only possible at the start of the input string 970 // Matches are only possible at the start of the input string
966 // (pattern begins with ^ or \A) 971 // (pattern begins with ^ or \A)
967 if (startPos > fActiveStart) { 972 if (startPos > fActiveStart) {
968 fMatch = FALSE; 973 fMatch = FALSE;
969 return FALSE; 974 return FALSE;
970 } 975 }
971 MatchChunkAt(startPos, FALSE, fDeferredStatus); 976 MatchChunkAt(startPos, FALSE, status);
972 if (U_FAILURE(fDeferredStatus)) { 977 if (U_FAILURE(status)) {
973 return FALSE; 978 return FALSE;
974 } 979 }
975 return fMatch; 980 return fMatch;
976 981
977 982
978 case START_SET: 983 case START_SET:
979 { 984 {
980 // Match may start on any char from a pre-computed set. 985 // Match may start on any char from a pre-computed set.
981 U_ASSERT(fPattern->fMinMatchLen > 0); 986 U_ASSERT(fPattern->fMinMatchLen > 0);
982 for (;;) { 987 for (;;) {
983 int32_t pos = startPos; 988 int32_t pos = startPos;
984 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf [startPos++]; 989 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf [startPos++];
985 if ((c<256 && fPattern->fInitialChars8->contains(c)) || 990 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
986 (c>=256 && fPattern->fInitialChars->contains(c))) { 991 (c>=256 && fPattern->fInitialChars->contains(c))) {
987 MatchChunkAt(pos, FALSE, fDeferredStatus); 992 MatchChunkAt(pos, FALSE, status);
988 if (U_FAILURE(fDeferredStatus)) { 993 if (U_FAILURE(status)) {
989 return FALSE; 994 return FALSE;
990 } 995 }
991 if (fMatch) { 996 if (fMatch) {
992 return TRUE; 997 return TRUE;
993 } 998 }
994 } 999 }
995 if (pos >= testLen) { 1000 if (startPos > testLen) {
996 fMatch = FALSE; 1001 fMatch = FALSE;
997 fHitEnd = TRUE; 1002 fHitEnd = TRUE;
998 return FALSE; 1003 return FALSE;
999 } 1004 }
1000 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1005 if (findProgressInterrupt(startPos, status))
1001 return FALSE; 1006 return FALSE;
1002 } 1007 }
1003 } 1008 }
1004 U_ASSERT(FALSE); 1009 U_ASSERT(FALSE);
1005 1010
1006 case START_STRING: 1011 case START_STRING:
1007 case START_CHAR: 1012 case START_CHAR:
1008 { 1013 {
1009 // Match starts on exactly one char. 1014 // Match starts on exactly one char.
1010 U_ASSERT(fPattern->fMinMatchLen > 0); 1015 U_ASSERT(fPattern->fMinMatchLen > 0);
1011 UChar32 theChar = fPattern->fInitialChar; 1016 UChar32 theChar = fPattern->fInitialChar;
1012 for (;;) { 1017 for (;;) {
1013 int32_t pos = startPos; 1018 int32_t pos = startPos;
1014 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf [startPos++]; 1019 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf [startPos++];
1015 if (c == theChar) { 1020 if (c == theChar) {
1016 MatchChunkAt(pos, FALSE, fDeferredStatus); 1021 MatchChunkAt(pos, FALSE, status);
1017 if (U_FAILURE(fDeferredStatus)) { 1022 if (U_FAILURE(status)) {
1018 return FALSE; 1023 return FALSE;
1019 } 1024 }
1020 if (fMatch) { 1025 if (fMatch) {
1021 return TRUE; 1026 return TRUE;
1022 } 1027 }
1023 } 1028 }
1024 if (pos >= testLen) { 1029 if (startPos > testLen) {
1025 fMatch = FALSE; 1030 fMatch = FALSE;
1026 fHitEnd = TRUE; 1031 fHitEnd = TRUE;
1027 return FALSE; 1032 return FALSE;
1028 } 1033 }
1029 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1034 if (findProgressInterrupt(startPos, status))
1030 return FALSE; 1035 return FALSE;
1031 } 1036 }
1032 } 1037 }
1033 U_ASSERT(FALSE); 1038 U_ASSERT(FALSE);
1034 1039
1035 case START_LINE: 1040 case START_LINE:
1036 { 1041 {
1037 UChar32 c; 1042 UChar32 c;
1038 if (startPos == fAnchorStart) { 1043 if (startPos == fAnchorStart) {
1039 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1044 MatchChunkAt(startPos, FALSE, status);
1040 if (U_FAILURE(fDeferredStatus)) { 1045 if (U_FAILURE(status)) {
1041 return FALSE; 1046 return FALSE;
1042 } 1047 }
1043 if (fMatch) { 1048 if (fMatch) {
1044 return TRUE; 1049 return TRUE;
1045 } 1050 }
1046 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1051 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1047 } 1052 }
1048 1053
1049 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 1054 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1050 for (;;) { 1055 for (;;) {
1051 c = inputBuf[startPos-1]; 1056 c = inputBuf[startPos-1];
1052 if (c == 0x0a) { 1057 if (c == 0x0a) {
1053 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1058 MatchChunkAt(startPos, FALSE, status);
1054 if (U_FAILURE(fDeferredStatus)) { 1059 if (U_FAILURE(status)) {
1055 return FALSE; 1060 return FALSE;
1056 } 1061 }
1057 if (fMatch) { 1062 if (fMatch) {
1058 return TRUE; 1063 return TRUE;
1059 } 1064 }
1060 } 1065 }
1061 if (startPos >= testLen) { 1066 if (startPos >= testLen) {
1062 fMatch = FALSE; 1067 fMatch = FALSE;
1063 fHitEnd = TRUE; 1068 fHitEnd = TRUE;
1064 return FALSE; 1069 return FALSE;
1065 } 1070 }
1066 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1071 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1067 // Note that it's perfectly OK for a pattern to have a zero-leng th 1072 // Note that it's perfectly OK for a pattern to have a zero-leng th
1068 // match at the end of a string, so we must make sure that the loop 1073 // match at the end of a string, so we must make sure that the loop
1069 // runs with startPos == testLen the last time through. 1074 // runs with startPos == testLen the last time through.
1070 » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1075 if (findProgressInterrupt(startPos, status))
1071 return FALSE; 1076 return FALSE;
1072 } 1077 }
1073 } else { 1078 } else {
1074 for (;;) { 1079 for (;;) {
1075 c = inputBuf[startPos-1]; 1080 c = inputBuf[startPos-1];
1076 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 1081 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1077 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { 1082 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1078 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPo s] == 0x0a) { 1083 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPo s] == 0x0a) {
1079 startPos++; 1084 startPos++;
1080 } 1085 }
1081 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1086 MatchChunkAt(startPos, FALSE, status);
1082 if (U_FAILURE(fDeferredStatus)) { 1087 if (U_FAILURE(status)) {
1083 return FALSE; 1088 return FALSE;
1084 } 1089 }
1085 if (fMatch) { 1090 if (fMatch) {
1086 return TRUE; 1091 return TRUE;
1087 } 1092 }
1088 } 1093 }
1089 if (startPos >= testLen) { 1094 if (startPos >= testLen) {
1090 fMatch = FALSE; 1095 fMatch = FALSE;
1091 fHitEnd = TRUE; 1096 fHitEnd = TRUE;
1092 return FALSE; 1097 return FALSE;
1093 } 1098 }
1094 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1099 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1095 // Note that it's perfectly OK for a pattern to have a zero-leng th 1100 // Note that it's perfectly OK for a pattern to have a zero-leng th
1096 // match at the end of a string, so we must make sure that the loop 1101 // match at the end of a string, so we must make sure that the loop
1097 // runs with startPos == testLen the last time through. 1102 // runs with startPos == testLen the last time through.
1098 » if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1103 if (findProgressInterrupt(startPos, status))
1099 return FALSE; 1104 return FALSE;
1100 } 1105 }
1101 } 1106 }
1102 } 1107 }
1103 1108
1104 default: 1109 default:
1105 U_ASSERT(FALSE); 1110 U_ASSERT(FALSE);
1106 } 1111 }
1107 1112
1108 U_ASSERT(FALSE); 1113 U_ASSERT(FALSE);
1109 return FALSE; 1114 return FALSE;
1110 } 1115 }
1111 1116
1112 1117
1113 1118
1114 //------------------------------------------------------------------------------ -- 1119 //------------------------------------------------------------------------------ --
1115 // 1120 //
1116 // group() 1121 // group()
1117 // 1122 //
1118 //------------------------------------------------------------------------------ -- 1123 //------------------------------------------------------------------------------ --
1119 UnicodeString RegexMatcher::group(UErrorCode &status) const { 1124 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1120 return group(0, status); 1125 return group(0, status);
1121 } 1126 }
1122 1127
1123 // Return immutable shallow clone 1128 // Return immutable shallow clone
1124 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const { 1129 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1125 return group(0, dest, group_len, status); 1130 return group(0, dest, group_len, status);
1126 } 1131 }
1127 1132
1128 // Return immutable shallow clone 1133 // Return immutable shallow clone
1129 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UE rrorCode &status) const { 1134 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UE rrorCode &status) const {
1130 group_len = 0; 1135 group_len = 0;
1131 UBool bailOut = FALSE;
1132 if (U_FAILURE(status)) { 1136 if (U_FAILURE(status)) {
1133 return dest; 1137 return dest;
1134 } 1138 }
1135 if (U_FAILURE(fDeferredStatus)) { 1139 if (U_FAILURE(fDeferredStatus)) {
1136 status = fDeferredStatus; 1140 status = fDeferredStatus;
1137 bailOut = TRUE; 1141 } else if (fMatch == FALSE) {
1142 status = U_REGEX_INVALID_STATE;
1143 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1144 status = U_INDEX_OUTOFBOUNDS_ERROR;
1138 } 1145 }
1139 if (fMatch == FALSE) { 1146
1140 status = U_REGEX_INVALID_STATE; 1147 if (U_FAILURE(status)) {
1141 bailOut = TRUE; 1148 return dest;
1142 } 1149 }
1143 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1150
1144 status = U_INDEX_OUTOFBOUNDS_ERROR;
1145 bailOut = TRUE;
1146 }
1147
1148 if (bailOut) {
1149 return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
1150 }
1151
1152 int64_t s, e; 1151 int64_t s, e;
1153 if (groupNum == 0) { 1152 if (groupNum == 0) {
1154 s = fMatchStart; 1153 s = fMatchStart;
1155 e = fMatchEnd; 1154 e = fMatchEnd;
1156 } else { 1155 } else {
1157 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1156 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1158 U_ASSERT(groupOffset < fPattern->fFrameSize); 1157 U_ASSERT(groupOffset < fPattern->fFrameSize);
1159 U_ASSERT(groupOffset >= 0); 1158 U_ASSERT(groupOffset >= 0);
1160 s = fFrame->fExtra[groupOffset]; 1159 s = fFrame->fExtra[groupOffset];
1161 e = fFrame->fExtra[groupOffset+1]; 1160 e = fFrame->fExtra[groupOffset+1];
1162 } 1161 }
1163 1162
1164 if (s < 0) { 1163 if (s < 0) {
1165 // A capture group wasn't part of the match 1164 // A capture group wasn't part of the match
1166 return utext_clone(dest, fInputText, FALSE, TRUE, &status); 1165 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1167 } 1166 }
1168 U_ASSERT(s <= e); 1167 U_ASSERT(s <= e);
1169 group_len = e - s; 1168 group_len = e - s;
1170 1169
1171 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); 1170 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1172 if (dest) 1171 if (dest)
1173 UTEXT_SETNATIVEINDEX(dest, s); 1172 UTEXT_SETNATIVEINDEX(dest, s);
1174 return dest; 1173 return dest;
1175 } 1174 }
1176 1175
1177 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { 1176 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1178 UnicodeString result; 1177 UnicodeString result;
1179 if (U_FAILURE(status)) { 1178 if (U_FAILURE(status)) {
1180 return result; 1179 return result;
1181 } 1180 }
1182 UText resultText = UTEXT_INITIALIZER; 1181 UText resultText = UTEXT_INITIALIZER;
1183 utext_openUnicodeString(&resultText, &result, &status); 1182 utext_openUnicodeString(&resultText, &result, &status);
1184 group(groupNum, &resultText, status); 1183 group(groupNum, &resultText, status);
1185 utext_close(&resultText); 1184 utext_close(&resultText);
1186 return result; 1185 return result;
1187 } 1186 }
1188 1187
1189 1188
1190 // Return deep (mutable) clone 1189 // Return deep (mutable) clone
1191 //» » Technology Preview (as an API), but note that the UnicodeString API is implemented 1190 // Technology Preview (as an API), but note that the UnicodeString API is i mplemented
1192 //» » using this function. 1191 // using this function.
1193 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) co nst { 1192 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) co nst {
1194 UBool bailOut = FALSE;
1195 if (U_FAILURE(status)) { 1193 if (U_FAILURE(status)) {
1196 return dest; 1194 return dest;
1197 } 1195 }
1196
1198 if (U_FAILURE(fDeferredStatus)) { 1197 if (U_FAILURE(fDeferredStatus)) {
1199 status = fDeferredStatus; 1198 status = fDeferredStatus;
1200 bailOut = TRUE; 1199 } else if (fMatch == FALSE) {
1200 status = U_REGEX_INVALID_STATE;
1201 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1202 status = U_INDEX_OUTOFBOUNDS_ERROR;
1201 } 1203 }
1202 1204 if (U_FAILURE(status)) {
1203 if (fMatch == FALSE) { 1205 return dest;
1204 status = U_REGEX_INVALID_STATE;
1205 bailOut = TRUE;
1206 } 1206 }
1207 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1207
1208 status = U_INDEX_OUTOFBOUNDS_ERROR;
1209 bailOut = TRUE;
1210 }
1211
1212 if (bailOut) {
1213 if (dest) {
1214 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1215 return dest;
1216 } else {
1217 return utext_openUChars(NULL, NULL, 0, &status);
1218 }
1219 }
1220
1221 int64_t s, e; 1208 int64_t s, e;
1222 if (groupNum == 0) { 1209 if (groupNum == 0) {
1223 s = fMatchStart; 1210 s = fMatchStart;
1224 e = fMatchEnd; 1211 e = fMatchEnd;
1225 } else { 1212 } else {
1226 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1213 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1227 U_ASSERT(groupOffset < fPattern->fFrameSize); 1214 U_ASSERT(groupOffset < fPattern->fFrameSize);
1228 U_ASSERT(groupOffset >= 0); 1215 U_ASSERT(groupOffset >= 0);
1229 s = fFrame->fExtra[groupOffset]; 1216 s = fFrame->fExtra[groupOffset];
1230 e = fFrame->fExtra[groupOffset+1]; 1217 e = fFrame->fExtra[groupOffset+1];
1231 } 1218 }
1232 1219
1233 if (s < 0) { 1220 if (s < 0) {
1234 // A capture group wasn't part of the match 1221 // A capture group wasn't part of the match
1235 if (dest) { 1222 if (dest) {
1236 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1223 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1237 return dest; 1224 return dest;
1238 } else { 1225 } else {
1239 return utext_openUChars(NULL, NULL, 0, &status); 1226 return utext_openUChars(NULL, NULL, 0, &status);
1240 } 1227 }
1241 } 1228 }
1242 U_ASSERT(s <= e); 1229 U_ASSERT(s <= e);
1243 1230
1244 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1231 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1245 U_ASSERT(e <= fInputLength); 1232 U_ASSERT(e <= fInputLength);
1246 if (dest) { 1233 if (dest) {
1247 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkCo ntents+s, (int32_t)(e-s), &status); 1234 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkCo ntents+s, (int32_t)(e-s), &status);
1248 } else { 1235 } else {
1249 UText groupText = UTEXT_INITIALIZER; 1236 UText groupText = UTEXT_INITIALIZER;
1250 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &stat us); 1237 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &stat us);
1251 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); 1238 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1252 utext_close(&groupText); 1239 utext_close(&groupText);
1253 } 1240 }
(...skipping 13 matching lines...) Expand all
1267 utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1254 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1268 1255
1269 if (dest) { 1256 if (dest) {
1270 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status); 1257 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1271 } else { 1258 } else {
1272 UText groupText = UTEXT_INITIALIZER; 1259 UText groupText = UTEXT_INITIALIZER;
1273 utext_openUChars(&groupText, groupChars, len16, &status); 1260 utext_openUChars(&groupText, groupChars, len16, &status);
1274 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); 1261 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1275 utext_close(&groupText); 1262 utext_close(&groupText);
1276 } 1263 }
1277 1264
1278 uprv_free(groupChars); 1265 uprv_free(groupChars);
1279 } 1266 }
1280 return dest; 1267 return dest;
1281 } 1268 }
1282 1269
1283 //------------------------------------------------------------------------------ -- 1270 //------------------------------------------------------------------------------ --
1284 // 1271 //
1285 // appendGroup() -- currently internal only, appends a group to a UText rather 1272 // appendGroup() -- currently internal only, appends a group to a UText rather
1286 // than replacing its contents 1273 // than replacing its contents
1287 // 1274 //
1288 //------------------------------------------------------------------------------ -- 1275 //------------------------------------------------------------------------------ --
1289 1276
1290 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &sta tus) const { 1277 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &sta tus) const {
1291 if (U_FAILURE(status)) { 1278 if (U_FAILURE(status)) {
1292 return 0; 1279 return 0;
1293 } 1280 }
1294 if (U_FAILURE(fDeferredStatus)) { 1281 if (U_FAILURE(fDeferredStatus)) {
1295 status = fDeferredStatus; 1282 status = fDeferredStatus;
1296 return 0; 1283 return 0;
1297 } 1284 }
1298 int64_t destLen = utext_nativeLength(dest); 1285 int64_t destLen = utext_nativeLength(dest);
1299 1286
1300 if (fMatch == FALSE) { 1287 if (fMatch == FALSE) {
1301 status = U_REGEX_INVALID_STATE; 1288 status = U_REGEX_INVALID_STATE;
1302 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1289 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1303 } 1290 }
1304 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1291 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1305 status = U_INDEX_OUTOFBOUNDS_ERROR; 1292 status = U_INDEX_OUTOFBOUNDS_ERROR;
1306 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1293 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1307 } 1294 }
1308 1295
1309 int64_t s, e; 1296 int64_t s, e;
1310 if (groupNum == 0) { 1297 if (groupNum == 0) {
1311 s = fMatchStart; 1298 s = fMatchStart;
1312 e = fMatchEnd; 1299 e = fMatchEnd;
1313 } else { 1300 } else {
1314 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1301 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1315 U_ASSERT(groupOffset < fPattern->fFrameSize); 1302 U_ASSERT(groupOffset < fPattern->fFrameSize);
1316 U_ASSERT(groupOffset >= 0); 1303 U_ASSERT(groupOffset >= 0);
1317 s = fFrame->fExtra[groupOffset]; 1304 s = fFrame->fExtra[groupOffset];
1318 e = fFrame->fExtra[groupOffset+1]; 1305 e = fFrame->fExtra[groupOffset+1];
1319 } 1306 }
1320 1307
1321 if (s < 0) { 1308 if (s < 0) {
1322 // A capture group wasn't part of the match 1309 // A capture group wasn't part of the match
1323 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1310 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1324 } 1311 }
1325 U_ASSERT(s <= e); 1312 U_ASSERT(s <= e);
1326 1313
1327 int64_t deltaLen; 1314 int64_t deltaLen;
1328 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1315 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1329 U_ASSERT(e <= fInputLength); 1316 U_ASSERT(e <= fInputLength);
1330 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkConten ts+s, (int32_t)(e-s), &status); 1317 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkConten ts+s, (int32_t)(e-s), &status);
1331 } else { 1318 } else {
1332 int32_t len16; 1319 int32_t len16;
1333 if (UTEXT_USES_U16(fInputText)) { 1320 if (UTEXT_USES_U16(fInputText)) {
1334 len16 = (int32_t)(e-s); 1321 len16 = (int32_t)(e-s);
1335 } else { 1322 } else {
1336 UErrorCode lengthStatus = U_ZERO_ERROR; 1323 UErrorCode lengthStatus = U_ZERO_ERROR;
1337 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); 1324 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1338 } 1325 }
1339 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1326 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1340 if (groupChars == NULL) { 1327 if (groupChars == NULL) {
1341 status = U_MEMORY_ALLOCATION_ERROR; 1328 status = U_MEMORY_ALLOCATION_ERROR;
1342 return 0; 1329 return 0;
1343 } 1330 }
1344 utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1331 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1345 1332
1346 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &sta tus); 1333 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &sta tus);
1347 uprv_free(groupChars); 1334 uprv_free(groupChars);
1348 } 1335 }
1349 return deltaLen; 1336 return deltaLen;
1350 } 1337 }
1351 1338
1352 1339
1353 1340
1354 //------------------------------------------------------------------------------ -- 1341 //------------------------------------------------------------------------------ --
1355 // 1342 //
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
1402 if (!fInput) { 1389 if (!fInput) {
1403 UErrorCode status = U_ZERO_ERROR; 1390 UErrorCode status = U_ZERO_ERROR;
1404 int32_t len16; 1391 int32_t len16;
1405 if (UTEXT_USES_U16(fInputText)) { 1392 if (UTEXT_USES_U16(fInputText)) {
1406 len16 = (int32_t)fInputLength; 1393 len16 = (int32_t)fInputLength;
1407 } else { 1394 } else {
1408 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status) ; 1395 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status) ;
1409 status = U_ZERO_ERROR; // overflow, length status 1396 status = U_ZERO_ERROR; // overflow, length status
1410 } 1397 }
1411 UnicodeString *result = new UnicodeString(len16, 0, 0); 1398 UnicodeString *result = new UnicodeString(len16, 0, 0);
1412 1399
1413 UChar *inputChars = result->getBuffer(len16); 1400 UChar *inputChars = result->getBuffer(len16);
1414 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning 1401 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1415 result->releaseBuffer(len16); 1402 result->releaseBuffer(len16);
1416 1403
1417 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rath er than operator= 1404 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rath er than operator=
1418 } 1405 }
1419 1406
1420 return *fInput; 1407 return *fInput;
1421 } 1408 }
1422 1409
1423 //------------------------------------------------------------------------------ -- 1410 //------------------------------------------------------------------------------ --
1424 // 1411 //
1425 // inputText() 1412 // inputText()
1426 // 1413 //
1427 //------------------------------------------------------------------------------ -- 1414 //------------------------------------------------------------------------------ --
1428 UText *RegexMatcher::inputText() const { 1415 UText *RegexMatcher::inputText() const {
1429 return fInputText; 1416 return fInputText;
1430 } 1417 }
1431 1418
1432 1419
1433 //------------------------------------------------------------------------------ -- 1420 //------------------------------------------------------------------------------ --
1434 // 1421 //
1435 // getInput() -- like inputText(), but makes a clone or copies into another UTe xt 1422 // getInput() -- like inputText(), but makes a clone or copies into another UTe xt
1436 // 1423 //
1437 //------------------------------------------------------------------------------ -- 1424 //------------------------------------------------------------------------------ --
1438 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { 1425 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1439 UBool bailOut = FALSE;
1440 if (U_FAILURE(status)) { 1426 if (U_FAILURE(status)) {
1441 return dest; 1427 return dest;
1442 } 1428 }
1443 if (U_FAILURE(fDeferredStatus)) { 1429 if (U_FAILURE(fDeferredStatus)) {
1444 status = fDeferredStatus; 1430 status = fDeferredStatus;
1445 bailOut = TRUE; 1431 return dest;
1446 } 1432 }
1447 1433
1448 if (bailOut) {
1449 if (dest) {
1450 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1451 return dest;
1452 } else {
1453 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1454 }
1455 }
1456
1457 if (dest) { 1434 if (dest) {
1458 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1435 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1459 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkCo ntents, (int32_t)fInputLength, &status); 1436 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkCo ntents, (int32_t)fInputLength, &status);
1460 } else { 1437 } else {
1461 int32_t input16Len; 1438 int32_t input16Len;
1462 if (UTEXT_USES_U16(fInputText)) { 1439 if (UTEXT_USES_U16(fInputText)) {
1463 input16Len = (int32_t)fInputLength; 1440 input16Len = (int32_t)fInputLength;
1464 } else { 1441 } else {
1465 UErrorCode lengthStatus = U_ZERO_ERROR; 1442 UErrorCode lengthStatus = U_ZERO_ERROR;
1466 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error 1443 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1467 } 1444 }
1468 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len)) ; 1445 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len)) ;
1469 if (inputChars == NULL) { 1446 if (inputChars == NULL) {
1470 return dest; 1447 return dest;
1471 } 1448 }
1472 1449
1473 status = U_ZERO_ERROR; 1450 status = U_ZERO_ERROR;
1474 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, & status); // not terminated warning 1451 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, & status); // not terminated warning
1475 status = U_ZERO_ERROR; 1452 status = U_ZERO_ERROR;
1476 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16 Len, &status); 1453 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16 Len, &status);
1477 1454
1478 uprv_free(inputChars); 1455 uprv_free(inputChars);
1479 } 1456 }
1480 return dest; 1457 return dest;
1481 } else { 1458 } else {
1482 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); 1459 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1483 } 1460 }
1484 } 1461 }
1485 1462
1486 1463
1487 static UBool compat_SyncMutableUTextContents(UText *ut); 1464 static UBool compat_SyncMutableUTextContents(UText *ut);
1488 static UBool compat_SyncMutableUTextContents(UText *ut) { 1465 static UBool compat_SyncMutableUTextContents(UText *ut) {
1489 UBool retVal = FALSE; 1466 UBool retVal = FALSE;
1490 1467
1491 // In the following test, we're really only interested in whether the UText should switch 1468 // In the following test, we're really only interested in whether the UText should switch
1492 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents 1469 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1493 // will still point to the correct data. 1470 // will still point to the correct data.
1494 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { 1471 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1495 UnicodeString *us=(UnicodeString *)ut->context; 1472 UnicodeString *us=(UnicodeString *)ut->context;
1496 1473
1497 // Update to the latest length. 1474 // Update to the latest length.
1498 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). 1475 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1499 int32_t newLength = us->length(); 1476 int32_t newLength = us->length();
1500 1477
1501 // Update the chunk description. 1478 // Update the chunk description.
1502 // The buffer may have switched between stack- and heap-based. 1479 // The buffer may have switched between stack- and heap-based.
1503 ut->chunkContents = us->getBuffer(); 1480 ut->chunkContents = us->getBuffer();
1504 ut->chunkLength = newLength; 1481 ut->chunkLength = newLength;
1505 ut->chunkNativeLimit = newLength; 1482 ut->chunkNativeLimit = newLength;
1506 ut->nativeIndexingLimit = newLength; 1483 ut->nativeIndexingLimit = newLength;
1507 retVal = TRUE; 1484 retVal = TRUE;
1508 } 1485 }
1509 1486
1510 return retVal; 1487 return retVal;
1511 } 1488 }
1512 1489
1513 //------------------------------------------------------------------------------ -- 1490 //------------------------------------------------------------------------------ --
1514 // 1491 //
1515 // lookingAt() 1492 // lookingAt()
1516 // 1493 //
1517 //------------------------------------------------------------------------------ -- 1494 //------------------------------------------------------------------------------ --
1518 UBool RegexMatcher::lookingAt(UErrorCode &status) { 1495 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1519 if (U_FAILURE(status)) { 1496 if (U_FAILURE(status)) {
1520 return FALSE; 1497 return FALSE;
1521 } 1498 }
1522 if (U_FAILURE(fDeferredStatus)) { 1499 if (U_FAILURE(fDeferredStatus)) {
1523 status = fDeferredStatus; 1500 status = fDeferredStatus;
1524 return FALSE; 1501 return FALSE;
1525 } 1502 }
1526 1503
1527 if (fInputUniStrMaybeMutable) { 1504 if (fInputUniStrMaybeMutable) {
1528 if (compat_SyncMutableUTextContents(fInputText)) { 1505 if (compat_SyncMutableUTextContents(fInputText)) {
1529 fInputLength = utext_nativeLength(fInputText); 1506 fInputLength = utext_nativeLength(fInputText);
1530 reset(); 1507 reset();
1531 } 1508 }
1532 } 1509 }
1533 else { 1510 else {
1534 resetPreserveRegion(); 1511 resetPreserveRegion();
1535 } 1512 }
1536 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1513 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1537 MatchChunkAt((int32_t)fActiveStart, FALSE, status); 1514 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1538 } else { 1515 } else {
1539 MatchAt(fActiveStart, FALSE, status); 1516 MatchAt(fActiveStart, FALSE, status);
1540 } 1517 }
1541 return fMatch; 1518 return fMatch;
1542 } 1519 }
1543 1520
1544 1521
1545 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { 1522 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1546 if (U_FAILURE(status)) { 1523 if (U_FAILURE(status)) {
1547 return FALSE; 1524 return FALSE;
1548 } 1525 }
1549 if (U_FAILURE(fDeferredStatus)) { 1526 if (U_FAILURE(fDeferredStatus)) {
1550 status = fDeferredStatus; 1527 status = fDeferredStatus;
1551 return FALSE; 1528 return FALSE;
1552 } 1529 }
1553 reset(); 1530 reset();
1554 1531
1555 if (start < 0) { 1532 if (start < 0) {
1556 status = U_INDEX_OUTOFBOUNDS_ERROR; 1533 status = U_INDEX_OUTOFBOUNDS_ERROR;
1557 return FALSE; 1534 return FALSE;
1558 } 1535 }
1559 1536
1560 if (fInputUniStrMaybeMutable) { 1537 if (fInputUniStrMaybeMutable) {
1561 if (compat_SyncMutableUTextContents(fInputText)) { 1538 if (compat_SyncMutableUTextContents(fInputText)) {
1562 fInputLength = utext_nativeLength(fInputText); 1539 fInputLength = utext_nativeLength(fInputText);
1563 reset(); 1540 reset();
1564 } 1541 }
1565 } 1542 }
1566 1543
1567 int64_t nativeStart; 1544 int64_t nativeStart;
1568 nativeStart = start; 1545 nativeStart = start;
1569 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1546 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1570 status = U_INDEX_OUTOFBOUNDS_ERROR; 1547 status = U_INDEX_OUTOFBOUNDS_ERROR;
1571 return FALSE; 1548 return FALSE;
1572 } 1549 }
1573 1550
1574 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1551 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1575 MatchChunkAt((int32_t)nativeStart, FALSE, status); 1552 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1576 } else { 1553 } else {
1577 MatchAt(nativeStart, FALSE, status); 1554 MatchAt(nativeStart, FALSE, status);
1578 } 1555 }
1579 return fMatch; 1556 return fMatch;
1580 } 1557 }
1581 1558
1582 1559
1583 1560
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
1616 1593
1617 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { 1594 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1618 if (U_FAILURE(status)) { 1595 if (U_FAILURE(status)) {
1619 return FALSE; 1596 return FALSE;
1620 } 1597 }
1621 if (U_FAILURE(fDeferredStatus)) { 1598 if (U_FAILURE(fDeferredStatus)) {
1622 status = fDeferredStatus; 1599 status = fDeferredStatus;
1623 return FALSE; 1600 return FALSE;
1624 } 1601 }
1625 reset(); 1602 reset();
1626 1603
1627 if (start < 0) { 1604 if (start < 0) {
1628 status = U_INDEX_OUTOFBOUNDS_ERROR; 1605 status = U_INDEX_OUTOFBOUNDS_ERROR;
1629 return FALSE; 1606 return FALSE;
1630 } 1607 }
1631 1608
1632 if (fInputUniStrMaybeMutable) { 1609 if (fInputUniStrMaybeMutable) {
1633 if (compat_SyncMutableUTextContents(fInputText)) { 1610 if (compat_SyncMutableUTextContents(fInputText)) {
1634 fInputLength = utext_nativeLength(fInputText); 1611 fInputLength = utext_nativeLength(fInputText);
1635 reset(); 1612 reset();
1636 } 1613 }
(...skipping 29 matching lines...) Expand all
1666 1643
1667 //------------------------------------------------------------------------------ -- 1644 //------------------------------------------------------------------------------ --
1668 // 1645 //
1669 // region 1646 // region
1670 // 1647 //
1671 //------------------------------------------------------------------------------ -- 1648 //------------------------------------------------------------------------------ --
1672 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int 64_t startIndex, UErrorCode &status) { 1649 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int 64_t startIndex, UErrorCode &status) {
1673 if (U_FAILURE(status)) { 1650 if (U_FAILURE(status)) {
1674 return *this; 1651 return *this;
1675 } 1652 }
1676 1653
1677 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { 1654 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1678 status = U_ILLEGAL_ARGUMENT_ERROR; 1655 status = U_ILLEGAL_ARGUMENT_ERROR;
1679 } 1656 }
1680 1657
1681 int64_t nativeStart = regionStart; 1658 int64_t nativeStart = regionStart;
1682 int64_t nativeLimit = regionLimit; 1659 int64_t nativeLimit = regionLimit;
1683 if (nativeStart > fInputLength || nativeLimit > fInputLength) { 1660 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1684 status = U_ILLEGAL_ARGUMENT_ERROR; 1661 status = U_ILLEGAL_ARGUMENT_ERROR;
1685 } 1662 }
1686 1663
1687 if (startIndex == -1) 1664 if (startIndex == -1)
1688 this->reset(); 1665 this->reset();
1689 else 1666 else
1690 resetPreserveRegion(); 1667 resetPreserveRegion();
1691 1668
1692 fRegionStart = nativeStart; 1669 fRegionStart = nativeStart;
1693 fRegionLimit = nativeLimit; 1670 fRegionLimit = nativeLimit;
1694 fActiveStart = nativeStart; 1671 fActiveStart = nativeStart;
1695 fActiveLimit = nativeLimit; 1672 fActiveLimit = nativeLimit;
1696 1673
1697 if (startIndex != -1) { 1674 if (startIndex != -1) {
1698 if (startIndex < fActiveStart || startIndex > fActiveLimit) { 1675 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1699 status = U_INDEX_OUTOFBOUNDS_ERROR; 1676 status = U_INDEX_OUTOFBOUNDS_ERROR;
1700 } 1677 }
1701 fMatchEnd = startIndex; 1678 fMatchEnd = startIndex;
1702 } 1679 }
1703 1680
1704 if (!fTransparentBounds) { 1681 if (!fTransparentBounds) {
1705 fLookStart = nativeStart; 1682 fLookStart = nativeStart;
1706 fLookLimit = nativeLimit; 1683 fLookLimit = nativeLimit;
1707 } 1684 }
1708 if (fAnchoringBounds) { 1685 if (fAnchoringBounds) {
1709 fAnchorStart = nativeStart; 1686 fAnchorStart = nativeStart;
1710 fAnchorLimit = nativeLimit; 1687 fAnchorLimit = nativeLimit;
1711 } 1688 }
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
1748 // replaceAll 1725 // replaceAll
1749 // 1726 //
1750 //------------------------------------------------------------------------------ -- 1727 //------------------------------------------------------------------------------ --
1751 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorC ode &status) { 1728 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorC ode &status) {
1752 UText replacementText = UTEXT_INITIALIZER; 1729 UText replacementText = UTEXT_INITIALIZER;
1753 UText resultText = UTEXT_INITIALIZER; 1730 UText resultText = UTEXT_INITIALIZER;
1754 UnicodeString resultString; 1731 UnicodeString resultString;
1755 if (U_FAILURE(status)) { 1732 if (U_FAILURE(status)) {
1756 return resultString; 1733 return resultString;
1757 } 1734 }
1758 1735
1759 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1736 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1760 utext_openUnicodeString(&resultText, &resultString, &status); 1737 utext_openUnicodeString(&resultText, &resultString, &status);
1761 1738
1762 replaceAll(&replacementText, &resultText, status); 1739 replaceAll(&replacementText, &resultText, status);
1763 1740
1764 utext_close(&resultText); 1741 utext_close(&resultText);
1765 utext_close(&replacementText); 1742 utext_close(&replacementText);
1766 1743
1767 return resultString; 1744 return resultString;
1768 } 1745 }
1769 1746
1770 1747
1771 // 1748 //
1772 // replaceAll, UText mode 1749 // replaceAll, UText mode
1773 // 1750 //
1774 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &sta tus) { 1751 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &sta tus) {
1775 if (U_FAILURE(status)) { 1752 if (U_FAILURE(status)) {
1776 return dest; 1753 return dest;
1777 } 1754 }
1778 if (U_FAILURE(fDeferredStatus)) { 1755 if (U_FAILURE(fDeferredStatus)) {
1779 status = fDeferredStatus; 1756 status = fDeferredStatus;
1780 return dest; 1757 return dest;
1781 } 1758 }
1782 1759
1783 if (dest == NULL) { 1760 if (dest == NULL) {
1784 UnicodeString emptyString; 1761 UnicodeString emptyString;
1785 UText empty = UTEXT_INITIALIZER; 1762 UText empty = UTEXT_INITIALIZER;
1786 1763
1787 utext_openUnicodeString(&empty, &emptyString, &status); 1764 utext_openUnicodeString(&empty, &emptyString, &status);
1788 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1765 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1789 utext_close(&empty); 1766 utext_close(&empty);
1790 } 1767 }
1791 1768
1792 if (U_SUCCESS(status)) { 1769 if (U_SUCCESS(status)) {
1793 reset(); 1770 reset();
1794 while (find()) { 1771 while (find()) {
1795 appendReplacement(dest, replacement, status); 1772 appendReplacement(dest, replacement, status);
1796 if (U_FAILURE(status)) { 1773 if (U_FAILURE(status)) {
1797 break; 1774 break;
1798 } 1775 }
1799 } 1776 }
1800 appendTail(dest, status); 1777 appendTail(dest, status);
1801 } 1778 }
1802 1779
1803 return dest; 1780 return dest;
1804 } 1781 }
1805 1782
1806 1783
1807 //------------------------------------------------------------------------------ -- 1784 //------------------------------------------------------------------------------ --
1808 // 1785 //
1809 // replaceFirst 1786 // replaceFirst
1810 // 1787 //
1811 //------------------------------------------------------------------------------ -- 1788 //------------------------------------------------------------------------------ --
1812 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErro rCode &status) { 1789 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErro rCode &status) {
1813 UText replacementText = UTEXT_INITIALIZER; 1790 UText replacementText = UTEXT_INITIALIZER;
1814 UText resultText = UTEXT_INITIALIZER; 1791 UText resultText = UTEXT_INITIALIZER;
1815 UnicodeString resultString; 1792 UnicodeString resultString;
1816 1793
1817 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1794 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1818 utext_openUnicodeString(&resultText, &resultString, &status); 1795 utext_openUnicodeString(&resultText, &resultString, &status);
1819 1796
1820 replaceFirst(&replacementText, &resultText, status); 1797 replaceFirst(&replacementText, &resultText, status);
1821 1798
1822 utext_close(&resultText); 1799 utext_close(&resultText);
1823 utext_close(&replacementText); 1800 utext_close(&replacementText);
1824 1801
1825 return resultString; 1802 return resultString;
1826 } 1803 }
1827 1804
1828 // 1805 //
1829 // replaceFirst, UText mode 1806 // replaceFirst, UText mode
1830 // 1807 //
1831 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &s tatus) { 1808 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &s tatus) {
1832 if (U_FAILURE(status)) { 1809 if (U_FAILURE(status)) {
1833 return dest; 1810 return dest;
1834 } 1811 }
1835 if (U_FAILURE(fDeferredStatus)) { 1812 if (U_FAILURE(fDeferredStatus)) {
1836 status = fDeferredStatus; 1813 status = fDeferredStatus;
1837 return dest; 1814 return dest;
1838 } 1815 }
1839 1816
1840 reset(); 1817 reset();
1841 if (!find()) { 1818 if (!find()) {
1842 return getInput(dest, status); 1819 return getInput(dest, status);
1843 } 1820 }
1844 1821
1845 if (dest == NULL) { 1822 if (dest == NULL) {
1846 UnicodeString emptyString; 1823 UnicodeString emptyString;
1847 UText empty = UTEXT_INITIALIZER; 1824 UText empty = UTEXT_INITIALIZER;
1848 1825
1849 utext_openUnicodeString(&empty, &emptyString, &status); 1826 utext_openUnicodeString(&empty, &emptyString, &status);
1850 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1827 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1851 utext_close(&empty); 1828 utext_close(&empty);
1852 } 1829 }
1853 1830
1854 appendReplacement(dest, replacement, status); 1831 appendReplacement(dest, replacement, status);
1855 appendTail(dest, status); 1832 appendTail(dest, status);
1856 1833
1857 return dest; 1834 return dest;
1858 } 1835 }
1859 1836
1860 1837
1861 //------------------------------------------------------------------------------ -- 1838 //------------------------------------------------------------------------------ --
1862 // 1839 //
1863 // requireEnd 1840 // requireEnd
1864 // 1841 //
1865 //------------------------------------------------------------------------------ -- 1842 //------------------------------------------------------------------------------ --
1866 UBool RegexMatcher::requireEnd() const { 1843 UBool RegexMatcher::requireEnd() const {
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
1901 //resetStack(); // more expensive than it looks... 1878 //resetStack(); // more expensive than it looks...
1902 } 1879 }
1903 1880
1904 1881
1905 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { 1882 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1906 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStat us); 1883 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStat us);
1907 if (fPattern->fNeedsAltInput) { 1884 if (fPattern->fNeedsAltInput) {
1908 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDe ferredStatus); 1885 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDe ferredStatus);
1909 } 1886 }
1910 fInputLength = utext_nativeLength(fInputText); 1887 fInputLength = utext_nativeLength(fInputText);
1911 1888
1912 reset(); 1889 reset();
1913 delete fInput; 1890 delete fInput;
1914 fInput = NULL; 1891 fInput = NULL;
1915 1892
1916 // Do the following for any UnicodeString. 1893 // Do the following for any UnicodeString.
1917 // This is for compatibility for those clients who modify the input string "live" during regex operations. 1894 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1918 fInputUniStrMaybeMutable = TRUE; 1895 fInputUniStrMaybeMutable = TRUE;
1919 1896
1920 if (fWordBreakItr != NULL) { 1897 if (fWordBreakItr != NULL) {
1921 #if UCONFIG_NO_BREAK_ITERATION==0 1898 #if UCONFIG_NO_BREAK_ITERATION==0
1922 UErrorCode status = U_ZERO_ERROR; 1899 UErrorCode status = U_ZERO_ERROR;
1923 fWordBreakItr->setText(fInputText, status); 1900 fWordBreakItr->setText(fInputText, status);
1924 #endif 1901 #endif
1925 } 1902 }
1926 return *this; 1903 return *this;
1927 } 1904 }
1928 1905
1929 1906
1930 RegexMatcher &RegexMatcher::reset(UText *input) { 1907 RegexMatcher &RegexMatcher::reset(UText *input) {
1931 if (fInputText != input) { 1908 if (fInputText != input) {
1932 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatu s); 1909 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatu s);
1933 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1910 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1934 fInputLength = utext_nativeLength(fInputText); 1911 fInputLength = utext_nativeLength(fInputText);
1935 1912
1936 delete fInput; 1913 delete fInput;
1937 fInput = NULL; 1914 fInput = NULL;
1938 1915
1939 if (fWordBreakItr != NULL) { 1916 if (fWordBreakItr != NULL) {
1940 #if UCONFIG_NO_BREAK_ITERATION==0 1917 #if UCONFIG_NO_BREAK_ITERATION==0
1941 UErrorCode status = U_ZERO_ERROR; 1918 UErrorCode status = U_ZERO_ERROR;
1942 fWordBreakItr->setText(input, status); 1919 fWordBreakItr->setText(input, status);
1943 #endif 1920 #endif
1944 } 1921 }
1945 } 1922 }
1946 reset(); 1923 reset();
1947 fInputUniStrMaybeMutable = FALSE; 1924 fInputUniStrMaybeMutable = FALSE;
1948 1925
1949 return *this; 1926 return *this;
1950 } 1927 }
1951 1928
1952 /*RegexMatcher &RegexMatcher::reset(const UChar *) { 1929 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1953 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; 1930 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1954 return *this; 1931 return *this;
1955 }*/ 1932 }*/
1956 1933
1957 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { 1934 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1958 if (U_FAILURE(status)) { 1935 if (U_FAILURE(status)) {
1959 return *this; 1936 return *this;
1960 } 1937 }
1961 reset(); // Reset also resets the region to be the entire string. 1938 reset(); // Reset also resets the region to be the entire string.
1962 1939
1963 if (position < 0 || position > fActiveLimit) { 1940 if (position < 0 || position > fActiveLimit) {
1964 status = U_INDEX_OUTOFBOUNDS_ERROR; 1941 status = U_INDEX_OUTOFBOUNDS_ERROR;
1965 return *this; 1942 return *this;
1966 } 1943 }
1967 fMatchEnd = position; 1944 fMatchEnd = position;
1968 return *this; 1945 return *this;
1969 } 1946 }
1970 1947
1971 1948
1972 //------------------------------------------------------------------------------ -- 1949 //------------------------------------------------------------------------------ --
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
2036 2013
2037 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); 2014 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2038 if (destText == NULL) { 2015 if (destText == NULL) {
2039 status = U_MEMORY_ALLOCATION_ERROR; 2016 status = U_MEMORY_ALLOCATION_ERROR;
2040 return 0; 2017 return 0;
2041 } 2018 }
2042 int32_t i; 2019 int32_t i;
2043 for (i = 0; i < destCapacity; i++) { 2020 for (i = 0; i < destCapacity; i++) {
2044 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); 2021 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2045 } 2022 }
2046 2023
2047 int32_t fieldCount = split(&inputText, destText, destCapacity, status); 2024 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2048 2025
2049 for (i = 0; i < destCapacity; i++) { 2026 for (i = 0; i < destCapacity; i++) {
2050 utext_close(destText[i]); 2027 utext_close(destText[i]);
2051 } 2028 }
2052 2029
2053 uprv_free(destText); 2030 uprv_free(destText);
2054 utext_close(&inputText); 2031 utext_close(&inputText);
2055 return fieldCount; 2032 return fieldCount;
2056 } 2033 }
2057 2034
2058 // 2035 //
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
2094 // There is one or zero output string left. 2071 // There is one or zero output string left.
2095 // Fill the last output string with whatever is left from the input, then exit the loop. 2072 // Fill the last output string with whatever is left from the input, then exit the loop.
2096 // ( i will be == destCapacity if we filled the output array while processing 2073 // ( i will be == destCapacity if we filled the output array while processing
2097 // capture groups of the delimiter expression, in which case we w ill discard the 2074 // capture groups of the delimiter expression, in which case we w ill discard the
2098 // last capture group saved in favor of the unprocessed remainder of the 2075 // last capture group saved in favor of the unprocessed remainder of the
2099 // input string.) 2076 // input string.)
2100 i = destCapacity-1; 2077 i = destCapacity-1;
2101 if (fActiveLimit > nextOutputStringStart) { 2078 if (fActiveLimit > nextOutputStringStart) {
2102 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2079 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2103 if (dest[i]) { 2080 if (dest[i]) {
2104 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2081 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2105 input->chunkContents+nextOutputStringStart , 2082 input->chunkContents+nextOutputStringStart ,
2106 (int32_t)(fActiveLimit-nextOutputStringSta rt), &status); 2083 (int32_t)(fActiveLimit-nextOutputStringSta rt), &status);
2107 } else { 2084 } else {
2108 UText remainingText = UTEXT_INITIALIZER; 2085 UText remainingText = UTEXT_INITIALIZER;
2109 utext_openUChars(&remainingText, input->chunkContents+ne xtOutputStringStart, 2086 utext_openUChars(&remainingText, input->chunkContents+ne xtOutputStringStart,
2110 fActiveLimit-nextOutputStringStart, &st atus); 2087 fActiveLimit-nextOutputStringStart, &st atus);
2111 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2088 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2112 utext_close(&remainingText); 2089 utext_close(&remainingText);
2113 } 2090 }
2114 } else { 2091 } else {
2115 UErrorCode lengthStatus = U_ZERO_ERROR; 2092 UErrorCode lengthStatus = U_ZERO_ERROR;
2116 int32_t remaining16Length = 2093 int32_t remaining16Length =
2117 utext_extract(input, nextOutputStringStart, fActiveLimit , NULL, 0, &lengthStatus); 2094 utext_extract(input, nextOutputStringStart, fActiveLimit , NULL, 0, &lengthStatus);
2118 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*( remaining16Length+1)); 2095 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*( remaining16Length+1));
2119 if (remainingChars == NULL) { 2096 if (remainingChars == NULL) {
2120 status = U_MEMORY_ALLOCATION_ERROR; 2097 status = U_MEMORY_ALLOCATION_ERROR;
2121 break; 2098 break;
2122 } 2099 }
2123 2100
2124 utext_extract(input, nextOutputStringStart, fActiveLimit, re mainingChars, remaining16Length+1, &status); 2101 utext_extract(input, nextOutputStringStart, fActiveLimit, re mainingChars, remaining16Length+1, &status);
2125 if (dest[i]) { 2102 if (dest[i]) {
2126 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), r emainingChars, remaining16Length, &status); 2103 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), r emainingChars, remaining16Length, &status);
2127 } else { 2104 } else {
2128 UText remainingText = UTEXT_INITIALIZER; 2105 UText remainingText = UTEXT_INITIALIZER;
2129 utext_openUChars(&remainingText, remainingChars, remaini ng16Length, &status); 2106 utext_openUChars(&remainingText, remainingChars, remaini ng16Length, &status);
2130 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2107 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2131 utext_close(&remainingText); 2108 utext_close(&remainingText);
2132 } 2109 }
2133 2110
2134 uprv_free(remainingChars); 2111 uprv_free(remainingChars);
2135 } 2112 }
2136 } 2113 }
2137 break; 2114 break;
2138 } 2115 }
2139 if (find()) { 2116 if (find()) {
2140 // We found another delimiter. Move everything from where we starte d looking 2117 // We found another delimiter. Move everything from where we starte d looking
2141 // up until the start of the delimiter into the next output string. 2118 // up until the start of the delimiter into the next output string.
2142 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2119 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2143 if (dest[i]) { 2120 if (dest[i]) {
2144 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2121 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2145 input->chunkContents+nextOutputStringStart, 2122 input->chunkContents+nextOutputStringStart,
2146 (int32_t)(fMatchStart-nextOutputStringStart), &status); 2123 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2147 } else { 2124 } else {
2148 UText remainingText = UTEXT_INITIALIZER; 2125 UText remainingText = UTEXT_INITIALIZER;
2149 utext_openUChars(&remainingText, input->chunkContents+nextOu tputStringStart, 2126 utext_openUChars(&remainingText, input->chunkContents+nextOu tputStringStart,
2150 fMatchStart-nextOutputStringStart, &status ); 2127 fMatchStart-nextOutputStringStart, &status );
2151 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st atus); 2128 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st atus);
2152 utext_close(&remainingText); 2129 utext_close(&remainingText);
2153 } 2130 }
2154 } else { 2131 } else {
2155 UErrorCode lengthStatus = U_ZERO_ERROR; 2132 UErrorCode lengthStatus = U_ZERO_ERROR;
2156 int32_t remaining16Length = utext_extract(input, nextOutputStrin gStart, fMatchStart, NULL, 0, &lengthStatus); 2133 int32_t remaining16Length = utext_extract(input, nextOutputStrin gStart, fMatchStart, NULL, 0, &lengthStatus);
2157 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(rema ining16Length+1)); 2134 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(rema ining16Length+1));
2158 if (remainingChars == NULL) { 2135 if (remainingChars == NULL) {
2159 status = U_MEMORY_ALLOCATION_ERROR; 2136 status = U_MEMORY_ALLOCATION_ERROR;
2160 break; 2137 break;
2161 } 2138 }
2162 utext_extract(input, nextOutputStringStart, fMatchStart, remaini ngChars, remaining16Length+1, &status); 2139 utext_extract(input, nextOutputStringStart, fMatchStart, remaini ngChars, remaining16Length+1, &status);
2163 if (dest[i]) { 2140 if (dest[i]) {
2164 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remai ningChars, remaining16Length, &status); 2141 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remai ningChars, remaining16Length, &status);
2165 } else { 2142 } else {
2166 UText remainingText = UTEXT_INITIALIZER; 2143 UText remainingText = UTEXT_INITIALIZER;
2167 utext_openUChars(&remainingText, remainingChars, remaining16 Length, &status); 2144 utext_openUChars(&remainingText, remainingChars, remaining16 Length, &status);
2168 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st atus); 2145 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st atus);
2169 utext_close(&remainingText); 2146 utext_close(&remainingText);
2170 } 2147 }
2171 2148
2172 uprv_free(remainingChars); 2149 uprv_free(remainingChars);
2173 } 2150 }
2174 nextOutputStringStart = fMatchEnd; 2151 nextOutputStringStart = fMatchEnd;
2175 2152
2176 // If the delimiter pattern has capturing parentheses, the captured 2153 // If the delimiter pattern has capturing parentheses, the captured
2177 // text goes out into the next n destination strings. 2154 // text goes out into the next n destination strings.
2178 int32_t groupNum; 2155 int32_t groupNum;
2179 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 2156 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2180 if (i >= destCapacity-2) { 2157 if (i >= destCapacity-2) {
2181 // Never fill the last available output string with capture group text. 2158 // Never fill the last available output string with capture group text.
(...skipping 12 matching lines...) Expand all
2194 if (i+1 < destCapacity) { 2171 if (i+1 < destCapacity) {
2195 ++i; 2172 ++i;
2196 if (dest[i] == NULL) { 2173 if (dest[i] == NULL) {
2197 dest[i] = utext_openUChars(NULL, NULL, 0, &status); 2174 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2198 } else { 2175 } else {
2199 static UChar emptyString[] = {(UChar)0}; 2176 static UChar emptyString[] = {(UChar)0};
2200 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), e mptyString, 0, &status); 2177 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), e mptyString, 0, &status);
2201 } 2178 }
2202 } 2179 }
2203 break; 2180 break;
2204 2181
2205 } 2182 }
2206 } 2183 }
2207 else 2184 else
2208 { 2185 {
2209 // We ran off the end of the input while looking for the next delimi ter. 2186 // We ran off the end of the input while looking for the next delimi ter.
2210 // All the remaining text goes into the current output string. 2187 // All the remaining text goes into the current output string.
2211 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2188 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2212 if (dest[i]) { 2189 if (dest[i]) {
2213 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2190 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2214 input->chunkContents+nextOutputStringStart, 2191 input->chunkContents+nextOutputStringStart,
2215 (int32_t)(fActiveLimit-nextOutputStringStart), &status); 2192 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2216 } else { 2193 } else {
2217 UText remainingText = UTEXT_INITIALIZER; 2194 UText remainingText = UTEXT_INITIALIZER;
2218 utext_openUChars(&remainingText, input->chunkContents+nextOu tputStringStart, 2195 utext_openUChars(&remainingText, input->chunkContents+nextOu tputStringStart,
2219 fActiveLimit-nextOutputStringStart, &status ); 2196 fActiveLimit-nextOutputStringStart, &status );
2220 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st atus); 2197 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st atus);
2221 utext_close(&remainingText); 2198 utext_close(&remainingText);
2222 } 2199 }
2223 } else { 2200 } else {
2224 UErrorCode lengthStatus = U_ZERO_ERROR; 2201 UErrorCode lengthStatus = U_ZERO_ERROR;
2225 int32_t remaining16Length = utext_extract(input, nextOutputStrin gStart, fActiveLimit, NULL, 0, &lengthStatus); 2202 int32_t remaining16Length = utext_extract(input, nextOutputStrin gStart, fActiveLimit, NULL, 0, &lengthStatus);
2226 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(rema ining16Length+1)); 2203 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(rema ining16Length+1));
2227 if (remainingChars == NULL) { 2204 if (remainingChars == NULL) {
2228 status = U_MEMORY_ALLOCATION_ERROR; 2205 status = U_MEMORY_ALLOCATION_ERROR;
2229 break; 2206 break;
2230 } 2207 }
2231 2208
2232 utext_extract(input, nextOutputStringStart, fActiveLimit, remain ingChars, remaining16Length+1, &status); 2209 utext_extract(input, nextOutputStringStart, fActiveLimit, remain ingChars, remaining16Length+1, &status);
2233 if (dest[i]) { 2210 if (dest[i]) {
2234 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remai ningChars, remaining16Length, &status); 2211 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remai ningChars, remaining16Length, &status);
2235 } else { 2212 } else {
2236 UText remainingText = UTEXT_INITIALIZER; 2213 UText remainingText = UTEXT_INITIALIZER;
2237 utext_openUChars(&remainingText, remainingChars, remaining16 Length, &status); 2214 utext_openUChars(&remainingText, remainingChars, remaining16 Length, &status);
2238 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st atus); 2215 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &st atus);
2239 utext_close(&remainingText); 2216 utext_close(&remainingText);
2240 } 2217 }
2241 2218
2242 uprv_free(remainingChars); 2219 uprv_free(remainingChars);
2243 } 2220 }
2244 break; 2221 break;
2245 } 2222 }
2246 if (U_FAILURE(status)) { 2223 if (U_FAILURE(status)) {
2247 break; 2224 break;
2248 } 2225 }
2249 } // end of for loop 2226 } // end of for loop
2250 return i+1; 2227 return i+1;
2251 } 2228 }
(...skipping 29 matching lines...) Expand all
2281 if (fMatch == FALSE) { 2258 if (fMatch == FALSE) {
2282 status = U_REGEX_INVALID_STATE; 2259 status = U_REGEX_INVALID_STATE;
2283 return -1; 2260 return -1;
2284 } 2261 }
2285 if (group < 0 || group > fPattern->fGroupMap->size()) { 2262 if (group < 0 || group > fPattern->fGroupMap->size()) {
2286 status = U_INDEX_OUTOFBOUNDS_ERROR; 2263 status = U_INDEX_OUTOFBOUNDS_ERROR;
2287 return -1; 2264 return -1;
2288 } 2265 }
2289 int64_t s; 2266 int64_t s;
2290 if (group == 0) { 2267 if (group == 0) {
2291 s = fMatchStart; 2268 s = fMatchStart;
2292 } else { 2269 } else {
2293 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 2270 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2294 U_ASSERT(groupOffset < fPattern->fFrameSize); 2271 U_ASSERT(groupOffset < fPattern->fFrameSize);
2295 U_ASSERT(groupOffset >= 0); 2272 U_ASSERT(groupOffset >= 0);
2296 s = fFrame->fExtra[groupOffset]; 2273 s = fFrame->fExtra[groupOffset];
2297 } 2274 }
2298 2275
2299 return s; 2276 return s;
2300 } 2277 }
2301 2278
2302 2279
2303 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { 2280 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2304 return (int32_t)start64(group, status); 2281 return (int32_t)start64(group, status);
2305 } 2282 }
2306 2283
2307 //------------------------------------------------------------------------------ -- 2284 //------------------------------------------------------------------------------ --
2308 // 2285 //
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after
2370 return; 2347 return;
2371 } 2348 }
2372 if (U_FAILURE(fDeferredStatus)) { 2349 if (U_FAILURE(fDeferredStatus)) {
2373 status = fDeferredStatus; 2350 status = fDeferredStatus;
2374 return; 2351 return;
2375 } 2352 }
2376 if (limit < 0) { 2353 if (limit < 0) {
2377 status = U_ILLEGAL_ARGUMENT_ERROR; 2354 status = U_ILLEGAL_ARGUMENT_ERROR;
2378 return; 2355 return;
2379 } 2356 }
2380 2357
2381 // Reset the matcher. This is needed here in case there is a current match 2358 // Reset the matcher. This is needed here in case there is a current match
2382 // whose final stack frame (containing the match results, pointed to by f Frame) 2359 // whose final stack frame (containing the match results, pointed to by f Frame)
2383 // would be lost by resizing to a smaller stack size. 2360 // would be lost by resizing to a smaller stack size.
2384 reset(); 2361 reset();
2385 2362
2386 if (limit == 0) { 2363 if (limit == 0) {
2387 // Unlimited stack expansion 2364 // Unlimited stack expansion
2388 fStack->setMaxCapacity(0); 2365 fStack->setMaxCapacity(0);
2389 } else { 2366 } else {
2390 // Change the units of the limit from bytes to ints, and bump the size up 2367 // Change the units of the limit from bytes to ints, and bump the size up
2391 // to be big enough to hold at least one stack frame for the pattern, 2368 // to be big enough to hold at least one stack frame for the pattern,
2392 // if it isn't there already. 2369 // if it isn't there already.
2393 int32_t adjustedLimit = limit / sizeof(int32_t); 2370 int32_t adjustedLimit = limit / sizeof(int32_t);
2394 if (adjustedLimit < fPattern->fFrameSize) { 2371 if (adjustedLimit < fPattern->fFrameSize) {
2395 adjustedLimit = fPattern->fFrameSize; 2372 adjustedLimit = fPattern->fFrameSize;
2396 } 2373 }
2397 fStack->setMaxCapacity(adjustedLimit); 2374 fStack->setMaxCapacity(adjustedLimit);
2398 } 2375 }
2399 fStackLimit = limit; 2376 fStackLimit = limit;
2400 } 2377 }
2401 2378
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after
2479 // Code following this point in this file is the internal 2456 // Code following this point in this file is the internal
2480 // Match Engine Implementation. 2457 // Match Engine Implementation.
2481 // 2458 //
2482 //============================================================================== == 2459 //============================================================================== ==
2483 2460
2484 2461
2485 //------------------------------------------------------------------------------ -- 2462 //------------------------------------------------------------------------------ --
2486 // 2463 //
2487 // resetStack 2464 // resetStack
2488 // Discard any previous contents of the state save stack, and initiali ze a 2465 // Discard any previous contents of the state save stack, and initiali ze a
2489 // new stack frame to all -1. The -1s are needed for capture group li mits, 2466 // new stack frame to all -1. The -1s are needed for capture group li mits,
2490 // where they indicate that a group has not yet matched anything. 2467 // where they indicate that a group has not yet matched anything.
2491 //------------------------------------------------------------------------------ -- 2468 //------------------------------------------------------------------------------ --
2492 REStackFrame *RegexMatcher::resetStack() { 2469 REStackFrame *RegexMatcher::resetStack() {
2493 // Discard any previous contents of the state save stack, and initialize a 2470 // Discard any previous contents of the state save stack, and initialize a
2494 // new stack frame with all -1 data. The -1s are needed for capture group limits, 2471 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2495 // where they indicate that a group has not yet matched anything. 2472 // where they indicate that a group has not yet matched anything.
2496 fStack->removeAllElements(); 2473 fStack->removeAllElements();
2497 2474
2498 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrame Size, fDeferredStatus); 2475 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrame Size, fDeferredStatus);
2499 int32_t i; 2476 int32_t i;
2500 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { 2477 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2501 iFrame->fExtra[i] = -1; 2478 iFrame->fExtra[i] = -1;
2502 } 2479 }
2503 return iFrame; 2480 return iFrame;
2504 } 2481 }
2505 2482
2506 2483
2507 2484
2508 //------------------------------------------------------------------------------ -- 2485 //------------------------------------------------------------------------------ --
2509 // 2486 //
2510 // isWordBoundary 2487 // isWordBoundary
2511 // in perl, "xab..cd..", \b is true at positions 0,3,5,7 2488 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2512 // For us, 2489 // For us,
2513 // If the current char is a combining mark, 2490 // If the current char is a combining mark,
2514 // \b is FALSE. 2491 // \b is FALSE.
2515 // Else Scan backwards to the first non-combining char. 2492 // Else Scan backwards to the first non-combining char.
2516 // We are at a boundary if the this char and the orig inal chars are 2493 // We are at a boundary if the this char and the orig inal chars are
2517 // opposite in membership in \w set 2494 // opposite in membership in \w set
2518 // 2495 //
2519 // parameters: pos - the current position in the input buffer 2496 // parameters: pos - the current position in the input buffer
2520 // 2497 //
2521 // TODO: double-check edge cases at region boundaries. 2498 // TODO: double-check edge cases at region boundaries.
2522 // 2499 //
2523 //------------------------------------------------------------------------------ -- 2500 //------------------------------------------------------------------------------ --
2524 UBool RegexMatcher::isWordBoundary(int64_t pos) { 2501 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2525 UBool isBoundary = FALSE; 2502 UBool isBoundary = FALSE;
2526 UBool cIsWord = FALSE; 2503 UBool cIsWord = FALSE;
2527 2504
2528 if (pos >= fLookLimit) { 2505 if (pos >= fLookLimit) {
2529 fHitEnd = TRUE; 2506 fHitEnd = TRUE;
2530 } else { 2507 } else {
2531 // Determine whether char c at current position is a member of the word set of chars. 2508 // Determine whether char c at current position is a member of the word set of chars.
2532 // If we're off the end of the string, behave as though we're not at a w ord char. 2509 // If we're off the end of the string, behave as though we're not at a w ord char.
2533 UTEXT_SETNATIVEINDEX(fInputText, pos); 2510 UTEXT_SETNATIVEINDEX(fInputText, pos);
2534 UChar32 c = UTEXT_CURRENT32(fInputText); 2511 UChar32 c = UTEXT_CURRENT32(fInputText);
2535 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_ FORMAT_CHAR) { 2512 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_ FORMAT_CHAR) {
2536 // Current char is a combining one. Not a boundary. 2513 // Current char is a combining one. Not a boundary.
2537 return FALSE; 2514 return FALSE;
2538 } 2515 }
2539 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2516 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2540 } 2517 }
2541 2518
2542 // Back up until we come to a non-combining char, determine whether 2519 // Back up until we come to a non-combining char, determine whether
2543 // that char is a word char. 2520 // that char is a word char.
2544 UBool prevCIsWord = FALSE; 2521 UBool prevCIsWord = FALSE;
2545 for (;;) { 2522 for (;;) {
2546 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { 2523 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2547 break; 2524 break;
2548 } 2525 }
2549 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); 2526 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2550 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2527 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2551 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2528 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2552 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevCh ar); 2529 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevCh ar);
2553 break; 2530 break;
2554 } 2531 }
2555 } 2532 }
2556 isBoundary = cIsWord ^ prevCIsWord; 2533 isBoundary = cIsWord ^ prevCIsWord;
2557 return isBoundary; 2534 return isBoundary;
2558 } 2535 }
2559 2536
2560 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { 2537 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2561 UBool isBoundary = FALSE; 2538 UBool isBoundary = FALSE;
2562 UBool cIsWord = FALSE; 2539 UBool cIsWord = FALSE;
2563 2540
2564 const UChar *inputBuf = fInputText->chunkContents; 2541 const UChar *inputBuf = fInputText->chunkContents;
2565 2542
2566 if (pos >= fLookLimit) { 2543 if (pos >= fLookLimit) {
2567 fHitEnd = TRUE; 2544 fHitEnd = TRUE;
2568 } else { 2545 } else {
2569 // Determine whether char c at current position is a member of the word set of chars. 2546 // Determine whether char c at current position is a member of the word set of chars.
2570 // If we're off the end of the string, behave as though we're not at a w ord char. 2547 // If we're off the end of the string, behave as though we're not at a w ord char.
2571 UChar32 c; 2548 UChar32 c;
2572 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); 2549 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2573 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_ FORMAT_CHAR) { 2550 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_ FORMAT_CHAR) {
2574 // Current char is a combining one. Not a boundary. 2551 // Current char is a combining one. Not a boundary.
2575 return FALSE; 2552 return FALSE;
2576 } 2553 }
2577 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2554 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2578 } 2555 }
2579 2556
2580 // Back up until we come to a non-combining char, determine whether 2557 // Back up until we come to a non-combining char, determine whether
2581 // that char is a word char. 2558 // that char is a word char.
2582 UBool prevCIsWord = FALSE; 2559 UBool prevCIsWord = FALSE;
2583 for (;;) { 2560 for (;;) {
2584 if (pos <= fLookStart) { 2561 if (pos <= fLookStart) {
2585 break; 2562 break;
2586 } 2563 }
2587 UChar32 prevChar; 2564 UChar32 prevChar;
2588 U16_PREV(inputBuf, fLookStart, pos, prevChar); 2565 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2589 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2566 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2590 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2567 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2591 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevCh ar); 2568 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevCh ar);
2592 break; 2569 break;
2593 } 2570 }
2594 } 2571 }
2595 isBoundary = cIsWord ^ prevCIsWord; 2572 isBoundary = cIsWord ^ prevCIsWord;
2596 return isBoundary; 2573 return isBoundary;
2597 } 2574 }
2598 2575
2599 //------------------------------------------------------------------------------ -- 2576 //------------------------------------------------------------------------------ --
2600 // 2577 //
2601 // isUWordBoundary 2578 // isUWordBoundary
2602 // 2579 //
2603 // Test for a word boundary using RBBI word break. 2580 // Test for a word boundary using RBBI word break.
2604 // 2581 //
2605 // parameters: pos - the current position in the input buffer 2582 // parameters: pos - the current position in the input buffer
2606 // 2583 //
2607 //------------------------------------------------------------------------------ -- 2584 //------------------------------------------------------------------------------ --
2608 UBool RegexMatcher::isUWordBoundary(int64_t pos) { 2585 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2609 UBool returnVal = FALSE; 2586 UBool returnVal = FALSE;
2610 #if UCONFIG_NO_BREAK_ITERATION==0 2587 #if UCONFIG_NO_BREAK_ITERATION==0
2611 2588
2612 // If we haven't yet created a break iterator for this matcher, do it now. 2589 // If we haven't yet created a break iterator for this matcher, do it now.
2613 if (fWordBreakItr == NULL) { 2590 if (fWordBreakItr == NULL) {
2614 fWordBreakItr = 2591 fWordBreakItr =
2615 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale:: getEnglish(), fDeferredStatus); 2592 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale:: getEnglish(), fDeferredStatus);
2616 if (U_FAILURE(fDeferredStatus)) { 2593 if (U_FAILURE(fDeferredStatus)) {
2617 return FALSE; 2594 return FALSE;
2618 } 2595 }
2619 fWordBreakItr->setText(fInputText, fDeferredStatus); 2596 fWordBreakItr->setText(fInputText, fDeferredStatus);
2620 } 2597 }
2621 2598
2622 if (pos >= fLookLimit) { 2599 if (pos >= fLookLimit) {
2623 fHitEnd = TRUE; 2600 fHitEnd = TRUE;
2624 returnVal = TRUE; // With Unicode word rules, only positions within th e interior of "real" 2601 returnVal = TRUE; // With Unicode word rules, only positions within th e interior of "real"
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
2656 return; 2633 return;
2657 } 2634 }
2658 } 2635 }
2659 if (fTimeLimit > 0 && fTime >= fTimeLimit) { 2636 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2660 status = U_REGEX_TIME_OUT; 2637 status = U_REGEX_TIME_OUT;
2661 } 2638 }
2662 } 2639 }
2663 2640
2664 //------------------------------------------------------------------------------ -- 2641 //------------------------------------------------------------------------------ --
2665 // 2642 //
2666 // ReportFindProgress This function is called once for each advance in the target
2667 // string from the find() function, and calls the user progress callback
2668 // function if there is one installed.
2669 //
2670 // NOTE:
2671 //
2672 // If the match operation needs to be aborted because t he user
2673 // callback asked for it, just set an error status.
2674 // The engine will pick that up and stop in its outer l oop.
2675 //
2676 //------------------------------------------------------------------------------ --
2677 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
2678 if (fFindProgressCallbackFn != NULL) {
2679 if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
2680 status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
2681 return FALSE;
2682 }
2683 }
2684 return TRUE;
2685 }
2686
2687 //------------------------------------------------------------------------------ --
2688 //
2689 // StateSave 2643 // StateSave
2690 // Make a new stack frame, initialized as a copy of the current stack fram e. 2644 // Make a new stack frame, initialized as a copy of the current stack fram e.
2691 // Set the pattern index in the original stack frame from the operand valu e 2645 // Set the pattern index in the original stack frame from the operand valu e
2692 // in the opcode. Execution of the engine continues with the state in 2646 // in the opcode. Execution of the engine continues with the state in
2693 // the newly created stack frame 2647 // the newly created stack frame
2694 // 2648 //
2695 // Note that reserveBlock() may grow the stack, resulting in the 2649 // Note that reserveBlock() may grow the stack, resulting in the
2696 // whole thing being relocated in memory. 2650 // whole thing being relocated in memory.
2697 // 2651 //
2698 // Parameters: 2652 // Parameters:
2699 // fp The top frame pointer when called. At return, a new 2653 // fp The top frame pointer when called. At return, a new
2700 // fame will be present 2654 // fame will be present
2701 // savePatIdx An index into the compiled pattern. Goes into the origina l 2655 // savePatIdx An index into the compiled pattern. Goes into the origina l
2702 // (not new) frame. If execution ever back-tracks out of the 2656 // (not new) frame. If execution ever back-tracks out of the
2703 // new frame, this will be where we continue from in the patt ern. 2657 // new frame, this will be where we continue from in the patt ern.
2704 // Return 2658 // Return
2705 // The new frame pointer. 2659 // The new frame pointer.
2706 // 2660 //
2707 //------------------------------------------------------------------------------ -- 2661 //------------------------------------------------------------------------------ --
2708 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatId x, UErrorCode &status) { 2662 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatId x, UErrorCode &status) {
2709 // push storage for a new frame. 2663 // push storage for a new frame.
2710 int64_t *newFP = fStack->reserveBlock(fFrameSize, status); 2664 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2711 if (newFP == NULL) { 2665 if (newFP == NULL) {
2712 // Failure on attempted stack expansion. 2666 // Failure on attempted stack expansion.
2713 // Stack function set some other error code, change it to a more 2667 // Stack function set some other error code, change it to a more
2714 // specific one for regular expressions. 2668 // specific one for regular expressions.
2715 status = U_REGEX_STACK_OVERFLOW; 2669 status = U_REGEX_STACK_OVERFLOW;
2716 // We need to return a writable stack frame, so just return the 2670 // We need to return a writable stack frame, so just return the
2717 // previous frame. The match operation will stop quickly 2671 // previous frame. The match operation will stop quickly
2718 // because of the error status, after which the frame will never 2672 // because of the error status, after which the frame will never
2719 // be looked at again. 2673 // be looked at again.
2720 return fp; 2674 return fp;
2721 } 2675 }
2722 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. 2676 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2723 2677
2724 // New stack frame = copy of old top frame. 2678 // New stack frame = copy of old top frame.
2725 int64_t *source = (int64_t *)fp; 2679 int64_t *source = (int64_t *)fp;
2726 int64_t *dest = newFP; 2680 int64_t *dest = newFP;
2727 for (;;) { 2681 for (;;) {
2728 *dest++ = *source++; 2682 *dest++ = *source++;
2729 if (source == newFP) { 2683 if (source == newFP) {
2730 break; 2684 break;
2731 } 2685 }
2732 } 2686 }
2733 2687
2734 fTickCounter--; 2688 fTickCounter--;
2735 if (fTickCounter <= 0) { 2689 if (fTickCounter <= 0) {
2736 IncrementTime(status); // Re-initializes fTickCounter 2690 IncrementTime(status); // Re-initializes fTickCounter
2737 } 2691 }
2738 fp->fPatIdx = savePatIdx; 2692 fp->fPatIdx = savePatIdx;
2739 return (REStackFrame *)newFP; 2693 return (REStackFrame *)newFP;
2740 } 2694 }
2741 2695
2742 2696
2743 //------------------------------------------------------------------------------ -- 2697 //------------------------------------------------------------------------------ --
2744 // 2698 //
2745 // MatchAt This is the actual matching engine. 2699 // MatchAt This is the actual matching engine.
2746 // 2700 //
2747 // startIdx: begin matching a this index. 2701 // startIdx: begin matching a this index.
2748 // toEnd: if true, match must extend to end of the input region 2702 // toEnd: if true, match must extend to end of the input region
2749 // 2703 //
2750 //------------------------------------------------------------------------------ -- 2704 //------------------------------------------------------------------------------ --
2751 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { 2705 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2752 UBool isMatch = FALSE; // True if the we have a match. 2706 UBool isMatch = FALSE; // True if the we have a match.
2753 2707
2754 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-chara cter matches for searching backwards 2708 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-chara cter matches for searching backwards
2755 2709
2756 int32_t op; // Operation from the compiled pattern, s plit into 2710 int32_t op; // Operation from the compiled pattern, s plit into
2757 int32_t opType; // the opcode 2711 int32_t opType; // the opcode
2758 int32_t opValue; // and the operand value. 2712 int32_t opValue; // and the operand value.
2759 2713
2760 #ifdef REGEX_RUN_DEBUG 2714 #ifdef REGEX_RUN_DEBUG
2761 if (fTraceDebug) 2715 if (fTraceDebug)
2762 { 2716 {
2763 printf("MatchAt(startIdx=%ld)\n", startIdx); 2717 printf("MatchAt(startIdx=%ld)\n", startIdx);
2764 printf("Original Pattern: "); 2718 printf("Original Pattern: ");
2765 UChar32 c = utext_next32From(fPattern->fPattern, 0); 2719 UChar32 c = utext_next32From(fPattern->fPattern, 0);
2766 while (c != U_SENTINEL) { 2720 while (c != U_SENTINEL) {
2767 if (c<32 || c>256) { 2721 if (c<32 || c>256) {
2768 c = '.'; 2722 c = '.';
2769 } 2723 }
2770 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); 2724 printf("%c", c);
2771 2725
2772 c = UTEXT_NEXT32(fPattern->fPattern); 2726 c = UTEXT_NEXT32(fPattern->fPattern);
2773 } 2727 }
2774 printf("\n"); 2728 printf("\n");
2775 printf("Input String: "); 2729 printf("Input String: ");
2776 c = utext_next32From(fInputText, 0); 2730 c = utext_next32From(fInputText, 0);
2777 while (c != U_SENTINEL) { 2731 while (c != U_SENTINEL) {
2778 if (c<32 || c>256) { 2732 if (c<32 || c>256) {
2779 c = '.'; 2733 c = '.';
2780 } 2734 }
2781 printf("%c", c); 2735 printf("%c", c);
2782 2736
2783 c = UTEXT_NEXT32(fInputText); 2737 c = UTEXT_NEXT32(fInputText);
2784 } 2738 }
2785 printf("\n"); 2739 printf("\n");
2786 printf("\n"); 2740 printf("\n");
2787 } 2741 }
2788 #endif 2742 #endif
2789 2743
2790 if (U_FAILURE(status)) { 2744 if (U_FAILURE(status)) {
2791 return; 2745 return;
2792 } 2746 }
2793 2747
2794 // Cache frequently referenced items from the compiled pattern 2748 // Cache frequently referenced items from the compiled pattern
2795 // 2749 //
2796 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 2750 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2797 2751
2798 const UChar *litText = fPattern->fLiteralText.getBuffer(); 2752 const UChar *litText = fPattern->fLiteralText.getBuffer();
2799 UVector *sets = fPattern->fSets; 2753 UVector *sets = fPattern->fSets;
2800 2754
2801 fFrameSize = fPattern->fFrameSize; 2755 fFrameSize = fPattern->fFrameSize;
2802 REStackFrame *fp = resetStack(); 2756 REStackFrame *fp = resetStack();
2803 2757
2804 fp->fPatIdx = 0; 2758 fp->fPatIdx = 0;
2805 fp->fInputIdx = startIdx; 2759 fp->fInputIdx = startIdx;
2806 2760
2807 // Zero out the pattern's static data 2761 // Zero out the pattern's static data
2808 int32_t i; 2762 int32_t i;
2809 for (i = 0; i<fPattern->fDataSize; i++) { 2763 for (i = 0; i<fPattern->fDataSize; i++) {
2810 fData[i] = 0; 2764 fData[i] = 0;
2811 } 2765 }
2812 2766
2813 // 2767 //
2814 // Main loop for interpreting the compiled pattern. 2768 // Main loop for interpreting the compiled pattern.
2815 // One iteration of the loop per pattern operation performed. 2769 // One iteration of the loop per pattern operation performed.
2816 // 2770 //
2817 for (;;) { 2771 for (;;) {
2818 #if 0
2819 if (_heapchk() != _HEAPOK) {
2820 fprintf(stderr, "Heap Trouble\n");
2821 }
2822 #endif
2823
2824 op = (int32_t)pat[fp->fPatIdx]; 2772 op = (int32_t)pat[fp->fPatIdx];
2825 opType = URX_TYPE(op); 2773 opType = URX_TYPE(op);
2826 opValue = URX_VAL(op); 2774 opValue = URX_VAL(op);
2827 #ifdef REGEX_RUN_DEBUG 2775 #ifdef REGEX_RUN_DEBUG
2828 if (fTraceDebug) { 2776 if (fTraceDebug) {
2829 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2777 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2830 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, 2778 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
2831 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 2779 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2832 fPattern->dumpOp(fp->fPatIdx); 2780 fPattern->dumpOp(fp->fPatIdx);
2833 } 2781 }
2834 #endif 2782 #endif
2835 fp->fPatIdx++; 2783 fp->fPatIdx++;
2836 2784
2837 switch (opType) { 2785 switch (opType) {
2838 2786
2839 2787
2840 case URX_NOP: 2788 case URX_NOP:
2841 break; 2789 break;
2842 2790
2843 2791
2844 case URX_BACKTRACK: 2792 case URX_BACKTRACK:
2845 // Force a backtrack. In some circumstances, the pattern compiler 2793 // Force a backtrack. In some circumstances, the pattern compiler
2846 // will notice that the pattern can't possibly match anything, and will 2794 // will notice that the pattern can't possibly match anything, and will
(...skipping 23 matching lines...) Expand all
2870 // Strings require two slots in the compiled pattern, one for th e 2818 // Strings require two slots in the compiled pattern, one for th e
2871 // offset to the string text, and one for the length. 2819 // offset to the string text, and one for the length.
2872 2820
2873 int32_t stringStartIdx = opValue; 2821 int32_t stringStartIdx = opValue;
2874 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second ope rand 2822 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second ope rand
2875 fp->fPatIdx++; 2823 fp->fPatIdx++;
2876 opType = URX_TYPE(op); 2824 opType = URX_TYPE(op);
2877 int32_t stringLen = URX_VAL(op); 2825 int32_t stringLen = URX_VAL(op);
2878 U_ASSERT(opType == URX_STRING_LEN); 2826 U_ASSERT(opType == URX_STRING_LEN);
2879 U_ASSERT(stringLen >= 2); 2827 U_ASSERT(stringLen >= 2);
2880 2828
2881 const UChar *patternString = litText+stringStartIdx; 2829 const UChar *patternString = litText+stringStartIdx;
2882 int32_t patternStringIndex = 0; 2830 int32_t patternStringIndex = 0;
2883 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2831 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2884 UChar32 inputChar; 2832 UChar32 inputChar;
2885 UChar32 patternChar; 2833 UChar32 patternChar;
2886 UBool success = TRUE; 2834 UBool success = TRUE;
2887 while (patternStringIndex < stringLen) { 2835 while (patternStringIndex < stringLen) {
2888 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { 2836 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2889 success = FALSE; 2837 success = FALSE;
2890 fHitEnd = TRUE; 2838 fHitEnd = TRUE;
2891 break; 2839 break;
2892 } 2840 }
2893 inputChar = UTEXT_NEXT32(fInputText); 2841 inputChar = UTEXT_NEXT32(fInputText);
2894 U16_NEXT(patternString, patternStringIndex, stringLen, patte rnChar); 2842 U16_NEXT(patternString, patternStringIndex, stringLen, patte rnChar);
2895 if (patternChar != inputChar) { 2843 if (patternChar != inputChar) {
2896 success = FALSE; 2844 success = FALSE;
2897 break; 2845 break;
2898 } 2846 }
2899 } 2847 }
2900 2848
2901 if (success) { 2849 if (success) {
2902 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2850 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2903 } else { 2851 } else {
2904 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2852 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2905 } 2853 }
2906 } 2854 }
2907 break; 2855 break;
2908 2856
2909 2857
2910 case URX_STATE_SAVE: 2858 case URX_STATE_SAVE:
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
2945 2893
2946 case URX_DOLLAR: // $, test for End of line 2894 case URX_DOLLAR: // $, test for End of line
2947 // or for position before new lin e at end of input 2895 // or for position before new lin e at end of input
2948 { 2896 {
2949 if (fp->fInputIdx >= fAnchorLimit) { 2897 if (fp->fInputIdx >= fAnchorLimit) {
2950 // We really are at the end of input. Success. 2898 // We really are at the end of input. Success.
2951 fHitEnd = TRUE; 2899 fHitEnd = TRUE;
2952 fRequireEnd = TRUE; 2900 fRequireEnd = TRUE;
2953 break; 2901 break;
2954 } 2902 }
2955 2903
2956 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2904 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2957 2905
2958 // If we are positioned just before a new-line that is located a t the 2906 // If we are positioned just before a new-line that is located a t the
2959 // end of input, succeed. 2907 // end of input, succeed.
2960 UChar32 c = UTEXT_NEXT32(fInputText); 2908 UChar32 c = UTEXT_NEXT32(fInputText);
2961 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 2909 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2962 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x202 9) { 2910 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x202 9) {
2963 // If not in the middle of a CR/LF sequence 2911 // If not in the middle of a CR/LF sequence
2964 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTE XT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { 2912 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTE XT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2965 // At new-line at end of input. Success 2913 // At new-line at end of input. Success
2966 fHitEnd = TRUE; 2914 fHitEnd = TRUE;
2967 fRequireEnd = TRUE; 2915 fRequireEnd = TRUE;
2968 2916
2969 break; 2917 break;
2970 } 2918 }
2971 } 2919 }
2972 } else { 2920 } else {
2973 UChar32 nextC = UTEXT_NEXT32(fInputText); 2921 UChar32 nextC = UTEXT_NEXT32(fInputText);
2974 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInpu tText) >= fAnchorLimit) { 2922 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInpu tText) >= fAnchorLimit) {
2975 fHitEnd = TRUE; 2923 fHitEnd = TRUE;
2976 fRequireEnd = TRUE; 2924 fRequireEnd = TRUE;
2977 break; // At CR/LF at end of inp ut. Success 2925 break; // At CR/LF at end of inp ut. Success
2978 } 2926 }
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after
3058 3006
3059 case URX_CARET_M: // ^, test for start of line in muli t-line mode 3007 case URX_CARET_M: // ^, test for start of line in muli t-line mode
3060 { 3008 {
3061 if (fp->fInputIdx == fAnchorStart) { 3009 if (fp->fInputIdx == fAnchorStart) {
3062 // We are at the start input. Success. 3010 // We are at the start input. Success.
3063 break; 3011 break;
3064 } 3012 }
3065 // Check whether character just before the current pos is a new-l ine 3013 // Check whether character just before the current pos is a new-l ine
3066 // unless we are at the end of input 3014 // unless we are at the end of input
3067 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3015 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3068 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3016 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3069 if ((fp->fInputIdx < fAnchorLimit) && 3017 if ((fp->fInputIdx < fAnchorLimit) &&
3070 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 3018 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3071 // It's a new-line. ^ is true. Success. 3019 // It's a new-line. ^ is true. Success.
3072 // TODO: what should be done with positions between a CR an d LF? 3020 // TODO: what should be done with positions between a CR an d LF?
3073 break; 3021 break;
3074 } 3022 }
3075 // Not at the start of a line. Fail. 3023 // Not at the start of a line. Fail.
3076 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3024 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3077 } 3025 }
3078 break; 3026 break;
3079 3027
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after
3141 break; 3089 break;
3142 3090
3143 3091
3144 case URX_BACKSLASH_G: // Test for position at end of previous m atch 3092 case URX_BACKSLASH_G: // Test for position at end of previous m atch
3145 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp-> fInputIdx==fActiveStart))) { 3093 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp-> fInputIdx==fActiveStart))) {
3146 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3094 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3147 } 3095 }
3148 break; 3096 break;
3149 3097
3150 3098
3151 case URX_BACKSLASH_X: 3099 case URX_BACKSLASH_X:
3152 // Match a Grapheme, as defined by Unicode TR 29. 3100 // Match a Grapheme, as defined by Unicode TR 29.
3153 // Differs slightly from Perl, which consumes combining marks indep endently 3101 // Differs slightly from Perl, which consumes combining marks indep endently
3154 // of context. 3102 // of context.
3155 { 3103 {
3156 3104
3157 // Fail if at end of input 3105 // Fail if at end of input
3158 if (fp->fInputIdx >= fActiveLimit) { 3106 if (fp->fInputIdx >= fActiveLimit) {
3159 fHitEnd = TRUE; 3107 fHitEnd = TRUE;
3160 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3108 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3161 break; 3109 break;
3162 } 3110 }
3163 3111
3164 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3112 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3165 3113
3166 // Examine (and consume) the current char. 3114 // Examine (and consume) the current char.
3167 // Dispatch into a little state machine, based on the char. 3115 // Dispatch into a little state machine, based on the char.
3168 UChar32 c; 3116 UChar32 c;
3169 c = UTEXT_NEXT32(fInputText); 3117 c = UTEXT_NEXT32(fInputText);
3170 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3118 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3171 UnicodeSet **sets = fPattern->fStaticSets; 3119 UnicodeSet **sets = fPattern->fStaticSets;
3172 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 3120 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3173 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 3121 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
3220 c = UTEXT_CURRENT32(fInputText); 3168 c = UTEXT_CURRENT32(fInputText);
3221 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 3169 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3222 break; 3170 break;
3223 } 3171 }
3224 (void)UTEXT_NEXT32(fInputText); 3172 (void)UTEXT_NEXT32(fInputText);
3225 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3173 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3226 } 3174 }
3227 goto GC_Done; 3175 goto GC_Done;
3228 3176
3229 GC_Control: 3177 GC_Control:
3230 // Most control chars stand alone (don't combine with combining chars), 3178 // Most control chars stand alone (don't combine with combining chars),
3231 // except for that CR/LF sequence is a single grapheme cluster . 3179 // except for that CR/LF sequence is a single grapheme cluster .
3232 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32 (fInputText) == 0x0a) { 3180 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32 (fInputText) == 0x0a) {
3233 c = UTEXT_NEXT32(fInputText); 3181 c = UTEXT_NEXT32(fInputText);
3234 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3182 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3235 } 3183 }
3236 3184
3237 GC_Done: 3185 GC_Done:
3238 if (fp->fInputIdx >= fActiveLimit) { 3186 if (fp->fInputIdx >= fActiveLimit) {
3239 fHitEnd = TRUE; 3187 fHitEnd = TRUE;
3240 } 3188 }
3241 break; 3189 break;
3242 } 3190 }
3243
3244 3191
3245 3192
3246 3193
3194
3247 case URX_BACKSLASH_Z: // Test for end of Input 3195 case URX_BACKSLASH_Z: // Test for end of Input
3248 if (fp->fInputIdx < fAnchorLimit) { 3196 if (fp->fInputIdx < fAnchorLimit) {
3249 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3197 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3250 } else { 3198 } else {
3251 fHitEnd = TRUE; 3199 fHitEnd = TRUE;
3252 fRequireEnd = TRUE; 3200 fRequireEnd = TRUE;
3253 } 3201 }
3254 break; 3202 break;
3255 3203
3256 3204
3257 3205
3258 case URX_STATIC_SETREF: 3206 case URX_STATIC_SETREF:
3259 { 3207 {
3260 // Test input character against one of the predefined sets 3208 // Test input character against one of the predefined sets
3261 // (Word Characters, for example) 3209 // (Word Characters, for example)
3262 // The high bit of the op value is a flag for the match polarity . 3210 // The high bit of the op value is a flag for the match polarity .
3263 // 0: success if input char is in set. 3211 // 0: success if input char is in set.
3264 // 1: success if input char is not in set. 3212 // 1: success if input char is not in set.
3265 if (fp->fInputIdx >= fActiveLimit) { 3213 if (fp->fInputIdx >= fActiveLimit) {
3266 fHitEnd = TRUE; 3214 fHitEnd = TRUE;
3267 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3215 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3268 break; 3216 break;
3269 } 3217 }
3270 3218
3271 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 3219 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3272 opValue &= ~URX_NEG_SET; 3220 opValue &= ~URX_NEG_SET;
3273 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3221 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3274 3222
3275 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3223 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3276 UChar32 c = UTEXT_NEXT32(fInputText); 3224 UChar32 c = UTEXT_NEXT32(fInputText);
3277 if (c < 256) { 3225 if (c < 256) {
3278 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3226 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3279 if (s8->contains(c)) { 3227 if (s8->contains(c)) {
3280 success = !success; 3228 success = !success;
3281 } 3229 }
3282 } else { 3230 } else {
3283 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3231 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3284 if (s->contains(c)) { 3232 if (s->contains(c)) {
3285 success = !success; 3233 success = !success;
3286 } 3234 }
3287 } 3235 }
3288 if (success) { 3236 if (success) {
3289 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3237 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3290 } else { 3238 } else {
3291 // the character wasn't in the set. 3239 // the character wasn't in the set.
3292 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3240 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3293 } 3241 }
3294 } 3242 }
3295 break; 3243 break;
3296 3244
3297 3245
3298 case URX_STAT_SETREF_N: 3246 case URX_STAT_SETREF_N:
3299 { 3247 {
3300 // Test input character for NOT being a member of one of 3248 // Test input character for NOT being a member of one of
3301 // the predefined sets (Word Characters, for example) 3249 // the predefined sets (Word Characters, for example)
3302 if (fp->fInputIdx >= fActiveLimit) { 3250 if (fp->fInputIdx >= fActiveLimit) {
3303 fHitEnd = TRUE; 3251 fHitEnd = TRUE;
3304 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3252 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3305 break; 3253 break;
3306 } 3254 }
3307 3255
3308 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3256 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3309 3257
3310 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3258 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3311 3259
3312 UChar32 c = UTEXT_NEXT32(fInputText); 3260 UChar32 c = UTEXT_NEXT32(fInputText);
3313 if (c < 256) { 3261 if (c < 256) {
3314 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3262 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3315 if (s8->contains(c) == FALSE) { 3263 if (s8->contains(c) == FALSE) {
3316 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3264 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3317 break; 3265 break;
3318 } 3266 }
3319 } else { 3267 } else {
3320 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3268 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3321 if (s->contains(c) == FALSE) { 3269 if (s->contains(c) == FALSE) {
3322 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3270 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3323 break; 3271 break;
3324 } 3272 }
3325 } 3273 }
3326 // the character wasn't in the set. 3274 // the character wasn't in the set.
3327 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3275 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3328 } 3276 }
3329 break; 3277 break;
3330 3278
3331 3279
3332 case URX_SETREF: 3280 case URX_SETREF:
3333 if (fp->fInputIdx >= fActiveLimit) { 3281 if (fp->fInputIdx >= fActiveLimit) {
3334 fHitEnd = TRUE; 3282 fHitEnd = TRUE;
3335 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3283 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3336 break; 3284 break;
3337 } else { 3285 } else {
3338 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3286 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3339 3287
3340 // There is input left. Pick up one char and test it for set me mbership. 3288 // There is input left. Pick up one char and test it for set me mbership.
3341 UChar32 c = UTEXT_NEXT32(fInputText); 3289 UChar32 c = UTEXT_NEXT32(fInputText);
3342 U_ASSERT(opValue > 0 && opValue < sets->size()); 3290 U_ASSERT(opValue > 0 && opValue < sets->size());
3343 if (c<256) { 3291 if (c<256) {
3344 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 3292 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3345 if (s8->contains(c)) { 3293 if (s8->contains(c)) {
3346 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3294 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3347 break; 3295 break;
3348 } 3296 }
3349 } else { 3297 } else {
3350 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 3298 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3351 if (s->contains(c)) { 3299 if (s->contains(c)) {
3352 // The character is in the set. A Match. 3300 // The character is in the set. A Match.
3353 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3301 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3354 break; 3302 break;
3355 } 3303 }
3356 } 3304 }
3357 3305
3358 // the character wasn't in the set. 3306 // the character wasn't in the set.
3359 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3307 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3360 } 3308 }
3361 break; 3309 break;
3362 3310
3363 3311
3364 case URX_DOTANY: 3312 case URX_DOTANY:
3365 { 3313 {
3366 // . matches anything, but stops at end-of-line. 3314 // . matches anything, but stops at end-of-line.
3367 if (fp->fInputIdx >= fActiveLimit) { 3315 if (fp->fInputIdx >= fActiveLimit) {
3368 // At end of input. Match failed. Backtrack out. 3316 // At end of input. Match failed. Backtrack out.
3369 fHitEnd = TRUE; 3317 fHitEnd = TRUE;
3370 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3318 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3371 break; 3319 break;
3372 } 3320 }
3373 3321
3374 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3322 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3375 3323
3376 // There is input left. Advance over one char, unless we've hit end-of-line 3324 // There is input left. Advance over one char, unless we've hit end-of-line
3377 UChar32 c = UTEXT_NEXT32(fInputText); 3325 UChar32 c = UTEXT_NEXT32(fInputText);
3378 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 3326 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
3379 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 3327 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3380 // End of line in normal mode. . does not match. 3328 // End of line in normal mode. . does not match.
3381 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3329 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3382 break; 3330 break;
3383 } 3331 }
3384 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3332 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3385 } 3333 }
3386 break; 3334 break;
3387 3335
3388 3336
3389 case URX_DOTANY_ALL: 3337 case URX_DOTANY_ALL:
3390 { 3338 {
3391 // ., in dot-matches-all (including new lines) mode 3339 // ., in dot-matches-all (including new lines) mode
3392 if (fp->fInputIdx >= fActiveLimit) { 3340 if (fp->fInputIdx >= fActiveLimit) {
3393 // At end of input. Match failed. Backtrack out. 3341 // At end of input. Match failed. Backtrack out.
3394 fHitEnd = TRUE; 3342 fHitEnd = TRUE;
3395 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3343 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3396 break; 3344 break;
3397 } 3345 }
3398 3346
3399 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3347 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3400 3348
3401 // There is input left. Advance over one char, except if we are 3349 // There is input left. Advance over one char, except if we are
3402 // at a cr/lf, advance over both of them. 3350 // at a cr/lf, advance over both of them.
3403 UChar32 c; 3351 UChar32 c;
3404 c = UTEXT_NEXT32(fInputText); 3352 c = UTEXT_NEXT32(fInputText);
3405 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3353 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3406 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 3354 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3407 // In the case of a CR/LF, we need to advance over both. 3355 // In the case of a CR/LF, we need to advance over both.
3408 UChar32 nextc = UTEXT_CURRENT32(fInputText); 3356 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3409 if (nextc == 0x0a) { 3357 if (nextc == 0x0a) {
3410 (void)UTEXT_NEXT32(fInputText); 3358 (void)UTEXT_NEXT32(fInputText);
3411 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3359 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3412 } 3360 }
3413 } 3361 }
3414 } 3362 }
3415 break; 3363 break;
3416 3364
3417 3365
3418 case URX_DOTANY_UNIX: 3366 case URX_DOTANY_UNIX:
3419 { 3367 {
3420 // '.' operator, matches all, but stops at end-of-line. 3368 // '.' operator, matches all, but stops at end-of-line.
3421 // UNIX_LINES mode, so 0x0a is the only recognized line ending . 3369 // UNIX_LINES mode, so 0x0a is the only recognized line ending .
3422 if (fp->fInputIdx >= fActiveLimit) { 3370 if (fp->fInputIdx >= fActiveLimit) {
3423 // At end of input. Match failed. Backtrack out. 3371 // At end of input. Match failed. Backtrack out.
3424 fHitEnd = TRUE; 3372 fHitEnd = TRUE;
3425 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3373 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3426 break; 3374 break;
3427 } 3375 }
3428 3376
3429 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3377 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3430 3378
3431 // There is input left. Advance over one char, unless we've hit end-of-line 3379 // There is input left. Advance over one char, unless we've hit end-of-line
3432 UChar32 c = UTEXT_NEXT32(fInputText); 3380 UChar32 c = UTEXT_NEXT32(fInputText);
3433 if (c == 0x0a) { 3381 if (c == 0x0a) {
3434 // End of line in normal mode. '.' does not match the \n 3382 // End of line in normal mode. '.' does not match the \n
3435 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3383 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3436 } else { 3384 } else {
3437 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3385 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3438 } 3386 }
3439 } 3387 }
3440 break; 3388 break;
(...skipping 24 matching lines...) Expand all
3465 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 3413 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3466 int32_t frameLoc = URX_VAL(stoOp); 3414 int32_t frameLoc = URX_VAL(stoOp);
3467 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 3415 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3468 int64_t prevInputIdx = fp->fExtra[frameLoc]; 3416 int64_t prevInputIdx = fp->fExtra[frameLoc];
3469 U_ASSERT(prevInputIdx <= fp->fInputIdx); 3417 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3470 if (prevInputIdx < fp->fInputIdx) { 3418 if (prevInputIdx < fp->fInputIdx) {
3471 // The match did make progress. Repeat the loop. 3419 // The match did make progress. Repeat the loop.
3472 fp = StateSave(fp, fp->fPatIdx, status); // State save to l oc following current 3420 fp = StateSave(fp, fp->fPatIdx, status); // State save to l oc following current
3473 fp->fPatIdx = opValue; 3421 fp->fPatIdx = opValue;
3474 fp->fExtra[frameLoc] = fp->fInputIdx; 3422 fp->fExtra[frameLoc] = fp->fInputIdx;
3475 } 3423 }
3476 // If the input position did not advance, we do nothing here, 3424 // If the input position did not advance, we do nothing here,
3477 // execution will fall out of the loop. 3425 // execution will fall out of the loop.
3478 } 3426 }
3479 break; 3427 break;
3480 3428
3481 case URX_CTR_INIT: 3429 case URX_CTR_INIT:
3482 { 3430 {
3483 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3431 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3484 fp->fExtra[opValue] = 0; // Set the loop counte r variable to zero 3432 fp->fExtra[opValue] = 0; // Set the loop counte r variable to zero
3485 3433
3486 // Pick up the three extra operands that CTR_INIT has, and 3434 // Pick up the three extra operands that CTR_INIT has, and
3487 // skip the pattern location counter past 3435 // skip the pattern location counter past
3488 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3436 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3489 fp->fPatIdx += 3; 3437 fp->fPatIdx += 3;
3490 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3438 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3491 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3439 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3492 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3440 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3493 U_ASSERT(minCount>=0); 3441 U_ASSERT(minCount>=0);
3494 U_ASSERT(maxCount>=minCount || maxCount==-1); 3442 U_ASSERT(maxCount>=minCount || maxCount==-1);
3495 U_ASSERT(loopLoc>=fp->fPatIdx); 3443 U_ASSERT(loopLoc>=fp->fPatIdx);
3496 3444
3497 if (minCount == 0) { 3445 if (minCount == 0) {
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
3535 } 3483 }
3536 break; 3484 break;
3537 3485
3538 case URX_CTR_INIT_NG: 3486 case URX_CTR_INIT_NG:
3539 { 3487 {
3540 // Initialize a non-greedy loop 3488 // Initialize a non-greedy loop
3541 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3489 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3542 fp->fExtra[opValue] = 0; // Set the loop counte r variable to zero 3490 fp->fExtra[opValue] = 0; // Set the loop counte r variable to zero
3543 3491
3544 // Pick up the three extra operands that CTR_INIT_NG has, and 3492 // Pick up the three extra operands that CTR_INIT_NG has, and
3545 // skip the pattern location counter past 3493 // skip the pattern location counter past
3546 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3494 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3547 fp->fPatIdx += 3; 3495 fp->fPatIdx += 3;
3548 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3496 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3549 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3497 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3550 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3498 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3551 U_ASSERT(minCount>=0); 3499 U_ASSERT(minCount>=0);
3552 U_ASSERT(maxCount>=minCount || maxCount==-1); 3500 U_ASSERT(maxCount>=minCount || maxCount==-1);
3553 U_ASSERT(loopLoc>fp->fPatIdx); 3501 U_ASSERT(loopLoc>fp->fPatIdx);
3554 if (maxCount == -1) { 3502 if (maxCount == -1) {
3555 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial in put index for loop breaking. 3503 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial in put index for loop breaking.
3556 } 3504 }
3557 3505
3558 if (minCount == 0) { 3506 if (minCount == 0) {
3559 if (maxCount != 0) { 3507 if (maxCount != 0) {
3560 fp = StateSave(fp, fp->fPatIdx, status); 3508 fp = StateSave(fp, fp->fPatIdx, status);
3561 } 3509 }
3562 fp->fPatIdx = loopLoc+1; // Continue with stuff after repe ated block 3510 fp->fPatIdx = loopLoc+1; // Continue with stuff after repe ated block
3563 } 3511 }
3564 } 3512 }
3565 break; 3513 break;
3566 3514
3567 case URX_CTR_LOOP_NG: 3515 case URX_CTR_LOOP_NG:
3568 { 3516 {
3569 // Non-greedy {min, max} loops 3517 // Non-greedy {min, max} loops
3570 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3518 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3571 int32_t initOp = (int32_t)pat[opValue]; 3519 int32_t initOp = (int32_t)pat[opValue];
3572 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 3520 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3573 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3521 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after
3640 U_ASSERT(groupStartIdx <= groupEndIdx); 3588 U_ASSERT(groupStartIdx <= groupEndIdx);
3641 if (groupStartIdx < 0) { 3589 if (groupStartIdx < 0) {
3642 // This capture group has not participated in the match thus far, 3590 // This capture group has not participated in the match thus far,
3643 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL , no match. 3591 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL , no match.
3644 break; 3592 break;
3645 } 3593 }
3646 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); 3594 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3647 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3595 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3648 3596
3649 // Note: if the capture group match was of an empty string the backref 3597 // Note: if the capture group match was of an empty string the backref
3650 // match succeeds. Verified by testing: Perl matches s ucceed 3598 // match succeeds. Verified by testing: Perl matches s ucceed
3651 // in this case, so we do too. 3599 // in this case, so we do too.
3652 3600
3653 UBool success = TRUE; 3601 UBool success = TRUE;
3654 for (;;) { 3602 for (;;) {
3655 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) { 3603 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3656 success = TRUE; 3604 success = TRUE;
3657 break; 3605 break;
3658 } 3606 }
3659 if (utext_getNativeIndex(fInputText) >= fActiveLimit) { 3607 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3660 success = FALSE; 3608 success = FALSE;
3661 fHitEnd = TRUE; 3609 fHitEnd = TRUE;
3662 break; 3610 break;
(...skipping 26 matching lines...) Expand all
3689 // This capture group has not participated in the match thus far, 3637 // This capture group has not participated in the match thus far,
3690 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL , no match. 3638 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL , no match.
3691 break; 3639 break;
3692 } 3640 }
3693 utext_setNativeIndex(fAltInputText, groupStartIdx); 3641 utext_setNativeIndex(fAltInputText, groupStartIdx);
3694 utext_setNativeIndex(fInputText, fp->fInputIdx); 3642 utext_setNativeIndex(fInputText, fp->fInputIdx);
3695 CaseFoldingUTextIterator captureGroupItr(*fAltInputText); 3643 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3696 CaseFoldingUTextIterator inputItr(*fInputText); 3644 CaseFoldingUTextIterator inputItr(*fInputText);
3697 3645
3698 // Note: if the capture group match was of an empty string the backref 3646 // Note: if the capture group match was of an empty string the backref
3699 // match succeeds. Verified by testing: Perl matches s ucceed 3647 // match succeeds. Verified by testing: Perl matches s ucceed
3700 // in this case, so we do too. 3648 // in this case, so we do too.
3701 3649
3702 UBool success = TRUE; 3650 UBool success = TRUE;
3703 for (;;) { 3651 for (;;) {
3704 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(f AltInputText) >= groupEndIdx) { 3652 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(f AltInputText) >= groupEndIdx) {
3705 success = TRUE; 3653 success = TRUE;
3706 break; 3654 break;
3707 } 3655 }
3708 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputTe xt) >= fActiveLimit) { 3656 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputTe xt) >= fActiveLimit) {
3709 success = FALSE; 3657 success = FALSE;
3710 fHitEnd = TRUE; 3658 fHitEnd = TRUE;
3711 break; 3659 break;
3712 } 3660 }
3713 UChar32 captureGroupChar = captureGroupItr.next(); 3661 UChar32 captureGroupChar = captureGroupItr.next();
3714 UChar32 inputChar = inputItr.next(); 3662 UChar32 inputChar = inputItr.next();
3715 if (inputChar != captureGroupChar) { 3663 if (inputChar != captureGroupChar) {
3716 success = FALSE; 3664 success = FALSE;
3717 break; 3665 break;
3718 } 3666 }
3719 } 3667 }
3720 3668
3721 if (success && inputItr.inExpansion()) { 3669 if (success && inputItr.inExpansion()) {
3722 // We otained a match by consuming part of a string obtained from 3670 // We otained a match by consuming part of a string obtained from
3723 // case-folding a single code point of the input text. 3671 // case-folding a single code point of the input text.
3724 // This does not count as an overall match. 3672 // This does not count as an overall match.
3725 success = FALSE; 3673 success = FALSE;
3726 } 3674 }
3727 3675
3728 if (success) { 3676 if (success) {
3729 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3677 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3730 } else { 3678 } else {
3731 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3679 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3732 } 3680 }
3733 3681
3734 } 3682 }
3735 break; 3683 break;
3736 3684
3737 case URX_STO_INP_LOC: 3685 case URX_STO_INP_LOC:
3738 { 3686 {
3739 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 3687 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3740 fp->fExtra[opValue] = fp->fInputIdx; 3688 fp->fExtra[opValue] = fp->fInputIdx;
3741 } 3689 }
3742 break; 3690 break;
3743 3691
3744 case URX_JMPX: 3692 case URX_JMPX:
3745 { 3693 {
3746 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3694 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
3806 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3754 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3807 3755
3808 UChar32 c = UTEXT_NEXT32(fInputText); 3756 UChar32 c = UTEXT_NEXT32(fInputText);
3809 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 3757 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3810 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3758 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3811 break; 3759 break;
3812 } 3760 }
3813 } else { 3761 } else {
3814 fHitEnd = TRUE; 3762 fHitEnd = TRUE;
3815 } 3763 }
3816 3764
3817 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3765 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3818 break; 3766 break;
3819 3767
3820 case URX_STRING_I: 3768 case URX_STRING_I:
3821 { 3769 {
3822 // Case-insensitive test input against a literal string. 3770 // Case-insensitive test input against a literal string.
3823 // Strings require two slots in the compiled pattern, one for th e 3771 // Strings require two slots in the compiled pattern, one for th e
3824 // offset to the string text, and one for the length. 3772 // offset to the string text, and one for the length.
3825 // The compiled string has already been case folded. 3773 // The compiled string has already been case folded.
3826 { 3774 {
3827 const UChar *patternString = litText + opValue; 3775 const UChar *patternString = litText + opValue;
3828 int32_t patternStringIdx = 0; 3776 int32_t patternStringIdx = 0;
3829 3777
3830 op = (int32_t)pat[fp->fPatIdx]; 3778 op = (int32_t)pat[fp->fPatIdx];
3831 fp->fPatIdx++; 3779 fp->fPatIdx++;
3832 opType = URX_TYPE(op); 3780 opType = URX_TYPE(op);
3833 opValue = URX_VAL(op); 3781 opValue = URX_VAL(op);
3834 U_ASSERT(opType == URX_STRING_LEN); 3782 U_ASSERT(opType == URX_STRING_LEN);
3835 int32_t patternStringLen = opValue; // Length of the string from the pattern. 3783 int32_t patternStringLen = opValue; // Length of the string from the pattern.
3836 3784
3837 3785
3838 UChar32 cPattern; 3786 UChar32 cPattern;
3839 UChar32 cText; 3787 UChar32 cText;
3840 UBool success = TRUE; 3788 UBool success = TRUE;
3841 3789
3842 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3790 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3843 CaseFoldingUTextIterator inputIterator(*fInputText); 3791 CaseFoldingUTextIterator inputIterator(*fInputText);
3844 while (patternStringIdx < patternStringLen) { 3792 while (patternStringIdx < patternStringLen) {
3845 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX (fInputText) >= fActiveLimit) { 3793 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX (fInputText) >= fActiveLimit) {
3846 success = FALSE; 3794 success = FALSE;
3847 fHitEnd = TRUE; 3795 fHitEnd = TRUE;
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after
3942 // The look-behind expression matched, but the match did no t 3890 // The look-behind expression matched, but the match did no t
3943 // extend all the way to the point that we are looking be hind from. 3891 // extend all the way to the point that we are looking be hind from.
3944 // FAIL out of here, which will take us back to the LB_CONT , which 3892 // FAIL out of here, which will take us back to the LB_CONT , which
3945 // will retry the match starting at another position or fail 3893 // will retry the match starting at another position or fail
3946 // the look-behind altogether, whichever is appropriate. 3894 // the look-behind altogether, whichever is appropriate.
3947 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3895 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3948 break; 3896 break;
3949 } 3897 }
3950 3898
3951 // Look-behind match is good. Restore the orignal input string length, 3899 // Look-behind match is good. Restore the orignal input string length,
3952 // which had been truncated to pin the end of the lookbehind m atch to the 3900 // which had been truncated to pin the end of the lookbehind m atch to the
3953 // position being looked-behind. 3901 // position being looked-behind.
3954 int64_t originalInputLen = fData[opValue+3]; 3902 int64_t originalInputLen = fData[opValue+3];
3955 U_ASSERT(originalInputLen >= fActiveLimit); 3903 U_ASSERT(originalInputLen >= fActiveLimit);
3956 U_ASSERT(originalInputLen <= fInputLength); 3904 U_ASSERT(originalInputLen <= fInputLength);
3957 fActiveLimit = originalInputLen; 3905 fActiveLimit = originalInputLen;
3958 } 3906 }
3959 break; 3907 break;
3960 3908
3961 3909
3962 case URX_LBN_CONT: 3910 case URX_LBN_CONT:
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
4019 // extend all the way to the point that we are looking be hind from. 3967 // extend all the way to the point that we are looking be hind from.
4020 // FAIL out of here, which will take us back to the LB_CONT , which 3968 // FAIL out of here, which will take us back to the LB_CONT , which
4021 // will retry the match starting at another position or succeed 3969 // will retry the match starting at another position or succeed
4022 // the look-behind altogether, whichever is appropriate. 3970 // the look-behind altogether, whichever is appropriate.
4023 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3971 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4024 break; 3972 break;
4025 } 3973 }
4026 3974
4027 // Look-behind expression matched, which means look-behind test as 3975 // Look-behind expression matched, which means look-behind test as
4028 // a whole Fails 3976 // a whole Fails
4029 3977
4030 // Restore the orignal input string length, which had been tru ncated 3978 // Restore the orignal input string length, which had been tru ncated
4031 // inorder to pin the end of the lookbehind match 3979 // inorder to pin the end of the lookbehind match
4032 // to the position being looked-behind. 3980 // to the position being looked-behind.
4033 int64_t originalInputLen = fData[opValue+3]; 3981 int64_t originalInputLen = fData[opValue+3];
4034 U_ASSERT(originalInputLen >= fActiveLimit); 3982 U_ASSERT(originalInputLen >= fActiveLimit);
4035 U_ASSERT(originalInputLen <= fInputLength); 3983 U_ASSERT(originalInputLen <= fInputLength);
4036 fActiveLimit = originalInputLen; 3984 fActiveLimit = originalInputLen;
4037 3985
4038 // Restore original stack position, discarding any state saved 3986 // Restore original stack position, discarding any state saved
4039 // by the successful pattern match. 3987 // by the successful pattern match.
4040 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3988 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4041 int32_t newStackSize = (int32_t)fData[opValue]; 3989 int32_t newStackSize = (int32_t)fData[opValue];
4042 U_ASSERT(fStack->size() > newStackSize); 3990 U_ASSERT(fStack->size() > newStackSize);
4043 fStack->setSize(newStackSize); 3991 fStack->setSize(newStackSize);
4044 3992
4045 // FAIL, which will take control back to someplace 3993 // FAIL, which will take control back to someplace
4046 // prior to entering the look-behind test. 3994 // prior to entering the look-behind test.
4047 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3995 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4048 } 3996 }
4049 break; 3997 break;
4050 3998
4051 3999
4052 case URX_LOOP_SR_I: 4000 case URX_LOOP_SR_I:
4053 // Loop Initialization for the optimized implementation of 4001 // Loop Initialization for the optimized implementation of
4054 // [some character set]* 4002 // [some character set]*
4055 // This op scans through all matching input. 4003 // This op scans through all matching input.
(...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after
4168 break; 4116 break;
4169 4117
4170 4118
4171 case URX_LOOP_C: 4119 case URX_LOOP_C:
4172 { 4120 {
4173 U_ASSERT(opValue>=0 && opValue<fFrameSize); 4121 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4174 backSearchIndex = fp->fExtra[opValue]; 4122 backSearchIndex = fp->fExtra[opValue];
4175 U_ASSERT(backSearchIndex <= fp->fInputIdx); 4123 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4176 if (backSearchIndex == fp->fInputIdx) { 4124 if (backSearchIndex == fp->fInputIdx) {
4177 // We've backed up the input idx to the point that the loop started. 4125 // We've backed up the input idx to the point that the loop started.
4178 // The loop is done. Leave here without saving state. 4126 // The loop is done. Leave here without saving state.
4179 // Subsequent failures won't come back here. 4127 // Subsequent failures won't come back here.
4180 break; 4128 break;
4181 } 4129 }
4182 // Set up for the next iteration of the loop, with input index 4130 // Set up for the next iteration of the loop, with input index
4183 // backed up by one from the last time through, 4131 // backed up by one from the last time through,
4184 // and a state save to this instruction in case the following code fails again. 4132 // and a state save to this instruction in case the following code fails again.
4185 // (We're going backwards because this loop emulates stack unw inding, not 4133 // (We're going backwards because this loop emulates stack unw inding, not
4186 // the initial scan forward.) 4134 // the initial scan forward.)
4187 U_ASSERT(fp->fInputIdx > 0); 4135 U_ASSERT(fp->fInputIdx > 0);
4188 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4136 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4189 UChar32 prevC = UTEXT_PREVIOUS32(fInputText); 4137 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4190 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4138 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4191 4139
4192 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); 4140 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4193 if (prevC == 0x0a && 4141 if (prevC == 0x0a &&
4194 fp->fInputIdx > backSearchIndex && 4142 fp->fInputIdx > backSearchIndex &&
4195 twoPrevC == 0x0d) { 4143 twoPrevC == 0x0d) {
4196 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 4144 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4197 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 4145 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4198 // .*, stepping back over CRLF pair. 4146 // .*, stepping back over CRLF pair.
4199 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4147 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4200 } 4148 }
4201 } 4149 }
4202 4150
4203 4151
4204 fp = StateSave(fp, fp->fPatIdx-1, status); 4152 fp = StateSave(fp, fp->fPatIdx-1, status);
4205 } 4153 }
4206 break; 4154 break;
4207 4155
4208 4156
4209 4157
4210 default: 4158 default:
4211 // Trouble. The compiled pattern contains an entry with an 4159 // Trouble. The compiled pattern contains an entry with an
4212 // unrecognized type tag. 4160 // unrecognized type tag.
4213 U_ASSERT(FALSE); 4161 U_ASSERT(FALSE);
4214 } 4162 }
4215 4163
4216 if (U_FAILURE(status)) { 4164 if (U_FAILURE(status)) {
4217 isMatch = FALSE; 4165 isMatch = FALSE;
4218 break; 4166 break;
4219 } 4167 }
4220 } 4168 }
4221 4169
4222 breakFromLoop: 4170 breakFromLoop:
4223 fMatch = isMatch; 4171 fMatch = isMatch;
4224 if (isMatch) { 4172 if (isMatch) {
4225 fLastMatchEnd = fMatchEnd; 4173 fLastMatchEnd = fMatchEnd;
4226 fMatchStart = startIdx; 4174 fMatchStart = startIdx;
4227 fMatchEnd = fp->fInputIdx; 4175 fMatchEnd = fp->fInputIdx;
4228 if (fTraceDebug) { 4176 }
4229 REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchSta rt, fMatchEnd)); 4177
4178 #ifdef REGEX_RUN_DEBUG
4179 if (fTraceDebug) {
4180 if (isMatch) {
4181 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
4182 } else {
4183 printf("No match\n\n");
4230 } 4184 }
4231 } 4185 }
4232 else 4186 #endif
4233 {
4234 if (fTraceDebug) {
4235 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
4236 }
4237 }
4238 4187
4239 fFrame = fp; // The active stack frame when the engine stoppe d. 4188 fFrame = fp; // The active stack frame when the engine stoppe d.
4240 // Contains the capture group results that we need to 4189 // Contains the capture group results that we need to
4241 // access later. 4190 // access later.
4242 return; 4191 return;
4243 } 4192 }
4244 4193
4245 4194
4246 //------------------------------------------------------------------------------ -- 4195 //------------------------------------------------------------------------------ --
4247 // 4196 //
4248 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with t he 4197 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with t he
4249 // assumption that the entire string is available in the UText' s 4198 // assumption that the entire string is available in the UText' s
4250 // chunk buffer. For now, that means we can use int32_t indexes , 4199 // chunk buffer. For now, that means we can use int32_t indexes ,
4251 // except for anything that needs to be saved (like group start s 4200 // except for anything that needs to be saved (like group start s
4252 // and ends). 4201 // and ends).
4253 // 4202 //
4254 // startIdx: begin matching a this index. 4203 // startIdx: begin matching a this index.
4255 // toEnd: if true, match must extend to end of the input region 4204 // toEnd: if true, match must extend to end of the input region
4256 // 4205 //
4257 //------------------------------------------------------------------------------ -- 4206 //------------------------------------------------------------------------------ --
4258 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu s) { 4207 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu s) {
4259 UBool isMatch = FALSE; // True if the we have a match. 4208 UBool isMatch = FALSE; // True if the we have a match.
4260 4209
4261 int32_t backSearchIndex = INT32_MAX; // used after greedy single-charact er matches for searching backwards 4210 int32_t backSearchIndex = INT32_MAX; // used after greedy single-charact er matches for searching backwards
4262 4211
4263 int32_t op; // Operation from the compiled pattern, s plit into 4212 int32_t op; // Operation from the compiled pattern, s plit into
4264 int32_t opType; // the opcode 4213 int32_t opType; // the opcode
4265 int32_t opValue; // and the operand value. 4214 int32_t opValue; // and the operand value.
4266 4215
4267 #ifdef REGEX_RUN_DEBUG 4216 #ifdef REGEX_RUN_DEBUG
4268 if (fTraceDebug) 4217 if (fTraceDebug) {
4269 {
4270 printf("MatchAt(startIdx=%d)\n", startIdx); 4218 printf("MatchAt(startIdx=%d)\n", startIdx);
4271 printf("Original Pattern: "); 4219 printf("Original Pattern: ");
4272 UChar32 c = utext_next32From(fPattern->fPattern, 0); 4220 UChar32 c = utext_next32From(fPattern->fPattern, 0);
4273 while (c != U_SENTINEL) { 4221 while (c != U_SENTINEL) {
4274 if (c<32 || c>256) { 4222 if (c<32 || c>256) {
4275 c = '.'; 4223 c = '.';
4276 } 4224 }
4277 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); 4225 printf("%c", c);
4278 4226
4279 c = UTEXT_NEXT32(fPattern->fPattern); 4227 c = UTEXT_NEXT32(fPattern->fPattern);
4280 } 4228 }
4281 printf("\n"); 4229 printf("\n");
4282 printf("Input String: "); 4230 printf("Input String: ");
4283 c = utext_next32From(fInputText, 0); 4231 c = utext_next32From(fInputText, 0);
4284 while (c != U_SENTINEL) { 4232 while (c != U_SENTINEL) {
4285 if (c<32 || c>256) { 4233 if (c<32 || c>256) {
4286 c = '.'; 4234 c = '.';
4287 } 4235 }
4288 printf("%c", c); 4236 printf("%c", c);
4289 4237
4290 c = UTEXT_NEXT32(fInputText); 4238 c = UTEXT_NEXT32(fInputText);
4291 } 4239 }
4292 printf("\n"); 4240 printf("\n");
4293 printf("\n"); 4241 printf("\n");
4294 } 4242 }
4295 #endif 4243 #endif
4296 4244
4297 if (U_FAILURE(status)) { 4245 if (U_FAILURE(status)) {
4298 return; 4246 return;
4299 } 4247 }
4300 4248
4301 // Cache frequently referenced items from the compiled pattern 4249 // Cache frequently referenced items from the compiled pattern
4302 // 4250 //
4303 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 4251 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4304 4252
4305 const UChar *litText = fPattern->fLiteralText.getBuffer(); 4253 const UChar *litText = fPattern->fLiteralText.getBuffer();
4306 UVector *sets = fPattern->fSets; 4254 UVector *sets = fPattern->fSets;
4307 4255
4308 const UChar *inputBuf = fInputText->chunkContents; 4256 const UChar *inputBuf = fInputText->chunkContents;
4309 4257
4310 fFrameSize = fPattern->fFrameSize; 4258 fFrameSize = fPattern->fFrameSize;
4311 REStackFrame *fp = resetStack(); 4259 REStackFrame *fp = resetStack();
4312 4260
4313 fp->fPatIdx = 0; 4261 fp->fPatIdx = 0;
4314 fp->fInputIdx = startIdx; 4262 fp->fInputIdx = startIdx;
4315 4263
4316 // Zero out the pattern's static data 4264 // Zero out the pattern's static data
4317 int32_t i; 4265 int32_t i;
4318 for (i = 0; i<fPattern->fDataSize; i++) { 4266 for (i = 0; i<fPattern->fDataSize; i++) {
4319 fData[i] = 0; 4267 fData[i] = 0;
4320 } 4268 }
4321 4269
4322 // 4270 //
4323 // Main loop for interpreting the compiled pattern. 4271 // Main loop for interpreting the compiled pattern.
4324 // One iteration of the loop per pattern operation performed. 4272 // One iteration of the loop per pattern operation performed.
4325 // 4273 //
4326 for (;;) { 4274 for (;;) {
4327 #if 0
4328 if (_heapchk() != _HEAPOK) {
4329 fprintf(stderr, "Heap Trouble\n");
4330 }
4331 #endif
4332
4333 op = (int32_t)pat[fp->fPatIdx]; 4275 op = (int32_t)pat[fp->fPatIdx];
4334 opType = URX_TYPE(op); 4276 opType = URX_TYPE(op);
4335 opValue = URX_VAL(op); 4277 opValue = URX_VAL(op);
4336 #ifdef REGEX_RUN_DEBUG 4278 #ifdef REGEX_RUN_DEBUG
4337 if (fTraceDebug) { 4279 if (fTraceDebug) {
4338 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4280 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4339 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, 4281 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
4340 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer( ), fActiveLimit); 4282 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer( ), fActiveLimit);
4341 fPattern->dumpOp(fp->fPatIdx); 4283 fPattern->dumpOp(fp->fPatIdx);
4342 } 4284 }
4343 #endif 4285 #endif
4344 fp->fPatIdx++; 4286 fp->fPatIdx++;
4345 4287
4346 switch (opType) { 4288 switch (opType) {
4347 4289
4348 4290
4349 case URX_NOP: 4291 case URX_NOP:
4350 break; 4292 break;
4351 4293
4352 4294
4353 case URX_BACKTRACK: 4295 case URX_BACKTRACK:
4354 // Force a backtrack. In some circumstances, the pattern compiler 4296 // Force a backtrack. In some circumstances, the pattern compiler
4355 // will notice that the pattern can't possibly match anything, and will 4297 // will notice that the pattern can't possibly match anything, and will
4356 // emit one of these at that point. 4298 // emit one of these at that point.
4357 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4299 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4358 break; 4300 break;
4359 4301
4360 4302
4361 case URX_ONECHAR: 4303 case URX_ONECHAR:
4362 if (fp->fInputIdx < fActiveLimit) { 4304 if (fp->fInputIdx < fActiveLimit) {
4363 UChar32 c; 4305 UChar32 c;
4364 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4306 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4365 if (c == opValue) { 4307 if (c == opValue) {
4366 break; 4308 break;
4367 } 4309 }
4368 } else { 4310 } else {
4369 fHitEnd = TRUE; 4311 fHitEnd = TRUE;
4370 } 4312 }
4371 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4313 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4372 break; 4314 break;
4373 4315
4374 4316
4375 case URX_STRING: 4317 case URX_STRING:
4376 { 4318 {
4377 // Test input against a literal string. 4319 // Test input against a literal string.
4378 // Strings require two slots in the compiled pattern, one for th e 4320 // Strings require two slots in the compiled pattern, one for th e
4379 // offset to the string text, and one for the length. 4321 // offset to the string text, and one for the length.
4380 int32_t stringStartIdx = opValue; 4322 int32_t stringStartIdx = opValue;
4381 int32_t stringLen; 4323 int32_t stringLen;
4382 4324
4383 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second ope rand 4325 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second ope rand
4384 fp->fPatIdx++; 4326 fp->fPatIdx++;
4385 opType = URX_TYPE(op); 4327 opType = URX_TYPE(op);
4386 stringLen = URX_VAL(op); 4328 stringLen = URX_VAL(op);
4387 U_ASSERT(opType == URX_STRING_LEN); 4329 U_ASSERT(opType == URX_STRING_LEN);
4388 U_ASSERT(stringLen >= 2); 4330 U_ASSERT(stringLen >= 2);
4389 4331
4390 const UChar * pInp = inputBuf + fp->fInputIdx; 4332 const UChar * pInp = inputBuf + fp->fInputIdx;
4391 const UChar * pInpLimit = inputBuf + fActiveLimit; 4333 const UChar * pInpLimit = inputBuf + fActiveLimit;
4392 const UChar * pPat = litText+stringStartIdx; 4334 const UChar * pPat = litText+stringStartIdx;
4393 const UChar * pEnd = pInp + stringLen; 4335 const UChar * pEnd = pInp + stringLen;
4394 UBool success = TRUE; 4336 UBool success = TRUE;
4395 while (pInp < pEnd) { 4337 while (pInp < pEnd) {
4396 if (pInp >= pInpLimit) { 4338 if (pInp >= pInpLimit) {
4397 fHitEnd = TRUE; 4339 fHitEnd = TRUE;
4398 success = FALSE; 4340 success = FALSE;
4399 break; 4341 break;
4400 } 4342 }
4401 if (*pInp++ != *pPat++) { 4343 if (*pInp++ != *pPat++) {
4402 success = FALSE; 4344 success = FALSE;
4403 break; 4345 break;
4404 } 4346 }
4405 } 4347 }
4406 4348
4407 if (success) { 4349 if (success) {
4408 fp->fInputIdx += stringLen; 4350 fp->fInputIdx += stringLen;
4409 } else { 4351 } else {
4410 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4352 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4411 } 4353 }
4412 } 4354 }
4413 break; 4355 break;
4414 4356
4415 4357
4416 case URX_STATE_SAVE: 4358 case URX_STATE_SAVE:
4417 fp = StateSave(fp, opValue, status); 4359 fp = StateSave(fp, opValue, status);
4418 break; 4360 break;
4419 4361
4420 4362
4421 case URX_END: 4363 case URX_END:
4422 // The match loop will exit via this path on a successful match, 4364 // The match loop will exit via this path on a successful match,
4423 // when we reach the end of the pattern. 4365 // when we reach the end of the pattern.
4424 if (toEnd && fp->fInputIdx != fActiveLimit) { 4366 if (toEnd && fp->fInputIdx != fActiveLimit) {
4425 // The pattern matched, but not to the end of input. Try some m ore. 4367 // The pattern matched, but not to the end of input. Try some m ore.
4426 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4368 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4427 break; 4369 break;
4428 } 4370 }
4429 isMatch = TRUE; 4371 isMatch = TRUE;
4430 goto breakFromLoop; 4372 goto breakFromLoop;
4431 4373
4432 // Start and End Capture stack frame variables are laid out out like this: 4374 // Start and End Capture stack frame variables are laid out out like this:
4433 // fp->fExtra[opValue] - The start of a completed capture group 4375 // fp->fExtra[opValue] - The start of a completed capture group
4434 // opValue+1 - The end of a completed capture group 4376 // opValue+1 - The end of a completed capture group
4435 // opValue+2 - the start of a capture group whose end 4377 // opValue+2 - the start of a capture group whose end
4436 // has not yet been reached (and might not ever be). 4378 // has not yet been reached (and might not ever be).
4437 case URX_START_CAPTURE: 4379 case URX_START_CAPTURE:
4438 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4380 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4439 fp->fExtra[opValue+2] = fp->fInputIdx; 4381 fp->fExtra[opValue+2] = fp->fInputIdx;
4440 break; 4382 break;
4441 4383
4442 4384
4443 case URX_END_CAPTURE: 4385 case URX_END_CAPTURE:
4444 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4386 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4445 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for th is group must be set. 4387 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for th is group must be set.
4446 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 4388 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4447 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 4389 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4448 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 4390 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4449 break; 4391 break;
4450 4392
4451 4393
4452 case URX_DOLLAR: // $, test for End of line 4394 case URX_DOLLAR: // $, test for End of line
4453 // or for position before new line at end of input 4395 // or for position before new line at end of input
4454 if (fp->fInputIdx < fAnchorLimit-2) { 4396 if (fp->fInputIdx < fAnchorLimit-2) {
4455 // We are no where near the end of input. Fail. 4397 // We are no where near the end of input. Fail.
4456 // This is the common case. Keep it first. 4398 // This is the common case. Keep it first.
4457 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4399 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4458 break; 4400 break;
4459 } 4401 }
4460 if (fp->fInputIdx >= fAnchorLimit) { 4402 if (fp->fInputIdx >= fAnchorLimit) {
4461 // We really are at the end of input. Success. 4403 // We really are at the end of input. Success.
4462 fHitEnd = TRUE; 4404 fHitEnd = TRUE;
4463 fRequireEnd = TRUE; 4405 fRequireEnd = TRUE;
4464 break; 4406 break;
4465 } 4407 }
4466 4408
4467 // If we are positioned just before a new-line that is located at th e 4409 // If we are positioned just before a new-line that is located at th e
4468 // end of input, succeed. 4410 // end of input, succeed.
4469 if (fp->fInputIdx == fAnchorLimit-1) { 4411 if (fp->fInputIdx == fAnchorLimit-1) {
4470 UChar32 c; 4412 UChar32 c;
4471 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); 4413 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4472 4414
4473 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { 4415 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
4474 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp- >fInputIdx-1]==0x0d)) { 4416 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp- >fInputIdx-1]==0x0d)) {
4475 // At new-line at end of input. Success 4417 // At new-line at end of input. Success
4476 fHitEnd = TRUE; 4418 fHitEnd = TRUE;
4477 fRequireEnd = TRUE; 4419 fRequireEnd = TRUE;
4478 break; 4420 break;
4479 } 4421 }
4480 } 4422 }
4481 } else if (fp->fInputIdx == fAnchorLimit-2 && 4423 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4482 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a ) { 4424 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a ) {
4483 fHitEnd = TRUE; 4425 fHitEnd = TRUE;
4484 fRequireEnd = TRUE; 4426 fRequireEnd = TRUE;
4485 break; // At CR/LF at end of input. Success 4427 break; // At CR/LF at end of input. Success
4486 } 4428 }
4487 4429
4488 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4430 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4489 4431
4490 break; 4432 break;
4491 4433
4492 4434
4493 case URX_DOLLAR_D: // $, test for End of Line, in UNI X_LINES mode. 4435 case URX_DOLLAR_D: // $, test for End of Line, in UNI X_LINES mode.
4494 if (fp->fInputIdx >= fAnchorLimit-1) { 4436 if (fp->fInputIdx >= fAnchorLimit-1) {
4495 // Either at the last character of input, or off the end. 4437 // Either at the last character of input, or off the end.
4496 if (fp->fInputIdx == fAnchorLimit-1) { 4438 if (fp->fInputIdx == fAnchorLimit-1) {
4497 // At last char of input. Success if it's a new line. 4439 // At last char of input. Success if it's a new line.
4498 if (inputBuf[fp->fInputIdx] == 0x0a) { 4440 if (inputBuf[fp->fInputIdx] == 0x0a) {
4499 fHitEnd = TRUE; 4441 fHitEnd = TRUE;
4500 fRequireEnd = TRUE; 4442 fRequireEnd = TRUE;
4501 break; 4443 break;
4502 } 4444 }
4503 } else { 4445 } else {
4504 // Off the end of input. Success. 4446 // Off the end of input. Success.
4505 fHitEnd = TRUE; 4447 fHitEnd = TRUE;
4506 fRequireEnd = TRUE; 4448 fRequireEnd = TRUE;
4507 break; 4449 break;
4508 } 4450 }
4509 } 4451 }
4510 4452
4511 // Not at end of input. Back-track out. 4453 // Not at end of input. Back-track out.
4512 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4454 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4513 break; 4455 break;
4514 4456
4515 4457
4516 case URX_DOLLAR_M: // $, test for End of line in multi-l ine mode 4458 case URX_DOLLAR_M: // $, test for End of line in multi-l ine mode
4517 { 4459 {
4518 if (fp->fInputIdx >= fAnchorLimit) { 4460 if (fp->fInputIdx >= fAnchorLimit) {
4519 // We really are at the end of input. Success. 4461 // We really are at the end of input. Success.
4520 fHitEnd = TRUE; 4462 fHitEnd = TRUE;
4521 fRequireEnd = TRUE; 4463 fRequireEnd = TRUE;
4522 break; 4464 break;
4523 } 4465 }
4524 // If we are positioned just before a new-line, succeed. 4466 // If we are positioned just before a new-line, succeed.
4525 // It makes no difference where the new-line is within the input . 4467 // It makes no difference where the new-line is within the input .
4526 UChar32 c = inputBuf[fp->fInputIdx]; 4468 UChar32 c = inputBuf[fp->fInputIdx];
4527 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 4469 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
4528 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 4470 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4529 // In multi-line mode, hitting a new-line just before the e nd of input does not 4471 // In multi-line mode, hitting a new-line just before the e nd of input does not
4530 // set the hitEnd or requireEnd flags 4472 // set the hitEnd or requireEnd flags
4531 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp- >fInputIdx-1]==0x0d)) { 4473 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp- >fInputIdx-1]==0x0d)) {
4532 break; 4474 break;
4533 } 4475 }
4534 } 4476 }
4535 // not at a new line. Fail. 4477 // not at a new line. Fail.
4536 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4478 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4537 } 4479 }
4538 break; 4480 break;
4539 4481
4540 4482
4541 case URX_DOLLAR_MD: // $, test for End of line in multi- line and UNIX_LINES mode 4483 case URX_DOLLAR_MD: // $, test for End of line in multi- line and UNIX_LINES mode
4542 { 4484 {
4543 if (fp->fInputIdx >= fAnchorLimit) { 4485 if (fp->fInputIdx >= fAnchorLimit) {
4544 // We really are at the end of input. Success. 4486 // We really are at the end of input. Success.
4545 fHitEnd = TRUE; 4487 fHitEnd = TRUE;
4546 fRequireEnd = TRUE; // Java set requireEnd in this case, ev en though 4488 fRequireEnd = TRUE; // Java set requireEnd in this case, ev en though
4547 break; // adding a new-line would not lose t he match. 4489 break; // adding a new-line would not lose t he match.
4548 } 4490 }
4549 // If we are not positioned just before a new-line, the test fai ls; backtrack out. 4491 // If we are not positioned just before a new-line, the test fai ls; backtrack out.
4550 // It makes no difference where the new-line is within the input . 4492 // It makes no difference where the new-line is within the input .
4551 if (inputBuf[fp->fInputIdx] != 0x0a) { 4493 if (inputBuf[fp->fInputIdx] != 0x0a) {
4552 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4494 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4553 } 4495 }
4554 } 4496 }
4555 break; 4497 break;
4556 4498
4557 4499
4558 case URX_CARET: // ^, test for start of line 4500 case URX_CARET: // ^, test for start of line
4559 if (fp->fInputIdx != fAnchorStart) { 4501 if (fp->fInputIdx != fAnchorStart) {
4560 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4502 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4561 } 4503 }
4562 break; 4504 break;
4563 4505
4564 4506
4565 case URX_CARET_M: // ^, test for start of line in mul it-line mode 4507 case URX_CARET_M: // ^, test for start of line in mul it-line mode
4566 { 4508 {
4567 if (fp->fInputIdx == fAnchorStart) { 4509 if (fp->fInputIdx == fAnchorStart) {
4568 // We are at the start input. Success. 4510 // We are at the start input. Success.
4569 break; 4511 break;
4570 } 4512 }
4571 // Check whether character just before the current pos is a new- line 4513 // Check whether character just before the current pos is a new- line
4572 // unless we are at the end of input 4514 // unless we are at the end of input
4573 UChar c = inputBuf[fp->fInputIdx - 1]; 4515 UChar c = inputBuf[fp->fInputIdx - 1];
4574 if ((fp->fInputIdx < fAnchorLimit) && 4516 if ((fp->fInputIdx < fAnchorLimit) &&
4575 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 4517 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4576 // It's a new-line. ^ is true. Success. 4518 // It's a new-line. ^ is true. Success.
4577 // TODO: what should be done with positions between a CR a nd LF? 4519 // TODO: what should be done with positions between a CR a nd LF?
4578 break; 4520 break;
4579 } 4521 }
4580 // Not at the start of a line. Fail. 4522 // Not at the start of a line. Fail.
4581 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4523 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4582 } 4524 }
4583 break; 4525 break;
4584 4526
4585 4527
4586 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 4528 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4587 { 4529 {
4588 U_ASSERT(fp->fInputIdx >= fAnchorStart); 4530 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4589 if (fp->fInputIdx <= fAnchorStart) { 4531 if (fp->fInputIdx <= fAnchorStart) {
4590 // We are at the start input. Success. 4532 // We are at the start input. Success.
4591 break; 4533 break;
4592 } 4534 }
4593 // Check whether character just before the current pos is a new- line 4535 // Check whether character just before the current pos is a new- line
4594 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 4536 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4595 UChar c = inputBuf[fp->fInputIdx - 1]; 4537 UChar c = inputBuf[fp->fInputIdx - 1];
4596 if (c != 0x0a) { 4538 if (c != 0x0a) {
4597 // Not at the start of a line. Back-track out. 4539 // Not at the start of a line. Back-track out.
4598 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4540 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4599 } 4541 }
4600 } 4542 }
4601 break; 4543 break;
4602 4544
4603 case URX_BACKSLASH_B: // Test for word boundaries 4545 case URX_BACKSLASH_B: // Test for word boundaries
4604 { 4546 {
4605 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); 4547 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4606 success ^= (UBool)(opValue != 0); // flip sense for \B 4548 success ^= (UBool)(opValue != 0); // flip sense for \B
4607 if (!success) { 4549 if (!success) {
4608 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4550 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4609 } 4551 }
4610 } 4552 }
4611 break; 4553 break;
4612 4554
4613 4555
4614 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-sty le 4556 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-sty le
4615 { 4557 {
4616 UBool success = isUWordBoundary(fp->fInputIdx); 4558 UBool success = isUWordBoundary(fp->fInputIdx);
4617 success ^= (UBool)(opValue != 0); // flip sense for \B 4559 success ^= (UBool)(opValue != 0); // flip sense for \B
4618 if (!success) { 4560 if (!success) {
4619 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4561 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4620 } 4562 }
4621 } 4563 }
4622 break; 4564 break;
4623 4565
4624 4566
4625 case URX_BACKSLASH_D: // Test for decimal digit 4567 case URX_BACKSLASH_D: // Test for decimal digit
4626 { 4568 {
4627 if (fp->fInputIdx >= fActiveLimit) { 4569 if (fp->fInputIdx >= fActiveLimit) {
4628 fHitEnd = TRUE; 4570 fHitEnd = TRUE;
4629 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4571 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4630 break; 4572 break;
4631 } 4573 }
4632 4574
4633 UChar32 c; 4575 UChar32 c;
4634 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4576 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4635 int8_t ctype = u_charType(c); // TODO: make a unicode set f or this. Will be faster. 4577 int8_t ctype = u_charType(c); // TODO: make a unicode set f or this. Will be faster.
4636 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 4578 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4637 success ^= (UBool)(opValue != 0); // flip sense for \D 4579 success ^= (UBool)(opValue != 0); // flip sense for \D
4638 if (!success) { 4580 if (!success) {
4639 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4581 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4640 } 4582 }
4641 } 4583 }
4642 break; 4584 break;
4643 4585
4644 4586
4645 case URX_BACKSLASH_G: // Test for position at end of previous m atch 4587 case URX_BACKSLASH_G: // Test for position at end of previous m atch
4646 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp-> fInputIdx==fActiveStart))) { 4588 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp-> fInputIdx==fActiveStart))) {
4647 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4589 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4648 } 4590 }
4649 break; 4591 break;
4650 4592
4651 4593
4652 case URX_BACKSLASH_X: 4594 case URX_BACKSLASH_X:
4653 // Match a Grapheme, as defined by Unicode TR 29. 4595 // Match a Grapheme, as defined by Unicode TR 29.
4654 // Differs slightly from Perl, which consumes combining marks independe ntly 4596 // Differs slightly from Perl, which consumes combining marks independe ntly
4655 // of context. 4597 // of context.
4656 { 4598 {
4657 4599
4658 // Fail if at end of input 4600 // Fail if at end of input
4659 if (fp->fInputIdx >= fActiveLimit) { 4601 if (fp->fInputIdx >= fActiveLimit) {
4660 fHitEnd = TRUE; 4602 fHitEnd = TRUE;
4661 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4603 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4662 break; 4604 break;
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
4711 } 4653 }
4712 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4654 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4713 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 4655 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4714 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 4656 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4715 break; 4657 break;
4716 } 4658 }
4717 } 4659 }
4718 goto GC_Done; 4660 goto GC_Done;
4719 4661
4720 GC_Control: 4662 GC_Control:
4721 // Most control chars stand alone (don't combine with combining char s), 4663 // Most control chars stand alone (don't combine with combining char s),
4722 // except for that CR/LF sequence is a single grapheme cluster. 4664 // except for that CR/LF sequence is a single grapheme cluster.
4723 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInput Idx] == 0x0a) { 4665 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInput Idx] == 0x0a) {
4724 fp->fInputIdx++; 4666 fp->fInputIdx++;
4725 } 4667 }
4726 4668
4727 GC_Done: 4669 GC_Done:
4728 if (fp->fInputIdx >= fActiveLimit) { 4670 if (fp->fInputIdx >= fActiveLimit) {
4729 fHitEnd = TRUE; 4671 fHitEnd = TRUE;
4730 } 4672 }
4731 break; 4673 break;
4732 } 4674 }
4733 4675
4734 4676
4735 4677
4736 4678
4737 case URX_BACKSLASH_Z: // Test for end of Input 4679 case URX_BACKSLASH_Z: // Test for end of Input
4738 if (fp->fInputIdx < fAnchorLimit) { 4680 if (fp->fInputIdx < fAnchorLimit) {
4739 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4681 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4740 } else { 4682 } else {
4741 fHitEnd = TRUE; 4683 fHitEnd = TRUE;
4742 fRequireEnd = TRUE; 4684 fRequireEnd = TRUE;
4743 } 4685 }
4744 break; 4686 break;
4745 4687
4746 4688
4747 4689
4748 case URX_STATIC_SETREF: 4690 case URX_STATIC_SETREF:
4749 { 4691 {
4750 // Test input character against one of the predefined sets 4692 // Test input character against one of the predefined sets
4751 // (Word Characters, for example) 4693 // (Word Characters, for example)
4752 // The high bit of the op value is a flag for the match polarity . 4694 // The high bit of the op value is a flag for the match polarity .
4753 // 0: success if input char is in set. 4695 // 0: success if input char is in set.
4754 // 1: success if input char is not in set. 4696 // 1: success if input char is not in set.
4755 if (fp->fInputIdx >= fActiveLimit) { 4697 if (fp->fInputIdx >= fActiveLimit) {
4756 fHitEnd = TRUE; 4698 fHitEnd = TRUE;
4757 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4699 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4758 break; 4700 break;
4759 } 4701 }
4760 4702
4761 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 4703 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4762 opValue &= ~URX_NEG_SET; 4704 opValue &= ~URX_NEG_SET;
4763 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 4705 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4764 4706
4765 UChar32 c; 4707 UChar32 c;
4766 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4708 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4767 if (c < 256) { 4709 if (c < 256) {
4768 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 4710 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4769 if (s8->contains(c)) { 4711 if (s8->contains(c)) {
4770 success = !success; 4712 success = !success;
4771 } 4713 }
4772 } else { 4714 } else {
4773 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 4715 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4774 if (s->contains(c)) { 4716 if (s->contains(c)) {
4775 success = !success; 4717 success = !success;
4776 } 4718 }
4777 } 4719 }
4778 if (!success) { 4720 if (!success) {
4779 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4721 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4780 } 4722 }
4781 } 4723 }
4782 break; 4724 break;
4783 4725
4784 4726
4785 case URX_STAT_SETREF_N: 4727 case URX_STAT_SETREF_N:
4786 { 4728 {
4787 // Test input character for NOT being a member of one of 4729 // Test input character for NOT being a member of one of
4788 // the predefined sets (Word Characters, for example) 4730 // the predefined sets (Word Characters, for example)
4789 if (fp->fInputIdx >= fActiveLimit) { 4731 if (fp->fInputIdx >= fActiveLimit) {
4790 fHitEnd = TRUE; 4732 fHitEnd = TRUE;
4791 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4733 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4792 break; 4734 break;
4793 } 4735 }
4794 4736
4795 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 4737 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4796 4738
4797 UChar32 c; 4739 UChar32 c;
4798 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4740 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4799 if (c < 256) { 4741 if (c < 256) {
4800 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 4742 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4801 if (s8->contains(c) == FALSE) { 4743 if (s8->contains(c) == FALSE) {
4802 break; 4744 break;
4803 } 4745 }
4804 } else { 4746 } else {
4805 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 4747 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4806 if (s->contains(c) == FALSE) { 4748 if (s->contains(c) == FALSE) {
4807 break; 4749 break;
4808 } 4750 }
4809 } 4751 }
4810 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4752 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4811 } 4753 }
4812 break; 4754 break;
4813 4755
4814 4756
4815 case URX_SETREF: 4757 case URX_SETREF:
4816 { 4758 {
4817 if (fp->fInputIdx >= fActiveLimit) { 4759 if (fp->fInputIdx >= fActiveLimit) {
4818 fHitEnd = TRUE; 4760 fHitEnd = TRUE;
4819 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4761 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4820 break; 4762 break;
4821 } 4763 }
4822 4764
4823 U_ASSERT(opValue > 0 && opValue < sets->size()); 4765 U_ASSERT(opValue > 0 && opValue < sets->size());
4824 4766
4825 // There is input left. Pick up one char and test it for set me mbership. 4767 // There is input left. Pick up one char and test it for set me mbership.
4826 UChar32 c; 4768 UChar32 c;
4827 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4769 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4828 if (c<256) { 4770 if (c<256) {
4829 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 4771 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4830 if (s8->contains(c)) { 4772 if (s8->contains(c)) {
4831 // The character is in the set. A Match. 4773 // The character is in the set. A Match.
4832 break; 4774 break;
4833 } 4775 }
4834 } else { 4776 } else {
4835 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 4777 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4836 if (s->contains(c)) { 4778 if (s->contains(c)) {
4837 // The character is in the set. A Match. 4779 // The character is in the set. A Match.
4838 break; 4780 break;
4839 } 4781 }
4840 } 4782 }
4841 4783
4842 // the character wasn't in the set. 4784 // the character wasn't in the set.
4843 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4785 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4844 } 4786 }
4845 break; 4787 break;
4846 4788
4847 4789
4848 case URX_DOTANY: 4790 case URX_DOTANY:
4849 { 4791 {
4850 // . matches anything, but stops at end-of-line. 4792 // . matches anything, but stops at end-of-line.
4851 if (fp->fInputIdx >= fActiveLimit) { 4793 if (fp->fInputIdx >= fActiveLimit) {
4852 // At end of input. Match failed. Backtrack out. 4794 // At end of input. Match failed. Backtrack out.
4853 fHitEnd = TRUE; 4795 fHitEnd = TRUE;
4854 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4796 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4855 break; 4797 break;
4856 } 4798 }
4857 4799
4858 // There is input left. Advance over one char, unless we've hit end-of-line 4800 // There is input left. Advance over one char, unless we've hit end-of-line
4859 UChar32 c; 4801 UChar32 c;
4860 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4802 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4861 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 4803 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
4862 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 4804 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4863 // End of line in normal mode. . does not match. 4805 // End of line in normal mode. . does not match.
4864 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4806 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4865 break; 4807 break;
4866 } 4808 }
4867 } 4809 }
4868 break; 4810 break;
4869 4811
4870 4812
4871 case URX_DOTANY_ALL: 4813 case URX_DOTANY_ALL:
4872 { 4814 {
4873 // . in dot-matches-all (including new lines) mode 4815 // . in dot-matches-all (including new lines) mode
4874 if (fp->fInputIdx >= fActiveLimit) { 4816 if (fp->fInputIdx >= fActiveLimit) {
4875 // At end of input. Match failed. Backtrack out. 4817 // At end of input. Match failed. Backtrack out.
4876 fHitEnd = TRUE; 4818 fHitEnd = TRUE;
4877 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4819 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4878 break; 4820 break;
4879 } 4821 }
4880 4822
4881 // There is input left. Advance over one char, except if we are 4823 // There is input left. Advance over one char, except if we are
4882 // at a cr/lf, advance over both of them. 4824 // at a cr/lf, advance over both of them.
4883 UChar32 c; 4825 UChar32 c;
4884 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4826 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4885 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 4827 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4886 // In the case of a CR/LF, we need to advance over both. 4828 // In the case of a CR/LF, we need to advance over both.
4887 if (inputBuf[fp->fInputIdx] == 0x0a) { 4829 if (inputBuf[fp->fInputIdx] == 0x0a) {
4888 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); 4830 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4889 } 4831 }
4890 } 4832 }
4891 } 4833 }
4892 break; 4834 break;
4893 4835
4894 4836
4895 case URX_DOTANY_UNIX: 4837 case URX_DOTANY_UNIX:
4896 { 4838 {
4897 // '.' operator, matches all, but stops at end-of-line. 4839 // '.' operator, matches all, but stops at end-of-line.
4898 // UNIX_LINES mode, so 0x0a is the only recognized line ending . 4840 // UNIX_LINES mode, so 0x0a is the only recognized line ending .
4899 if (fp->fInputIdx >= fActiveLimit) { 4841 if (fp->fInputIdx >= fActiveLimit) {
4900 // At end of input. Match failed. Backtrack out. 4842 // At end of input. Match failed. Backtrack out.
4901 fHitEnd = TRUE; 4843 fHitEnd = TRUE;
4902 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4844 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4903 break; 4845 break;
4904 } 4846 }
4905 4847
4906 // There is input left. Advance over one char, unless we've hit end-of-line 4848 // There is input left. Advance over one char, unless we've hit end-of-line
4907 UChar32 c; 4849 UChar32 c;
4908 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4850 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4909 if (c == 0x0a) { 4851 if (c == 0x0a) {
4910 // End of line in normal mode. '.' does not match the \n 4852 // End of line in normal mode. '.' does not match the \n
4911 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4853 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4912 } 4854 }
4913 } 4855 }
4914 break; 4856 break;
4915 4857
4916 4858
4917 case URX_JMP: 4859 case URX_JMP:
4918 fp->fPatIdx = opValue; 4860 fp->fPatIdx = opValue;
4919 break; 4861 break;
4920 4862
4921 case URX_FAIL: 4863 case URX_FAIL:
4922 isMatch = FALSE; 4864 isMatch = FALSE;
4923 goto breakFromLoop; 4865 goto breakFromLoop;
4924 4866
4925 case URX_JMP_SAV: 4867 case URX_JMP_SAV:
4926 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 4868 U_ASSERT(opValue < fPattern->fCompiledPat->size());
4927 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 4869 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
4928 fp->fPatIdx = opValue; // Then JMP. 4870 fp->fPatIdx = opValue; // Then JMP.
4929 break; 4871 break;
4930 4872
4931 case URX_JMP_SAV_X: 4873 case URX_JMP_SAV_X:
4932 // This opcode is used with (x)+, when x can match a zero length str ing. 4874 // This opcode is used with (x)+, when x can match a zero length str ing.
4933 // Same as JMP_SAV, except conditional on the match having made forw ard progress. 4875 // Same as JMP_SAV, except conditional on the match having made forw ard progress.
4934 // Destination of the JMP must be a URX_STO_INP_LOC, from which we g et the 4876 // Destination of the JMP must be a URX_STO_INP_LOC, from which we g et the
4935 // data address of the input position at the start of the loop. 4877 // data address of the input position at the start of the loop.
4936 { 4878 {
4937 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size() ); 4879 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size() );
4938 int32_t stoOp = (int32_t)pat[opValue-1]; 4880 int32_t stoOp = (int32_t)pat[opValue-1];
4939 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 4881 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
4940 int32_t frameLoc = URX_VAL(stoOp); 4882 int32_t frameLoc = URX_VAL(stoOp);
4941 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 4883 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
4942 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; 4884 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
4943 U_ASSERT(prevInputIdx <= fp->fInputIdx); 4885 U_ASSERT(prevInputIdx <= fp->fInputIdx);
4944 if (prevInputIdx < fp->fInputIdx) { 4886 if (prevInputIdx < fp->fInputIdx) {
4945 // The match did make progress. Repeat the loop. 4887 // The match did make progress. Repeat the loop.
4946 fp = StateSave(fp, fp->fPatIdx, status); // State save to l oc following current 4888 fp = StateSave(fp, fp->fPatIdx, status); // State save to l oc following current
4947 fp->fPatIdx = opValue; 4889 fp->fPatIdx = opValue;
4948 fp->fExtra[frameLoc] = fp->fInputIdx; 4890 fp->fExtra[frameLoc] = fp->fInputIdx;
4949 } 4891 }
4950 // If the input position did not advance, we do nothing here, 4892 // If the input position did not advance, we do nothing here,
4951 // execution will fall out of the loop. 4893 // execution will fall out of the loop.
4952 } 4894 }
4953 break; 4895 break;
4954 4896
4955 case URX_CTR_INIT: 4897 case URX_CTR_INIT:
4956 { 4898 {
4957 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 4899 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4958 fp->fExtra[opValue] = 0; // Set the loop counte r variable to zero 4900 fp->fExtra[opValue] = 0; // Set the loop counte r variable to zero
4959 4901
4960 // Pick up the three extra operands that CTR_INIT has, and 4902 // Pick up the three extra operands that CTR_INIT has, and
4961 // skip the pattern location counter past 4903 // skip the pattern location counter past
4962 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 4904 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
4963 fp->fPatIdx += 3; 4905 fp->fPatIdx += 3;
4964 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 4906 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
4965 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 4907 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4966 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 4908 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
4967 U_ASSERT(minCount>=0); 4909 U_ASSERT(minCount>=0);
4968 U_ASSERT(maxCount>=minCount || maxCount==-1); 4910 U_ASSERT(maxCount>=minCount || maxCount==-1);
4969 U_ASSERT(loopLoc>=fp->fPatIdx); 4911 U_ASSERT(loopLoc>=fp->fPatIdx);
4970 4912
4971 if (minCount == 0) { 4913 if (minCount == 0) {
4972 fp = StateSave(fp, loopLoc+1, status); 4914 fp = StateSave(fp, loopLoc+1, status);
4973 } 4915 }
4974 if (maxCount == -1) { 4916 if (maxCount == -1) {
4975 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaki ng. 4917 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaki ng.
4976 } else if (maxCount == 0) { 4918 } else if (maxCount == 0) {
4977 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4919 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4978 } 4920 }
4979 } 4921 }
4980 break; 4922 break;
4981 4923
4982 case URX_CTR_LOOP: 4924 case URX_CTR_LOOP:
4983 { 4925 {
4984 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 4926 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
4985 int32_t initOp = (int32_t)pat[opValue]; 4927 int32_t initOp = (int32_t)pat[opValue];
4986 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 4928 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
4987 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 4929 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4988 int32_t minCount = (int32_t)pat[opValue+2]; 4930 int32_t minCount = (int32_t)pat[opValue+2];
4989 int32_t maxCount = (int32_t)pat[opValue+3]; 4931 int32_t maxCount = (int32_t)pat[opValue+3];
4990 (*pCounter)++; 4932 (*pCounter)++;
4991 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 4933 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
4992 U_ASSERT(*pCounter == maxCount); 4934 U_ASSERT(*pCounter == maxCount);
4993 break; 4935 break;
4994 } 4936 }
4995 if (*pCounter >= minCount) { 4937 if (*pCounter >= minCount) {
4996 if (maxCount == -1) { 4938 if (maxCount == -1) {
4997 // Loop has no hard upper bound. 4939 // Loop has no hard upper bound.
4998 // Check that it is progressing through the input, break if it is not. 4940 // Check that it is progressing through the input, break if it is not.
4999 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 4941 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5000 if (fp->fInputIdx == *pLastInputIdx) { 4942 if (fp->fInputIdx == *pLastInputIdx) {
5001 break; 4943 break;
5002 } else { 4944 } else {
5003 *pLastInputIdx = fp->fInputIdx; 4945 *pLastInputIdx = fp->fInputIdx;
5004 } 4946 }
5005 } 4947 }
5006 fp = StateSave(fp, fp->fPatIdx, status); 4948 fp = StateSave(fp, fp->fPatIdx, status);
5007 } 4949 }
5008 fp->fPatIdx = opValue + 4; // Loop back. 4950 fp->fPatIdx = opValue + 4; // Loop back.
5009 } 4951 }
5010 break; 4952 break;
5011 4953
5012 case URX_CTR_INIT_NG: 4954 case URX_CTR_INIT_NG:
5013 { 4955 {
5014 // Initialize a non-greedy loop 4956 // Initialize a non-greedy loop
5015 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 4957 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5016 fp->fExtra[opValue] = 0; // Set the loop counte r variable to zero 4958 fp->fExtra[opValue] = 0; // Set the loop counte r variable to zero
5017 4959
5018 // Pick up the three extra operands that CTR_INIT_NG has, and 4960 // Pick up the three extra operands that CTR_INIT_NG has, and
5019 // skip the pattern location counter past 4961 // skip the pattern location counter past
5020 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 4962 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5021 fp->fPatIdx += 3; 4963 fp->fPatIdx += 3;
5022 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 4964 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5023 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 4965 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5024 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 4966 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5025 U_ASSERT(minCount>=0); 4967 U_ASSERT(minCount>=0);
5026 U_ASSERT(maxCount>=minCount || maxCount==-1); 4968 U_ASSERT(maxCount>=minCount || maxCount==-1);
5027 U_ASSERT(loopLoc>fp->fPatIdx); 4969 U_ASSERT(loopLoc>fp->fPatIdx);
5028 if (maxCount == -1) { 4970 if (maxCount == -1) {
5029 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial in put index for loop breaking. 4971 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial in put index for loop breaking.
5030 } 4972 }
5031 4973
5032 if (minCount == 0) { 4974 if (minCount == 0) {
5033 if (maxCount != 0) { 4975 if (maxCount != 0) {
5034 fp = StateSave(fp, fp->fPatIdx, status); 4976 fp = StateSave(fp, fp->fPatIdx, status);
5035 } 4977 }
5036 fp->fPatIdx = loopLoc+1; // Continue with stuff after repe ated block 4978 fp->fPatIdx = loopLoc+1; // Continue with stuff after repe ated block
5037 } 4979 }
5038 } 4980 }
5039 break; 4981 break;
5040 4982
5041 case URX_CTR_LOOP_NG: 4983 case URX_CTR_LOOP_NG:
5042 { 4984 {
5043 // Non-greedy {min, max} loops 4985 // Non-greedy {min, max} loops
5044 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 4986 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5045 int32_t initOp = (int32_t)pat[opValue]; 4987 int32_t initOp = (int32_t)pat[opValue];
5046 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 4988 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5047 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 4989 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5048 int32_t minCount = (int32_t)pat[opValue+2]; 4990 int32_t minCount = (int32_t)pat[opValue+2];
5049 int32_t maxCount = (int32_t)pat[opValue+3]; 4991 int32_t maxCount = (int32_t)pat[opValue+3];
5050 4992
5051 (*pCounter)++; 4993 (*pCounter)++;
5052 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { 4994 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5053 // The loop has matched the maximum permitted number of time s. 4995 // The loop has matched the maximum permitted number of time s.
5054 // Break out of here with no action. Matching will 4996 // Break out of here with no action. Matching will
5055 // continue with the following pattern. 4997 // continue with the following pattern.
5056 U_ASSERT(*pCounter == maxCount); 4998 U_ASSERT(*pCounter == maxCount);
5057 break; 4999 break;
5058 } 5000 }
5059 5001
5060 if (*pCounter < minCount) { 5002 if (*pCounter < minCount) {
5061 // We haven't met the minimum number of matches yet. 5003 // We haven't met the minimum number of matches yet.
5062 // Loop back for another one. 5004 // Loop back for another one.
5063 fp->fPatIdx = opValue + 4; // Loop back. 5005 fp->fPatIdx = opValue + 4; // Loop back.
5064 } else { 5006 } else {
5065 // We do have the minimum number of matches. 5007 // We do have the minimum number of matches.
5066 5008
5067 // If there is no upper bound on the loop iterations, check that the input index 5009 // If there is no upper bound on the loop iterations, check that the input index
5068 // is progressing, and stop the loop if it is not. 5010 // is progressing, and stop the loop if it is not.
5069 if (maxCount == -1) { 5011 if (maxCount == -1) {
5070 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; 5012 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5071 if (fp->fInputIdx == *pLastInputIdx) { 5013 if (fp->fInputIdx == *pLastInputIdx) {
5072 break; 5014 break;
5073 } 5015 }
5074 *pLastInputIdx = fp->fInputIdx; 5016 *pLastInputIdx = fp->fInputIdx;
5075 } 5017 }
5076 5018
5077 // Loop Continuation: we will fall into the pattern followin g the loop 5019 // Loop Continuation: we will fall into the pattern followin g the loop
5078 // (non-greedy, don't execute loop body first), but first do 5020 // (non-greedy, don't execute loop body first), but first do
5079 // a state save to the top of the loop, so that a match fa ilure 5021 // a state save to the top of the loop, so that a match fa ilure
5080 // in the following pattern will try another iteration of the loop. 5022 // in the following pattern will try another iteration of the loop.
5081 fp = StateSave(fp, opValue + 4, status); 5023 fp = StateSave(fp, opValue + 4, status);
5082 } 5024 }
5083 } 5025 }
5084 break; 5026 break;
5085 5027
5086 case URX_STO_SP: 5028 case URX_STO_SP:
5087 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5029 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5088 fData[opValue] = fStack->size(); 5030 fData[opValue] = fStack->size();
5089 break; 5031 break;
5090 5032
5091 case URX_LD_SP: 5033 case URX_LD_SP:
5092 { 5034 {
5093 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5035 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5094 int32_t newStackSize = (int32_t)fData[opValue]; 5036 int32_t newStackSize = (int32_t)fData[opValue];
5095 U_ASSERT(newStackSize <= fStack->size()); 5037 U_ASSERT(newStackSize <= fStack->size());
5096 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize ; 5038 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize ;
5097 if (newFP == (int64_t *)fp) { 5039 if (newFP == (int64_t *)fp) {
5098 break; 5040 break;
5099 } 5041 }
5100 int32_t i; 5042 int32_t i;
5101 for (i=0; i<fFrameSize; i++) { 5043 for (i=0; i<fFrameSize; i++) {
5102 newFP[i] = ((int64_t *)fp)[i]; 5044 newFP[i] = ((int64_t *)fp)[i];
5103 } 5045 }
5104 fp = (REStackFrame *)newFP; 5046 fp = (REStackFrame *)newFP;
5105 fStack->setSize(newStackSize); 5047 fStack->setSize(newStackSize);
5106 } 5048 }
5107 break; 5049 break;
5108 5050
5109 case URX_BACKREF: 5051 case URX_BACKREF:
5110 { 5052 {
5111 U_ASSERT(opValue < fFrameSize); 5053 U_ASSERT(opValue < fFrameSize);
5112 int64_t groupStartIdx = fp->fExtra[opValue]; 5054 int64_t groupStartIdx = fp->fExtra[opValue];
5113 int64_t groupEndIdx = fp->fExtra[opValue+1]; 5055 int64_t groupEndIdx = fp->fExtra[opValue+1];
5114 U_ASSERT(groupStartIdx <= groupEndIdx); 5056 U_ASSERT(groupStartIdx <= groupEndIdx);
5115 int64_t inputIndex = fp->fInputIdx; 5057 int64_t inputIndex = fp->fInputIdx;
5116 if (groupStartIdx < 0) { 5058 if (groupStartIdx < 0) {
5117 // This capture group has not participated in the match thus far, 5059 // This capture group has not participated in the match thus far,
5118 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL , no match. 5060 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL , no match.
(...skipping 11 matching lines...) Expand all
5130 break; 5072 break;
5131 } 5073 }
5132 } 5074 }
5133 if (success) { 5075 if (success) {
5134 fp->fInputIdx = inputIndex; 5076 fp->fInputIdx = inputIndex;
5135 } else { 5077 } else {
5136 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5078 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5137 } 5079 }
5138 } 5080 }
5139 break; 5081 break;
5140 5082
5141 case URX_BACKREF_I: 5083 case URX_BACKREF_I:
5142 { 5084 {
5143 U_ASSERT(opValue < fFrameSize); 5085 U_ASSERT(opValue < fFrameSize);
5144 int64_t groupStartIdx = fp->fExtra[opValue]; 5086 int64_t groupStartIdx = fp->fExtra[opValue];
5145 int64_t groupEndIdx = fp->fExtra[opValue+1]; 5087 int64_t groupEndIdx = fp->fExtra[opValue+1];
5146 U_ASSERT(groupStartIdx <= groupEndIdx); 5088 U_ASSERT(groupStartIdx <= groupEndIdx);
5147 if (groupStartIdx < 0) { 5089 if (groupStartIdx < 0) {
5148 // This capture group has not participated in the match thus far, 5090 // This capture group has not participated in the match thus far,
5149 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL , no match. 5091 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL , no match.
5150 break; 5092 break;
5151 } 5093 }
5152 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx , groupEndIdx); 5094 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx , groupEndIdx);
5153 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActi veLimit); 5095 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActi veLimit);
5154 5096
5155 // Note: if the capture group match was of an empty string the backref 5097 // Note: if the capture group match was of an empty string the backref
5156 // match succeeds. Verified by testing: Perl matches s ucceed 5098 // match succeeds. Verified by testing: Perl matches s ucceed
5157 // in this case, so we do too. 5099 // in this case, so we do too.
5158 5100
5159 UBool success = TRUE; 5101 UBool success = TRUE;
5160 for (;;) { 5102 for (;;) {
5161 UChar32 captureGroupChar = captureGroupItr.next(); 5103 UChar32 captureGroupChar = captureGroupItr.next();
5162 if (captureGroupChar == U_SENTINEL) { 5104 if (captureGroupChar == U_SENTINEL) {
5163 success = TRUE; 5105 success = TRUE;
5164 break; 5106 break;
5165 } 5107 }
5166 UChar32 inputChar = inputItr.next(); 5108 UChar32 inputChar = inputItr.next();
5167 if (inputChar == U_SENTINEL) { 5109 if (inputChar == U_SENTINEL) {
5168 success = FALSE; 5110 success = FALSE;
5169 fHitEnd = TRUE; 5111 fHitEnd = TRUE;
5170 break; 5112 break;
5171 } 5113 }
5172 if (inputChar != captureGroupChar) { 5114 if (inputChar != captureGroupChar) {
5173 success = FALSE; 5115 success = FALSE;
5174 break; 5116 break;
5175 } 5117 }
5176 } 5118 }
5177 5119
5178 if (success && inputItr.inExpansion()) { 5120 if (success && inputItr.inExpansion()) {
5179 // We otained a match by consuming part of a string obtained from 5121 // We otained a match by consuming part of a string obtained from
5180 // case-folding a single code point of the input text. 5122 // case-folding a single code point of the input text.
5181 // This does not count as an overall match. 5123 // This does not count as an overall match.
5182 success = FALSE; 5124 success = FALSE;
5183 } 5125 }
5184 5126
5185 if (success) { 5127 if (success) {
5186 fp->fInputIdx = inputItr.getIndex(); 5128 fp->fInputIdx = inputItr.getIndex();
5187 } else { 5129 } else {
5188 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5130 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5189 } 5131 }
5190 } 5132 }
5191 break; 5133 break;
5192 5134
5193 case URX_STO_INP_LOC: 5135 case URX_STO_INP_LOC:
5194 { 5136 {
5195 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 5137 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5196 fp->fExtra[opValue] = fp->fInputIdx; 5138 fp->fExtra[opValue] = fp->fInputIdx;
5197 } 5139 }
5198 break; 5140 break;
5199 5141
5200 case URX_JMPX: 5142 case URX_JMPX:
5201 { 5143 {
5202 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5144 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5203 fp->fPatIdx += 1; 5145 fp->fPatIdx += 1;
5204 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 5146 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
5205 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 5147 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5206 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; 5148 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5207 U_ASSERT(savedInputIdx <= fp->fInputIdx); 5149 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5208 if (savedInputIdx < fp->fInputIdx) { 5150 if (savedInputIdx < fp->fInputIdx) {
5209 fp->fPatIdx = opValue; // JMP 5151 fp->fPatIdx = opValue; // JMP
5210 } else { 5152 } else {
5211 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL , no progress in loop. 5153 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL , no progress in loop.
5212 } 5154 }
5213 } 5155 }
5214 break; 5156 break;
5215 5157
5216 case URX_LA_START: 5158 case URX_LA_START:
5217 { 5159 {
5218 // Entering a lookahead block. 5160 // Entering a lookahead block.
5219 // Save Stack Ptr, Input Pos. 5161 // Save Stack Ptr, Input Pos.
5220 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5162 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5221 fData[opValue] = fStack->size(); 5163 fData[opValue] = fStack->size();
5222 fData[opValue+1] = fp->fInputIdx; 5164 fData[opValue+1] = fp->fInputIdx;
5223 fActiveStart = fLookStart; // Set the match region change for 5165 fActiveStart = fLookStart; // Set the match region change for
5224 fActiveLimit = fLookLimit; // transparent bounds. 5166 fActiveLimit = fLookLimit; // transparent bounds.
5225 } 5167 }
5226 break; 5168 break;
5227 5169
5228 case URX_LA_END: 5170 case URX_LA_END:
5229 { 5171 {
5230 // Leaving a look-ahead block. 5172 // Leaving a look-ahead block.
5231 // restore Stack Ptr, Input Pos to positions they had on entry to block. 5173 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5232 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5174 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5233 int32_t stackSize = fStack->size(); 5175 int32_t stackSize = fStack->size();
5234 int32_t newStackSize = (int32_t)fData[opValue]; 5176 int32_t newStackSize = (int32_t)fData[opValue];
5235 U_ASSERT(stackSize >= newStackSize); 5177 U_ASSERT(stackSize >= newStackSize);
5236 if (stackSize > newStackSize) { 5178 if (stackSize > newStackSize) {
5237 // Copy the current top frame back to the new (cut back) top frame. 5179 // Copy the current top frame back to the new (cut back) top frame.
5238 // This makes the capture groups from within the look-ahea d 5180 // This makes the capture groups from within the look-ahea d
5239 // expression available. 5181 // expression available.
5240 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrame Size; 5182 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrame Size;
5241 int32_t i; 5183 int32_t i;
5242 for (i=0; i<fFrameSize; i++) { 5184 for (i=0; i<fFrameSize; i++) {
5243 newFP[i] = ((int64_t *)fp)[i]; 5185 newFP[i] = ((int64_t *)fp)[i];
5244 } 5186 }
5245 fp = (REStackFrame *)newFP; 5187 fp = (REStackFrame *)newFP;
5246 fStack->setSize(newStackSize); 5188 fStack->setSize(newStackSize);
5247 } 5189 }
5248 fp->fInputIdx = fData[opValue+1]; 5190 fp->fInputIdx = fData[opValue+1];
5249 5191
5250 // Restore the active region bounds in the input string; they ma y have 5192 // Restore the active region bounds in the input string; they ma y have
5251 // been changed because of transparent bounds on a Region. 5193 // been changed because of transparent bounds on a Region.
5252 fActiveStart = fRegionStart; 5194 fActiveStart = fRegionStart;
5253 fActiveLimit = fRegionLimit; 5195 fActiveLimit = fRegionLimit;
5254 } 5196 }
5255 break; 5197 break;
5256 5198
5257 case URX_ONECHAR_I: 5199 case URX_ONECHAR_I:
5258 if (fp->fInputIdx < fActiveLimit) { 5200 if (fp->fInputIdx < fActiveLimit) {
5259 UChar32 c; 5201 UChar32 c;
5260 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5202 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5261 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 5203 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5262 break; 5204 break;
5263 } 5205 }
5264 } else { 5206 } else {
5265 fHitEnd = TRUE; 5207 fHitEnd = TRUE;
5266 } 5208 }
5267 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5209 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5268 break; 5210 break;
5269 5211
5270 case URX_STRING_I: 5212 case URX_STRING_I:
5271 // Case-insensitive test input against a literal string. 5213 // Case-insensitive test input against a literal string.
5272 // Strings require two slots in the compiled pattern, one for the 5214 // Strings require two slots in the compiled pattern, one for the
5273 // offset to the string text, and one for the length. 5215 // offset to the string text, and one for the length.
5274 // The compiled string has already been case folded. 5216 // The compiled string has already been case folded.
5275 { 5217 {
5276 const UChar *patternString = litText + opValue; 5218 const UChar *patternString = litText + opValue;
5277 5219
5278 op = (int32_t)pat[fp->fPatIdx]; 5220 op = (int32_t)pat[fp->fPatIdx];
5279 fp->fPatIdx++; 5221 fp->fPatIdx++;
5280 opType = URX_TYPE(op); 5222 opType = URX_TYPE(op);
5281 opValue = URX_VAL(op); 5223 opValue = URX_VAL(op);
5282 U_ASSERT(opType == URX_STRING_LEN); 5224 U_ASSERT(opType == URX_STRING_LEN);
5283 int32_t patternStringLen = opValue; // Length of the string fro m the pattern. 5225 int32_t patternStringLen = opValue; // Length of the string fro m the pattern.
5284 5226
5285 UChar32 cText; 5227 UChar32 cText;
5286 UChar32 cPattern; 5228 UChar32 cPattern;
5287 UBool success = TRUE; 5229 UBool success = TRUE;
5288 int32_t patternStringIdx = 0; 5230 int32_t patternStringIdx = 0;
5289 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit); 5231 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5290 while (patternStringIdx < patternStringLen) { 5232 while (patternStringIdx < patternStringLen) {
5291 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern); 5233 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5292 cText = inputIterator.next(); 5234 cText = inputIterator.next();
5293 if (cText != cPattern) { 5235 if (cText != cPattern) {
5294 success = FALSE; 5236 success = FALSE;
(...skipping 24 matching lines...) Expand all
5319 fData[opValue] = fStack->size(); 5261 fData[opValue] = fStack->size();
5320 fData[opValue+1] = fp->fInputIdx; 5262 fData[opValue+1] = fp->fInputIdx;
5321 // Init the variable containing the start index for attempted ma tches. 5263 // Init the variable containing the start index for attempted ma tches.
5322 fData[opValue+2] = -1; 5264 fData[opValue+2] = -1;
5323 // Save input string length, then reset to pin any matches to en d at 5265 // Save input string length, then reset to pin any matches to en d at
5324 // the current position. 5266 // the current position.
5325 fData[opValue+3] = fActiveLimit; 5267 fData[opValue+3] = fActiveLimit;
5326 fActiveLimit = fp->fInputIdx; 5268 fActiveLimit = fp->fInputIdx;
5327 } 5269 }
5328 break; 5270 break;
5329 5271
5330 5272
5331 case URX_LB_CONT: 5273 case URX_LB_CONT:
5332 { 5274 {
5333 // Positive Look-Behind, at top of loop checking for matches of LB expression 5275 // Positive Look-Behind, at top of loop checking for matches of LB expression
5334 // at all possible input starting positions. 5276 // at all possible input starting positions.
5335 5277
5336 // Fetch the min and max possible match lengths. They are the o perands 5278 // Fetch the min and max possible match lengths. They are the o perands
5337 // of this op in the pattern. 5279 // of this op in the pattern.
5338 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5280 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5339 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5281 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5340 U_ASSERT(minML <= maxML); 5282 U_ASSERT(minML <= maxML);
5341 U_ASSERT(minML >= 0); 5283 U_ASSERT(minML >= 0);
5342 5284
5343 // Fetch (from data) the last input index where a match was atte mpted. 5285 // Fetch (from data) the last input index where a match was atte mpted.
5344 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5286 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5345 int64_t *lbStartIdx = &fData[opValue+2]; 5287 int64_t *lbStartIdx = &fData[opValue+2];
5346 if (*lbStartIdx < 0) { 5288 if (*lbStartIdx < 0) {
5347 // First time through loop. 5289 // First time through loop.
5348 *lbStartIdx = fp->fInputIdx - minML; 5290 *lbStartIdx = fp->fInputIdx - minML;
5349 } else { 5291 } else {
5350 // 2nd through nth time through the loop. 5292 // 2nd through nth time through the loop.
5351 // Back up start position for match by one. 5293 // Back up start position for match by one.
5352 if (*lbStartIdx == 0) { 5294 if (*lbStartIdx == 0) {
5353 (*lbStartIdx)--; 5295 (*lbStartIdx)--;
5354 } else { 5296 } else {
5355 U16_BACK_1(inputBuf, 0, *lbStartIdx); 5297 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5356 } 5298 }
5357 } 5299 }
5358 5300
5359 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 5301 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5360 // We have tried all potential match starting points without 5302 // We have tried all potential match starting points without
5361 // getting a match. Backtrack out, and out of the 5303 // getting a match. Backtrack out, and out of the
5362 // Look Behind altogether. 5304 // Look Behind altogether.
5363 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5305 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5364 int64_t restoreInputLen = fData[opValue+3]; 5306 int64_t restoreInputLen = fData[opValue+3];
5365 U_ASSERT(restoreInputLen >= fActiveLimit); 5307 U_ASSERT(restoreInputLen >= fActiveLimit);
5366 U_ASSERT(restoreInputLen <= fInputLength); 5308 U_ASSERT(restoreInputLen <= fInputLength);
5367 fActiveLimit = restoreInputLen; 5309 fActiveLimit = restoreInputLen;
5368 break; 5310 break;
5369 } 5311 }
5370 5312
5371 // Save state to this URX_LB_CONT op, so failure to match wil l repeat the loop. 5313 // Save state to this URX_LB_CONT op, so failure to match wil l repeat the loop.
5372 // (successful match will fall off the end of the loop.) 5314 // (successful match will fall off the end of the loop.)
5373 fp = StateSave(fp, fp->fPatIdx-3, status); 5315 fp = StateSave(fp, fp->fPatIdx-3, status);
5374 fp->fInputIdx = *lbStartIdx; 5316 fp->fInputIdx = *lbStartIdx;
5375 } 5317 }
5376 break; 5318 break;
5377 5319
5378 case URX_LB_END: 5320 case URX_LB_END:
5379 // End of a look-behind block, after a successful match. 5321 // End of a look-behind block, after a successful match.
5380 { 5322 {
5381 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5323 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5382 if (fp->fInputIdx != fActiveLimit) { 5324 if (fp->fInputIdx != fActiveLimit) {
5383 // The look-behind expression matched, but the match did no t 5325 // The look-behind expression matched, but the match did no t
5384 // extend all the way to the point that we are looking be hind from. 5326 // extend all the way to the point that we are looking be hind from.
5385 // FAIL out of here, which will take us back to the LB_CONT , which 5327 // FAIL out of here, which will take us back to the LB_CONT , which
5386 // will retry the match starting at another position or fail 5328 // will retry the match starting at another position or fail
5387 // the look-behind altogether, whichever is appropriate. 5329 // the look-behind altogether, whichever is appropriate.
5388 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5330 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5389 break; 5331 break;
5390 } 5332 }
5391 5333
5392 // Look-behind match is good. Restore the orignal input string length, 5334 // Look-behind match is good. Restore the orignal input string length,
5393 // which had been truncated to pin the end of the lookbehind m atch to the 5335 // which had been truncated to pin the end of the lookbehind m atch to the
5394 // position being looked-behind. 5336 // position being looked-behind.
5395 int64_t originalInputLen = fData[opValue+3]; 5337 int64_t originalInputLen = fData[opValue+3];
5396 U_ASSERT(originalInputLen >= fActiveLimit); 5338 U_ASSERT(originalInputLen >= fActiveLimit);
5397 U_ASSERT(originalInputLen <= fInputLength); 5339 U_ASSERT(originalInputLen <= fInputLength);
5398 fActiveLimit = originalInputLen; 5340 fActiveLimit = originalInputLen;
5399 } 5341 }
5400 break; 5342 break;
5401 5343
5402 5344
5403 case URX_LBN_CONT: 5345 case URX_LBN_CONT:
5404 { 5346 {
5405 // Negative Look-Behind, at top of loop checking for matches of LB expression 5347 // Negative Look-Behind, at top of loop checking for matches of LB expression
5406 // at all possible input starting positions. 5348 // at all possible input starting positions.
5407 5349
5408 // Fetch the extra parameters of this op. 5350 // Fetch the extra parameters of this op.
5409 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5351 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5410 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5352 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5411 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 5353 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5412 continueLoc = URX_VAL(continueLoc); 5354 continueLoc = URX_VAL(continueLoc);
5413 U_ASSERT(minML <= maxML); 5355 U_ASSERT(minML <= maxML);
5414 U_ASSERT(minML >= 0); 5356 U_ASSERT(minML >= 0);
5415 U_ASSERT(continueLoc > fp->fPatIdx); 5357 U_ASSERT(continueLoc > fp->fPatIdx);
5416 5358
5417 // Fetch (from data) the last input index where a match was atte mpted. 5359 // Fetch (from data) the last input index where a match was atte mpted.
5418 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5360 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5419 int64_t *lbStartIdx = &fData[opValue+2]; 5361 int64_t *lbStartIdx = &fData[opValue+2];
5420 if (*lbStartIdx < 0) { 5362 if (*lbStartIdx < 0) {
5421 // First time through loop. 5363 // First time through loop.
5422 *lbStartIdx = fp->fInputIdx - minML; 5364 *lbStartIdx = fp->fInputIdx - minML;
5423 } else { 5365 } else {
5424 // 2nd through nth time through the loop. 5366 // 2nd through nth time through the loop.
5425 // Back up start position for match by one. 5367 // Back up start position for match by one.
5426 if (*lbStartIdx == 0) { 5368 if (*lbStartIdx == 0) {
5427 (*lbStartIdx)--; // Because U16_BACK is unsafe startin g at 0. 5369 (*lbStartIdx)--; // Because U16_BACK is unsafe startin g at 0.
5428 } else { 5370 } else {
5429 U16_BACK_1(inputBuf, 0, *lbStartIdx); 5371 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5430 } 5372 }
5431 } 5373 }
5432 5374
5433 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 5375 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5434 // We have tried all potential match starting points without 5376 // We have tried all potential match starting points without
5435 // getting a match, which means that the negative lookbehin d as 5377 // getting a match, which means that the negative lookbehin d as
5436 // a whole has succeeded. Jump forward to the continue loc ation 5378 // a whole has succeeded. Jump forward to the continue loc ation
5437 int64_t restoreInputLen = fData[opValue+3]; 5379 int64_t restoreInputLen = fData[opValue+3];
5438 U_ASSERT(restoreInputLen >= fActiveLimit); 5380 U_ASSERT(restoreInputLen >= fActiveLimit);
5439 U_ASSERT(restoreInputLen <= fInputLength); 5381 U_ASSERT(restoreInputLen <= fInputLength);
5440 fActiveLimit = restoreInputLen; 5382 fActiveLimit = restoreInputLen;
5441 fp->fPatIdx = continueLoc; 5383 fp->fPatIdx = continueLoc;
5442 break; 5384 break;
5443 } 5385 }
5444 5386
5445 // Save state to this URX_LB_CONT op, so failure to match wil l repeat the loop. 5387 // Save state to this URX_LB_CONT op, so failure to match wil l repeat the loop.
5446 // (successful match will cause a FAIL out of the loop alto gether.) 5388 // (successful match will cause a FAIL out of the loop alto gether.)
5447 fp = StateSave(fp, fp->fPatIdx-4, status); 5389 fp = StateSave(fp, fp->fPatIdx-4, status);
5448 fp->fInputIdx = *lbStartIdx; 5390 fp->fInputIdx = *lbStartIdx;
5449 } 5391 }
5450 break; 5392 break;
5451 5393
5452 case URX_LBN_END: 5394 case URX_LBN_END:
5453 // End of a negative look-behind block, after a successful match. 5395 // End of a negative look-behind block, after a successful match.
5454 { 5396 {
5455 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5397 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5456 if (fp->fInputIdx != fActiveLimit) { 5398 if (fp->fInputIdx != fActiveLimit) {
5457 // The look-behind expression matched, but the match did no t 5399 // The look-behind expression matched, but the match did no t
5458 // extend all the way to the point that we are looking be hind from. 5400 // extend all the way to the point that we are looking be hind from.
5459 // FAIL out of here, which will take us back to the LB_CONT , which 5401 // FAIL out of here, which will take us back to the LB_CONT , which
5460 // will retry the match starting at another position or succeed 5402 // will retry the match starting at another position or succeed
5461 // the look-behind altogether, whichever is appropriate. 5403 // the look-behind altogether, whichever is appropriate.
5462 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5404 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5463 break; 5405 break;
5464 } 5406 }
5465 5407
5466 // Look-behind expression matched, which means look-behind test as 5408 // Look-behind expression matched, which means look-behind test as
5467 // a whole Fails 5409 // a whole Fails
5468 5410
5469 // Restore the orignal input string length, which had been tru ncated 5411 // Restore the orignal input string length, which had been tru ncated
5470 // inorder to pin the end of the lookbehind match 5412 // inorder to pin the end of the lookbehind match
5471 // to the position being looked-behind. 5413 // to the position being looked-behind.
5472 int64_t originalInputLen = fData[opValue+3]; 5414 int64_t originalInputLen = fData[opValue+3];
5473 U_ASSERT(originalInputLen >= fActiveLimit); 5415 U_ASSERT(originalInputLen >= fActiveLimit);
5474 U_ASSERT(originalInputLen <= fInputLength); 5416 U_ASSERT(originalInputLen <= fInputLength);
5475 fActiveLimit = originalInputLen; 5417 fActiveLimit = originalInputLen;
5476 5418
5477 // Restore original stack position, discarding any state saved 5419 // Restore original stack position, discarding any state saved
5478 // by the successful pattern match. 5420 // by the successful pattern match.
5479 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5421 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5480 int32_t newStackSize = (int32_t)fData[opValue]; 5422 int32_t newStackSize = (int32_t)fData[opValue];
5481 U_ASSERT(fStack->size() > newStackSize); 5423 U_ASSERT(fStack->size() > newStackSize);
5482 fStack->setSize(newStackSize); 5424 fStack->setSize(newStackSize);
5483 5425
5484 // FAIL, which will take control back to someplace 5426 // FAIL, which will take control back to someplace
5485 // prior to entering the look-behind test. 5427 // prior to entering the look-behind test.
5486 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5428 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5487 } 5429 }
5488 break; 5430 break;
5489 5431
5490 5432
5491 case URX_LOOP_SR_I: 5433 case URX_LOOP_SR_I:
5492 // Loop Initialization for the optimized implementation of 5434 // Loop Initialization for the optimized implementation of
5493 // [some character set]* 5435 // [some character set]*
5494 // This op scans through all matching input. 5436 // This op scans through all matching input.
5495 // The following LOOP_C op emulates stack unwinding if the followi ng pattern fails. 5437 // The following LOOP_C op emulates stack unwinding if the followi ng pattern fails.
5496 { 5438 {
5497 U_ASSERT(opValue > 0 && opValue < sets->size()); 5439 U_ASSERT(opValue > 0 && opValue < sets->size());
5498 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5440 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5499 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 5441 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5500 5442
5501 // Loop through input, until either the input is exhausted or 5443 // Loop through input, until either the input is exhausted or
5502 // we reach a character that is not a member of the set. 5444 // we reach a character that is not a member of the set.
5503 int32_t ix = (int32_t)fp->fInputIdx; 5445 int32_t ix = (int32_t)fp->fInputIdx;
5504 for (;;) { 5446 for (;;) {
5505 if (ix >= fActiveLimit) { 5447 if (ix >= fActiveLimit) {
5506 fHitEnd = TRUE; 5448 fHitEnd = TRUE;
5507 break; 5449 break;
5508 } 5450 }
5509 UChar32 c; 5451 UChar32 c;
5510 U16_NEXT(inputBuf, ix, fActiveLimit, c); 5452 U16_NEXT(inputBuf, ix, fActiveLimit, c);
5511 if (c<256) { 5453 if (c<256) {
5512 if (s8->contains(c) == FALSE) { 5454 if (s8->contains(c) == FALSE) {
5513 U16_BACK_1(inputBuf, 0, ix); 5455 U16_BACK_1(inputBuf, 0, ix);
5514 break; 5456 break;
5515 } 5457 }
5516 } else { 5458 } else {
5517 if (s->contains(c) == FALSE) { 5459 if (s->contains(c) == FALSE) {
5518 U16_BACK_1(inputBuf, 0, ix); 5460 U16_BACK_1(inputBuf, 0, ix);
5519 break; 5461 break;
5520 } 5462 }
5521 } 5463 }
5522 } 5464 }
5523 5465
5524 // If there were no matching characters, skip over the loop alto gether. 5466 // If there were no matching characters, skip over the loop alto gether.
5525 // The loop doesn't run at all, a * op always succeeds. 5467 // The loop doesn't run at all, a * op always succeeds.
5526 if (ix == fp->fInputIdx) { 5468 if (ix == fp->fInputIdx) {
5527 fp->fPatIdx++; // skip the URX_LOOP_C op. 5469 fp->fPatIdx++; // skip the URX_LOOP_C op.
5528 break; 5470 break;
5529 } 5471 }
5530 5472
5531 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5473 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5532 // must follow. It's operand is the stack location 5474 // must follow. It's operand is the stack location
5533 // that holds the starting input index for the match of this [ set]* 5475 // that holds the starting input index for the match of this [ set]*
5534 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 5476 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5535 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5477 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5536 int32_t stackLoc = URX_VAL(loopcOp); 5478 int32_t stackLoc = URX_VAL(loopcOp);
5537 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5479 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5538 fp->fExtra[stackLoc] = fp->fInputIdx; 5480 fp->fExtra[stackLoc] = fp->fInputIdx;
5539 fp->fInputIdx = ix; 5481 fp->fInputIdx = ix;
5540 5482
5541 // Save State to the URX_LOOP_C op that follows this one, 5483 // Save State to the URX_LOOP_C op that follows this one,
5542 // so that match failures in the following code will return to there. 5484 // so that match failures in the following code will return to there.
5543 // Then bump the pattern idx so the LOOP_C is skipped on the w ay out of here. 5485 // Then bump the pattern idx so the LOOP_C is skipped on the w ay out of here.
5544 fp = StateSave(fp, fp->fPatIdx, status); 5486 fp = StateSave(fp, fp->fPatIdx, status);
5545 fp->fPatIdx++; 5487 fp->fPatIdx++;
5546 } 5488 }
5547 break; 5489 break;
5548 5490
5549 5491
5550 case URX_LOOP_DOT_I: 5492 case URX_LOOP_DOT_I:
5551 // Loop Initialization for the optimized implementation of .* 5493 // Loop Initialization for the optimized implementation of .*
5552 // This op scans through all remaining input. 5494 // This op scans through all remaining input.
5553 // The following LOOP_C op emulates stack unwinding if the followi ng pattern fails. 5495 // The following LOOP_C op emulates stack unwinding if the followi ng pattern fails.
5554 { 5496 {
5555 // Loop through input until the input is exhausted (we reach an end-of-line) 5497 // Loop through input until the input is exhausted (we reach an end-of-line)
5556 // In DOTALL mode, we can just go straight to the end of the inp ut. 5498 // In DOTALL mode, we can just go straight to the end of the inp ut.
5557 int32_t ix; 5499 int32_t ix;
5558 if ((opValue & 1) == 1) { 5500 if ((opValue & 1) == 1) {
5559 // Dot-matches-All mode. Jump straight to the end of the st ring. 5501 // Dot-matches-All mode. Jump straight to the end of the st ring.
(...skipping 15 matching lines...) Expand all
5575 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 5517 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5576 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) { 5518 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
5577 // char is a line ending. Put the input pos ba ck to the 5519 // char is a line ending. Put the input pos ba ck to the
5578 // line ending char, and exit the scanning lo op. 5520 // line ending char, and exit the scanning lo op.
5579 U16_BACK_1(inputBuf, 0, ix); 5521 U16_BACK_1(inputBuf, 0, ix);
5580 break; 5522 break;
5581 } 5523 }
5582 } 5524 }
5583 } 5525 }
5584 } 5526 }
5585 5527
5586 // If there were no matching characters, skip over the loop alto gether. 5528 // If there were no matching characters, skip over the loop alto gether.
5587 // The loop doesn't run at all, a * op always succeeds. 5529 // The loop doesn't run at all, a * op always succeeds.
5588 if (ix == fp->fInputIdx) { 5530 if (ix == fp->fInputIdx) {
5589 fp->fPatIdx++; // skip the URX_LOOP_C op. 5531 fp->fPatIdx++; // skip the URX_LOOP_C op.
5590 break; 5532 break;
5591 } 5533 }
5592 5534
5593 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5535 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5594 // must follow. It's operand is the stack location 5536 // must follow. It's operand is the stack location
5595 // that holds the starting input index for the match of this . * 5537 // that holds the starting input index for the match of this . *
5596 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 5538 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5597 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5539 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5598 int32_t stackLoc = URX_VAL(loopcOp); 5540 int32_t stackLoc = URX_VAL(loopcOp);
5599 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5541 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5600 fp->fExtra[stackLoc] = fp->fInputIdx; 5542 fp->fExtra[stackLoc] = fp->fInputIdx;
5601 fp->fInputIdx = ix; 5543 fp->fInputIdx = ix;
5602 5544
5603 // Save State to the URX_LOOP_C op that follows this one, 5545 // Save State to the URX_LOOP_C op that follows this one,
5604 // so that match failures in the following code will return to there. 5546 // so that match failures in the following code will return to there.
5605 // Then bump the pattern idx so the LOOP_C is skipped on the w ay out of here. 5547 // Then bump the pattern idx so the LOOP_C is skipped on the w ay out of here.
5606 fp = StateSave(fp, fp->fPatIdx, status); 5548 fp = StateSave(fp, fp->fPatIdx, status);
5607 fp->fPatIdx++; 5549 fp->fPatIdx++;
5608 } 5550 }
5609 break; 5551 break;
5610 5552
5611 5553
5612 case URX_LOOP_C: 5554 case URX_LOOP_C:
5613 { 5555 {
5614 U_ASSERT(opValue>=0 && opValue<fFrameSize); 5556 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5615 backSearchIndex = (int32_t)fp->fExtra[opValue]; 5557 backSearchIndex = (int32_t)fp->fExtra[opValue];
5616 U_ASSERT(backSearchIndex <= fp->fInputIdx); 5558 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5617 if (backSearchIndex == fp->fInputIdx) { 5559 if (backSearchIndex == fp->fInputIdx) {
5618 // We've backed up the input idx to the point that the loop started. 5560 // We've backed up the input idx to the point that the loop started.
5619 // The loop is done. Leave here without saving state. 5561 // The loop is done. Leave here without saving state.
5620 // Subsequent failures won't come back here. 5562 // Subsequent failures won't come back here.
5621 break; 5563 break;
5622 } 5564 }
5623 // Set up for the next iteration of the loop, with input index 5565 // Set up for the next iteration of the loop, with input index
5624 // backed up by one from the last time through, 5566 // backed up by one from the last time through,
5625 // and a state save to this instruction in case the following code fails again. 5567 // and a state save to this instruction in case the following code fails again.
5626 // (We're going backwards because this loop emulates stack unw inding, not 5568 // (We're going backwards because this loop emulates stack unw inding, not
5627 // the initial scan forward.) 5569 // the initial scan forward.)
5628 U_ASSERT(fp->fInputIdx > 0); 5570 U_ASSERT(fp->fInputIdx > 0);
5629 UChar32 prevC; 5571 UChar32 prevC;
5630 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit? 5572 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5631 5573
5632 if (prevC == 0x0a && 5574 if (prevC == 0x0a &&
5633 fp->fInputIdx > backSearchIndex && 5575 fp->fInputIdx > backSearchIndex &&
5634 inputBuf[fp->fInputIdx-1] == 0x0d) { 5576 inputBuf[fp->fInputIdx-1] == 0x0d) {
5635 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 5577 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5636 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 5578 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5637 // .*, stepping back over CRLF pair. 5579 // .*, stepping back over CRLF pair.
5638 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 5580 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5639 } 5581 }
5640 } 5582 }
5641 5583
5642 5584
5643 fp = StateSave(fp, fp->fPatIdx-1, status); 5585 fp = StateSave(fp, fp->fPatIdx-1, status);
5644 } 5586 }
5645 break; 5587 break;
5646 5588
5647 5589
5648 5590
5649 default: 5591 default:
5650 // Trouble. The compiled pattern contains an entry with an 5592 // Trouble. The compiled pattern contains an entry with an
5651 // unrecognized type tag. 5593 // unrecognized type tag.
5652 U_ASSERT(FALSE); 5594 U_ASSERT(FALSE);
5653 } 5595 }
5654 5596
5655 if (U_FAILURE(status)) { 5597 if (U_FAILURE(status)) {
5656 isMatch = FALSE; 5598 isMatch = FALSE;
5657 break; 5599 break;
5658 } 5600 }
5659 } 5601 }
5660 5602
5661 breakFromLoop: 5603 breakFromLoop:
5662 fMatch = isMatch; 5604 fMatch = isMatch;
5663 if (isMatch) { 5605 if (isMatch) {
5664 fLastMatchEnd = fMatchEnd; 5606 fLastMatchEnd = fMatchEnd;
5665 fMatchStart = startIdx; 5607 fMatchStart = startIdx;
5666 fMatchEnd = fp->fInputIdx; 5608 fMatchEnd = fp->fInputIdx;
5667 if (fTraceDebug) { 5609 }
5668 REGEX_RUN_DEBUG_PRINTF(("Match. start=%ld end=%ld\n\n", fMatchSta rt, fMatchEnd)); 5610
5611 #ifdef REGEX_RUN_DEBUG
5612 if (fTraceDebug) {
5613 if (isMatch) {
5614 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
5615 } else {
5616 printf("No match\n\n");
5669 } 5617 }
5670 } 5618 }
5671 else 5619 #endif
5672 { 5620
5673 if (fTraceDebug) {
5674 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
5675 }
5676 }
5677
5678 fFrame = fp; // The active stack frame when the engine stoppe d. 5621 fFrame = fp; // The active stack frame when the engine stoppe d.
5679 // Contains the capture group results that we need to 5622 // Contains the capture group results that we need to
5680 // access later. 5623 // access later.
5681 5624
5682 return; 5625 return;
5683 } 5626 }
5684 5627
5685 5628
5686 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) 5629 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5687 5630
5688 U_NAMESPACE_END 5631 U_NAMESPACE_END
5689 5632
5690 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 5633 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
OLDNEW
« no previous file with comments | « source/i18n/reldtfmt.cpp ('k') | source/i18n/repattrn.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698