OLD | NEW |
| (Empty) |
1 /* This is JavaScriptCore's variant of the PCRE library. While this library | |
2 started out as a copy of PCRE, many of the features of PCRE have been | |
3 removed. This library now supports only the regular expression features | |
4 required by the JavaScript language specification, and has only the functions | |
5 needed by JavaScriptCore and the rest of WebKit. | |
6 | |
7 Originally written by Philip Hazel | |
8 Copyright (c) 1997-2006 University of Cambridge | |
9 Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved. | |
10 Copyright (C) 2007 Eric Seidel <eric@webkit.org> | |
11 | |
12 ----------------------------------------------------------------------------- | |
13 Redistribution and use in source and binary forms, with or without | |
14 modification, are permitted provided that the following conditions are met: | |
15 | |
16 * Redistributions of source code must retain the above copyright notice, | |
17 this list of conditions and the following disclaimer. | |
18 | |
19 * Redistributions in binary form must reproduce the above copyright | |
20 notice, this list of conditions and the following disclaimer in the | |
21 documentation and/or other materials provided with the distribution. | |
22 | |
23 * Neither the name of the University of Cambridge nor the names of its | |
24 contributors may be used to endorse or promote products derived from | |
25 this software without specific prior written permission. | |
26 | |
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
37 POSSIBILITY OF SUCH DAMAGE. | |
38 ----------------------------------------------------------------------------- | |
39 */ | |
40 | |
41 /* This module contains jsRegExpExecute(), the externally visible function | |
42 that does pattern matching using an NFA algorithm, following the rules from | |
43 the JavaScript specification. There are also some supporting functions. */ | |
44 | |
45 #include "config.h" | |
46 | |
47 #include "pcre_internal.h" | |
48 | |
49 #include "ASCIICType.h" | |
50 | |
51 #include <ctype.h> | |
52 #include <limits.h> | |
53 #include <string.h> /* for memcpy */ | |
54 | |
55 #ifdef __GNUC__ | |
56 #define USE_COMPUTED_GOTO_FOR_MATCH_RECURSION | |
57 //#define USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
58 #endif | |
59 | |
60 /* Avoid warnings on Windows. */ | |
61 #undef min | |
62 #undef max | |
63 | |
64 namespace dart { namespace jscre { | |
65 | |
66 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION | |
67 typedef int ReturnLocation; | |
68 #else | |
69 typedef void* ReturnLocation; | |
70 #endif | |
71 | |
72 /* Structure for building a chain of data for holding the values of | |
73 the subject pointer at the start of each bracket, used to detect when | |
74 an empty string has been matched by a bracket to break infinite loops. */ | |
75 struct BracketChainNode { | |
76 BracketChainNode* previousBracket; | |
77 const UChar* bracketStart; | |
78 }; | |
79 | |
80 struct MatchFrame { | |
81 ReturnLocation returnLocation; | |
82 struct MatchFrame* previousFrame; | |
83 | |
84 /* Function arguments that may change */ | |
85 struct { | |
86 const UChar* subjectPtr; | |
87 const unsigned char* instructionPtr; | |
88 int offsetTop; | |
89 BracketChainNode* bracketChain; | |
90 } args; | |
91 | |
92 | |
93 /* PCRE uses "fake" recursion built off of gotos, thus | |
94 stack-based local variables are not safe to use. Instead we have to | |
95 store local variables on the current MatchFrame. */ | |
96 struct { | |
97 const unsigned char* data; | |
98 const unsigned char* startOfRepeatingBracket; | |
99 const UChar* subjectPtrAtStartOfInstruction; // Several instrutions stas
h away a subjectPtr here for later compare | |
100 const unsigned char* instructionPtrAtStartOfOnce; | |
101 | |
102 int repeatOthercase; | |
103 | |
104 int ctype; | |
105 int fc; | |
106 int fi; | |
107 int length; | |
108 int max; | |
109 int number; | |
110 int offset; | |
111 int saveOffset1; | |
112 int saveOffset2; | |
113 int saveOffset3; | |
114 | |
115 BracketChainNode bracketChainNode; | |
116 } locals; | |
117 }; | |
118 | |
119 /* Structure for passing "static" information around between the functions | |
120 doing traditional NFA matching, so that they are thread-safe. */ | |
121 | |
122 struct MatchData { | |
123 int* offsetVector; /* Offset vector */ | |
124 int offsetEnd; /* One past the end */ | |
125 int offsetMax; /* The maximum usable for return data */ | |
126 bool offsetOverflow; /* Set if too many extractions */ | |
127 const UChar* startSubject; /* Start of the subject string */ | |
128 const UChar* endSubject; /* End of the subject string */ | |
129 const UChar* endMatchPtr; /* Subject position at end match */ | |
130 int endOffsetTop; /* Highwater mark at end of match */ | |
131 bool multiline; | |
132 bool ignoreCase; | |
133 }; | |
134 | |
135 /* The maximum remaining length of subject we are prepared to search for a | |
136 req_byte match. */ | |
137 | |
138 #define REQ_BYTE_MAX 1000 | |
139 | |
140 /* The below limit restricts the number of "recursive" match calls in order to | |
141 avoid spending exponential time on complex regular expressions. */ | |
142 | |
143 static const unsigned matchLimit = 100000; | |
144 | |
145 #ifdef DEBUG | |
146 /************************************************* | |
147 * Debugging function to print chars * | |
148 *************************************************/ | |
149 | |
150 /* Print a sequence of chars in printable format, stopping at the end of the | |
151 subject if the requested. | |
152 | |
153 Arguments: | |
154 p points to characters | |
155 length number to print | |
156 isSubject true if printing from within md.startSubject | |
157 md pointer to matching data block, if isSubject is true | |
158 */ | |
159 | |
160 static void pchars(const UChar* p, int length, bool isSubject, const MatchData&
md) | |
161 { | |
162 if (isSubject && length > md.endSubject - p) | |
163 length = md.endSubject - p; | |
164 while (length-- > 0) { | |
165 int c; | |
166 if (isprint(c = *(p++))) | |
167 printf("%c", c); | |
168 else if (c < 256) | |
169 printf("\\x%02x", c); | |
170 else | |
171 printf("\\x{%x}", c); | |
172 } | |
173 } | |
174 #endif | |
175 | |
176 /************************************************* | |
177 * Match a back-reference * | |
178 *************************************************/ | |
179 | |
180 /* If a back reference hasn't been set, the length that is passed is greater | |
181 than the number of characters left in the string, so the match fails. | |
182 | |
183 Arguments: | |
184 offset index into the offset vector | |
185 subjectPtr points into the subject | |
186 length length to be matched | |
187 md points to match data block | |
188 | |
189 Returns: true if matched | |
190 */ | |
191 | |
192 static bool matchRef(int offset, const UChar* subjectPtr, int length, const Matc
hData& md) | |
193 { | |
194 const UChar* p = md.startSubject + md.offsetVector[offset]; | |
195 | |
196 #ifdef DEBUG | |
197 if (subjectPtr >= md.endSubject) | |
198 printf("matching subject <null>"); | |
199 else { | |
200 printf("matching subject "); | |
201 pchars(subjectPtr, length, true, md); | |
202 } | |
203 printf(" against backref "); | |
204 pchars(p, length, false, md); | |
205 printf("\n"); | |
206 #endif | |
207 | |
208 /* Always fail if not enough characters left */ | |
209 | |
210 if (length > md.endSubject - subjectPtr) | |
211 return false; | |
212 | |
213 /* Separate the caselesss case for speed */ | |
214 | |
215 if (md.ignoreCase) { | |
216 while (length-- > 0) { | |
217 UChar c = *p++; | |
218 int othercase = kjs_pcre_ucp_othercase(c); | |
219 UChar d = *subjectPtr++; | |
220 if (c != d && othercase != d) | |
221 return false; | |
222 } | |
223 } | |
224 else { | |
225 while (length-- > 0) | |
226 if (*p++ != *subjectPtr++) | |
227 return false; | |
228 } | |
229 | |
230 return true; | |
231 } | |
232 | |
233 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION | |
234 | |
235 /* Use numbered labels and switch statement at the bottom of the match function.
*/ | |
236 | |
237 #define RMATCH_WHERE(num) num | |
238 #define RRETURN_LABEL RRETURN_SWITCH | |
239 | |
240 #else | |
241 | |
242 /* Use GCC's computed goto extension. */ | |
243 | |
244 /* For one test case this is more than 40% faster than the switch statement. | |
245 We could avoid the use of the num argument entirely by using local labels, | |
246 but using it for the GCC case as well as the non-GCC case allows us to share | |
247 a bit more code and notice if we use conflicting numbers.*/ | |
248 | |
249 #define RMATCH_WHERE(num) &&RRETURN_##num | |
250 #define RRETURN_LABEL *stack.currentFrame->returnLocation | |
251 | |
252 #endif | |
253 | |
254 #define RECURSIVE_MATCH_COMMON(num) \ | |
255 goto RECURSE;\ | |
256 RRETURN_##num: \ | |
257 stack.popCurrentFrame(); | |
258 | |
259 #define RECURSIVE_MATCH(num, ra, rb) \ | |
260 do { \ | |
261 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \ | |
262 RECURSIVE_MATCH_COMMON(num) \ | |
263 } while (0) | |
264 | |
265 #define RECURSIVE_MATCH_STARTNG_NEW_GROUP(num, ra, rb) \ | |
266 do { \ | |
267 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \ | |
268 startNewGroup(stack.currentFrame); \ | |
269 RECURSIVE_MATCH_COMMON(num) \ | |
270 } while (0) | |
271 | |
272 #define RRETURN goto RRETURN_LABEL | |
273 | |
274 #define RRETURN_NO_MATCH do { isMatch = false; RRETURN; } while (0) | |
275 | |
276 /************************************************* | |
277 * Match from current position * | |
278 *************************************************/ | |
279 | |
280 /* On entry instructionPtr points to the first opcode, and subjectPtr to the fir
st character | |
281 in the subject string, while substringStart holds the value of subjectPtr at the
start of the | |
282 last bracketed group - used for breaking infinite loops matching zero-length | |
283 strings. This function is called recursively in many circumstances. Whenever it | |
284 returns a negative (error) response, the outer match() call must also return the | |
285 same response. | |
286 | |
287 Arguments: | |
288 subjectPtr pointer in subject | |
289 instructionPtr position in code | |
290 offsetTop current top pointer | |
291 md pointer to "static" info for the match | |
292 | |
293 Returns: 1 if matched ) these values are >= 0 | |
294 0 if failed to match ) | |
295 a negative error value if aborted by an error condition | |
296 (e.g. stopped by repeated call or recursion limit) | |
297 */ | |
298 | |
299 static const unsigned FRAMES_ON_STACK = 16; | |
300 | |
301 struct MatchStack { | |
302 MatchStack() | |
303 : framesEnd(frames + FRAMES_ON_STACK) | |
304 , currentFrame(frames) | |
305 , size(1) // match() creates accesses the first frame w/o calling pushNe
wFrame | |
306 { | |
307 ASSERT((sizeof(frames) / sizeof(frames[0])) == FRAMES_ON_STACK); | |
308 } | |
309 | |
310 MatchFrame frames[FRAMES_ON_STACK]; | |
311 MatchFrame* framesEnd; | |
312 MatchFrame* currentFrame; | |
313 unsigned size; | |
314 | |
315 inline bool canUseStackBufferForNextFrame() | |
316 { | |
317 return size < FRAMES_ON_STACK; | |
318 } | |
319 | |
320 inline MatchFrame* allocateNextFrame() | |
321 { | |
322 if (canUseStackBufferForNextFrame()) | |
323 return currentFrame + 1; | |
324 return new MatchFrame; | |
325 } | |
326 | |
327 inline void pushNewFrame(const unsigned char* instructionPtr, BracketChainNo
de* bracketChain, ReturnLocation returnLocation) | |
328 { | |
329 MatchFrame* newframe = allocateNextFrame(); | |
330 newframe->previousFrame = currentFrame; | |
331 | |
332 newframe->args.subjectPtr = currentFrame->args.subjectPtr; | |
333 newframe->args.offsetTop = currentFrame->args.offsetTop; | |
334 newframe->args.instructionPtr = instructionPtr; | |
335 newframe->args.bracketChain = bracketChain; | |
336 newframe->returnLocation = returnLocation; | |
337 size++; | |
338 | |
339 currentFrame = newframe; | |
340 } | |
341 | |
342 inline void popCurrentFrame() | |
343 { | |
344 MatchFrame* oldFrame = currentFrame; | |
345 currentFrame = currentFrame->previousFrame; | |
346 if (size > FRAMES_ON_STACK) | |
347 delete oldFrame; | |
348 size--; | |
349 } | |
350 | |
351 void popAllFrames() | |
352 { | |
353 while (size) | |
354 popCurrentFrame(); | |
355 } | |
356 }; | |
357 | |
358 static int matchError(int errorCode, MatchStack& stack) | |
359 { | |
360 stack.popAllFrames(); | |
361 return errorCode; | |
362 } | |
363 | |
364 /* Get the next UTF-8 character, not advancing the pointer, incrementing length | |
365 if there are extra bytes. This is called when we know we are in UTF-8 mode. */ | |
366 | |
367 static inline void getUTF8CharAndIncrementLength(int& c, const unsigned char* su
bjectPtr, int& len) | |
368 { | |
369 c = *subjectPtr; | |
370 if ((c & 0xc0) == 0xc0) { | |
371 int gcaa = kjs_pcre_utf8_table4[c & 0x3f]; /* Number of additional byte
s */ | |
372 int gcss = 6 * gcaa; | |
373 c = (c & kjs_pcre_utf8_table3[gcaa]) << gcss; | |
374 for (int gcii = 1; gcii <= gcaa; gcii++) { | |
375 gcss -= 6; | |
376 c |= (subjectPtr[gcii] & 0x3f) << gcss; | |
377 } | |
378 len += gcaa; | |
379 } | |
380 } | |
381 | |
382 static inline void startNewGroup(MatchFrame* currentFrame) | |
383 { | |
384 /* At the start of a bracketed group, add the current subject pointer to the | |
385 stack of such pointers, to be re-instated at the end of the group when we h
it | |
386 the closing ket. When match() is called in other circumstances, we don't ad
d to | |
387 this stack. */ | |
388 | |
389 currentFrame->locals.bracketChainNode.previousBracket = currentFrame->args.b
racketChain; | |
390 currentFrame->locals.bracketChainNode.bracketStart = currentFrame->args.subj
ectPtr; | |
391 currentFrame->args.bracketChain = ¤tFrame->locals.bracketChainNode; | |
392 } | |
393 | |
394 // FIXME: "minimize" means "not greedy", we should invert the callers to ask for
"greedy" to be less confusing | |
395 static inline void repeatInformationFromInstructionOffset(short instructionOffse
t, bool& minimize, int& minimumRepeats, int& maximumRepeats) | |
396 { | |
397 // Instruction offsets are based off of OP_CRSTAR, OP_STAR, OP_TYPESTAR, OP_
NOTSTAR | |
398 static const char minimumRepeatsFromInstructionOffset[] = { 0, 0, 1, 1, 0, 0
}; | |
399 static const int maximumRepeatsFromInstructionOffset[] = { INT_MAX, INT_MAX,
INT_MAX, INT_MAX, 1, 1 }; | |
400 | |
401 ASSERT(instructionOffset >= 0); | |
402 ASSERT(instructionOffset <= (OP_CRMINQUERY - OP_CRSTAR)); | |
403 | |
404 minimize = (instructionOffset & 1); // this assumes ordering: Instruction, M
inimizeInstruction, Instruction2, MinimizeInstruction2 | |
405 minimumRepeats = minimumRepeatsFromInstructionOffset[instructionOffset]; | |
406 maximumRepeats = maximumRepeatsFromInstructionOffset[instructionOffset]; | |
407 } | |
408 | |
409 static int match(const UChar* subjectPtr, const unsigned char* instructionPtr, i
nt offsetTop, MatchData& md) | |
410 { | |
411 bool isMatch = false; | |
412 int min; | |
413 bool minimize = false; /* Initialization not really needed, but some compile
rs think so. */ | |
414 unsigned matchCount = 0; | |
415 | |
416 MatchStack stack; | |
417 | |
418 /* The opcode jump table. */ | |
419 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
420 #define EMIT_JUMP_TABLE_ENTRY(opcode) &&LABEL_OP_##opcode, | |
421 static void* opcodeJumpTable[256] = { FOR_EACH_OPCODE(EMIT_JUMP_TABLE_ENTRY)
}; | |
422 #undef EMIT_JUMP_TABLE_ENTRY | |
423 #endif | |
424 | |
425 /* One-time setup of the opcode jump table. */ | |
426 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
427 for (int i = 255; !opcodeJumpTable[i]; i--) | |
428 opcodeJumpTable[i] = &&CAPTURING_BRACKET; | |
429 #endif | |
430 | |
431 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION | |
432 // Shark shows this as a hot line | |
433 // Using a static const here makes this line disappear, but makes later acce
ss hotter (not sure why) | |
434 stack.currentFrame->returnLocation = &&RETURN; | |
435 #else | |
436 stack.currentFrame->returnLocation = 0; | |
437 #endif | |
438 stack.currentFrame->args.subjectPtr = subjectPtr; | |
439 stack.currentFrame->args.instructionPtr = instructionPtr; | |
440 stack.currentFrame->args.offsetTop = offsetTop; | |
441 stack.currentFrame->args.bracketChain = 0; | |
442 startNewGroup(stack.currentFrame); | |
443 | |
444 /* This is where control jumps back to to effect "recursion" */ | |
445 | |
446 RECURSE: | |
447 if (++matchCount > matchLimit) | |
448 return matchError(JSRegExpErrorHitLimit, stack); | |
449 | |
450 /* Now start processing the operations. */ | |
451 | |
452 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
453 while (true) | |
454 #endif | |
455 { | |
456 | |
457 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
458 #define BEGIN_OPCODE(opcode) LABEL_OP_##opcode | |
459 #define NEXT_OPCODE goto *opcodeJumpTable[*stack.currentFrame->args.instructionP
tr] | |
460 #else | |
461 #define BEGIN_OPCODE(opcode) case OP_##opcode | |
462 #define NEXT_OPCODE continue | |
463 #endif | |
464 | |
465 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
466 NEXT_OPCODE; | |
467 #else | |
468 switch (*stack.currentFrame->args.instructionPtr) | |
469 #endif | |
470 { | |
471 /* Non-capturing bracket: optimized */ | |
472 | |
473 BEGIN_OPCODE(BRA): | |
474 NON_CAPTURING_BRACKET: | |
475 DPRINTF(("start bracket 0\n")); | |
476 do { | |
477 RECURSIVE_MATCH_STARTNG_NEW_GROUP(2, stack.currentFrame->arg
s.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); | |
478 if (isMatch) | |
479 RRETURN; | |
480 stack.currentFrame->args.instructionPtr += getLinkValue(stac
k.currentFrame->args.instructionPtr + 1); | |
481 } while (*stack.currentFrame->args.instructionPtr == OP_ALT); | |
482 DPRINTF(("bracket 0 failed\n")); | |
483 RRETURN; | |
484 | |
485 /* Skip over large extraction number data if encountered. */ | |
486 | |
487 BEGIN_OPCODE(BRANUMBER): | |
488 stack.currentFrame->args.instructionPtr += 3; | |
489 NEXT_OPCODE; | |
490 | |
491 /* End of the pattern. */ | |
492 | |
493 BEGIN_OPCODE(END): | |
494 md.endMatchPtr = stack.currentFrame->args.subjectPtr; /
* Record where we ended */ | |
495 md.endOffsetTop = stack.currentFrame->args.offsetTop; /* and h
ow many extracts were taken */ | |
496 isMatch = true; | |
497 RRETURN; | |
498 | |
499 /* Assertion brackets. Check the alternative branches in turn - the | |
500 matching won't pass the KET for an assertion. If any one branch mat
ches, | |
501 the assertion is true. Lookbehind assertions have an OP_REVERSE ite
m at the | |
502 start of each branch to move the current point backwards, so the co
de at | |
503 this level is identical to the lookahead case. */ | |
504 | |
505 BEGIN_OPCODE(ASSERT): | |
506 do { | |
507 RECURSIVE_MATCH_STARTNG_NEW_GROUP(6, stack.currentFrame->arg
s.instructionPtr + 1 + LINK_SIZE, NULL); | |
508 if (isMatch) | |
509 break; | |
510 stack.currentFrame->args.instructionPtr += getLinkValue(stac
k.currentFrame->args.instructionPtr + 1); | |
511 } while (*stack.currentFrame->args.instructionPtr == OP_ALT); | |
512 if (*stack.currentFrame->args.instructionPtr == OP_KET) | |
513 RRETURN_NO_MATCH; | |
514 | |
515 /* Continue from after the assertion, updating the offsets high
water | |
516 mark, since extracts may have been taken during the assertion.
*/ | |
517 | |
518 advanceToEndOfBracket(stack.currentFrame->args.instructionPtr); | |
519 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; | |
520 stack.currentFrame->args.offsetTop = md.endOffsetTop; | |
521 NEXT_OPCODE; | |
522 | |
523 /* Negative assertion: all branches must fail to match */ | |
524 | |
525 BEGIN_OPCODE(ASSERT_NOT): | |
526 do { | |
527 RECURSIVE_MATCH_STARTNG_NEW_GROUP(7, stack.currentFrame->arg
s.instructionPtr + 1 + LINK_SIZE, NULL); | |
528 if (isMatch) | |
529 RRETURN_NO_MATCH; | |
530 stack.currentFrame->args.instructionPtr += getLinkValue(stac
k.currentFrame->args.instructionPtr + 1); | |
531 } while (*stack.currentFrame->args.instructionPtr == OP_ALT); | |
532 | |
533 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; | |
534 NEXT_OPCODE; | |
535 | |
536 /* An alternation is the end of a branch; scan along to find the end
of the | |
537 bracketed group and go to there. */ | |
538 | |
539 BEGIN_OPCODE(ALT): | |
540 advanceToEndOfBracket(stack.currentFrame->args.instructionPtr); | |
541 NEXT_OPCODE; | |
542 | |
543 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicat
ing | |
544 that it may occur zero times. It may repeat infinitely, or not at a
ll - | |
545 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upp
er | |
546 repeat limits are compiled as a number of copies, with the optional
ones | |
547 preceded by BRAZERO or BRAMINZERO. */ | |
548 | |
549 BEGIN_OPCODE(BRAZERO): { | |
550 stack.currentFrame->locals.startOfRepeatingBracket = stack.curre
ntFrame->args.instructionPtr + 1; | |
551 RECURSIVE_MATCH_STARTNG_NEW_GROUP(14, stack.currentFrame->locals
.startOfRepeatingBracket, stack.currentFrame->args.bracketChain); | |
552 if (isMatch) | |
553 RRETURN; | |
554 advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatin
gBracket); | |
555 stack.currentFrame->args.instructionPtr = stack.currentFrame->lo
cals.startOfRepeatingBracket + 1 + LINK_SIZE; | |
556 NEXT_OPCODE; | |
557 } | |
558 | |
559 BEGIN_OPCODE(BRAMINZERO): { | |
560 stack.currentFrame->locals.startOfRepeatingBracket = stack.curre
ntFrame->args.instructionPtr + 1; | |
561 advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatin
gBracket); | |
562 RECURSIVE_MATCH_STARTNG_NEW_GROUP(15, stack.currentFrame->locals
.startOfRepeatingBracket + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain)
; | |
563 if (isMatch) | |
564 RRETURN; | |
565 stack.currentFrame->args.instructionPtr++; | |
566 NEXT_OPCODE; | |
567 } | |
568 | |
569 /* End of a group, repeated or non-repeating. If we are at the end o
f | |
570 an assertion "group", stop matching and return 1, but record the | |
571 current high water mark for use by positive assertions. Do this als
o | |
572 for the "once" (not-backup up) groups. */ | |
573 | |
574 BEGIN_OPCODE(KET): | |
575 BEGIN_OPCODE(KETRMIN): | |
576 BEGIN_OPCODE(KETRMAX): | |
577 stack.currentFrame->locals.instructionPtrAtStartOfOnce = stack.c
urrentFrame->args.instructionPtr - getLinkValue(stack.currentFrame->args.instruc
tionPtr + 1); | |
578 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stac
k.currentFrame->args.bracketChain->bracketStart; | |
579 | |
580 /* Back up the stack of bracket start pointers. */ | |
581 | |
582 stack.currentFrame->args.bracketChain = stack.currentFrame->args
.bracketChain->previousBracket; | |
583 | |
584 if (*stack.currentFrame->locals.instructionPtrAtStartOfOnce == O
P_ASSERT || *stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT
_NOT) { | |
585 md.endOffsetTop = stack.currentFrame->args.offsetTop; | |
586 isMatch = true; | |
587 RRETURN; | |
588 } | |
589 | |
590 /* In all other cases except a conditional group we have to chec
k the | |
591 group number back at the start and if necessary complete handli
ng an | |
592 extraction by setting the offsets and bumping the high water ma
rk. */ | |
593 | |
594 stack.currentFrame->locals.number = *stack.currentFrame->locals.
instructionPtrAtStartOfOnce - OP_BRA; | |
595 | |
596 /* For extended extraction brackets (large number), we have to f
ish out | |
597 the number from a dummy opcode at the start. */ | |
598 | |
599 if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX) | |
600 stack.currentFrame->locals.number = get2ByteValue(stack.curr
entFrame->locals.instructionPtrAtStartOfOnce + 2 + LINK_SIZE); | |
601 stack.currentFrame->locals.offset = stack.currentFrame->locals.n
umber << 1; | |
602 | |
603 #ifdef DEBUG | |
604 printf("end bracket %d", stack.currentFrame->locals.number); | |
605 printf("\n"); | |
606 #endif | |
607 | |
608 /* Test for a numbered group. This includes groups called as a r
esult | |
609 of recursion. Note that whole-pattern recursion is coded as a r
ecurse | |
610 into group 0, so it won't be picked up here. Instead, we catch
it when | |
611 the OP_END is reached. */ | |
612 | |
613 if (stack.currentFrame->locals.number > 0) { | |
614 if (stack.currentFrame->locals.offset >= md.offsetMax) | |
615 md.offsetOverflow = true; | |
616 else { | |
617 md.offsetVector[stack.currentFrame->locals.offset] = | |
618 md.offsetVector[md.offsetEnd - stack.currentFrame->local
s.number]; | |
619 md.offsetVector[stack.currentFrame->locals.offset+1] = s
tack.currentFrame->args.subjectPtr - md.startSubject; | |
620 if (stack.currentFrame->args.offsetTop <= stack.currentF
rame->locals.offset) | |
621 stack.currentFrame->args.offsetTop = stack.currentFr
ame->locals.offset + 2; | |
622 } | |
623 } | |
624 | |
625 /* For a non-repeating ket, just continue at this level. This al
so | |
626 happens for a repeating ket if no characters were matched in th
e group. | |
627 This is the forcible breaking of infinite loops as implemented
in Perl | |
628 5.005. If there is an options reset, it will get obeyed in the
normal | |
629 course of events. */ | |
630 | |
631 if (*stack.currentFrame->args.instructionPtr == OP_KET || stack.
currentFrame->args.subjectPtr == stack.currentFrame->locals.subjectPtrAtStartOfI
nstruction) { | |
632 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; | |
633 NEXT_OPCODE; | |
634 } | |
635 | |
636 /* The repeating kets try the rest of the pattern or restart fro
m the | |
637 preceding bracket, in the appropriate order. */ | |
638 | |
639 if (*stack.currentFrame->args.instructionPtr == OP_KETRMIN) { | |
640 RECURSIVE_MATCH(16, stack.currentFrame->args.instructionPtr
+ 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); | |
641 if (isMatch) | |
642 RRETURN; | |
643 RECURSIVE_MATCH_STARTNG_NEW_GROUP(17, stack.currentFrame->lo
cals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain); | |
644 if (isMatch) | |
645 RRETURN; | |
646 } else { /* OP_KETRMAX */ | |
647 RECURSIVE_MATCH_STARTNG_NEW_GROUP(18, stack.currentFrame->lo
cals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain); | |
648 if (isMatch) | |
649 RRETURN; | |
650 RECURSIVE_MATCH(19, stack.currentFrame->args.instructionPtr
+ 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); | |
651 if (isMatch) | |
652 RRETURN; | |
653 } | |
654 RRETURN; | |
655 | |
656 /* Start of subject. */ | |
657 | |
658 BEGIN_OPCODE(CIRC): | |
659 if (stack.currentFrame->args.subjectPtr != md.startSubject) | |
660 RRETURN_NO_MATCH; | |
661 stack.currentFrame->args.instructionPtr++; | |
662 NEXT_OPCODE; | |
663 | |
664 /* After internal newline if multiline. */ | |
665 | |
666 BEGIN_OPCODE(BOL): | |
667 if (stack.currentFrame->args.subjectPtr != md.startSubject && !i
sNewline(stack.currentFrame->args.subjectPtr[-1])) | |
668 RRETURN_NO_MATCH; | |
669 stack.currentFrame->args.instructionPtr++; | |
670 NEXT_OPCODE; | |
671 | |
672 /* End of subject. */ | |
673 | |
674 BEGIN_OPCODE(DOLL): | |
675 if (stack.currentFrame->args.subjectPtr < md.endSubject) | |
676 RRETURN_NO_MATCH; | |
677 stack.currentFrame->args.instructionPtr++; | |
678 NEXT_OPCODE; | |
679 | |
680 /* Before internal newline if multiline. */ | |
681 | |
682 BEGIN_OPCODE(EOL): | |
683 if (stack.currentFrame->args.subjectPtr < md.endSubject && !isNe
wline(*stack.currentFrame->args.subjectPtr)) | |
684 RRETURN_NO_MATCH; | |
685 stack.currentFrame->args.instructionPtr++; | |
686 NEXT_OPCODE; | |
687 | |
688 /* Word boundary assertions */ | |
689 | |
690 BEGIN_OPCODE(NOT_WORD_BOUNDARY): | |
691 BEGIN_OPCODE(WORD_BOUNDARY): { | |
692 bool currentCharIsWordChar = false; | |
693 bool previousCharIsWordChar = false; | |
694 | |
695 if (stack.currentFrame->args.subjectPtr > md.startSubject) | |
696 previousCharIsWordChar = isWordChar(stack.currentFrame->args
.subjectPtr[-1]); | |
697 if (stack.currentFrame->args.subjectPtr < md.endSubject) | |
698 currentCharIsWordChar = isWordChar(*stack.currentFrame->args
.subjectPtr); | |
699 | |
700 /* Now see if the situation is what we want */ | |
701 bool wordBoundaryDesired = (*stack.currentFrame->args.instructio
nPtr++ == OP_WORD_BOUNDARY); | |
702 if (wordBoundaryDesired ? currentCharIsWordChar == previousCharI
sWordChar : currentCharIsWordChar != previousCharIsWordChar) | |
703 RRETURN_NO_MATCH; | |
704 NEXT_OPCODE; | |
705 } | |
706 | |
707 /* Match a single character type; inline for speed */ | |
708 | |
709 BEGIN_OPCODE(NOT_NEWLINE): | |
710 if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
711 RRETURN_NO_MATCH; | |
712 if (isNewline(*stack.currentFrame->args.subjectPtr++)) | |
713 RRETURN_NO_MATCH; | |
714 stack.currentFrame->args.instructionPtr++; | |
715 NEXT_OPCODE; | |
716 | |
717 BEGIN_OPCODE(NOT_DIGIT): | |
718 if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
719 RRETURN_NO_MATCH; | |
720 if (isASCIIDigit(*stack.currentFrame->args.subjectPtr++)) | |
721 RRETURN_NO_MATCH; | |
722 stack.currentFrame->args.instructionPtr++; | |
723 NEXT_OPCODE; | |
724 | |
725 BEGIN_OPCODE(DIGIT): | |
726 if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
727 RRETURN_NO_MATCH; | |
728 if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr++)) | |
729 RRETURN_NO_MATCH; | |
730 stack.currentFrame->args.instructionPtr++; | |
731 NEXT_OPCODE; | |
732 | |
733 BEGIN_OPCODE(NOT_WHITESPACE): | |
734 if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
735 RRETURN_NO_MATCH; | |
736 if (isSpaceChar(*stack.currentFrame->args.subjectPtr++)) | |
737 RRETURN_NO_MATCH; | |
738 stack.currentFrame->args.instructionPtr++; | |
739 NEXT_OPCODE; | |
740 | |
741 BEGIN_OPCODE(WHITESPACE): | |
742 if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
743 RRETURN_NO_MATCH; | |
744 if (!isSpaceChar(*stack.currentFrame->args.subjectPtr++)) | |
745 RRETURN_NO_MATCH; | |
746 stack.currentFrame->args.instructionPtr++; | |
747 NEXT_OPCODE; | |
748 | |
749 BEGIN_OPCODE(NOT_WORDCHAR): | |
750 if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
751 RRETURN_NO_MATCH; | |
752 if (isWordChar(*stack.currentFrame->args.subjectPtr++)) | |
753 RRETURN_NO_MATCH; | |
754 stack.currentFrame->args.instructionPtr++; | |
755 NEXT_OPCODE; | |
756 | |
757 BEGIN_OPCODE(WORDCHAR): | |
758 if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
759 RRETURN_NO_MATCH; | |
760 if (!isWordChar(*stack.currentFrame->args.subjectPtr++)) | |
761 RRETURN_NO_MATCH; | |
762 stack.currentFrame->args.instructionPtr++; | |
763 NEXT_OPCODE; | |
764 | |
765 /* Match a back reference, possibly repeatedly. Look past the end of
the | |
766 item to see if there is repeat information following. The code is s
imilar | |
767 to that for character classes, but repeated for efficiency. Then ob
ey | |
768 similar code to character type repeats - written out again for spee
d. | |
769 However, if the referenced string is the empty string, always treat | |
770 it as matched, any number of times (otherwise there could be infini
te | |
771 loops). */ | |
772 | |
773 BEGIN_OPCODE(REF): | |
774 stack.currentFrame->locals.offset = get2ByteValue(stack.currentF
rame->args.instructionPtr + 1) << 1; /* Doubled ref number */ | |
775 stack.currentFrame->args.instructionPtr += 3;
/* Advance past item */ | |
776 | |
777 /* If the reference is unset, set the length to be longer than t
he amount | |
778 of subject left; this ensures that every attempt at a match fai
ls. We | |
779 can't just fail here, because of the possibility of quantifiers
with zero | |
780 minima. */ | |
781 | |
782 if (stack.currentFrame->locals.offset >= stack.currentFrame->arg
s.offsetTop || md.offsetVector[stack.currentFrame->locals.offset] < 0) | |
783 stack.currentFrame->locals.length = 0; | |
784 else | |
785 stack.currentFrame->locals.length = md.offsetVector[stack.cu
rrentFrame->locals.offset+1] - md.offsetVector[stack.currentFrame->locals.offset
]; | |
786 | |
787 /* Set up for repetition, or handle the non-repeated case */ | |
788 | |
789 switch (*stack.currentFrame->args.instructionPtr) { | |
790 case OP_CRSTAR: | |
791 case OP_CRMINSTAR: | |
792 case OP_CRPLUS: | |
793 case OP_CRMINPLUS: | |
794 case OP_CRQUERY: | |
795 case OP_CRMINQUERY: | |
796 repeatInformationFromInstructionOffset(*stack.currentFra
me->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals
.max); | |
797 break; | |
798 | |
799 case OP_CRRANGE: | |
800 case OP_CRMINRANGE: | |
801 minimize = (*stack.currentFrame->args.instructionPtr ==
OP_CRMINRANGE); | |
802 min = get2ByteValue(stack.currentFrame->args.instruction
Ptr + 1); | |
803 stack.currentFrame->locals.max = get2ByteValue(stack.cur
rentFrame->args.instructionPtr + 3); | |
804 if (stack.currentFrame->locals.max == 0) | |
805 stack.currentFrame->locals.max = INT_MAX; | |
806 stack.currentFrame->args.instructionPtr += 5; | |
807 break; | |
808 | |
809 default: /* No repeat follows */ | |
810 if (!matchRef(stack.currentFrame->locals.offset, stack.c
urrentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) | |
811 RRETURN_NO_MATCH; | |
812 stack.currentFrame->args.subjectPtr += stack.currentFram
e->locals.length; | |
813 NEXT_OPCODE; | |
814 } | |
815 | |
816 /* If the length of the reference is zero, just continue with th
e | |
817 main loop. */ | |
818 | |
819 if (stack.currentFrame->locals.length == 0) | |
820 NEXT_OPCODE; | |
821 | |
822 /* First, ensure the minimum number of matches are present. */ | |
823 | |
824 for (int i = 1; i <= min; i++) { | |
825 if (!matchRef(stack.currentFrame->locals.offset, stack.curre
ntFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) | |
826 RRETURN_NO_MATCH; | |
827 stack.currentFrame->args.subjectPtr += stack.currentFrame->l
ocals.length; | |
828 } | |
829 | |
830 /* If min = max, continue at the same level without recursion. | |
831 They are not both allowed to be zero. */ | |
832 | |
833 if (min == stack.currentFrame->locals.max) | |
834 NEXT_OPCODE; | |
835 | |
836 /* If minimizing, keep trying and advancing the pointer */ | |
837 | |
838 if (minimize) { | |
839 for (stack.currentFrame->locals.fi = min;; stack.currentFram
e->locals.fi++) { | |
840 RECURSIVE_MATCH(20, stack.currentFrame->args.instruction
Ptr, stack.currentFrame->args.bracketChain); | |
841 if (isMatch) | |
842 RRETURN; | |
843 if (stack.currentFrame->locals.fi >= stack.currentFrame-
>locals.max || !matchRef(stack.currentFrame->locals.offset, stack.currentFrame->
args.subjectPtr, stack.currentFrame->locals.length, md)) | |
844 RRETURN; | |
845 stack.currentFrame->args.subjectPtr += stack.currentFram
e->locals.length; | |
846 } | |
847 /* Control never reaches here */ | |
848 } | |
849 | |
850 /* If maximizing, find the longest string and work backwards */ | |
851 | |
852 else { | |
853 stack.currentFrame->locals.subjectPtrAtStartOfInstruction =
stack.currentFrame->args.subjectPtr; | |
854 for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
855 if (!matchRef(stack.currentFrame->locals.offset, stack.c
urrentFrame->args.subjectPtr, stack.currentFrame->locals.length, md)) | |
856 break; | |
857 stack.currentFrame->args.subjectPtr += stack.currentFram
e->locals.length; | |
858 } | |
859 while (stack.currentFrame->args.subjectPtr >= stack.currentF
rame->locals.subjectPtrAtStartOfInstruction) { | |
860 RECURSIVE_MATCH(21, stack.currentFrame->args.instruction
Ptr, stack.currentFrame->args.bracketChain); | |
861 if (isMatch) | |
862 RRETURN; | |
863 stack.currentFrame->args.subjectPtr -= stack.currentFram
e->locals.length; | |
864 } | |
865 RRETURN_NO_MATCH; | |
866 } | |
867 /* Control never reaches here */ | |
868 | |
869 /* Match a bit-mapped character class, possibly repeatedly. This op
code is | |
870 used when all the characters in the class have values in the range
0-255, | |
871 and either the matching is caseful, or the characters are in the ra
nge | |
872 0-127 when UTF-8 processing is enabled. The only difference between | |
873 OP_CLASS and OP_NCLASS occurs when a data character outside the ran
ge is | |
874 encountered. | |
875 | |
876 First, look past the end of the item to see if there is repeat info
rmation | |
877 following. Then obey similar code to character type repeats - writt
en out | |
878 again for speed. */ | |
879 | |
880 BEGIN_OPCODE(NCLASS): | |
881 BEGIN_OPCODE(CLASS): | |
882 stack.currentFrame->locals.data = stack.currentFrame->args.instr
uctionPtr + 1; /* Save for matching */ | |
883 stack.currentFrame->args.instructionPtr += 33;
/* Advance past the item */ | |
884 | |
885 switch (*stack.currentFrame->args.instructionPtr) { | |
886 case OP_CRSTAR: | |
887 case OP_CRMINSTAR: | |
888 case OP_CRPLUS: | |
889 case OP_CRMINPLUS: | |
890 case OP_CRQUERY: | |
891 case OP_CRMINQUERY: | |
892 repeatInformationFromInstructionOffset(*stack.currentFra
me->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals
.max); | |
893 break; | |
894 | |
895 case OP_CRRANGE: | |
896 case OP_CRMINRANGE: | |
897 minimize = (*stack.currentFrame->args.instructionPtr ==
OP_CRMINRANGE); | |
898 min = get2ByteValue(stack.currentFrame->args.instruction
Ptr + 1); | |
899 stack.currentFrame->locals.max = get2ByteValue(stack.cur
rentFrame->args.instructionPtr + 3); | |
900 if (stack.currentFrame->locals.max == 0) | |
901 stack.currentFrame->locals.max = INT_MAX; | |
902 stack.currentFrame->args.instructionPtr += 5; | |
903 break; | |
904 | |
905 default: /* No repeat follows */ | |
906 min = stack.currentFrame->locals.max = 1; | |
907 break; | |
908 } | |
909 | |
910 /* First, ensure the minimum number of matches are present. */ | |
911 | |
912 for (int i = 1; i <= min; i++) { | |
913 if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
914 RRETURN_NO_MATCH; | |
915 int c = *stack.currentFrame->args.subjectPtr++; | |
916 if (c > 255) { | |
917 if (stack.currentFrame->locals.data[-1] == OP_CLASS) | |
918 RRETURN_NO_MATCH; | |
919 } else { | |
920 if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c
& 7)))) | |
921 RRETURN_NO_MATCH; | |
922 } | |
923 } | |
924 | |
925 /* If max == min we can continue with the main loop without the | |
926 need to recurse. */ | |
927 | |
928 if (min == stack.currentFrame->locals.max) | |
929 NEXT_OPCODE; | |
930 | |
931 /* If minimizing, keep testing the rest of the expression and ad
vancing | |
932 the pointer while it matches the class. */ | |
933 if (minimize) { | |
934 for (stack.currentFrame->locals.fi = min;; stack.currentFram
e->locals.fi++) { | |
935 RECURSIVE_MATCH(22, stack.currentFrame->args.instruction
Ptr, stack.currentFrame->args.bracketChain); | |
936 if (isMatch) | |
937 RRETURN; | |
938 if (stack.currentFrame->locals.fi >= stack.currentFrame-
>locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) | |
939 RRETURN; | |
940 int c = *stack.currentFrame->args.subjectPtr++; | |
941 if (c > 255) { | |
942 if (stack.currentFrame->locals.data[-1] == OP_CLASS) | |
943 RRETURN; | |
944 } else { | |
945 if ((stack.currentFrame->locals.data[c/8] & (1 << (c
&7))) == 0) | |
946 RRETURN; | |
947 } | |
948 } | |
949 /* Control never reaches here */ | |
950 } | |
951 /* If maximizing, find the longest possible run, then work backw
ards. */ | |
952 else { | |
953 stack.currentFrame->locals.subjectPtrAtStartOfInstruction =
stack.currentFrame->args.subjectPtr; | |
954 | |
955 for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
956 if (stack.currentFrame->args.subjectPtr >= md.endSubject
) | |
957 break; | |
958 int c = *stack.currentFrame->args.subjectPtr; | |
959 if (c > 255) { | |
960 if (stack.currentFrame->locals.data[-1] == OP_CLASS) | |
961 break; | |
962 } else { | |
963 if (!(stack.currentFrame->locals.data[c / 8] & (1 <<
(c & 7)))) | |
964 break; | |
965 } | |
966 ++stack.currentFrame->args.subjectPtr; | |
967 } | |
968 for (;;) { | |
969 RECURSIVE_MATCH(24, stack.currentFrame->args.instruction
Ptr, stack.currentFrame->args.bracketChain); | |
970 if (isMatch) | |
971 RRETURN; | |
972 if (stack.currentFrame->args.subjectPtr-- == stack.curre
ntFrame->locals.subjectPtrAtStartOfInstruction) | |
973 break; /* Stop if tried at original pos */ | |
974 } | |
975 | |
976 RRETURN; | |
977 } | |
978 /* Control never reaches here */ | |
979 | |
980 /* Match an extended character class. */ | |
981 | |
982 BEGIN_OPCODE(XCLASS): | |
983 stack.currentFrame->locals.data = stack.currentFrame->args.instr
uctionPtr + 1 + LINK_SIZE; /* Save for matching */ | |
984 stack.currentFrame->args.instructionPtr += getLinkValue(stack.cu
rrentFrame->args.instructionPtr + 1); /* Advance past the i
tem */ | |
985 | |
986 switch (*stack.currentFrame->args.instructionPtr) { | |
987 case OP_CRSTAR: | |
988 case OP_CRMINSTAR: | |
989 case OP_CRPLUS: | |
990 case OP_CRMINPLUS: | |
991 case OP_CRQUERY: | |
992 case OP_CRMINQUERY: | |
993 repeatInformationFromInstructionOffset(*stack.currentFra
me->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals
.max); | |
994 break; | |
995 | |
996 case OP_CRRANGE: | |
997 case OP_CRMINRANGE: | |
998 minimize = (*stack.currentFrame->args.instructionPtr ==
OP_CRMINRANGE); | |
999 min = get2ByteValue(stack.currentFrame->args.instruction
Ptr + 1); | |
1000 stack.currentFrame->locals.max = get2ByteValue(stack.cur
rentFrame->args.instructionPtr + 3); | |
1001 if (stack.currentFrame->locals.max == 0) | |
1002 stack.currentFrame->locals.max = INT_MAX; | |
1003 stack.currentFrame->args.instructionPtr += 5; | |
1004 break; | |
1005 | |
1006 default: /* No repeat follows */ | |
1007 min = stack.currentFrame->locals.max = 1; | |
1008 } | |
1009 | |
1010 /* First, ensure the minimum number of matches are present. */ | |
1011 | |
1012 for (int i = 1; i <= min; i++) { | |
1013 if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1014 RRETURN_NO_MATCH; | |
1015 int c = *stack.currentFrame->args.subjectPtr++; | |
1016 if (!kjs_pcre_xclass(c, stack.currentFrame->locals.data)) | |
1017 RRETURN_NO_MATCH; | |
1018 } | |
1019 | |
1020 /* If max == min we can continue with the main loop without the | |
1021 need to recurse. */ | |
1022 | |
1023 if (min == stack.currentFrame->locals.max) | |
1024 NEXT_OPCODE; | |
1025 | |
1026 /* If minimizing, keep testing the rest of the expression and ad
vancing | |
1027 the pointer while it matches the class. */ | |
1028 | |
1029 if (minimize) { | |
1030 for (stack.currentFrame->locals.fi = min;; stack.currentFram
e->locals.fi++) { | |
1031 RECURSIVE_MATCH(26, stack.currentFrame->args.instruction
Ptr, stack.currentFrame->args.bracketChain); | |
1032 if (isMatch) | |
1033 RRETURN; | |
1034 if (stack.currentFrame->locals.fi >= stack.currentFrame-
>locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1035 RRETURN; | |
1036 int c = *stack.currentFrame->args.subjectPtr++; | |
1037 if (!kjs_pcre_xclass(c, stack.currentFrame->locals.data)
) | |
1038 RRETURN; | |
1039 } | |
1040 /* Control never reaches here */ | |
1041 } | |
1042 | |
1043 /* If maximizing, find the longest possible run, then work backw
ards. */ | |
1044 | |
1045 else { | |
1046 stack.currentFrame->locals.subjectPtrAtStartOfInstruction =
stack.currentFrame->args.subjectPtr; | |
1047 for (int i = min; i < stack.currentFrame->locals.max; i++) { | |
1048 if (stack.currentFrame->args.subjectPtr >= md.endSubject
) | |
1049 break; | |
1050 int c = *stack.currentFrame->args.subjectPtr; | |
1051 if (!kjs_pcre_xclass(c, stack.currentFrame->locals.data)
) | |
1052 break; | |
1053 ++stack.currentFrame->args.subjectPtr; | |
1054 } | |
1055 for(;;) { | |
1056 RECURSIVE_MATCH(27, stack.currentFrame->args.instruction
Ptr, stack.currentFrame->args.bracketChain); | |
1057 if (isMatch) | |
1058 RRETURN; | |
1059 if (stack.currentFrame->args.subjectPtr-- == stack.curre
ntFrame->locals.subjectPtrAtStartOfInstruction) | |
1060 break; /* Stop if tried at original pos */ | |
1061 } | |
1062 RRETURN; | |
1063 } | |
1064 | |
1065 /* Control never reaches here */ | |
1066 | |
1067 /* Match a single character, casefully */ | |
1068 | |
1069 BEGIN_OPCODE(CHAR): | |
1070 stack.currentFrame->locals.length = 1; | |
1071 stack.currentFrame->args.instructionPtr++; | |
1072 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, sta
ck.currentFrame->args.instructionPtr, stack.currentFrame->locals.length); | |
1073 stack.currentFrame->args.instructionPtr += stack.currentFrame->l
ocals.length; | |
1074 if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1075 RRETURN_NO_MATCH; | |
1076 if (stack.currentFrame->locals.fc != *stack.currentFrame->args.s
ubjectPtr++) | |
1077 RRETURN_NO_MATCH; | |
1078 NEXT_OPCODE; | |
1079 | |
1080 /* Match a single character, caselessly */ | |
1081 | |
1082 BEGIN_OPCODE(CHAR_IGNORING_CASE): { | |
1083 stack.currentFrame->locals.length = 1; | |
1084 stack.currentFrame->args.instructionPtr++; | |
1085 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, sta
ck.currentFrame->args.instructionPtr, stack.currentFrame->locals.length); | |
1086 stack.currentFrame->args.instructionPtr += stack.currentFrame->l
ocals.length; | |
1087 if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1088 RRETURN_NO_MATCH; | |
1089 int dc = *stack.currentFrame->args.subjectPtr++; | |
1090 if (stack.currentFrame->locals.fc != dc && kjs_pcre_ucp_othercas
e(stack.currentFrame->locals.fc) != dc) | |
1091 RRETURN_NO_MATCH; | |
1092 NEXT_OPCODE; | |
1093 } | |
1094 | |
1095 /* Match a single ASCII character. */ | |
1096 | |
1097 BEGIN_OPCODE(ASCII_CHAR): | |
1098 if (md.endSubject == stack.currentFrame->args.subjectPtr) | |
1099 RRETURN_NO_MATCH; | |
1100 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->
args.instructionPtr[1]) | |
1101 RRETURN_NO_MATCH; | |
1102 ++stack.currentFrame->args.subjectPtr; | |
1103 stack.currentFrame->args.instructionPtr += 2; | |
1104 NEXT_OPCODE; | |
1105 | |
1106 /* Match one of two cases of an ASCII letter. */ | |
1107 | |
1108 BEGIN_OPCODE(ASCII_LETTER_IGNORING_CASE): | |
1109 if (md.endSubject == stack.currentFrame->args.subjectPtr) | |
1110 RRETURN_NO_MATCH; | |
1111 if ((*stack.currentFrame->args.subjectPtr | 0x20) != stack.curre
ntFrame->args.instructionPtr[1]) | |
1112 RRETURN_NO_MATCH; | |
1113 ++stack.currentFrame->args.subjectPtr; | |
1114 stack.currentFrame->args.instructionPtr += 2; | |
1115 NEXT_OPCODE; | |
1116 | |
1117 /* Match a single character repeatedly; different opcodes share code
. */ | |
1118 | |
1119 BEGIN_OPCODE(EXACT): | |
1120 min = stack.currentFrame->locals.max = get2ByteValue(stack.curre
ntFrame->args.instructionPtr + 1); | |
1121 minimize = false; | |
1122 stack.currentFrame->args.instructionPtr += 3; | |
1123 goto REPEATCHAR; | |
1124 | |
1125 BEGIN_OPCODE(UPTO): | |
1126 BEGIN_OPCODE(MINUPTO): | |
1127 min = 0; | |
1128 stack.currentFrame->locals.max = get2ByteValue(stack.currentFram
e->args.instructionPtr + 1); | |
1129 minimize = *stack.currentFrame->args.instructionPtr == OP_MINUPT
O; | |
1130 stack.currentFrame->args.instructionPtr += 3; | |
1131 goto REPEATCHAR; | |
1132 | |
1133 BEGIN_OPCODE(STAR): | |
1134 BEGIN_OPCODE(MINSTAR): | |
1135 BEGIN_OPCODE(PLUS): | |
1136 BEGIN_OPCODE(MINPLUS): | |
1137 BEGIN_OPCODE(QUERY): | |
1138 BEGIN_OPCODE(MINQUERY): | |
1139 repeatInformationFromInstructionOffset(*stack.currentFrame->args
.instructionPtr++ - OP_STAR, minimize, min, stack.currentFrame->locals.max); | |
1140 | |
1141 /* Common code for all repeated single-character matches. We can
give | |
1142 up quickly if there are fewer than the minimum number of charac
ters left in | |
1143 the subject. */ | |
1144 | |
1145 REPEATCHAR: | |
1146 | |
1147 stack.currentFrame->locals.length = 1; | |
1148 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, sta
ck.currentFrame->args.instructionPtr, stack.currentFrame->locals.length); | |
1149 if (min * (stack.currentFrame->locals.fc > 0xFFFF ? 2 : 1) > md.
endSubject - stack.currentFrame->args.subjectPtr) | |
1150 RRETURN_NO_MATCH; | |
1151 stack.currentFrame->args.instructionPtr += stack.currentFrame->l
ocals.length; | |
1152 | |
1153 if (stack.currentFrame->locals.fc <= 0xFFFF) { | |
1154 int othercase; | |
1155 othercase = md.ignoreCase ? kjs_pcre_ucp_othercase(stack.cur
rentFrame->locals.fc) : -1; | |
1156 | |
1157 for (int i = 1; i <= min; i++) { | |
1158 if (*stack.currentFrame->args.subjectPtr != stack.curren
tFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase) | |
1159 RRETURN_NO_MATCH; | |
1160 ++stack.currentFrame->args.subjectPtr; | |
1161 } | |
1162 | |
1163 if (min == stack.currentFrame->locals.max) | |
1164 NEXT_OPCODE; | |
1165 | |
1166 if (minimize) { | |
1167 stack.currentFrame->locals.repeatOthercase = othercase; | |
1168 for (stack.currentFrame->locals.fi = min;; stack.current
Frame->locals.fi++) { | |
1169 RECURSIVE_MATCH(28, stack.currentFrame->args.instruc
tionPtr, stack.currentFrame->args.bracketChain); | |
1170 if (isMatch) | |
1171 RRETURN; | |
1172 if (stack.currentFrame->locals.fi >= stack.currentFr
ame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1173 RRETURN; | |
1174 if (*stack.currentFrame->args.subjectPtr != stack.cu
rrentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != stack.currentFr
ame->locals.repeatOthercase) | |
1175 RRETURN; | |
1176 ++stack.currentFrame->args.subjectPtr; | |
1177 } | |
1178 /* Control never reaches here */ | |
1179 } else { | |
1180 stack.currentFrame->locals.subjectPtrAtStartOfInstructio
n = stack.currentFrame->args.subjectPtr; | |
1181 for (int i = min; i < stack.currentFrame->locals.max; i+
+) { | |
1182 if (stack.currentFrame->args.subjectPtr >= md.endSub
ject) | |
1183 break; | |
1184 if (*stack.currentFrame->args.subjectPtr != stack.cu
rrentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase) | |
1185 break; | |
1186 ++stack.currentFrame->args.subjectPtr; | |
1187 } | |
1188 while (stack.currentFrame->args.subjectPtr >= stack.curr
entFrame->locals.subjectPtrAtStartOfInstruction) { | |
1189 RECURSIVE_MATCH(29, stack.currentFrame->args.instruc
tionPtr, stack.currentFrame->args.bracketChain); | |
1190 if (isMatch) | |
1191 RRETURN; | |
1192 --stack.currentFrame->args.subjectPtr; | |
1193 } | |
1194 RRETURN_NO_MATCH; | |
1195 } | |
1196 /* Control never reaches here */ | |
1197 } else { | |
1198 /* No case on surrogate pairs, so no need to bother with "ot
hercase". */ | |
1199 | |
1200 for (int i = 1; i <= min; i++) { | |
1201 if (*stack.currentFrame->args.subjectPtr != stack.curren
tFrame->locals.fc) | |
1202 RRETURN_NO_MATCH; | |
1203 stack.currentFrame->args.subjectPtr += 2; | |
1204 } | |
1205 | |
1206 if (min == stack.currentFrame->locals.max) | |
1207 NEXT_OPCODE; | |
1208 | |
1209 if (minimize) { | |
1210 for (stack.currentFrame->locals.fi = min;; stack.current
Frame->locals.fi++) { | |
1211 RECURSIVE_MATCH(30, stack.currentFrame->args.instruc
tionPtr, stack.currentFrame->args.bracketChain); | |
1212 if (isMatch) | |
1213 RRETURN; | |
1214 if (stack.currentFrame->locals.fi >= stack.currentFr
ame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1215 RRETURN; | |
1216 if (*stack.currentFrame->args.subjectPtr != stack.cu
rrentFrame->locals.fc) | |
1217 RRETURN; | |
1218 stack.currentFrame->args.subjectPtr += 2; | |
1219 } | |
1220 /* Control never reaches here */ | |
1221 } else { | |
1222 stack.currentFrame->locals.subjectPtrAtStartOfInstructio
n = stack.currentFrame->args.subjectPtr; | |
1223 for (int i = min; i < stack.currentFrame->locals.max; i+
+) { | |
1224 if (stack.currentFrame->args.subjectPtr > md.endSubj
ect - 2) | |
1225 break; | |
1226 if (*stack.currentFrame->args.subjectPtr != stack.cu
rrentFrame->locals.fc) | |
1227 break; | |
1228 stack.currentFrame->args.subjectPtr += 2; | |
1229 } | |
1230 while (stack.currentFrame->args.subjectPtr >= stack.curr
entFrame->locals.subjectPtrAtStartOfInstruction) { | |
1231 RECURSIVE_MATCH(31, stack.currentFrame->args.instruc
tionPtr, stack.currentFrame->args.bracketChain); | |
1232 if (isMatch) | |
1233 RRETURN; | |
1234 stack.currentFrame->args.subjectPtr -= 2; | |
1235 } | |
1236 RRETURN_NO_MATCH; | |
1237 } | |
1238 /* Control never reaches here */ | |
1239 } | |
1240 /* Control never reaches here */ | |
1241 | |
1242 /* Match a negated single one-byte character. */ | |
1243 | |
1244 BEGIN_OPCODE(NOT): { | |
1245 if (stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1246 RRETURN_NO_MATCH; | |
1247 stack.currentFrame->args.instructionPtr++; | |
1248 int c = *stack.currentFrame->args.subjectPtr++; | |
1249 if (md.ignoreCase) { | |
1250 if (c < 128) | |
1251 c = toLowerCase(c); | |
1252 if (toLowerCase(*stack.currentFrame->args.instructionPtr++)
== c) | |
1253 RRETURN_NO_MATCH; | |
1254 } else { | |
1255 if (*stack.currentFrame->args.instructionPtr++ == c) | |
1256 RRETURN_NO_MATCH; | |
1257 } | |
1258 NEXT_OPCODE; | |
1259 } | |
1260 | |
1261 /* Match a negated single one-byte character repeatedly. This is alm
ost a | |
1262 repeat of the code for a repeated single character, but I haven't f
ound a | |
1263 nice way of commoning these up that doesn't require a test of the | |
1264 positive/negative option for each character match. Maybe that would
n't add | |
1265 very much to the time taken, but character matching *is* what this
is all | |
1266 about... */ | |
1267 | |
1268 BEGIN_OPCODE(NOTEXACT): | |
1269 min = stack.currentFrame->locals.max = get2ByteValue(stack.curre
ntFrame->args.instructionPtr + 1); | |
1270 minimize = false; | |
1271 stack.currentFrame->args.instructionPtr += 3; | |
1272 goto REPEATNOTCHAR; | |
1273 | |
1274 BEGIN_OPCODE(NOTUPTO): | |
1275 BEGIN_OPCODE(NOTMINUPTO): | |
1276 min = 0; | |
1277 stack.currentFrame->locals.max = get2ByteValue(stack.currentFram
e->args.instructionPtr + 1); | |
1278 minimize = *stack.currentFrame->args.instructionPtr == OP_NOTMIN
UPTO; | |
1279 stack.currentFrame->args.instructionPtr += 3; | |
1280 goto REPEATNOTCHAR; | |
1281 | |
1282 BEGIN_OPCODE(NOTSTAR): | |
1283 BEGIN_OPCODE(NOTMINSTAR): | |
1284 BEGIN_OPCODE(NOTPLUS): | |
1285 BEGIN_OPCODE(NOTMINPLUS): | |
1286 BEGIN_OPCODE(NOTQUERY): | |
1287 BEGIN_OPCODE(NOTMINQUERY): | |
1288 repeatInformationFromInstructionOffset(*stack.currentFrame->args
.instructionPtr++ - OP_NOTSTAR, minimize, min, stack.currentFrame->locals.max); | |
1289 | |
1290 /* Common code for all repeated single-byte matches. We can give up
quickly | |
1291 if there are fewer than the minimum number of bytes left in the | |
1292 subject. */ | |
1293 | |
1294 REPEATNOTCHAR: | |
1295 if (min > md.endSubject - stack.currentFrame->args.subjectPtr) | |
1296 RRETURN_NO_MATCH; | |
1297 stack.currentFrame->locals.fc = *stack.currentFrame->args.instru
ctionPtr++; | |
1298 | |
1299 /* The code is duplicated for the caseless and caseful cases, fo
r speed, | |
1300 since matching characters is likely to be quite common. First,
ensure the | |
1301 minimum number of matches are present. If min = max, continue a
t the same | |
1302 level without recursing. Otherwise, if minimizing, keep trying
the rest of | |
1303 the expression and advancing one matching character if failing,
up to the | |
1304 maximum. Alternatively, if maximizing, find the maximum number
of | |
1305 characters and work backwards. */ | |
1306 | |
1307 DPRINTF(("negative matching %c{%d,%d}\n", stack.currentFrame->lo
cals.fc, min, stack.currentFrame->locals.max)); | |
1308 | |
1309 if (md.ignoreCase) { | |
1310 if (stack.currentFrame->locals.fc < 128) | |
1311 stack.currentFrame->locals.fc = toLowerCase(stack.curren
tFrame->locals.fc); | |
1312 | |
1313 for (int i = 1; i <= min; i++) { | |
1314 int d = *stack.currentFrame->args.subjectPtr++; | |
1315 if (d < 128) | |
1316 d = toLowerCase(d); | |
1317 if (stack.currentFrame->locals.fc == d) | |
1318 RRETURN_NO_MATCH; | |
1319 } | |
1320 | |
1321 if (min == stack.currentFrame->locals.max) | |
1322 NEXT_OPCODE; | |
1323 | |
1324 if (minimize) { | |
1325 for (stack.currentFrame->locals.fi = min;; stack.current
Frame->locals.fi++) { | |
1326 RECURSIVE_MATCH(38, stack.currentFrame->args.instruc
tionPtr, stack.currentFrame->args.bracketChain); | |
1327 if (isMatch) | |
1328 RRETURN; | |
1329 int d = *stack.currentFrame->args.subjectPtr++; | |
1330 if (d < 128) | |
1331 d = toLowerCase(d); | |
1332 if (stack.currentFrame->locals.fi >= stack.currentFr
ame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack
.currentFrame->locals.fc == d) | |
1333 RRETURN; | |
1334 } | |
1335 /* Control never reaches here */ | |
1336 } | |
1337 | |
1338 /* Maximize case */ | |
1339 | |
1340 else { | |
1341 stack.currentFrame->locals.subjectPtrAtStartOfInstructio
n = stack.currentFrame->args.subjectPtr; | |
1342 | |
1343 for (int i = min; i < stack.currentFrame->locals.max; i+
+) { | |
1344 if (stack.currentFrame->args.subjectPtr >= md.endSub
ject) | |
1345 break; | |
1346 int d = *stack.currentFrame->args.subjectPtr; | |
1347 if (d < 128) | |
1348 d = toLowerCase(d); | |
1349 if (stack.currentFrame->locals.fc == d) | |
1350 break; | |
1351 ++stack.currentFrame->args.subjectPtr; | |
1352 } | |
1353 for (;;) { | |
1354 RECURSIVE_MATCH(40, stack.currentFrame->args.instruc
tionPtr, stack.currentFrame->args.bracketChain); | |
1355 if (isMatch) | |
1356 RRETURN; | |
1357 if (stack.currentFrame->args.subjectPtr-- == stack.c
urrentFrame->locals.subjectPtrAtStartOfInstruction) | |
1358 break; /* Stop if tried at original pos *
/ | |
1359 } | |
1360 | |
1361 RRETURN; | |
1362 } | |
1363 /* Control never reaches here */ | |
1364 } | |
1365 | |
1366 /* Caseful comparisons */ | |
1367 | |
1368 else { | |
1369 for (int i = 1; i <= min; i++) { | |
1370 int d = *stack.currentFrame->args.subjectPtr++; | |
1371 if (stack.currentFrame->locals.fc == d) | |
1372 RRETURN_NO_MATCH; | |
1373 } | |
1374 | |
1375 if (min == stack.currentFrame->locals.max) | |
1376 NEXT_OPCODE; | |
1377 | |
1378 if (minimize) { | |
1379 for (stack.currentFrame->locals.fi = min;; stack.current
Frame->locals.fi++) { | |
1380 RECURSIVE_MATCH(42, stack.currentFrame->args.instruc
tionPtr, stack.currentFrame->args.bracketChain); | |
1381 if (isMatch) | |
1382 RRETURN; | |
1383 int d = *stack.currentFrame->args.subjectPtr++; | |
1384 if (stack.currentFrame->locals.fi >= stack.currentFr
ame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack
.currentFrame->locals.fc == d) | |
1385 RRETURN; | |
1386 } | |
1387 /* Control never reaches here */ | |
1388 } | |
1389 | |
1390 /* Maximize case */ | |
1391 | |
1392 else { | |
1393 stack.currentFrame->locals.subjectPtrAtStartOfInstructio
n = stack.currentFrame->args.subjectPtr; | |
1394 | |
1395 for (int i = min; i < stack.currentFrame->locals.max; i+
+) { | |
1396 if (stack.currentFrame->args.subjectPtr >= md.endSub
ject) | |
1397 break; | |
1398 int d = *stack.currentFrame->args.subjectPtr; | |
1399 if (stack.currentFrame->locals.fc == d) | |
1400 break; | |
1401 ++stack.currentFrame->args.subjectPtr; | |
1402 } | |
1403 for (;;) { | |
1404 RECURSIVE_MATCH(44, stack.currentFrame->args.instruc
tionPtr, stack.currentFrame->args.bracketChain); | |
1405 if (isMatch) | |
1406 RRETURN; | |
1407 if (stack.currentFrame->args.subjectPtr-- == stack.c
urrentFrame->locals.subjectPtrAtStartOfInstruction) | |
1408 break; /* Stop if tried at original pos *
/ | |
1409 } | |
1410 | |
1411 RRETURN; | |
1412 } | |
1413 } | |
1414 /* Control never reaches here */ | |
1415 | |
1416 /* Match a single character type repeatedly; several different opcod
es | |
1417 share code. This is very similar to the code for single characters,
but we | |
1418 repeat it in the interests of efficiency. */ | |
1419 | |
1420 BEGIN_OPCODE(TYPEEXACT): | |
1421 min = stack.currentFrame->locals.max = get2ByteValue(stack.curre
ntFrame->args.instructionPtr + 1); | |
1422 minimize = true; | |
1423 stack.currentFrame->args.instructionPtr += 3; | |
1424 goto REPEATTYPE; | |
1425 | |
1426 BEGIN_OPCODE(TYPEUPTO): | |
1427 BEGIN_OPCODE(TYPEMINUPTO): | |
1428 min = 0; | |
1429 stack.currentFrame->locals.max = get2ByteValue(stack.currentFram
e->args.instructionPtr + 1); | |
1430 minimize = *stack.currentFrame->args.instructionPtr == OP_TYPEMI
NUPTO; | |
1431 stack.currentFrame->args.instructionPtr += 3; | |
1432 goto REPEATTYPE; | |
1433 | |
1434 BEGIN_OPCODE(TYPESTAR): | |
1435 BEGIN_OPCODE(TYPEMINSTAR): | |
1436 BEGIN_OPCODE(TYPEPLUS): | |
1437 BEGIN_OPCODE(TYPEMINPLUS): | |
1438 BEGIN_OPCODE(TYPEQUERY): | |
1439 BEGIN_OPCODE(TYPEMINQUERY): | |
1440 repeatInformationFromInstructionOffset(*stack.currentFrame->args
.instructionPtr++ - OP_TYPESTAR, minimize, min, stack.currentFrame->locals.max); | |
1441 | |
1442 /* Common code for all repeated single character type matches. N
ote that | |
1443 in UTF-8 mode, '.' matches a character of any length, but for t
he other | |
1444 character types, the valid characters are all one-byte long. */ | |
1445 | |
1446 REPEATTYPE: | |
1447 stack.currentFrame->locals.ctype = *stack.currentFrame->args.ins
tructionPtr++; /* Code for the character type */ | |
1448 | |
1449 /* First, ensure the minimum number of matches are present. Use
inline | |
1450 code for maximizing the speed, and do the type test once at the
start | |
1451 (i.e. keep it out of the loop). Also we can test that there are
at least | |
1452 the minimum number of characters before we start. */ | |
1453 | |
1454 if (min > md.endSubject - stack.currentFrame->args.subjectPtr) | |
1455 RRETURN_NO_MATCH; | |
1456 if (min > 0) { | |
1457 switch (stack.currentFrame->locals.ctype) { | |
1458 case OP_NOT_NEWLINE: | |
1459 for (int i = 1; i <= min; i++) { | |
1460 if (isNewline(*stack.currentFrame->args.subjectP
tr)) | |
1461 RRETURN_NO_MATCH; | |
1462 ++stack.currentFrame->args.subjectPtr; | |
1463 } | |
1464 break; | |
1465 | |
1466 case OP_NOT_DIGIT: | |
1467 for (int i = 1; i <= min; i++) { | |
1468 if (isASCIIDigit(*stack.currentFrame->args.subje
ctPtr)) | |
1469 RRETURN_NO_MATCH; | |
1470 ++stack.currentFrame->args.subjectPtr; | |
1471 } | |
1472 break; | |
1473 | |
1474 case OP_DIGIT: | |
1475 for (int i = 1; i <= min; i++) { | |
1476 if (!isASCIIDigit(*stack.currentFrame->args.subj
ectPtr)) | |
1477 RRETURN_NO_MATCH; | |
1478 ++stack.currentFrame->args.subjectPtr; | |
1479 } | |
1480 break; | |
1481 | |
1482 case OP_NOT_WHITESPACE: | |
1483 for (int i = 1; i <= min; i++) { | |
1484 if (isSpaceChar(*stack.currentFrame->args.subjec
tPtr)) | |
1485 RRETURN_NO_MATCH; | |
1486 ++stack.currentFrame->args.subjectPtr; | |
1487 } | |
1488 break; | |
1489 | |
1490 case OP_WHITESPACE: | |
1491 for (int i = 1; i <= min; i++) { | |
1492 if (!isSpaceChar(*stack.currentFrame->args.subje
ctPtr)) | |
1493 RRETURN_NO_MATCH; | |
1494 ++stack.currentFrame->args.subjectPtr; | |
1495 } | |
1496 break; | |
1497 | |
1498 case OP_NOT_WORDCHAR: | |
1499 for (int i = 1; i <= min; i++) { | |
1500 if (isWordChar(*stack.currentFrame->args.subject
Ptr)) | |
1501 RRETURN_NO_MATCH; | |
1502 ++stack.currentFrame->args.subjectPtr; | |
1503 } | |
1504 break; | |
1505 | |
1506 case OP_WORDCHAR: | |
1507 for (int i = 1; i <= min; i++) { | |
1508 if (!isWordChar(*stack.currentFrame->args.subjec
tPtr)) | |
1509 RRETURN_NO_MATCH; | |
1510 ++stack.currentFrame->args.subjectPtr; | |
1511 } | |
1512 break; | |
1513 | |
1514 default: | |
1515 ASSERT_NOT_REACHED(); | |
1516 return matchError(JSRegExpErrorInternal, stack); | |
1517 } /* End switch(stack.currentFrame->locals.ctype) */ | |
1518 } | |
1519 | |
1520 /* If min = max, continue at the same level without recursing */ | |
1521 | |
1522 if (min == stack.currentFrame->locals.max) | |
1523 NEXT_OPCODE; | |
1524 | |
1525 /* If minimizing, we have to test the rest of the pattern before
each | |
1526 subsequent match. */ | |
1527 | |
1528 if (minimize) { | |
1529 for (stack.currentFrame->locals.fi = min;; stack.currentFram
e->locals.fi++) { | |
1530 RECURSIVE_MATCH(48, stack.currentFrame->args.instruction
Ptr, stack.currentFrame->args.bracketChain); | |
1531 if (isMatch) | |
1532 RRETURN; | |
1533 if (stack.currentFrame->locals.fi >= stack.currentFrame-
>locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) | |
1534 RRETURN; | |
1535 | |
1536 int c = *stack.currentFrame->args.subjectPtr++; | |
1537 switch (stack.currentFrame->locals.ctype) { | |
1538 case OP_NOT_NEWLINE: | |
1539 if (isNewline(c)) | |
1540 RRETURN; | |
1541 break; | |
1542 | |
1543 case OP_NOT_DIGIT: | |
1544 if (isASCIIDigit(c)) | |
1545 RRETURN; | |
1546 break; | |
1547 | |
1548 case OP_DIGIT: | |
1549 if (!isASCIIDigit(c)) | |
1550 RRETURN; | |
1551 break; | |
1552 | |
1553 case OP_NOT_WHITESPACE: | |
1554 if (isSpaceChar(c)) | |
1555 RRETURN; | |
1556 break; | |
1557 | |
1558 case OP_WHITESPACE: | |
1559 if (!isSpaceChar(c)) | |
1560 RRETURN; | |
1561 break; | |
1562 | |
1563 case OP_NOT_WORDCHAR: | |
1564 if (isWordChar(c)) | |
1565 RRETURN; | |
1566 break; | |
1567 | |
1568 case OP_WORDCHAR: | |
1569 if (!isWordChar(c)) | |
1570 RRETURN; | |
1571 break; | |
1572 | |
1573 default: | |
1574 ASSERT_NOT_REACHED(); | |
1575 return matchError(JSRegExpErrorInternal, stack); | |
1576 } | |
1577 } | |
1578 /* Control never reaches here */ | |
1579 } | |
1580 | |
1581 /* If maximizing it is worth using inline code for speed, doing
the type | |
1582 test once at the start (i.e. keep it out of the loop). */ | |
1583 | |
1584 else { | |
1585 stack.currentFrame->locals.subjectPtrAtStartOfInstruction =
stack.currentFrame->args.subjectPtr; /* Remember where we started */ | |
1586 | |
1587 switch (stack.currentFrame->locals.ctype) { | |
1588 case OP_NOT_NEWLINE: | |
1589 for (int i = min; i < stack.currentFrame->locals.max
; i++) { | |
1590 if (stack.currentFrame->args.subjectPtr >= md.en
dSubject || isNewline(*stack.currentFrame->args.subjectPtr)) | |
1591 break; | |
1592 stack.currentFrame->args.subjectPtr++; | |
1593 } | |
1594 break; | |
1595 | |
1596 case OP_NOT_DIGIT: | |
1597 for (int i = min; i < stack.currentFrame->locals.max
; i++) { | |
1598 if (stack.currentFrame->args.subjectPtr >= md.en
dSubject) | |
1599 break; | |
1600 int c = *stack.currentFrame->args.subjectPtr; | |
1601 if (isASCIIDigit(c)) | |
1602 break; | |
1603 ++stack.currentFrame->args.subjectPtr; | |
1604 } | |
1605 break; | |
1606 | |
1607 case OP_DIGIT: | |
1608 for (int i = min; i < stack.currentFrame->locals.max
; i++) { | |
1609 if (stack.currentFrame->args.subjectPtr >= md.en
dSubject) | |
1610 break; | |
1611 int c = *stack.currentFrame->args.subjectPtr; | |
1612 if (!isASCIIDigit(c)) | |
1613 break; | |
1614 ++stack.currentFrame->args.subjectPtr; | |
1615 } | |
1616 break; | |
1617 | |
1618 case OP_NOT_WHITESPACE: | |
1619 for (int i = min; i < stack.currentFrame->locals.max
; i++) { | |
1620 if (stack.currentFrame->args.subjectPtr >= md.en
dSubject) | |
1621 break; | |
1622 int c = *stack.currentFrame->args.subjectPtr; | |
1623 if (isSpaceChar(c)) | |
1624 break; | |
1625 ++stack.currentFrame->args.subjectPtr; | |
1626 } | |
1627 break; | |
1628 | |
1629 case OP_WHITESPACE: | |
1630 for (int i = min; i < stack.currentFrame->locals.max
; i++) { | |
1631 if (stack.currentFrame->args.subjectPtr >= md.en
dSubject) | |
1632 break; | |
1633 int c = *stack.currentFrame->args.subjectPtr; | |
1634 if (!isSpaceChar(c)) | |
1635 break; | |
1636 ++stack.currentFrame->args.subjectPtr; | |
1637 } | |
1638 break; | |
1639 | |
1640 case OP_NOT_WORDCHAR: | |
1641 for (int i = min; i < stack.currentFrame->locals.max
; i++) { | |
1642 if (stack.currentFrame->args.subjectPtr >= md.en
dSubject) | |
1643 break; | |
1644 int c = *stack.currentFrame->args.subjectPtr; | |
1645 if (isWordChar(c)) | |
1646 break; | |
1647 ++stack.currentFrame->args.subjectPtr; | |
1648 } | |
1649 break; | |
1650 | |
1651 case OP_WORDCHAR: | |
1652 for (int i = min; i < stack.currentFrame->locals.max
; i++) { | |
1653 if (stack.currentFrame->args.subjectPtr >= md.en
dSubject) | |
1654 break; | |
1655 int c = *stack.currentFrame->args.subjectPtr; | |
1656 if (!isWordChar(c)) | |
1657 break; | |
1658 ++stack.currentFrame->args.subjectPtr; | |
1659 } | |
1660 break; | |
1661 | |
1662 default: | |
1663 ASSERT_NOT_REACHED(); | |
1664 return matchError(JSRegExpErrorInternal, stack); | |
1665 } | |
1666 | |
1667 /* stack.currentFrame->args.subjectPtr is now past the end o
f the maximum run */ | |
1668 | |
1669 for (;;) { | |
1670 RECURSIVE_MATCH(52, stack.currentFrame->args.instruction
Ptr, stack.currentFrame->args.bracketChain); | |
1671 if (isMatch) | |
1672 RRETURN; | |
1673 if (stack.currentFrame->args.subjectPtr-- == stack.curre
ntFrame->locals.subjectPtrAtStartOfInstruction) | |
1674 break; /* Stop if tried at original pos */ | |
1675 } | |
1676 | |
1677 /* Get here if we can't make it match with any permitted rep
etitions */ | |
1678 | |
1679 RRETURN; | |
1680 } | |
1681 /* Control never reaches here */ | |
1682 | |
1683 BEGIN_OPCODE(CRMINPLUS): | |
1684 BEGIN_OPCODE(CRMINQUERY): | |
1685 BEGIN_OPCODE(CRMINRANGE): | |
1686 BEGIN_OPCODE(CRMINSTAR): | |
1687 BEGIN_OPCODE(CRPLUS): | |
1688 BEGIN_OPCODE(CRQUERY): | |
1689 BEGIN_OPCODE(CRRANGE): | |
1690 BEGIN_OPCODE(CRSTAR): | |
1691 ASSERT_NOT_REACHED(); | |
1692 return matchError(JSRegExpErrorInternal, stack); | |
1693 | |
1694 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP | |
1695 CAPTURING_BRACKET: | |
1696 #else | |
1697 default: | |
1698 #endif | |
1699 /* Opening capturing bracket. If there is space in the offset ve
ctor, save | |
1700 the current subject position in the working slot at the top of
the vector. We | |
1701 mustn't change the current values of the data slot, because the
y may be set | |
1702 from a previous iteration of this group, and be referred to by
a reference | |
1703 inside the group. | |
1704 | |
1705 If the bracket fails to match, we need to restore this value an
d also the | |
1706 values of the final offsets, in case they were set by a previou
s iteration of | |
1707 the same bracket. | |
1708 | |
1709 If there isn't enough space in the offset vector, treat this as
if it were a | |
1710 non-capturing bracket. Don't worry about setting the flag for t
he error case | |
1711 here; that is handled in the code for KET. */ | |
1712 | |
1713 ASSERT(*stack.currentFrame->args.instructionPtr > OP_BRA); | |
1714 | |
1715 stack.currentFrame->locals.number = *stack.currentFrame->args.in
structionPtr - OP_BRA; | |
1716 | |
1717 /* For extended extraction brackets (large number), we have to f
ish out the | |
1718 number from a dummy opcode at the start. */ | |
1719 | |
1720 if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX) | |
1721 stack.currentFrame->locals.number = get2ByteValue(stack.curr
entFrame->args.instructionPtr + 2 + LINK_SIZE); | |
1722 stack.currentFrame->locals.offset = stack.currentFrame->locals.n
umber << 1; | |
1723 | |
1724 #ifdef DEBUG | |
1725 printf("start bracket %d subject=", stack.currentFrame->locals.n
umber); | |
1726 pchars(stack.currentFrame->args.subjectPtr, 16, true, md); | |
1727 printf("\n"); | |
1728 #endif | |
1729 | |
1730 if (stack.currentFrame->locals.offset < md.offsetMax) { | |
1731 stack.currentFrame->locals.saveOffset1 = md.offsetVector[sta
ck.currentFrame->locals.offset]; | |
1732 stack.currentFrame->locals.saveOffset2 = md.offsetVector[sta
ck.currentFrame->locals.offset + 1]; | |
1733 stack.currentFrame->locals.saveOffset3 = md.offsetVector[md.
offsetEnd - stack.currentFrame->locals.number]; | |
1734 | |
1735 DPRINTF(("saving %d %d %d\n", stack.currentFrame->locals.sav
eOffset1, stack.currentFrame->locals.saveOffset2, stack.currentFrame->locals.sav
eOffset3)); | |
1736 md.offsetVector[md.offsetEnd - stack.currentFrame->locals.nu
mber] = stack.currentFrame->args.subjectPtr - md.startSubject; | |
1737 | |
1738 do { | |
1739 RECURSIVE_MATCH_STARTNG_NEW_GROUP(1, stack.currentFrame-
>args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); | |
1740 if (isMatch) | |
1741 RRETURN; | |
1742 stack.currentFrame->args.instructionPtr += getLinkValue(
stack.currentFrame->args.instructionPtr + 1); | |
1743 } while (*stack.currentFrame->args.instructionPtr == OP_ALT)
; | |
1744 | |
1745 DPRINTF(("bracket %d failed\n", stack.currentFrame->locals.n
umber)); | |
1746 | |
1747 md.offsetVector[stack.currentFrame->locals.offset] = stack.c
urrentFrame->locals.saveOffset1; | |
1748 md.offsetVector[stack.currentFrame->locals.offset + 1] = sta
ck.currentFrame->locals.saveOffset2; | |
1749 md.offsetVector[md.offsetEnd - stack.currentFrame->locals.nu
mber] = stack.currentFrame->locals.saveOffset3; | |
1750 | |
1751 RRETURN; | |
1752 } | |
1753 | |
1754 /* Insufficient room for saving captured contents */ | |
1755 | |
1756 goto NON_CAPTURING_BRACKET; | |
1757 } | |
1758 | |
1759 /* Do not stick any code in here without much thought; it is assumed | |
1760 that "continue" in the code above comes out to here to repeat the main | |
1761 loop. */ | |
1762 | |
1763 } /* End of main loop */ | |
1764 | |
1765 ASSERT_NOT_REACHED(); | |
1766 | |
1767 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION | |
1768 | |
1769 RRETURN_SWITCH: | |
1770 switch (stack.currentFrame->returnLocation) { | |
1771 case 0: goto RETURN; | |
1772 case 1: goto RRETURN_1; | |
1773 case 2: goto RRETURN_2; | |
1774 case 6: goto RRETURN_6; | |
1775 case 7: goto RRETURN_7; | |
1776 case 14: goto RRETURN_14; | |
1777 case 15: goto RRETURN_15; | |
1778 case 16: goto RRETURN_16; | |
1779 case 17: goto RRETURN_17; | |
1780 case 18: goto RRETURN_18; | |
1781 case 19: goto RRETURN_19; | |
1782 case 20: goto RRETURN_20; | |
1783 case 21: goto RRETURN_21; | |
1784 case 22: goto RRETURN_22; | |
1785 case 24: goto RRETURN_24; | |
1786 case 26: goto RRETURN_26; | |
1787 case 27: goto RRETURN_27; | |
1788 case 28: goto RRETURN_28; | |
1789 case 29: goto RRETURN_29; | |
1790 case 30: goto RRETURN_30; | |
1791 case 31: goto RRETURN_31; | |
1792 case 38: goto RRETURN_38; | |
1793 case 40: goto RRETURN_40; | |
1794 case 42: goto RRETURN_42; | |
1795 case 44: goto RRETURN_44; | |
1796 case 48: goto RRETURN_48; | |
1797 case 52: goto RRETURN_52; | |
1798 } | |
1799 | |
1800 ASSERT_NOT_REACHED(); | |
1801 return matchError(JSRegExpErrorInternal, stack); | |
1802 | |
1803 #endif | |
1804 | |
1805 RETURN: | |
1806 return isMatch; | |
1807 } | |
1808 | |
1809 | |
1810 /************************************************* | |
1811 * Execute a Regular Expression * | |
1812 *************************************************/ | |
1813 | |
1814 /* This function applies a compiled re to a subject string and picks out | |
1815 portions of the string if it matches. Two elements in the vector are set for | |
1816 each substring: the offsets to the start and end of the substring. | |
1817 | |
1818 Arguments: | |
1819 re points to the compiled expression | |
1820 extra_data points to extra data or is NULL | |
1821 subject points to the subject string | |
1822 length length of subject string (may contain binary zeros) | |
1823 start_offset where to start in the subject string | |
1824 options option bits | |
1825 offsets points to a vector of ints to be filled in with offsets | |
1826 offsetcount the number of elements in the vector | |
1827 | |
1828 Returns: > 0 => success; value is the number of elements filled in | |
1829 = 0 => success, but offsets is not big enough | |
1830 -1 => failed to match | |
1831 < -1 => some kind of unexpected problem | |
1832 */ | |
1833 | |
1834 static void tryFirstByteOptimization(const UChar*& subjectPtr, const UChar* endS
ubject, int first_byte, bool first_byte_caseless, bool useMultiLineFirstCharOpti
mization, const UChar* originalSubjectStart) | |
1835 { | |
1836 // If first_byte is set, try scanning to the first instance of that byte | |
1837 // no need to try and match against any earlier part of the subject string. | |
1838 if (first_byte >= 0) { | |
1839 UChar first_char = first_byte; | |
1840 if (first_byte_caseless) | |
1841 while (subjectPtr < endSubject) { | |
1842 int c = *subjectPtr; | |
1843 if (c > 127) | |
1844 break; | |
1845 if (toLowerCase(c) == first_char) | |
1846 break; | |
1847 subjectPtr++; | |
1848 } | |
1849 else { | |
1850 while (subjectPtr < endSubject && *subjectPtr != first_char) | |
1851 subjectPtr++; | |
1852 } | |
1853 } else if (useMultiLineFirstCharOptimization) { | |
1854 /* Or to just after \n for a multiline match if possible */ | |
1855 // I'm not sure why this != originalSubjectStart check is necessary -- e
cs 11/18/07 | |
1856 if (subjectPtr > originalSubjectStart) { | |
1857 while (subjectPtr < endSubject && !isNewline(subjectPtr[-1])) | |
1858 subjectPtr++; | |
1859 } | |
1860 } | |
1861 } | |
1862 | |
1863 static bool tryRequiredByteOptimization(const UChar*& subjectPtr, const UChar* e
ndSubject, int req_byte, int req_byte2, bool req_byte_caseless, bool hasFirstByt
e, const UChar*& reqBytePtr) | |
1864 { | |
1865 /* If req_byte is set, we know that that character must appear in the subjec
t | |
1866 for the match to succeed. If the first character is set, req_byte must be | |
1867 later in the subject; otherwise the test starts at the match point. This | |
1868 optimization can save a huge amount of backtracking in patterns with nested | |
1869 unlimited repeats that aren't going to match. Writing separate code for | |
1870 cased/caseless versions makes it go faster, as does using an autoincrement | |
1871 and backing off on a match. | |
1872 | |
1873 HOWEVER: when the subject string is very, very long, searching to its end c
an | |
1874 take a long time, and give bad performance on quite ordinary patterns. This | |
1875 showed up when somebody was matching /^C/ on a 32-megabyte string... so we | |
1876 don't do this when the string is sufficiently long. | |
1877 */ | |
1878 | |
1879 if (req_byte >= 0 && endSubject - subjectPtr < REQ_BYTE_MAX) { | |
1880 const UChar* p = subjectPtr + (hasFirstByte ? 1 : 0); | |
1881 | |
1882 /* We don't need to repeat the search if we haven't yet reached the | |
1883 place we found it at last time. */ | |
1884 | |
1885 if (p > reqBytePtr) { | |
1886 if (req_byte_caseless) { | |
1887 while (p < endSubject) { | |
1888 int pp = *p++; | |
1889 if (pp == req_byte || pp == req_byte2) { | |
1890 p--; | |
1891 break; | |
1892 } | |
1893 } | |
1894 } else { | |
1895 while (p < endSubject) { | |
1896 if (*p++ == req_byte) { | |
1897 p--; | |
1898 break; | |
1899 } | |
1900 } | |
1901 } | |
1902 | |
1903 /* If we can't find the required character, break the matching loop
*/ | |
1904 | |
1905 if (p >= endSubject) | |
1906 return true; | |
1907 | |
1908 /* If we have found the required character, save the point where we | |
1909 found it, so that we don't search again next time round the loop if | |
1910 the start hasn't passed this character yet. */ | |
1911 | |
1912 reqBytePtr = p; | |
1913 } | |
1914 } | |
1915 return false; | |
1916 } | |
1917 | |
1918 int jsRegExpExecute(const JSRegExp* re, | |
1919 const UChar* subject, int length, int start_offset, int* off
sets, | |
1920 int offsetcount) | |
1921 { | |
1922 ASSERT(re); | |
1923 ASSERT(subject); | |
1924 ASSERT(offsetcount >= 0); | |
1925 ASSERT(offsets || offsetcount == 0); | |
1926 | |
1927 MatchData matchBlock; | |
1928 matchBlock.startSubject = subject; | |
1929 matchBlock.endSubject = matchBlock.startSubject + length; | |
1930 const UChar* endSubject = matchBlock.endSubject; | |
1931 | |
1932 matchBlock.multiline = (re->options & MatchAcrossMultipleLinesOption); | |
1933 matchBlock.ignoreCase = (re->options & IgnoreCaseOption); | |
1934 | |
1935 /* If the expression has got more back references than the offsets supplied
can | |
1936 hold, we get a temporary chunk of working store to use during the matching. | |
1937 Otherwise, we can use the vector supplied, rounding down its size to a mult
iple | |
1938 of 3. */ | |
1939 | |
1940 int ocount = offsetcount - (offsetcount % 3); | |
1941 | |
1942 // FIXME: This is lame that we have to second-guess our caller here. | |
1943 // The API should change to either fail-hard when we don't have enough offse
t space | |
1944 // or that we shouldn't ask our callers to pre-allocate in the first place. | |
1945 bool using_temporary_offsets = false; | |
1946 if (re->top_backref > 0 && re->top_backref >= ocount/3) { | |
1947 ocount = re->top_backref * 3 + 3; | |
1948 matchBlock.offsetVector = new int[ocount]; | |
1949 if (!matchBlock.offsetVector) | |
1950 return JSRegExpErrorNoMemory; | |
1951 using_temporary_offsets = true; | |
1952 } else | |
1953 matchBlock.offsetVector = offsets; | |
1954 | |
1955 matchBlock.offsetEnd = ocount; | |
1956 matchBlock.offsetMax = (2*ocount)/3; | |
1957 matchBlock.offsetOverflow = false; | |
1958 | |
1959 /* Compute the minimum number of offsets that we need to reset each time. Do
ing | |
1960 this makes a huge difference to execution time when there aren't many brack
ets | |
1961 in the pattern. */ | |
1962 | |
1963 int resetcount = 2 + re->top_bracket * 2; | |
1964 if (resetcount > offsetcount) | |
1965 resetcount = ocount; | |
1966 | |
1967 /* Reset the working variable associated with each extraction. These should | |
1968 never be used unless previously set, but they get saved and restored, and s
o we | |
1969 initialize them to avoid reading uninitialized locations. */ | |
1970 | |
1971 if (matchBlock.offsetVector) { | |
1972 int* iptr = matchBlock.offsetVector + ocount; | |
1973 int* iend = iptr - resetcount/2 + 1; | |
1974 while (--iptr >= iend) | |
1975 *iptr = -1; | |
1976 } | |
1977 | |
1978 /* Set up the first character to match, if available. The first_byte value i
s | |
1979 never set for an anchored regular expression, but the anchoring may be forc
ed | |
1980 at run time, so we have to test for anchoring. The first char may be unset
for | |
1981 an unanchored pattern, of course. If there's no first char and the pattern
was | |
1982 studied, there may be a bitmap of possible first characters. */ | |
1983 | |
1984 bool first_byte_caseless = false; | |
1985 int first_byte = -1; | |
1986 if (re->options & UseFirstByteOptimizationOption) { | |
1987 first_byte = re->first_byte & 255; | |
1988 if ((first_byte_caseless = (re->first_byte & REQ_IGNORE_CASE))) | |
1989 first_byte = toLowerCase(first_byte); | |
1990 } | |
1991 | |
1992 /* For anchored or unanchored matches, there may be a "last known required | |
1993 character" set. */ | |
1994 | |
1995 bool req_byte_caseless = false; | |
1996 int req_byte = -1; | |
1997 int req_byte2 = -1; | |
1998 if (re->options & UseRequiredByteOptimizationOption) { | |
1999 req_byte = re->req_byte & 255; // FIXME: This optimization could be made
to work for UTF16 chars as well... | |
2000 req_byte_caseless = (re->req_byte & REQ_IGNORE_CASE); | |
2001 req_byte2 = flipCase(req_byte); | |
2002 } | |
2003 | |
2004 /* Loop for handling unanchored repeated matching attempts; for anchored reg
exs | |
2005 the loop runs just once. */ | |
2006 | |
2007 const UChar* startMatch = subject + start_offset; | |
2008 const UChar* reqBytePtr = startMatch - 1; | |
2009 bool useMultiLineFirstCharOptimization = re->options & UseMultiLineFirstByte
OptimizationOption; | |
2010 | |
2011 do { | |
2012 /* Reset the maximum number of extractions we might see. */ | |
2013 if (matchBlock.offsetVector) { | |
2014 int* iptr = matchBlock.offsetVector; | |
2015 int* iend = iptr + resetcount; | |
2016 while (iptr < iend) | |
2017 *iptr++ = -1; | |
2018 } | |
2019 | |
2020 tryFirstByteOptimization(startMatch, endSubject, first_byte, first_byte_
caseless, useMultiLineFirstCharOptimization, matchBlock.startSubject + start_off
set); | |
2021 if (tryRequiredByteOptimization(startMatch, endSubject, req_byte, req_by
te2, req_byte_caseless, first_byte >= 0, reqBytePtr)) | |
2022 break; | |
2023 | |
2024 /* When a match occurs, substrings will be set for all internal extracti
ons; | |
2025 we just need to set up the whole thing as substring 0 before returning.
If | |
2026 there were too many extractions, set the return code to zero. In the ca
se | |
2027 where we had to get some local store to hold offsets for backreferences
, copy | |
2028 those back references that we can. In this case there need not be overf
low | |
2029 if certain parts of the pattern were not used. */ | |
2030 | |
2031 /* The code starts after the JSRegExp block and the capture name table.
*/ | |
2032 const unsigned char* start_code = reinterpret_cast<const unsigned char*>
(re + 1); | |
2033 | |
2034 int returnCode = match(startMatch, start_code, 2, matchBlock); | |
2035 | |
2036 /* When the result is no match, advance the pointer to the next characte
r | |
2037 and continue. */ | |
2038 if (returnCode == 0) { | |
2039 startMatch++; | |
2040 continue; | |
2041 } | |
2042 | |
2043 if (returnCode != 1) { | |
2044 ASSERT(returnCode == JSRegExpErrorHitLimit || returnCode == JSRegExp
ErrorNoMemory); | |
2045 DPRINTF((">>>> error: returning %d\n", returnCode)); | |
2046 return returnCode; | |
2047 } | |
2048 | |
2049 /* We have a match! Copy the offset information from temporary store if | |
2050 necessary */ | |
2051 | |
2052 if (using_temporary_offsets) { | |
2053 if (offsetcount >= 4) { | |
2054 memcpy(offsets + 2, matchBlock.offsetVector + 2, (offsetcount -
2) * sizeof(int)); | |
2055 DPRINTF(("Copied offsets from temporary memory\n")); | |
2056 } | |
2057 if (matchBlock.endOffsetTop > offsetcount) | |
2058 matchBlock.offsetOverflow = true; | |
2059 | |
2060 DPRINTF(("Freeing temporary memory\n")); | |
2061 delete [] matchBlock.offsetVector; | |
2062 } | |
2063 | |
2064 returnCode = matchBlock.offsetOverflow ? 0 : matchBlock.endOffsetTop / 2
; | |
2065 | |
2066 if (offsetcount < 2) | |
2067 returnCode = 0; | |
2068 else { | |
2069 offsets[0] = startMatch - matchBlock.startSubject; | |
2070 offsets[1] = matchBlock.endMatchPtr - matchBlock.startSubject; | |
2071 } | |
2072 | |
2073 DPRINTF((">>>> returning %d\n", returnCode)); | |
2074 return returnCode; | |
2075 } while (!(re->options & IsAnchoredOption) && startMatch <= endSubject); | |
2076 | |
2077 if (using_temporary_offsets) { | |
2078 DPRINTF(("Freeing temporary memory\n")); | |
2079 delete [] matchBlock.offsetVector; | |
2080 } | |
2081 | |
2082 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); | |
2083 return JSRegExpErrorNoMatch; | |
2084 } | |
2085 | |
2086 } } // namespace dart::jscre | |
OLD | NEW |