Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(89)

Side by Side Diff: runtime/third_party/jscre/pcre_compile.cpp

Issue 1071713003: - Remove JSCRE from the runtime. (Closed) Base URL: http://dart.googlecode.com/svn/branches/bleeding_edge/dart/
Patch Set: Created 5 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 /* This is JavaScriptCore's variant of the PCRE library. While this library
2 started out as a copy of PCRE, many of the features of PCRE have been
3 removed. This library now supports only the regular expression features
4 required by the JavaScript language specification, and has only the functions
5 needed by JavaScriptCore and the rest of WebKit.
6
7 Originally written by Philip Hazel
8 Copyright (c) 1997-2006 University of Cambridge
9 Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved.
10 Copyright (C) 2007 Eric Seidel <eric@webkit.org>
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 /* This module contains the external function jsRegExpExecute(), along with
42 supporting internal functions that are not used by other modules. */
43
44 #include "config.h"
45
46 #include "pcre_internal.h"
47
48 #include <string.h>
49 #include "ASCIICType.h"
50
51 /* Negative values for the firstchar and reqchar variables */
52
53 #define REQ_UNSET (-2)
54 #define REQ_NONE (-1)
55
56 /*************************************************
57 * Code parameters and static tables *
58 *************************************************/
59
60 /* Maximum number of items on the nested bracket stacks at compile time. This
61 applies to the nesting of all kinds of parentheses. It does not limit
62 un-nested, non-capturing parentheses. This number can be made bigger if
63 necessary - it is used to dimension one int and one unsigned char vector at
64 compile time. */
65
66 #define BRASTACK_SIZE 200
67
68 namespace dart { namespace jscre {
69
70 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
71 are simple data values; negative values are for special things like \d and so
72 on. Zero means further processing is needed (for things like \x), or the escape
73 is invalid. */
74
75 static const short escapes[] = {
76 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
77 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
78 '@', 0, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
79 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
80 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
81 0, 0, 0, '[', '\\', ']', '^', '_', /* X - _ */
82 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
83 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
84 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
85 0, 0, 0 /* x - z */
86 };
87
88 /* Error code numbers. They are given names so that they can more easily be
89 tracked. */
90
91 enum ErrorCode {
92 ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
93 ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17
94 };
95
96 /* The texts of compile-time error messages. These are "char *" because they
97 are passed to the outside world. */
98
99 static const char* errorText(ErrorCode code)
100 {
101 static const char errorTexts[] =
102 /* 1 */
103 "\\ at end of pattern\0"
104 "\\c at end of pattern\0"
105 "character value in \\x{...} sequence is too large\0"
106 "numbers out of order in {} quantifier\0"
107 /* 5 */
108 "number too big in {} quantifier\0"
109 "missing terminating ] for character class\0"
110 "internal error: code overflow\0"
111 "range out of order in character class\0"
112 "nothing to repeat\0"
113 /* 10 */
114 "unmatched parentheses\0"
115 "internal error: unexpected repeat\0"
116 "unrecognized character after (?\0"
117 "failed to get memory\0"
118 "missing )\0"
119 /* 15 */
120 "reference to non-existent subpattern\0"
121 "regular expression too large\0"
122 "parentheses nested too deeply"
123 ;
124
125 int i = code;
126 const char* text = errorTexts;
127 while (i > 1)
128 i -= !*text++;
129 return text;
130 }
131
132 /* Structure for passing "static" information around between the functions
133 doing the compiling. */
134
135 struct CompileData {
136 CompileData() {
137 top_backref = 0;
138 backrefMap = 0;
139 req_varyopt = 0;
140 needOuterBracket = false;
141 numCapturingBrackets = 0;
142 }
143 int top_backref; /* Maximum back reference */
144 unsigned backrefMap; /* Bitmap of low back refs */
145 int req_varyopt; /* "After variable item" flag for reqbyte */
146 bool needOuterBracket;
147 int numCapturingBrackets;
148 };
149
150 /* Definitions to allow mutual recursion */
151
152 static bool compileBracket(int, int*, unsigned char**, const UChar**, const UCha r*, ErrorCode*, int, int*, int*, CompileData&);
153 static bool bracketIsAnchored(const unsigned char* code);
154 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap , unsigned backrefMap);
155 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool ina ssert);
156
157 /*************************************************
158 * Handle escapes *
159 *************************************************/
160
161 /* This function is called when a \ has been encountered. It either returns a
162 positive value for a simple escape such as \n, or a negative value which
163 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
164 a positive value greater than 255 may be returned. On entry, ptr is pointing at
165 the \. On exit, it is on the final character of the escape sequence.
166
167 Arguments:
168 ptrptr points to the pattern position pointer
169 errorcodeptr points to the errorcode variable
170 bracount number of previous extracting brackets
171 options the options bits
172 isclass true if inside a character class
173
174 Returns: zero or positive => a data character
175 negative => a special escape sequence
176 on error, errorptr is set
177 */
178
179 static int checkEscape(const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int bracount, bool isclass)
180 {
181 const UChar* ptr = *ptrptr + 1;
182
183 /* If backslash is at the end of the pattern, it's an error. */
184 if (ptr == patternEnd) {
185 *errorcodeptr = ERR1;
186 *ptrptr = ptr;
187 return 0;
188 }
189
190 int c = *ptr;
191
192 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
193 a table. A non-zero result is something that can be returned immediately.
194 Otherwise further processing may be required. */
195
196 if (c < '0' || c > 'z') { /* Not alphameric */
197 } else if (int escapeValue = escapes[c - '0']) {
198 c = escapeValue;
199 if (isclass) {
200 if (-c == ESC_b)
201 c = '\b'; /* \b is backslash in a class */
202 else if (-c == ESC_B)
203 c = 'B'; /* and \B is a capital B in a class (in browsers event though ECMAScript 15.10.2.19 says it raises an error) */
204 }
205 /* Escapes that need further processing, or are illegal. */
206
207 } else {
208 switch (c) {
209 case '1':
210 case '2':
211 case '3':
212 case '4':
213 case '5':
214 case '6':
215 case '7':
216 case '8':
217 case '9':
218 /* Escape sequences starting with a non-zero digit are backrefer ences,
219 unless there are insufficient brackets, in which case they are octal
220 escape sequences. Those sequences end on the first non-octal ch aracter
221 or when we overflow 0-255, whichever comes first. */
222
223 if (!isclass) {
224 const UChar* oldptr = ptr;
225 c -= '0';
226 while ((ptr + 1 < patternEnd) && isASCIIDigit(ptr[1]) && c < = bracount)
227 c = c * 10 + *(++ptr) - '0';
228 if (c <= bracount) {
229 c = -(ESC_REF + c);
230 break;
231 }
232 ptr = oldptr; /* Put the pointer back and fall through */
233 }
234
235 /* Handle an octal number following \. If the first digit is 8 o r 9,
236 this is not octal. */
237
238 if ((c = *ptr) >= '8')
239 break;
240
241 /* \0 always starts an octal number, but we may drop through to here with a
242 larger first octal digit. */
243
244 case '0': {
245 c -= '0';
246 int i;
247 for (i = 1; i <= 2; ++i) {
248 if (ptr + i >= patternEnd || ptr[i] < '0' || ptr[i] > '7')
249 break;
250 int cc = c * 8 + ptr[i] - '0';
251 if (cc > 255)
252 break;
253 c = cc;
254 }
255 ptr += i - 1;
256 break;
257 }
258
259 case 'x': {
260 c = 0;
261 int i;
262 for (i = 1; i <= 2; ++i) {
263 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
264 c = 'x';
265 i = 1;
266 break;
267 }
268 int cc = ptr[i];
269 if (cc >= 'a')
270 cc -= 32; /* Convert to upper case */
271 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
272 }
273 ptr += i - 1;
274 break;
275 }
276
277 case 'u': {
278 c = 0;
279 int i;
280 for (i = 1; i <= 4; ++i) {
281 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
282 c = 'u';
283 i = 1;
284 break;
285 }
286 int cc = ptr[i];
287 if (cc >= 'a')
288 cc -= 32; /* Convert to upper case */
289 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
290 }
291 ptr += i - 1;
292 break;
293 }
294
295 case 'c':
296 if (++ptr == patternEnd) {
297 *errorcodeptr = ERR2;
298 return 0;
299 }
300 c = *ptr;
301
302 /* A letter is upper-cased; then the 0x40 bit is flipped. This c oding
303 is ASCII-specific, but then the whole concept of \cx is ASCII-s pecific. */
304 c = toASCIIUpper(c) ^ 0x40;
305 break;
306 }
307 }
308
309 *ptrptr = ptr;
310 return c;
311 }
312
313 /*************************************************
314 * Check for counted repeat *
315 *************************************************/
316
317 /* This function is called when a '{' is encountered in a place where it might
318 start a quantifier. It looks ahead to see if it really is a quantifier or not.
319 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
320 where the ddds are digits.
321
322 Arguments:
323 p pointer to the first char after '{'
324
325 Returns: true or false
326 */
327
328 static bool isCountedRepeat(const UChar* p, const UChar* patternEnd)
329 {
330 if (p >= patternEnd || !isASCIIDigit(*p))
331 return false;
332 p++;
333 while (p < patternEnd && isASCIIDigit(*p))
334 p++;
335 if (p < patternEnd && *p == '}')
336 return true;
337
338 if (p >= patternEnd || *p++ != ',')
339 return false;
340 if (p < patternEnd && *p == '}')
341 return true;
342
343 if (p >= patternEnd || !isASCIIDigit(*p))
344 return false;
345 p++;
346 while (p < patternEnd && isASCIIDigit(*p))
347 p++;
348
349 return (p < patternEnd && *p == '}');
350 }
351
352 /*************************************************
353 * Read repeat counts *
354 *************************************************/
355
356 /* Read an item of the form {n,m} and return the values. This is called only
357 after isCountedRepeat() has confirmed that a repeat-count quantifier exists,
358 so the syntax is guaranteed to be correct, but we need to check the values.
359
360 Arguments:
361 p pointer to first char after '{'
362 minp pointer to int for min
363 maxp pointer to int for max
364 returned as -1 if no max
365 errorcodeptr points to error code variable
366
367 Returns: pointer to '}' on success;
368 current ptr on error, with errorcodeptr set non-zero
369 */
370
371 static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, Error Code* errorcodeptr)
372 {
373 int min = 0;
374 int max = -1;
375
376 /* Read the minimum value and do a paranoid check: a negative value indicate s
377 an integer overflow. */
378
379 while (isASCIIDigit(*p))
380 min = min * 10 + *p++ - '0';
381 if (min < 0 || min > 65535) {
382 *errorcodeptr = ERR5;
383 return p;
384 }
385
386 /* Read the maximum value if there is one, and again do a paranoid on its si ze.
387 Also, max must not be less than min. */
388
389 if (*p == '}')
390 max = min;
391 else {
392 if (*(++p) != '}') {
393 max = 0;
394 while (isASCIIDigit(*p))
395 max = max * 10 + *p++ - '0';
396 if (max < 0 || max > 65535) {
397 *errorcodeptr = ERR5;
398 return p;
399 }
400 if (max < min) {
401 *errorcodeptr = ERR4;
402 return p;
403 }
404 }
405 }
406
407 /* Fill in the required variables, and pass back the pointer to the terminat ing
408 '}'. */
409
410 *minp = min;
411 *maxp = max;
412 return p;
413 }
414
415 /*************************************************
416 * Find first significant op code *
417 *************************************************/
418
419 /* This is called by several functions that scan a compiled expression looking
420 for a fixed first character, or an anchoring op code etc. It skips over things
421 that do not influence this.
422
423 Arguments:
424 code pointer to the start of the group
425 Returns: pointer to the first significant opcode
426 */
427
428 static const unsigned char* firstSignificantOpcode(const unsigned char* code)
429 {
430 while (*code == OP_BRANUMBER)
431 code += 3;
432 return code;
433 }
434
435 static const unsigned char* firstSignificantOpcodeSkippingAssertions(const unsig ned char* code)
436 {
437 while (true) {
438 switch (*code) {
439 case OP_ASSERT_NOT:
440 advanceToEndOfBracket(code);
441 code += 1 + LINK_SIZE;
442 break;
443 case OP_WORD_BOUNDARY:
444 case OP_NOT_WORD_BOUNDARY:
445 ++code;
446 break;
447 case OP_BRANUMBER:
448 code += 3;
449 break;
450 default:
451 return code;
452 }
453 }
454 }
455
456 /*************************************************
457 * Get othercase range *
458 *************************************************/
459
460 /* This function is passed the start and end of a class range, in UTF-8 mode
461 with UCP support. It searches up the characters, looking for internal ranges of
462 characters in the "other" case. Each call returns the next one, updating the
463 start address.
464
465 Arguments:
466 cptr points to starting character value; updated
467 d end value
468 ocptr where to put start of othercase range
469 odptr where to put end of othercase range
470
471 Yield: true when range returned; false when no more
472 */
473
474 static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr)
475 {
476 int c, othercase = 0;
477
478 for (c = *cptr; c <= d; c++) {
479 if ((othercase = kjs_pcre_ucp_othercase(c)) >= 0)
480 break;
481 }
482
483 if (c > d)
484 return false;
485
486 *ocptr = othercase;
487 int next = othercase + 1;
488
489 for (++c; c <= d; c++) {
490 if (kjs_pcre_ucp_othercase(c) != next)
491 break;
492 next++;
493 }
494
495 *odptr = next - 1;
496 *cptr = c;
497
498 return true;
499 }
500
501 /*************************************************
502 * Convert character value to UTF-8 *
503 *************************************************/
504
505 /* This function takes an integer value in the range 0 - 0x7fffffff
506 and encodes it as a UTF-8 character in 0 to 6 bytes.
507
508 Arguments:
509 cvalue the character value
510 buffer pointer to buffer for result - at least 6 bytes long
511
512 Returns: number of characters placed in the buffer
513 */
514
515 static int encodeUTF8(int cvalue, unsigned char *buffer)
516 {
517 int i;
518 for (i = 0; i < kjs_pcre_utf8_table1_size; i++)
519 if (cvalue <= kjs_pcre_utf8_table1[i])
520 break;
521 buffer += i;
522 for (int j = i; j > 0; j--) {
523 *buffer-- = 0x80 | (cvalue & 0x3f);
524 cvalue >>= 6;
525 }
526 *buffer = kjs_pcre_utf8_table2[i] | cvalue;
527 return i + 1;
528 }
529
530 /*************************************************
531 * Compile one branch *
532 *************************************************/
533
534 /* Scan the pattern, compiling it into the code vector.
535
536 Arguments:
537 options the option bits
538 brackets points to number of extracting brackets used
539 codeptr points to the pointer to the current code point
540 ptrptr points to the current pattern pointer
541 errorcodeptr points to error code variable
542 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
543 reqbyteptr set to the last literal character required, else < 0
544 cd contains pointers to tables etc.
545
546 Returns: true on success
547 false, with *errorcodeptr set non-zero on error
548 */
549
550 static inline bool safelyCheckNextChar(const UChar* ptr, const UChar* patternEnd , UChar expected)
551 {
552 return ((ptr + 1 < patternEnd) && ptr[1] == expected);
553 }
554
555 static bool
556 compileBranch(int options, int* brackets, unsigned char** codeptr,
557 const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorco deptr, int *firstbyteptr,
558 int* reqbyteptr, CompileData& cd)
559 {
560 int repeat_type, op_type;
561 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
562 int bravalue = 0;
563 int reqvary, tempreqvary;
564 int c;
565 unsigned char* code = *codeptr;
566 unsigned char* tempcode;
567 bool groupsetfirstbyte = false;
568 const UChar* ptr = *ptrptr;
569 unsigned char* previous = NULL;
570 unsigned char classbits[32];
571
572 bool class_utf8;
573 unsigned char* class_utf8data;
574 unsigned char utf8_char[6];
575
576 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
577 matching encountered yet". It gets changed to REQ_NONE if we hit something that
578 matches a non-fixed char first char; reqbyte just remains unset if we never
579 find one.
580
581 When we hit a repeat whose minimum is zero, we may have to adjust these val ues
582 to take the zero repeat into account. This is implemented by setting them t o
583 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The indivi dual
584 item types that can be repeated set these backoff variables appropriately. */
585
586 int firstbyte = REQ_UNSET;
587 int reqbyte = REQ_UNSET;
588 int zeroreqbyte = REQ_UNSET;
589 int zerofirstbyte = REQ_UNSET;
590
591 /* The variable req_caseopt contains either the REQ_IGNORE_CASE value or zer o,
592 according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit
593 value > 255. It is added into the firstbyte or reqbyte variables to record the
594 case status of the value. This is used only for ASCII characters. */
595
596 int req_caseopt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0;
597
598 /* Switch on next character until the end of the branch */
599
600 for (;; ptr++) {
601 bool negate_class;
602 bool should_flip_negation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */
603 int class_charcount;
604 int class_lastchar;
605 int skipbytes;
606 int subreqbyte;
607 int subfirstbyte;
608 int mclength;
609 unsigned char mcbuffer[8];
610
611 /* Next byte in the pattern */
612
613 c = ptr < patternEnd ? *ptr : 0;
614
615 /* Fill in length of a previous callout, except when the next thing is
616 a quantifier. */
617
618 bool is_quantifier = c == '*' || c == '+' || c == '?' || (c == '{' && is CountedRepeat(ptr + 1, patternEnd));
619
620 switch (c) {
621 /* The branch terminates at end of string, |, or ). */
622
623 case 0:
624 if (ptr < patternEnd)
625 goto NORMAL_CHAR;
626 // End of string; fall through
627 case '|':
628 case ')':
629 *firstbyteptr = firstbyte;
630 *reqbyteptr = reqbyte;
631 *codeptr = code;
632 *ptrptr = ptr;
633 return true;
634
635 /* Handle single-character metacharacters. In multiline mode, ^ disa bles
636 the setting of any following char as a first character. */
637
638 case '^':
639 if (options & MatchAcrossMultipleLinesOption) {
640 if (firstbyte == REQ_UNSET)
641 firstbyte = REQ_NONE;
642 *code++ = OP_BOL;
643 } else
644 *code++ = OP_CIRC;
645 previous = NULL;
646 break;
647
648 case '$':
649 previous = NULL;
650 if (options & MatchAcrossMultipleLinesOption)
651 *code++ = OP_EOL;
652 else
653 *code++ = OP_DOLL;
654 break;
655
656 /* There can never be a first char if '.' is first, whatever happens about
657 repeats. The value of reqbyte doesn't change either. */
658
659 case '.':
660 if (firstbyte == REQ_UNSET)
661 firstbyte = REQ_NONE;
662 zerofirstbyte = firstbyte;
663 zeroreqbyte = reqbyte;
664 previous = code;
665 *code++ = OP_NOT_NEWLINE;
666 break;
667
668 /* Character classes. If the included characters are all < 256, we b uild a
669 32-byte bitmap of the permitted characters, except in the special c ase
670 where there is only one such character. For negated classes, we bui ld the
671 map as usual, then invert it at the end. However, we use a differen t opcode
672 so that data characters > 255 can be handled correctly.
673
674 If the class contains characters outside the 0-255 range, a differe nt
675 opcode is compiled. It may optionally have a bit map for characters < 256,
676 but those above are are explicitly listed afterwards. A flag byte t ells
677 whether the bitmap is present, and whether this is a negated class or not.
678 */
679
680 case '[': {
681 previous = code;
682 should_flip_negation = false;
683
684 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
685 they are encountered at the top level, so we'll do that too. */
686
687 /* If the first character is '^', set the negation flag and skip it. */
688
689 if (ptr + 1 >= patternEnd) {
690 *errorcodeptr = ERR6;
691 return false;
692 }
693
694 if (ptr[1] == '^') {
695 negate_class = true;
696 ++ptr;
697 } else
698 negate_class = false;
699
700 /* Keep a count of chars with values < 256 so that we can optimi ze the case
701 of just a single character (as long as it's < 256). For higher valued UTF-8
702 characters, we don't yet do any optimization. */
703
704 class_charcount = 0;
705 class_lastchar = -1;
706
707 class_utf8 = false; /* No chars >= 256 */
708 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
709
710 /* Initialize the 32-char bit map to all zeros. We have to build the
711 map in a temporary bit of store, in case the class contains onl y 1
712 character (< 256), because in that case the compiled code doesn 't use the
713 bit map. */
714
715 memset(classbits, 0, 32 * sizeof(unsigned char));
716
717 /* Process characters until ] is reached. The first pass
718 through the regex checked the overall syntax, so we don't need to be very
719 strict here. At the start of the loop, c contains the first byt e of the
720 character. */
721
722 while ((++ptr < patternEnd) && (c = *ptr) != ']') {
723 /* Backslash may introduce a single character, or it may int roduce one
724 of the specials, which just set a flag. Escaped items are c hecked for
725 validity in the pre-compiling pass. The sequence \b is a sp ecial case.
726 Inside a class (and only there) it is treated as backspace. Elsewhere
727 it marks a word boundary. Other escapes have preset maps re ady to
728 or into the one we are building. We assume they have more t han one
729 character in them, so set class_charcount bigger than one. */
730
731 if (c == '\\') {
732 c = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCa pturingBrackets, true);
733 if (c < 0) {
734 class_charcount += 2; /* Greater than 1 is what matters */
735 switch (-c) {
736 case ESC_d:
737 for (c = 0; c < 32; c++)
738 classbits[c] |= classBitmapForChar(c + c bit_digit);
739 continue;
740
741 case ESC_D:
742 should_flip_negation = true;
743 for (c = 0; c < 32; c++)
744 classbits[c] |= ~classBitmapForChar(c + cbit_digit);
745 continue;
746
747 case ESC_w:
748 for (c = 0; c < 32; c++)
749 classbits[c] |= classBitmapForChar(c + c bit_word);
750 continue;
751
752 case ESC_W:
753 should_flip_negation = true;
754 for (c = 0; c < 32; c++)
755 classbits[c] |= ~classBitmapForChar(c + cbit_word);
756 continue;
757
758 case ESC_s:
759 for (c = 0; c < 32; c++)
760 classbits[c] |= classBitmapForChar(c + cbit_space);
761 continue;
762
763 case ESC_S:
764 should_flip_negation = true;
765 for (c = 0; c < 32; c++)
766 classbits[c] |= ~classBitmapForChar(c + cbit_space);
767 continue;
768
769 /* Unrecognized escapes are faulted if PCRE is running in its
770 strict mode. By default, for compatibility with Perl, they are
771 treated as literals. */
772
773 default:
774 c = *ptr; /* The final characte r */
775 class_charcount -= 2; /* Undo the default c ount from above */
776 }
777 }
778
779 /* Fall through if we have a single character (c >= 0). This may be
780 > 256 in UTF-8 mode. */
781
782 } /* End of backslash handling */
783
784 /* A single character may be followed by '-' to form a range . However,
785 Perl does not permit ']' to be the end of the range. A '-' character
786 here is treated as a literal. */
787
788 if ((ptr + 2 < patternEnd) && ptr[1] == '-' && ptr[2] != ']' ) {
789 ptr += 2;
790
791 int d = *ptr;
792
793 /* The second part of a range can be a single-character escape, but
794 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
795 in such circumstances. */
796
797 if (d == '\\') {
798 const UChar* oldptr = ptr;
799 d = checkEscape(&ptr, patternEnd, errorcodeptr, cd.n umCapturingBrackets, true);
800
801 /* \X is literal X; any other special means the '-' was literal */
802 if (d < 0) {
803 ptr = oldptr - 2;
804 goto LONE_SINGLE_CHARACTER; /* A few lines belo w */
805 }
806 }
807
808 /* The check that the two values are in the correct orde r happens in
809 the pre-pass. Optimize one-character ranges */
810
811 if (d == c)
812 goto LONE_SINGLE_CHARACTER; /* A few lines below */
813
814 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
815 matching, we have to use an XCLASS with extra data item s. Caseless
816 matching for characters > 127 is available only if UCP support is
817 available. */
818
819 if ((d > 255 || ((options & IgnoreCaseOption) && d > 127 ))) {
820 class_utf8 = true;
821
822 /* With UCP support, we can find the other case equi valents of
823 the relevant characters. There may be several range s. Optimize how
824 they fit with the basic range. */
825
826 if (options & IgnoreCaseOption) {
827 int occ, ocd;
828 int cc = c;
829 int origd = d;
830 while (getOthercaseRange(&cc, origd, &occ, &ocd) ) {
831 if (occ >= c && ocd <= d)
832 continue; /* Skip embedded ranges */
833
834 if (occ < c && ocd >= c - 1) /* Exte nd the basic range */
835 { /* if the re is overlap, */
836 c = occ; /* no ting that if occ < c */
837 continue; /* we can't have ocd > d */
838 } /* becaus e a subrange is */
839 if (ocd > d && occ <= d + 1) /* alwa ys shorter than */
840 { /* the ba sic range. */
841 d = ocd;
842 continue;
843 }
844
845 if (occ == ocd)
846 *class_utf8data++ = XCL_SINGLE;
847 else {
848 *class_utf8data++ = XCL_RANGE;
849 class_utf8data += encodeUTF8(occ, class_ utf8data);
850 }
851 class_utf8data += encodeUTF8(ocd, class_utf8 data);
852 }
853 }
854
855 /* Now record the original range, possibly modified for UCP caseless
856 overlapping ranges. */
857
858 *class_utf8data++ = XCL_RANGE;
859 class_utf8data += encodeUTF8(c, class_utf8data);
860 class_utf8data += encodeUTF8(d, class_utf8data);
861
862 /* With UCP support, we are done. Without UCP suppor t, there is no
863 caseless matching for UTF-8 characters > 127; we ca n use the bit map
864 for the smaller ones. */
865
866 continue; /* With next character in the class */
867 }
868
869 /* We use the bit map for all cases when not in UTF-8 mo de; else
870 ranges that lie entirely within 0-127 when there is UCP support; else
871 for partial ranges without UCP support. */
872
873 for (; c <= d; c++) {
874 classbits[c/8] |= (1 << (c&7));
875 if (options & IgnoreCaseOption) {
876 int uc = flipCase(c);
877 classbits[uc/8] |= (1 << (uc&7));
878 }
879 class_charcount++; /* in case a one-c har range */
880 class_lastchar = c;
881 }
882
883 continue; /* Go get the next char in the class */
884 }
885
886 /* Handle a lone single character - we can get here for a no rmal
887 non-escape char, or after \ that introduces a single charac ter or for an
888 apparent range that isn't. */
889
890 LONE_SINGLE_CHARACTER:
891
892 /* Handle a character that cannot go in the bit map */
893
894 if ((c > 255 || ((options & IgnoreCaseOption) && c > 127))) {
895 class_utf8 = true;
896 *class_utf8data++ = XCL_SINGLE;
897 class_utf8data += encodeUTF8(c, class_utf8data);
898
899 if (options & IgnoreCaseOption) {
900 int othercase;
901 if ((othercase = kjs_pcre_ucp_othercase(c)) >= 0) {
902 *class_utf8data++ = XCL_SINGLE;
903 class_utf8data += encodeUTF8(othercase, class_ut f8data);
904 }
905 }
906 } else {
907 /* Handle a single-byte character */
908 classbits[c/8] |= (1 << (c&7));
909 if (options & IgnoreCaseOption) {
910 c = flipCase(c);
911 classbits[c/8] |= (1 << (c&7));
912 }
913 class_charcount++;
914 class_lastchar = c;
915 }
916 }
917
918 /* If class_charcount is 1, we saw precisely one character whose value is
919 less than 256. In non-UTF-8 mode we can always optimize. In UTF -8 mode, we
920 can optimize the negative case only if there were no characters >= 128
921 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
922 single-bytes only. This is an historical hangover. Maybe one da y we can
923 tidy these opcodes to handle multi-byte characters.
924
925 The optimization throws away the bit map. We turn the item into a
926 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's neg ative. Note
927 that OP_NOT does not support multibyte characters. In the posit ive case, it
928 can cause firstbyte to be set. Otherwise, there can be no first char if
929 this item is first, whatever repeat count may follow. In the ca se of
930 reqbyte, save the previous value for reinstating. */
931
932 if (class_charcount == 1 && (!class_utf8 && (!negate_class || cl ass_lastchar < 128))) {
933 zeroreqbyte = reqbyte;
934
935 /* The OP_NOT opcode works on one-byte characters only. */
936
937 if (negate_class) {
938 if (firstbyte == REQ_UNSET)
939 firstbyte = REQ_NONE;
940 zerofirstbyte = firstbyte;
941 *code++ = OP_NOT;
942 *code++ = class_lastchar;
943 break;
944 }
945
946 /* For a single, positive character, get the value into c, a nd
947 then we can handle this with the normal one-character code. */
948
949 c = class_lastchar;
950 goto NORMAL_CHAR;
951 } /* End of 1-char optimization */
952
953 /* The general case - not the one-char optimization. If this is the first
954 thing in the branch, there can be no first char setting, whatev er the
955 repeat count. Any reqbyte setting must remain unchanged after a ny kind of
956 repeat. */
957
958 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
959 zerofirstbyte = firstbyte;
960 zeroreqbyte = reqbyte;
961
962 /* If there are characters with values > 255, we have to compile an
963 extended class, with its own opcode. If there are no characters < 256,
964 we can omit the bitmap. */
965
966 if (class_utf8 && !should_flip_negation) {
967 *class_utf8data++ = XCL_END; /* Marks the end of extra da ta */
968 *code++ = OP_XCLASS;
969 code += LINK_SIZE;
970 *code = negate_class? XCL_NOT : 0;
971
972 /* If the map is required, install it, and move on to the en d of
973 the extra data */
974
975 if (class_charcount > 0) {
976 *code++ |= XCL_MAP;
977 memcpy(code, classbits, 32);
978 code = class_utf8data;
979 }
980
981 /* If the map is not required, slide down the extra data. */
982
983 else {
984 int len = class_utf8data - (code + 33);
985 memmove(code + 1, code + 33, len);
986 code += len + 1;
987 }
988
989 /* Now fill in the complete length of the item */
990
991 putLinkValue(previous + 1, code - previous);
992 break; /* End of class handling */
993 }
994
995 /* If there are no characters > 255, negate the 32-byte map if n ecessary,
996 and copy it into the code vector. If this is the first thing in the branch,
997 there can be no first char setting, whatever the repeat count. Any reqbyte
998 setting must remain unchanged after any kind of repeat. */
999
1000 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP _NCLASS;
1001 if (negate_class)
1002 for (c = 0; c < 32; c++)
1003 code[c] = ~classbits[c];
1004 else
1005 memcpy(code, classbits, 32);
1006 code += 32;
1007 break;
1008 }
1009
1010 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
1011 has been tested above. */
1012
1013 case '{':
1014 if (!is_quantifier)
1015 goto NORMAL_CHAR;
1016 ptr = readRepeatCounts(ptr + 1, &repeat_min, &repeat_max, errorc odeptr);
1017 if (*errorcodeptr)
1018 goto FAILED;
1019 goto REPEAT;
1020
1021 case '*':
1022 repeat_min = 0;
1023 repeat_max = -1;
1024 goto REPEAT;
1025
1026 case '+':
1027 repeat_min = 1;
1028 repeat_max = -1;
1029 goto REPEAT;
1030
1031 case '?':
1032 repeat_min = 0;
1033 repeat_max = 1;
1034
1035 REPEAT:
1036 if (!previous) {
1037 *errorcodeptr = ERR9;
1038 goto FAILED;
1039 }
1040
1041 if (repeat_min == 0) {
1042 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
1043 reqbyte = zeroreqbyte; /* Ditto */
1044 }
1045
1046 /* Remember whether this is a variable length repeat */
1047
1048 reqvary = (repeat_min == repeat_max) ? 0 : REQ_VARY;
1049
1050 op_type = 0; /* Default single-char op codes */
1051
1052 /* Save start of previous item, in case we have to move it up to make space
1053 for an inserted OP_ONCE for the additional '+' extension. */
1054 /* FIXME: Probably don't need this because we don't use OP_ONCE. */
1055
1056 tempcode = previous;
1057
1058 /* If the next character is '+', we have a possessive quantifier . This
1059 implies greediness, whatever the setting of the PCRE_UNGREEDY o ption.
1060 If the next character is '?' this is a minimizing repeat, by de fault,
1061 but if PCRE_UNGREEDY is set, it works the other way round. We c hange the
1062 repeat type to the non-default. */
1063
1064 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
1065 repeat_type = 1;
1066 ptr++;
1067 } else
1068 repeat_type = 0;
1069
1070 /* If previous was a character match, abolish the item and gener ate a
1071 repeat item instead. If a char item has a minumum of more than one, ensure
1072 that it is set in reqbyte - it might not be if a sequence such as x{3} is
1073 the first thing in a branch because the x will have gone into f irstbyte
1074 instead. */
1075
1076 if (*previous == OP_CHAR || *previous == OP_CHAR_IGNORING_CASE) {
1077 /* Deal with UTF-8 characters that take up more than one byt e. It's
1078 easier to write this out separately than try to macrify it. Use c to
1079 hold the length of the character in bytes, plus 0x80 to fla g that it's a
1080 length rather than a small character. */
1081
1082 if (code[-1] & 0x80) {
1083 unsigned char *lastchar = code - 1;
1084 while((*lastchar & 0xc0) == 0x80)
1085 lastchar--;
1086 c = code - lastchar; /* Length of UTF-8 chara cter */
1087 memcpy(utf8_char, lastchar, c); /* Save the char */
1088 c |= 0x80; /* Flag c as a length */
1089 }
1090 else {
1091 c = code[-1];
1092 if (repeat_min > 1)
1093 reqbyte = c | req_caseopt | cd.req_varyopt;
1094 }
1095
1096 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single char acter types */
1097 }
1098
1099 else if (*previous == OP_ASCII_CHAR || *previous == OP_ASCII_LET TER_IGNORING_CASE) {
1100 c = previous[1];
1101 if (repeat_min > 1)
1102 reqbyte = c | req_caseopt | cd.req_varyopt;
1103 goto OUTPUT_SINGLE_REPEAT;
1104 }
1105
1106 /* If previous was a single negated character ([^a] or similar), we use
1107 one of the special opcodes, replacing it. The code is shared wi th single-
1108 character repeats by setting opt_type to add a suitable offset into
1109 repeat_type. OP_NOT is currently used only for single-byte char s. */
1110
1111 else if (*previous == OP_NOT) {
1112 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1113 c = previous[1];
1114 goto OUTPUT_SINGLE_REPEAT;
1115 }
1116
1117 /* If previous was a character type match (\d or similar), aboli sh it and
1118 create a suitable repeat item. The code is shared with single-c haracter
1119 repeats by setting op_type to add a suitable offset into repeat _type. */
1120
1121 else if (*previous <= OP_NOT_NEWLINE) {
1122 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1123 c = *previous;
1124
1125 OUTPUT_SINGLE_REPEAT:
1126 int prop_type = -1;
1127 int prop_value = -1;
1128
1129 unsigned char* oldcode = code;
1130 code = previous; /* Usually overwrite previ ous item */
1131
1132 /* If the maximum is zero then the minimum must also be zero ; Perl allows
1133 this case, so we do too - by simply omitting the item altog ether. */
1134
1135 if (repeat_max == 0)
1136 goto END_REPEAT;
1137
1138 /* Combine the op_type with the repeat_type */
1139
1140 repeat_type += op_type;
1141
1142 /* A minimum of zero is handled either as the special case * or ?, or as
1143 an UPTO, with the maximum given. */
1144
1145 if (repeat_min == 0) {
1146 if (repeat_max == -1)
1147 *code++ = OP_STAR + repeat_type;
1148 else if (repeat_max == 1)
1149 *code++ = OP_QUERY + repeat_type;
1150 else {
1151 *code++ = OP_UPTO + repeat_type;
1152 put2ByteValueAndAdvance(code, repeat_max);
1153 }
1154 }
1155
1156 /* A repeat minimum of 1 is optimized into some special case s. If the
1157 maximum is unlimited, we use OP_PLUS. Otherwise, the origin al item it
1158 left in place and, if the maximum is greater than 1, we use OP_UPTO with
1159 one less than the maximum. */
1160
1161 else if (repeat_min == 1) {
1162 if (repeat_max == -1)
1163 *code++ = OP_PLUS + repeat_type;
1164 else {
1165 code = oldcode; /* leave previous it em in place */
1166 if (repeat_max == 1)
1167 goto END_REPEAT;
1168 *code++ = OP_UPTO + repeat_type;
1169 put2ByteValueAndAdvance(code, repeat_max - 1);
1170 }
1171 }
1172
1173 /* The case {n,n} is just an EXACT, while the general case { n,m} is
1174 handled as an EXACT followed by an UPTO. */
1175
1176 else {
1177 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1178 put2ByteValueAndAdvance(code, repeat_min);
1179
1180 /* If the maximum is unlimited, insert an OP_STAR. Befor e doing so,
1181 we have to insert the character for the previous code. For a repeated
1182 Unicode property match, there are two extra bytes that define the
1183 required property. In UTF-8 mode, long characters have their length in
1184 c, with the 0x80 bit as a flag. */
1185
1186 if (repeat_max < 0) {
1187 if (c >= 128) {
1188 memcpy(code, utf8_char, c & 7);
1189 code += c & 7;
1190 } else {
1191 *code++ = c;
1192 if (prop_type >= 0) {
1193 *code++ = prop_type;
1194 *code++ = prop_value;
1195 }
1196 }
1197 *code++ = OP_STAR + repeat_type;
1198 }
1199
1200 /* Else insert an UPTO if the max is greater than the mi n, again
1201 preceded by the character, for the previously inserted code. */
1202
1203 else if (repeat_max != repeat_min) {
1204 if (c >= 128) {
1205 memcpy(code, utf8_char, c & 7);
1206 code += c & 7;
1207 } else
1208 *code++ = c;
1209 if (prop_type >= 0) {
1210 *code++ = prop_type;
1211 *code++ = prop_value;
1212 }
1213 repeat_max -= repeat_min;
1214 *code++ = OP_UPTO + repeat_type;
1215 put2ByteValueAndAdvance(code, repeat_max);
1216 }
1217 }
1218
1219 /* The character or character type itself comes last in all cases. */
1220
1221 if (c >= 128) {
1222 memcpy(code, utf8_char, c & 7);
1223 code += c & 7;
1224 } else
1225 *code++ = c;
1226
1227 /* For a repeated Unicode property match, there are two extr a bytes that
1228 define the required property. */
1229
1230 if (prop_type >= 0) {
1231 *code++ = prop_type;
1232 *code++ = prop_value;
1233 }
1234 }
1235
1236 /* If previous was a character class or a back reference, we put the repeat
1237 stuff after it, but just skip the item if the repeat was {0,0}. */
1238
1239 else if (*previous == OP_CLASS ||
1240 *previous == OP_NCLASS ||
1241 *previous == OP_XCLASS ||
1242 *previous == OP_REF)
1243 {
1244 if (repeat_max == 0) {
1245 code = previous;
1246 goto END_REPEAT;
1247 }
1248
1249 if (repeat_min == 0 && repeat_max == -1)
1250 *code++ = OP_CRSTAR + repeat_type;
1251 else if (repeat_min == 1 && repeat_max == -1)
1252 *code++ = OP_CRPLUS + repeat_type;
1253 else if (repeat_min == 0 && repeat_max == 1)
1254 *code++ = OP_CRQUERY + repeat_type;
1255 else {
1256 *code++ = OP_CRRANGE + repeat_type;
1257 put2ByteValueAndAdvance(code, repeat_min);
1258 if (repeat_max == -1)
1259 repeat_max = 0; /* 2-byte encoding for max */
1260 put2ByteValueAndAdvance(code, repeat_max);
1261 }
1262 }
1263
1264 /* If previous was a bracket group, we may have to replicate it in certain
1265 cases. */
1266
1267 else if (*previous >= OP_BRA) {
1268 int ketoffset = 0;
1269 int len = code - previous;
1270 unsigned char* bralink = NULL;
1271
1272 /* If the maximum repeat count is unlimited, find the end of the bracket
1273 by scanning through from the start, and compute the offset back to it
1274 from the current code pointer. There may be an OP_OPT setti ng following
1275 the final KET, so we can't find the end just by going back from the code
1276 pointer. */
1277
1278 if (repeat_max == -1) {
1279 const unsigned char* ket = previous;
1280 advanceToEndOfBracket(ket);
1281 ketoffset = code - ket;
1282 }
1283
1284 /* The case of a zero minimum is special because of the need to stick
1285 OP_BRAZERO in front of it, and because the group appears on ce in the
1286 data, whereas in other cases it appears the minimum number of times. For
1287 this reason, it is simplest to treat this case separately, as otherwise
1288 the code gets far too messy. There are several special subc ases when the
1289 minimum is zero. */
1290
1291 if (repeat_min == 0) {
1292 /* If the maximum is also zero, we just omit the group f rom the output
1293 altogether. */
1294
1295 if (repeat_max == 0) {
1296 code = previous;
1297 goto END_REPEAT;
1298 }
1299
1300 /* If the maximum is 1 or unlimited, we just have to sti ck in the
1301 BRAZERO and do no more at this point. However, we do ne ed to adjust
1302 any OP_RECURSE calls inside the group that refer to the group itself or
1303 any internal group, because the offset is from the star t of the whole
1304 regex. Temporarily terminate the pattern while doing th is. */
1305
1306 if (repeat_max <= 1) {
1307 *code = OP_END;
1308 memmove(previous+1, previous, len);
1309 code++;
1310 *previous++ = OP_BRAZERO + repeat_type;
1311 }
1312
1313 /* If the maximum is greater than 1 and limited, we have to replicate
1314 in a nested fashion, sticking OP_BRAZERO before each se t of brackets.
1315 The first one has to be handled carefully because it's the original
1316 copy, which has to be moved up. The remainder can be ha ndled by code
1317 that is common with the non-zero minimum case below. We have to
1318 adjust the value of repeat_max, since one less copy is required. */
1319
1320 else {
1321 *code = OP_END;
1322 memmove(previous + 2 + LINK_SIZE, previous, len);
1323 code += 2 + LINK_SIZE;
1324 *previous++ = OP_BRAZERO + repeat_type;
1325 *previous++ = OP_BRA;
1326
1327 /* We chain together the bracket offset fields that have to be
1328 filled in later when the ends of the brackets are r eached. */
1329
1330 int offset = (!bralink) ? 0 : previous - bralink;
1331 bralink = previous;
1332 putLinkValueAllowZeroAndAdvance(previous, offset);
1333 }
1334
1335 repeat_max--;
1336 }
1337
1338 /* If the minimum is greater than zero, replicate the group as many
1339 times as necessary, and adjust the maximum to the number of subsequent
1340 copies that we need. If we set a first char from the group, and didn't
1341 set a required char, copy the latter from the former. */
1342
1343 else {
1344 if (repeat_min > 1) {
1345 if (groupsetfirstbyte && reqbyte < 0)
1346 reqbyte = firstbyte;
1347 for (int i = 1; i < repeat_min; i++) {
1348 memcpy(code, previous, len);
1349 code += len;
1350 }
1351 }
1352 if (repeat_max > 0)
1353 repeat_max -= repeat_min;
1354 }
1355
1356 /* This code is common to both the zero and non-zero minimum cases. If
1357 the maximum is limited, it replicates the group in a nested fashion,
1358 remembering the bracket starts on a stack. In the case of a zero minimum,
1359 the first one was set up above. In all cases the repeat_max now specifies
1360 the number of additional copies needed. */
1361
1362 if (repeat_max >= 0) {
1363 for (int i = repeat_max - 1; i >= 0; i--) {
1364 *code++ = OP_BRAZERO + repeat_type;
1365
1366 /* All but the final copy start a new nesting, maint aining the
1367 chain of brackets outstanding. */
1368
1369 if (i != 0) {
1370 *code++ = OP_BRA;
1371 int offset = (!bralink) ? 0 : code - bralink;
1372 bralink = code;
1373 putLinkValueAllowZeroAndAdvance(code, offset);
1374 }
1375
1376 memcpy(code, previous, len);
1377 code += len;
1378 }
1379
1380 /* Now chain through the pending brackets, and fill in t heir length
1381 fields (which are holding the chain links pro tem). */
1382
1383 while (bralink) {
1384 int offset = code - bralink + 1;
1385 unsigned char* bra = code - offset;
1386 int oldlinkoffset = getLinkValueAllowZero(bra + 1);
1387 bralink = (!oldlinkoffset) ? 0 : bralink - oldlinkof fset;
1388 *code++ = OP_KET;
1389 putLinkValueAndAdvance(code, offset);
1390 putLinkValue(bra + 1, offset);
1391 }
1392 }
1393
1394 /* If the maximum is unlimited, set a repeater in the final copy. We
1395 can't just offset backwards from the current code point, be cause we
1396 don't know if there's been an options resetting after the k et. The
1397 correct offset was computed above. */
1398
1399 else
1400 code[-ketoffset] = OP_KETRMAX + repeat_type;
1401 }
1402
1403 /* Else there's some kind of shambles */
1404
1405 else {
1406 *errorcodeptr = ERR11;
1407 goto FAILED;
1408 }
1409
1410 /* In all case we no longer have a previous item. We also set th e
1411 "follows varying string" flag for subsequently encountered reqb ytes if
1412 it isn't already set and we have just passed a varying length i tem. */
1413
1414 END_REPEAT:
1415 previous = NULL;
1416 cd.req_varyopt |= reqvary;
1417 break;
1418
1419 /* Start of nested bracket sub-expression, or comment or lookahead o r
1420 lookbehind or option setting or condition. First deal with special things
1421 that can come after a bracket; all are introduced by ?, and the app earance
1422 of any of them means that this is not a referencing group. They wer e
1423 checked for validity in the first pass over the string, so we don't have to
1424 check for syntax errors here. */
1425
1426 case '(':
1427 skipbytes = 0;
1428
1429 if (*(++ptr) == '?') {
1430 switch (*(++ptr)) {
1431 case ':': /* Non-extracting bracket */
1432 bravalue = OP_BRA;
1433 ptr++;
1434 break;
1435
1436 case '=': /* Positive lookahead */
1437 bravalue = OP_ASSERT;
1438 ptr++;
1439 break;
1440
1441 case '!': /* Negative lookahead */
1442 bravalue = OP_ASSERT_NOT;
1443 ptr++;
1444 break;
1445
1446 /* Character after (? not specially recognized */
1447
1448 default:
1449 *errorcodeptr = ERR12;
1450 goto FAILED;
1451 }
1452 }
1453
1454 /* Else we have a referencing group; adjust the opcode. If the b racket
1455 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1456 arrange for the true number to follow later, in an OP_BRANUMBER item. */
1457
1458 else {
1459 if (++(*brackets) > EXTRACT_BASIC_MAX) {
1460 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1461 code[1 + LINK_SIZE] = OP_BRANUMBER;
1462 put2ByteValue(code + 2 + LINK_SIZE, *brackets);
1463 skipbytes = 3;
1464 }
1465 else
1466 bravalue = OP_BRA + *brackets;
1467 }
1468
1469 /* Process nested bracketed re. Assertions may not be repeated, but other
1470 kinds can be. We copy code into a non-variable in order to be a ble
1471 to pass its address because some compilers complain otherwise. Pass in a
1472 new setting for the ims options if they have changed. */
1473
1474 previous = (bravalue >= OP_BRAZERO) ? code : 0;
1475 *code = bravalue;
1476 tempcode = code;
1477 tempreqvary = cd.req_varyopt; /* Save value before bracket * /
1478
1479 if (!compileBracket(
1480 options,
1481 brackets, /* Extracting b racket count */
1482 &tempcode, /* Where to put code (updated) */
1483 &ptr, /* Input pointe r (updated) */
1484 patternEnd,
1485 errorcodeptr, /* Where to put an error message */
1486 skipbytes, /* Skip over OP _BRANUMBER */
1487 &subfirstbyte, /* For possible first char */
1488 &subreqbyte, /* For possible last char */
1489 cd)) /* Tables block */
1490 goto FAILED;
1491
1492 /* At the end of compiling, code is still pointing to the start of the
1493 group, while tempcode has been updated to point past the end of the group
1494 and any option resetting that may follow it. The pattern pointe r (ptr)
1495 is on the bracket. */
1496
1497 /* Handle updating of the required and first characters. Update for normal
1498 brackets of all kinds, and conditions with two branches (see co de above).
1499 If the bracket is followed by a quantifier with zero repeat, we have to
1500 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
1501 main loop so that they can be accessed for the back off. */
1502
1503 zeroreqbyte = reqbyte;
1504 zerofirstbyte = firstbyte;
1505 groupsetfirstbyte = false;
1506
1507 if (bravalue >= OP_BRA) {
1508 /* If we have not yet set a firstbyte in this branch, take i t from the
1509 subpattern, remembering that it was set here so that a repe at of more
1510 than one can replicate it as reqbyte if necessary. If the s ubpattern has
1511 no firstbyte, set "none" for the whole branch. In both case s, a zero
1512 repeat forces firstbyte to "none". */
1513
1514 if (firstbyte == REQ_UNSET) {
1515 if (subfirstbyte >= 0) {
1516 firstbyte = subfirstbyte;
1517 groupsetfirstbyte = true;
1518 }
1519 else
1520 firstbyte = REQ_NONE;
1521 zerofirstbyte = REQ_NONE;
1522 }
1523
1524 /* If firstbyte was previously set, convert the subpattern's firstbyte
1525 into reqbyte if there wasn't one, using the vary flag that was in
1526 existence beforehand. */
1527
1528 else if (subfirstbyte >= 0 && subreqbyte < 0)
1529 subreqbyte = subfirstbyte | tempreqvary;
1530
1531 /* If the subpattern set a required byte (or set a first byt e that isn't
1532 really the first byte - see above), set it. */
1533
1534 if (subreqbyte >= 0)
1535 reqbyte = subreqbyte;
1536 }
1537
1538 /* For a forward assertion, we take the reqbyte, if set. This ca n be
1539 helpful if the pattern that follows the assertion doesn't set a different
1540 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
1541 for an assertion, however because it leads to incorrect effect for patterns
1542 such as /(?=a)a.+/ when the "real" "a" would then become a reqb yte instead
1543 of a firstbyte. This is overcome by a scan at the end if there' s no
1544 firstbyte, looking for an asserted first char. */
1545
1546 else if (bravalue == OP_ASSERT && subreqbyte >= 0)
1547 reqbyte = subreqbyte;
1548
1549 /* Now update the main code pointer to the end of the group. */
1550
1551 code = tempcode;
1552
1553 /* Error if hit end of pattern */
1554
1555 if (ptr >= patternEnd || *ptr != ')') {
1556 *errorcodeptr = ERR14;
1557 goto FAILED;
1558 }
1559 break;
1560
1561 /* Check \ for being a real metacharacter; if not, fall through and handle
1562 it as a data character at the start of a string. Escape items are c hecked
1563 for validity in the pre-compiling pass. */
1564
1565 case '\\':
1566 c = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCapturingB rackets, false);
1567
1568 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1569 are arranged to be the negation of the corresponding OP_values. For the
1570 back references, the values are ESC_REF plus the reference numb er. Only
1571 back references and those types that consume a character may be repeated.
1572 We can test for values between ESC_b and ESC_w for the latter; this may
1573 have to change if any new ones are ever created. */
1574
1575 if (c < 0) {
1576 /* For metasequences that actually match a character, we dis able the
1577 setting of a first character if it hasn't already been set. */
1578
1579 if (firstbyte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)
1580 firstbyte = REQ_NONE;
1581
1582 /* Set values to reset to if this is followed by a zero repe at. */
1583
1584 zerofirstbyte = firstbyte;
1585 zeroreqbyte = reqbyte;
1586
1587 /* Back references are handled specially */
1588
1589 if (-c >= ESC_REF) {
1590 int number = -c - ESC_REF;
1591 previous = code;
1592 *code++ = OP_REF;
1593 put2ByteValueAndAdvance(code, number);
1594 }
1595
1596 /* For the rest, we can obtain the OP value by negating the escape
1597 value */
1598
1599 else {
1600 previous = (-c > ESC_b && -c <= ESC_w) ? code : NULL;
1601 *code++ = -c;
1602 }
1603 continue;
1604 }
1605
1606 /* Fall through. */
1607
1608 /* Handle a literal character. It is guaranteed not to be whites pace or #
1609 when the extended flag is set. If we are in UTF-8 mode, it may be a
1610 multi-byte literal character. */
1611
1612 default:
1613 NORMAL_CHAR:
1614
1615 previous = code;
1616
1617 if (c < 128) {
1618 mclength = 1;
1619 mcbuffer[0] = c;
1620
1621 if ((options & IgnoreCaseOption) && (c | 0x20) >= 'a' && (c | 0x20) <= 'z') {
1622 *code++ = OP_ASCII_LETTER_IGNORING_CASE;
1623 *code++ = c | 0x20;
1624 } else {
1625 *code++ = OP_ASCII_CHAR;
1626 *code++ = c;
1627 }
1628 } else {
1629 mclength = encodeUTF8(c, mcbuffer);
1630
1631 *code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CA SE : OP_CHAR;
1632 for (c = 0; c < mclength; c++)
1633 *code++ = mcbuffer[c];
1634 }
1635
1636 /* Set the first and required bytes appropriately. If no previou s first
1637 byte, set it from this character, but revert to none on a zero repeat.
1638 Otherwise, leave the firstbyte value alone, and don't change it on a zero
1639 repeat. */
1640
1641 if (firstbyte == REQ_UNSET) {
1642 zerofirstbyte = REQ_NONE;
1643 zeroreqbyte = reqbyte;
1644
1645 /* If the character is more than one byte long, we can set f irstbyte
1646 only if it is not to be matched caselessly. */
1647
1648 if (mclength == 1 || req_caseopt == 0) {
1649 firstbyte = mcbuffer[0] | req_caseopt;
1650 if (mclength != 1)
1651 reqbyte = code[-1] | cd.req_varyopt;
1652 }
1653 else
1654 firstbyte = reqbyte = REQ_NONE;
1655 }
1656
1657 /* firstbyte was previously set; we can set reqbyte only the len gth is
1658 1 or the matching is caseful. */
1659
1660 else {
1661 zerofirstbyte = firstbyte;
1662 zeroreqbyte = reqbyte;
1663 if (mclength == 1 || req_caseopt == 0)
1664 reqbyte = code[-1] | req_caseopt | cd.req_varyopt;
1665 }
1666
1667 break; /* End of literal character handling */
1668 }
1669 } /* end of big loop */
1670
1671 /* Control never reaches here by falling through, only by a goto for all the
1672 error states. Pass back the position in the pattern so that it can be displ ayed
1673 to the user for diagnosing the error. */
1674
1675 FAILED:
1676 *ptrptr = ptr;
1677 return false;
1678 }
1679
1680 /*************************************************
1681 * Compile sequence of alternatives *
1682 *************************************************/
1683
1684 /* On entry, ptr is pointing past the bracket character, but on return
1685 it points to the closing bracket, or vertical bar, or end of string.
1686 The code variable is pointing at the byte into which the BRA operator has been
1687 stored. If the ims options are changed at the start (for a (?ims: group) or
1688 during any branch, we need to insert an OP_OPT item at the start of every
1689 following branch to ensure they get set correctly at run time, and also pass
1690 the new options into every subsequent branch compile.
1691
1692 Argument:
1693 options option bits, including any changes for this subpattern
1694 brackets -> int containing the number of extracting brackets used
1695 codeptr -> the address of the current code pointer
1696 ptrptr -> the address of the current pattern pointer
1697 errorcodeptr -> pointer to error code variable
1698 skipbytes skip this many bytes at start (for OP_BRANUMBER)
1699 firstbyteptr place to put the first required character, or a negative number
1700 reqbyteptr place to put the last required character, or a negative number
1701 cd points to the data block with tables pointers etc.
1702
1703 Returns: true on success
1704 */
1705
1706 static bool
1707 compileBracket(int options, int* brackets, unsigned char** codeptr,
1708 const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int skipbytes,
1709 int* firstbyteptr, int* reqbyteptr, CompileData& cd)
1710 {
1711 const UChar* ptr = *ptrptr;
1712 unsigned char* code = *codeptr;
1713 unsigned char* last_branch = code;
1714 unsigned char* start_bracket = code;
1715 int firstbyte = REQ_UNSET;
1716 int reqbyte = REQ_UNSET;
1717
1718 /* Offset is set zero to mark that this bracket is still open */
1719
1720 putLinkValueAllowZero(code + 1, 0);
1721 code += 1 + LINK_SIZE + skipbytes;
1722
1723 /* Loop for each alternative branch */
1724
1725 while (true) {
1726 /* Now compile the branch */
1727
1728 int branchfirstbyte;
1729 int branchreqbyte;
1730 if (!compileBranch(options, brackets, &code, &ptr, patternEnd, errorcode ptr,
1731 &branchfirstbyte, &branchreqbyte, cd)) {
1732 *ptrptr = ptr;
1733 return false;
1734 }
1735
1736 /* If this is the first branch, the firstbyte and reqbyte values for the
1737 branch become the values for the regex. */
1738
1739 if (*last_branch != OP_ALT) {
1740 firstbyte = branchfirstbyte;
1741 reqbyte = branchreqbyte;
1742 }
1743
1744 /* If this is not the first branch, the first char and reqbyte have to
1745 match the values from all the previous branches, except that if the pre vious
1746 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
1747 REQ_VARY for the regex. */
1748
1749 else {
1750 /* If we previously had a firstbyte, but it doesn't match the new br anch,
1751 we have to abandon the firstbyte for the regex, but if there was pr eviously
1752 no reqbyte, it takes on the value of the old firstbyte. */
1753
1754 if (firstbyte >= 0 && firstbyte != branchfirstbyte) {
1755 if (reqbyte < 0)
1756 reqbyte = firstbyte;
1757 firstbyte = REQ_NONE;
1758 }
1759
1760 /* If we (now or from before) have no firstbyte, a firstbyte from th e
1761 branch becomes a reqbyte if there isn't a branch reqbyte. */
1762
1763 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
1764 branchreqbyte = branchfirstbyte;
1765
1766 /* Now ensure that the reqbytes match */
1767
1768 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
1769 reqbyte = REQ_NONE;
1770 else
1771 reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
1772 }
1773
1774 /* Reached end of expression, either ')' or end of pattern. Go back thro ugh
1775 the alternative branches and reverse the chain of offsets, with the fie ld in
1776 the BRA item now becoming an offset to the first alternative. If there are
1777 no alternatives, it points to the end of the group. The length in the
1778 terminating ket is always the length of the whole bracketed item. If an y of
1779 the ims options were changed inside the group, compile a resetting op-c ode
1780 following, except at the very end of the pattern. Return leaving the po inter
1781 at the terminating char. */
1782
1783 if (ptr >= patternEnd || *ptr != '|') {
1784 int length = code - last_branch;
1785 do {
1786 int prev_length = getLinkValueAllowZero(last_branch + 1);
1787 putLinkValue(last_branch + 1, length);
1788 length = prev_length;
1789 last_branch -= length;
1790 } while (length > 0);
1791
1792 /* Fill in the ket */
1793
1794 *code = OP_KET;
1795 putLinkValue(code + 1, code - start_bracket);
1796 code += 1 + LINK_SIZE;
1797
1798 /* Set values to pass back */
1799
1800 *codeptr = code;
1801 *ptrptr = ptr;
1802 *firstbyteptr = firstbyte;
1803 *reqbyteptr = reqbyte;
1804 return true;
1805 }
1806
1807 /* Another branch follows; insert an "or" node. Its length field points back
1808 to the previous branch while the bracket remains open. At the end the c hain
1809 is reversed. It's done like this so that the start of the bracket has a
1810 zero offset until it is closed, making it possible to detect recursion. */
1811
1812 *code = OP_ALT;
1813 putLinkValue(code + 1, code - last_branch);
1814 last_branch = code;
1815 code += 1 + LINK_SIZE;
1816 ptr++;
1817 }
1818 ASSERT_NOT_REACHED();
1819 }
1820
1821 /*************************************************
1822 * Check for anchored expression *
1823 *************************************************/
1824
1825 /* Try to find out if this is an anchored regular expression. Consider each
1826 alternative branch. If they all start OP_CIRC, or with a bracket
1827 all of whose alternatives start OP_CIRC (recurse ad lib), then
1828 it's anchored.
1829
1830 Arguments:
1831 code points to start of expression (the bracket)
1832 captureMap a bitmap of which brackets we are inside while testing; this
1833 handles up to substring 31; all brackets after that share
1834 the zero bit
1835 backrefMap the back reference bitmap
1836 */
1837
1838 static bool branchIsAnchored(const unsigned char* code)
1839 {
1840 const unsigned char* scode = firstSignificantOpcode(code);
1841 int op = *scode;
1842
1843 /* Brackets */
1844 if (op >= OP_BRA || op == OP_ASSERT)
1845 return bracketIsAnchored(scode);
1846
1847 /* Check for explicit anchoring */
1848 return op == OP_CIRC;
1849 }
1850
1851 static bool bracketIsAnchored(const unsigned char* code)
1852 {
1853 do {
1854 if (!branchIsAnchored(code + 1 + LINK_SIZE))
1855 return false;
1856 code += getLinkValue(code + 1);
1857 } while (*code == OP_ALT); /* Loop for each alternative */
1858 return true;
1859 }
1860
1861 /*************************************************
1862 * Check for starting with ^ or .* *
1863 *************************************************/
1864
1865 /* This is called to find out if every branch starts with ^ or .* so that
1866 "first char" processing can be done to speed things up in multiline
1867 matching and for non-DOTALL patterns that start with .* (which must start at
1868 the beginning or after \n)
1869
1870 Except when the .* appears inside capturing parentheses, and there is a
1871 subsequent back reference to those parentheses. By keeping a bitmap of the
1872 first 31 back references, we can catch some of the more common cases more
1873 precisely; all the greater back references share a single bit.
1874
1875 Arguments:
1876 code points to start of expression (the bracket)
1877 captureMap a bitmap of which brackets we are inside while testing; this
1878 handles up to substring 31; all brackets after that share
1879 the zero bit
1880 backrefMap the back reference bitmap
1881 */
1882
1883 static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
1884 {
1885 const unsigned char* scode = firstSignificantOpcode(code);
1886 int op = *scode;
1887
1888 /* Capturing brackets */
1889 if (op > OP_BRA) {
1890 int captureNum = op - OP_BRA;
1891 if (captureNum > EXTRACT_BASIC_MAX)
1892 captureNum = get2ByteValue(scode + 2 + LINK_SIZE);
1893 int bracketMask = (captureNum < 32) ? (1 << captureNum) : 1;
1894 return bracketNeedsLineStart(scode, captureMap | bracketMask, backrefMap );
1895 }
1896
1897 /* Other brackets */
1898 if (op == OP_BRA || op == OP_ASSERT)
1899 return bracketNeedsLineStart(scode, captureMap, backrefMap);
1900
1901 /* .* means "start at start or after \n" if it isn't in brackets that
1902 may be referenced. */
1903
1904 if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1905 return scode[1] == OP_NOT_NEWLINE && !(captureMap & backrefMap);
1906
1907 /* Explicit ^ */
1908 return op == OP_CIRC || op == OP_BOL;
1909 }
1910
1911 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap , unsigned backrefMap)
1912 {
1913 do {
1914 if (!branchNeedsLineStart(code + 1 + LINK_SIZE, captureMap, backrefMap))
1915 return false;
1916 code += getLinkValue(code + 1);
1917 } while (*code == OP_ALT); /* Loop for each alternative */
1918 return true;
1919 }
1920
1921 /*************************************************
1922 * Check for asserted fixed first char *
1923 *************************************************/
1924
1925 /* During compilation, the "first char" settings from forward assertions are
1926 discarded, because they can cause conflicts with actual literals that follow.
1927 However, if we end up without a first char setting for an unanchored pattern,
1928 it is worth scanning the regex to see if there is an initial asserted first
1929 char. If all branches start with the same asserted char, or with a bracket all
1930 of whose alternatives start with the same asserted char (recurse ad lib), then
1931 we return that char, otherwise -1.
1932
1933 Arguments:
1934 code points to start of expression (the bracket)
1935 options pointer to the options (used to check casing changes)
1936 inassert true if in an assertion
1937
1938 Returns: -1 or the fixed first char
1939 */
1940
1941 static int branchFindFirstAssertedCharacter(const unsigned char* code, bool inas sert)
1942 {
1943 const unsigned char* scode = firstSignificantOpcodeSkippingAssertions(code);
1944 int op = *scode;
1945
1946 if (op >= OP_BRA)
1947 op = OP_BRA;
1948
1949 switch (op) {
1950 default:
1951 return -1;
1952
1953 case OP_BRA:
1954 case OP_ASSERT:
1955 return bracketFindFirstAssertedCharacter(scode, op == OP_ASSERT);
1956
1957 case OP_EXACT:
1958 scode += 2;
1959 /* Fall through */
1960
1961 case OP_CHAR:
1962 case OP_CHAR_IGNORING_CASE:
1963 case OP_ASCII_CHAR:
1964 case OP_ASCII_LETTER_IGNORING_CASE:
1965 case OP_PLUS:
1966 case OP_MINPLUS:
1967 if (!inassert)
1968 return -1;
1969 return scode[1];
1970 }
1971 }
1972
1973 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool ina ssert)
1974 {
1975 int c = -1;
1976 do {
1977 int d = branchFindFirstAssertedCharacter(code + 1 + LINK_SIZE, inassert) ;
1978 if (d < 0)
1979 return -1;
1980 if (c < 0)
1981 c = d;
1982 else if (c != d)
1983 return -1;
1984 code += getLinkValue(code + 1);
1985 } while (*code == OP_ALT);
1986 return c;
1987 }
1988
1989 static inline int multiplyWithOverflowCheck(int a, int b)
1990 {
1991 if (!a || !b)
1992 return 0;
1993 if (a > MAX_PATTERN_SIZE / b)
1994 return -1;
1995 return a * b;
1996 }
1997
1998 static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt h, JSRegExpIgnoreCaseOption ignoreCase,
1999 CompileData& cd, ErrorCode& errorcode)
2000 {
2001 /* Make a pass over the pattern to compute the
2002 amount of store required to hold the compiled code. This does not have to b e
2003 perfect as long as errors are overestimates. */
2004
2005 if (patternLength > MAX_PATTERN_SIZE) {
2006 errorcode = ERR16;
2007 return -1;
2008 }
2009
2010 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
2011 int branch_extra = 0;
2012 int lastitemlength = 0;
2013 unsigned brastackptr = 0;
2014 int brastack[BRASTACK_SIZE];
2015 unsigned char bralenstack[BRASTACK_SIZE];
2016 int bracount = 0;
2017
2018 const UChar* ptr = reinterpret_cast<const UChar*>(pattern - 1);
2019 const UChar* patternEnd = reinterpret_cast<const UChar*>(pattern + patternLe ngth);
2020
2021 while (++ptr < patternEnd) {
2022 int minRepeats = 0, maxRepeats = 0;
2023 int c = *ptr;
2024
2025 switch (c) {
2026 /* A backslashed item may be an escaped data character or it may be a
2027 character type. */
2028
2029 case '\\':
2030 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBra ckets, false);
2031 if (errorcode != 0)
2032 return -1;
2033
2034 lastitemlength = 1; /* Default length of last item for repea ts */
2035
2036 if (c >= 0) { /* Data character */
2037 length += 2; /* For a one-byte character */
2038
2039 if (c > 127) {
2040 int i;
2041 for (i = 0; i < kjs_pcre_utf8_table1_size; i++)
2042 if (c <= kjs_pcre_utf8_table1[i]) break;
2043 length += i;
2044 lastitemlength += i;
2045 }
2046
2047 continue;
2048 }
2049
2050 /* Other escapes need one byte */
2051
2052 length++;
2053
2054 /* A back reference needs an additional 2 bytes, plus either one or 5
2055 bytes for a repeat. We also need to keep the value of the highe st
2056 back reference. */
2057
2058 if (c <= -ESC_REF) {
2059 int refnum = -c - ESC_REF;
2060 cd.backrefMap |= (refnum < 32) ? (1 << refnum) : 1;
2061 if (refnum > cd.top_backref)
2062 cd.top_backref = refnum;
2063 length += 2; /* For single back reference */
2064 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRe peat(ptr + 2, patternEnd)) {
2065 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats , &errorcode);
2066 if (errorcode)
2067 return -1;
2068 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats = = -1)) ||
2069 (minRepeats == 1 && maxRepeats == -1))
2070 length++;
2071 else
2072 length += 5;
2073 if (safelyCheckNextChar(ptr, patternEnd, '?'))
2074 ptr++;
2075 }
2076 }
2077 continue;
2078
2079 case '^': /* Single-byte metacharacters */
2080 case '.':
2081 case '$':
2082 length++;
2083 lastitemlength = 1;
2084 continue;
2085
2086 case '*': /* These repeats won't be after brackets; */
2087 case '+': /* those are handled separately */
2088 case '?':
2089 length++;
2090 goto POSSESSIVE;
2091
2092 /* This covers the cases of braced repeats after a single char, meta char,
2093 class, or back reference. */
2094
2095 case '{':
2096 if (!isCountedRepeat(ptr + 1, patternEnd))
2097 goto NORMAL_CHAR;
2098 ptr = readRepeatCounts(ptr + 1, &minRepeats, &maxRepeats, &error code);
2099 if (errorcode != 0)
2100 return -1;
2101
2102 /* These special cases just insert one extra opcode */
2103
2104 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) | |
2105 (minRepeats == 1 && maxRepeats == -1))
2106 length++;
2107
2108 /* These cases might insert additional copies of a preceding cha racter. */
2109
2110 else {
2111 if (minRepeats != 1) {
2112 length -= lastitemlength; /* Uncount the original char or metachar */
2113 if (minRepeats > 0)
2114 length += 3 + lastitemlength;
2115 }
2116 length += lastitemlength + ((maxRepeats > 0) ? 3 : 1);
2117 }
2118
2119 if (safelyCheckNextChar(ptr, patternEnd, '?'))
2120 ptr++; /* Needs no extra length */
2121
2122 POSSESSIVE: /* Test for possessive quantifier */
2123 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2124 ptr++;
2125 length += 2 + 2 * LINK_SIZE; /* Allow for atomic brackets */
2126 }
2127 continue;
2128
2129 /* An alternation contains an offset to the next branch or ket. If a ny ims
2130 options changed in the previous branch(es), and/or if we are in a
2131 lookbehind assertion, extra space will be needed at the start of th e
2132 branch. This is handled by branch_extra. */
2133
2134 case '|':
2135 if (brastackptr == 0)
2136 cd.needOuterBracket = true;
2137 length += 1 + LINK_SIZE + branch_extra;
2138 continue;
2139
2140 /* A character class uses 33 characters provided that all the charac ter
2141 values are less than 256. Otherwise, it uses a bit map for low valu ed
2142 characters, and individual items for others. Don't worry about char acter
2143 types that aren't allowed in classes - they'll get picked up during the
2144 compile. A character class that contains only one single-byte chara cter
2145 uses 2 or 3 bytes, depending on whether it is negated or not. Notic e this
2146 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
2147
2148 case '[': {
2149 int class_optcount;
2150 if (*(++ptr) == '^') {
2151 class_optcount = 10; /* Greater than one */
2152 ptr++;
2153 }
2154 else
2155 class_optcount = 0;
2156
2157 bool class_utf8 = false;
2158
2159 for (; ptr < patternEnd && *ptr != ']'; ++ptr) {
2160 /* Check for escapes */
2161
2162 if (*ptr == '\\') {
2163 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapt uringBrackets, true);
2164 if (errorcode != 0)
2165 return -1;
2166
2167 /* Handle escapes that turn into characters */
2168
2169 if (c >= 0)
2170 goto NON_SPECIAL_CHARACTER;
2171
2172 /* Escapes that are meta-things. The normal ones just af fect the
2173 bit map, but Unicode properties require an XCLASS exten ded item. */
2174
2175 else
2176 class_optcount = 10; /* \d, \s etc; make sur e > 1 */
2177 }
2178
2179 /* Anything else increments the possible optimization count. We have to
2180 detect ranges here so that we can compute the number of ext ra ranges for
2181 caseless wide characters when UCP support is available. If there are wide
2182 characters, we are going to have to use an XCLASS, even for single
2183 characters. */
2184
2185 else {
2186 c = *ptr;
2187
2188 /* Come here from handling \ above when it escapes to a char value */
2189
2190 NON_SPECIAL_CHARACTER:
2191 class_optcount++;
2192
2193 int d = -1;
2194 if (safelyCheckNextChar(ptr, patternEnd, '-')) {
2195 UChar const *hyptr = ptr++;
2196 if (safelyCheckNextChar(ptr, patternEnd, '\\')) {
2197 ptr++;
2198 d = checkEscape(&ptr, patternEnd, &errorcode, cd .numCapturingBrackets, true);
2199 if (errorcode != 0)
2200 return -1;
2201 }
2202 else if ((ptr + 1 < patternEnd) && ptr[1] != ']')
2203 d = *++ptr;
2204 if (d < 0)
2205 ptr = hyptr; /* go back to hyphen as data * /
2206 }
2207
2208 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
2209 127 for caseless matching, we will need to use an XCLAS S. */
2210
2211 if (d >= 0) {
2212 class_optcount = 10; /* Ensure > 1 */
2213 if (d < c) {
2214 errorcode = ERR8;
2215 return -1;
2216 }
2217
2218 if ((d > 255 || (ignoreCase && d > 127))) {
2219 unsigned char buffer[6];
2220 if (!class_utf8) /* Allow for XCLASS ove rhead */
2221 {
2222 class_utf8 = true;
2223 length += LINK_SIZE + 2;
2224 }
2225
2226 /* If we have UCP support, find out how many ext ra ranges are
2227 needed to map the other case of characters with in this range. We
2228 have to mimic the range optimization here, beca use extending the
2229 range upwards might push d over a boundary that makes it use
2230 another byte in the UTF-8 representation. */
2231
2232 if (ignoreCase) {
2233 int occ, ocd;
2234 int cc = c;
2235 int origd = d;
2236 while (getOthercaseRange(&cc, origd, &occ, & ocd)) {
2237 if (occ >= c && ocd <= d)
2238 continue; /* Skip embedded */
2239
2240 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2241 { /* if there is overlap, */
2242 c = occ; /* noti ng that if occ < c */
2243 continue; /* we c an't have ocd > d */
2244 } /* because a subrange is */
2245 if (ocd > d && occ <= d + 1) /* always shorter than */
2246 { /* the basi c range. */
2247 d = ocd;
2248 continue;
2249 }
2250
2251 /* An extra item is needed */
2252
2253 length += 1 + encodeUTF8(occ, buffer) +
2254 ((occ == ocd) ? 0 : encodeUTF8(ocd, buff er));
2255 }
2256 }
2257
2258 /* The length of the (possibly extended) range * /
2259
2260 length += 1 + encodeUTF8(c, buffer) + encodeUTF8 (d, buffer);
2261 }
2262
2263 }
2264
2265 /* We have a single character. There is nothing to be do ne unless we
2266 are in UTF-8 mode. If the char is > 255, or 127 when ca seless, we must
2267 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
2268 support. */
2269
2270 else {
2271 if ((c > 255 || (ignoreCase && c > 127))) {
2272 unsigned char buffer[6];
2273 class_optcount = 10; /* Ensure > 1 */
2274 if (!class_utf8) /* Allow for XCLASS ove rhead */
2275 {
2276 class_utf8 = true;
2277 length += LINK_SIZE + 2;
2278 }
2279 length += (ignoreCase ? 2 : 1) * (1 + encodeUTF8 (c, buffer));
2280 }
2281 }
2282 }
2283 }
2284
2285 if (ptr >= patternEnd) { /* Missing terminating ']' */
2286 errorcode = ERR6;
2287 return -1;
2288 }
2289
2290 /* We can optimize when there was only one optimizable character .
2291 Note that this does not detect the case of a negated single cha racter.
2292 In that case we do an incorrect length computation, but it's no t a serious
2293 problem because the computed length is too large rather than to o small. */
2294
2295 if (class_optcount == 1)
2296 goto NORMAL_CHAR;
2297
2298 /* Here, we handle repeats for the class opcodes. */
2299 {
2300 length += 33;
2301
2302 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
2303 we also need extra for wrapping the whole thing in a sub-pa ttern. */
2304
2305 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRe peat(ptr + 2, patternEnd)) {
2306 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats , &errorcode);
2307 if (errorcode != 0)
2308 return -1;
2309 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats = = -1)) ||
2310 (minRepeats == 1 && maxRepeats == -1))
2311 length++;
2312 else
2313 length += 5;
2314 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2315 ptr++;
2316 length += 2 + 2 * LINK_SIZE;
2317 } else if (safelyCheckNextChar(ptr, patternEnd, '?'))
2318 ptr++;
2319 }
2320 }
2321 continue;
2322 }
2323
2324 /* Brackets may be genuine groups or special things */
2325
2326 case '(': {
2327 int branch_newextra = 0;
2328 int bracket_length = 1 + LINK_SIZE;
2329 bool capturing = false;
2330
2331 /* Handle special forms of bracket, which all start (? */
2332
2333 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
2334 switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) {
2335 /* Non-referencing groups and lookaheads just move the p ointer on, and
2336 then behave like a non-special bracket, except that the y don't increment
2337 the count of extracting brackets. Ditto for the "once o nly" bracket,
2338 which is in Perl from version 5.005. */
2339
2340 case ':':
2341 case '=':
2342 case '!':
2343 ptr += 2;
2344 break;
2345
2346 /* Else loop checking valid options until ) is met. Anyt hing else is an
2347 error. If we are without any brackets, i.e. at top leve l, the settings
2348 act as if specified in the options, so massage the opti ons immediately.
2349 This is for backward compatibility with Perl 5.004. */
2350
2351 default:
2352 errorcode = ERR12;
2353 return -1;
2354 }
2355 } else
2356 capturing = 1;
2357
2358 /* Capturing brackets must be counted so we can process escapes in a
2359 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are goi ng to need
2360 an additional 3 bytes of memory per capturing bracket. */
2361
2362 if (capturing) {
2363 bracount++;
2364 if (bracount > EXTRACT_BASIC_MAX)
2365 bracket_length += 3;
2366 }
2367
2368 /* Save length for computing whole length at end if there's a re peat that
2369 requires duplication of the group. Also save the current value of
2370 branch_extra, and start the new group with the new value. If no n-zero, this
2371 will either be 2 for a (?imsx: group, or 3 for a lookbehind ass ertion. */
2372
2373 if (brastackptr >= sizeof(brastack)/sizeof(int)) {
2374 errorcode = ERR17;
2375 return -1;
2376 }
2377
2378 bralenstack[brastackptr] = branch_extra;
2379 branch_extra = branch_newextra;
2380
2381 brastack[brastackptr++] = length;
2382 length += bracket_length;
2383 continue;
2384 }
2385
2386 /* Handle ket. Look for subsequent maxRepeats/minRepeats; for certai n sets of values we
2387 have to replicate this bracket up to that many times. If brastackpt r is
2388 0 this is an unmatched bracket which will generate an error, but ta ke care
2389 not to try to access brastack[-1] when computing the length and res toring
2390 the branch_extra value. */
2391
2392 case ')': {
2393 int duplength;
2394 length += 1 + LINK_SIZE;
2395 if (brastackptr > 0) {
2396 duplength = length - brastack[--brastackptr];
2397 branch_extra = bralenstack[brastackptr];
2398 }
2399 else
2400 duplength = 0;
2401
2402 /* Leave ptr at the final char; for readRepeatCounts this happen s
2403 automatically; for the others we need an increment. */
2404
2405 if ((ptr + 1 < patternEnd) && (c = ptr[1]) == '{' && isCountedRe peat(ptr + 2, patternEnd)) {
2406 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &e rrorcode);
2407 if (errorcode)
2408 return -1;
2409 } else if (c == '*') {
2410 minRepeats = 0;
2411 maxRepeats = -1;
2412 ptr++;
2413 } else if (c == '+') {
2414 minRepeats = 1;
2415 maxRepeats = -1;
2416 ptr++;
2417 } else if (c == '?') {
2418 minRepeats = 0;
2419 maxRepeats = 1;
2420 ptr++;
2421 } else {
2422 minRepeats = 1;
2423 maxRepeats = 1;
2424 }
2425
2426 /* If the minimum is zero, we have to allow for an OP_BRAZERO be fore the
2427 group, and if the maximum is greater than zero, we have to repl icate
2428 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2429 bracket set. */
2430
2431 int repeatsLength;
2432 if (minRepeats == 0) {
2433 length++;
2434 if (maxRepeats > 0) {
2435 repeatsLength = multiplyWithOverflowCheck(maxRepeats - 1 , duplength + 3 + 2 * LINK_SIZE);
2436 if (repeatsLength < 0) {
2437 errorcode = ERR16;
2438 return -1;
2439 }
2440 length += repeatsLength;
2441 if (length > MAX_PATTERN_SIZE) {
2442 errorcode = ERR16;
2443 return -1;
2444 }
2445 }
2446 }
2447
2448 /* When the minimum is greater than zero, we have to replicate u p to
2449 minval-1 times, with no additions required in the copies. Then, if there
2450 is a limited maximum we have to replicate up to maxval-1 times allowing
2451 for a BRAZERO item before each optional copy and nesting bracke ts for all
2452 but one of the optional copies. */
2453
2454 else {
2455 repeatsLength = multiplyWithOverflowCheck(minRepeats - 1, du plength);
2456 if (repeatsLength < 0) {
2457 errorcode = ERR16;
2458 return -1;
2459 }
2460 length += repeatsLength;
2461 if (maxRepeats > minRepeats) { /* Need this test as maxRepea ts=-1 means no limit */
2462 repeatsLength = multiplyWithOverflowCheck(maxRepeats - m inRepeats, duplength + 3 + 2 * LINK_SIZE);
2463 if (repeatsLength < 0) {
2464 errorcode = ERR16;
2465 return -1;
2466 }
2467 length += repeatsLength - (2 + 2 * LINK_SIZE);
2468 }
2469 if (length > MAX_PATTERN_SIZE) {
2470 errorcode = ERR16;
2471 return -1;
2472 }
2473 }
2474
2475 /* Allow space for once brackets for "possessive quantifier" */
2476
2477 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2478 ptr++;
2479 length += 2 + 2 * LINK_SIZE;
2480 }
2481 continue;
2482 }
2483
2484 /* Non-special character. It won't be space or # in extended mode, s o it is
2485 always a genuine character. If we are in a \Q...\E sequence, check for the
2486 end; if not, we have a literal. */
2487
2488 default:
2489 NORMAL_CHAR:
2490 length += 2; /* For a one-byte character */
2491 lastitemlength = 1; /* Default length of last item for repeats */
2492
2493 if (c > 127) {
2494 int i;
2495 for (i = 0; i < kjs_pcre_utf8_table1_size; i++)
2496 if (c <= kjs_pcre_utf8_table1[i])
2497 break;
2498 length += i;
2499 lastitemlength += i;
2500 }
2501
2502 continue;
2503 }
2504 }
2505
2506 length += 2 + LINK_SIZE; /* For final KET and END */
2507
2508 cd.numCapturingBrackets = bracount;
2509 return length;
2510 }
2511
2512 /*************************************************
2513 * Compile a Regular Expression *
2514 *************************************************/
2515
2516 /* This function takes a string and returns a pointer to a block of store
2517 holding a compiled version of the expression. The original API for this
2518 function had no error code return variable; it is retained for backwards
2519 compatibility. The new function is given a new name.
2520
2521 Arguments:
2522 pattern the regular expression
2523 options various option bits
2524 errorcodeptr pointer to error code variable (pcre_compile2() only)
2525 can be NULL if you don't want a code value
2526 errorptr pointer to pointer to error text
2527 erroroffset ptr offset in pattern where error was detected
2528 tables pointer to character tables or NULL
2529
2530 Returns: pointer to compiled data block, or NULL on error,
2531 with errorptr and erroroffset set
2532 */
2533
2534 static inline JSRegExp* returnError(ErrorCode errorcode, const char** errorptr)
2535 {
2536 *errorptr = errorText(errorcode);
2537 return 0;
2538 }
2539
2540 JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
2541 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption mul tiline,
2542 unsigned* numSubpatterns, const char** errorptr,
2543 malloc_t* allocate_function, free_t* free_function)
2544 {
2545 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2546 can do is just return NULL, but we can set a code value if there is a code pointer. */
2547 if (!errorptr)
2548 return 0;
2549 *errorptr = NULL;
2550
2551 CompileData cd;
2552
2553 ErrorCode errorcode = ERR0;
2554 /* Call this once just to count the brackets. */
2555 calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, error code);
2556 /* Call it again to compute the length. */
2557 int length = calculateCompiledPatternLength(pattern, patternLength, ignoreCa se, cd, errorcode);
2558 if (errorcode)
2559 return returnError(errorcode, errorptr);
2560
2561 if (length > MAX_PATTERN_SIZE)
2562 return returnError(ERR16, errorptr);
2563
2564 size_t size = length + sizeof(JSRegExp);
2565 JSRegExp* re = reinterpret_cast<JSRegExp*>((*allocate_function)(size));
2566
2567 if (!re)
2568 return returnError(ERR13, errorptr);
2569
2570 re->options = (ignoreCase ? IgnoreCaseOption : 0) | (multiline ? MatchAcross MultipleLinesOption : 0);
2571
2572 /* The starting points of the name/number translation table and of the code are
2573 passed around in the compile data block. */
2574
2575 unsigned char* codeStart = reinterpret_cast<unsigned char*>(re + 1);
2576
2577 /* Set up a starting, non-extracting bracket, then compile the expression. O n
2578 error, errorcode will be set non-zero, so we don't need to look at the resu lt
2579 of the function here. */
2580
2581 const UChar* ptr = reinterpret_cast<const UChar*>(pattern);
2582 const UChar* patternEnd = pattern + patternLength;
2583 unsigned char* code = reinterpret_cast<unsigned char*>(codeStart);
2584 int firstbyte, reqbyte;
2585 int bracketCount = 0;
2586 if (!cd.needOuterBracket)
2587 compileBranch(re->options, &bracketCount, &code, &ptr, patternEnd, &erro rcode, &firstbyte, &reqbyte, cd);
2588 else {
2589 *code = OP_BRA;
2590 compileBracket(re->options, &bracketCount, &code, &ptr, patternEnd, &err orcode, 0, &firstbyte, &reqbyte, cd);
2591 }
2592 re->top_bracket = bracketCount;
2593 re->top_backref = cd.top_backref;
2594
2595 /* If not reached end of pattern on success, there's an excess bracket. */
2596
2597 if (errorcode == 0 && ptr < patternEnd)
2598 errorcode = ERR10;
2599
2600 /* Fill in the terminating state and check for disastrous overflow, but
2601 if debugging, leave the test till after things are printed out. */
2602
2603 *code++ = OP_END;
2604
2605 ASSERT(code - codeStart <= length);
2606 if (code - codeStart > length)
2607 errorcode = ERR7;
2608
2609 /* Give an error if there's back reference to a non-existent capturing
2610 subpattern. */
2611
2612 if (re->top_backref > re->top_bracket)
2613 errorcode = ERR15;
2614
2615 /* Failed to compile, or error while post-processing */
2616
2617 if (errorcode != ERR0) {
2618 (*free_function)(reinterpret_cast<void*>(re));
2619 return returnError(errorcode, errorptr);
2620 }
2621
2622 /* If the anchored option was not passed, set the flag if we can determine t hat
2623 the pattern is anchored by virtue of ^ characters or \A or anything else (s uch
2624 as starting with .* when DOTALL is set).
2625
2626 Otherwise, if we know what the first character has to be, save it, because that
2627 speeds up unanchored matches no end. If not, see if we can set the
2628 UseMultiLineFirstByteOptimizationOption flag. This is helpful for multiline matches when all branches
2629 start with ^. and also when all branches start with .* for non-DOTALL match es.
2630 */
2631
2632 if (cd.needOuterBracket ? bracketIsAnchored(codeStart) : branchIsAnchored(co deStart))
2633 re->options |= IsAnchoredOption;
2634 else {
2635 if (firstbyte < 0) {
2636 firstbyte = (cd.needOuterBracket
2637 ? bracketFindFirstAssertedCharacter(codeStart, false)
2638 : branchFindFirstAssertedCharacter(codeStart, false))
2639 | ((re->options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0);
2640 }
2641 if (firstbyte >= 0) {
2642 int ch = firstbyte & 255;
2643 if (ch < 127) {
2644 re->first_byte = ((firstbyte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? ch : firstbyte;
2645 re->options |= UseFirstByteOptimizationOption;
2646 }
2647 } else {
2648 if (cd.needOuterBracket ? bracketNeedsLineStart(codeStart, 0, cd.bac krefMap) : branchNeedsLineStart(codeStart, 0, cd.backrefMap))
2649 re->options |= UseMultiLineFirstByteOptimizationOption;
2650 }
2651 }
2652
2653 /* For an anchored pattern, we use the "required byte" only if it follows a
2654 variable length item in the regex. Remove the caseless flag for non-caseabl e
2655 bytes. */
2656
2657 if (reqbyte >= 0 && (!(re->options & IsAnchoredOption) || (reqbyte & REQ_VAR Y))) {
2658 int ch = reqbyte & 255;
2659 if (ch < 127) {
2660 re->req_byte = ((reqbyte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? (reqbyte & ~REQ_IGNORE_CASE) : reqbyte;
2661 re->options |= UseRequiredByteOptimizationOption;
2662 }
2663 }
2664
2665 if (numSubpatterns)
2666 *numSubpatterns = re->top_bracket;
2667 return re;
2668 }
2669
2670 void jsRegExpFree(JSRegExp* re, free_t* free_function)
2671 {
2672 (*free_function)(reinterpret_cast<void*>(re));
2673 }
2674
2675 } } // namespace dart::jscre
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698