Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3)

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1565183002: [regexp] move regexp parser into own files. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: fix test compile Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | src/utils.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/regexp-parser.h"
6
7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h"
9 #include "src/isolate.h"
10 #include "src/objects-inl.h"
11 #include "src/regexp/jsregexp.h"
12 #include "src/utils.h"
13
14 namespace v8 {
15 namespace internal {
16
17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
18 bool multiline, bool unicode, Isolate* isolate,
19 Zone* zone)
20 : isolate_(isolate),
21 zone_(zone),
22 error_(error),
23 captures_(NULL),
24 in_(in),
25 current_(kEndMarker),
26 next_pos_(0),
27 captures_started_(0),
28 capture_count_(0),
29 has_more_(true),
30 multiline_(multiline),
31 unicode_(unicode),
32 simple_(false),
33 contains_anchor_(false),
34 is_scanned_for_captures_(false),
35 failed_(false) {
36 Advance();
37 }
38
39
40 uc32 RegExpParser::Next() {
41 if (has_next()) {
42 return in()->Get(next_pos_);
43 } else {
44 return kEndMarker;
45 }
46 }
47
48
49 void RegExpParser::Advance() {
50 if (next_pos_ < in()->length()) {
51 StackLimitCheck check(isolate());
52 if (check.HasOverflowed()) {
53 ReportError(CStrVector(Isolate::kStackOverflowMessage));
54 } else if (zone()->excess_allocation()) {
55 ReportError(CStrVector("Regular expression too large"));
56 } else {
57 current_ = in()->Get(next_pos_);
58 next_pos_++;
59 }
60 } else {
61 current_ = kEndMarker;
62 // Advance so that position() points to 1-after-the-last-character. This is
63 // important so that Reset() to this position works correctly.
64 next_pos_ = in()->length() + 1;
65 has_more_ = false;
66 }
67 }
68
69
70 void RegExpParser::Reset(int pos) {
71 next_pos_ = pos;
72 has_more_ = (pos < in()->length());
73 Advance();
74 }
75
76
77 void RegExpParser::Advance(int dist) {
78 next_pos_ += dist - 1;
79 Advance();
80 }
81
82
83 bool RegExpParser::simple() { return simple_; }
84
85
86 bool RegExpParser::IsSyntaxCharacter(uc32 c) {
87 return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
88 c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
89 c == '{' || c == '}' || c == '|';
90 }
91
92
93 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
94 failed_ = true;
95 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
96 // Zip to the end to make sure the no more input is read.
97 current_ = kEndMarker;
98 next_pos_ = in()->length();
99 return NULL;
100 }
101
102
103 #define CHECK_FAILED /**/); \
104 if (failed_) return NULL; \
105 ((void)0
106
107
108 // Pattern ::
109 // Disjunction
110 RegExpTree* RegExpParser::ParsePattern() {
111 RegExpTree* result = ParseDisjunction(CHECK_FAILED);
112 DCHECK(!has_more());
113 // If the result of parsing is a literal string atom, and it has the
114 // same length as the input, then the atom is identical to the input.
115 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
116 simple_ = true;
117 }
118 return result;
119 }
120
121
122 // Disjunction ::
123 // Alternative
124 // Alternative | Disjunction
125 // Alternative ::
126 // [empty]
127 // Term Alternative
128 // Term ::
129 // Assertion
130 // Atom
131 // Atom Quantifier
132 RegExpTree* RegExpParser::ParseDisjunction() {
133 // Used to store current state while parsing subexpressions.
134 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
135 zone());
136 RegExpParserState* state = &initial_state;
137 // Cache the builder in a local variable for quick access.
138 RegExpBuilder* builder = initial_state.builder();
139 while (true) {
140 switch (current()) {
141 case kEndMarker:
142 if (state->IsSubexpression()) {
143 // Inside a parenthesized group when hitting end of input.
144 ReportError(CStrVector("Unterminated group") CHECK_FAILED);
145 }
146 DCHECK_EQ(INITIAL, state->group_type());
147 // Parsing completed successfully.
148 return builder->ToRegExp();
149 case ')': {
150 if (!state->IsSubexpression()) {
151 ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
152 }
153 DCHECK_NE(INITIAL, state->group_type());
154
155 Advance();
156 // End disjunction parsing and convert builder content to new single
157 // regexp atom.
158 RegExpTree* body = builder->ToRegExp();
159
160 int end_capture_index = captures_started();
161
162 int capture_index = state->capture_index();
163 SubexpressionType group_type = state->group_type();
164
165 // Build result of subexpression.
166 if (group_type == CAPTURE) {
167 RegExpCapture* capture = GetCapture(capture_index);
168 capture->set_body(body);
169 body = capture;
170 } else if (group_type != GROUPING) {
171 DCHECK(group_type == POSITIVE_LOOKAROUND ||
172 group_type == NEGATIVE_LOOKAROUND);
173 bool is_positive = (group_type == POSITIVE_LOOKAROUND);
174 body = new (zone()) RegExpLookaround(
175 body, is_positive, end_capture_index - capture_index,
176 capture_index, state->lookaround_type());
177 }
178
179 // Restore previous state.
180 state = state->previous_state();
181 builder = state->builder();
182
183 builder->AddAtom(body);
184 // For compatability with JSC and ES3, we allow quantifiers after
185 // lookaheads, and break in all cases.
186 break;
187 }
188 case '|': {
189 Advance();
190 builder->NewAlternative();
191 continue;
192 }
193 case '*':
194 case '+':
195 case '?':
196 return ReportError(CStrVector("Nothing to repeat"));
197 case '^': {
198 Advance();
199 if (multiline_) {
200 builder->AddAssertion(
201 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
202 } else {
203 builder->AddAssertion(
204 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
205 set_contains_anchor();
206 }
207 continue;
208 }
209 case '$': {
210 Advance();
211 RegExpAssertion::AssertionType assertion_type =
212 multiline_ ? RegExpAssertion::END_OF_LINE
213 : RegExpAssertion::END_OF_INPUT;
214 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
215 continue;
216 }
217 case '.': {
218 Advance();
219 // everything except \x0a, \x0d, \u2028 and \u2029
220 ZoneList<CharacterRange>* ranges =
221 new (zone()) ZoneList<CharacterRange>(2, zone());
222 CharacterRange::AddClassEscape('.', ranges, zone());
223 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
224 builder->AddAtom(atom);
225 break;
226 }
227 case '(': {
228 SubexpressionType subexpr_type = CAPTURE;
229 RegExpLookaround::Type lookaround_type = state->lookaround_type();
230 Advance();
231 if (current() == '?') {
232 switch (Next()) {
233 case ':':
234 subexpr_type = GROUPING;
235 break;
236 case '=':
237 lookaround_type = RegExpLookaround::LOOKAHEAD;
238 subexpr_type = POSITIVE_LOOKAROUND;
239 break;
240 case '!':
241 lookaround_type = RegExpLookaround::LOOKAHEAD;
242 subexpr_type = NEGATIVE_LOOKAROUND;
243 break;
244 case '<':
245 if (FLAG_harmony_regexp_lookbehind) {
246 Advance();
247 lookaround_type = RegExpLookaround::LOOKBEHIND;
248 if (Next() == '=') {
249 subexpr_type = POSITIVE_LOOKAROUND;
250 break;
251 } else if (Next() == '!') {
252 subexpr_type = NEGATIVE_LOOKAROUND;
253 break;
254 }
255 }
256 // Fall through.
257 default:
258 ReportError(CStrVector("Invalid group") CHECK_FAILED);
259 break;
260 }
261 Advance(2);
262 } else {
263 if (captures_started_ >= kMaxCaptures) {
264 ReportError(CStrVector("Too many captures") CHECK_FAILED);
265 }
266 captures_started_++;
267 }
268 // Store current state and begin new disjunction parsing.
269 state = new (zone()) RegExpParserState(
270 state, subexpr_type, lookaround_type, captures_started_, zone());
271 builder = state->builder();
272 continue;
273 }
274 case '[': {
275 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);
276 builder->AddAtom(atom);
277 break;
278 }
279 // Atom ::
280 // \ AtomEscape
281 case '\\':
282 switch (Next()) {
283 case kEndMarker:
284 return ReportError(CStrVector("\\ at end of pattern"));
285 case 'b':
286 Advance(2);
287 builder->AddAssertion(
288 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
289 continue;
290 case 'B':
291 Advance(2);
292 builder->AddAssertion(
293 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
294 continue;
295 // AtomEscape ::
296 // CharacterClassEscape
297 //
298 // CharacterClassEscape :: one of
299 // d D s S w W
300 case 'd':
301 case 'D':
302 case 's':
303 case 'S':
304 case 'w':
305 case 'W': {
306 uc32 c = Next();
307 Advance(2);
308 ZoneList<CharacterRange>* ranges =
309 new (zone()) ZoneList<CharacterRange>(2, zone());
310 CharacterRange::AddClassEscape(c, ranges, zone());
311 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
312 builder->AddAtom(atom);
313 break;
314 }
315 case '1':
316 case '2':
317 case '3':
318 case '4':
319 case '5':
320 case '6':
321 case '7':
322 case '8':
323 case '9': {
324 int index = 0;
325 if (ParseBackReferenceIndex(&index)) {
326 if (state->IsInsideCaptureGroup(index)) {
327 // The back reference is inside the capture group it refers to.
328 // Nothing can possibly have been captured yet, so we use empty
329 // instead. This ensures that, when checking a back reference,
330 // the capture registers of the referenced capture are either
331 // both set or both cleared.
332 builder->AddEmpty();
333 } else {
334 RegExpCapture* capture = GetCapture(index);
335 RegExpTree* atom = new (zone()) RegExpBackReference(capture);
336 builder->AddAtom(atom);
337 }
338 break;
339 }
340 uc32 first_digit = Next();
341 if (first_digit == '8' || first_digit == '9') {
342 // If the 'u' flag is present, only syntax characters can be
343 // escaped,
344 // no other identity escapes are allowed. If the 'u' flag is not
345 // present, all identity escapes are allowed.
346 if (!FLAG_harmony_unicode_regexps || !unicode_) {
347 builder->AddCharacter(first_digit);
348 Advance(2);
349 } else {
350 return ReportError(CStrVector("Invalid escape"));
351 }
352 break;
353 }
354 }
355 // FALLTHROUGH
356 case '0': {
357 Advance();
358 uc32 octal = ParseOctalLiteral();
359 builder->AddCharacter(octal);
360 break;
361 }
362 // ControlEscape :: one of
363 // f n r t v
364 case 'f':
365 Advance(2);
366 builder->AddCharacter('\f');
367 break;
368 case 'n':
369 Advance(2);
370 builder->AddCharacter('\n');
371 break;
372 case 'r':
373 Advance(2);
374 builder->AddCharacter('\r');
375 break;
376 case 't':
377 Advance(2);
378 builder->AddCharacter('\t');
379 break;
380 case 'v':
381 Advance(2);
382 builder->AddCharacter('\v');
383 break;
384 case 'c': {
385 Advance();
386 uc32 controlLetter = Next();
387 // Special case if it is an ASCII letter.
388 // Convert lower case letters to uppercase.
389 uc32 letter = controlLetter & ~('a' ^ 'A');
390 if (letter < 'A' || 'Z' < letter) {
391 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
392 // This is outside the specification. We match JSC in
393 // reading the backslash as a literal character instead
394 // of as starting an escape.
395 builder->AddCharacter('\\');
396 } else {
397 Advance(2);
398 builder->AddCharacter(controlLetter & 0x1f);
399 }
400 break;
401 }
402 case 'x': {
403 Advance(2);
404 uc32 value;
405 if (ParseHexEscape(2, &value)) {
406 builder->AddCharacter(value);
407 } else if (!FLAG_harmony_unicode_regexps || !unicode_) {
408 builder->AddCharacter('x');
409 } else {
410 // If the 'u' flag is present, invalid escapes are not treated as
411 // identity escapes.
412 return ReportError(CStrVector("Invalid escape"));
413 }
414 break;
415 }
416 case 'u': {
417 Advance(2);
418 uc32 value;
419 if (ParseUnicodeEscape(&value)) {
420 builder->AddCharacter(value);
421 } else if (!FLAG_harmony_unicode_regexps || !unicode_) {
422 builder->AddCharacter('u');
423 } else {
424 // If the 'u' flag is present, invalid escapes are not treated as
425 // identity escapes.
426 return ReportError(CStrVector("Invalid unicode escape"));
427 }
428 break;
429 }
430 default:
431 Advance();
432 // If the 'u' flag is present, only syntax characters can be
433 // escaped, no
434 // other identity escapes are allowed. If the 'u' flag is not
435 // present,
436 // all identity escapes are allowed.
437 if (!FLAG_harmony_unicode_regexps || !unicode_ ||
438 IsSyntaxCharacter(current())) {
439 builder->AddCharacter(current());
440 Advance();
441 } else {
442 return ReportError(CStrVector("Invalid escape"));
443 }
444 break;
445 }
446 break;
447 case '{': {
448 int dummy;
449 if (ParseIntervalQuantifier(&dummy, &dummy)) {
450 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
451 }
452 // fallthrough
453 }
454 default:
455 builder->AddCharacter(current());
456 Advance();
457 break;
458 } // end switch(current())
459
460 int min;
461 int max;
462 switch (current()) {
463 // QuantifierPrefix ::
464 // *
465 // +
466 // ?
467 // {
468 case '*':
469 min = 0;
470 max = RegExpTree::kInfinity;
471 Advance();
472 break;
473 case '+':
474 min = 1;
475 max = RegExpTree::kInfinity;
476 Advance();
477 break;
478 case '?':
479 min = 0;
480 max = 1;
481 Advance();
482 break;
483 case '{':
484 if (ParseIntervalQuantifier(&min, &max)) {
485 if (max < min) {
486 ReportError(CStrVector("numbers out of order in {} quantifier.")
487 CHECK_FAILED);
488 }
489 break;
490 } else {
491 continue;
492 }
493 default:
494 continue;
495 }
496 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
497 if (current() == '?') {
498 quantifier_type = RegExpQuantifier::NON_GREEDY;
499 Advance();
500 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
501 // FLAG_regexp_possessive_quantifier is a debug-only flag.
502 quantifier_type = RegExpQuantifier::POSSESSIVE;
503 Advance();
504 }
505 builder->AddQuantifierToAtom(min, max, quantifier_type);
506 }
507 }
508
509
510 #ifdef DEBUG
511 // Currently only used in an DCHECK.
512 static bool IsSpecialClassEscape(uc32 c) {
513 switch (c) {
514 case 'd':
515 case 'D':
516 case 's':
517 case 'S':
518 case 'w':
519 case 'W':
520 return true;
521 default:
522 return false;
523 }
524 }
525 #endif
526
527
528 // In order to know whether an escape is a backreference or not we have to scan
529 // the entire regexp and find the number of capturing parentheses. However we
530 // don't want to scan the regexp twice unless it is necessary. This mini-parser
531 // is called when needed. It can see the difference between capturing and
532 // noncapturing parentheses and can skip character classes and backslash-escaped
533 // characters.
534 void RegExpParser::ScanForCaptures() {
535 // Start with captures started previous to current position
536 int capture_count = captures_started();
537 // Add count of captures after this position.
538 int n;
539 while ((n = current()) != kEndMarker) {
540 Advance();
541 switch (n) {
542 case '\\':
543 Advance();
544 break;
545 case '[': {
546 int c;
547 while ((c = current()) != kEndMarker) {
548 Advance();
549 if (c == '\\') {
550 Advance();
551 } else {
552 if (c == ']') break;
553 }
554 }
555 break;
556 }
557 case '(':
558 if (current() != '?') capture_count++;
559 break;
560 }
561 }
562 capture_count_ = capture_count;
563 is_scanned_for_captures_ = true;
564 }
565
566
567 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
568 DCHECK_EQ('\\', current());
569 DCHECK('1' <= Next() && Next() <= '9');
570 // Try to parse a decimal literal that is no greater than the total number
571 // of left capturing parentheses in the input.
572 int start = position();
573 int value = Next() - '0';
574 Advance(2);
575 while (true) {
576 uc32 c = current();
577 if (IsDecimalDigit(c)) {
578 value = 10 * value + (c - '0');
579 if (value > kMaxCaptures) {
580 Reset(start);
581 return false;
582 }
583 Advance();
584 } else {
585 break;
586 }
587 }
588 if (value > captures_started()) {
589 if (!is_scanned_for_captures_) {
590 int saved_position = position();
591 ScanForCaptures();
592 Reset(saved_position);
593 }
594 if (value > capture_count_) {
595 Reset(start);
596 return false;
597 }
598 }
599 *index_out = value;
600 return true;
601 }
602
603
604 RegExpCapture* RegExpParser::GetCapture(int index) {
605 // The index for the capture groups are one-based. Its index in the list is
606 // zero-based.
607 int know_captures =
608 is_scanned_for_captures_ ? capture_count_ : captures_started_;
609 DCHECK(index <= know_captures);
610 if (captures_ == NULL) {
611 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
612 }
613 while (captures_->length() < know_captures) {
614 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
615 }
616 return captures_->at(index - 1);
617 }
618
619
620 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
621 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
622 if (s->group_type() != CAPTURE) continue;
623 // Return true if we found the matching capture index.
624 if (index == s->capture_index()) return true;
625 // Abort if index is larger than what has been parsed up till this state.
626 if (index > s->capture_index()) return false;
627 }
628 return false;
629 }
630
631
632 // QuantifierPrefix ::
633 // { DecimalDigits }
634 // { DecimalDigits , }
635 // { DecimalDigits , DecimalDigits }
636 //
637 // Returns true if parsing succeeds, and set the min_out and max_out
638 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
639 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
640 DCHECK_EQ(current(), '{');
641 int start = position();
642 Advance();
643 int min = 0;
644 if (!IsDecimalDigit(current())) {
645 Reset(start);
646 return false;
647 }
648 while (IsDecimalDigit(current())) {
649 int next = current() - '0';
650 if (min > (RegExpTree::kInfinity - next) / 10) {
651 // Overflow. Skip past remaining decimal digits and return -1.
652 do {
653 Advance();
654 } while (IsDecimalDigit(current()));
655 min = RegExpTree::kInfinity;
656 break;
657 }
658 min = 10 * min + next;
659 Advance();
660 }
661 int max = 0;
662 if (current() == '}') {
663 max = min;
664 Advance();
665 } else if (current() == ',') {
666 Advance();
667 if (current() == '}') {
668 max = RegExpTree::kInfinity;
669 Advance();
670 } else {
671 while (IsDecimalDigit(current())) {
672 int next = current() - '0';
673 if (max > (RegExpTree::kInfinity - next) / 10) {
674 do {
675 Advance();
676 } while (IsDecimalDigit(current()));
677 max = RegExpTree::kInfinity;
678 break;
679 }
680 max = 10 * max + next;
681 Advance();
682 }
683 if (current() != '}') {
684 Reset(start);
685 return false;
686 }
687 Advance();
688 }
689 } else {
690 Reset(start);
691 return false;
692 }
693 *min_out = min;
694 *max_out = max;
695 return true;
696 }
697
698
699 uc32 RegExpParser::ParseOctalLiteral() {
700 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
701 // For compatibility with some other browsers (not all), we parse
702 // up to three octal digits with a value below 256.
703 uc32 value = current() - '0';
704 Advance();
705 if ('0' <= current() && current() <= '7') {
706 value = value * 8 + current() - '0';
707 Advance();
708 if (value < 32 && '0' <= current() && current() <= '7') {
709 value = value * 8 + current() - '0';
710 Advance();
711 }
712 }
713 return value;
714 }
715
716
717 bool RegExpParser::ParseHexEscape(int length, uc32* value) {
718 int start = position();
719 uc32 val = 0;
720 for (int i = 0; i < length; ++i) {
721 uc32 c = current();
722 int d = HexValue(c);
723 if (d < 0) {
724 Reset(start);
725 return false;
726 }
727 val = val * 16 + d;
728 Advance();
729 }
730 *value = val;
731 return true;
732 }
733
734
735 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
736 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
737 // allowed). In the latter case, the number of hex digits between { } is
738 // arbitrary. \ and u have already been read.
739 if (current() == '{' && FLAG_harmony_unicode_regexps && unicode_) {
740 int start = position();
741 Advance();
742 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
743 if (current() == '}') {
744 Advance();
745 return true;
746 }
747 }
748 Reset(start);
749 return false;
750 }
751 // \u but no {, or \u{...} escapes not allowed.
752 return ParseHexEscape(4, value);
753 }
754
755
756 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
757 uc32 x = 0;
758 int d = HexValue(current());
759 if (d < 0) {
760 return false;
761 }
762 while (d >= 0) {
763 x = x * 16 + d;
764 if (x > max_value) {
765 return false;
766 }
767 Advance();
768 d = HexValue(current());
769 }
770 *value = x;
771 return true;
772 }
773
774
775 uc32 RegExpParser::ParseClassCharacterEscape() {
776 DCHECK(current() == '\\');
777 DCHECK(has_next() && !IsSpecialClassEscape(Next()));
778 Advance();
779 switch (current()) {
780 case 'b':
781 Advance();
782 return '\b';
783 // ControlEscape :: one of
784 // f n r t v
785 case 'f':
786 Advance();
787 return '\f';
788 case 'n':
789 Advance();
790 return '\n';
791 case 'r':
792 Advance();
793 return '\r';
794 case 't':
795 Advance();
796 return '\t';
797 case 'v':
798 Advance();
799 return '\v';
800 case 'c': {
801 uc32 controlLetter = Next();
802 uc32 letter = controlLetter & ~('A' ^ 'a');
803 // For compatibility with JSC, inside a character class
804 // we also accept digits and underscore as control characters.
805 if ((controlLetter >= '0' && controlLetter <= '9') ||
806 controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
807 Advance(2);
808 // Control letters mapped to ASCII control characters in the range
809 // 0x00-0x1f.
810 return controlLetter & 0x1f;
811 }
812 // We match JSC in reading the backslash as a literal
813 // character instead of as starting an escape.
814 return '\\';
815 }
816 case '0':
817 case '1':
818 case '2':
819 case '3':
820 case '4':
821 case '5':
822 case '6':
823 case '7':
824 // For compatibility, we interpret a decimal escape that isn't
825 // a back reference (and therefore either \0 or not valid according
826 // to the specification) as a 1..3 digit octal character code.
827 return ParseOctalLiteral();
828 case 'x': {
829 Advance();
830 uc32 value;
831 if (ParseHexEscape(2, &value)) {
832 return value;
833 }
834 if (!FLAG_harmony_unicode_regexps || !unicode_) {
835 // If \x is not followed by a two-digit hexadecimal, treat it
836 // as an identity escape.
837 return 'x';
838 }
839 // If the 'u' flag is present, invalid escapes are not treated as
840 // identity escapes.
841 ReportError(CStrVector("Invalid escape"));
842 return 0;
843 }
844 case 'u': {
845 Advance();
846 uc32 value;
847 if (ParseUnicodeEscape(&value)) {
848 return value;
849 }
850 if (!FLAG_harmony_unicode_regexps || !unicode_) {
851 return 'u';
852 }
853 // If the 'u' flag is present, invalid escapes are not treated as
854 // identity escapes.
855 ReportError(CStrVector("Invalid unicode escape"));
856 return 0;
857 }
858 default: {
859 uc32 result = current();
860 // If the 'u' flag is present, only syntax characters can be escaped, no
861 // other identity escapes are allowed. If the 'u' flag is not present, all
862 // identity escapes are allowed.
863 if (!FLAG_harmony_unicode_regexps || !unicode_ ||
864 IsSyntaxCharacter(result)) {
865 Advance();
866 return result;
867 }
868 ReportError(CStrVector("Invalid escape"));
869 return 0;
870 }
871 }
872 return 0;
873 }
874
875
876 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
877 DCHECK_EQ(0, *char_class);
878 uc32 first = current();
879 if (first == '\\') {
880 switch (Next()) {
881 case 'w':
882 case 'W':
883 case 'd':
884 case 'D':
885 case 's':
886 case 'S': {
887 *char_class = Next();
888 Advance(2);
889 return CharacterRange::Singleton(0); // Return dummy value.
890 }
891 case kEndMarker:
892 return ReportError(CStrVector("\\ at end of pattern"));
893 default:
894 uc32 c = ParseClassCharacterEscape(CHECK_FAILED);
895 return CharacterRange::Singleton(c);
896 }
897 } else {
898 Advance();
899 return CharacterRange::Singleton(first);
900 }
901 }
902
903
904 static const uc16 kNoCharClass = 0;
905
906 // Adds range or pre-defined character class to character ranges.
907 // If char_class is not kInvalidClass, it's interpreted as a class
908 // escape (i.e., 's' means whitespace, from '\s').
909 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
910 uc16 char_class, CharacterRange range,
911 Zone* zone) {
912 if (char_class != kNoCharClass) {
913 CharacterRange::AddClassEscape(char_class, ranges, zone);
914 } else {
915 ranges->Add(range, zone);
916 }
917 }
918
919
920 RegExpTree* RegExpParser::ParseCharacterClass() {
921 static const char* kUnterminated = "Unterminated character class";
922 static const char* kRangeOutOfOrder = "Range out of order in character class";
923
924 DCHECK_EQ(current(), '[');
925 Advance();
926 bool is_negated = false;
927 if (current() == '^') {
928 is_negated = true;
929 Advance();
930 }
931 ZoneList<CharacterRange>* ranges =
932 new (zone()) ZoneList<CharacterRange>(2, zone());
933 while (has_more() && current() != ']') {
934 uc16 char_class = kNoCharClass;
935 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
936 if (current() == '-') {
937 Advance();
938 if (current() == kEndMarker) {
939 // If we reach the end we break out of the loop and let the
940 // following code report an error.
941 break;
942 } else if (current() == ']') {
943 AddRangeOrEscape(ranges, char_class, first, zone());
944 ranges->Add(CharacterRange::Singleton('-'), zone());
945 break;
946 }
947 uc16 char_class_2 = kNoCharClass;
948 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
949 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
950 // Either end is an escaped character class. Treat the '-' verbatim.
951 AddRangeOrEscape(ranges, char_class, first, zone());
952 ranges->Add(CharacterRange::Singleton('-'), zone());
953 AddRangeOrEscape(ranges, char_class_2, next, zone());
954 continue;
955 }
956 if (first.from() > next.to()) {
957 return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
958 }
959 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
960 } else {
961 AddRangeOrEscape(ranges, char_class, first, zone());
962 }
963 }
964 if (!has_more()) {
965 return ReportError(CStrVector(kUnterminated) CHECK_FAILED);
966 }
967 Advance();
968 if (ranges->length() == 0) {
969 ranges->Add(CharacterRange::Everything(), zone());
970 is_negated = !is_negated;
971 }
972 return new (zone()) RegExpCharacterClass(ranges, is_negated);
973 }
974
975
976 #undef CHECK_FAILED
977
978
979 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
980 FlatStringReader* input, bool multiline,
981 bool unicode, RegExpCompileData* result) {
982 DCHECK(result != NULL);
983 RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);
984 RegExpTree* tree = parser.ParsePattern();
985 if (parser.failed()) {
986 DCHECK(tree == NULL);
987 DCHECK(!result->error.is_null());
988 } else {
989 DCHECK(tree != NULL);
990 DCHECK(result->error.is_null());
991 result->tree = tree;
992 int capture_count = parser.captures_started();
993 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
994 result->contains_anchor = parser.contains_anchor();
995 result->capture_count = capture_count;
996 }
997 return !parser.failed();
998 }
999
1000
1001 RegExpBuilder::RegExpBuilder(Zone* zone)
1002 : zone_(zone),
1003 pending_empty_(false),
1004 characters_(NULL),
1005 terms_(),
1006 alternatives_()
1007 #ifdef DEBUG
1008 ,
1009 last_added_(ADD_NONE)
1010 #endif
1011 {
1012 }
1013
1014
1015 void RegExpBuilder::FlushCharacters() {
1016 pending_empty_ = false;
1017 if (characters_ != NULL) {
1018 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1019 characters_ = NULL;
1020 text_.Add(atom, zone());
1021 LAST(ADD_ATOM);
1022 }
1023 }
1024
1025
1026 void RegExpBuilder::FlushText() {
1027 FlushCharacters();
1028 int num_text = text_.length();
1029 if (num_text == 0) {
1030 return;
1031 } else if (num_text == 1) {
1032 terms_.Add(text_.last(), zone());
1033 } else {
1034 RegExpText* text = new (zone()) RegExpText(zone());
1035 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1036 terms_.Add(text, zone());
1037 }
1038 text_.Clear();
1039 }
1040
1041
1042 void RegExpBuilder::AddCharacter(uc16 c) {
1043 pending_empty_ = false;
1044 if (characters_ == NULL) {
1045 characters_ = new (zone()) ZoneList<uc16>(4, zone());
1046 }
1047 characters_->Add(c, zone());
1048 LAST(ADD_CHAR);
1049 }
1050
1051
1052 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1053
1054
1055 void RegExpBuilder::AddAtom(RegExpTree* term) {
1056 if (term->IsEmpty()) {
1057 AddEmpty();
1058 return;
1059 }
1060 if (term->IsTextElement()) {
1061 FlushCharacters();
1062 text_.Add(term, zone());
1063 } else {
1064 FlushText();
1065 terms_.Add(term, zone());
1066 }
1067 LAST(ADD_ATOM);
1068 }
1069
1070
1071 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1072 FlushText();
1073 terms_.Add(assert, zone());
1074 LAST(ADD_ASSERT);
1075 }
1076
1077
1078 void RegExpBuilder::NewAlternative() { FlushTerms(); }
1079
1080
1081 void RegExpBuilder::FlushTerms() {
1082 FlushText();
1083 int num_terms = terms_.length();
1084 RegExpTree* alternative;
1085 if (num_terms == 0) {
1086 alternative = new (zone()) RegExpEmpty();
1087 } else if (num_terms == 1) {
1088 alternative = terms_.last();
1089 } else {
1090 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1091 }
1092 alternatives_.Add(alternative, zone());
1093 terms_.Clear();
1094 LAST(ADD_NONE);
1095 }
1096
1097
1098 RegExpTree* RegExpBuilder::ToRegExp() {
1099 FlushTerms();
1100 int num_alternatives = alternatives_.length();
1101 if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1102 if (num_alternatives == 1) return alternatives_.last();
1103 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1104 }
1105
1106
1107 void RegExpBuilder::AddQuantifierToAtom(
1108 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1109 if (pending_empty_) {
1110 pending_empty_ = false;
1111 return;
1112 }
1113 RegExpTree* atom;
1114 if (characters_ != NULL) {
1115 DCHECK(last_added_ == ADD_CHAR);
1116 // Last atom was character.
1117 Vector<const uc16> char_vector = characters_->ToConstVector();
1118 int num_chars = char_vector.length();
1119 if (num_chars > 1) {
1120 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1121 text_.Add(new (zone()) RegExpAtom(prefix), zone());
1122 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1123 }
1124 characters_ = NULL;
1125 atom = new (zone()) RegExpAtom(char_vector);
1126 FlushText();
1127 } else if (text_.length() > 0) {
1128 DCHECK(last_added_ == ADD_ATOM);
1129 atom = text_.RemoveLast();
1130 FlushText();
1131 } else if (terms_.length() > 0) {
1132 DCHECK(last_added_ == ADD_ATOM);
1133 atom = terms_.RemoveLast();
1134 if (atom->max_match() == 0) {
1135 // Guaranteed to only match an empty string.
1136 LAST(ADD_TERM);
1137 if (min == 0) {
1138 return;
1139 }
1140 terms_.Add(atom, zone());
1141 return;
1142 }
1143 } else {
1144 // Only call immediately after adding an atom or character!
1145 UNREACHABLE();
1146 return;
1147 }
1148 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1149 zone());
1150 LAST(ADD_TERM);
1151 }
1152
1153 } // namespace internal
1154 } // namespace v8
OLDNEW
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | src/utils.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698