Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(88)

Side by Side Diff: runtime/vm/regexp_parser.cc

Issue 678193004: Copy irregexp related code from V8. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: rebase Created 6 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « runtime/vm/regexp_parser.h ('k') | runtime/vm/unibrow.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 #include "vm/regexp_parser.h"
6
7 // SNIP
8
9 namespace dart {
10
11 RegExpBuilder::RegExpBuilder(Zone* zone)
12 : zone_(zone),
13 pending_empty_(false),
14 characters_(NULL),
15 terms_(),
16 alternatives_()
17 #ifdef DEBUG
18 , last_added_(ADD_NONE)
19 #endif
20 {}
21
22
23 void RegExpBuilder::FlushCharacters() {
24 pending_empty_ = false;
25 if (characters_ != NULL) {
26 RegExpTree* atom = new(zone()) RegExpAtom(characters_->ToConstVector());
27 characters_ = NULL;
28 text_.Add(atom, zone());
29 LAST(ADD_ATOM);
30 }
31 }
32
33
34 void RegExpBuilder::FlushText() {
35 FlushCharacters();
36 int num_text = text_.length();
37 if (num_text == 0) {
38 return;
39 } else if (num_text == 1) {
40 terms_.Add(text_.last(), zone());
41 } else {
42 RegExpText* text = new(zone()) RegExpText(zone());
43 for (int i = 0; i < num_text; i++)
44 text_.Get(i)->AppendToText(text, zone());
45 terms_.Add(text, zone());
46 }
47 text_.Clear();
48 }
49
50
51 void RegExpBuilder::AddCharacter(uc16 c) {
52 pending_empty_ = false;
53 if (characters_ == NULL) {
54 characters_ = new(zone()) ZoneList<uc16>(4, zone());
55 }
56 characters_->Add(c, zone());
57 LAST(ADD_CHAR);
58 }
59
60
61 void RegExpBuilder::AddEmpty() {
62 pending_empty_ = true;
63 }
64
65
66 void RegExpBuilder::AddAtom(RegExpTree* term) {
67 if (term->IsEmpty()) {
68 AddEmpty();
69 return;
70 }
71 if (term->IsTextElement()) {
72 FlushCharacters();
73 text_.Add(term, zone());
74 } else {
75 FlushText();
76 terms_.Add(term, zone());
77 }
78 LAST(ADD_ATOM);
79 }
80
81
82 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
83 FlushText();
84 terms_.Add(assert, zone());
85 LAST(ADD_ASSERT);
86 }
87
88
89 void RegExpBuilder::NewAlternative() {
90 FlushTerms();
91 }
92
93
94 void RegExpBuilder::FlushTerms() {
95 FlushText();
96 int num_terms = terms_.length();
97 RegExpTree* alternative;
98 if (num_terms == 0) {
99 alternative = RegExpEmpty::GetInstance();
100 } else if (num_terms == 1) {
101 alternative = terms_.last();
102 } else {
103 alternative = new(zone()) RegExpAlternative(terms_.GetList(zone()));
104 }
105 alternatives_.Add(alternative, zone());
106 terms_.Clear();
107 LAST(ADD_NONE);
108 }
109
110
111 RegExpTree* RegExpBuilder::ToRegExp() {
112 FlushTerms();
113 int num_alternatives = alternatives_.length();
114 if (num_alternatives == 0) {
115 return RegExpEmpty::GetInstance();
116 }
117 if (num_alternatives == 1) {
118 return alternatives_.last();
119 }
120 return new(zone()) RegExpDisjunction(alternatives_.GetList(zone()));
121 }
122
123
124 void RegExpBuilder::AddQuantifierToAtom(
125 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
126 if (pending_empty_) {
127 pending_empty_ = false;
128 return;
129 }
130 RegExpTree* atom;
131 if (characters_ != NULL) {
132 DCHECK(last_added_ == ADD_CHAR);
133 // Last atom was character.
134 Vector<const uc16> char_vector = characters_->ToConstVector();
135 int num_chars = char_vector.length();
136 if (num_chars > 1) {
137 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
138 text_.Add(new(zone()) RegExpAtom(prefix), zone());
139 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
140 }
141 characters_ = NULL;
142 atom = new(zone()) RegExpAtom(char_vector);
143 FlushText();
144 } else if (text_.length() > 0) {
145 DCHECK(last_added_ == ADD_ATOM);
146 atom = text_.RemoveLast();
147 FlushText();
148 } else if (terms_.length() > 0) {
149 DCHECK(last_added_ == ADD_ATOM);
150 atom = terms_.RemoveLast();
151 if (atom->max_match() == 0) {
152 // Guaranteed to only match an empty string.
153 LAST(ADD_TERM);
154 if (min == 0) {
155 return;
156 }
157 terms_.Add(atom, zone());
158 return;
159 }
160 } else {
161 // Only call immediately after adding an atom or character!
162 UNREACHABLE();
163 return;
164 }
165 terms_.Add(
166 new(zone()) RegExpQuantifier(min, max, quantifier_type, atom), zone());
167 LAST(ADD_TERM);
168 }
169
170 // SNIP
171
172 // ----------------------------------------------------------------------------
173 // Regular expressions
174
175 RegExpParser::RegExpParser(FlatStringReader* in,
176 Handle<String>* error,
177 bool multiline,
178 Zone* zone)
179 : isolate_(zone->isolate()),
180 zone_(zone),
181 error_(error),
182 captures_(NULL),
183 in_(in),
184 current_(kEndMarker),
185 next_pos_(0),
186 capture_count_(0),
187 has_more_(true),
188 multiline_(multiline),
189 simple_(false),
190 contains_anchor_(false),
191 is_scanned_for_captures_(false),
192 failed_(false) {
193 Advance();
194 }
195
196
197 uc32 RegExpParser::Next() {
198 if (has_next()) {
199 return in()->Get(next_pos_);
200 } else {
201 return kEndMarker;
202 }
203 }
204
205
206 void RegExpParser::Advance() {
207 if (next_pos_ < in()->length()) {
208 StackLimitCheck check(isolate());
209 if (check.HasOverflowed()) {
210 ReportError(CStrVector(Isolate::kStackOverflowMessage));
211 } else if (zone()->excess_allocation()) {
212 ReportError(CStrVector("Regular expression too large"));
213 } else {
214 current_ = in()->Get(next_pos_);
215 next_pos_++;
216 }
217 } else {
218 current_ = kEndMarker;
219 has_more_ = false;
220 }
221 }
222
223
224 void RegExpParser::Reset(int pos) {
225 next_pos_ = pos;
226 has_more_ = (pos < in()->length());
227 Advance();
228 }
229
230
231 void RegExpParser::Advance(int dist) {
232 next_pos_ += dist - 1;
233 Advance();
234 }
235
236
237 bool RegExpParser::simple() {
238 return simple_;
239 }
240
241
242 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
243 failed_ = true;
244 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
245 // Zip to the end to make sure the no more input is read.
246 current_ = kEndMarker;
247 next_pos_ = in()->length();
248 return NULL;
249 }
250
251
252 // Pattern ::
253 // Disjunction
254 RegExpTree* RegExpParser::ParsePattern() {
255 RegExpTree* result = ParseDisjunction(CHECK_FAILED);
256 DCHECK(!has_more());
257 // If the result of parsing is a literal string atom, and it has the
258 // same length as the input, then the atom is identical to the input.
259 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
260 simple_ = true;
261 }
262 return result;
263 }
264
265
266 // Disjunction ::
267 // Alternative
268 // Alternative | Disjunction
269 // Alternative ::
270 // [empty]
271 // Term Alternative
272 // Term ::
273 // Assertion
274 // Atom
275 // Atom Quantifier
276 RegExpTree* RegExpParser::ParseDisjunction() {
277 // Used to store current state while parsing subexpressions.
278 RegExpParserState initial_state(NULL, INITIAL, 0, zone());
279 RegExpParserState* stored_state = &initial_state;
280 // Cache the builder in a local variable for quick access.
281 RegExpBuilder* builder = initial_state.builder();
282 while (true) {
283 switch (current()) {
284 case kEndMarker:
285 if (stored_state->IsSubexpression()) {
286 // Inside a parenthesized group when hitting end of input.
287 ReportError(CStrVector("Unterminated group") CHECK_FAILED);
288 }
289 DCHECK_EQ(INITIAL, stored_state->group_type());
290 // Parsing completed successfully.
291 return builder->ToRegExp();
292 case ')': {
293 if (!stored_state->IsSubexpression()) {
294 ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
295 }
296 DCHECK_NE(INITIAL, stored_state->group_type());
297
298 Advance();
299 // End disjunction parsing and convert builder content to new single
300 // regexp atom.
301 RegExpTree* body = builder->ToRegExp();
302
303 int end_capture_index = captures_started();
304
305 int capture_index = stored_state->capture_index();
306 SubexpressionType group_type = stored_state->group_type();
307
308 // Restore previous state.
309 stored_state = stored_state->previous_state();
310 builder = stored_state->builder();
311
312 // Build result of subexpression.
313 if (group_type == CAPTURE) {
314 RegExpCapture* capture = new(zone()) RegExpCapture(body, capture_index);
315 captures_->at(capture_index - 1) = capture;
316 body = capture;
317 } else if (group_type != GROUPING) {
318 DCHECK(group_type == POSITIVE_LOOKAHEAD ||
319 group_type == NEGATIVE_LOOKAHEAD);
320 bool is_positive = (group_type == POSITIVE_LOOKAHEAD);
321 body = new(zone()) RegExpLookahead(body,
322 is_positive,
323 end_capture_index - capture_index,
324 capture_index);
325 }
326 builder->AddAtom(body);
327 // For compatability with JSC and ES3, we allow quantifiers after
328 // lookaheads, and break in all cases.
329 break;
330 }
331 case '|': {
332 Advance();
333 builder->NewAlternative();
334 continue;
335 }
336 case '*':
337 case '+':
338 case '?':
339 return ReportError(CStrVector("Nothing to repeat"));
340 case '^': {
341 Advance();
342 if (multiline_) {
343 builder->AddAssertion(
344 new(zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
345 } else {
346 builder->AddAssertion(
347 new(zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
348 set_contains_anchor();
349 }
350 continue;
351 }
352 case '$': {
353 Advance();
354 RegExpAssertion::AssertionType assertion_type =
355 multiline_ ? RegExpAssertion::END_OF_LINE :
356 RegExpAssertion::END_OF_INPUT;
357 builder->AddAssertion(new(zone()) RegExpAssertion(assertion_type));
358 continue;
359 }
360 case '.': {
361 Advance();
362 // everything except \x0a, \x0d, \u2028 and \u2029
363 ZoneList<CharacterRange>* ranges =
364 new(zone()) ZoneList<CharacterRange>(2, zone());
365 CharacterRange::AddClassEscape('.', ranges, zone());
366 RegExpTree* atom = new(zone()) RegExpCharacterClass(ranges, false);
367 builder->AddAtom(atom);
368 break;
369 }
370 case '(': {
371 SubexpressionType subexpr_type = CAPTURE;
372 Advance();
373 if (current() == '?') {
374 switch (Next()) {
375 case ':':
376 subexpr_type = GROUPING;
377 break;
378 case '=':
379 subexpr_type = POSITIVE_LOOKAHEAD;
380 break;
381 case '!':
382 subexpr_type = NEGATIVE_LOOKAHEAD;
383 break;
384 default:
385 ReportError(CStrVector("Invalid group") CHECK_FAILED);
386 break;
387 }
388 Advance(2);
389 } else {
390 if (captures_ == NULL) {
391 captures_ = new(zone()) ZoneList<RegExpCapture*>(2, zone());
392 }
393 if (captures_started() >= kMaxCaptures) {
394 ReportError(CStrVector("Too many captures") CHECK_FAILED);
395 }
396 captures_->Add(NULL, zone());
397 }
398 // Store current state and begin new disjunction parsing.
399 stored_state = new(zone()) RegExpParserState(stored_state, subexpr_type,
400 captures_started(), zone());
401 builder = stored_state->builder();
402 continue;
403 }
404 case '[': {
405 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);
406 builder->AddAtom(atom);
407 break;
408 }
409 // Atom ::
410 // \ AtomEscape
411 case '\\':
412 switch (Next()) {
413 case kEndMarker:
414 return ReportError(CStrVector("\\ at end of pattern"));
415 case 'b':
416 Advance(2);
417 builder->AddAssertion(
418 new(zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
419 continue;
420 case 'B':
421 Advance(2);
422 builder->AddAssertion(
423 new(zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
424 continue;
425 // AtomEscape ::
426 // CharacterClassEscape
427 //
428 // CharacterClassEscape :: one of
429 // d D s S w W
430 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {
431 uc32 c = Next();
432 Advance(2);
433 ZoneList<CharacterRange>* ranges =
434 new(zone()) ZoneList<CharacterRange>(2, zone());
435 CharacterRange::AddClassEscape(c, ranges, zone());
436 RegExpTree* atom = new(zone()) RegExpCharacterClass(ranges, false);
437 builder->AddAtom(atom);
438 break;
439 }
440 case '1': case '2': case '3': case '4': case '5': case '6':
441 case '7': case '8': case '9': {
442 int index = 0;
443 if (ParseBackReferenceIndex(&index)) {
444 RegExpCapture* capture = NULL;
445 if (captures_ != NULL && index <= captures_->length()) {
446 capture = captures_->at(index - 1);
447 }
448 if (capture == NULL) {
449 builder->AddEmpty();
450 break;
451 }
452 RegExpTree* atom = new(zone()) RegExpBackReference(capture);
453 builder->AddAtom(atom);
454 break;
455 }
456 uc32 first_digit = Next();
457 if (first_digit == '8' || first_digit == '9') {
458 // Treat as identity escape
459 builder->AddCharacter(first_digit);
460 Advance(2);
461 break;
462 }
463 }
464 // FALLTHROUGH
465 case '0': {
466 Advance();
467 uc32 octal = ParseOctalLiteral();
468 builder->AddCharacter(octal);
469 break;
470 }
471 // ControlEscape :: one of
472 // f n r t v
473 case 'f':
474 Advance(2);
475 builder->AddCharacter('\f');
476 break;
477 case 'n':
478 Advance(2);
479 builder->AddCharacter('\n');
480 break;
481 case 'r':
482 Advance(2);
483 builder->AddCharacter('\r');
484 break;
485 case 't':
486 Advance(2);
487 builder->AddCharacter('\t');
488 break;
489 case 'v':
490 Advance(2);
491 builder->AddCharacter('\v');
492 break;
493 case 'c': {
494 Advance();
495 uc32 controlLetter = Next();
496 // Special case if it is an ASCII letter.
497 // Convert lower case letters to uppercase.
498 uc32 letter = controlLetter & ~('a' ^ 'A');
499 if (letter < 'A' || 'Z' < letter) {
500 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
501 // This is outside the specification. We match JSC in
502 // reading the backslash as a literal character instead
503 // of as starting an escape.
504 builder->AddCharacter('\\');
505 } else {
506 Advance(2);
507 builder->AddCharacter(controlLetter & 0x1f);
508 }
509 break;
510 }
511 case 'x': {
512 Advance(2);
513 uc32 value;
514 if (ParseHexEscape(2, &value)) {
515 builder->AddCharacter(value);
516 } else {
517 builder->AddCharacter('x');
518 }
519 break;
520 }
521 case 'u': {
522 Advance(2);
523 uc32 value;
524 if (ParseHexEscape(4, &value)) {
525 builder->AddCharacter(value);
526 } else {
527 builder->AddCharacter('u');
528 }
529 break;
530 }
531 default:
532 // Identity escape.
533 builder->AddCharacter(Next());
534 Advance(2);
535 break;
536 }
537 break;
538 case '{': {
539 int dummy;
540 if (ParseIntervalQuantifier(&dummy, &dummy)) {
541 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
542 }
543 // fallthrough
544 }
545 default:
546 builder->AddCharacter(current());
547 Advance();
548 break;
549 } // end switch(current())
550
551 int min;
552 int max;
553 switch (current()) {
554 // QuantifierPrefix ::
555 // *
556 // +
557 // ?
558 // {
559 case '*':
560 min = 0;
561 max = RegExpTree::kInfinity;
562 Advance();
563 break;
564 case '+':
565 min = 1;
566 max = RegExpTree::kInfinity;
567 Advance();
568 break;
569 case '?':
570 min = 0;
571 max = 1;
572 Advance();
573 break;
574 case '{':
575 if (ParseIntervalQuantifier(&min, &max)) {
576 if (max < min) {
577 ReportError(CStrVector("numbers out of order in {} quantifier.")
578 CHECK_FAILED);
579 }
580 break;
581 } else {
582 continue;
583 }
584 default:
585 continue;
586 }
587 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
588 if (current() == '?') {
589 quantifier_type = RegExpQuantifier::NON_GREEDY;
590 Advance();
591 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
592 // FLAG_regexp_possessive_quantifier is a debug-only flag.
593 quantifier_type = RegExpQuantifier::POSSESSIVE;
594 Advance();
595 }
596 builder->AddQuantifierToAtom(min, max, quantifier_type);
597 }
598 }
599
600
601 #ifdef DEBUG
602 // Currently only used in an DCHECK.
603 static bool IsSpecialClassEscape(uc32 c) {
604 switch (c) {
605 case 'd': case 'D':
606 case 's': case 'S':
607 case 'w': case 'W':
608 return true;
609 default:
610 return false;
611 }
612 }
613 #endif
614
615
616 // In order to know whether an escape is a backreference or not we have to scan
617 // the entire regexp and find the number of capturing parentheses. However we
618 // don't want to scan the regexp twice unless it is necessary. This mini-parser
619 // is called when needed. It can see the difference between capturing and
620 // noncapturing parentheses and can skip character classes and backslash-escaped
621 // characters.
622 void RegExpParser::ScanForCaptures() {
623 // Start with captures started previous to current position
624 int capture_count = captures_started();
625 // Add count of captures after this position.
626 int n;
627 while ((n = current()) != kEndMarker) {
628 Advance();
629 switch (n) {
630 case '\\':
631 Advance();
632 break;
633 case '[': {
634 int c;
635 while ((c = current()) != kEndMarker) {
636 Advance();
637 if (c == '\\') {
638 Advance();
639 } else {
640 if (c == ']') break;
641 }
642 }
643 break;
644 }
645 case '(':
646 if (current() != '?') capture_count++;
647 break;
648 }
649 }
650 capture_count_ = capture_count;
651 is_scanned_for_captures_ = true;
652 }
653
654
655 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
656 DCHECK_EQ('\\', current());
657 DCHECK('1' <= Next() && Next() <= '9');
658 // Try to parse a decimal literal that is no greater than the total number
659 // of left capturing parentheses in the input.
660 int start = position();
661 int value = Next() - '0';
662 Advance(2);
663 while (true) {
664 uc32 c = current();
665 if (IsDecimalDigit(c)) {
666 value = 10 * value + (c - '0');
667 if (value > kMaxCaptures) {
668 Reset(start);
669 return false;
670 }
671 Advance();
672 } else {
673 break;
674 }
675 }
676 if (value > captures_started()) {
677 if (!is_scanned_for_captures_) {
678 int saved_position = position();
679 ScanForCaptures();
680 Reset(saved_position);
681 }
682 if (value > capture_count_) {
683 Reset(start);
684 return false;
685 }
686 }
687 *index_out = value;
688 return true;
689 }
690
691
692 // QuantifierPrefix ::
693 // { DecimalDigits }
694 // { DecimalDigits , }
695 // { DecimalDigits , DecimalDigits }
696 //
697 // Returns true if parsing succeeds, and set the min_out and max_out
698 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
699 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
700 DCHECK_EQ(current(), '{');
701 int start = position();
702 Advance();
703 int min = 0;
704 if (!IsDecimalDigit(current())) {
705 Reset(start);
706 return false;
707 }
708 while (IsDecimalDigit(current())) {
709 int next = current() - '0';
710 if (min > (RegExpTree::kInfinity - next) / 10) {
711 // Overflow. Skip past remaining decimal digits and return -1.
712 do {
713 Advance();
714 } while (IsDecimalDigit(current()));
715 min = RegExpTree::kInfinity;
716 break;
717 }
718 min = 10 * min + next;
719 Advance();
720 }
721 int max = 0;
722 if (current() == '}') {
723 max = min;
724 Advance();
725 } else if (current() == ',') {
726 Advance();
727 if (current() == '}') {
728 max = RegExpTree::kInfinity;
729 Advance();
730 } else {
731 while (IsDecimalDigit(current())) {
732 int next = current() - '0';
733 if (max > (RegExpTree::kInfinity - next) / 10) {
734 do {
735 Advance();
736 } while (IsDecimalDigit(current()));
737 max = RegExpTree::kInfinity;
738 break;
739 }
740 max = 10 * max + next;
741 Advance();
742 }
743 if (current() != '}') {
744 Reset(start);
745 return false;
746 }
747 Advance();
748 }
749 } else {
750 Reset(start);
751 return false;
752 }
753 *min_out = min;
754 *max_out = max;
755 return true;
756 }
757
758
759 uc32 RegExpParser::ParseOctalLiteral() {
760 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
761 // For compatibility with some other browsers (not all), we parse
762 // up to three octal digits with a value below 256.
763 uc32 value = current() - '0';
764 Advance();
765 if ('0' <= current() && current() <= '7') {
766 value = value * 8 + current() - '0';
767 Advance();
768 if (value < 32 && '0' <= current() && current() <= '7') {
769 value = value * 8 + current() - '0';
770 Advance();
771 }
772 }
773 return value;
774 }
775
776
777 bool RegExpParser::ParseHexEscape(int length, uc32 *value) {
778 int start = position();
779 uc32 val = 0;
780 bool done = false;
781 for (int i = 0; !done; i++) {
782 uc32 c = current();
783 int d = HexValue(c);
784 if (d < 0) {
785 Reset(start);
786 return false;
787 }
788 val = val * 16 + d;
789 Advance();
790 if (i == length - 1) {
791 done = true;
792 }
793 }
794 *value = val;
795 return true;
796 }
797
798
799 uc32 RegExpParser::ParseClassCharacterEscape() {
800 DCHECK(current() == '\\');
801 DCHECK(has_next() && !IsSpecialClassEscape(Next()));
802 Advance();
803 switch (current()) {
804 case 'b':
805 Advance();
806 return '\b';
807 // ControlEscape :: one of
808 // f n r t v
809 case 'f':
810 Advance();
811 return '\f';
812 case 'n':
813 Advance();
814 return '\n';
815 case 'r':
816 Advance();
817 return '\r';
818 case 't':
819 Advance();
820 return '\t';
821 case 'v':
822 Advance();
823 return '\v';
824 case 'c': {
825 uc32 controlLetter = Next();
826 uc32 letter = controlLetter & ~('A' ^ 'a');
827 // For compatibility with JSC, inside a character class
828 // we also accept digits and underscore as control characters.
829 if ((controlLetter >= '0' && controlLetter <= '9') ||
830 controlLetter == '_' ||
831 (letter >= 'A' && letter <= 'Z')) {
832 Advance(2);
833 // Control letters mapped to ASCII control characters in the range
834 // 0x00-0x1f.
835 return controlLetter & 0x1f;
836 }
837 // We match JSC in reading the backslash as a literal
838 // character instead of as starting an escape.
839 return '\\';
840 }
841 case '0': case '1': case '2': case '3': case '4': case '5':
842 case '6': case '7':
843 // For compatibility, we interpret a decimal escape that isn't
844 // a back reference (and therefore either \0 or not valid according
845 // to the specification) as a 1..3 digit octal character code.
846 return ParseOctalLiteral();
847 case 'x': {
848 Advance();
849 uc32 value;
850 if (ParseHexEscape(2, &value)) {
851 return value;
852 }
853 // If \x is not followed by a two-digit hexadecimal, treat it
854 // as an identity escape.
855 return 'x';
856 }
857 case 'u': {
858 Advance();
859 uc32 value;
860 if (ParseHexEscape(4, &value)) {
861 return value;
862 }
863 // If \u is not followed by a four-digit hexadecimal, treat it
864 // as an identity escape.
865 return 'u';
866 }
867 default: {
868 // Extended identity escape. We accept any character that hasn't
869 // been matched by a more specific case, not just the subset required
870 // by the ECMAScript specification.
871 uc32 result = current();
872 Advance();
873 return result;
874 }
875 }
876 return 0;
877 }
878
879
880 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
881 DCHECK_EQ(0, *char_class);
882 uc32 first = current();
883 if (first == '\\') {
884 switch (Next()) {
885 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {
886 *char_class = Next();
887 Advance(2);
888 return CharacterRange::Singleton(0); // Return dummy value.
889 }
890 case kEndMarker:
891 return ReportError(CStrVector("\\ at end of pattern"));
892 default:
893 uc32 c = ParseClassCharacterEscape(CHECK_FAILED);
894 return CharacterRange::Singleton(c);
895 }
896 } else {
897 Advance();
898 return CharacterRange::Singleton(first);
899 }
900 }
901
902
903 static const uc16 kNoCharClass = 0;
904
905 // Adds range or pre-defined character class to character ranges.
906 // If char_class is not kInvalidClass, it's interpreted as a class
907 // escape (i.e., 's' means whitespace, from '\s').
908 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
909 uc16 char_class,
910 CharacterRange range,
911 Zone* zone) {
912 if (char_class != kNoCharClass) {
913 CharacterRange::AddClassEscape(char_class, ranges, zone);
914 } else {
915 ranges->Add(range, zone);
916 }
917 }
918
919
920 RegExpTree* RegExpParser::ParseCharacterClass() {
921 static const char* kUnterminated = "Unterminated character class";
922 static const char* kRangeOutOfOrder = "Range out of order in character class";
923
924 DCHECK_EQ(current(), '[');
925 Advance();
926 bool is_negated = false;
927 if (current() == '^') {
928 is_negated = true;
929 Advance();
930 }
931 ZoneList<CharacterRange>* ranges =
932 new(zone()) ZoneList<CharacterRange>(2, zone());
933 while (has_more() && current() != ']') {
934 uc16 char_class = kNoCharClass;
935 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
936 if (current() == '-') {
937 Advance();
938 if (current() == kEndMarker) {
939 // If we reach the end we break out of the loop and let the
940 // following code report an error.
941 break;
942 } else if (current() == ']') {
943 AddRangeOrEscape(ranges, char_class, first, zone());
944 ranges->Add(CharacterRange::Singleton('-'), zone());
945 break;
946 }
947 uc16 char_class_2 = kNoCharClass;
948 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
949 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
950 // Either end is an escaped character class. Treat the '-' verbatim.
951 AddRangeOrEscape(ranges, char_class, first, zone());
952 ranges->Add(CharacterRange::Singleton('-'), zone());
953 AddRangeOrEscape(ranges, char_class_2, next, zone());
954 continue;
955 }
956 if (first.from() > next.to()) {
957 return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
958 }
959 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
960 } else {
961 AddRangeOrEscape(ranges, char_class, first, zone());
962 }
963 }
964 if (!has_more()) {
965 return ReportError(CStrVector(kUnterminated) CHECK_FAILED);
966 }
967 Advance();
968 if (ranges->length() == 0) {
969 ranges->Add(CharacterRange::Everything(), zone());
970 is_negated = !is_negated;
971 }
972 return new(zone()) RegExpCharacterClass(ranges, is_negated);
973 }
974
975
976 // ----------------------------------------------------------------------------
977 // The Parser interface.
978
979 bool RegExpParser::ParseRegExp(FlatStringReader* input,
980 bool multiline,
981 RegExpCompileData* result,
982 Zone* zone) {
983 DCHECK(result != NULL);
984 RegExpParser parser(input, &result->error, multiline, zone);
985 RegExpTree* tree = parser.ParsePattern();
986 if (parser.failed()) {
987 DCHECK(tree == NULL);
988 DCHECK(!result->error.is_null());
989 } else {
990 DCHECK(tree != NULL);
991 DCHECK(result->error.is_null());
992 result->tree = tree;
993 int capture_count = parser.captures_started();
994 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
995 result->contains_anchor = parser.contains_anchor();
996 result->capture_count = capture_count;
997 }
998 return !parser.failed();
999 }
1000
1001 // SNIP
1002
1003 } // namespace dart
OLDNEW
« no previous file with comments | « runtime/vm/regexp_parser.h ('k') | runtime/vm/unibrow.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698