Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(192)

Side by Side Diff: runtime/vm/regexp_parser.cc

Issue 539153002: Port and integrate the irregexp engine from V8 (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Addressed Ivan's comments. Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « runtime/vm/regexp_parser.h ('k') | runtime/vm/symbols.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 #include "vm/longjump.h"
6 #include "vm/object_store.h"
7 #include "vm/regexp_parser.h"
8
9 namespace dart {
10
11 #define I isolate()
12
13 // Enables possessive quantifier syntax for testing.
14 static const bool FLAG_regexp_possessive_quantifier = false;
15
16 RegExpBuilder::RegExpBuilder()
17 : isolate_(Isolate::Current()),
18 pending_empty_(false),
19 characters_(NULL),
20 terms_(),
21 text_(),
22 alternatives_()
23 #ifdef DEBUG
24 , last_added_(ADD_NONE)
25 #endif
26 {}
27
28
29 void RegExpBuilder::FlushCharacters() {
30 pending_empty_ = false;
31 if (characters_ != NULL) {
32 RegExpTree* atom = new(I) RegExpAtom(characters_);
33 characters_ = NULL;
34 text_.Add(atom);
35 LAST(ADD_ATOM);
36 }
37 }
38
39
40 void RegExpBuilder::FlushText() {
41 FlushCharacters();
42 intptr_t num_text = text_.length();
43 if (num_text == 0) {
44 return;
45 } else if (num_text == 1) {
46 terms_.Add(text_.Last());
47 } else {
48 RegExpText* text = new(I) RegExpText();
49 for (intptr_t i = 0; i < num_text; i++)
50 text_[i]->AppendToText(text);
51 terms_.Add(text);
52 }
53 text_.Clear();
54 }
55
56
57 void RegExpBuilder::AddCharacter(uint16_t c) {
58 pending_empty_ = false;
59 if (characters_ == NULL) {
60 characters_ = new(I) ZoneGrowableArray<uint16_t>(4);
61 }
62 characters_->Add(c);
63 LAST(ADD_CHAR);
64 }
65
66
67 void RegExpBuilder::AddEmpty() {
68 pending_empty_ = true;
69 }
70
71
72 void RegExpBuilder::AddAtom(RegExpTree* term) {
73 if (term->IsEmpty()) {
74 AddEmpty();
75 return;
76 }
77 if (term->IsTextElement()) {
78 FlushCharacters();
79 text_.Add(term);
80 } else {
81 FlushText();
82 terms_.Add(term);
83 }
84 LAST(ADD_ATOM);
85 }
86
87
88 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
89 FlushText();
90 terms_.Add(assert);
91 LAST(ADD_ASSERT);
92 }
93
94
95 void RegExpBuilder::NewAlternative() {
96 FlushTerms();
97 }
98
99
100 void RegExpBuilder::FlushTerms() {
101 FlushText();
102 intptr_t num_terms = terms_.length();
103 RegExpTree* alternative;
104 if (num_terms == 0) {
105 alternative = RegExpEmpty::GetInstance();
106 } else if (num_terms == 1) {
107 alternative = terms_.Last();
108 } else {
109 ZoneGrowableArray<RegExpTree*>* terms =
110 new(I) ZoneGrowableArray<RegExpTree*>();
111 for (intptr_t i = 0; i < terms_.length(); i++) {
112 terms->Add(terms_[i]);
113 }
114 alternative = new(I) RegExpAlternative(terms);
115 }
116 alternatives_.Add(alternative);
117 terms_.Clear();
118 LAST(ADD_NONE);
119 }
120
121
122 RegExpTree* RegExpBuilder::ToRegExp() {
123 FlushTerms();
124 intptr_t num_alternatives = alternatives_.length();
125 if (num_alternatives == 0) {
126 return RegExpEmpty::GetInstance();
127 }
128 if (num_alternatives == 1) {
129 return alternatives_.Last();
130 }
131 ZoneGrowableArray<RegExpTree*>* alternatives =
132 new(I) ZoneGrowableArray<RegExpTree*>();
133 for (intptr_t i = 0; i < alternatives_.length(); i++) {
134 alternatives->Add(alternatives_[i]);
135 }
136 return new(I) RegExpDisjunction(alternatives);
137 }
138
139
140 void RegExpBuilder::AddQuantifierToAtom(
141 intptr_t min,
142 intptr_t max,
143 RegExpQuantifier::QuantifierType quantifier_type) {
144 if (pending_empty_) {
145 pending_empty_ = false;
146 return;
147 }
148 RegExpTree* atom;
149 if (characters_ != NULL) {
150 DEBUG_ASSERT(last_added_ == ADD_CHAR);
151 // Last atom was character.
152
153 ZoneGrowableArray<uint16_t> *char_vector =
154 new(I) ZoneGrowableArray<uint16_t>();
155 char_vector->AddArray(*characters_);
156 intptr_t num_chars = char_vector->length();
157 if (num_chars > 1) {
158 ZoneGrowableArray<uint16_t> *prefix =
159 new(I) ZoneGrowableArray<uint16_t>();
160 for (intptr_t i = 0; i < num_chars - 1; i++) {
161 prefix->Add(char_vector->At(i));
162 }
163 text_.Add(new(I) RegExpAtom(prefix));
164 ZoneGrowableArray<uint16_t> *tail = new(I) ZoneGrowableArray<uint16_t>();
165 tail->Add(char_vector->At(num_chars - 1));
166 char_vector = tail;
167 }
168 characters_ = NULL;
169 atom = new(I) RegExpAtom(char_vector);
170 FlushText();
171 } else if (text_.length() > 0) {
172 DEBUG_ASSERT(last_added_ == ADD_ATOM);
173 atom = text_.RemoveLast();
174 FlushText();
175 } else if (terms_.length() > 0) {
176 DEBUG_ASSERT(last_added_ == ADD_ATOM);
177 atom = terms_.RemoveLast();
178 if (atom->max_match() == 0) {
179 // Guaranteed to only match an empty string.
180 LAST(ADD_TERM);
181 if (min == 0) {
182 return;
183 }
184 terms_.Add(atom);
185 return;
186 }
187 } else {
188 // Only call immediately after adding an atom or character!
189 UNREACHABLE();
190 return;
191 }
192 terms_.Add(new(I) RegExpQuantifier(min, max, quantifier_type, atom));
193 LAST(ADD_TERM);
194 }
195
196 // ----------------------------------------------------------------------------
197 // Implementation of Parser
198
199 RegExpParser::RegExpParser(const String& in,
200 String* error,
201 bool multiline)
202 : isolate_(Isolate::Current()),
203 error_(error),
204 captures_(NULL),
205 in_(in),
206 current_(kEndMarker),
207 next_pos_(0),
208 capture_count_(0),
209 has_more_(true),
210 multiline_(multiline),
211 simple_(false),
212 contains_anchor_(false),
213 is_scanned_for_captures_(false),
214 failed_(false) {
215 Advance();
216 }
217
218
219 uint32_t RegExpParser::Next() {
220 if (has_next()) {
221 return in().CharAt(next_pos_);
222 } else {
223 return kEndMarker;
224 }
225 }
226
227
228 void RegExpParser::Advance() {
229 if (next_pos_ < in().Length()) {
230 current_ = in().CharAt(next_pos_);
231 next_pos_++;
232 } else {
233 current_ = kEndMarker;
234 has_more_ = false;
235 }
236 }
237
238
239 void RegExpParser::Advance(intptr_t dist) {
240 next_pos_ += dist - 1;
241 Advance();
242 }
243
244
245 void RegExpParser::Reset(intptr_t pos) {
246 next_pos_ = pos;
247 has_more_ = (pos < in().Length());
248 Advance();
249 }
250
251
252 bool RegExpParser::ParseFunction(ParsedFunction *parsed_function) {
253 Isolate* isolate = parsed_function->isolate();
254 JSRegExp& regexp = JSRegExp::Handle(parsed_function->function().regexp());
255
256 const String& pattern = String::Handle(regexp.pattern());
257 const bool multiline = regexp.is_multi_line();
258
259 RegExpCompileData* compile_data = new(isolate) RegExpCompileData();
260 if (!RegExpParser::ParseRegExp(pattern, multiline, compile_data)) {
261 // Parsing failures are handled in the JSRegExp factory constructor.
262 UNREACHABLE();
263 }
264
265 regexp.set_num_bracket_expressions(compile_data->capture_count);
266 if (compile_data->simple) {
267 regexp.set_is_simple();
268 } else {
269 regexp.set_is_complex();
270 }
271
272 parsed_function->SetRegExpCompileData(compile_data);
273
274 return true;
275 }
276
277
278 bool RegExpParser::ParseRegExp(const String& input,
279 bool multiline,
280 RegExpCompileData* result) {
281 ASSERT(result != NULL);
282 LongJumpScope jump;
283 RegExpParser parser(input, &result->error, multiline);
284 if (setjmp(*jump.Set()) == 0) {
285 RegExpTree* tree = parser.ParsePattern();
286 ASSERT(tree != NULL);
287 ASSERT(result->error.IsNull());
288 result->tree = tree;
289 intptr_t capture_count = parser.captures_started();
290 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
291 result->contains_anchor = parser.contains_anchor();
292 result->capture_count = capture_count;
293 } else {
294 ASSERT(!result->error.IsNull());
295 Isolate::Current()->object_store()->clear_sticky_error();
296
297 // Throw a FormatException on parsing failures.
298 const String& message = String::Handle(
299 String::Concat(result->error, input));
300 const Array& args = Array::Handle(Array::New(1));
301 args.SetAt(0, message);
302
303 Exceptions::ThrowByType(Exceptions::kFormat, args);
304 }
305 return !parser.failed();
306 }
307
308
309 void RegExpParser::ReportError(const char* message) {
310 failed_ = true;
311 *error_ = String::New(message);
312 // Zip to the end to make sure the no more input is read.
313 current_ = kEndMarker;
314 next_pos_ = in().Length();
315
316 const Error& error = Error::Handle(LanguageError::New(*error_));
317 Report::LongJump(error);
318 UNREACHABLE();
319 }
320
321
322 // Pattern ::
323 // Disjunction
324 RegExpTree* RegExpParser::ParsePattern() {
325 RegExpTree* result = ParseDisjunction();
326 ASSERT(!has_more());
327 // If the result of parsing is a literal string atom, and it has the
328 // same length as the input, then the atom is identical to the input.
329 if (result->IsAtom() && result->AsAtom()->length() == in().Length()) {
330 simple_ = true;
331 }
332 return result;
333 }
334
335
336 // Disjunction ::
337 // Alternative
338 // Alternative | Disjunction
339 // Alternative ::
340 // [empty]
341 // Term Alternative
342 // Term ::
343 // Assertion
344 // Atom
345 // Atom Quantifier
346 RegExpTree* RegExpParser::ParseDisjunction() {
347 // Used to store current state while parsing subexpressions.
348 RegExpParserState initial_state(NULL, INITIAL, 0, I);
349 RegExpParserState* stored_state = &initial_state;
350 // Cache the builder in a local variable for quick access.
351 RegExpBuilder* builder = initial_state.builder();
352 while (true) {
353 switch (current()) {
354 case kEndMarker:
355 if (stored_state->IsSubexpression()) {
356 // Inside a parenthesized group when hitting end of input.
357 ReportError("Unterminated group");
358 UNREACHABLE();
359 }
360 ASSERT(INITIAL == stored_state->group_type());
361 // Parsing completed successfully.
362 return builder->ToRegExp();
363 case ')': {
364 if (!stored_state->IsSubexpression()) {
365 ReportError("Unmatched ')'");
366 UNREACHABLE();
367 }
368 ASSERT(INITIAL != stored_state->group_type());
369
370 Advance();
371 // End disjunction parsing and convert builder content to new single
372 // regexp atom.
373 RegExpTree* body = builder->ToRegExp();
374
375 intptr_t end_capture_index = captures_started();
376
377 intptr_t capture_index = stored_state->capture_index();
378 SubexpressionType group_type = stored_state->group_type();
379
380 // Restore previous state.
381 stored_state = stored_state->previous_state();
382 builder = stored_state->builder();
383
384 // Build result of subexpression.
385 if (group_type == CAPTURE) {
386 RegExpCapture* capture = new(I) RegExpCapture(body, capture_index);
387 (*captures_)[capture_index - 1] = capture;
388 body = capture;
389 } else if (group_type != GROUPING) {
390 ASSERT(group_type == POSITIVE_LOOKAHEAD ||
391 group_type == NEGATIVE_LOOKAHEAD);
392 bool is_positive = (group_type == POSITIVE_LOOKAHEAD);
393 body = new(I) RegExpLookahead(body,
394 is_positive,
395 end_capture_index - capture_index,
396 capture_index);
397 }
398 builder->AddAtom(body);
399 // For compatibility with JSC and ES3, we allow quantifiers after
400 // lookaheads, and break in all cases.
401 break;
402 }
403 case '|': {
404 Advance();
405 builder->NewAlternative();
406 continue;
407 }
408 case '*':
409 case '+':
410 case '?':
411 ReportError("Nothing to repeat");
412 UNREACHABLE();
413 case '^': {
414 Advance();
415 if (multiline_) {
416 builder->AddAssertion(
417 new(I) RegExpAssertion(RegExpAssertion::START_OF_LINE));
418 } else {
419 builder->AddAssertion(
420 new(I) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
421 set_contains_anchor();
422 }
423 continue;
424 }
425 case '$': {
426 Advance();
427 RegExpAssertion::AssertionType assertion_type =
428 multiline_ ? RegExpAssertion::END_OF_LINE :
429 RegExpAssertion::END_OF_INPUT;
430 builder->AddAssertion(new RegExpAssertion(assertion_type));
431 continue;
432 }
433 case '.': {
434 Advance();
435 // everything except \x0a, \x0d, \u2028 and \u2029
436 ZoneGrowableArray<CharacterRange>* ranges =
437 new ZoneGrowableArray<CharacterRange>(2);
438 CharacterRange::AddClassEscape('.', ranges);
439 RegExpTree* atom = new RegExpCharacterClass(ranges, false);
440 builder->AddAtom(atom);
441 break;
442 }
443 case '(': {
444 SubexpressionType subexpr_type = CAPTURE;
445 Advance();
446 if (current() == '?') {
447 switch (Next()) {
448 case ':':
449 subexpr_type = GROUPING;
450 break;
451 case '=':
452 subexpr_type = POSITIVE_LOOKAHEAD;
453 break;
454 case '!':
455 subexpr_type = NEGATIVE_LOOKAHEAD;
456 break;
457 default:
458 ReportError("Invalid group");
459 UNREACHABLE();
460 }
461 Advance(2);
462 } else {
463 if (captures_ == NULL) {
464 captures_ = new ZoneGrowableArray<RegExpCapture*>(2);
465 }
466 if (captures_started() >= kMaxCaptures) {
467 ReportError("Too many captures");
468 UNREACHABLE();
469 }
470 captures_->Add(NULL);
471 }
472 // Store current state and begin new disjunction parsing.
473 stored_state = new RegExpParserState(stored_state, subexpr_type,
474 captures_started(), I);
475 builder = stored_state->builder();
476 continue;
477 }
478 case '[': {
479 RegExpTree* atom = ParseCharacterClass();
480 builder->AddAtom(atom);
481 break;
482 }
483 // Atom ::
484 // \ AtomEscape
485 case '\\':
486 switch (Next()) {
487 case kEndMarker:
488 ReportError("\\ at end of pattern");
489 UNREACHABLE();
490 case 'b':
491 Advance(2);
492 builder->AddAssertion(
493 new RegExpAssertion(RegExpAssertion::BOUNDARY));
494 continue;
495 case 'B':
496 Advance(2);
497 builder->AddAssertion(
498 new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
499 continue;
500 // AtomEscape ::
501 // CharacterClassEscape
502 //
503 // CharacterClassEscape :: one of
504 // d D s S w W
505 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {
506 uint32_t c = Next();
507 Advance(2);
508 ZoneGrowableArray<CharacterRange>* ranges =
509 new ZoneGrowableArray<CharacterRange>(2);
510 CharacterRange::AddClassEscape(c, ranges);
511 RegExpTree* atom = new RegExpCharacterClass(ranges, false);
512 builder->AddAtom(atom);
513 break;
514 }
515 case '1': case '2': case '3': case '4': case '5': case '6':
516 case '7': case '8': case '9': {
517 intptr_t index = 0;
518 if (ParseBackReferenceIndex(&index)) {
519 RegExpCapture* capture = NULL;
520 if (captures_ != NULL && index <= captures_->length()) {
521 capture = captures_->At(index - 1);
522 }
523 if (capture == NULL) {
524 builder->AddEmpty();
525 break;
526 }
527 RegExpTree* atom = new RegExpBackReference(capture);
528 builder->AddAtom(atom);
529 break;
530 }
531 uint32_t first_digit = Next();
532 if (first_digit == '8' || first_digit == '9') {
533 // Treat as identity escape
534 builder->AddCharacter(first_digit);
535 Advance(2);
536 break;
537 }
538 }
539 // FALLTHROUGH
540 case '0': {
541 Advance();
542 uint32_t octal = ParseOctalLiteral();
543 builder->AddCharacter(octal);
544 break;
545 }
546 // ControlEscape :: one of
547 // f n r t v
548 case 'f':
549 Advance(2);
550 builder->AddCharacter('\f');
551 break;
552 case 'n':
553 Advance(2);
554 builder->AddCharacter('\n');
555 break;
556 case 'r':
557 Advance(2);
558 builder->AddCharacter('\r');
559 break;
560 case 't':
561 Advance(2);
562 builder->AddCharacter('\t');
563 break;
564 case 'v':
565 Advance(2);
566 builder->AddCharacter('\v');
567 break;
568 case 'c': {
569 Advance();
570 uint32_t controlLetter = Next();
571 // Special case if it is an ASCII letter.
572 // Convert lower case letters to uppercase.
573 uint32_t letter = controlLetter & ~('a' ^ 'A');
574 if (letter < 'A' || 'Z' < letter) {
575 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
576 // This is outside the specification. We match JSC in
577 // reading the backslash as a literal character instead
578 // of as starting an escape.
579 builder->AddCharacter('\\');
580 } else {
581 Advance(2);
582 builder->AddCharacter(controlLetter & 0x1f);
583 }
584 break;
585 }
586 case 'x': {
587 Advance(2);
588 uint32_t value;
589 if (ParseHexEscape(2, &value)) {
590 builder->AddCharacter(value);
591 } else {
592 builder->AddCharacter('x');
593 }
594 break;
595 }
596 case 'u': {
597 Advance(2);
598 uint32_t value;
599 if (ParseHexEscape(4, &value)) {
600 builder->AddCharacter(value);
601 } else {
602 builder->AddCharacter('u');
603 }
604 break;
605 }
606 default:
607 // Identity escape.
608 builder->AddCharacter(Next());
609 Advance(2);
610 break;
611 }
612 break;
613 case '{': {
614 intptr_t dummy;
615 if (ParseIntervalQuantifier(&dummy, &dummy)) {
616 ReportError("Nothing to repeat");
617 UNREACHABLE();
618 }
619 // fallthrough
620 }
621 default:
622 builder->AddCharacter(current());
623 Advance();
624 break;
625 } // end switch(current())
626
627 intptr_t min;
628 intptr_t max;
629 switch (current()) {
630 // QuantifierPrefix ::
631 // *
632 // +
633 // ?
634 // {
635 case '*':
636 min = 0;
637 max = RegExpTree::kInfinity;
638 Advance();
639 break;
640 case '+':
641 min = 1;
642 max = RegExpTree::kInfinity;
643 Advance();
644 break;
645 case '?':
646 min = 0;
647 max = 1;
648 Advance();
649 break;
650 case '{':
651 if (ParseIntervalQuantifier(&min, &max)) {
652 if (max < min) {
653 ReportError("numbers out of order in {} quantifier.");
654 UNREACHABLE();
655 }
656 break;
657 } else {
658 continue;
659 }
660 default:
661 continue;
662 }
663 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
664 if (current() == '?') {
665 quantifier_type = RegExpQuantifier::NON_GREEDY;
666 Advance();
667 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
668 // FLAG_regexp_possessive_quantifier is a debug-only flag.
669 quantifier_type = RegExpQuantifier::POSSESSIVE;
670 Advance();
671 }
672 builder->AddQuantifierToAtom(min, max, quantifier_type);
673 }
674 }
675
676
677 static const uint16_t kNoCharClass = 0;
678
679 // Adds range or pre-defined character class to character ranges.
680 // If char_class is not kInvalidClass, it's interpreted as a class
681 // escape (i.e., 's' means whitespace, from '\s').
682 static inline void AddRangeOrEscape(ZoneGrowableArray<CharacterRange>* ranges,
683 uint16_t char_class,
684 CharacterRange range) {
685 if (char_class != kNoCharClass) {
686 CharacterRange::AddClassEscape(char_class, ranges);
687 } else {
688 ranges->Add(range);
689 }
690 }
691
692
693 RegExpTree* RegExpParser::ParseCharacterClass() {
694 static const char* kUnterminated = "Unterminated character class";
695 static const char* kRangeOutOfOrder = "Range out of order in character class";
696
697 ASSERT(current() == '[');
698 Advance();
699 bool is_negated = false;
700 if (current() == '^') {
701 is_negated = true;
702 Advance();
703 }
704 ZoneGrowableArray<CharacterRange>* ranges =
705 new(I) ZoneGrowableArray<CharacterRange>(2);
706 while (has_more() && current() != ']') {
707 uint16_t char_class = kNoCharClass;
708 CharacterRange first = ParseClassAtom(&char_class);
709 if (current() == '-') {
710 Advance();
711 if (current() == kEndMarker) {
712 // If we reach the end we break out of the loop and let the
713 // following code report an error.
714 break;
715 } else if (current() == ']') {
716 AddRangeOrEscape(ranges, char_class, first);
717 ranges->Add(CharacterRange::Singleton('-'));
718 break;
719 }
720 uint16_t char_class_2 = kNoCharClass;
721 CharacterRange next = ParseClassAtom(&char_class_2);
722 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
723 // Either end is an escaped character class. Treat the '-' verbatim.
724 AddRangeOrEscape(ranges, char_class, first);
725 ranges->Add(CharacterRange::Singleton('-'));
726 AddRangeOrEscape(ranges, char_class_2, next);
727 continue;
728 }
729 if (first.from() > next.to()) {
730 ReportError(kRangeOutOfOrder);
731 UNREACHABLE();
732 }
733 ranges->Add(CharacterRange::Range(first.from(), next.to()));
734 } else {
735 AddRangeOrEscape(ranges, char_class, first);
736 }
737 }
738 if (!has_more()) {
739 ReportError(kUnterminated);
740 UNREACHABLE();
741 }
742 Advance();
743 if (ranges->length() == 0) {
744 ranges->Add(CharacterRange::Everything());
745 is_negated = !is_negated;
746 }
747 return new(I) RegExpCharacterClass(ranges, is_negated);
748 }
749
750
751 #ifdef DEBUG
752 // Currently only used in an ASSERT.
753 static bool IsSpecialClassEscape(uint32_t c) {
754 switch (c) {
755 case 'd': case 'D':
756 case 's': case 'S':
757 case 'w': case 'W':
758 return true;
759 default:
760 return false;
761 }
762 }
763 #endif
764
765
766 // In order to know whether an escape is a backreference or not we have to scan
767 // the entire regexp and find the number of capturing parentheses. However we
768 // don't want to scan the regexp twice unless it is necessary. This mini-parser
769 // is called when needed. It can see the difference between capturing and
770 // noncapturing parentheses and can skip character classes and backslash-escaped
771 // characters.
772 void RegExpParser::ScanForCaptures() {
773 // Start with captures started previous to current position
774 intptr_t capture_count = captures_started();
775 // Add count of captures after this position.
776 intptr_t n;
777 while ((n = current()) != kEndMarker) {
778 Advance();
779 switch (n) {
780 case '\\':
781 Advance();
782 break;
783 case '[': {
784 intptr_t c;
785 while ((c = current()) != kEndMarker) {
786 Advance();
787 if (c == '\\') {
788 Advance();
789 } else {
790 if (c == ']') break;
791 }
792 }
793 break;
794 }
795 case '(':
796 if (current() != '?') capture_count++;
797 break;
798 }
799 }
800 capture_count_ = capture_count;
801 is_scanned_for_captures_ = true;
802 }
803
804
805 static inline bool IsDecimalDigit(int32_t c) {
806 return '0' <= c && c <= '9';
807 }
808
809
810 bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) {
811 ASSERT('\\' == current());
812 ASSERT('1' <= Next() && Next() <= '9');
813 // Try to parse a decimal literal that is no greater than the total number
814 // of left capturing parentheses in the input.
815 intptr_t start = position();
816 intptr_t value = Next() - '0';
817 Advance(2);
818 while (true) {
819 uint32_t c = current();
820 if (IsDecimalDigit(c)) {
821 value = 10 * value + (c - '0');
822 if (value > kMaxCaptures) {
823 Reset(start);
824 return false;
825 }
826 Advance();
827 } else {
828 break;
829 }
830 }
831 if (value > captures_started()) {
832 if (!is_scanned_for_captures_) {
833 intptr_t saved_position = position();
834 ScanForCaptures();
835 Reset(saved_position);
836 }
837 if (value > capture_count_) {
838 Reset(start);
839 return false;
840 }
841 }
842 *index_out = value;
843 return true;
844 }
845
846
847 // QuantifierPrefix ::
848 // { DecimalDigits }
849 // { DecimalDigits , }
850 // { DecimalDigits , DecimalDigits }
851 //
852 // Returns true if parsing succeeds, and set the min_out and max_out
853 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
854 bool RegExpParser::ParseIntervalQuantifier(intptr_t* min_out,
855 intptr_t* max_out) {
856 ASSERT(current() == '{');
857 intptr_t start = position();
858 Advance();
859 intptr_t min = 0;
860 if (!IsDecimalDigit(current())) {
861 Reset(start);
862 return false;
863 }
864 while (IsDecimalDigit(current())) {
865 intptr_t next = current() - '0';
866 if (min > (RegExpTree::kInfinity - next) / 10) {
867 // Overflow. Skip past remaining decimal digits and return -1.
868 do {
869 Advance();
870 } while (IsDecimalDigit(current()));
871 min = RegExpTree::kInfinity;
872 break;
873 }
874 min = 10 * min + next;
875 Advance();
876 }
877 intptr_t max = 0;
878 if (current() == '}') {
879 max = min;
880 Advance();
881 } else if (current() == ',') {
882 Advance();
883 if (current() == '}') {
884 max = RegExpTree::kInfinity;
885 Advance();
886 } else {
887 while (IsDecimalDigit(current())) {
888 intptr_t next = current() - '0';
889 if (max > (RegExpTree::kInfinity - next) / 10) {
890 do {
891 Advance();
892 } while (IsDecimalDigit(current()));
893 max = RegExpTree::kInfinity;
894 break;
895 }
896 max = 10 * max + next;
897 Advance();
898 }
899 if (current() != '}') {
900 Reset(start);
901 return false;
902 }
903 Advance();
904 }
905 } else {
906 Reset(start);
907 return false;
908 }
909 *min_out = min;
910 *max_out = max;
911 return true;
912 }
913
914
915 uint32_t RegExpParser::ParseOctalLiteral() {
916 ASSERT(('0' <= current() && current() <= '7') || current() == kEndMarker);
917 // For compatibility with some other browsers (not all), we parse
918 // up to three octal digits with a value below 256.
919 uint32_t value = current() - '0';
920 Advance();
921 if ('0' <= current() && current() <= '7') {
922 value = value * 8 + current() - '0';
923 Advance();
924 if (value < 32 && '0' <= current() && current() <= '7') {
925 value = value * 8 + current() - '0';
926 Advance();
927 }
928 }
929 return value;
930 }
931
932
933 // Returns the value (0 .. 15) of a hexadecimal character c.
934 // If c is not a legal hexadecimal character, returns a value < 0.
935 static inline intptr_t HexValue(uint32_t c) {
936 c -= '0';
937 if (static_cast<unsigned>(c) <= 9) return c;
938 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
939 if (static_cast<unsigned>(c) <= 5) return c + 10;
940 return -1;
941 }
942
943
944 bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t *value) {
945 intptr_t start = position();
946 uint32_t val = 0;
947 bool done = false;
948 for (intptr_t i = 0; !done; i++) {
949 uint32_t c = current();
950 intptr_t d = HexValue(c);
951 if (d < 0) {
952 Reset(start);
953 return false;
954 }
955 val = val * 16 + d;
956 Advance();
957 if (i == length - 1) {
958 done = true;
959 }
960 }
961 *value = val;
962 return true;
963 }
964
965
966 uint32_t RegExpParser::ParseClassCharacterEscape() {
967 ASSERT(current() == '\\');
968 DEBUG_ASSERT(has_next() && !IsSpecialClassEscape(Next()));
969 Advance();
970 switch (current()) {
971 case 'b':
972 Advance();
973 return '\b';
974 // ControlEscape :: one of
975 // f n r t v
976 case 'f':
977 Advance();
978 return '\f';
979 case 'n':
980 Advance();
981 return '\n';
982 case 'r':
983 Advance();
984 return '\r';
985 case 't':
986 Advance();
987 return '\t';
988 case 'v':
989 Advance();
990 return '\v';
991 case 'c': {
992 uint32_t controlLetter = Next();
993 uint32_t letter = controlLetter & ~('A' ^ 'a');
994 // For compatibility with JSC, inside a character class
995 // we also accept digits and underscore as control characters.
996 if ((controlLetter >= '0' && controlLetter <= '9') ||
997 controlLetter == '_' ||
998 (letter >= 'A' && letter <= 'Z')) {
999 Advance(2);
1000 // Control letters mapped to ASCII control characters in the range
1001 // 0x00-0x1f.
1002 return controlLetter & 0x1f;
1003 }
1004 // We match JSC in reading the backslash as a literal
1005 // character instead of as starting an escape.
1006 return '\\';
1007 }
1008 case '0': case '1': case '2': case '3': case '4': case '5':
1009 case '6': case '7':
1010 // For compatibility, we interpret a decimal escape that isn't
1011 // a back reference (and therefore either \0 or not valid according
1012 // to the specification) as a 1..3 digit octal character code.
1013 return ParseOctalLiteral();
1014 case 'x': {
1015 Advance();
1016 uint32_t value;
1017 if (ParseHexEscape(2, &value)) {
1018 return value;
1019 }
1020 // If \x is not followed by a two-digit hexadecimal, treat it
1021 // as an identity escape.
1022 return 'x';
1023 }
1024 case 'u': {
1025 Advance();
1026 uint32_t value;
1027 if (ParseHexEscape(4, &value)) {
1028 return value;
1029 }
1030 // If \u is not followed by a four-digit hexadecimal, treat it
1031 // as an identity escape.
1032 return 'u';
1033 }
1034 default: {
1035 // Extended identity escape. We accept any character that hasn't
1036 // been matched by a more specific case, not just the subset required
1037 // by the ECMAScript specification.
1038 uint32_t result = current();
1039 Advance();
1040 return result;
1041 }
1042 }
1043 return 0;
1044 }
1045
1046
1047 CharacterRange RegExpParser::ParseClassAtom(uint16_t* char_class) {
1048 ASSERT(0 == *char_class);
1049 uint32_t first = current();
1050 if (first == '\\') {
1051 switch (Next()) {
1052 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {
1053 *char_class = Next();
1054 Advance(2);
1055 return CharacterRange::Singleton(0); // Return dummy value.
1056 }
1057 case kEndMarker:
1058 ReportError("\\ at end of pattern");
1059 UNREACHABLE();
1060 default:
1061 uint32_t c = ParseClassCharacterEscape();
1062 return CharacterRange::Singleton(c);
1063 }
1064 } else {
1065 Advance();
1066 return CharacterRange::Singleton(first);
1067 }
1068 }
1069
1070 } // namespace dart
OLDNEW
« no previous file with comments | « runtime/vm/regexp_parser.h ('k') | runtime/vm/symbols.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698