Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(60)

Side by Side Diff: runtime/vm/regexp_parser.cc

Issue 683433003: Integrate the Irregexp Regular Expression Engine. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: rebase, enable tests, remove *CodeUnitsAt Created 6 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 #include "vm/longjump.h"
6 #include "vm/object_store.h"
7 #include "vm/regexp_parser.h"
8
9 namespace dart {
10
11 #define I isolate()
12
13 // Enables possessive quantifier syntax for testing.
14 static const bool FLAG_regexp_possessive_quantifier = false;
15
16 RegExpBuilder::RegExpBuilder()
17 : isolate_(Isolate::Current()),
18 pending_empty_(false),
19 characters_(NULL),
20 terms_(),
21 text_(),
22 alternatives_()
23 #ifdef DEBUG
24 , last_added_(ADD_NONE)
25 #endif
26 {}
27
28
29 void RegExpBuilder::FlushCharacters() {
30 pending_empty_ = false;
31 if (characters_ != NULL) {
32 RegExpTree* atom = new(I) RegExpAtom(characters_);
33 characters_ = NULL;
34 text_.Add(atom);
35 LAST(ADD_ATOM);
36 }
37 }
38
39
40 void RegExpBuilder::FlushText() {
41 FlushCharacters();
42 intptr_t num_text = text_.length();
43 if (num_text == 0) {
44 return;
45 } else if (num_text == 1) {
46 terms_.Add(text_.Last());
47 } else {
48 RegExpText* text = new(I) RegExpText();
49 for (intptr_t i = 0; i < num_text; i++)
50 text_[i]->AppendToText(text);
51 terms_.Add(text);
52 }
53 text_.Clear();
54 }
55
56
57 void RegExpBuilder::AddCharacter(uint16_t c) {
58 pending_empty_ = false;
59 if (characters_ == NULL) {
60 characters_ = new(I) ZoneGrowableArray<uint16_t>(4);
61 }
62 characters_->Add(c);
63 LAST(ADD_CHAR);
64 }
65
66
67 void RegExpBuilder::AddEmpty() {
68 pending_empty_ = true;
69 }
70
71
72 void RegExpBuilder::AddAtom(RegExpTree* term) {
73 if (term->IsEmpty()) {
74 AddEmpty();
75 return;
76 }
77 if (term->IsTextElement()) {
78 FlushCharacters();
79 text_.Add(term);
80 } else {
81 FlushText();
82 terms_.Add(term);
83 }
84 LAST(ADD_ATOM);
85 }
86
87
88 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
89 FlushText();
90 terms_.Add(assert);
91 LAST(ADD_ASSERT);
92 }
93
94
95 void RegExpBuilder::NewAlternative() {
96 FlushTerms();
97 }
98
99
100 void RegExpBuilder::FlushTerms() {
101 FlushText();
102 intptr_t num_terms = terms_.length();
103 RegExpTree* alternative;
104 if (num_terms == 0) {
105 alternative = RegExpEmpty::GetInstance();
106 } else if (num_terms == 1) {
107 alternative = terms_.Last();
108 } else {
109 ZoneGrowableArray<RegExpTree*>* terms =
110 new(I) ZoneGrowableArray<RegExpTree*>();
111 for (intptr_t i = 0; i < terms_.length(); i++) {
112 terms->Add(terms_[i]);
113 }
114 alternative = new(I) RegExpAlternative(terms);
115 }
116 alternatives_.Add(alternative);
117 terms_.Clear();
118 LAST(ADD_NONE);
119 }
120
121
122 RegExpTree* RegExpBuilder::ToRegExp() {
123 FlushTerms();
124 intptr_t num_alternatives = alternatives_.length();
125 if (num_alternatives == 0) {
126 return RegExpEmpty::GetInstance();
127 }
128 if (num_alternatives == 1) {
129 return alternatives_.Last();
130 }
131 ZoneGrowableArray<RegExpTree*>* alternatives =
132 new(I) ZoneGrowableArray<RegExpTree*>();
133 for (intptr_t i = 0; i < alternatives_.length(); i++) {
134 alternatives->Add(alternatives_[i]);
135 }
136 return new(I) RegExpDisjunction(alternatives);
137 }
138
139
140 void RegExpBuilder::AddQuantifierToAtom(
141 intptr_t min,
142 intptr_t max,
143 RegExpQuantifier::QuantifierType quantifier_type) {
144 if (pending_empty_) {
145 pending_empty_ = false;
146 return;
147 }
148 RegExpTree* atom;
149 if (characters_ != NULL) {
150 DEBUG_ASSERT(last_added_ == ADD_CHAR);
151 // Last atom was character.
152
153 ZoneGrowableArray<uint16_t> *char_vector =
154 new(I) ZoneGrowableArray<uint16_t>();
155 char_vector->AddArray(*characters_);
156 intptr_t num_chars = char_vector->length();
157 if (num_chars > 1) {
158 ZoneGrowableArray<uint16_t> *prefix =
159 new(I) ZoneGrowableArray<uint16_t>();
160 for (intptr_t i = 0; i < num_chars - 1; i++) {
161 prefix->Add(char_vector->At(i));
162 }
163 text_.Add(new(I) RegExpAtom(prefix));
164 ZoneGrowableArray<uint16_t> *tail = new(I) ZoneGrowableArray<uint16_t>();
165 tail->Add(char_vector->At(num_chars - 1));
166 char_vector = tail;
167 }
168 characters_ = NULL;
169 atom = new(I) RegExpAtom(char_vector);
170 FlushText();
171 } else if (text_.length() > 0) {
172 DEBUG_ASSERT(last_added_ == ADD_ATOM);
173 atom = text_.RemoveLast();
174 FlushText();
175 } else if (terms_.length() > 0) {
176 DEBUG_ASSERT(last_added_ == ADD_ATOM);
177 atom = terms_.RemoveLast();
178 if (atom->max_match() == 0) {
179 // Guaranteed to only match an empty string.
180 LAST(ADD_TERM);
181 if (min == 0) {
182 return;
183 }
184 terms_.Add(atom);
185 return;
186 }
187 } else {
188 // Only call immediately after adding an atom or character!
189 UNREACHABLE();
190 return;
191 }
192 terms_.Add(new(I) RegExpQuantifier(min, max, quantifier_type, atom));
193 LAST(ADD_TERM);
194 }
195
196 // ----------------------------------------------------------------------------
197 // Implementation of Parser
198
199 RegExpParser::RegExpParser(const String& in,
200 String* error,
201 bool multiline)
202 : isolate_(Isolate::Current()),
203 error_(error),
204 captures_(NULL),
205 in_(in),
206 current_(kEndMarker),
207 next_pos_(0),
208 capture_count_(0),
209 has_more_(true),
210 multiline_(multiline),
211 simple_(false),
212 contains_anchor_(false),
213 is_scanned_for_captures_(false),
214 failed_(false) {
215 Advance();
216 }
217
218
219 bool RegExpParser::ParseFunction(ParsedFunction *parsed_function) {
220 Isolate* isolate = parsed_function->isolate();
221 JSRegExp& regexp = JSRegExp::Handle(parsed_function->function().regexp());
222
223 const String& pattern = String::Handle(regexp.pattern());
224 const bool multiline = regexp.is_multi_line();
225
226 RegExpCompileData* compile_data = new(isolate) RegExpCompileData();
227 if (!RegExpParser::ParseRegExp(pattern, multiline, compile_data)) {
228 // Parsing failures are handled in the JSRegExp factory constructor.
229 UNREACHABLE();
230 }
231
232 regexp.set_num_bracket_expressions(compile_data->capture_count);
233 if (compile_data->simple) {
234 regexp.set_is_simple();
235 } else {
236 regexp.set_is_complex();
237 }
238
239 parsed_function->SetRegExpCompileData(compile_data);
240
241 return true;
242 }
243
244
245 uint32_t RegExpParser::Next() {
246 if (has_next()) {
247 return in().CharAt(next_pos_);
248 } else {
249 return kEndMarker;
250 }
251 }
252
253
254 void RegExpParser::Advance() {
255 if (next_pos_ < in().Length()) {
256 current_ = in().CharAt(next_pos_);
257 next_pos_++;
258 } else {
259 current_ = kEndMarker;
260 has_more_ = false;
261 }
262 }
263
264
265 void RegExpParser::Reset(intptr_t pos) {
266 next_pos_ = pos;
267 has_more_ = (pos < in().Length());
268 Advance();
269 }
270
271
272 void RegExpParser::Advance(intptr_t dist) {
273 next_pos_ += dist - 1;
274 Advance();
275 }
276
277
278 bool RegExpParser::simple() {
279 return simple_;
280 }
281
282
283 void RegExpParser::ReportError(const char* message) {
284 failed_ = true;
285 *error_ = String::New(message);
286 // Zip to the end to make sure the no more input is read.
287 current_ = kEndMarker;
288 next_pos_ = in().Length();
289
290 const Error& error = Error::Handle(LanguageError::New(*error_));
291 Report::LongJump(error);
292 UNREACHABLE();
293 }
294
295
296 // Pattern ::
297 // Disjunction
298 RegExpTree* RegExpParser::ParsePattern() {
299 RegExpTree* result = ParseDisjunction();
300 ASSERT(!has_more());
301 // If the result of parsing is a literal string atom, and it has the
302 // same length as the input, then the atom is identical to the input.
303 if (result->IsAtom() && result->AsAtom()->length() == in().Length()) {
304 simple_ = true;
305 }
306 return result;
307 }
308
309
310 // Disjunction ::
311 // Alternative
312 // Alternative | Disjunction
313 // Alternative ::
314 // [empty]
315 // Term Alternative
316 // Term ::
317 // Assertion
318 // Atom
319 // Atom Quantifier
320 RegExpTree* RegExpParser::ParseDisjunction() {
321 // Used to store current state while parsing subexpressions.
322 RegExpParserState initial_state(NULL, INITIAL, 0, I);
323 RegExpParserState* stored_state = &initial_state;
324 // Cache the builder in a local variable for quick access.
325 RegExpBuilder* builder = initial_state.builder();
326 while (true) {
327 switch (current()) {
328 case kEndMarker:
329 if (stored_state->IsSubexpression()) {
330 // Inside a parenthesized group when hitting end of input.
331 ReportError("Unterminated group");
332 UNREACHABLE();
333 }
334 ASSERT(INITIAL == stored_state->group_type());
335 // Parsing completed successfully.
336 return builder->ToRegExp();
337 case ')': {
338 if (!stored_state->IsSubexpression()) {
339 ReportError("Unmatched ')'");
340 UNREACHABLE();
341 }
342 ASSERT(INITIAL != stored_state->group_type());
343
344 Advance();
345 // End disjunction parsing and convert builder content to new single
346 // regexp atom.
347 RegExpTree* body = builder->ToRegExp();
348
349 intptr_t end_capture_index = captures_started();
350
351 intptr_t capture_index = stored_state->capture_index();
352 SubexpressionType group_type = stored_state->group_type();
353
354 // Restore previous state.
355 stored_state = stored_state->previous_state();
356 builder = stored_state->builder();
357
358 // Build result of subexpression.
359 if (group_type == CAPTURE) {
360 RegExpCapture* capture = new(I) RegExpCapture(body, capture_index);
361 (*captures_)[capture_index - 1] = capture;
362 body = capture;
363 } else if (group_type != GROUPING) {
364 ASSERT(group_type == POSITIVE_LOOKAHEAD ||
365 group_type == NEGATIVE_LOOKAHEAD);
366 bool is_positive = (group_type == POSITIVE_LOOKAHEAD);
367 body = new(I) RegExpLookahead(body,
368 is_positive,
369 end_capture_index - capture_index,
370 capture_index);
371 }
372 builder->AddAtom(body);
373 // For compatibility with JSC and ES3, we allow quantifiers after
374 // lookaheads, and break in all cases.
375 break;
376 }
377 case '|': {
378 Advance();
379 builder->NewAlternative();
380 continue;
381 }
382 case '*':
383 case '+':
384 case '?':
385 ReportError("Nothing to repeat");
386 UNREACHABLE();
387 case '^': {
388 Advance();
389 if (multiline_) {
390 builder->AddAssertion(
391 new(I) RegExpAssertion(RegExpAssertion::START_OF_LINE));
392 } else {
393 builder->AddAssertion(
394 new(I) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
395 set_contains_anchor();
396 }
397 continue;
398 }
399 case '$': {
400 Advance();
401 RegExpAssertion::AssertionType assertion_type =
402 multiline_ ? RegExpAssertion::END_OF_LINE :
403 RegExpAssertion::END_OF_INPUT;
404 builder->AddAssertion(new RegExpAssertion(assertion_type));
405 continue;
406 }
407 case '.': {
408 Advance();
409 // everything except \x0a, \x0d, \u2028 and \u2029
410 ZoneGrowableArray<CharacterRange>* ranges =
411 new ZoneGrowableArray<CharacterRange>(2);
412 CharacterRange::AddClassEscape('.', ranges);
413 RegExpTree* atom = new RegExpCharacterClass(ranges, false);
414 builder->AddAtom(atom);
415 break;
416 }
417 case '(': {
418 SubexpressionType subexpr_type = CAPTURE;
419 Advance();
420 if (current() == '?') {
421 switch (Next()) {
422 case ':':
423 subexpr_type = GROUPING;
424 break;
425 case '=':
426 subexpr_type = POSITIVE_LOOKAHEAD;
427 break;
428 case '!':
429 subexpr_type = NEGATIVE_LOOKAHEAD;
430 break;
431 default:
432 ReportError("Invalid group");
433 UNREACHABLE();
434 }
435 Advance(2);
436 } else {
437 if (captures_ == NULL) {
438 captures_ = new ZoneGrowableArray<RegExpCapture*>(2);
439 }
440 if (captures_started() >= kMaxCaptures) {
441 ReportError("Too many captures");
442 UNREACHABLE();
443 }
444 captures_->Add(NULL);
445 }
446 // Store current state and begin new disjunction parsing.
447 stored_state = new RegExpParserState(stored_state, subexpr_type,
448 captures_started(), I);
449 builder = stored_state->builder();
450 continue;
451 }
452 case '[': {
453 RegExpTree* atom = ParseCharacterClass();
454 builder->AddAtom(atom);
455 break;
456 }
457 // Atom ::
458 // \ AtomEscape
459 case '\\':
460 switch (Next()) {
461 case kEndMarker:
462 ReportError("\\ at end of pattern");
463 UNREACHABLE();
464 case 'b':
465 Advance(2);
466 builder->AddAssertion(
467 new RegExpAssertion(RegExpAssertion::BOUNDARY));
468 continue;
469 case 'B':
470 Advance(2);
471 builder->AddAssertion(
472 new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
473 continue;
474 // AtomEscape ::
475 // CharacterClassEscape
476 //
477 // CharacterClassEscape :: one of
478 // d D s S w W
479 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {
480 uint32_t c = Next();
481 Advance(2);
482 ZoneGrowableArray<CharacterRange>* ranges =
483 new ZoneGrowableArray<CharacterRange>(2);
484 CharacterRange::AddClassEscape(c, ranges);
485 RegExpTree* atom = new RegExpCharacterClass(ranges, false);
486 builder->AddAtom(atom);
487 break;
488 }
489 case '1': case '2': case '3': case '4': case '5': case '6':
490 case '7': case '8': case '9': {
491 intptr_t index = 0;
492 if (ParseBackReferenceIndex(&index)) {
493 RegExpCapture* capture = NULL;
494 if (captures_ != NULL && index <= captures_->length()) {
495 capture = captures_->At(index - 1);
496 }
497 if (capture == NULL) {
498 builder->AddEmpty();
499 break;
500 }
501 RegExpTree* atom = new RegExpBackReference(capture);
502 builder->AddAtom(atom);
503 break;
504 }
505 uint32_t first_digit = Next();
506 if (first_digit == '8' || first_digit == '9') {
507 // Treat as identity escape
508 builder->AddCharacter(first_digit);
509 Advance(2);
510 break;
511 }
512 }
513 // FALLTHROUGH
514 case '0': {
515 Advance();
516 uint32_t octal = ParseOctalLiteral();
517 builder->AddCharacter(octal);
518 break;
519 }
520 // ControlEscape :: one of
521 // f n r t v
522 case 'f':
523 Advance(2);
524 builder->AddCharacter('\f');
525 break;
526 case 'n':
527 Advance(2);
528 builder->AddCharacter('\n');
529 break;
530 case 'r':
531 Advance(2);
532 builder->AddCharacter('\r');
533 break;
534 case 't':
535 Advance(2);
536 builder->AddCharacter('\t');
537 break;
538 case 'v':
539 Advance(2);
540 builder->AddCharacter('\v');
541 break;
542 case 'c': {
543 Advance();
544 uint32_t controlLetter = Next();
545 // Special case if it is an ASCII letter.
546 // Convert lower case letters to uppercase.
547 uint32_t letter = controlLetter & ~('a' ^ 'A');
548 if (letter < 'A' || 'Z' < letter) {
549 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
550 // This is outside the specification. We match JSC in
551 // reading the backslash as a literal character instead
552 // of as starting an escape.
553 builder->AddCharacter('\\');
554 } else {
555 Advance(2);
556 builder->AddCharacter(controlLetter & 0x1f);
557 }
558 break;
559 }
560 case 'x': {
561 Advance(2);
562 uint32_t value;
563 if (ParseHexEscape(2, &value)) {
564 builder->AddCharacter(value);
565 } else {
566 builder->AddCharacter('x');
567 }
568 break;
569 }
570 case 'u': {
571 Advance(2);
572 uint32_t value;
573 if (ParseHexEscape(4, &value)) {
574 builder->AddCharacter(value);
575 } else {
576 builder->AddCharacter('u');
577 }
578 break;
579 }
580 default:
581 // Identity escape.
582 builder->AddCharacter(Next());
583 Advance(2);
584 break;
585 }
586 break;
587 case '{': {
588 intptr_t dummy;
589 if (ParseIntervalQuantifier(&dummy, &dummy)) {
590 ReportError("Nothing to repeat");
591 UNREACHABLE();
592 }
593 // fallthrough
594 }
595 default:
596 builder->AddCharacter(current());
597 Advance();
598 break;
599 } // end switch(current())
600
601 intptr_t min;
602 intptr_t max;
603 switch (current()) {
604 // QuantifierPrefix ::
605 // *
606 // +
607 // ?
608 // {
609 case '*':
610 min = 0;
611 max = RegExpTree::kInfinity;
612 Advance();
613 break;
614 case '+':
615 min = 1;
616 max = RegExpTree::kInfinity;
617 Advance();
618 break;
619 case '?':
620 min = 0;
621 max = 1;
622 Advance();
623 break;
624 case '{':
625 if (ParseIntervalQuantifier(&min, &max)) {
626 if (max < min) {
627 ReportError("numbers out of order in {} quantifier.");
628 UNREACHABLE();
629 }
630 break;
631 } else {
632 continue;
633 }
634 default:
635 continue;
636 }
637 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
638 if (current() == '?') {
639 quantifier_type = RegExpQuantifier::NON_GREEDY;
640 Advance();
641 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
642 // FLAG_regexp_possessive_quantifier is a debug-only flag.
643 quantifier_type = RegExpQuantifier::POSSESSIVE;
644 Advance();
645 }
646 builder->AddQuantifierToAtom(min, max, quantifier_type);
647 }
648 }
649
650
651 #ifdef DEBUG
652 // Currently only used in an ASSERT.
653 static bool IsSpecialClassEscape(uint32_t c) {
654 switch (c) {
655 case 'd': case 'D':
656 case 's': case 'S':
657 case 'w': case 'W':
658 return true;
659 default:
660 return false;
661 }
662 }
663 #endif
664
665
666 // In order to know whether an escape is a backreference or not we have to scan
667 // the entire regexp and find the number of capturing parentheses. However we
668 // don't want to scan the regexp twice unless it is necessary. This mini-parser
669 // is called when needed. It can see the difference between capturing and
670 // noncapturing parentheses and can skip character classes and backslash-escaped
671 // characters.
672 void RegExpParser::ScanForCaptures() {
673 // Start with captures started previous to current position
674 intptr_t capture_count = captures_started();
675 // Add count of captures after this position.
676 intptr_t n;
677 while ((n = current()) != kEndMarker) {
678 Advance();
679 switch (n) {
680 case '\\':
681 Advance();
682 break;
683 case '[': {
684 intptr_t c;
685 while ((c = current()) != kEndMarker) {
686 Advance();
687 if (c == '\\') {
688 Advance();
689 } else {
690 if (c == ']') break;
691 }
692 }
693 break;
694 }
695 case '(':
696 if (current() != '?') capture_count++;
697 break;
698 }
699 }
700 capture_count_ = capture_count;
701 is_scanned_for_captures_ = true;
702 }
703
704
705 static inline bool IsDecimalDigit(int32_t c) {
706 return '0' <= c && c <= '9';
707 }
708
709
710 bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) {
711 ASSERT('\\' == current());
712 ASSERT('1' <= Next() && Next() <= '9');
713 // Try to parse a decimal literal that is no greater than the total number
714 // of left capturing parentheses in the input.
715 intptr_t start = position();
716 intptr_t value = Next() - '0';
717 Advance(2);
718 while (true) {
719 uint32_t c = current();
720 if (IsDecimalDigit(c)) {
721 value = 10 * value + (c - '0');
722 if (value > kMaxCaptures) {
723 Reset(start);
724 return false;
725 }
726 Advance();
727 } else {
728 break;
729 }
730 }
731 if (value > captures_started()) {
732 if (!is_scanned_for_captures_) {
733 intptr_t saved_position = position();
734 ScanForCaptures();
735 Reset(saved_position);
736 }
737 if (value > capture_count_) {
738 Reset(start);
739 return false;
740 }
741 }
742 *index_out = value;
743 return true;
744 }
745
746
747 // QuantifierPrefix ::
748 // { DecimalDigits }
749 // { DecimalDigits , }
750 // { DecimalDigits , DecimalDigits }
751 //
752 // Returns true if parsing succeeds, and set the min_out and max_out
753 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
754 bool RegExpParser::ParseIntervalQuantifier(intptr_t* min_out,
755 intptr_t* max_out) {
756 ASSERT(current() == '{');
757 intptr_t start = position();
758 Advance();
759 intptr_t min = 0;
760 if (!IsDecimalDigit(current())) {
761 Reset(start);
762 return false;
763 }
764 while (IsDecimalDigit(current())) {
765 intptr_t next = current() - '0';
766 if (min > (RegExpTree::kInfinity - next) / 10) {
767 // Overflow. Skip past remaining decimal digits and return -1.
768 do {
769 Advance();
770 } while (IsDecimalDigit(current()));
771 min = RegExpTree::kInfinity;
772 break;
773 }
774 min = 10 * min + next;
775 Advance();
776 }
777 intptr_t max = 0;
778 if (current() == '}') {
779 max = min;
780 Advance();
781 } else if (current() == ',') {
782 Advance();
783 if (current() == '}') {
784 max = RegExpTree::kInfinity;
785 Advance();
786 } else {
787 while (IsDecimalDigit(current())) {
788 intptr_t next = current() - '0';
789 if (max > (RegExpTree::kInfinity - next) / 10) {
790 do {
791 Advance();
792 } while (IsDecimalDigit(current()));
793 max = RegExpTree::kInfinity;
794 break;
795 }
796 max = 10 * max + next;
797 Advance();
798 }
799 if (current() != '}') {
800 Reset(start);
801 return false;
802 }
803 Advance();
804 }
805 } else {
806 Reset(start);
807 return false;
808 }
809 *min_out = min;
810 *max_out = max;
811 return true;
812 }
813
814
815 uint32_t RegExpParser::ParseOctalLiteral() {
816 ASSERT(('0' <= current() && current() <= '7') || current() == kEndMarker);
817 // For compatibility with some other browsers (not all), we parse
818 // up to three octal digits with a value below 256.
819 uint32_t value = current() - '0';
820 Advance();
821 if ('0' <= current() && current() <= '7') {
822 value = value * 8 + current() - '0';
823 Advance();
824 if (value < 32 && '0' <= current() && current() <= '7') {
825 value = value * 8 + current() - '0';
826 Advance();
827 }
828 }
829 return value;
830 }
831
832
833 // Returns the value (0 .. 15) of a hexadecimal character c.
834 // If c is not a legal hexadecimal character, returns a value < 0.
835 static inline intptr_t HexValue(uint32_t c) {
836 c -= '0';
837 if (static_cast<unsigned>(c) <= 9) return c;
838 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
839 if (static_cast<unsigned>(c) <= 5) return c + 10;
840 return -1;
841 }
842
843
844 bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t *value) {
845 intptr_t start = position();
846 uint32_t val = 0;
847 bool done = false;
848 for (intptr_t i = 0; !done; i++) {
849 uint32_t c = current();
850 intptr_t d = HexValue(c);
851 if (d < 0) {
852 Reset(start);
853 return false;
854 }
855 val = val * 16 + d;
856 Advance();
857 if (i == length - 1) {
858 done = true;
859 }
860 }
861 *value = val;
862 return true;
863 }
864
865
866 uint32_t RegExpParser::ParseClassCharacterEscape() {
867 ASSERT(current() == '\\');
868 DEBUG_ASSERT(has_next() && !IsSpecialClassEscape(Next()));
869 Advance();
870 switch (current()) {
871 case 'b':
872 Advance();
873 return '\b';
874 // ControlEscape :: one of
875 // f n r t v
876 case 'f':
877 Advance();
878 return '\f';
879 case 'n':
880 Advance();
881 return '\n';
882 case 'r':
883 Advance();
884 return '\r';
885 case 't':
886 Advance();
887 return '\t';
888 case 'v':
889 Advance();
890 return '\v';
891 case 'c': {
892 uint32_t controlLetter = Next();
893 uint32_t letter = controlLetter & ~('A' ^ 'a');
894 // For compatibility with JSC, inside a character class
895 // we also accept digits and underscore as control characters.
896 if ((controlLetter >= '0' && controlLetter <= '9') ||
897 controlLetter == '_' ||
898 (letter >= 'A' && letter <= 'Z')) {
899 Advance(2);
900 // Control letters mapped to ASCII control characters in the range
901 // 0x00-0x1f.
902 return controlLetter & 0x1f;
903 }
904 // We match JSC in reading the backslash as a literal
905 // character instead of as starting an escape.
906 return '\\';
907 }
908 case '0': case '1': case '2': case '3': case '4': case '5':
909 case '6': case '7':
910 // For compatibility, we interpret a decimal escape that isn't
911 // a back reference (and therefore either \0 or not valid according
912 // to the specification) as a 1..3 digit octal character code.
913 return ParseOctalLiteral();
914 case 'x': {
915 Advance();
916 uint32_t value;
917 if (ParseHexEscape(2, &value)) {
918 return value;
919 }
920 // If \x is not followed by a two-digit hexadecimal, treat it
921 // as an identity escape.
922 return 'x';
923 }
924 case 'u': {
925 Advance();
926 uint32_t value;
927 if (ParseHexEscape(4, &value)) {
928 return value;
929 }
930 // If \u is not followed by a four-digit hexadecimal, treat it
931 // as an identity escape.
932 return 'u';
933 }
934 default: {
935 // Extended identity escape. We accept any character that hasn't
936 // been matched by a more specific case, not just the subset required
937 // by the ECMAScript specification.
938 uint32_t result = current();
939 Advance();
940 return result;
941 }
942 }
943 return 0;
944 }
945
946
947 CharacterRange RegExpParser::ParseClassAtom(uint16_t* char_class) {
948 ASSERT(0 == *char_class);
949 uint32_t first = current();
950 if (first == '\\') {
951 switch (Next()) {
952 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {
953 *char_class = Next();
954 Advance(2);
955 return CharacterRange::Singleton(0); // Return dummy value.
956 }
957 case kEndMarker:
958 ReportError("\\ at end of pattern");
959 UNREACHABLE();
960 default:
961 uint32_t c = ParseClassCharacterEscape();
962 return CharacterRange::Singleton(c);
963 }
964 } else {
965 Advance();
966 return CharacterRange::Singleton(first);
967 }
968 }
969
970
971 static const uint16_t kNoCharClass = 0;
972
973 // Adds range or pre-defined character class to character ranges.
974 // If char_class is not kInvalidClass, it's interpreted as a class
975 // escape (i.e., 's' means whitespace, from '\s').
976 static inline void AddRangeOrEscape(ZoneGrowableArray<CharacterRange>* ranges,
977 uint16_t char_class,
978 CharacterRange range) {
979 if (char_class != kNoCharClass) {
980 CharacterRange::AddClassEscape(char_class, ranges);
981 } else {
982 ranges->Add(range);
983 }
984 }
985
986
987 RegExpTree* RegExpParser::ParseCharacterClass() {
988 static const char* kUnterminated = "Unterminated character class";
989 static const char* kRangeOutOfOrder = "Range out of order in character class";
990
991 ASSERT(current() == '[');
992 Advance();
993 bool is_negated = false;
994 if (current() == '^') {
995 is_negated = true;
996 Advance();
997 }
998 ZoneGrowableArray<CharacterRange>* ranges =
999 new(I) ZoneGrowableArray<CharacterRange>(2);
1000 while (has_more() && current() != ']') {
1001 uint16_t char_class = kNoCharClass;
1002 CharacterRange first = ParseClassAtom(&char_class);
1003 if (current() == '-') {
1004 Advance();
1005 if (current() == kEndMarker) {
1006 // If we reach the end we break out of the loop and let the
1007 // following code report an error.
1008 break;
1009 } else if (current() == ']') {
1010 AddRangeOrEscape(ranges, char_class, first);
1011 ranges->Add(CharacterRange::Singleton('-'));
1012 break;
1013 }
1014 uint16_t char_class_2 = kNoCharClass;
1015 CharacterRange next = ParseClassAtom(&char_class_2);
1016 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
1017 // Either end is an escaped character class. Treat the '-' verbatim.
1018 AddRangeOrEscape(ranges, char_class, first);
1019 ranges->Add(CharacterRange::Singleton('-'));
1020 AddRangeOrEscape(ranges, char_class_2, next);
1021 continue;
1022 }
1023 if (first.from() > next.to()) {
1024 ReportError(kRangeOutOfOrder);
1025 UNREACHABLE();
1026 }
1027 ranges->Add(CharacterRange::Range(first.from(), next.to()));
1028 } else {
1029 AddRangeOrEscape(ranges, char_class, first);
1030 }
1031 }
1032 if (!has_more()) {
1033 ReportError(kUnterminated);
1034 UNREACHABLE();
1035 }
1036 Advance();
1037 if (ranges->length() == 0) {
1038 ranges->Add(CharacterRange::Everything());
1039 is_negated = !is_negated;
1040 }
1041 return new(I) RegExpCharacterClass(ranges, is_negated);
1042 }
1043
1044
1045 // ----------------------------------------------------------------------------
1046 // The Parser interface.
1047
1048 bool RegExpParser::ParseRegExp(const String& input,
1049 bool multiline,
1050 RegExpCompileData* result) {
1051 ASSERT(result != NULL);
1052 LongJumpScope jump;
1053 RegExpParser parser(input, &result->error, multiline);
1054 if (setjmp(*jump.Set()) == 0) {
1055 RegExpTree* tree = parser.ParsePattern();
1056 ASSERT(tree != NULL);
1057 ASSERT(result->error.IsNull());
1058 result->tree = tree;
1059 intptr_t capture_count = parser.captures_started();
1060 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1061 result->contains_anchor = parser.contains_anchor();
1062 result->capture_count = capture_count;
1063 } else {
1064 ASSERT(!result->error.IsNull());
1065 Isolate::Current()->object_store()->clear_sticky_error();
1066
1067 // Throw a FormatException on parsing failures.
1068 const String& message = String::Handle(
1069 String::Concat(result->error, input));
1070 const Array& args = Array::Handle(Array::New(1));
1071 args.SetAt(0, message);
1072
1073 Exceptions::ThrowByType(Exceptions::kFormat, args);
1074 }
1075 return !parser.failed();
1076 }
1077
1078 } // namespace dart
OLDNEW
« runtime/vm/object.cc ('K') | « runtime/vm/regexp_parser.h ('k') | runtime/vm/symbols.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698