Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2)

Side by Side Diff: src/parser.cc

Issue 11350: Add back references. Since "back references" is two words... (Closed) Base URL: http://v8.googlecode.com/svn/branches/experimental/regexp2000/
Patch Set: Created 12 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 497 matching lines...) Expand 10 before | Expand all | Expand 10 after
508 // must not be 'b' or 'B' since they are usually handle specially. 508 // must not be 'b' or 'B' since they are usually handle specially.
509 uc32 ParseClassCharacterEscape(bool* ok); 509 uc32 ParseClassCharacterEscape(bool* ok);
510 510
511 // Checks whether the following is a length-digit hexadecimal number, 511 // Checks whether the following is a length-digit hexadecimal number,
512 // and sets the value if it is. 512 // and sets the value if it is.
513 bool ParseHexEscape(int length, uc32* value); 513 bool ParseHexEscape(int length, uc32* value);
514 514
515 uc32 ParseControlLetterEscape(bool* ok); 515 uc32 ParseControlLetterEscape(bool* ok);
516 uc32 ParseOctalLiteral(); 516 uc32 ParseOctalLiteral();
517 517
518 // Tries to parse the input as a backreference. If successful it 518 // Tries to parse the input as a back reference. If successful it
519 // stores the result in the output parameter and returns true. If 519 // stores the result in the output parameter and returns true. If
520 // it fails it will push back the characters read so the same characters 520 // it fails it will push back the characters read so the same characters
521 // can be reparsed. 521 // can be reparsed.
522 bool ParseBackreferenceIndex(int* index_out); 522 bool ParseBackReferenceIndex(int* index_out);
523 523
524 CharacterRange ParseClassAtom(bool* is_char_class, 524 CharacterRange ParseClassAtom(bool* is_char_class,
525 ZoneList<CharacterRange>* ranges, 525 ZoneList<CharacterRange>* ranges,
526 bool* ok); 526 bool* ok);
527 RegExpTree* ReportError(Vector<const char> message, bool* ok); 527 RegExpTree* ReportError(Vector<const char> message, bool* ok);
528 void Advance(); 528 void Advance();
529 void Advance(int dist); 529 void Advance(int dist);
530 void Reset(int pos); 530 void Reset(int pos);
531 531
532 bool HasCharacterEscapes(); 532 bool HasCharacterEscapes();
533 533
534 int captures_started() { return captures_ == NULL ? 0 : captures_->length(); } 534 int captures_started() { return captures_ == NULL ? 0 : captures_->length(); }
535 int position() { return next_pos_ - 1; } 535 int position() { return next_pos_ - 1; }
536 536
537 static const uc32 kEndMarker = (1 << 21); 537 static const uc32 kEndMarker = (1 << 21);
538 private: 538 private:
539 uc32 current() { return current_; } 539 uc32 current() { return current_; }
540 bool has_more() { return has_more_; } 540 bool has_more() { return has_more_; }
541 bool has_next() { return next_pos_ < in()->length(); } 541 bool has_next() { return next_pos_ < in()->length(); }
542 uc32 Next(); 542 uc32 Next();
543 FlatStringReader* in() { return in_; } 543 FlatStringReader* in() { return in_; }
544 void ScanForCaptures();
545 bool CaptureAvailable(int index);
544 uc32 current_; 546 uc32 current_;
545 bool has_more_; 547 bool has_more_;
546 bool multiline_mode_; 548 bool multiline_mode_;
547 int next_pos_; 549 int next_pos_;
548 FlatStringReader* in_; 550 FlatStringReader* in_;
549 Handle<String>* error_; 551 Handle<String>* error_;
550 bool has_character_escapes_; 552 bool has_character_escapes_;
553 bool is_scanned_for_captures_;
551 ZoneList<RegExpCapture*>* captures_; 554 ZoneList<RegExpCapture*>* captures_;
555 int capture_count_;
Christian Plesner Hansen 2008/11/21 11:16:39 This name could be misleading but I can't think of
552 }; 556 };
553 557
554 558
555 // A temporary scope stores information during parsing, just like 559 // A temporary scope stores information during parsing, just like
556 // a plain scope. However, temporary scopes are not kept around 560 // a plain scope. However, temporary scopes are not kept around
557 // after parsing or referenced by syntax trees so they can be stack- 561 // after parsing or referenced by syntax trees so they can be stack-
558 // allocated and hence used by the pre-parser. 562 // allocated and hence used by the pre-parser.
559 class TemporaryScope BASE_EMBEDDED { 563 class TemporaryScope BASE_EMBEDDED {
560 public: 564 public:
561 explicit TemporaryScope(Parser* parser); 565 explicit TemporaryScope(Parser* parser);
(...skipping 2933 matching lines...) Expand 10 before | Expand all | Expand 10 after
3495 RegExpParser::RegExpParser(FlatStringReader* in, 3499 RegExpParser::RegExpParser(FlatStringReader* in,
3496 Handle<String>* error, 3500 Handle<String>* error,
3497 bool multiline_mode) 3501 bool multiline_mode)
3498 : current_(kEndMarker), 3502 : current_(kEndMarker),
3499 has_more_(true), 3503 has_more_(true),
3500 multiline_mode_(multiline_mode), 3504 multiline_mode_(multiline_mode),
3501 next_pos_(0), 3505 next_pos_(0),
3502 in_(in), 3506 in_(in),
3503 error_(error), 3507 error_(error),
3504 has_character_escapes_(false), 3508 has_character_escapes_(false),
3505 captures_(NULL) { 3509 is_scanned_for_captures_(false),
3510 captures_(NULL),
3511 capture_count_(0) {
3506 Advance(1); 3512 Advance(1);
3507 } 3513 }
3508 3514
3509 3515
3510 uc32 RegExpParser::Next() { 3516 uc32 RegExpParser::Next() {
3511 if (has_next()) { 3517 if (has_next()) {
3512 return in()->Get(next_pos_); 3518 return in()->Get(next_pos_);
3513 } else { 3519 } else {
3514 return kEndMarker; 3520 return kEndMarker;
3515 } 3521 }
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
3557 // Disjunction 3563 // Disjunction
3558 RegExpTree* RegExpParser::ParsePattern(bool* ok) { 3564 RegExpTree* RegExpParser::ParsePattern(bool* ok) {
3559 RegExpTree* result = ParseDisjunction(CHECK_OK); 3565 RegExpTree* result = ParseDisjunction(CHECK_OK);
3560 if (has_more()) { 3566 if (has_more()) {
3561 ReportError(CStrVector("Unmatched ')'"), CHECK_OK); 3567 ReportError(CStrVector("Unmatched ')'"), CHECK_OK);
3562 } 3568 }
3563 return result; 3569 return result;
3564 } 3570 }
3565 3571
3566 3572
3573 bool RegExpParser::CaptureAvailable(int index) {
3574 if (captures_ == NULL) return false;
3575 if (index >= captures_->length()) return false;
3576 RegExpCapture* capture = captures_->at(index);
3577 return capture != NULL && capture->available() == CAPTURE_AVAILABLE;
3578 }
3579
3580
3567 // Disjunction :: 3581 // Disjunction ::
3568 // Alternative 3582 // Alternative
3569 // Alternative | Disjunction 3583 // Alternative | Disjunction
3570 // Alternative :: 3584 // Alternative ::
3571 // [empty] 3585 // [empty]
3572 // Term Alternative 3586 // Term Alternative
3573 // Term :: 3587 // Term ::
3574 // Assertion 3588 // Assertion
3575 // Atom 3589 // Atom
3576 // Atom Quantifier 3590 // Atom Quantifier
(...skipping 83 matching lines...) Expand 10 before | Expand all | Expand 10 after
3660 Advance(2); 3674 Advance(2);
3661 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2); 3675 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
3662 CharacterRange::AddClassEscape(c, ranges); 3676 CharacterRange::AddClassEscape(c, ranges);
3663 RegExpTree* atom = new RegExpCharacterClass(ranges, false); 3677 RegExpTree* atom = new RegExpCharacterClass(ranges, false);
3664 builder.AddAtom(atom); 3678 builder.AddAtom(atom);
3665 goto has_read_atom; // Avoid setting has_character_escapes_. 3679 goto has_read_atom; // Avoid setting has_character_escapes_.
3666 } 3680 }
3667 case '1': case '2': case '3': case '4': case '5': case '6': 3681 case '1': case '2': case '3': case '4': case '5': case '6':
3668 case '7': case '8': case '9': { 3682 case '7': case '8': case '9': {
3669 int index = 0; 3683 int index = 0;
3670 if (ParseBackreferenceIndex(&index)) { 3684 if (ParseBackReferenceIndex(&index)) {
3671 RegExpCapture* capture = captures_->at(index - 1); 3685 if (!CaptureAvailable(index - 1)) {
3672 if (capture == NULL || capture->available() != CAPTURE_AVAILABLE) {
3673 // Prepare to ignore a following quantifier 3686 // Prepare to ignore a following quantifier
3674 builder.AddEmpty(); 3687 builder.AddEmpty();
3675 goto has_read_atom; 3688 goto has_read_atom;
3676 } 3689 }
3677 RegExpTree* atom = new RegExpBackreference(capture); 3690 RegExpCapture* capture = captures_->at(index - 1);
3691 RegExpTree* atom = new RegExpBackReference(capture);
3678 builder.AddAtom(atom); 3692 builder.AddAtom(atom);
3679 goto has_read_atom; // Avoid setting has_character_escapes_. 3693 goto has_read_atom; // Avoid setting has_character_escapes_.
3680 } 3694 }
3681 uc32 first_digit = Next(); 3695 uc32 first_digit = Next();
3682 if (first_digit == '8' || first_digit == '9') { 3696 if (first_digit == '8' || first_digit == '9') {
3683 // Treat as identity escape 3697 // Treat as identity escape
3684 builder.AddCharacter(first_digit); 3698 builder.AddCharacter(first_digit);
3685 Advance(2); 3699 Advance(2);
3686 break; 3700 break;
3687 } 3701 }
(...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after
3837 case 's': case 'S': 3851 case 's': case 'S':
3838 case 'w': case 'W': 3852 case 'w': case 'W':
3839 return true; 3853 return true;
3840 default: 3854 default:
3841 return false; 3855 return false;
3842 } 3856 }
3843 } 3857 }
3844 #endif 3858 #endif
3845 3859
3846 3860
3847 bool RegExpParser::ParseBackreferenceIndex(int* index_out) { 3861 // In order to know whether an escape is a backreference or not we have to scan
3862 // the entire regexp and find the number of capturing parentheses. However we
3863 // don't want to scan the regexp twice unless it is necessary. This mini-parser
3864 // is called when needed. It can see the difference between capturing and
3865 // noncapturing parentheses and can skip character classes and backslash-escaped
3866 // characters.
3867 void RegExpParser::ScanForCaptures() {
3868 int n;
3869 while ((n = current()) != kEndMarker) {
3870 Advance();
3871 switch (n) {
3872 case '\\':
3873 Advance();
3874 break;
3875 case '[': {
3876 int c;
3877 while ((c = current()) != kEndMarker) {
3878 Advance();
3879 if (c == '\\') {
3880 Advance();
3881 } else {
3882 if (c == ']') break;
3883 }
3884 }
3885 break;
3886 }
3887 case '(':
3888 if (current() != '?') capture_count_++;
3889 break;
3890 }
3891 }
3892 is_scanned_for_captures_ = true;
3893 }
3894
3895
3896 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
3848 ASSERT_EQ('\\', current()); 3897 ASSERT_EQ('\\', current());
3849 ASSERT('1' <= Next() && Next() <= '9'); 3898 ASSERT('1' <= Next() && Next() <= '9');
3850 // Try to parse a decimal literal that is no greater than the number 3899 // Try to parse a decimal literal that is no greater than the number
3851 // of previously encountered left capturing parentheses. 3900 // of previously encountered left capturing parentheses.
3852 // This is a not according the the ECMAScript specification. According to 3901 // This is a not according the the ECMAScript specification. According to
3853 // that, one must accept values up to the total number of left capturing 3902 // that, one must accept values up to the total number of left capturing
3854 // parentheses in the entire input, even if they are meaningless. 3903 // parentheses in the entire input, even if they are meaningless.
3855 if (captures_ == NULL) 3904 if (!is_scanned_for_captures_) {
3856 return false; 3905 int saved_position = position();
3906 ScanForCaptures();
3907 Reset(saved_position);
3908 }
3909 if (capture_count_ == 0) return false;
3857 int start = position(); 3910 int start = position();
3858 int value = Next() - '0'; 3911 int value = Next() - '0';
3859 if (value > captures_->length()) 3912 if (value > capture_count_) return false;
3860 return false;
3861 Advance(2); 3913 Advance(2);
3862 while (true) { 3914 while (true) {
3863 uc32 c = current(); 3915 uc32 c = current();
3864 if (IsDecimalDigit(c)) { 3916 if (IsDecimalDigit(c)) {
3865 value = 10 * value + (c - '0'); 3917 value = 10 * value + (c - '0');
3866 if (value > captures_->length()) { 3918 if (value > capture_count_) {
3867 Reset(start); 3919 Reset(start);
3868 return false; 3920 return false;
3869 } 3921 }
3870 Advance(); 3922 Advance();
3871 } else { 3923 } else {
3872 break; 3924 break;
3873 } 3925 }
3874 } 3926 }
3875 *index_out = value; 3927 *index_out = value;
3876 return true; 3928 return true;
(...skipping 184 matching lines...) Expand 10 before | Expand all | Expand 10 after
4061 break; 4113 break;
4062 default: 4114 default:
4063 ReportError(CStrVector("Invalid group"), CHECK_OK); 4115 ReportError(CStrVector("Invalid group"), CHECK_OK);
4064 break; 4116 break;
4065 } 4117 }
4066 } else { 4118 } else {
4067 if (captures_ == NULL) { 4119 if (captures_ == NULL) {
4068 captures_ = new ZoneList<RegExpCapture*>(2); 4120 captures_ = new ZoneList<RegExpCapture*>(2);
4069 } 4121 }
4070 captures_->Add(NULL); 4122 captures_->Add(NULL);
4123 if (!is_scanned_for_captures_) capture_count_++;
4071 } 4124 }
4072 int capture_index = captures_started(); 4125 int capture_index = captures_started();
4073 RegExpTree* body = ParseDisjunction(CHECK_OK); 4126 RegExpTree* body = ParseDisjunction(CHECK_OK);
4074 if (current() != ')') { 4127 if (current() != ')') {
4075 ReportError(CStrVector("Unterminated group"), CHECK_OK); 4128 ReportError(CStrVector("Unterminated group"), CHECK_OK);
4076 } 4129 }
4077 Advance(); 4130 Advance();
4078 4131
4079 int end_capture_index = captures_started(); 4132 int end_capture_index = captures_started();
4080 if (type == '!') { 4133 if (type == '!') {
(...skipping 222 matching lines...) Expand 10 before | Expand all | Expand 10 after
4303 start_position, 4356 start_position,
4304 is_expression); 4357 is_expression);
4305 return result; 4358 return result;
4306 } 4359 }
4307 4360
4308 4361
4309 #undef NEW 4362 #undef NEW
4310 4363
4311 4364
4312 } } // namespace v8::internal 4365 } } // namespace v8::internal
OLDNEW
« src/jsregexp.cc ('K') | « src/jsregexp.cc ('k') | src/regexp-macro-assembler.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698