Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(319)

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 2788873002: [regexp] Allow named captures and back-references in non-unicode patterns (Closed)
Patch Set: Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/regexp-parser.h" 5 #include "src/regexp/regexp-parser.h"
6 6
7 #include "src/char-predicates-inl.h" 7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h" 8 #include "src/factory.h"
9 #include "src/isolate.h" 9 #include "src/isolate.h"
10 #include "src/objects-inl.h" 10 #include "src/objects-inl.h"
(...skipping 22 matching lines...) Expand all
33 ignore_case_(flags & JSRegExp::kIgnoreCase), 33 ignore_case_(flags & JSRegExp::kIgnoreCase),
34 multiline_(flags & JSRegExp::kMultiline), 34 multiline_(flags & JSRegExp::kMultiline),
35 unicode_(flags & JSRegExp::kUnicode), 35 unicode_(flags & JSRegExp::kUnicode),
36 next_pos_(0), 36 next_pos_(0),
37 captures_started_(0), 37 captures_started_(0),
38 capture_count_(0), 38 capture_count_(0),
39 has_more_(true), 39 has_more_(true),
40 simple_(false), 40 simple_(false),
41 contains_anchor_(false), 41 contains_anchor_(false),
42 is_scanned_for_captures_(false), 42 is_scanned_for_captures_(false),
43 has_named_captures_(false),
43 failed_(false) { 44 failed_(false) {
44 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall); 45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);
45 Advance(); 46 Advance();
46 } 47 }
47 48
48 template <bool update_position> 49 template <bool update_position>
49 inline uc32 RegExpParser::ReadNext() { 50 inline uc32 RegExpParser::ReadNext() {
50 int position = next_pos_; 51 int position = next_pos_;
51 uc32 c0 = in()->Get(position); 52 uc32 c0 = in()->Get(position);
52 position++; 53 position++;
(...skipping 265 matching lines...) Expand 10 before | Expand all | Expand 10 after
318 lookaround_type = RegExpLookaround::LOOKBEHIND; 319 lookaround_type = RegExpLookaround::LOOKBEHIND;
319 Advance(2); 320 Advance(2);
320 break; 321 break;
321 } else if (Next() == '!') { 322 } else if (Next() == '!') {
322 subexpr_type = NEGATIVE_LOOKAROUND; 323 subexpr_type = NEGATIVE_LOOKAROUND;
323 lookaround_type = RegExpLookaround::LOOKBEHIND; 324 lookaround_type = RegExpLookaround::LOOKBEHIND;
324 Advance(2); 325 Advance(2);
325 break; 326 break;
326 } 327 }
327 } 328 }
328 if (FLAG_harmony_regexp_named_captures && unicode()) { 329 if (FLAG_harmony_regexp_named_captures) {
330 has_named_captures_ = true;
329 is_named_capture = true; 331 is_named_capture = true;
330 Advance(); 332 Advance();
331 break; 333 break;
332 } 334 }
333 // Fall through. 335 // Fall through.
334 default: 336 default:
335 return ReportError(CStrVector("Invalid group")); 337 return ReportError(CStrVector("Invalid group"));
336 } 338 }
337 } 339 }
338 340
(...skipping 195 matching lines...) Expand 10 before | Expand all | Expand 10 after
534 builder->AddEscapedUnicodeCharacter(value); 536 builder->AddEscapedUnicodeCharacter(value);
535 } else if (!unicode()) { 537 } else if (!unicode()) {
536 builder->AddCharacter('u'); 538 builder->AddCharacter('u');
537 } else { 539 } else {
538 // With /u, invalid escapes are not treated as identity escapes. 540 // With /u, invalid escapes are not treated as identity escapes.
539 return ReportError(CStrVector("Invalid unicode escape")); 541 return ReportError(CStrVector("Invalid unicode escape"));
540 } 542 }
541 break; 543 break;
542 } 544 }
543 case 'k': 545 case 'k':
544 if (FLAG_harmony_regexp_named_captures && unicode()) { 546 // Either an identity escape or a named back-reference. The two
547 // interpretations are mutually exclusive: '\k' is interpreted as
548 // an identity escape for non-unicode patterns without named
549 // capture groups, and as the beginning of a named back-reference
550 // in all other cases.
551 if (FLAG_harmony_regexp_named_captures &&
552 (unicode() || HasNamedCaptures())) {
545 Advance(2); 553 Advance(2);
546 ParseNamedBackReference(builder, state CHECK_FAILED); 554 ParseNamedBackReference(builder, state CHECK_FAILED);
547 break; 555 break;
548 } 556 }
549 // Fall through. 557 // Fall through.
550 default: 558 default:
551 Advance(); 559 Advance();
552 // With /u, no identity escapes except for syntax characters 560 // With /u, no identity escapes except for syntax characters
553 // are allowed. Otherwise, all identity escapes are allowed. 561 // are allowed. Otherwise, all identity escapes are allowed.
554 if (!unicode() || IsSyntaxCharacterOrSlash(current())) { 562 if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
(...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after
650 #endif 658 #endif
651 659
652 660
653 // In order to know whether an escape is a backreference or not we have to scan 661 // In order to know whether an escape is a backreference or not we have to scan
654 // the entire regexp and find the number of capturing parentheses. However we 662 // the entire regexp and find the number of capturing parentheses. However we
655 // don't want to scan the regexp twice unless it is necessary. This mini-parser 663 // don't want to scan the regexp twice unless it is necessary. This mini-parser
656 // is called when needed. It can see the difference between capturing and 664 // is called when needed. It can see the difference between capturing and
657 // noncapturing parentheses and can skip character classes and backslash-escaped 665 // noncapturing parentheses and can skip character classes and backslash-escaped
658 // characters. 666 // characters.
659 void RegExpParser::ScanForCaptures() { 667 void RegExpParser::ScanForCaptures() {
668 DCHECK(!is_scanned_for_captures_);
669 const int saved_position = position();
660 // Start with captures started previous to current position 670 // Start with captures started previous to current position
661 int capture_count = captures_started(); 671 int capture_count = captures_started();
662 // Add count of captures after this position. 672 // Add count of captures after this position.
663 int n; 673 int n;
664 while ((n = current()) != kEndMarker) { 674 while ((n = current()) != kEndMarker) {
665 Advance(); 675 Advance();
666 switch (n) { 676 switch (n) {
667 case '\\': 677 case '\\':
668 Advance(); 678 Advance();
669 break; 679 break;
(...skipping 15 matching lines...) Expand all
685 // * a non-capturing group '(:', 695 // * a non-capturing group '(:',
686 // * a lookbehind assertion '(?<=' '(?<!' 696 // * a lookbehind assertion '(?<=' '(?<!'
687 // * or a named capture '(?<'. 697 // * or a named capture '(?<'.
688 // 698 //
689 // Of these, only named captures are capturing groups. 699 // Of these, only named captures are capturing groups.
690 if (!FLAG_harmony_regexp_named_captures) break; 700 if (!FLAG_harmony_regexp_named_captures) break;
691 701
692 Advance(); 702 Advance();
693 if (current() != '<') break; 703 if (current() != '<') break;
694 704
695 // TODO(jgruber): To be more future-proof we could test for 705 if (FLAG_harmony_regexp_lookbehind) {
696 // IdentifierStart here once it becomes clear whether group names 706 // TODO(jgruber): To be more future-proof we could test for
697 // allow unicode escapes. 707 // IdentifierStart here once it becomes clear whether group names
698 Advance(); 708 // allow unicode escapes.
699 if (current() == '=' || current() == '!') break; 709 // https://github.com/tc39/proposal-regexp-named-groups/issues/23
710 Advance();
711 if (current() == '=' || current() == '!') break;
712 }
713
714 // Found a possible named capture. It could turn out to be a syntax
715 // error (e.g. an unterminated or invalid name), but that distinction
716 // does not matter for our purposes.
717 has_named_captures_ = true;
700 } 718 }
701 capture_count++; 719 capture_count++;
702 break; 720 break;
703 } 721 }
704 } 722 }
705 capture_count_ = capture_count; 723 capture_count_ = capture_count;
706 is_scanned_for_captures_ = true; 724 is_scanned_for_captures_ = true;
725 Reset(saved_position);
707 } 726 }
708 727
709 728
710 bool RegExpParser::ParseBackReferenceIndex(int* index_out) { 729 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
711 DCHECK_EQ('\\', current()); 730 DCHECK_EQ('\\', current());
712 DCHECK('1' <= Next() && Next() <= '9'); 731 DCHECK('1' <= Next() && Next() <= '9');
713 // Try to parse a decimal literal that is no greater than the total number 732 // Try to parse a decimal literal that is no greater than the total number
714 // of left capturing parentheses in the input. 733 // of left capturing parentheses in the input.
715 int start = position(); 734 int start = position();
716 int value = Next() - '0'; 735 int value = Next() - '0';
717 Advance(2); 736 Advance(2);
718 while (true) { 737 while (true) {
719 uc32 c = current(); 738 uc32 c = current();
720 if (IsDecimalDigit(c)) { 739 if (IsDecimalDigit(c)) {
721 value = 10 * value + (c - '0'); 740 value = 10 * value + (c - '0');
722 if (value > kMaxCaptures) { 741 if (value > kMaxCaptures) {
723 Reset(start); 742 Reset(start);
724 return false; 743 return false;
725 } 744 }
726 Advance(); 745 Advance();
727 } else { 746 } else {
728 break; 747 break;
729 } 748 }
730 } 749 }
731 if (value > captures_started()) { 750 if (value > captures_started()) {
732 if (!is_scanned_for_captures_) { 751 if (!is_scanned_for_captures_) ScanForCaptures();
733 int saved_position = position();
734 ScanForCaptures();
735 Reset(saved_position);
736 }
737 if (value > capture_count_) { 752 if (value > capture_count_) {
738 Reset(start); 753 Reset(start);
739 return false; 754 return false;
740 } 755 }
741 } 756 }
742 *index_out = value; 757 *index_out = value;
743 return true; 758 return true;
744 } 759 }
745 760
746 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) { 761 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
747 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { 762 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
748 v->push_back(code_unit); 763 v->push_back(code_unit);
749 } else { 764 } else {
750 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); 765 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
751 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); 766 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
752 } 767 }
753 } 768 }
754 769
755 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { 770 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
756 DCHECK(FLAG_harmony_regexp_named_captures); 771 DCHECK(FLAG_harmony_regexp_named_captures);
757 DCHECK(unicode());
758 772
759 ZoneVector<uc16>* name = 773 ZoneVector<uc16>* name =
760 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); 774 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
761 775
762 bool at_start = true; 776 bool at_start = true;
763 while (true) { 777 while (true) {
764 uc32 c = current(); 778 uc32 c = current();
765 Advance(); 779 Advance();
766 780
767 // Convert unicode escapes. 781 // Convert unicode escapes.
768 if (c == '\\' && current() == 'u') { 782 if (c == '\\' && current() == 'u') {
783 // TODO(jgruber): Reconsider this once the spec has settled.
784 // https://github.com/tc39/proposal-regexp-named-groups/issues/23
769 Advance(); 785 Advance();
770 if (!ParseUnicodeEscape(&c)) { 786 if (!ParseUnicodeEscape(&c)) {
771 ReportError(CStrVector("Invalid Unicode escape sequence")); 787 ReportError(CStrVector("Invalid Unicode escape sequence"));
772 return nullptr; 788 return nullptr;
773 } 789 }
774 } 790 }
775 791
776 if (at_start) { 792 if (at_start) {
777 if (!IdentifierStart::Is(c)) { 793 if (!IdentifierStart::Is(c)) {
778 ReportError(CStrVector("Invalid capture group name")); 794 ReportError(CStrVector("Invalid capture group name"));
(...skipping 12 matching lines...) Expand all
791 } 807 }
792 } 808 }
793 } 809 }
794 810
795 return name; 811 return name;
796 } 812 }
797 813
798 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, 814 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
799 int index) { 815 int index) {
800 DCHECK(FLAG_harmony_regexp_named_captures); 816 DCHECK(FLAG_harmony_regexp_named_captures);
801 DCHECK(unicode());
802 DCHECK(0 < index && index <= captures_started_); 817 DCHECK(0 < index && index <= captures_started_);
803 DCHECK_NOT_NULL(name); 818 DCHECK_NOT_NULL(name);
804 819
805 if (named_captures_ == nullptr) { 820 if (named_captures_ == nullptr) {
806 named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone()); 821 named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
807 } else { 822 } else {
808 // Check for duplicates and bail if we find any. 823 // Check for duplicates and bail if we find any.
824 // TODO(jgruber): O(n^2).
809 for (const auto& named_capture : *named_captures_) { 825 for (const auto& named_capture : *named_captures_) {
810 if (*named_capture->name() == *name) { 826 if (*named_capture->name() == *name) {
811 ReportError(CStrVector("Duplicate capture group name")); 827 ReportError(CStrVector("Duplicate capture group name"));
812 return false; 828 return false;
813 } 829 }
814 } 830 }
815 } 831 }
816 832
817 RegExpCapture* capture = GetCapture(index); 833 RegExpCapture* capture = GetCapture(index);
818 DCHECK(capture->name() == nullptr); 834 DCHECK(capture->name() == nullptr);
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after
913 for (int i = 0; i < named_captures_->length(); i++) { 929 for (int i = 0; i < named_captures_->length(); i++) {
914 RegExpCapture* capture = named_captures_->at(i); 930 RegExpCapture* capture = named_captures_->at(i);
915 MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name()); 931 MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
916 array->set(i * 2, *name.ToHandleChecked()); 932 array->set(i * 2, *name.ToHandleChecked());
917 array->set(i * 2 + 1, Smi::FromInt(capture->index())); 933 array->set(i * 2 + 1, Smi::FromInt(capture->index()));
918 } 934 }
919 935
920 return array; 936 return array;
921 } 937 }
922 938
939 bool RegExpParser::HasNamedCaptures() {
940 if (has_named_captures_ || is_scanned_for_captures_) {
941 return has_named_captures_;
942 }
943
944 ScanForCaptures();
945 DCHECK(is_scanned_for_captures_);
946 return has_named_captures_;
947 }
948
923 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) { 949 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
924 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { 950 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
925 if (s->group_type() != CAPTURE) continue; 951 if (s->group_type() != CAPTURE) continue;
926 // Return true if we found the matching capture index. 952 // Return true if we found the matching capture index.
927 if (index == s->capture_index()) return true; 953 if (index == s->capture_index()) return true;
928 // Abort if index is larger than what has been parsed up till this state. 954 // Abort if index is larger than what has been parsed up till this state.
929 if (index > s->capture_index()) return false; 955 if (index > s->capture_index()) return false;
930 } 956 }
931 return false; 957 return false;
932 } 958 }
(...skipping 905 matching lines...) Expand 10 before | Expand all | Expand 10 after
1838 return false; 1864 return false;
1839 } 1865 }
1840 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1866 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1841 zone()); 1867 zone());
1842 LAST(ADD_TERM); 1868 LAST(ADD_TERM);
1843 return true; 1869 return true;
1844 } 1870 }
1845 1871
1846 } // namespace internal 1872 } // namespace internal
1847 } // namespace v8 1873 } // namespace v8
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698