| OLD | NEW |
| 1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
| 6 | 6 |
| 7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
| 8 #include "src/factory.h" | 8 #include "src/factory.h" |
| 9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
| 10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
| (...skipping 22 matching lines...) Expand all Loading... |
| 33 ignore_case_(flags & JSRegExp::kIgnoreCase), | 33 ignore_case_(flags & JSRegExp::kIgnoreCase), |
| 34 multiline_(flags & JSRegExp::kMultiline), | 34 multiline_(flags & JSRegExp::kMultiline), |
| 35 unicode_(flags & JSRegExp::kUnicode), | 35 unicode_(flags & JSRegExp::kUnicode), |
| 36 next_pos_(0), | 36 next_pos_(0), |
| 37 captures_started_(0), | 37 captures_started_(0), |
| 38 capture_count_(0), | 38 capture_count_(0), |
| 39 has_more_(true), | 39 has_more_(true), |
| 40 simple_(false), | 40 simple_(false), |
| 41 contains_anchor_(false), | 41 contains_anchor_(false), |
| 42 is_scanned_for_captures_(false), | 42 is_scanned_for_captures_(false), |
| 43 has_named_captures_(false), |
| 43 failed_(false) { | 44 failed_(false) { |
| 44 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall); | 45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall); |
| 45 Advance(); | 46 Advance(); |
| 46 } | 47 } |
| 47 | 48 |
| 48 template <bool update_position> | 49 template <bool update_position> |
| 49 inline uc32 RegExpParser::ReadNext() { | 50 inline uc32 RegExpParser::ReadNext() { |
| 50 int position = next_pos_; | 51 int position = next_pos_; |
| 51 uc32 c0 = in()->Get(position); | 52 uc32 c0 = in()->Get(position); |
| 52 position++; | 53 position++; |
| (...skipping 265 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 318 lookaround_type = RegExpLookaround::LOOKBEHIND; | 319 lookaround_type = RegExpLookaround::LOOKBEHIND; |
| 319 Advance(2); | 320 Advance(2); |
| 320 break; | 321 break; |
| 321 } else if (Next() == '!') { | 322 } else if (Next() == '!') { |
| 322 subexpr_type = NEGATIVE_LOOKAROUND; | 323 subexpr_type = NEGATIVE_LOOKAROUND; |
| 323 lookaround_type = RegExpLookaround::LOOKBEHIND; | 324 lookaround_type = RegExpLookaround::LOOKBEHIND; |
| 324 Advance(2); | 325 Advance(2); |
| 325 break; | 326 break; |
| 326 } | 327 } |
| 327 } | 328 } |
| 328 if (FLAG_harmony_regexp_named_captures && unicode()) { | 329 if (FLAG_harmony_regexp_named_captures) { |
| 330 has_named_captures_ = true; |
| 329 is_named_capture = true; | 331 is_named_capture = true; |
| 330 Advance(); | 332 Advance(); |
| 331 break; | 333 break; |
| 332 } | 334 } |
| 333 // Fall through. | 335 // Fall through. |
| 334 default: | 336 default: |
| 335 return ReportError(CStrVector("Invalid group")); | 337 return ReportError(CStrVector("Invalid group")); |
| 336 } | 338 } |
| 337 } | 339 } |
| 338 | 340 |
| (...skipping 195 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 534 builder->AddEscapedUnicodeCharacter(value); | 536 builder->AddEscapedUnicodeCharacter(value); |
| 535 } else if (!unicode()) { | 537 } else if (!unicode()) { |
| 536 builder->AddCharacter('u'); | 538 builder->AddCharacter('u'); |
| 537 } else { | 539 } else { |
| 538 // With /u, invalid escapes are not treated as identity escapes. | 540 // With /u, invalid escapes are not treated as identity escapes. |
| 539 return ReportError(CStrVector("Invalid unicode escape")); | 541 return ReportError(CStrVector("Invalid unicode escape")); |
| 540 } | 542 } |
| 541 break; | 543 break; |
| 542 } | 544 } |
| 543 case 'k': | 545 case 'k': |
| 544 if (FLAG_harmony_regexp_named_captures && unicode()) { | 546 // Either an identity escape or a named back-reference. The two |
| 547 // interpretations are mutually exclusive: '\k' is interpreted as |
| 548 // an identity escape for non-unicode patterns without named |
| 549 // capture groups, and as the beginning of a named back-reference |
| 550 // in all other cases. |
| 551 if (FLAG_harmony_regexp_named_captures && |
| 552 (unicode() || HasNamedCaptures())) { |
| 545 Advance(2); | 553 Advance(2); |
| 546 ParseNamedBackReference(builder, state CHECK_FAILED); | 554 ParseNamedBackReference(builder, state CHECK_FAILED); |
| 547 break; | 555 break; |
| 548 } | 556 } |
| 549 // Fall through. | 557 // Fall through. |
| 550 default: | 558 default: |
| 551 Advance(); | 559 Advance(); |
| 552 // With /u, no identity escapes except for syntax characters | 560 // With /u, no identity escapes except for syntax characters |
| 553 // are allowed. Otherwise, all identity escapes are allowed. | 561 // are allowed. Otherwise, all identity escapes are allowed. |
| 554 if (!unicode() || IsSyntaxCharacterOrSlash(current())) { | 562 if (!unicode() || IsSyntaxCharacterOrSlash(current())) { |
| (...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 650 #endif | 658 #endif |
| 651 | 659 |
| 652 | 660 |
| 653 // In order to know whether an escape is a backreference or not we have to scan | 661 // In order to know whether an escape is a backreference or not we have to scan |
| 654 // the entire regexp and find the number of capturing parentheses. However we | 662 // the entire regexp and find the number of capturing parentheses. However we |
| 655 // don't want to scan the regexp twice unless it is necessary. This mini-parser | 663 // don't want to scan the regexp twice unless it is necessary. This mini-parser |
| 656 // is called when needed. It can see the difference between capturing and | 664 // is called when needed. It can see the difference between capturing and |
| 657 // noncapturing parentheses and can skip character classes and backslash-escaped | 665 // noncapturing parentheses and can skip character classes and backslash-escaped |
| 658 // characters. | 666 // characters. |
| 659 void RegExpParser::ScanForCaptures() { | 667 void RegExpParser::ScanForCaptures() { |
| 668 DCHECK(!is_scanned_for_captures_); |
| 669 const int saved_position = position(); |
| 660 // Start with captures started previous to current position | 670 // Start with captures started previous to current position |
| 661 int capture_count = captures_started(); | 671 int capture_count = captures_started(); |
| 662 // Add count of captures after this position. | 672 // Add count of captures after this position. |
| 663 int n; | 673 int n; |
| 664 while ((n = current()) != kEndMarker) { | 674 while ((n = current()) != kEndMarker) { |
| 665 Advance(); | 675 Advance(); |
| 666 switch (n) { | 676 switch (n) { |
| 667 case '\\': | 677 case '\\': |
| 668 Advance(); | 678 Advance(); |
| 669 break; | 679 break; |
| (...skipping 15 matching lines...) Expand all Loading... |
| 685 // * a non-capturing group '(:', | 695 // * a non-capturing group '(:', |
| 686 // * a lookbehind assertion '(?<=' '(?<!' | 696 // * a lookbehind assertion '(?<=' '(?<!' |
| 687 // * or a named capture '(?<'. | 697 // * or a named capture '(?<'. |
| 688 // | 698 // |
| 689 // Of these, only named captures are capturing groups. | 699 // Of these, only named captures are capturing groups. |
| 690 if (!FLAG_harmony_regexp_named_captures) break; | 700 if (!FLAG_harmony_regexp_named_captures) break; |
| 691 | 701 |
| 692 Advance(); | 702 Advance(); |
| 693 if (current() != '<') break; | 703 if (current() != '<') break; |
| 694 | 704 |
| 695 // TODO(jgruber): To be more future-proof we could test for | 705 if (FLAG_harmony_regexp_lookbehind) { |
| 696 // IdentifierStart here once it becomes clear whether group names | 706 // TODO(jgruber): To be more future-proof we could test for |
| 697 // allow unicode escapes. | 707 // IdentifierStart here once it becomes clear whether group names |
| 698 Advance(); | 708 // allow unicode escapes. |
| 699 if (current() == '=' || current() == '!') break; | 709 // https://github.com/tc39/proposal-regexp-named-groups/issues/23 |
| 710 Advance(); |
| 711 if (current() == '=' || current() == '!') break; |
| 712 } |
| 713 |
| 714 // Found a possible named capture. It could turn out to be a syntax |
| 715 // error (e.g. an unterminated or invalid name), but that distinction |
| 716 // does not matter for our purposes. |
| 717 has_named_captures_ = true; |
| 700 } | 718 } |
| 701 capture_count++; | 719 capture_count++; |
| 702 break; | 720 break; |
| 703 } | 721 } |
| 704 } | 722 } |
| 705 capture_count_ = capture_count; | 723 capture_count_ = capture_count; |
| 706 is_scanned_for_captures_ = true; | 724 is_scanned_for_captures_ = true; |
| 725 Reset(saved_position); |
| 707 } | 726 } |
| 708 | 727 |
| 709 | 728 |
| 710 bool RegExpParser::ParseBackReferenceIndex(int* index_out) { | 729 bool RegExpParser::ParseBackReferenceIndex(int* index_out) { |
| 711 DCHECK_EQ('\\', current()); | 730 DCHECK_EQ('\\', current()); |
| 712 DCHECK('1' <= Next() && Next() <= '9'); | 731 DCHECK('1' <= Next() && Next() <= '9'); |
| 713 // Try to parse a decimal literal that is no greater than the total number | 732 // Try to parse a decimal literal that is no greater than the total number |
| 714 // of left capturing parentheses in the input. | 733 // of left capturing parentheses in the input. |
| 715 int start = position(); | 734 int start = position(); |
| 716 int value = Next() - '0'; | 735 int value = Next() - '0'; |
| 717 Advance(2); | 736 Advance(2); |
| 718 while (true) { | 737 while (true) { |
| 719 uc32 c = current(); | 738 uc32 c = current(); |
| 720 if (IsDecimalDigit(c)) { | 739 if (IsDecimalDigit(c)) { |
| 721 value = 10 * value + (c - '0'); | 740 value = 10 * value + (c - '0'); |
| 722 if (value > kMaxCaptures) { | 741 if (value > kMaxCaptures) { |
| 723 Reset(start); | 742 Reset(start); |
| 724 return false; | 743 return false; |
| 725 } | 744 } |
| 726 Advance(); | 745 Advance(); |
| 727 } else { | 746 } else { |
| 728 break; | 747 break; |
| 729 } | 748 } |
| 730 } | 749 } |
| 731 if (value > captures_started()) { | 750 if (value > captures_started()) { |
| 732 if (!is_scanned_for_captures_) { | 751 if (!is_scanned_for_captures_) ScanForCaptures(); |
| 733 int saved_position = position(); | |
| 734 ScanForCaptures(); | |
| 735 Reset(saved_position); | |
| 736 } | |
| 737 if (value > capture_count_) { | 752 if (value > capture_count_) { |
| 738 Reset(start); | 753 Reset(start); |
| 739 return false; | 754 return false; |
| 740 } | 755 } |
| 741 } | 756 } |
| 742 *index_out = value; | 757 *index_out = value; |
| 743 return true; | 758 return true; |
| 744 } | 759 } |
| 745 | 760 |
| 746 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) { | 761 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) { |
| 747 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { | 762 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { |
| 748 v->push_back(code_unit); | 763 v->push_back(code_unit); |
| 749 } else { | 764 } else { |
| 750 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); | 765 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); |
| 751 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); | 766 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); |
| 752 } | 767 } |
| 753 } | 768 } |
| 754 | 769 |
| 755 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { | 770 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { |
| 756 DCHECK(FLAG_harmony_regexp_named_captures); | 771 DCHECK(FLAG_harmony_regexp_named_captures); |
| 757 DCHECK(unicode()); | |
| 758 | 772 |
| 759 ZoneVector<uc16>* name = | 773 ZoneVector<uc16>* name = |
| 760 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); | 774 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); |
| 761 | 775 |
| 762 bool at_start = true; | 776 bool at_start = true; |
| 763 while (true) { | 777 while (true) { |
| 764 uc32 c = current(); | 778 uc32 c = current(); |
| 765 Advance(); | 779 Advance(); |
| 766 | 780 |
| 767 // Convert unicode escapes. | 781 // Convert unicode escapes. |
| 768 if (c == '\\' && current() == 'u') { | 782 if (c == '\\' && current() == 'u') { |
| 783 // TODO(jgruber): Reconsider this once the spec has settled. |
| 784 // https://github.com/tc39/proposal-regexp-named-groups/issues/23 |
| 769 Advance(); | 785 Advance(); |
| 770 if (!ParseUnicodeEscape(&c)) { | 786 if (!ParseUnicodeEscape(&c)) { |
| 771 ReportError(CStrVector("Invalid Unicode escape sequence")); | 787 ReportError(CStrVector("Invalid Unicode escape sequence")); |
| 772 return nullptr; | 788 return nullptr; |
| 773 } | 789 } |
| 774 } | 790 } |
| 775 | 791 |
| 776 if (at_start) { | 792 if (at_start) { |
| 777 if (!IdentifierStart::Is(c)) { | 793 if (!IdentifierStart::Is(c)) { |
| 778 ReportError(CStrVector("Invalid capture group name")); | 794 ReportError(CStrVector("Invalid capture group name")); |
| (...skipping 12 matching lines...) Expand all Loading... |
| 791 } | 807 } |
| 792 } | 808 } |
| 793 } | 809 } |
| 794 | 810 |
| 795 return name; | 811 return name; |
| 796 } | 812 } |
| 797 | 813 |
| 798 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, | 814 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, |
| 799 int index) { | 815 int index) { |
| 800 DCHECK(FLAG_harmony_regexp_named_captures); | 816 DCHECK(FLAG_harmony_regexp_named_captures); |
| 801 DCHECK(unicode()); | |
| 802 DCHECK(0 < index && index <= captures_started_); | 817 DCHECK(0 < index && index <= captures_started_); |
| 803 DCHECK_NOT_NULL(name); | 818 DCHECK_NOT_NULL(name); |
| 804 | 819 |
| 805 if (named_captures_ == nullptr) { | 820 if (named_captures_ == nullptr) { |
| 806 named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone()); | 821 named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone()); |
| 807 } else { | 822 } else { |
| 808 // Check for duplicates and bail if we find any. | 823 // Check for duplicates and bail if we find any. |
| 824 // TODO(jgruber): O(n^2). |
| 809 for (const auto& named_capture : *named_captures_) { | 825 for (const auto& named_capture : *named_captures_) { |
| 810 if (*named_capture->name() == *name) { | 826 if (*named_capture->name() == *name) { |
| 811 ReportError(CStrVector("Duplicate capture group name")); | 827 ReportError(CStrVector("Duplicate capture group name")); |
| 812 return false; | 828 return false; |
| 813 } | 829 } |
| 814 } | 830 } |
| 815 } | 831 } |
| 816 | 832 |
| 817 RegExpCapture* capture = GetCapture(index); | 833 RegExpCapture* capture = GetCapture(index); |
| 818 DCHECK(capture->name() == nullptr); | 834 DCHECK(capture->name() == nullptr); |
| (...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 913 for (int i = 0; i < named_captures_->length(); i++) { | 929 for (int i = 0; i < named_captures_->length(); i++) { |
| 914 RegExpCapture* capture = named_captures_->at(i); | 930 RegExpCapture* capture = named_captures_->at(i); |
| 915 MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name()); | 931 MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name()); |
| 916 array->set(i * 2, *name.ToHandleChecked()); | 932 array->set(i * 2, *name.ToHandleChecked()); |
| 917 array->set(i * 2 + 1, Smi::FromInt(capture->index())); | 933 array->set(i * 2 + 1, Smi::FromInt(capture->index())); |
| 918 } | 934 } |
| 919 | 935 |
| 920 return array; | 936 return array; |
| 921 } | 937 } |
| 922 | 938 |
| 939 bool RegExpParser::HasNamedCaptures() { |
| 940 if (has_named_captures_ || is_scanned_for_captures_) { |
| 941 return has_named_captures_; |
| 942 } |
| 943 |
| 944 ScanForCaptures(); |
| 945 DCHECK(is_scanned_for_captures_); |
| 946 return has_named_captures_; |
| 947 } |
| 948 |
| 923 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) { | 949 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) { |
| 924 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { | 950 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { |
| 925 if (s->group_type() != CAPTURE) continue; | 951 if (s->group_type() != CAPTURE) continue; |
| 926 // Return true if we found the matching capture index. | 952 // Return true if we found the matching capture index. |
| 927 if (index == s->capture_index()) return true; | 953 if (index == s->capture_index()) return true; |
| 928 // Abort if index is larger than what has been parsed up till this state. | 954 // Abort if index is larger than what has been parsed up till this state. |
| 929 if (index > s->capture_index()) return false; | 955 if (index > s->capture_index()) return false; |
| 930 } | 956 } |
| 931 return false; | 957 return false; |
| 932 } | 958 } |
| (...skipping 905 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1838 return false; | 1864 return false; |
| 1839 } | 1865 } |
| 1840 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1866 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
| 1841 zone()); | 1867 zone()); |
| 1842 LAST(ADD_TERM); | 1868 LAST(ADD_TERM); |
| 1843 return true; | 1869 return true; |
| 1844 } | 1870 } |
| 1845 | 1871 |
| 1846 } // namespace internal | 1872 } // namespace internal |
| 1847 } // namespace v8 | 1873 } // namespace v8 |
| OLD | NEW |