| OLD | NEW |
| 1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
| 6 | 6 |
| 7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
| 8 #include "src/factory.h" | 8 #include "src/factory.h" |
| 9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
| 10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
| (...skipping 28 matching lines...) Expand all Loading... |
| 39 has_more_(true), | 39 has_more_(true), |
| 40 simple_(false), | 40 simple_(false), |
| 41 contains_anchor_(false), | 41 contains_anchor_(false), |
| 42 is_scanned_for_captures_(false), | 42 is_scanned_for_captures_(false), |
| 43 has_named_captures_(false), | 43 has_named_captures_(false), |
| 44 failed_(false) { | 44 failed_(false) { |
| 45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall); | 45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall); |
| 46 Advance(); | 46 Advance(); |
| 47 } | 47 } |
| 48 | 48 |
| 49 template <bool update_position> | 49 inline uc32 RegExpParser::ReadNext(bool update_position, ScanMode mode) { |
| 50 inline uc32 RegExpParser::ReadNext() { | |
| 51 int position = next_pos_; | 50 int position = next_pos_; |
| 52 uc32 c0 = in()->Get(position); | 51 uc32 c0 = in()->Get(position); |
| 53 position++; | 52 position++; |
| 54 // Read the whole surrogate pair in case of unicode flag, if possible. | 53 const bool try_combine_surrogate_pairs = |
| 55 if (unicode() && position < in()->length() && | 54 (unicode() || mode == ScanMode::FORCE_COMBINE_SURROGATE_PAIRS); |
| 55 if (try_combine_surrogate_pairs && position < in()->length() && |
| 56 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { | 56 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { |
| 57 uc16 c1 = in()->Get(position); | 57 uc16 c1 = in()->Get(position); |
| 58 if (unibrow::Utf16::IsTrailSurrogate(c1)) { | 58 if (unibrow::Utf16::IsTrailSurrogate(c1)) { |
| 59 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); | 59 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); |
| 60 position++; | 60 position++; |
| 61 } | 61 } |
| 62 } | 62 } |
| 63 if (update_position) next_pos_ = position; | 63 if (update_position) next_pos_ = position; |
| 64 return c0; | 64 return c0; |
| 65 } | 65 } |
| 66 | 66 |
| 67 | 67 |
| 68 uc32 RegExpParser::Next() { | 68 uc32 RegExpParser::Next() { |
| 69 if (has_next()) { | 69 if (has_next()) { |
| 70 return ReadNext<false>(); | 70 return ReadNext(false, ScanMode::DEFAULT); |
| 71 } else { | 71 } else { |
| 72 return kEndMarker; | 72 return kEndMarker; |
| 73 } | 73 } |
| 74 } | 74 } |
| 75 | 75 |
| 76 | 76 void RegExpParser::Advance(ScanMode mode) { |
| 77 void RegExpParser::Advance() { | |
| 78 if (has_next()) { | 77 if (has_next()) { |
| 79 StackLimitCheck check(isolate()); | 78 StackLimitCheck check(isolate()); |
| 80 if (check.HasOverflowed()) { | 79 if (check.HasOverflowed()) { |
| 81 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow"); | 80 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow"); |
| 82 ReportError(CStrVector( | 81 ReportError(CStrVector( |
| 83 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow))); | 82 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow))); |
| 84 } else if (zone()->excess_allocation()) { | 83 } else if (zone()->excess_allocation()) { |
| 85 ReportError(CStrVector("Regular expression too large")); | 84 ReportError(CStrVector("Regular expression too large")); |
| 86 } else { | 85 } else { |
| 87 current_ = ReadNext<true>(); | 86 current_ = ReadNext(true, mode); |
| 88 } | 87 } |
| 89 } else { | 88 } else { |
| 90 current_ = kEndMarker; | 89 current_ = kEndMarker; |
| 91 // Advance so that position() points to 1-after-the-last-character. This is | 90 // Advance so that position() points to 1-after-the-last-character. This is |
| 92 // important so that Reset() to this position works correctly. | 91 // important so that Reset() to this position works correctly. |
| 93 next_pos_ = in()->length() + 1; | 92 next_pos_ = in()->length() + 1; |
| 94 has_more_ = false; | 93 has_more_ = false; |
| 95 } | 94 } |
| 96 } | 95 } |
| 97 | 96 |
| 98 | 97 |
| 99 void RegExpParser::Reset(int pos) { | 98 void RegExpParser::Reset(int pos) { |
| 100 next_pos_ = pos; | 99 next_pos_ = pos; |
| 101 has_more_ = (pos < in()->length()); | 100 has_more_ = (pos < in()->length()); |
| 102 Advance(); | 101 Advance(); |
| 103 } | 102 } |
| 104 | 103 |
| 105 | 104 void RegExpParser::Advance(int dist, ScanMode mode) { |
| 106 void RegExpParser::Advance(int dist) { | |
| 107 next_pos_ += dist - 1; | 105 next_pos_ += dist - 1; |
| 108 Advance(); | 106 Advance(mode); |
| 109 } | 107 } |
| 110 | 108 |
| 111 | 109 |
| 112 bool RegExpParser::simple() { return simple_; } | 110 bool RegExpParser::simple() { return simple_; } |
| 113 | 111 |
| 114 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { | 112 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { |
| 115 switch (c) { | 113 switch (c) { |
| 116 case '^': | 114 case '^': |
| 117 case '$': | 115 case '$': |
| 118 case '\\': | 116 case '\\': |
| (...skipping 203 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 322 } else if (Next() == '!') { | 320 } else if (Next() == '!') { |
| 323 subexpr_type = NEGATIVE_LOOKAROUND; | 321 subexpr_type = NEGATIVE_LOOKAROUND; |
| 324 lookaround_type = RegExpLookaround::LOOKBEHIND; | 322 lookaround_type = RegExpLookaround::LOOKBEHIND; |
| 325 Advance(2); | 323 Advance(2); |
| 326 break; | 324 break; |
| 327 } | 325 } |
| 328 } | 326 } |
| 329 if (FLAG_harmony_regexp_named_captures) { | 327 if (FLAG_harmony_regexp_named_captures) { |
| 330 has_named_captures_ = true; | 328 has_named_captures_ = true; |
| 331 is_named_capture = true; | 329 is_named_capture = true; |
| 332 Advance(); | |
| 333 break; | 330 break; |
| 334 } | 331 } |
| 335 // Fall through. | 332 // Fall through. |
| 336 default: | 333 default: |
| 337 return ReportError(CStrVector("Invalid group")); | 334 return ReportError(CStrVector("Invalid group")); |
| 338 } | 335 } |
| 339 } | 336 } |
| 340 | 337 |
| 341 const ZoneVector<uc16>* capture_name = nullptr; | 338 const ZoneVector<uc16>* capture_name = nullptr; |
| 342 if (subexpr_type == CAPTURE) { | 339 if (subexpr_type == CAPTURE) { |
| (...skipping 419 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 762 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { | 759 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { |
| 763 v->push_back(code_unit); | 760 v->push_back(code_unit); |
| 764 } else { | 761 } else { |
| 765 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); | 762 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); |
| 766 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); | 763 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); |
| 767 } | 764 } |
| 768 } | 765 } |
| 769 | 766 |
| 770 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { | 767 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { |
| 771 DCHECK(FLAG_harmony_regexp_named_captures); | 768 DCHECK(FLAG_harmony_regexp_named_captures); |
| 769 DCHECK_EQ(current(), '<'); |
| 772 | 770 |
| 773 ZoneVector<uc16>* name = | 771 ZoneVector<uc16>* name = |
| 774 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); | 772 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); |
| 775 | 773 |
| 774 // Capture names can always contain surrogate pairs, and we need to scan |
| 775 // accordingly. |
| 776 const ScanMode scan_mode = ScanMode::FORCE_COMBINE_SURROGATE_PAIRS; |
| 777 Advance(scan_mode); |
| 778 |
| 776 bool at_start = true; | 779 bool at_start = true; |
| 777 while (true) { | 780 while (true) { |
| 778 uc32 c = current(); | 781 uc32 c = current(); |
| 779 Advance(); | 782 Advance(scan_mode); |
| 780 | 783 |
| 781 // Convert unicode escapes. | 784 // Convert unicode escapes. |
| 782 if (c == '\\' && current() == 'u') { | 785 if (c == '\\' && current() == 'u') { |
| 783 // TODO(jgruber): Reconsider this once the spec has settled. | 786 // TODO(jgruber): Reconsider this once the spec has settled. |
| 784 // https://github.com/tc39/proposal-regexp-named-groups/issues/23 | 787 // https://github.com/tc39/proposal-regexp-named-groups/issues/23 |
| 785 Advance(); | 788 Advance(scan_mode); |
| 786 if (!ParseUnicodeEscape(&c)) { | 789 if (!ParseUnicodeEscape(&c)) { |
| 787 ReportError(CStrVector("Invalid Unicode escape sequence")); | 790 ReportError(CStrVector("Invalid Unicode escape sequence")); |
| 788 return nullptr; | 791 return nullptr; |
| 789 } | 792 } |
| 790 } | 793 } |
| 791 | 794 |
| 792 // The backslash char is misclassified as both ID_Start and ID_Continue. | 795 // The backslash char is misclassified as both ID_Start and ID_Continue. |
| 793 if (c == '\\') { | 796 if (c == '\\') { |
| 794 ReportError(CStrVector("Invalid capture group name")); | 797 ReportError(CStrVector("Invalid capture group name")); |
| 795 return nullptr; | 798 return nullptr; |
| (...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 846 } | 849 } |
| 847 | 850 |
| 848 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, | 851 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, |
| 849 RegExpParserState* state) { | 852 RegExpParserState* state) { |
| 850 // The parser is assumed to be on the '<' in \k<name>. | 853 // The parser is assumed to be on the '<' in \k<name>. |
| 851 if (current() != '<') { | 854 if (current() != '<') { |
| 852 ReportError(CStrVector("Invalid named reference")); | 855 ReportError(CStrVector("Invalid named reference")); |
| 853 return false; | 856 return false; |
| 854 } | 857 } |
| 855 | 858 |
| 856 Advance(); | |
| 857 const ZoneVector<uc16>* name = ParseCaptureGroupName(); | 859 const ZoneVector<uc16>* name = ParseCaptureGroupName(); |
| 858 if (name == nullptr) { | 860 if (name == nullptr) { |
| 859 return false; | 861 return false; |
| 860 } | 862 } |
| 861 | 863 |
| 862 if (state->IsInsideCaptureGroup(name)) { | 864 if (state->IsInsideCaptureGroup(name)) { |
| 863 builder->AddEmpty(); | 865 builder->AddEmpty(); |
| 864 } else { | 866 } else { |
| 865 RegExpBackReference* atom = new (zone()) RegExpBackReference(); | 867 RegExpBackReference* atom = new (zone()) RegExpBackReference(); |
| 866 atom->set_name(name); | 868 atom->set_name(name); |
| (...skipping 1003 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1870 return false; | 1872 return false; |
| 1871 } | 1873 } |
| 1872 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1874 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
| 1873 zone()); | 1875 zone()); |
| 1874 LAST(ADD_TERM); | 1876 LAST(ADD_TERM); |
| 1875 return true; | 1877 return true; |
| 1876 } | 1878 } |
| 1877 | 1879 |
| 1878 } // namespace internal | 1880 } // namespace internal |
| 1879 } // namespace v8 | 1881 } // namespace v8 |
| OLD | NEW |