OLD | NEW |
---|---|
1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
6 | 6 |
7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
8 #include "src/factory.h" | 8 #include "src/factory.h" |
9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
(...skipping 28 matching lines...) Expand all Loading... | |
39 has_more_(true), | 39 has_more_(true), |
40 simple_(false), | 40 simple_(false), |
41 contains_anchor_(false), | 41 contains_anchor_(false), |
42 is_scanned_for_captures_(false), | 42 is_scanned_for_captures_(false), |
43 has_named_captures_(false), | 43 has_named_captures_(false), |
44 failed_(false) { | 44 failed_(false) { |
45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall); | 45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall); |
46 Advance(); | 46 Advance(); |
47 } | 47 } |
48 | 48 |
49 template <bool update_position> | 49 template <bool update_position> |
Yang
2017/04/05 12:52:38
I wonder whether it makes sense to make the scan m
jgruber
2017/04/05 12:57:30
Agreed, will do that.
jgruber
2017/04/07 07:10:56
Done.
| |
50 inline uc32 RegExpParser::ReadNext() { | 50 inline uc32 RegExpParser::ReadNext(ScanMode mode) { |
51 int position = next_pos_; | 51 int position = next_pos_; |
52 uc32 c0 = in()->Get(position); | 52 uc32 c0 = in()->Get(position); |
53 position++; | 53 position++; |
54 // Read the whole surrogate pair in case of unicode flag, if possible. | 54 const bool try_combine_surrogate_pairs = |
55 if (unicode() && position < in()->length() && | 55 (unicode() || mode == ScanMode::FORCE_COMBINE_SURROGATE_PAIRS); |
56 if (try_combine_surrogate_pairs && position < in()->length() && | |
56 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { | 57 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { |
57 uc16 c1 = in()->Get(position); | 58 uc16 c1 = in()->Get(position); |
58 if (unibrow::Utf16::IsTrailSurrogate(c1)) { | 59 if (unibrow::Utf16::IsTrailSurrogate(c1)) { |
59 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); | 60 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); |
60 position++; | 61 position++; |
61 } | 62 } |
62 } | 63 } |
63 if (update_position) next_pos_ = position; | 64 if (update_position) next_pos_ = position; |
64 return c0; | 65 return c0; |
65 } | 66 } |
66 | 67 |
67 | 68 |
68 uc32 RegExpParser::Next() { | 69 uc32 RegExpParser::Next() { |
69 if (has_next()) { | 70 if (has_next()) { |
70 return ReadNext<false>(); | 71 return ReadNext<false>(ScanMode::DEFAULT); |
71 } else { | 72 } else { |
72 return kEndMarker; | 73 return kEndMarker; |
73 } | 74 } |
74 } | 75 } |
75 | 76 |
76 | 77 void RegExpParser::Advance(ScanMode mode) { |
77 void RegExpParser::Advance() { | |
78 if (has_next()) { | 78 if (has_next()) { |
79 StackLimitCheck check(isolate()); | 79 StackLimitCheck check(isolate()); |
80 if (check.HasOverflowed()) { | 80 if (check.HasOverflowed()) { |
81 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow"); | 81 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow"); |
82 ReportError(CStrVector( | 82 ReportError(CStrVector( |
83 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow))); | 83 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow))); |
84 } else if (zone()->excess_allocation()) { | 84 } else if (zone()->excess_allocation()) { |
85 ReportError(CStrVector("Regular expression too large")); | 85 ReportError(CStrVector("Regular expression too large")); |
86 } else { | 86 } else { |
87 current_ = ReadNext<true>(); | 87 current_ = ReadNext<true>(mode); |
88 } | 88 } |
89 } else { | 89 } else { |
90 current_ = kEndMarker; | 90 current_ = kEndMarker; |
91 // Advance so that position() points to 1-after-the-last-character. This is | 91 // Advance so that position() points to 1-after-the-last-character. This is |
92 // important so that Reset() to this position works correctly. | 92 // important so that Reset() to this position works correctly. |
93 next_pos_ = in()->length() + 1; | 93 next_pos_ = in()->length() + 1; |
94 has_more_ = false; | 94 has_more_ = false; |
95 } | 95 } |
96 } | 96 } |
97 | 97 |
98 | 98 |
99 void RegExpParser::Reset(int pos) { | 99 void RegExpParser::Reset(int pos) { |
100 next_pos_ = pos; | 100 next_pos_ = pos; |
101 has_more_ = (pos < in()->length()); | 101 has_more_ = (pos < in()->length()); |
102 Advance(); | 102 Advance(); |
103 } | 103 } |
104 | 104 |
105 | 105 void RegExpParser::Advance(int dist, ScanMode mode) { |
106 void RegExpParser::Advance(int dist) { | |
107 next_pos_ += dist - 1; | 106 next_pos_ += dist - 1; |
108 Advance(); | 107 Advance(mode); |
109 } | 108 } |
110 | 109 |
111 | 110 |
112 bool RegExpParser::simple() { return simple_; } | 111 bool RegExpParser::simple() { return simple_; } |
113 | 112 |
114 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { | 113 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { |
115 switch (c) { | 114 switch (c) { |
116 case '^': | 115 case '^': |
117 case '$': | 116 case '$': |
118 case '\\': | 117 case '\\': |
(...skipping 203 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
322 } else if (Next() == '!') { | 321 } else if (Next() == '!') { |
323 subexpr_type = NEGATIVE_LOOKAROUND; | 322 subexpr_type = NEGATIVE_LOOKAROUND; |
324 lookaround_type = RegExpLookaround::LOOKBEHIND; | 323 lookaround_type = RegExpLookaround::LOOKBEHIND; |
325 Advance(2); | 324 Advance(2); |
326 break; | 325 break; |
327 } | 326 } |
328 } | 327 } |
329 if (FLAG_harmony_regexp_named_captures) { | 328 if (FLAG_harmony_regexp_named_captures) { |
330 has_named_captures_ = true; | 329 has_named_captures_ = true; |
331 is_named_capture = true; | 330 is_named_capture = true; |
332 Advance(); | |
Yang
2017/04/05 12:52:38
How come we can remove this Advance here and below
jgruber
2017/04/05 12:57:30
The advance is now done in ParseCaptureGroupName.
| |
333 break; | 331 break; |
334 } | 332 } |
335 // Fall through. | 333 // Fall through. |
336 default: | 334 default: |
337 return ReportError(CStrVector("Invalid group")); | 335 return ReportError(CStrVector("Invalid group")); |
338 } | 336 } |
339 } | 337 } |
340 | 338 |
341 const ZoneVector<uc16>* capture_name = nullptr; | 339 const ZoneVector<uc16>* capture_name = nullptr; |
342 if (subexpr_type == CAPTURE) { | 340 if (subexpr_type == CAPTURE) { |
(...skipping 419 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
762 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { | 760 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { |
763 v->push_back(code_unit); | 761 v->push_back(code_unit); |
764 } else { | 762 } else { |
765 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); | 763 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); |
766 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); | 764 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); |
767 } | 765 } |
768 } | 766 } |
769 | 767 |
770 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { | 768 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { |
771 DCHECK(FLAG_harmony_regexp_named_captures); | 769 DCHECK(FLAG_harmony_regexp_named_captures); |
770 DCHECK_EQ(current(), '<'); | |
772 | 771 |
773 ZoneVector<uc16>* name = | 772 ZoneVector<uc16>* name = |
774 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); | 773 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); |
775 | 774 |
775 // Capture names can always contain surrogate pairs, and we need to scan | |
776 // accordingly. | |
777 const ScanMode scan_mode = ScanMode::FORCE_COMBINE_SURROGATE_PAIRS; | |
778 Advance(scan_mode); | |
779 | |
776 bool at_start = true; | 780 bool at_start = true; |
777 while (true) { | 781 while (true) { |
778 uc32 c = current(); | 782 uc32 c = current(); |
779 Advance(); | 783 Advance(scan_mode); |
780 | 784 |
781 // Convert unicode escapes. | 785 // Convert unicode escapes. |
782 if (c == '\\' && current() == 'u') { | 786 if (c == '\\' && current() == 'u') { |
783 // TODO(jgruber): Reconsider this once the spec has settled. | 787 // TODO(jgruber): Reconsider this once the spec has settled. |
784 // https://github.com/tc39/proposal-regexp-named-groups/issues/23 | 788 // https://github.com/tc39/proposal-regexp-named-groups/issues/23 |
785 Advance(); | 789 Advance(scan_mode); |
786 if (!ParseUnicodeEscape(&c)) { | 790 if (!ParseUnicodeEscape(&c)) { |
787 ReportError(CStrVector("Invalid Unicode escape sequence")); | 791 ReportError(CStrVector("Invalid Unicode escape sequence")); |
788 return nullptr; | 792 return nullptr; |
789 } | 793 } |
790 } | 794 } |
791 | 795 |
792 if (at_start) { | 796 if (at_start) { |
793 if (!IdentifierStart::Is(c)) { | 797 if (!IdentifierStart::Is(c)) { |
794 ReportError(CStrVector("Invalid capture group name")); | 798 ReportError(CStrVector("Invalid capture group name")); |
795 return nullptr; | 799 return nullptr; |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
840 } | 844 } |
841 | 845 |
842 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, | 846 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, |
843 RegExpParserState* state) { | 847 RegExpParserState* state) { |
844 // The parser is assumed to be on the '<' in \k<name>. | 848 // The parser is assumed to be on the '<' in \k<name>. |
845 if (current() != '<') { | 849 if (current() != '<') { |
846 ReportError(CStrVector("Invalid named reference")); | 850 ReportError(CStrVector("Invalid named reference")); |
847 return false; | 851 return false; |
848 } | 852 } |
849 | 853 |
850 Advance(); | |
851 const ZoneVector<uc16>* name = ParseCaptureGroupName(); | 854 const ZoneVector<uc16>* name = ParseCaptureGroupName(); |
852 if (name == nullptr) { | 855 if (name == nullptr) { |
853 return false; | 856 return false; |
854 } | 857 } |
855 | 858 |
856 if (state->IsInsideCaptureGroup(name)) { | 859 if (state->IsInsideCaptureGroup(name)) { |
857 builder->AddEmpty(); | 860 builder->AddEmpty(); |
858 } else { | 861 } else { |
859 RegExpBackReference* atom = new (zone()) RegExpBackReference(); | 862 RegExpBackReference* atom = new (zone()) RegExpBackReference(); |
860 atom->set_name(name); | 863 atom->set_name(name); |
(...skipping 1003 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1864 return false; | 1867 return false; |
1865 } | 1868 } |
1866 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1869 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
1867 zone()); | 1870 zone()); |
1868 LAST(ADD_TERM); | 1871 LAST(ADD_TERM); |
1869 return true; | 1872 return true; |
1870 } | 1873 } |
1871 | 1874 |
1872 } // namespace internal | 1875 } // namespace internal |
1873 } // namespace v8 | 1876 } // namespace v8 |
OLD | NEW |