OLD | NEW |
1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
6 | 6 |
7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
8 #include "src/factory.h" | 8 #include "src/factory.h" |
9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
(...skipping 28 matching lines...) Expand all Loading... |
39 has_more_(true), | 39 has_more_(true), |
40 simple_(false), | 40 simple_(false), |
41 contains_anchor_(false), | 41 contains_anchor_(false), |
42 is_scanned_for_captures_(false), | 42 is_scanned_for_captures_(false), |
43 has_named_captures_(false), | 43 has_named_captures_(false), |
44 failed_(false) { | 44 failed_(false) { |
45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall); | 45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall); |
46 Advance(); | 46 Advance(); |
47 } | 47 } |
48 | 48 |
49 inline uc32 RegExpParser::ReadNext(bool update_position, ScanMode mode) { | 49 template <bool update_position> |
| 50 inline uc32 RegExpParser::ReadNext() { |
50 int position = next_pos_; | 51 int position = next_pos_; |
51 uc32 c0 = in()->Get(position); | 52 uc32 c0 = in()->Get(position); |
52 position++; | 53 position++; |
53 const bool try_combine_surrogate_pairs = | 54 // Read the whole surrogate pair in case of unicode flag, if possible. |
54 (unicode() || mode == ScanMode::FORCE_COMBINE_SURROGATE_PAIRS); | 55 if (unicode() && position < in()->length() && |
55 if (try_combine_surrogate_pairs && position < in()->length() && | |
56 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { | 56 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { |
57 uc16 c1 = in()->Get(position); | 57 uc16 c1 = in()->Get(position); |
58 if (unibrow::Utf16::IsTrailSurrogate(c1)) { | 58 if (unibrow::Utf16::IsTrailSurrogate(c1)) { |
59 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); | 59 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); |
60 position++; | 60 position++; |
61 } | 61 } |
62 } | 62 } |
63 if (update_position) next_pos_ = position; | 63 if (update_position) next_pos_ = position; |
64 return c0; | 64 return c0; |
65 } | 65 } |
66 | 66 |
67 | 67 |
68 uc32 RegExpParser::Next() { | 68 uc32 RegExpParser::Next() { |
69 if (has_next()) { | 69 if (has_next()) { |
70 return ReadNext(false, ScanMode::DEFAULT); | 70 return ReadNext<false>(); |
71 } else { | 71 } else { |
72 return kEndMarker; | 72 return kEndMarker; |
73 } | 73 } |
74 } | 74 } |
75 | 75 |
76 void RegExpParser::Advance(ScanMode mode) { | 76 void RegExpParser::Advance() { |
77 if (has_next()) { | 77 if (has_next()) { |
78 StackLimitCheck check(isolate()); | 78 StackLimitCheck check(isolate()); |
79 if (check.HasOverflowed()) { | 79 if (check.HasOverflowed()) { |
80 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow"); | 80 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow"); |
81 ReportError(CStrVector( | 81 ReportError(CStrVector( |
82 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow))); | 82 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow))); |
83 } else if (zone()->excess_allocation()) { | 83 } else if (zone()->excess_allocation()) { |
84 ReportError(CStrVector("Regular expression too large")); | 84 ReportError(CStrVector("Regular expression too large")); |
85 } else { | 85 } else { |
86 current_ = ReadNext(true, mode); | 86 current_ = ReadNext<true>(); |
87 } | 87 } |
88 } else { | 88 } else { |
89 current_ = kEndMarker; | 89 current_ = kEndMarker; |
90 // Advance so that position() points to 1-after-the-last-character. This is | 90 // Advance so that position() points to 1-after-the-last-character. This is |
91 // important so that Reset() to this position works correctly. | 91 // important so that Reset() to this position works correctly. |
92 next_pos_ = in()->length() + 1; | 92 next_pos_ = in()->length() + 1; |
93 has_more_ = false; | 93 has_more_ = false; |
94 } | 94 } |
95 } | 95 } |
96 | 96 |
97 | 97 |
98 void RegExpParser::Reset(int pos) { | 98 void RegExpParser::Reset(int pos) { |
99 next_pos_ = pos; | 99 next_pos_ = pos; |
100 has_more_ = (pos < in()->length()); | 100 has_more_ = (pos < in()->length()); |
101 Advance(); | 101 Advance(); |
102 } | 102 } |
103 | 103 |
104 void RegExpParser::Advance(int dist, ScanMode mode) { | 104 void RegExpParser::Advance(int dist) { |
105 next_pos_ += dist - 1; | 105 next_pos_ += dist - 1; |
106 Advance(mode); | 106 Advance(); |
107 } | 107 } |
108 | 108 |
109 | 109 |
110 bool RegExpParser::simple() { return simple_; } | 110 bool RegExpParser::simple() { return simple_; } |
111 | 111 |
112 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { | 112 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { |
113 switch (c) { | 113 switch (c) { |
114 case '^': | 114 case '^': |
115 case '$': | 115 case '$': |
116 case '\\': | 116 case '\\': |
(...skipping 202 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
319 } else if (Next() == '!') { | 319 } else if (Next() == '!') { |
320 subexpr_type = NEGATIVE_LOOKAROUND; | 320 subexpr_type = NEGATIVE_LOOKAROUND; |
321 lookaround_type = RegExpLookaround::LOOKBEHIND; | 321 lookaround_type = RegExpLookaround::LOOKBEHIND; |
322 Advance(2); | 322 Advance(2); |
323 break; | 323 break; |
324 } | 324 } |
325 } | 325 } |
326 if (FLAG_harmony_regexp_named_captures) { | 326 if (FLAG_harmony_regexp_named_captures) { |
327 has_named_captures_ = true; | 327 has_named_captures_ = true; |
328 is_named_capture = true; | 328 is_named_capture = true; |
| 329 Advance(); |
329 break; | 330 break; |
330 } | 331 } |
331 // Fall through. | 332 // Fall through. |
332 default: | 333 default: |
333 return ReportError(CStrVector("Invalid group")); | 334 return ReportError(CStrVector("Invalid group")); |
334 } | 335 } |
335 } | 336 } |
336 | 337 |
337 const ZoneVector<uc16>* capture_name = nullptr; | 338 const ZoneVector<uc16>* capture_name = nullptr; |
338 if (subexpr_type == CAPTURE) { | 339 if (subexpr_type == CAPTURE) { |
(...skipping 415 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
754 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { | 755 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { |
755 v->push_back(code_unit); | 756 v->push_back(code_unit); |
756 } else { | 757 } else { |
757 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); | 758 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); |
758 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); | 759 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); |
759 } | 760 } |
760 } | 761 } |
761 | 762 |
762 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { | 763 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { |
763 DCHECK(FLAG_harmony_regexp_named_captures); | 764 DCHECK(FLAG_harmony_regexp_named_captures); |
764 DCHECK_EQ(current(), '<'); | |
765 | 765 |
766 ZoneVector<uc16>* name = | 766 ZoneVector<uc16>* name = |
767 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); | 767 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); |
768 | 768 |
769 // Capture names can always contain surrogate pairs, and we need to scan | |
770 // accordingly. | |
771 const ScanMode scan_mode = ScanMode::FORCE_COMBINE_SURROGATE_PAIRS; | |
772 Advance(scan_mode); | |
773 | |
774 bool at_start = true; | 769 bool at_start = true; |
775 while (true) { | 770 while (true) { |
776 uc32 c = current(); | 771 uc32 c = current(); |
777 Advance(scan_mode); | 772 Advance(); |
778 | 773 |
779 // Convert unicode escapes. | 774 // Convert unicode escapes. |
780 if (c == '\\' && current() == 'u') { | 775 if (c == '\\' && current() == 'u') { |
781 Advance(scan_mode); | 776 Advance(); |
782 if (!ParseUnicodeEscape(&c)) { | 777 if (!ParseUnicodeEscape(&c)) { |
783 ReportError(CStrVector("Invalid Unicode escape sequence")); | 778 ReportError(CStrVector("Invalid Unicode escape sequence")); |
784 return nullptr; | 779 return nullptr; |
785 } | 780 } |
786 } | 781 } |
787 | 782 |
788 // The backslash char is misclassified as both ID_Start and ID_Continue. | 783 // The backslash char is misclassified as both ID_Start and ID_Continue. |
789 if (c == '\\') { | 784 if (c == '\\') { |
790 ReportError(CStrVector("Invalid capture group name")); | 785 ReportError(CStrVector("Invalid capture group name")); |
791 return nullptr; | 786 return nullptr; |
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
842 } | 837 } |
843 | 838 |
844 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, | 839 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, |
845 RegExpParserState* state) { | 840 RegExpParserState* state) { |
846 // The parser is assumed to be on the '<' in \k<name>. | 841 // The parser is assumed to be on the '<' in \k<name>. |
847 if (current() != '<') { | 842 if (current() != '<') { |
848 ReportError(CStrVector("Invalid named reference")); | 843 ReportError(CStrVector("Invalid named reference")); |
849 return false; | 844 return false; |
850 } | 845 } |
851 | 846 |
| 847 Advance(); |
852 const ZoneVector<uc16>* name = ParseCaptureGroupName(); | 848 const ZoneVector<uc16>* name = ParseCaptureGroupName(); |
853 if (name == nullptr) { | 849 if (name == nullptr) { |
854 return false; | 850 return false; |
855 } | 851 } |
856 | 852 |
857 if (state->IsInsideCaptureGroup(name)) { | 853 if (state->IsInsideCaptureGroup(name)) { |
858 builder->AddEmpty(); | 854 builder->AddEmpty(); |
859 } else { | 855 } else { |
860 RegExpBackReference* atom = new (zone()) RegExpBackReference(); | 856 RegExpBackReference* atom = new (zone()) RegExpBackReference(); |
861 atom->set_name(name); | 857 atom->set_name(name); |
(...skipping 1072 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1934 return false; | 1930 return false; |
1935 } | 1931 } |
1936 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1932 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
1937 zone()); | 1933 zone()); |
1938 LAST(ADD_TERM); | 1934 LAST(ADD_TERM); |
1939 return true; | 1935 return true; |
1940 } | 1936 } |
1941 | 1937 |
1942 } // namespace internal | 1938 } // namespace internal |
1943 } // namespace v8 | 1939 } // namespace v8 |
OLD | NEW |