Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(26)

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 2791163003: [regexp] Support unicode capture names in non-unicode patterns (Closed)
Patch Set: Update test Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/regexp-named-captures.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/regexp-parser.h" 5 #include "src/regexp/regexp-parser.h"
6 6
7 #include "src/char-predicates-inl.h" 7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h" 8 #include "src/factory.h"
9 #include "src/isolate.h" 9 #include "src/isolate.h"
10 #include "src/objects-inl.h" 10 #include "src/objects-inl.h"
(...skipping 28 matching lines...) Expand all
39 has_more_(true), 39 has_more_(true),
40 simple_(false), 40 simple_(false),
41 contains_anchor_(false), 41 contains_anchor_(false),
42 is_scanned_for_captures_(false), 42 is_scanned_for_captures_(false),
43 has_named_captures_(false), 43 has_named_captures_(false),
44 failed_(false) { 44 failed_(false) {
45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall); 45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);
46 Advance(); 46 Advance();
47 } 47 }
48 48
49 template <bool update_position> 49 template <bool update_position>
Yang 2017/04/05 12:52:38 I wonder whether it makes sense to make the scan m
jgruber 2017/04/05 12:57:30 Agreed, will do that.
jgruber 2017/04/07 07:10:56 Done.
50 inline uc32 RegExpParser::ReadNext() { 50 inline uc32 RegExpParser::ReadNext(ScanMode mode) {
51 int position = next_pos_; 51 int position = next_pos_;
52 uc32 c0 = in()->Get(position); 52 uc32 c0 = in()->Get(position);
53 position++; 53 position++;
54 // Read the whole surrogate pair in case of unicode flag, if possible. 54 const bool try_combine_surrogate_pairs =
55 if (unicode() && position < in()->length() && 55 (unicode() || mode == ScanMode::FORCE_COMBINE_SURROGATE_PAIRS);
56 if (try_combine_surrogate_pairs && position < in()->length() &&
56 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { 57 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
57 uc16 c1 = in()->Get(position); 58 uc16 c1 = in()->Get(position);
58 if (unibrow::Utf16::IsTrailSurrogate(c1)) { 59 if (unibrow::Utf16::IsTrailSurrogate(c1)) {
59 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); 60 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
60 position++; 61 position++;
61 } 62 }
62 } 63 }
63 if (update_position) next_pos_ = position; 64 if (update_position) next_pos_ = position;
64 return c0; 65 return c0;
65 } 66 }
66 67
67 68
68 uc32 RegExpParser::Next() { 69 uc32 RegExpParser::Next() {
69 if (has_next()) { 70 if (has_next()) {
70 return ReadNext<false>(); 71 return ReadNext<false>(ScanMode::DEFAULT);
71 } else { 72 } else {
72 return kEndMarker; 73 return kEndMarker;
73 } 74 }
74 } 75 }
75 76
76 77 void RegExpParser::Advance(ScanMode mode) {
77 void RegExpParser::Advance() {
78 if (has_next()) { 78 if (has_next()) {
79 StackLimitCheck check(isolate()); 79 StackLimitCheck check(isolate());
80 if (check.HasOverflowed()) { 80 if (check.HasOverflowed()) {
81 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow"); 81 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow");
82 ReportError(CStrVector( 82 ReportError(CStrVector(
83 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow))); 83 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));
84 } else if (zone()->excess_allocation()) { 84 } else if (zone()->excess_allocation()) {
85 ReportError(CStrVector("Regular expression too large")); 85 ReportError(CStrVector("Regular expression too large"));
86 } else { 86 } else {
87 current_ = ReadNext<true>(); 87 current_ = ReadNext<true>(mode);
88 } 88 }
89 } else { 89 } else {
90 current_ = kEndMarker; 90 current_ = kEndMarker;
91 // Advance so that position() points to 1-after-the-last-character. This is 91 // Advance so that position() points to 1-after-the-last-character. This is
92 // important so that Reset() to this position works correctly. 92 // important so that Reset() to this position works correctly.
93 next_pos_ = in()->length() + 1; 93 next_pos_ = in()->length() + 1;
94 has_more_ = false; 94 has_more_ = false;
95 } 95 }
96 } 96 }
97 97
98 98
99 void RegExpParser::Reset(int pos) { 99 void RegExpParser::Reset(int pos) {
100 next_pos_ = pos; 100 next_pos_ = pos;
101 has_more_ = (pos < in()->length()); 101 has_more_ = (pos < in()->length());
102 Advance(); 102 Advance();
103 } 103 }
104 104
105 105 void RegExpParser::Advance(int dist, ScanMode mode) {
106 void RegExpParser::Advance(int dist) {
107 next_pos_ += dist - 1; 106 next_pos_ += dist - 1;
108 Advance(); 107 Advance(mode);
109 } 108 }
110 109
111 110
112 bool RegExpParser::simple() { return simple_; } 111 bool RegExpParser::simple() { return simple_; }
113 112
114 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { 113 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
115 switch (c) { 114 switch (c) {
116 case '^': 115 case '^':
117 case '$': 116 case '$':
118 case '\\': 117 case '\\':
(...skipping 203 matching lines...) Expand 10 before | Expand all | Expand 10 after
322 } else if (Next() == '!') { 321 } else if (Next() == '!') {
323 subexpr_type = NEGATIVE_LOOKAROUND; 322 subexpr_type = NEGATIVE_LOOKAROUND;
324 lookaround_type = RegExpLookaround::LOOKBEHIND; 323 lookaround_type = RegExpLookaround::LOOKBEHIND;
325 Advance(2); 324 Advance(2);
326 break; 325 break;
327 } 326 }
328 } 327 }
329 if (FLAG_harmony_regexp_named_captures) { 328 if (FLAG_harmony_regexp_named_captures) {
330 has_named_captures_ = true; 329 has_named_captures_ = true;
331 is_named_capture = true; 330 is_named_capture = true;
332 Advance();
Yang 2017/04/05 12:52:38 How come we can remove this Advance here and below
jgruber 2017/04/05 12:57:30 The advance is now done in ParseCaptureGroupName.
333 break; 331 break;
334 } 332 }
335 // Fall through. 333 // Fall through.
336 default: 334 default:
337 return ReportError(CStrVector("Invalid group")); 335 return ReportError(CStrVector("Invalid group"));
338 } 336 }
339 } 337 }
340 338
341 const ZoneVector<uc16>* capture_name = nullptr; 339 const ZoneVector<uc16>* capture_name = nullptr;
342 if (subexpr_type == CAPTURE) { 340 if (subexpr_type == CAPTURE) {
(...skipping 419 matching lines...) Expand 10 before | Expand all | Expand 10 after
762 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { 760 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
763 v->push_back(code_unit); 761 v->push_back(code_unit);
764 } else { 762 } else {
765 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); 763 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
766 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); 764 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
767 } 765 }
768 } 766 }
769 767
770 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { 768 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
771 DCHECK(FLAG_harmony_regexp_named_captures); 769 DCHECK(FLAG_harmony_regexp_named_captures);
770 DCHECK_EQ(current(), '<');
772 771
773 ZoneVector<uc16>* name = 772 ZoneVector<uc16>* name =
774 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); 773 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
775 774
775 // Capture names can always contain surrogate pairs, and we need to scan
776 // accordingly.
777 const ScanMode scan_mode = ScanMode::FORCE_COMBINE_SURROGATE_PAIRS;
778 Advance(scan_mode);
779
776 bool at_start = true; 780 bool at_start = true;
777 while (true) { 781 while (true) {
778 uc32 c = current(); 782 uc32 c = current();
779 Advance(); 783 Advance(scan_mode);
780 784
781 // Convert unicode escapes. 785 // Convert unicode escapes.
782 if (c == '\\' && current() == 'u') { 786 if (c == '\\' && current() == 'u') {
783 // TODO(jgruber): Reconsider this once the spec has settled. 787 // TODO(jgruber): Reconsider this once the spec has settled.
784 // https://github.com/tc39/proposal-regexp-named-groups/issues/23 788 // https://github.com/tc39/proposal-regexp-named-groups/issues/23
785 Advance(); 789 Advance(scan_mode);
786 if (!ParseUnicodeEscape(&c)) { 790 if (!ParseUnicodeEscape(&c)) {
787 ReportError(CStrVector("Invalid Unicode escape sequence")); 791 ReportError(CStrVector("Invalid Unicode escape sequence"));
788 return nullptr; 792 return nullptr;
789 } 793 }
790 } 794 }
791 795
792 if (at_start) { 796 if (at_start) {
793 if (!IdentifierStart::Is(c)) { 797 if (!IdentifierStart::Is(c)) {
794 ReportError(CStrVector("Invalid capture group name")); 798 ReportError(CStrVector("Invalid capture group name"));
795 return nullptr; 799 return nullptr;
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
840 } 844 }
841 845
842 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, 846 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
843 RegExpParserState* state) { 847 RegExpParserState* state) {
844 // The parser is assumed to be on the '<' in \k<name>. 848 // The parser is assumed to be on the '<' in \k<name>.
845 if (current() != '<') { 849 if (current() != '<') {
846 ReportError(CStrVector("Invalid named reference")); 850 ReportError(CStrVector("Invalid named reference"));
847 return false; 851 return false;
848 } 852 }
849 853
850 Advance();
851 const ZoneVector<uc16>* name = ParseCaptureGroupName(); 854 const ZoneVector<uc16>* name = ParseCaptureGroupName();
852 if (name == nullptr) { 855 if (name == nullptr) {
853 return false; 856 return false;
854 } 857 }
855 858
856 if (state->IsInsideCaptureGroup(name)) { 859 if (state->IsInsideCaptureGroup(name)) {
857 builder->AddEmpty(); 860 builder->AddEmpty();
858 } else { 861 } else {
859 RegExpBackReference* atom = new (zone()) RegExpBackReference(); 862 RegExpBackReference* atom = new (zone()) RegExpBackReference();
860 atom->set_name(name); 863 atom->set_name(name);
(...skipping 1003 matching lines...) Expand 10 before | Expand all | Expand 10 after
1864 return false; 1867 return false;
1865 } 1868 }
1866 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1869 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1867 zone()); 1870 zone());
1868 LAST(ADD_TERM); 1871 LAST(ADD_TERM);
1869 return true; 1872 return true;
1870 } 1873 }
1871 1874
1872 } // namespace internal 1875 } // namespace internal
1873 } // namespace v8 1876 } // namespace v8
OLDNEW
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/regexp-named-captures.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698