Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(257)

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 2791163003: [regexp] Support unicode capture names in non-unicode patterns (Closed)
Patch Set: Remove template parameter from ReadNext Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/regexp-named-captures.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/regexp-parser.h" 5 #include "src/regexp/regexp-parser.h"
6 6
7 #include "src/char-predicates-inl.h" 7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h" 8 #include "src/factory.h"
9 #include "src/isolate.h" 9 #include "src/isolate.h"
10 #include "src/objects-inl.h" 10 #include "src/objects-inl.h"
(...skipping 28 matching lines...) Expand all
39 has_more_(true), 39 has_more_(true),
40 simple_(false), 40 simple_(false),
41 contains_anchor_(false), 41 contains_anchor_(false),
42 is_scanned_for_captures_(false), 42 is_scanned_for_captures_(false),
43 has_named_captures_(false), 43 has_named_captures_(false),
44 failed_(false) { 44 failed_(false) {
45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall); 45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);
46 Advance(); 46 Advance();
47 } 47 }
48 48
49 template <bool update_position> 49 inline uc32 RegExpParser::ReadNext(bool update_position, ScanMode mode) {
50 inline uc32 RegExpParser::ReadNext() {
51 int position = next_pos_; 50 int position = next_pos_;
52 uc32 c0 = in()->Get(position); 51 uc32 c0 = in()->Get(position);
53 position++; 52 position++;
54 // Read the whole surrogate pair in case of unicode flag, if possible. 53 const bool try_combine_surrogate_pairs =
55 if (unicode() && position < in()->length() && 54 (unicode() || mode == ScanMode::FORCE_COMBINE_SURROGATE_PAIRS);
55 if (try_combine_surrogate_pairs && position < in()->length() &&
56 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { 56 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
57 uc16 c1 = in()->Get(position); 57 uc16 c1 = in()->Get(position);
58 if (unibrow::Utf16::IsTrailSurrogate(c1)) { 58 if (unibrow::Utf16::IsTrailSurrogate(c1)) {
59 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); 59 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
60 position++; 60 position++;
61 } 61 }
62 } 62 }
63 if (update_position) next_pos_ = position; 63 if (update_position) next_pos_ = position;
64 return c0; 64 return c0;
65 } 65 }
66 66
67 67
68 uc32 RegExpParser::Next() { 68 uc32 RegExpParser::Next() {
69 if (has_next()) { 69 if (has_next()) {
70 return ReadNext<false>(); 70 return ReadNext(false, ScanMode::DEFAULT);
71 } else { 71 } else {
72 return kEndMarker; 72 return kEndMarker;
73 } 73 }
74 } 74 }
75 75
76 76 void RegExpParser::Advance(ScanMode mode) {
77 void RegExpParser::Advance() {
78 if (has_next()) { 77 if (has_next()) {
79 StackLimitCheck check(isolate()); 78 StackLimitCheck check(isolate());
80 if (check.HasOverflowed()) { 79 if (check.HasOverflowed()) {
81 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow"); 80 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow");
82 ReportError(CStrVector( 81 ReportError(CStrVector(
83 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow))); 82 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));
84 } else if (zone()->excess_allocation()) { 83 } else if (zone()->excess_allocation()) {
85 ReportError(CStrVector("Regular expression too large")); 84 ReportError(CStrVector("Regular expression too large"));
86 } else { 85 } else {
87 current_ = ReadNext<true>(); 86 current_ = ReadNext(true, mode);
88 } 87 }
89 } else { 88 } else {
90 current_ = kEndMarker; 89 current_ = kEndMarker;
91 // Advance so that position() points to 1-after-the-last-character. This is 90 // Advance so that position() points to 1-after-the-last-character. This is
92 // important so that Reset() to this position works correctly. 91 // important so that Reset() to this position works correctly.
93 next_pos_ = in()->length() + 1; 92 next_pos_ = in()->length() + 1;
94 has_more_ = false; 93 has_more_ = false;
95 } 94 }
96 } 95 }
97 96
98 97
99 void RegExpParser::Reset(int pos) { 98 void RegExpParser::Reset(int pos) {
100 next_pos_ = pos; 99 next_pos_ = pos;
101 has_more_ = (pos < in()->length()); 100 has_more_ = (pos < in()->length());
102 Advance(); 101 Advance();
103 } 102 }
104 103
105 104 void RegExpParser::Advance(int dist, ScanMode mode) {
106 void RegExpParser::Advance(int dist) {
107 next_pos_ += dist - 1; 105 next_pos_ += dist - 1;
108 Advance(); 106 Advance(mode);
109 } 107 }
110 108
111 109
112 bool RegExpParser::simple() { return simple_; } 110 bool RegExpParser::simple() { return simple_; }
113 111
114 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { 112 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
115 switch (c) { 113 switch (c) {
116 case '^': 114 case '^':
117 case '$': 115 case '$':
118 case '\\': 116 case '\\':
(...skipping 203 matching lines...) Expand 10 before | Expand all | Expand 10 after
322 } else if (Next() == '!') { 320 } else if (Next() == '!') {
323 subexpr_type = NEGATIVE_LOOKAROUND; 321 subexpr_type = NEGATIVE_LOOKAROUND;
324 lookaround_type = RegExpLookaround::LOOKBEHIND; 322 lookaround_type = RegExpLookaround::LOOKBEHIND;
325 Advance(2); 323 Advance(2);
326 break; 324 break;
327 } 325 }
328 } 326 }
329 if (FLAG_harmony_regexp_named_captures) { 327 if (FLAG_harmony_regexp_named_captures) {
330 has_named_captures_ = true; 328 has_named_captures_ = true;
331 is_named_capture = true; 329 is_named_capture = true;
332 Advance();
333 break; 330 break;
334 } 331 }
335 // Fall through. 332 // Fall through.
336 default: 333 default:
337 return ReportError(CStrVector("Invalid group")); 334 return ReportError(CStrVector("Invalid group"));
338 } 335 }
339 } 336 }
340 337
341 const ZoneVector<uc16>* capture_name = nullptr; 338 const ZoneVector<uc16>* capture_name = nullptr;
342 if (subexpr_type == CAPTURE) { 339 if (subexpr_type == CAPTURE) {
(...skipping 419 matching lines...) Expand 10 before | Expand all | Expand 10 after
762 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { 759 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
763 v->push_back(code_unit); 760 v->push_back(code_unit);
764 } else { 761 } else {
765 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); 762 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
766 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); 763 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
767 } 764 }
768 } 765 }
769 766
770 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { 767 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
771 DCHECK(FLAG_harmony_regexp_named_captures); 768 DCHECK(FLAG_harmony_regexp_named_captures);
769 DCHECK_EQ(current(), '<');
772 770
773 ZoneVector<uc16>* name = 771 ZoneVector<uc16>* name =
774 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); 772 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
775 773
774 // Capture names can always contain surrogate pairs, and we need to scan
775 // accordingly.
776 const ScanMode scan_mode = ScanMode::FORCE_COMBINE_SURROGATE_PAIRS;
777 Advance(scan_mode);
778
776 bool at_start = true; 779 bool at_start = true;
777 while (true) { 780 while (true) {
778 uc32 c = current(); 781 uc32 c = current();
779 Advance(); 782 Advance(scan_mode);
780 783
781 // Convert unicode escapes. 784 // Convert unicode escapes.
782 if (c == '\\' && current() == 'u') { 785 if (c == '\\' && current() == 'u') {
783 // TODO(jgruber): Reconsider this once the spec has settled. 786 // TODO(jgruber): Reconsider this once the spec has settled.
784 // https://github.com/tc39/proposal-regexp-named-groups/issues/23 787 // https://github.com/tc39/proposal-regexp-named-groups/issues/23
785 Advance(); 788 Advance(scan_mode);
786 if (!ParseUnicodeEscape(&c)) { 789 if (!ParseUnicodeEscape(&c)) {
787 ReportError(CStrVector("Invalid Unicode escape sequence")); 790 ReportError(CStrVector("Invalid Unicode escape sequence"));
788 return nullptr; 791 return nullptr;
789 } 792 }
790 } 793 }
791 794
792 // The backslash char is misclassified as both ID_Start and ID_Continue. 795 // The backslash char is misclassified as both ID_Start and ID_Continue.
793 if (c == '\\') { 796 if (c == '\\') {
794 ReportError(CStrVector("Invalid capture group name")); 797 ReportError(CStrVector("Invalid capture group name"));
795 return nullptr; 798 return nullptr;
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
846 } 849 }
847 850
848 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, 851 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
849 RegExpParserState* state) { 852 RegExpParserState* state) {
850 // The parser is assumed to be on the '<' in \k<name>. 853 // The parser is assumed to be on the '<' in \k<name>.
851 if (current() != '<') { 854 if (current() != '<') {
852 ReportError(CStrVector("Invalid named reference")); 855 ReportError(CStrVector("Invalid named reference"));
853 return false; 856 return false;
854 } 857 }
855 858
856 Advance();
857 const ZoneVector<uc16>* name = ParseCaptureGroupName(); 859 const ZoneVector<uc16>* name = ParseCaptureGroupName();
858 if (name == nullptr) { 860 if (name == nullptr) {
859 return false; 861 return false;
860 } 862 }
861 863
862 if (state->IsInsideCaptureGroup(name)) { 864 if (state->IsInsideCaptureGroup(name)) {
863 builder->AddEmpty(); 865 builder->AddEmpty();
864 } else { 866 } else {
865 RegExpBackReference* atom = new (zone()) RegExpBackReference(); 867 RegExpBackReference* atom = new (zone()) RegExpBackReference();
866 atom->set_name(name); 868 atom->set_name(name);
(...skipping 1003 matching lines...) Expand 10 before | Expand all | Expand 10 after
1870 return false; 1872 return false;
1871 } 1873 }
1872 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1874 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1873 zone()); 1875 zone());
1874 LAST(ADD_TERM); 1876 LAST(ADD_TERM);
1875 return true; 1877 return true;
1876 } 1878 }
1877 1879
1878 } // namespace internal 1880 } // namespace internal
1879 } // namespace v8 1881 } // namespace v8
OLDNEW
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/regexp-named-captures.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698