Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(5)

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 2859933003: Revert of [regexp] Support unicode capture names in non-unicode patterns (Closed)
Patch Set: Fixing test Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/regexp-named-captures.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/regexp-parser.h" 5 #include "src/regexp/regexp-parser.h"
6 6
7 #include "src/char-predicates-inl.h" 7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h" 8 #include "src/factory.h"
9 #include "src/isolate.h" 9 #include "src/isolate.h"
10 #include "src/objects-inl.h" 10 #include "src/objects-inl.h"
(...skipping 28 matching lines...) Expand all
39 has_more_(true), 39 has_more_(true),
40 simple_(false), 40 simple_(false),
41 contains_anchor_(false), 41 contains_anchor_(false),
42 is_scanned_for_captures_(false), 42 is_scanned_for_captures_(false),
43 has_named_captures_(false), 43 has_named_captures_(false),
44 failed_(false) { 44 failed_(false) {
45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall); 45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);
46 Advance(); 46 Advance();
47 } 47 }
48 48
49 inline uc32 RegExpParser::ReadNext(bool update_position, ScanMode mode) { 49 template <bool update_position>
50 inline uc32 RegExpParser::ReadNext() {
50 int position = next_pos_; 51 int position = next_pos_;
51 uc32 c0 = in()->Get(position); 52 uc32 c0 = in()->Get(position);
52 position++; 53 position++;
53 const bool try_combine_surrogate_pairs = 54 // Read the whole surrogate pair in case of unicode flag, if possible.
54 (unicode() || mode == ScanMode::FORCE_COMBINE_SURROGATE_PAIRS); 55 if (unicode() && position < in()->length() &&
55 if (try_combine_surrogate_pairs && position < in()->length() &&
56 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { 56 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
57 uc16 c1 = in()->Get(position); 57 uc16 c1 = in()->Get(position);
58 if (unibrow::Utf16::IsTrailSurrogate(c1)) { 58 if (unibrow::Utf16::IsTrailSurrogate(c1)) {
59 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); 59 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
60 position++; 60 position++;
61 } 61 }
62 } 62 }
63 if (update_position) next_pos_ = position; 63 if (update_position) next_pos_ = position;
64 return c0; 64 return c0;
65 } 65 }
66 66
67 67
68 uc32 RegExpParser::Next() { 68 uc32 RegExpParser::Next() {
69 if (has_next()) { 69 if (has_next()) {
70 return ReadNext(false, ScanMode::DEFAULT); 70 return ReadNext<false>();
71 } else { 71 } else {
72 return kEndMarker; 72 return kEndMarker;
73 } 73 }
74 } 74 }
75 75
76 void RegExpParser::Advance(ScanMode mode) { 76 void RegExpParser::Advance() {
77 if (has_next()) { 77 if (has_next()) {
78 StackLimitCheck check(isolate()); 78 StackLimitCheck check(isolate());
79 if (check.HasOverflowed()) { 79 if (check.HasOverflowed()) {
80 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow"); 80 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow");
81 ReportError(CStrVector( 81 ReportError(CStrVector(
82 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow))); 82 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));
83 } else if (zone()->excess_allocation()) { 83 } else if (zone()->excess_allocation()) {
84 ReportError(CStrVector("Regular expression too large")); 84 ReportError(CStrVector("Regular expression too large"));
85 } else { 85 } else {
86 current_ = ReadNext(true, mode); 86 current_ = ReadNext<true>();
87 } 87 }
88 } else { 88 } else {
89 current_ = kEndMarker; 89 current_ = kEndMarker;
90 // Advance so that position() points to 1-after-the-last-character. This is 90 // Advance so that position() points to 1-after-the-last-character. This is
91 // important so that Reset() to this position works correctly. 91 // important so that Reset() to this position works correctly.
92 next_pos_ = in()->length() + 1; 92 next_pos_ = in()->length() + 1;
93 has_more_ = false; 93 has_more_ = false;
94 } 94 }
95 } 95 }
96 96
97 97
98 void RegExpParser::Reset(int pos) { 98 void RegExpParser::Reset(int pos) {
99 next_pos_ = pos; 99 next_pos_ = pos;
100 has_more_ = (pos < in()->length()); 100 has_more_ = (pos < in()->length());
101 Advance(); 101 Advance();
102 } 102 }
103 103
104 void RegExpParser::Advance(int dist, ScanMode mode) { 104 void RegExpParser::Advance(int dist) {
105 next_pos_ += dist - 1; 105 next_pos_ += dist - 1;
106 Advance(mode); 106 Advance();
107 } 107 }
108 108
109 109
110 bool RegExpParser::simple() { return simple_; } 110 bool RegExpParser::simple() { return simple_; }
111 111
112 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { 112 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
113 switch (c) { 113 switch (c) {
114 case '^': 114 case '^':
115 case '$': 115 case '$':
116 case '\\': 116 case '\\':
(...skipping 202 matching lines...) Expand 10 before | Expand all | Expand 10 after
319 } else if (Next() == '!') { 319 } else if (Next() == '!') {
320 subexpr_type = NEGATIVE_LOOKAROUND; 320 subexpr_type = NEGATIVE_LOOKAROUND;
321 lookaround_type = RegExpLookaround::LOOKBEHIND; 321 lookaround_type = RegExpLookaround::LOOKBEHIND;
322 Advance(2); 322 Advance(2);
323 break; 323 break;
324 } 324 }
325 } 325 }
326 if (FLAG_harmony_regexp_named_captures) { 326 if (FLAG_harmony_regexp_named_captures) {
327 has_named_captures_ = true; 327 has_named_captures_ = true;
328 is_named_capture = true; 328 is_named_capture = true;
329 Advance();
329 break; 330 break;
330 } 331 }
331 // Fall through. 332 // Fall through.
332 default: 333 default:
333 return ReportError(CStrVector("Invalid group")); 334 return ReportError(CStrVector("Invalid group"));
334 } 335 }
335 } 336 }
336 337
337 const ZoneVector<uc16>* capture_name = nullptr; 338 const ZoneVector<uc16>* capture_name = nullptr;
338 if (subexpr_type == CAPTURE) { 339 if (subexpr_type == CAPTURE) {
(...skipping 415 matching lines...) Expand 10 before | Expand all | Expand 10 after
754 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { 755 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
755 v->push_back(code_unit); 756 v->push_back(code_unit);
756 } else { 757 } else {
757 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); 758 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
758 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); 759 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
759 } 760 }
760 } 761 }
761 762
762 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { 763 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
763 DCHECK(FLAG_harmony_regexp_named_captures); 764 DCHECK(FLAG_harmony_regexp_named_captures);
764 DCHECK_EQ(current(), '<');
765 765
766 ZoneVector<uc16>* name = 766 ZoneVector<uc16>* name =
767 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); 767 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
768 768
769 // Capture names can always contain surrogate pairs, and we need to scan
770 // accordingly.
771 const ScanMode scan_mode = ScanMode::FORCE_COMBINE_SURROGATE_PAIRS;
772 Advance(scan_mode);
773
774 bool at_start = true; 769 bool at_start = true;
775 while (true) { 770 while (true) {
776 uc32 c = current(); 771 uc32 c = current();
777 Advance(scan_mode); 772 Advance();
778 773
779 // Convert unicode escapes. 774 // Convert unicode escapes.
780 if (c == '\\' && current() == 'u') { 775 if (c == '\\' && current() == 'u') {
781 Advance(scan_mode); 776 Advance();
782 if (!ParseUnicodeEscape(&c)) { 777 if (!ParseUnicodeEscape(&c)) {
783 ReportError(CStrVector("Invalid Unicode escape sequence")); 778 ReportError(CStrVector("Invalid Unicode escape sequence"));
784 return nullptr; 779 return nullptr;
785 } 780 }
786 } 781 }
787 782
788 // The backslash char is misclassified as both ID_Start and ID_Continue. 783 // The backslash char is misclassified as both ID_Start and ID_Continue.
789 if (c == '\\') { 784 if (c == '\\') {
790 ReportError(CStrVector("Invalid capture group name")); 785 ReportError(CStrVector("Invalid capture group name"));
791 return nullptr; 786 return nullptr;
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
842 } 837 }
843 838
844 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, 839 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
845 RegExpParserState* state) { 840 RegExpParserState* state) {
846 // The parser is assumed to be on the '<' in \k<name>. 841 // The parser is assumed to be on the '<' in \k<name>.
847 if (current() != '<') { 842 if (current() != '<') {
848 ReportError(CStrVector("Invalid named reference")); 843 ReportError(CStrVector("Invalid named reference"));
849 return false; 844 return false;
850 } 845 }
851 846
847 Advance();
852 const ZoneVector<uc16>* name = ParseCaptureGroupName(); 848 const ZoneVector<uc16>* name = ParseCaptureGroupName();
853 if (name == nullptr) { 849 if (name == nullptr) {
854 return false; 850 return false;
855 } 851 }
856 852
857 if (state->IsInsideCaptureGroup(name)) { 853 if (state->IsInsideCaptureGroup(name)) {
858 builder->AddEmpty(); 854 builder->AddEmpty();
859 } else { 855 } else {
860 RegExpBackReference* atom = new (zone()) RegExpBackReference(); 856 RegExpBackReference* atom = new (zone()) RegExpBackReference();
861 atom->set_name(name); 857 atom->set_name(name);
(...skipping 1072 matching lines...) Expand 10 before | Expand all | Expand 10 after
1934 return false; 1930 return false;
1935 } 1931 }
1936 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1932 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1937 zone()); 1933 zone());
1938 LAST(ADD_TERM); 1934 LAST(ADD_TERM);
1939 return true; 1935 return true;
1940 } 1936 }
1941 1937
1942 } // namespace internal 1938 } // namespace internal
1943 } // namespace v8 1939 } // namespace v8
OLDNEW
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/regexp-named-captures.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698