Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(454)

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 2050343002: [regexp] Experimental support for regexp named captures (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Proper fixed array cast Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | src/runtime/runtime.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/regexp-parser.h" 5 #include "src/regexp/regexp-parser.h"
6 6
7 #include "src/char-predicates-inl.h" 7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h" 8 #include "src/factory.h"
9 #include "src/isolate.h" 9 #include "src/isolate.h"
10 #include "src/objects-inl.h" 10 #include "src/objects-inl.h"
11 #include "src/ostreams.h" 11 #include "src/ostreams.h"
12 #include "src/regexp/jsregexp.h" 12 #include "src/regexp/jsregexp.h"
13 #include "src/utils.h" 13 #include "src/utils.h"
14 14
15 #ifdef V8_I18N_SUPPORT 15 #ifdef V8_I18N_SUPPORT
16 #include "unicode/uset.h" 16 #include "unicode/uset.h"
17 #endif // V8_I18N_SUPPORT 17 #endif // V8_I18N_SUPPORT
18 18
19 namespace v8 { 19 namespace v8 {
20 namespace internal { 20 namespace internal {
21 21
22 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, 22 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
23 JSRegExp::Flags flags, Isolate* isolate, Zone* zone) 23 JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
24 : isolate_(isolate), 24 : isolate_(isolate),
25 zone_(zone), 25 zone_(zone),
26 error_(error), 26 error_(error),
27 captures_(NULL), 27 captures_(NULL),
28 named_captures_(NULL),
29 named_back_references_(NULL),
30 capture_strings_(0, zone),
28 in_(in), 31 in_(in),
29 current_(kEndMarker), 32 current_(kEndMarker),
30 ignore_case_(flags & JSRegExp::kIgnoreCase), 33 ignore_case_(flags & JSRegExp::kIgnoreCase),
31 multiline_(flags & JSRegExp::kMultiline), 34 multiline_(flags & JSRegExp::kMultiline),
32 unicode_(flags & JSRegExp::kUnicode), 35 unicode_(flags & JSRegExp::kUnicode),
33 next_pos_(0), 36 next_pos_(0),
34 captures_started_(0), 37 captures_started_(0),
35 capture_count_(0), 38 capture_count_(0),
36 has_more_(true), 39 has_more_(true),
37 simple_(false), 40 simple_(false),
(...skipping 104 matching lines...) Expand 10 before | Expand all | Expand 10 after
142 145
143 #define CHECK_FAILED /**/); \ 146 #define CHECK_FAILED /**/); \
144 if (failed_) return NULL; \ 147 if (failed_) return NULL; \
145 ((void)0 148 ((void)0
146 149
147 150
148 // Pattern :: 151 // Pattern ::
149 // Disjunction 152 // Disjunction
150 RegExpTree* RegExpParser::ParsePattern() { 153 RegExpTree* RegExpParser::ParsePattern() {
151 RegExpTree* result = ParseDisjunction(CHECK_FAILED); 154 RegExpTree* result = ParseDisjunction(CHECK_FAILED);
155 PatchNamedBackReferences(CHECK_FAILED);
152 DCHECK(!has_more()); 156 DCHECK(!has_more());
153 // If the result of parsing is a literal string atom, and it has the 157 // If the result of parsing is a literal string atom, and it has the
154 // same length as the input, then the atom is identical to the input. 158 // same length as the input, then the atom is identical to the input.
155 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) { 159 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
156 simple_ = true; 160 simple_ = true;
157 } 161 }
158 return result; 162 return result;
159 } 163 }
160 164
161 165
(...skipping 99 matching lines...) Expand 10 before | Expand all | Expand 10 after
261 new (zone()) ZoneList<CharacterRange>(2, zone()); 265 new (zone()) ZoneList<CharacterRange>(2, zone());
262 CharacterRange::AddClassEscape('.', ranges, zone()); 266 CharacterRange::AddClassEscape('.', ranges, zone());
263 RegExpCharacterClass* cc = 267 RegExpCharacterClass* cc =
264 new (zone()) RegExpCharacterClass(ranges, false); 268 new (zone()) RegExpCharacterClass(ranges, false);
265 builder->AddCharacterClass(cc); 269 builder->AddCharacterClass(cc);
266 break; 270 break;
267 } 271 }
268 case '(': { 272 case '(': {
269 SubexpressionType subexpr_type = CAPTURE; 273 SubexpressionType subexpr_type = CAPTURE;
270 RegExpLookaround::Type lookaround_type = state->lookaround_type(); 274 RegExpLookaround::Type lookaround_type = state->lookaround_type();
275 bool is_named_capture = false;
271 Advance(); 276 Advance();
272 if (current() == '?') { 277 if (current() == '?') {
273 switch (Next()) { 278 switch (Next()) {
274 case ':': 279 case ':':
275 subexpr_type = GROUPING; 280 subexpr_type = GROUPING;
281 Advance(2);
276 break; 282 break;
277 case '=': 283 case '=':
278 lookaround_type = RegExpLookaround::LOOKAHEAD; 284 lookaround_type = RegExpLookaround::LOOKAHEAD;
279 subexpr_type = POSITIVE_LOOKAROUND; 285 subexpr_type = POSITIVE_LOOKAROUND;
286 Advance(2);
280 break; 287 break;
281 case '!': 288 case '!':
282 lookaround_type = RegExpLookaround::LOOKAHEAD; 289 lookaround_type = RegExpLookaround::LOOKAHEAD;
283 subexpr_type = NEGATIVE_LOOKAROUND; 290 subexpr_type = NEGATIVE_LOOKAROUND;
291 Advance(2);
284 break; 292 break;
285 case '<': 293 case '<':
286 if (FLAG_harmony_regexp_lookbehind) { 294 if (FLAG_harmony_regexp_lookbehind ||
295 FLAG_harmony_regexp_named_captures) {
287 Advance(); 296 Advance();
288 lookaround_type = RegExpLookaround::LOOKBEHIND; 297 if (FLAG_harmony_regexp_lookbehind) {
289 if (Next() == '=') { 298 if (Next() == '=') {
290 subexpr_type = POSITIVE_LOOKAROUND; 299 subexpr_type = POSITIVE_LOOKAROUND;
291 break; 300 lookaround_type = RegExpLookaround::LOOKBEHIND;
292 } else if (Next() == '!') { 301 Advance(2);
293 subexpr_type = NEGATIVE_LOOKAROUND; 302 break;
303 } else if (Next() == '!') {
304 subexpr_type = NEGATIVE_LOOKAROUND;
305 lookaround_type = RegExpLookaround::LOOKBEHIND;
306 Advance(2);
307 break;
308 }
309 }
310 if (FLAG_harmony_regexp_named_captures && unicode()) {
311 is_named_capture = true;
312 Advance();
294 break; 313 break;
295 } 314 }
296 } 315 }
297 // Fall through. 316 // Fall through.
298 default: 317 default:
299 return ReportError(CStrVector("Invalid group")); 318 return ReportError(CStrVector("Invalid group"));
300 } 319 }
301 Advance(2); 320 }
302 } else { 321
322 if (subexpr_type == CAPTURE) {
303 if (captures_started_ >= kMaxCaptures) { 323 if (captures_started_ >= kMaxCaptures) {
304 return ReportError(CStrVector("Too many captures")); 324 return ReportError(CStrVector("Too many captures"));
305 } 325 }
306 captures_started_++; 326 captures_started_++;
327
328 if (is_named_capture) {
329 Vector<const uc16> name = ParseCaptureGroupName(CHECK_FAILED);
330 CreateNamedCaptureAtIndex(name, captures_started_ CHECK_FAILED);
331 }
307 } 332 }
308 // Store current state and begin new disjunction parsing. 333 // Store current state and begin new disjunction parsing.
309 state = new (zone()) RegExpParserState( 334 state = new (zone()) RegExpParserState(
310 state, subexpr_type, lookaround_type, captures_started_, 335 state, subexpr_type, lookaround_type, captures_started_,
311 ignore_case(), unicode(), zone()); 336 ignore_case(), unicode(), zone());
312 builder = state->builder(); 337 builder = state->builder();
313 continue; 338 continue;
314 } 339 }
315 case '[': { 340 case '[': {
316 RegExpTree* cc = ParseCharacterClass(CHECK_FAILED); 341 RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
(...skipping 173 matching lines...) Expand 10 before | Expand all | Expand 10 after
490 if (ParseUnicodeEscape(&value)) { 515 if (ParseUnicodeEscape(&value)) {
491 builder->AddEscapedUnicodeCharacter(value); 516 builder->AddEscapedUnicodeCharacter(value);
492 } else if (!unicode()) { 517 } else if (!unicode()) {
493 builder->AddCharacter('u'); 518 builder->AddCharacter('u');
494 } else { 519 } else {
495 // With /u, invalid escapes are not treated as identity escapes. 520 // With /u, invalid escapes are not treated as identity escapes.
496 return ReportError(CStrVector("Invalid unicode escape")); 521 return ReportError(CStrVector("Invalid unicode escape"));
497 } 522 }
498 break; 523 break;
499 } 524 }
525 case 'k':
526 if (FLAG_harmony_regexp_named_captures && unicode()) {
527 Advance(2);
528 ParseNamedBackReference(builder, state CHECK_FAILED);
529 break;
530 }
531 // FALLTHROUGH
500 default: 532 default:
501 Advance(); 533 Advance();
502 // With /u, no identity escapes except for syntax characters 534 // With /u, no identity escapes except for syntax characters
503 // are allowed. Otherwise, all identity escapes are allowed. 535 // are allowed. Otherwise, all identity escapes are allowed.
504 if (!unicode() || IsSyntaxCharacterOrSlash(current())) { 536 if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
505 builder->AddCharacter(current()); 537 builder->AddCharacter(current());
506 Advance(); 538 Advance();
507 } else { 539 } else {
508 return ReportError(CStrVector("Invalid escape")); 540 return ReportError(CStrVector("Invalid escape"));
509 } 541 }
(...skipping 158 matching lines...) Expand 10 before | Expand all | Expand 10 after
668 } 700 }
669 if (value > capture_count_) { 701 if (value > capture_count_) {
670 Reset(start); 702 Reset(start);
671 return false; 703 return false;
672 } 704 }
673 } 705 }
674 *index_out = value; 706 *index_out = value;
675 return true; 707 return true;
676 } 708 }
677 709
710 class CaptureNameBuffer {
711 public:
712 CaptureNameBuffer() : position_(0), backing_store_() {}
713
714 INLINE(void AddChar(uint32_t code_unit)) {
715 if (position_ >= backing_store_.length()) ExpandBuffer();
716 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
717 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
718 position_ += kUC16Size;
719 } else {
720 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
721 unibrow::Utf16::LeadSurrogate(code_unit);
722 position_ += kUC16Size;
723 if (position_ >= backing_store_.length()) ExpandBuffer();
724 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
725 unibrow::Utf16::TrailSurrogate(code_unit);
726 position_ += kUC16Size;
727 }
728 }
729
730 Vector<const uc16> two_byte_literal() const {
731 DCHECK((position_ & 0x1) == 0);
732 return Vector<const uc16>(reinterpret_cast<uc16*>(backing_store_.start()),
733 position_ >> 1);
734 }
735
736 void Dispose() { backing_store_.Dispose(); }
737
738 private:
739 static const int kInitialCapacity = 16;
740 static const int kGrowthFactor = 4;
741 static const int kMaxGrowth = 1 * MB;
742
743 inline int NewCapacity(int min_capacity) {
744 int capacity = Max(min_capacity, backing_store_.length());
745 int new_capacity = Min(capacity * kGrowthFactor, capacity + kMaxGrowth);
746 return new_capacity;
747 }
748
749 void ExpandBuffer() {
750 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
751 MemCopy(new_store.start(), backing_store_.start(), position_);
752 backing_store_.Dispose();
753 backing_store_ = new_store;
754 }
755
756 int position_;
757 Vector<byte> backing_store_;
Yang 2016/06/10 15:43:27 Can't we simply use a ZoneList here? It already ha
jgruber 2016/06/13 08:26:00 Right. I switched the CL to my ZoneVector version.
758
759 DISALLOW_COPY_AND_ASSIGN(CaptureNameBuffer);
760 };
761
762 Vector<const uc16> RegExpParser::ParseCaptureGroupName() {
763 DCHECK(FLAG_harmony_regexp_named_captures);
764 DCHECK(unicode());
765
766 CaptureNameBuffer buf;
767 bool at_start = true;
768 while (true) {
769 uc32 c = current();
770 Advance();
771
772 // Convert unicode escapes.
773 if (c == '\\' && current() == 'u') {
774 Advance();
775 if (!ParseUnicodeEscape(&c)) {
776 buf.Dispose();
777 ReportError(CStrVector("Invalid Unicode escape sequence"));
778 return Vector<const uc16>::empty();
779 }
780 }
781
782 if (at_start) {
783 if (!IdentifierStart::Is(c)) {
784 buf.Dispose();
785 ReportError(CStrVector("Invalid capture group name"));
786 return Vector<const uc16>::empty();
787 }
788 buf.AddChar(c);
789 at_start = false;
790 } else {
791 if (c == '>') {
792 break;
793 } else if (IdentifierPart::Is(c)) {
794 buf.AddChar(c);
795 } else {
796 buf.Dispose();
797 ReportError(CStrVector("Invalid capture group name"));
798 return Vector<const uc16>::empty();
799 }
800 }
801 }
802
803 Vector<const uc16> name = buf.two_byte_literal();
804 capture_strings_.Add(name, zone());
805 return name;
806 }
807
808 bool RegExpParser::CreateNamedCaptureAtIndex(Vector<const uc16> name,
809 int index) {
810 DCHECK(FLAG_harmony_regexp_named_captures);
811 DCHECK(unicode());
812 DCHECK(!name.is_empty());
813 DCHECK(0 < index && index <= captures_started_);
814
815 if (named_captures_ == nullptr) {
816 named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
817 } else {
818 // Check for duplicates and bail if we find any.
819 for (int i = 0; i < named_captures_->length(); i++) {
820 if (named_captures_->at(i)->name() == name) {
821 ReportError(CStrVector("Duplicate capture group name"));
822 return false;
823 }
824 }
825 }
826
827 RegExpCapture* capture = GetCapture(index);
828 DCHECK(capture->name().is_empty());
829
830 capture->set_name(name);
831 named_captures_->Add(capture, zone());
832
833 return true;
834 }
835
836 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
837 RegExpParserState* state) {
838 // The parser is assumed to be on the '<' in \k<name>.
839 if (current() != '<') {
840 ReportError(CStrVector("Invalid named reference"));
841 return false;
842 }
843
844 Advance();
845 Vector<const uc16> name = ParseCaptureGroupName();
846 if (name.is_empty()) {
847 return false;
848 }
849
850 const int index = LookupCaptureGroupIndex(name);
851 if (index != -1 && state->IsInsideCaptureGroup(index)) {
852 builder->AddEmpty();
853 } else {
854 RegExpBackReference* atom = new (zone()) RegExpBackReference();
855 atom->set_name(name);
856
857 builder->AddAtom(atom);
858
859 if (named_back_references_ == nullptr) {
860 named_back_references_ =
861 new (zone()) ZoneList<RegExpBackReference*>(1, zone());
862 }
863 named_back_references_->Add(atom, zone());
864 }
865
866 return true;
867 }
868
869 void RegExpParser::PatchNamedBackReferences() {
870 if (named_back_references_ == nullptr) return;
871
872 if (named_captures_ == nullptr) {
873 ReportError(CStrVector("Invalid named capture referenced"));
874 return;
875 }
876
877 // Look up and patch the actual capture for each named back reference.
878 // TODO(jgruber): O(n^2), optimize if necessary.
879
880 for (int i = 0; i < named_back_references_->length(); i++) {
881 RegExpBackReference* ref = named_back_references_->at(i);
882 int index = LookupCaptureGroupIndex(ref->name());
883 if (index == -1) {
884 ReportError(CStrVector("Invalid named capture referenced"));
885 return;
886 }
887 ref->set_capture(GetCapture(index));
888 }
889 }
890
891 int RegExpParser::LookupCaptureGroupIndex(Vector<const uc16> name) {
892 DCHECK(FLAG_harmony_regexp_named_captures);
893 DCHECK(unicode());
894 DCHECK(!name.is_empty());
895
896 // Attempt an initial lookup.
897 if (named_captures_ == nullptr) {
898 return -1;
899 }
900
901 for (int i = 0; i < named_captures_->length(); i++) {
902 RegExpCapture* capture = named_captures_->at(i);
903 if (capture->name() == name) {
904 return capture->index();
905 }
906 }
907
908 return -1;
909 }
678 910
679 RegExpCapture* RegExpParser::GetCapture(int index) { 911 RegExpCapture* RegExpParser::GetCapture(int index) {
680 // The index for the capture groups are one-based. Its index in the list is 912 // The index for the capture groups are one-based. Its index in the list is
681 // zero-based. 913 // zero-based.
682 int know_captures = 914 int know_captures =
683 is_scanned_for_captures_ ? capture_count_ : captures_started_; 915 is_scanned_for_captures_ ? capture_count_ : captures_started_;
684 DCHECK(index <= know_captures); 916 DCHECK(index <= know_captures);
685 if (captures_ == NULL) { 917 if (captures_ == NULL) {
686 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone()); 918 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
687 } 919 }
688 while (captures_->length() < know_captures) { 920 while (captures_->length() < know_captures) {
689 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone()); 921 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
690 } 922 }
691 return captures_->at(index - 1); 923 return captures_->at(index - 1);
692 } 924 }
693 925
926 Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
927 if (named_captures_ == nullptr || named_captures_->is_empty())
928 return Handle<FixedArray>();
929
930 int len = named_captures_->length() * 2;
931 Handle<FixedArray> array = isolate()->factory()->NewFixedArray(len);
932
933 for (int i = 0; i < named_captures_->length(); i++) {
934 RegExpCapture* capture = named_captures_->at(i);
935 MaybeHandle<String> name =
936 isolate()->factory()->NewStringFromTwoByte(capture->name());
937 array->set(i * 2, *name.ToHandleChecked());
938 array->set(i * 2 + 1, Smi::FromInt(capture->index()));
939 }
940
941 return array;
942 }
943
944 void RegExpParser::FreeCaptureStrings() {
945 for (int i = 0; i < capture_strings_.length(); i++) {
946 capture_strings_[i].Dispose();
947 }
948 capture_strings_.Clear();
949 }
694 950
695 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) { 951 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
696 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { 952 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
697 if (s->group_type() != CAPTURE) continue; 953 if (s->group_type() != CAPTURE) continue;
698 // Return true if we found the matching capture index. 954 // Return true if we found the matching capture index.
699 if (index == s->capture_index()) return true; 955 if (index == s->capture_index()) return true;
700 // Abort if index is larger than what has been parsed up till this state. 956 // Abort if index is larger than what has been parsed up till this state.
701 if (index > s->capture_index()) return false; 957 if (index > s->capture_index()) return false;
702 } 958 }
703 return false; 959 return false;
(...skipping 557 matching lines...) Expand 10 before | Expand all | Expand 10 after
1261 DCHECK(result->error.is_null()); 1517 DCHECK(result->error.is_null());
1262 if (FLAG_trace_regexp_parser) { 1518 if (FLAG_trace_regexp_parser) {
1263 OFStream os(stdout); 1519 OFStream os(stdout);
1264 tree->Print(os, zone); 1520 tree->Print(os, zone);
1265 os << "\n"; 1521 os << "\n";
1266 } 1522 }
1267 result->tree = tree; 1523 result->tree = tree;
1268 int capture_count = parser.captures_started(); 1524 int capture_count = parser.captures_started();
1269 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; 1525 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1270 result->contains_anchor = parser.contains_anchor(); 1526 result->contains_anchor = parser.contains_anchor();
1527 result->capture_name_map = parser.CreateCaptureNameMap();
1271 result->capture_count = capture_count; 1528 result->capture_count = capture_count;
1272 } 1529 }
1530 parser.FreeCaptureStrings();
1273 return !parser.failed(); 1531 return !parser.failed();
1274 } 1532 }
1275 1533
1276 RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode) 1534 RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
1277 : zone_(zone), 1535 : zone_(zone),
1278 pending_empty_(false), 1536 pending_empty_(false),
1279 ignore_case_(ignore_case), 1537 ignore_case_(ignore_case),
1280 unicode_(unicode), 1538 unicode_(unicode),
1281 characters_(NULL), 1539 characters_(NULL),
1282 pending_surrogate_(kNoPendingSurrogate), 1540 pending_surrogate_(kNoPendingSurrogate),
(...skipping 274 matching lines...) Expand 10 before | Expand all | Expand 10 after
1557 return false; 1815 return false;
1558 } 1816 }
1559 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1817 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1560 zone()); 1818 zone());
1561 LAST(ADD_TERM); 1819 LAST(ADD_TERM);
1562 return true; 1820 return true;
1563 } 1821 }
1564 1822
1565 } // namespace internal 1823 } // namespace internal
1566 } // namespace v8 1824 } // namespace v8
OLDNEW
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | src/runtime/runtime.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698