src/regexp/regexp-parser.cc - Issue 2788873002: [regexp] Allow named captures and back-references in non-unicode patterns

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 2788873002: [regexp] Allow named captures and back-references in non-unicode patterns (Closed)

Patch Set: Created 3 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2016 the V8 project authors. All rights reserved.	1 // Copyright 2016 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/regexp-parser.h"	5 #include "src/regexp/regexp-parser.h"

6	6

7 #include "src/char-predicates-inl.h"	7 #include "src/char-predicates-inl.h"

8 #include "src/factory.h"	8 #include "src/factory.h"

9 #include "src/isolate.h"	9 #include "src/isolate.h"

10 #include "src/objects-inl.h"	10 #include "src/objects-inl.h"

(...skipping 22 matching lines...) Expand all Loading...
33 ignore_case_(flags & JSRegExp::kIgnoreCase),	33 ignore_case_(flags & JSRegExp::kIgnoreCase),

34 multiline_(flags & JSRegExp::kMultiline),	34 multiline_(flags & JSRegExp::kMultiline),

35 unicode_(flags & JSRegExp::kUnicode),	35 unicode_(flags & JSRegExp::kUnicode),

36 next_pos_(0),	36 next_pos_(0),

37 captures_started_(0),	37 captures_started_(0),

38 capture_count_(0),	38 capture_count_(0),

39 has_more_(true),	39 has_more_(true),

40 simple_(false),	40 simple_(false),

41 contains_anchor_(false),	41 contains_anchor_(false),

42 is_scanned_for_captures_(false),	42 is_scanned_for_captures_(false),

	43 has_named_captures_(false),

43 failed_(false) {	44 failed_(false) {

44 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);	45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);

45 Advance();	46 Advance();

46 }	47 }

47	48

48 template <bool update_position>	49 template <bool update_position>

49 inline uc32 RegExpParser::ReadNext() {	50 inline uc32 RegExpParser::ReadNext() {

50 int position = next_pos_;	51 int position = next_pos_;

51 uc32 c0 = in()->Get(position);	52 uc32 c0 = in()->Get(position);

52 position++;	53 position++;

(...skipping 265 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
318 lookaround_type = RegExpLookaround::LOOKBEHIND;	319 lookaround_type = RegExpLookaround::LOOKBEHIND;

319 Advance(2);	320 Advance(2);

320 break;	321 break;

321 } else if (Next() == '!') {	322 } else if (Next() == '!') {

322 subexpr_type = NEGATIVE_LOOKAROUND;	323 subexpr_type = NEGATIVE_LOOKAROUND;

323 lookaround_type = RegExpLookaround::LOOKBEHIND;	324 lookaround_type = RegExpLookaround::LOOKBEHIND;

324 Advance(2);	325 Advance(2);

325 break;	326 break;

326 }	327 }

327 }	328 }

328 if (FLAG_harmony_regexp_named_captures && unicode()) {	329 if (FLAG_harmony_regexp_named_captures) {

	330 has_named_captures_ = true;

329 is_named_capture = true;	331 is_named_capture = true;

330 Advance();	332 Advance();

331 break;	333 break;

332 }	334 }

333 // Fall through.	335 // Fall through.

334 default:	336 default:

335 return ReportError(CStrVector("Invalid group"));	337 return ReportError(CStrVector("Invalid group"));

336 }	338 }

337 }	339 }

338	340

(...skipping 195 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
534 builder->AddEscapedUnicodeCharacter(value);	536 builder->AddEscapedUnicodeCharacter(value);

535 } else if (!unicode()) {	537 } else if (!unicode()) {

536 builder->AddCharacter('u');	538 builder->AddCharacter('u');

537 } else {	539 } else {

538 // With /u, invalid escapes are not treated as identity escapes.	540 // With /u, invalid escapes are not treated as identity escapes.

539 return ReportError(CStrVector("Invalid unicode escape"));	541 return ReportError(CStrVector("Invalid unicode escape"));

540 }	542 }

541 break;	543 break;

542 }	544 }

543 case 'k':	545 case 'k':

544 if (FLAG_harmony_regexp_named_captures && unicode()) {	546 // Either an identity escape or a named back-reference. The two

	547 // interpretations are mutually exclusive: '\k' is interpreted as

	548 // an identity escape for non-unicode patterns without named

	549 // capture groups, and as the beginning of a named back-reference

	550 // in all other cases.

	551 if (FLAG_harmony_regexp_named_captures &&

	552 (unicode() \|\| HasNamedCaptures())) {

545 Advance(2);	553 Advance(2);

546 ParseNamedBackReference(builder, state CHECK_FAILED);	554 ParseNamedBackReference(builder, state CHECK_FAILED);

547 break;	555 break;

548 }	556 }

549 // Fall through.	557 // Fall through.

550 default:	558 default:

551 Advance();	559 Advance();

552 // With /u, no identity escapes except for syntax characters	560 // With /u, no identity escapes except for syntax characters

553 // are allowed. Otherwise, all identity escapes are allowed.	561 // are allowed. Otherwise, all identity escapes are allowed.

554 if (!unicode() \|\| IsSyntaxCharacterOrSlash(current())) {	562 if (!unicode() \|\| IsSyntaxCharacterOrSlash(current())) {

(...skipping 95 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
650 #endif	658 #endif

651	659

652	660

653 // In order to know whether an escape is a backreference or not we have to scan	661 // In order to know whether an escape is a backreference or not we have to scan

654 // the entire regexp and find the number of capturing parentheses. However we	662 // the entire regexp and find the number of capturing parentheses. However we

655 // don't want to scan the regexp twice unless it is necessary. This mini-parser	663 // don't want to scan the regexp twice unless it is necessary. This mini-parser

656 // is called when needed. It can see the difference between capturing and	664 // is called when needed. It can see the difference between capturing and

657 // noncapturing parentheses and can skip character classes and backslash-escaped	665 // noncapturing parentheses and can skip character classes and backslash-escaped

658 // characters.	666 // characters.

659 void RegExpParser::ScanForCaptures() {	667 void RegExpParser::ScanForCaptures() {

	668 DCHECK(!is_scanned_for_captures_);

	669 const int saved_position = position();

660 // Start with captures started previous to current position	670 // Start with captures started previous to current position

661 int capture_count = captures_started();	671 int capture_count = captures_started();

662 // Add count of captures after this position.	672 // Add count of captures after this position.

663 int n;	673 int n;

664 while ((n = current()) != kEndMarker) {	674 while ((n = current()) != kEndMarker) {

665 Advance();	675 Advance();

666 switch (n) {	676 switch (n) {

667 case '\\':	677 case '\\':

668 Advance();	678 Advance();

669 break;	679 break;

(...skipping 15 matching lines...) Expand all Loading...
685 // * a non-capturing group '(:',	695 // * a non-capturing group '(:',

686 // * a lookbehind assertion '(?<=' '(?<!'	696 // * a lookbehind assertion '(?<=' '(?<!'

687 // * or a named capture '(?<'.	697 // * or a named capture '(?<'.

688 //	698 //

689 // Of these, only named captures are capturing groups.	699 // Of these, only named captures are capturing groups.

690 if (!FLAG_harmony_regexp_named_captures) break;	700 if (!FLAG_harmony_regexp_named_captures) break;

691	701

692 Advance();	702 Advance();

693 if (current() != '<') break;	703 if (current() != '<') break;

694	704

695 // TODO(jgruber): To be more future-proof we could test for	705 if (FLAG_harmony_regexp_lookbehind) {

696 // IdentifierStart here once it becomes clear whether group names	706 // TODO(jgruber): To be more future-proof we could test for

697 // allow unicode escapes.	707 // IdentifierStart here once it becomes clear whether group names

698 Advance();	708 // allow unicode escapes.

699 if (current() == '=' \|\| current() == '!') break;	709 // https://github.com/tc39/proposal-regexp-named-groups/issues/23

	710 Advance();

	711 if (current() == '=' \|\| current() == '!') break;

	712 }

	713

	714 // Found a possible named capture. It could turn out to be a syntax

	715 // error (e.g. an unterminated or invalid name), but that distinction

	716 // does not matter for our purposes.

	717 has_named_captures_ = true;

700 }	718 }

701 capture_count++;	719 capture_count++;

702 break;	720 break;

703 }	721 }

704 }	722 }

705 capture_count_ = capture_count;	723 capture_count_ = capture_count;

706 is_scanned_for_captures_ = true;	724 is_scanned_for_captures_ = true;

	725 Reset(saved_position);

707 }	726 }

708	727

709	728

710 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {	729 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {

711 DCHECK_EQ('\\', current());	730 DCHECK_EQ('\\', current());

712 DCHECK('1' <= Next() && Next() <= '9');	731 DCHECK('1' <= Next() && Next() <= '9');

713 // Try to parse a decimal literal that is no greater than the total number	732 // Try to parse a decimal literal that is no greater than the total number

714 // of left capturing parentheses in the input.	733 // of left capturing parentheses in the input.

715 int start = position();	734 int start = position();

716 int value = Next() - '0';	735 int value = Next() - '0';

717 Advance(2);	736 Advance(2);

718 while (true) {	737 while (true) {

719 uc32 c = current();	738 uc32 c = current();

720 if (IsDecimalDigit(c)) {	739 if (IsDecimalDigit(c)) {

721 value = 10 * value + (c - '0');	740 value = 10 * value + (c - '0');

722 if (value > kMaxCaptures) {	741 if (value > kMaxCaptures) {

723 Reset(start);	742 Reset(start);

724 return false;	743 return false;

725 }	744 }

726 Advance();	745 Advance();

727 } else {	746 } else {

728 break;	747 break;

729 }	748 }

730 }	749 }

731 if (value > captures_started()) {	750 if (value > captures_started()) {

732 if (!is_scanned_for_captures_) {	751 if (!is_scanned_for_captures_) ScanForCaptures();

733 int saved_position = position();

734 ScanForCaptures();

735 Reset(saved_position);

736 }

737 if (value > capture_count_) {	752 if (value > capture_count_) {

738 Reset(start);	753 Reset(start);

739 return false;	754 return false;

740 }	755 }

741 }	756 }

742 *index_out = value;	757 *index_out = value;

743 return true;	758 return true;

744 }	759 }

745	760

746 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {	761 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {

747 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {	762 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {

748 v->push_back(code_unit);	763 v->push_back(code_unit);

749 } else {	764 } else {

750 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));	765 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));

751 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));	766 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));

752 }	767 }

753 }	768 }

754	769

755 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {	770 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {

756 DCHECK(FLAG_harmony_regexp_named_captures);	771 DCHECK(FLAG_harmony_regexp_named_captures);

757 DCHECK(unicode());

758	772

759 ZoneVector<uc16>* name =	773 ZoneVector<uc16>* name =

760 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());	774 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());

761	775

762 bool at_start = true;	776 bool at_start = true;

763 while (true) {	777 while (true) {

764 uc32 c = current();	778 uc32 c = current();

765 Advance();	779 Advance();

766	780

767 // Convert unicode escapes.	781 // Convert unicode escapes.

768 if (c == '\\' && current() == 'u') {	782 if (c == '\\' && current() == 'u') {

	783 // TODO(jgruber): Reconsider this once the spec has settled.

	784 // https://github.com/tc39/proposal-regexp-named-groups/issues/23

769 Advance();	785 Advance();

770 if (!ParseUnicodeEscape(&c)) {	786 if (!ParseUnicodeEscape(&c)) {

771 ReportError(CStrVector("Invalid Unicode escape sequence"));	787 ReportError(CStrVector("Invalid Unicode escape sequence"));

772 return nullptr;	788 return nullptr;

773 }	789 }

774 }	790 }

775	791

776 if (at_start) {	792 if (at_start) {

777 if (!IdentifierStart::Is(c)) {	793 if (!IdentifierStart::Is(c)) {

778 ReportError(CStrVector("Invalid capture group name"));	794 ReportError(CStrVector("Invalid capture group name"));

(...skipping 12 matching lines...) Expand all Loading...
791 }	807 }

792 }	808 }

793 }	809 }

794	810

795 return name;	811 return name;

796 }	812 }

797	813

798 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,	814 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,

799 int index) {	815 int index) {

800 DCHECK(FLAG_harmony_regexp_named_captures);	816 DCHECK(FLAG_harmony_regexp_named_captures);

801 DCHECK(unicode());

802 DCHECK(0 < index && index <= captures_started_);	817 DCHECK(0 < index && index <= captures_started_);

803 DCHECK_NOT_NULL(name);	818 DCHECK_NOT_NULL(name);

804	819

805 if (named_captures_ == nullptr) {	820 if (named_captures_ == nullptr) {

806 named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());	821 named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());

807 } else {	822 } else {

808 // Check for duplicates and bail if we find any.	823 // Check for duplicates and bail if we find any.

	824 // TODO(jgruber): O(n^2).

809 for (const auto& named_capture : *named_captures_) {	825 for (const auto& named_capture : *named_captures_) {

810 if (named_capture->name() == name) {	826 if (named_capture->name() == name) {

811 ReportError(CStrVector("Duplicate capture group name"));	827 ReportError(CStrVector("Duplicate capture group name"));

812 return false;	828 return false;

813 }	829 }

814 }	830 }

815 }	831 }

816	832

817 RegExpCapture* capture = GetCapture(index);	833 RegExpCapture* capture = GetCapture(index);

818 DCHECK(capture->name() == nullptr);	834 DCHECK(capture->name() == nullptr);

(...skipping 94 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
913 for (int i = 0; i < named_captures_->length(); i++) {	929 for (int i = 0; i < named_captures_->length(); i++) {

914 RegExpCapture* capture = named_captures_->at(i);	930 RegExpCapture* capture = named_captures_->at(i);

915 MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());	931 MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());

916 array->set(i * 2, *name.ToHandleChecked());	932 array->set(i * 2, *name.ToHandleChecked());

917 array->set(i * 2 + 1, Smi::FromInt(capture->index()));	933 array->set(i * 2 + 1, Smi::FromInt(capture->index()));

918 }	934 }

919	935

920 return array;	936 return array;

921 }	937 }

922	938

	939 bool RegExpParser::HasNamedCaptures() {

	940 if (has_named_captures_ \|\| is_scanned_for_captures_) {

	941 return has_named_captures_;

	942 }

	943

	944 ScanForCaptures();

	945 DCHECK(is_scanned_for_captures_);

	946 return has_named_captures_;

	947 }

	948

923 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {	949 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {

924 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {	950 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {

925 if (s->group_type() != CAPTURE) continue;	951 if (s->group_type() != CAPTURE) continue;

926 // Return true if we found the matching capture index.	952 // Return true if we found the matching capture index.

927 if (index == s->capture_index()) return true;	953 if (index == s->capture_index()) return true;

928 // Abort if index is larger than what has been parsed up till this state.	954 // Abort if index is larger than what has been parsed up till this state.

929 if (index > s->capture_index()) return false;	955 if (index > s->capture_index()) return false;

930 }	956 }

931 return false;	957 return false;

932 }	958 }

(...skipping 905 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1838 return false;	1864 return false;

1839 }	1865 }

1840 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),	1866 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),

1841 zone());	1867 zone());

1842 LAST(ADD_TERM);	1868 LAST(ADD_TERM);

1843 return true;	1869 return true;

1844 }	1870 }

1845	1871

1846 } // namespace internal	1872 } // namespace internal

1847 } // namespace v8	1873 } // namespace v8

OLD	NEW

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/regexp-named-captures.js » ('j') | test/mjsunit/harmony/regexp-named-captures.js » ('J')