src/regexp/regexp-parser.cc - Issue 2791163003: [regexp] Support unicode capture names in non-unicode patterns

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 2791163003: [regexp] Support unicode capture names in non-unicode patterns (Closed)

Patch Set: Update test Created 3 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2016 the V8 project authors. All rights reserved.	1 // Copyright 2016 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/regexp-parser.h"	5 #include "src/regexp/regexp-parser.h"

6	6

7 #include "src/char-predicates-inl.h"	7 #include "src/char-predicates-inl.h"

8 #include "src/factory.h"	8 #include "src/factory.h"

9 #include "src/isolate.h"	9 #include "src/isolate.h"

10 #include "src/objects-inl.h"	10 #include "src/objects-inl.h"

(...skipping 28 matching lines...) Expand all Loading...
39 has_more_(true),	39 has_more_(true),

40 simple_(false),	40 simple_(false),

41 contains_anchor_(false),	41 contains_anchor_(false),

42 is_scanned_for_captures_(false),	42 is_scanned_for_captures_(false),

43 has_named_captures_(false),	43 has_named_captures_(false),

44 failed_(false) {	44 failed_(false) {

45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);	45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);

46 Advance();	46 Advance();

47 }	47 }

48	48

49 template <bool update_position>	49 template <bool update_position>
	Yang 2017/04/05 12:52:38 I wonder whether it makes sense to make the scan m I wonder whether it makes sense to make the scan mode a template param. Or alternatively make this bool a normal parameter. I sort of doubt that this has any performance implications. jgruber 2017/04/05 12:57:30 Agreed, will do that. Show quoted text On 2017/04/05 12:52:38, Yang wrote: > I wonder whether it makes sense to make the scan mode a template param. Or > alternatively make this bool a normal parameter. I sort of doubt that this has > any performance implications. Agreed, will do that. jgruber 2017/04/07 07:10:56 Done. Show quoted text On 2017/04/05 12:57:30, jgruber wrote: > On 2017/04/05 12:52:38, Yang wrote: > > I wonder whether it makes sense to make the scan mode a template param. Or > > alternatively make this bool a normal parameter. I sort of doubt that this has > > any performance implications. > > Agreed, will do that. Done.
50 inline uc32 RegExpParser::ReadNext() {	50 inline uc32 RegExpParser::ReadNext(ScanMode mode) {

51 int position = next_pos_;	51 int position = next_pos_;

52 uc32 c0 = in()->Get(position);	52 uc32 c0 = in()->Get(position);

53 position++;	53 position++;

54 // Read the whole surrogate pair in case of unicode flag, if possible.	54 const bool try_combine_surrogate_pairs =

55 if (unicode() && position < in()->length() &&	55 (unicode() \|\| mode == ScanMode::FORCE_COMBINE_SURROGATE_PAIRS);

	56 if (try_combine_surrogate_pairs && position < in()->length() &&

56 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {	57 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {

57 uc16 c1 = in()->Get(position);	58 uc16 c1 = in()->Get(position);

58 if (unibrow::Utf16::IsTrailSurrogate(c1)) {	59 if (unibrow::Utf16::IsTrailSurrogate(c1)) {

59 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);	60 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);

60 position++;	61 position++;

61 }	62 }

62 }	63 }

63 if (update_position) next_pos_ = position;	64 if (update_position) next_pos_ = position;

64 return c0;	65 return c0;

65 }	66 }

66	67

67	68

68 uc32 RegExpParser::Next() {	69 uc32 RegExpParser::Next() {

69 if (has_next()) {	70 if (has_next()) {

70 return ReadNext<false>();	71 return ReadNext<false>(ScanMode::DEFAULT);

71 } else {	72 } else {

72 return kEndMarker;	73 return kEndMarker;

73 }	74 }

74 }	75 }

75	76

76	77 void RegExpParser::Advance(ScanMode mode) {

77 void RegExpParser::Advance() {

78 if (has_next()) {	78 if (has_next()) {

79 StackLimitCheck check(isolate());	79 StackLimitCheck check(isolate());

80 if (check.HasOverflowed()) {	80 if (check.HasOverflowed()) {

81 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow");	81 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow");

82 ReportError(CStrVector(	82 ReportError(CStrVector(

83 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));	83 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));

84 } else if (zone()->excess_allocation()) {	84 } else if (zone()->excess_allocation()) {

85 ReportError(CStrVector("Regular expression too large"));	85 ReportError(CStrVector("Regular expression too large"));

86 } else {	86 } else {

87 current_ = ReadNext<true>();	87 current_ = ReadNext<true>(mode);

88 }	88 }

89 } else {	89 } else {

90 current_ = kEndMarker;	90 current_ = kEndMarker;

91 // Advance so that position() points to 1-after-the-last-character. This is	91 // Advance so that position() points to 1-after-the-last-character. This is

92 // important so that Reset() to this position works correctly.	92 // important so that Reset() to this position works correctly.

93 next_pos_ = in()->length() + 1;	93 next_pos_ = in()->length() + 1;

94 has_more_ = false;	94 has_more_ = false;

95 }	95 }

96 }	96 }

97	97

98	98

99 void RegExpParser::Reset(int pos) {	99 void RegExpParser::Reset(int pos) {

100 next_pos_ = pos;	100 next_pos_ = pos;

101 has_more_ = (pos < in()->length());	101 has_more_ = (pos < in()->length());

102 Advance();	102 Advance();

103 }	103 }

104	104

105	105 void RegExpParser::Advance(int dist, ScanMode mode) {

106 void RegExpParser::Advance(int dist) {

107 next_pos_ += dist - 1;	106 next_pos_ += dist - 1;

108 Advance();	107 Advance(mode);

109 }	108 }

110	109

111	110

112 bool RegExpParser::simple() { return simple_; }	111 bool RegExpParser::simple() { return simple_; }

113	112

114 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {	113 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {

115 switch (c) {	114 switch (c) {

116 case '^':	115 case '^':

117 case '$':	116 case '$':

118 case '\\':	117 case '\\':

(...skipping 203 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
322 } else if (Next() == '!') {	321 } else if (Next() == '!') {

323 subexpr_type = NEGATIVE_LOOKAROUND;	322 subexpr_type = NEGATIVE_LOOKAROUND;

324 lookaround_type = RegExpLookaround::LOOKBEHIND;	323 lookaround_type = RegExpLookaround::LOOKBEHIND;

325 Advance(2);	324 Advance(2);

326 break;	325 break;

327 }	326 }

328 }	327 }

329 if (FLAG_harmony_regexp_named_captures) {	328 if (FLAG_harmony_regexp_named_captures) {

330 has_named_captures_ = true;	329 has_named_captures_ = true;

331 is_named_capture = true;	330 is_named_capture = true;

332 Advance();
Yang 2017/04/05 12:52:38 How come we can remove this Advance here and below How come we can remove this Advance here and below? jgruber 2017/04/05 12:57:30 The advance is now done in ParseCaptureGroupName. Show quoted text On 2017/04/05 12:52:38, Yang wrote: > How come we can remove this Advance here and below? The advance is now done in ParseCaptureGroupName. That was necessary since advancing from the '<' already needs to use the FORCE_COMBINE_SURROGATE_PAIRS mode.
333 break;	331 break;

334 }	332 }

335 // Fall through.	333 // Fall through.

336 default:	334 default:

337 return ReportError(CStrVector("Invalid group"));	335 return ReportError(CStrVector("Invalid group"));

338 }	336 }

339 }	337 }

340	338

341 const ZoneVector<uc16>* capture_name = nullptr;	339 const ZoneVector<uc16>* capture_name = nullptr;

342 if (subexpr_type == CAPTURE) {	340 if (subexpr_type == CAPTURE) {

(...skipping 419 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
762 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {	760 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {

763 v->push_back(code_unit);	761 v->push_back(code_unit);

764 } else {	762 } else {

765 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));	763 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));

766 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));	764 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));

767 }	765 }

768 }	766 }

769	767

770 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {	768 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {

771 DCHECK(FLAG_harmony_regexp_named_captures);	769 DCHECK(FLAG_harmony_regexp_named_captures);

	770 DCHECK_EQ(current(), '<');

772	771

773 ZoneVector<uc16>* name =	772 ZoneVector<uc16>* name =

774 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());	773 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());

775	774

	775 // Capture names can always contain surrogate pairs, and we need to scan

	776 // accordingly.

	777 const ScanMode scan_mode = ScanMode::FORCE_COMBINE_SURROGATE_PAIRS;

	778 Advance(scan_mode);

	779

776 bool at_start = true;	780 bool at_start = true;

777 while (true) {	781 while (true) {

778 uc32 c = current();	782 uc32 c = current();

779 Advance();	783 Advance(scan_mode);

780	784

781 // Convert unicode escapes.	785 // Convert unicode escapes.

782 if (c == '\\' && current() == 'u') {	786 if (c == '\\' && current() == 'u') {

783 // TODO(jgruber): Reconsider this once the spec has settled.	787 // TODO(jgruber): Reconsider this once the spec has settled.

784 // https://github.com/tc39/proposal-regexp-named-groups/issues/23	788 // https://github.com/tc39/proposal-regexp-named-groups/issues/23

785 Advance();	789 Advance(scan_mode);

786 if (!ParseUnicodeEscape(&c)) {	790 if (!ParseUnicodeEscape(&c)) {

787 ReportError(CStrVector("Invalid Unicode escape sequence"));	791 ReportError(CStrVector("Invalid Unicode escape sequence"));

788 return nullptr;	792 return nullptr;

789 }	793 }

790 }	794 }

791	795

792 if (at_start) {	796 if (at_start) {

793 if (!IdentifierStart::Is(c)) {	797 if (!IdentifierStart::Is(c)) {

794 ReportError(CStrVector("Invalid capture group name"));	798 ReportError(CStrVector("Invalid capture group name"));

795 return nullptr;	799 return nullptr;

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
840 }	844 }

841	845

842 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,	846 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,

843 RegExpParserState* state) {	847 RegExpParserState* state) {

844 // The parser is assumed to be on the '<' in \k<name>.	848 // The parser is assumed to be on the '<' in \k<name>.

845 if (current() != '<') {	849 if (current() != '<') {

846 ReportError(CStrVector("Invalid named reference"));	850 ReportError(CStrVector("Invalid named reference"));

847 return false;	851 return false;

848 }	852 }

849	853

850 Advance();

851 const ZoneVector<uc16>* name = ParseCaptureGroupName();	854 const ZoneVector<uc16>* name = ParseCaptureGroupName();

852 if (name == nullptr) {	855 if (name == nullptr) {

853 return false;	856 return false;

854 }	857 }

855	858

856 if (state->IsInsideCaptureGroup(name)) {	859 if (state->IsInsideCaptureGroup(name)) {

857 builder->AddEmpty();	860 builder->AddEmpty();

858 } else {	861 } else {

859 RegExpBackReference* atom = new (zone()) RegExpBackReference();	862 RegExpBackReference* atom = new (zone()) RegExpBackReference();

860 atom->set_name(name);	863 atom->set_name(name);

(...skipping 1003 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1864 return false;	1867 return false;

1865 }	1868 }

1866 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),	1869 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),

1867 zone());	1870 zone());

1868 LAST(ADD_TERM);	1871 LAST(ADD_TERM);

1869 return true;	1872 return true;

1870 }	1873 }

1871	1874

1872 } // namespace internal	1875 } // namespace internal

1873 } // namespace v8	1876 } // namespace v8

OLD	NEW

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/regexp-named-captures.js » ('j') | no next file with comments »