src/regexp/regexp-parser.cc - Issue 2791163003: [regexp] Support unicode capture names in non-unicode patterns

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 2791163003: [regexp] Support unicode capture names in non-unicode patterns (Closed)

Patch Set: Remove template parameter from ReadNext Created 3 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2016 the V8 project authors. All rights reserved.	1 // Copyright 2016 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/regexp-parser.h"	5 #include "src/regexp/regexp-parser.h"

6	6

7 #include "src/char-predicates-inl.h"	7 #include "src/char-predicates-inl.h"

8 #include "src/factory.h"	8 #include "src/factory.h"

9 #include "src/isolate.h"	9 #include "src/isolate.h"

10 #include "src/objects-inl.h"	10 #include "src/objects-inl.h"

(...skipping 28 matching lines...) Expand all Loading...
39 has_more_(true),	39 has_more_(true),

40 simple_(false),	40 simple_(false),

41 contains_anchor_(false),	41 contains_anchor_(false),

42 is_scanned_for_captures_(false),	42 is_scanned_for_captures_(false),

43 has_named_captures_(false),	43 has_named_captures_(false),

44 failed_(false) {	44 failed_(false) {

45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);	45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);

46 Advance();	46 Advance();

47 }	47 }

48	48

49 template <bool update_position>	49 inline uc32 RegExpParser::ReadNext(bool update_position, ScanMode mode) {

50 inline uc32 RegExpParser::ReadNext() {

51 int position = next_pos_;	50 int position = next_pos_;

52 uc32 c0 = in()->Get(position);	51 uc32 c0 = in()->Get(position);

53 position++;	52 position++;

54 // Read the whole surrogate pair in case of unicode flag, if possible.	53 const bool try_combine_surrogate_pairs =

55 if (unicode() && position < in()->length() &&	54 (unicode() \|\| mode == ScanMode::FORCE_COMBINE_SURROGATE_PAIRS);

	55 if (try_combine_surrogate_pairs && position < in()->length() &&

56 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {	56 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {

57 uc16 c1 = in()->Get(position);	57 uc16 c1 = in()->Get(position);

58 if (unibrow::Utf16::IsTrailSurrogate(c1)) {	58 if (unibrow::Utf16::IsTrailSurrogate(c1)) {

59 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);	59 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);

60 position++;	60 position++;

61 }	61 }

62 }	62 }

63 if (update_position) next_pos_ = position;	63 if (update_position) next_pos_ = position;

64 return c0;	64 return c0;

65 }	65 }

66	66

67	67

68 uc32 RegExpParser::Next() {	68 uc32 RegExpParser::Next() {

69 if (has_next()) {	69 if (has_next()) {

70 return ReadNext<false>();	70 return ReadNext(false, ScanMode::DEFAULT);

71 } else {	71 } else {

72 return kEndMarker;	72 return kEndMarker;

73 }	73 }

74 }	74 }

75	75

76	76 void RegExpParser::Advance(ScanMode mode) {

77 void RegExpParser::Advance() {

78 if (has_next()) {	77 if (has_next()) {

79 StackLimitCheck check(isolate());	78 StackLimitCheck check(isolate());

80 if (check.HasOverflowed()) {	79 if (check.HasOverflowed()) {

81 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow");	80 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow");

82 ReportError(CStrVector(	81 ReportError(CStrVector(

83 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));	82 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));

84 } else if (zone()->excess_allocation()) {	83 } else if (zone()->excess_allocation()) {

85 ReportError(CStrVector("Regular expression too large"));	84 ReportError(CStrVector("Regular expression too large"));

86 } else {	85 } else {

87 current_ = ReadNext<true>();	86 current_ = ReadNext(true, mode);

88 }	87 }

89 } else {	88 } else {

90 current_ = kEndMarker;	89 current_ = kEndMarker;

91 // Advance so that position() points to 1-after-the-last-character. This is	90 // Advance so that position() points to 1-after-the-last-character. This is

92 // important so that Reset() to this position works correctly.	91 // important so that Reset() to this position works correctly.

93 next_pos_ = in()->length() + 1;	92 next_pos_ = in()->length() + 1;

94 has_more_ = false;	93 has_more_ = false;

95 }	94 }

96 }	95 }

97	96

98	97

99 void RegExpParser::Reset(int pos) {	98 void RegExpParser::Reset(int pos) {

100 next_pos_ = pos;	99 next_pos_ = pos;

101 has_more_ = (pos < in()->length());	100 has_more_ = (pos < in()->length());

102 Advance();	101 Advance();

103 }	102 }

104	103

105	104 void RegExpParser::Advance(int dist, ScanMode mode) {

106 void RegExpParser::Advance(int dist) {

107 next_pos_ += dist - 1;	105 next_pos_ += dist - 1;

108 Advance();	106 Advance(mode);

109 }	107 }

110	108

111	109

112 bool RegExpParser::simple() { return simple_; }	110 bool RegExpParser::simple() { return simple_; }

113	111

114 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {	112 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {

115 switch (c) {	113 switch (c) {

116 case '^':	114 case '^':

117 case '$':	115 case '$':

118 case '\\':	116 case '\\':

(...skipping 203 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
322 } else if (Next() == '!') {	320 } else if (Next() == '!') {

323 subexpr_type = NEGATIVE_LOOKAROUND;	321 subexpr_type = NEGATIVE_LOOKAROUND;

324 lookaround_type = RegExpLookaround::LOOKBEHIND;	322 lookaround_type = RegExpLookaround::LOOKBEHIND;

325 Advance(2);	323 Advance(2);

326 break;	324 break;

327 }	325 }

328 }	326 }

329 if (FLAG_harmony_regexp_named_captures) {	327 if (FLAG_harmony_regexp_named_captures) {

330 has_named_captures_ = true;	328 has_named_captures_ = true;

331 is_named_capture = true;	329 is_named_capture = true;

332 Advance();

333 break;	330 break;

334 }	331 }

335 // Fall through.	332 // Fall through.

336 default:	333 default:

337 return ReportError(CStrVector("Invalid group"));	334 return ReportError(CStrVector("Invalid group"));

338 }	335 }

339 }	336 }

340	337

341 const ZoneVector<uc16>* capture_name = nullptr;	338 const ZoneVector<uc16>* capture_name = nullptr;

342 if (subexpr_type == CAPTURE) {	339 if (subexpr_type == CAPTURE) {

(...skipping 419 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
762 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {	759 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {

763 v->push_back(code_unit);	760 v->push_back(code_unit);

764 } else {	761 } else {

765 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));	762 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));

766 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));	763 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));

767 }	764 }

768 }	765 }

769	766

770 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {	767 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {

771 DCHECK(FLAG_harmony_regexp_named_captures);	768 DCHECK(FLAG_harmony_regexp_named_captures);

	769 DCHECK_EQ(current(), '<');

772	770

773 ZoneVector<uc16>* name =	771 ZoneVector<uc16>* name =

774 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());	772 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());

775	773

	774 // Capture names can always contain surrogate pairs, and we need to scan

	775 // accordingly.

	776 const ScanMode scan_mode = ScanMode::FORCE_COMBINE_SURROGATE_PAIRS;

	777 Advance(scan_mode);

	778

776 bool at_start = true;	779 bool at_start = true;

777 while (true) {	780 while (true) {

778 uc32 c = current();	781 uc32 c = current();

779 Advance();	782 Advance(scan_mode);

780	783

781 // Convert unicode escapes.	784 // Convert unicode escapes.

782 if (c == '\\' && current() == 'u') {	785 if (c == '\\' && current() == 'u') {

783 // TODO(jgruber): Reconsider this once the spec has settled.	786 // TODO(jgruber): Reconsider this once the spec has settled.

784 // https://github.com/tc39/proposal-regexp-named-groups/issues/23	787 // https://github.com/tc39/proposal-regexp-named-groups/issues/23

785 Advance();	788 Advance(scan_mode);

786 if (!ParseUnicodeEscape(&c)) {	789 if (!ParseUnicodeEscape(&c)) {

787 ReportError(CStrVector("Invalid Unicode escape sequence"));	790 ReportError(CStrVector("Invalid Unicode escape sequence"));

788 return nullptr;	791 return nullptr;

789 }	792 }

790 }	793 }

791	794

792 // The backslash char is misclassified as both ID_Start and ID_Continue.	795 // The backslash char is misclassified as both ID_Start and ID_Continue.

793 if (c == '\\') {	796 if (c == '\\') {

794 ReportError(CStrVector("Invalid capture group name"));	797 ReportError(CStrVector("Invalid capture group name"));

795 return nullptr;	798 return nullptr;

(...skipping 50 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
846 }	849 }

847	850

848 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,	851 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,

849 RegExpParserState* state) {	852 RegExpParserState* state) {

850 // The parser is assumed to be on the '<' in \k<name>.	853 // The parser is assumed to be on the '<' in \k<name>.

851 if (current() != '<') {	854 if (current() != '<') {

852 ReportError(CStrVector("Invalid named reference"));	855 ReportError(CStrVector("Invalid named reference"));

853 return false;	856 return false;

854 }	857 }

855	858

856 Advance();

857 const ZoneVector<uc16>* name = ParseCaptureGroupName();	859 const ZoneVector<uc16>* name = ParseCaptureGroupName();

858 if (name == nullptr) {	860 if (name == nullptr) {

859 return false;	861 return false;

860 }	862 }

861	863

862 if (state->IsInsideCaptureGroup(name)) {	864 if (state->IsInsideCaptureGroup(name)) {

863 builder->AddEmpty();	865 builder->AddEmpty();

864 } else {	866 } else {

865 RegExpBackReference* atom = new (zone()) RegExpBackReference();	867 RegExpBackReference* atom = new (zone()) RegExpBackReference();

866 atom->set_name(name);	868 atom->set_name(name);

(...skipping 1003 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1870 return false;	1872 return false;

1871 }	1873 }

1872 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),	1874 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),

1873 zone());	1875 zone());

1874 LAST(ADD_TERM);	1876 LAST(ADD_TERM);

1875 return true;	1877 return true;

1876 }	1878 }

1877	1879

1878 } // namespace internal	1880 } // namespace internal

1879 } // namespace v8	1881 } // namespace v8

OLD	NEW

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/regexp-named-captures.js » ('j') | no next file with comments »