src/regexp/regexp-parser.cc - Issue 2859933003: Revert of [regexp] Support unicode capture names in non-unicode patterns

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 2859933003: Revert of [regexp] Support unicode capture names in non-unicode patterns (Closed)

Patch Set: Fixing test Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2016 the V8 project authors. All rights reserved.	1 // Copyright 2016 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/regexp-parser.h"	5 #include "src/regexp/regexp-parser.h"

6	6

7 #include "src/char-predicates-inl.h"	7 #include "src/char-predicates-inl.h"

8 #include "src/factory.h"	8 #include "src/factory.h"

9 #include "src/isolate.h"	9 #include "src/isolate.h"

10 #include "src/objects-inl.h"	10 #include "src/objects-inl.h"

(...skipping 28 matching lines...) Expand all Loading...
39 has_more_(true),	39 has_more_(true),

40 simple_(false),	40 simple_(false),

41 contains_anchor_(false),	41 contains_anchor_(false),

42 is_scanned_for_captures_(false),	42 is_scanned_for_captures_(false),

43 has_named_captures_(false),	43 has_named_captures_(false),

44 failed_(false) {	44 failed_(false) {

45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);	45 DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);

46 Advance();	46 Advance();

47 }	47 }

48	48

49 inline uc32 RegExpParser::ReadNext(bool update_position, ScanMode mode) {	49 template <bool update_position>

	50 inline uc32 RegExpParser::ReadNext() {

50 int position = next_pos_;	51 int position = next_pos_;

51 uc32 c0 = in()->Get(position);	52 uc32 c0 = in()->Get(position);

52 position++;	53 position++;

53 const bool try_combine_surrogate_pairs =	54 // Read the whole surrogate pair in case of unicode flag, if possible.

54 (unicode() \|\| mode == ScanMode::FORCE_COMBINE_SURROGATE_PAIRS);	55 if (unicode() && position < in()->length() &&

55 if (try_combine_surrogate_pairs && position < in()->length() &&

56 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {	56 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {

57 uc16 c1 = in()->Get(position);	57 uc16 c1 = in()->Get(position);

58 if (unibrow::Utf16::IsTrailSurrogate(c1)) {	58 if (unibrow::Utf16::IsTrailSurrogate(c1)) {

59 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);	59 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);

60 position++;	60 position++;

61 }	61 }

62 }	62 }

63 if (update_position) next_pos_ = position;	63 if (update_position) next_pos_ = position;

64 return c0;	64 return c0;

65 }	65 }

66	66

67	67

68 uc32 RegExpParser::Next() {	68 uc32 RegExpParser::Next() {

69 if (has_next()) {	69 if (has_next()) {

70 return ReadNext(false, ScanMode::DEFAULT);	70 return ReadNext<false>();

71 } else {	71 } else {

72 return kEndMarker;	72 return kEndMarker;

73 }	73 }

74 }	74 }

75	75

76 void RegExpParser::Advance(ScanMode mode) {	76 void RegExpParser::Advance() {

77 if (has_next()) {	77 if (has_next()) {

78 StackLimitCheck check(isolate());	78 StackLimitCheck check(isolate());

79 if (check.HasOverflowed()) {	79 if (check.HasOverflowed()) {

80 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow");	80 if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow");

81 ReportError(CStrVector(	81 ReportError(CStrVector(

82 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));	82 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));

83 } else if (zone()->excess_allocation()) {	83 } else if (zone()->excess_allocation()) {

84 ReportError(CStrVector("Regular expression too large"));	84 ReportError(CStrVector("Regular expression too large"));

85 } else {	85 } else {

86 current_ = ReadNext(true, mode);	86 current_ = ReadNext<true>();

87 }	87 }

88 } else {	88 } else {

89 current_ = kEndMarker;	89 current_ = kEndMarker;

90 // Advance so that position() points to 1-after-the-last-character. This is	90 // Advance so that position() points to 1-after-the-last-character. This is

91 // important so that Reset() to this position works correctly.	91 // important so that Reset() to this position works correctly.

92 next_pos_ = in()->length() + 1;	92 next_pos_ = in()->length() + 1;

93 has_more_ = false;	93 has_more_ = false;

94 }	94 }

95 }	95 }

96	96

97	97

98 void RegExpParser::Reset(int pos) {	98 void RegExpParser::Reset(int pos) {

99 next_pos_ = pos;	99 next_pos_ = pos;

100 has_more_ = (pos < in()->length());	100 has_more_ = (pos < in()->length());

101 Advance();	101 Advance();

102 }	102 }

103	103

104 void RegExpParser::Advance(int dist, ScanMode mode) {	104 void RegExpParser::Advance(int dist) {

105 next_pos_ += dist - 1;	105 next_pos_ += dist - 1;

106 Advance(mode);	106 Advance();

107 }	107 }

108	108

109	109

110 bool RegExpParser::simple() { return simple_; }	110 bool RegExpParser::simple() { return simple_; }

111	111

112 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {	112 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {

113 switch (c) {	113 switch (c) {

114 case '^':	114 case '^':

115 case '$':	115 case '$':

116 case '\\':	116 case '\\':

(...skipping 202 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
319 } else if (Next() == '!') {	319 } else if (Next() == '!') {

320 subexpr_type = NEGATIVE_LOOKAROUND;	320 subexpr_type = NEGATIVE_LOOKAROUND;

321 lookaround_type = RegExpLookaround::LOOKBEHIND;	321 lookaround_type = RegExpLookaround::LOOKBEHIND;

322 Advance(2);	322 Advance(2);

323 break;	323 break;

324 }	324 }

325 }	325 }

326 if (FLAG_harmony_regexp_named_captures) {	326 if (FLAG_harmony_regexp_named_captures) {

327 has_named_captures_ = true;	327 has_named_captures_ = true;

328 is_named_capture = true;	328 is_named_capture = true;

	329 Advance();

329 break;	330 break;

330 }	331 }

331 // Fall through.	332 // Fall through.

332 default:	333 default:

333 return ReportError(CStrVector("Invalid group"));	334 return ReportError(CStrVector("Invalid group"));

334 }	335 }

335 }	336 }

336	337

337 const ZoneVector<uc16>* capture_name = nullptr;	338 const ZoneVector<uc16>* capture_name = nullptr;

338 if (subexpr_type == CAPTURE) {	339 if (subexpr_type == CAPTURE) {

(...skipping 415 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
754 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {	755 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {

755 v->push_back(code_unit);	756 v->push_back(code_unit);

756 } else {	757 } else {

757 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));	758 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));

758 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));	759 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));

759 }	760 }

760 }	761 }

761	762

762 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {	763 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {

763 DCHECK(FLAG_harmony_regexp_named_captures);	764 DCHECK(FLAG_harmony_regexp_named_captures);

764 DCHECK_EQ(current(), '<');

765	765

766 ZoneVector<uc16>* name =	766 ZoneVector<uc16>* name =

767 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());	767 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());

768	768

769 // Capture names can always contain surrogate pairs, and we need to scan

770 // accordingly.

771 const ScanMode scan_mode = ScanMode::FORCE_COMBINE_SURROGATE_PAIRS;

772 Advance(scan_mode);

773

774 bool at_start = true;	769 bool at_start = true;

775 while (true) {	770 while (true) {

776 uc32 c = current();	771 uc32 c = current();

777 Advance(scan_mode);	772 Advance();

778	773

779 // Convert unicode escapes.	774 // Convert unicode escapes.

780 if (c == '\\' && current() == 'u') {	775 if (c == '\\' && current() == 'u') {

781 Advance(scan_mode);	776 Advance();

782 if (!ParseUnicodeEscape(&c)) {	777 if (!ParseUnicodeEscape(&c)) {

783 ReportError(CStrVector("Invalid Unicode escape sequence"));	778 ReportError(CStrVector("Invalid Unicode escape sequence"));

784 return nullptr;	779 return nullptr;

785 }	780 }

786 }	781 }

787	782

788 // The backslash char is misclassified as both ID_Start and ID_Continue.	783 // The backslash char is misclassified as both ID_Start and ID_Continue.

789 if (c == '\\') {	784 if (c == '\\') {

790 ReportError(CStrVector("Invalid capture group name"));	785 ReportError(CStrVector("Invalid capture group name"));

791 return nullptr;	786 return nullptr;

(...skipping 50 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
842 }	837 }

843	838

844 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,	839 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,

845 RegExpParserState* state) {	840 RegExpParserState* state) {

846 // The parser is assumed to be on the '<' in \k<name>.	841 // The parser is assumed to be on the '<' in \k<name>.

847 if (current() != '<') {	842 if (current() != '<') {

848 ReportError(CStrVector("Invalid named reference"));	843 ReportError(CStrVector("Invalid named reference"));

849 return false;	844 return false;

850 }	845 }

851	846

	847 Advance();

852 const ZoneVector<uc16>* name = ParseCaptureGroupName();	848 const ZoneVector<uc16>* name = ParseCaptureGroupName();

853 if (name == nullptr) {	849 if (name == nullptr) {

854 return false;	850 return false;

855 }	851 }

856	852

857 if (state->IsInsideCaptureGroup(name)) {	853 if (state->IsInsideCaptureGroup(name)) {

858 builder->AddEmpty();	854 builder->AddEmpty();

859 } else {	855 } else {

860 RegExpBackReference* atom = new (zone()) RegExpBackReference();	856 RegExpBackReference* atom = new (zone()) RegExpBackReference();

861 atom->set_name(name);	857 atom->set_name(name);

(...skipping 1072 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1934 return false;	1930 return false;

1935 }	1931 }

1936 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),	1932 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),

1937 zone());	1933 zone());

1938 LAST(ADD_TERM);	1934 LAST(ADD_TERM);

1939 return true;	1935 return true;

1940 }	1936 }

1941	1937

1942 } // namespace internal	1938 } // namespace internal

1943 } // namespace v8	1939 } // namespace v8

OLD	NEW

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/regexp-named-captures.js » ('j') | no next file with comments »