Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(830)

Unified Diff: src/regexp/regexp-parser.cc

Issue 2788873002: [regexp] Allow named captures and back-references in non-unicode patterns (Closed)
Patch Set: Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/regexp/regexp-parser.cc
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc
index 718ec882e3dbc7d1d081f04319f93e9cabb49e55..68c5a6d2633cd31ee8ee17c0487ec87ea44d7b7d 100644
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
@@ -40,6 +40,7 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
simple_(false),
contains_anchor_(false),
is_scanned_for_captures_(false),
+ has_named_captures_(false),
failed_(false) {
DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);
Advance();
@@ -325,7 +326,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {
break;
}
}
- if (FLAG_harmony_regexp_named_captures && unicode()) {
+ if (FLAG_harmony_regexp_named_captures) {
+ has_named_captures_ = true;
is_named_capture = true;
Advance();
break;
@@ -541,7 +543,13 @@ RegExpTree* RegExpParser::ParseDisjunction() {
break;
}
case 'k':
- if (FLAG_harmony_regexp_named_captures && unicode()) {
+ // Either an identity escape or a named back-reference. The two
+ // interpretations are mutually exclusive: '\k' is interpreted as
+ // an identity escape for non-unicode patterns without named
+ // capture groups, and as the beginning of a named back-reference
+ // in all other cases.
+ if (FLAG_harmony_regexp_named_captures &&
+ (unicode() || HasNamedCaptures())) {
Advance(2);
ParseNamedBackReference(builder, state CHECK_FAILED);
break;
@@ -657,6 +665,8 @@ static bool IsSpecialClassEscape(uc32 c) {
// noncapturing parentheses and can skip character classes and backslash-escaped
// characters.
void RegExpParser::ScanForCaptures() {
+ DCHECK(!is_scanned_for_captures_);
+ const int saved_position = position();
// Start with captures started previous to current position
int capture_count = captures_started();
// Add count of captures after this position.
@@ -692,11 +702,19 @@ void RegExpParser::ScanForCaptures() {
Advance();
if (current() != '<') break;
- // TODO(jgruber): To be more future-proof we could test for
- // IdentifierStart here once it becomes clear whether group names
- // allow unicode escapes.
- Advance();
- if (current() == '=' || current() == '!') break;
+ if (FLAG_harmony_regexp_lookbehind) {
+ // TODO(jgruber): To be more future-proof we could test for
+ // IdentifierStart here once it becomes clear whether group names
+ // allow unicode escapes.
+ // https://github.com/tc39/proposal-regexp-named-groups/issues/23
+ Advance();
+ if (current() == '=' || current() == '!') break;
+ }
+
+ // Found a possible named capture. It could turn out to be a syntax
+ // error (e.g. an unterminated or invalid name), but that distinction
+ // does not matter for our purposes.
+ has_named_captures_ = true;
}
capture_count++;
break;
@@ -704,6 +722,7 @@ void RegExpParser::ScanForCaptures() {
}
capture_count_ = capture_count;
is_scanned_for_captures_ = true;
+ Reset(saved_position);
}
@@ -729,11 +748,7 @@ bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
}
}
if (value > captures_started()) {
- if (!is_scanned_for_captures_) {
- int saved_position = position();
- ScanForCaptures();
- Reset(saved_position);
- }
+ if (!is_scanned_for_captures_) ScanForCaptures();
if (value > capture_count_) {
Reset(start);
return false;
@@ -754,7 +769,6 @@ static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
DCHECK(FLAG_harmony_regexp_named_captures);
- DCHECK(unicode());
ZoneVector<uc16>* name =
new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
@@ -766,6 +780,8 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
// Convert unicode escapes.
if (c == '\\' && current() == 'u') {
+ // TODO(jgruber): Reconsider this once the spec has settled.
+ // https://github.com/tc39/proposal-regexp-named-groups/issues/23
Advance();
if (!ParseUnicodeEscape(&c)) {
ReportError(CStrVector("Invalid Unicode escape sequence"));
@@ -798,7 +814,6 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
int index) {
DCHECK(FLAG_harmony_regexp_named_captures);
- DCHECK(unicode());
DCHECK(0 < index && index <= captures_started_);
DCHECK_NOT_NULL(name);
@@ -806,6 +821,7 @@ bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
} else {
// Check for duplicates and bail if we find any.
+ // TODO(jgruber): O(n^2).
for (const auto& named_capture : *named_captures_) {
if (*named_capture->name() == *name) {
ReportError(CStrVector("Duplicate capture group name"));
@@ -920,6 +936,16 @@ Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
return array;
}
+bool RegExpParser::HasNamedCaptures() {
+ if (has_named_captures_ || is_scanned_for_captures_) {
+ return has_named_captures_;
+ }
+
+ ScanForCaptures();
+ DCHECK(is_scanned_for_captures_);
+ return has_named_captures_;
+}
+
bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
if (s->group_type() != CAPTURE) continue;

Powered by Google App Engine
This is Rietveld 408576698