src/regexp/regexp-parser.cc - Issue 2788873002: [regexp] Allow named captures and back-references in non-unicode patterns

Unified Diff: src/regexp/regexp-parser.cc

Issue 2788873002: [regexp] Allow named captures and back-references in non-unicode patterns (Closed)

Patch Set: Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/regexp/regexp-parser.cc

diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc

index 718ec882e3dbc7d1d081f04319f93e9cabb49e55..68c5a6d2633cd31ee8ee17c0487ec87ea44d7b7d 100644

--- a/src/regexp/regexp-parser.cc

+++ b/src/regexp/regexp-parser.cc

@@ -40,6 +40,7 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,

simple_(false),

contains_anchor_(false),

is_scanned_for_captures_(false),

+ has_named_captures_(false),

failed_(false) {

DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);

Advance();

@@ -325,7 +326,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {

break;

}

- if (FLAG_harmony_regexp_named_captures && unicode()) {

+ if (FLAG_harmony_regexp_named_captures) {

+ has_named_captures_ = true;

is_named_capture = true;

Advance();

break;

@@ -541,7 +543,13 @@ RegExpTree* RegExpParser::ParseDisjunction() {

break;

}

case 'k':

- if (FLAG_harmony_regexp_named_captures && unicode()) {

+ // Either an identity escape or a named back-reference. The two

+ // interpretations are mutually exclusive: '\k' is interpreted as

+ // an identity escape for non-unicode patterns without named

+ // capture groups, and as the beginning of a named back-reference

+ // in all other cases.

+ if (FLAG_harmony_regexp_named_captures &&

+ (unicode() || HasNamedCaptures())) {

Advance(2);

ParseNamedBackReference(builder, state CHECK_FAILED);

break;

@@ -657,6 +665,8 @@ static bool IsSpecialClassEscape(uc32 c) {

// noncapturing parentheses and can skip character classes and backslash-escaped

// characters.

void RegExpParser::ScanForCaptures() {

+ DCHECK(!is_scanned_for_captures_);

+ const int saved_position = position();

// Start with captures started previous to current position

int capture_count = captures_started();

// Add count of captures after this position.

@@ -692,11 +702,19 @@ void RegExpParser::ScanForCaptures() {

Advance();

if (current() != '<') break;

- // TODO(jgruber): To be more future-proof we could test for

- // IdentifierStart here once it becomes clear whether group names

- // allow unicode escapes.

- Advance();

- if (current() == '=' || current() == '!') break;

+ if (FLAG_harmony_regexp_lookbehind) {

+ // TODO(jgruber): To be more future-proof we could test for

+ // IdentifierStart here once it becomes clear whether group names

+ // allow unicode escapes.

+ // https://github.com/tc39/proposal-regexp-named-groups/issues/23

+ Advance();

+ if (current() == '=' || current() == '!') break;

+ }

+ // Found a possible named capture. It could turn out to be a syntax

+ // error (e.g. an unterminated or invalid name), but that distinction

+ // does not matter for our purposes.

+ has_named_captures_ = true;

}

capture_count++;

break;

@@ -704,6 +722,7 @@ void RegExpParser::ScanForCaptures() {

}

capture_count_ = capture_count;

is_scanned_for_captures_ = true;

+ Reset(saved_position);

}

@@ -729,11 +748,7 @@ bool RegExpParser::ParseBackReferenceIndex(int* index_out) {

}

if (value > captures_started()) {

- if (!is_scanned_for_captures_) {

- int saved_position = position();

- ScanForCaptures();

- Reset(saved_position);

- }

+ if (!is_scanned_for_captures_) ScanForCaptures();

if (value > capture_count_) {

Reset(start);

return false;

@@ -754,7 +769,6 @@ static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {

const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {

DCHECK(FLAG_harmony_regexp_named_captures);

- DCHECK(unicode());

ZoneVector<uc16>* name =

new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());

@@ -766,6 +780,8 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {

// Convert unicode escapes.

if (c == '\\' && current() == 'u') {

+ // TODO(jgruber): Reconsider this once the spec has settled.

+ // https://github.com/tc39/proposal-regexp-named-groups/issues/23

Advance();

if (!ParseUnicodeEscape(&c)) {

ReportError(CStrVector("Invalid Unicode escape sequence"));

@@ -798,7 +814,6 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {

bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,

int index) {

DCHECK(FLAG_harmony_regexp_named_captures);

- DCHECK(unicode());

DCHECK(0 < index && index <= captures_started_);

DCHECK_NOT_NULL(name);

@@ -806,6 +821,7 @@ bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,

named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());

} else {

// Check for duplicates and bail if we find any.

+ // TODO(jgruber): O(n^2).

for (const auto& named_capture : *named_captures_) {

if (*named_capture->name() == *name) {

ReportError(CStrVector("Duplicate capture group name"));

@@ -920,6 +936,16 @@ Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {

return array;

}

+bool RegExpParser::HasNamedCaptures() {

+ if (has_named_captures_ || is_scanned_for_captures_) {

+ return has_named_captures_;

+ }

+ ScanForCaptures();

+ DCHECK(is_scanned_for_captures_);

+ return has_named_captures_;

bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {

for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {

if (s->group_type() != CAPTURE) continue;

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/regexp-named-captures.js » ('j') | test/mjsunit/harmony/regexp-named-captures.js » ('J')