src/regexp/regexp-parser.cc - Issue 2050343002: [regexp] Experimental support for regexp named captures

Unified Diff: src/regexp/regexp-parser.cc

Issue 2050343002: [regexp] Experimental support for regexp named captures (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: static_cast<int> Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/regexp/regexp-parser.cc

diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc

index bdfa13f719e09416f4af414d696100d2e6101690..2bac26020644bb8bc5949930e2d456c06b2f2704 100644

--- a/src/regexp/regexp-parser.cc

+++ b/src/regexp/regexp-parser.cc

@@ -25,6 +25,9 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,

zone_(zone),

error_(error),

captures_(NULL),

+ named_captures_(NULL),

+ named_back_references_(NULL),

+ capture_strings_(0, zone),

in_(in),

current_(kEndMarker),

ignore_case_(flags & JSRegExp::kIgnoreCase),

@@ -149,6 +152,7 @@ RegExpTree* RegExpParser::ReportError(Vector<const char> message) {

// Disjunction

RegExpTree* RegExpParser::ParsePattern() {

RegExpTree* result = ParseDisjunction(CHECK_FAILED);

+ PatchNamedBackReferences(CHECK_FAILED);

DCHECK(!has_more());

// If the result of parsing is a literal string atom, and it has the

// same length as the input, then the atom is identical to the input.

@@ -268,29 +272,44 @@ RegExpTree* RegExpParser::ParseDisjunction() {

case '(': {

SubexpressionType subexpr_type = CAPTURE;

RegExpLookaround::Type lookaround_type = state->lookaround_type();

+ bool is_named_capture = false;

Advance();

if (current() == '?') {

switch (Next()) {

case ':':

subexpr_type = GROUPING;

+ Advance(2);

break;

case '=':

lookaround_type = RegExpLookaround::LOOKAHEAD;

subexpr_type = POSITIVE_LOOKAROUND;

+ Advance(2);

break;

case '!':

lookaround_type = RegExpLookaround::LOOKAHEAD;

subexpr_type = NEGATIVE_LOOKAROUND;

+ Advance(2);

break;

case '<':

- if (FLAG_harmony_regexp_lookbehind) {

+ if (FLAG_harmony_regexp_lookbehind ||

+ FLAG_harmony_regexp_named_captures) {

Yang 2016/06/13 10:54:52 I don't think this check is still necessary. We ca

jgruber 2016/06/13 13:10:00 Done.

Advance();

- lookaround_type = RegExpLookaround::LOOKBEHIND;

- if (Next() == '=') {

- subexpr_type = POSITIVE_LOOKAROUND;

- break;

- } else if (Next() == '!') {

- subexpr_type = NEGATIVE_LOOKAROUND;

+ if (FLAG_harmony_regexp_lookbehind) {

+ if (Next() == '=') {

+ subexpr_type = POSITIVE_LOOKAROUND;

+ lookaround_type = RegExpLookaround::LOOKBEHIND;

+ Advance(2);

+ break;

+ } else if (Next() == '!') {

+ subexpr_type = NEGATIVE_LOOKAROUND;

+ lookaround_type = RegExpLookaround::LOOKBEHIND;

+ Advance(2);

+ break;

+ }

+ if (FLAG_harmony_regexp_named_captures && unicode()) {

+ is_named_capture = true;

+ Advance();

break;

}

@@ -298,12 +317,18 @@ RegExpTree* RegExpParser::ParseDisjunction() {

default:

return ReportError(CStrVector("Invalid group"));

}

- Advance(2);

- } else {

+ }

+ if (subexpr_type == CAPTURE) {

if (captures_started_ >= kMaxCaptures) {

return ReportError(CStrVector("Too many captures"));

}

captures_started_++;

+ if (is_named_capture) {

+ const ZoneVector<uc16>* name = ParseCaptureGroupName(CHECK_FAILED);

+ CreateNamedCaptureAtIndex(name, captures_started_ CHECK_FAILED);

Yang 2016/06/13 10:54:52 Can we simply attach the name to the parser state

jgruber 2016/06/13 13:10:00 Done.

+ }

}

// Store current state and begin new disjunction parsing.

state = new (zone()) RegExpParserState(

@@ -497,6 +522,13 @@ RegExpTree* RegExpParser::ParseDisjunction() {

}

break;

}

+ case 'k':

+ if (FLAG_harmony_regexp_named_captures && unicode()) {

+ Advance(2);

+ ParseNamedBackReference(builder, state CHECK_FAILED);

+ break;

+ }

+ // FALLTHROUGH

Yang 2016/06/13 10:54:53 I don't think we need all caps here :) Above we ha

jgruber 2016/06/13 13:10:00 Looking at the rest of the file, I think we have e

default:

Advance();

// With /u, no identity escapes except for syntax characters

@@ -675,6 +707,176 @@ bool RegExpParser::ParseBackReferenceIndex(int* index_out) {

return true;

}

+class CaptureNameBuffer {

+ public:

+ explicit CaptureNameBuffer(Zone* zone)

+ : backing_store_(nullptr), zone_(zone) {}

+ INLINE(void AddChar(uint32_t code_unit)) {

+ if (backing_store_ == nullptr) {

+ backing_store_ =

+ new (zone_->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone_);

+ }

+ if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {

+ backing_store_->push_back(code_unit);

+ } else {

+ backing_store_->push_back(unibrow::Utf16::LeadSurrogate(code_unit));

+ backing_store_->push_back(unibrow::Utf16::TrailSurrogate(code_unit));

+ }

+ const ZoneVector<uc16>* two_byte_literal() { return backing_store_; }

+ private:

+ ZoneVector<uc16>* backing_store_;

Yang 2016/06/13 10:54:52 Let's make this a non-dynamic member, like Bytecod

jgruber 2016/06/13 13:10:00 Done.

+ Zone* zone_;

+ DISALLOW_COPY_AND_ASSIGN(CaptureNameBuffer);

+};

+const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {

+ DCHECK(FLAG_harmony_regexp_named_captures);

+ DCHECK(unicode());

+ CaptureNameBuffer buf(zone());

+ bool at_start = true;

+ while (true) {

+ uc32 c = current();

+ Advance();

+ // Convert unicode escapes.

+ if (c == '\\' && current() == 'u') {

+ Advance();

+ if (!ParseUnicodeEscape(&c)) {

+ ReportError(CStrVector("Invalid Unicode escape sequence"));

+ return nullptr;

+ }

+ if (at_start) {

+ if (!IdentifierStart::Is(c)) {

+ ReportError(CStrVector("Invalid capture group name"));

+ return nullptr;

+ }

+ buf.AddChar(c);

+ at_start = false;

+ } else {

+ if (c == '>') {

+ break;

+ } else if (IdentifierPart::Is(c)) {

+ buf.AddChar(c);

+ } else {

+ ReportError(CStrVector("Invalid capture group name"));

+ return nullptr;

+ }

+ return buf.two_byte_literal();

+bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,

+ int index) {

+ DCHECK(FLAG_harmony_regexp_named_captures);

+ DCHECK(unicode());

+ DCHECK(0 < index && index <= captures_started_);

+ DCHECK_NOT_NULL(name);

+ if (named_captures_ == nullptr) {

+ named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());

Yang 2016/06/13 10:54:52 Let's make named_captures_ a non-dynamic member of

jgruber 2016/06/13 13:10:00 Do you have an intuition about how much overhead i

Yang 2016/06/13 13:38:00 Not a lot. List takes 3 pointers, so dynamic alloc

jgruber 2016/06/14 07:53:12 Ok. I'll stick with dynamic lists for now, just to

+ } else {

+ // Check for duplicates and bail if we find any.

+ for (int i = 0; i < named_captures_->length(); i++) {

Yang 2016/06/13 10:54:52 You can use C++11 syntax here. for (const auto& n

jgruber 2016/06/13 13:10:00 Done.

+ if (*named_captures_->at(i)->name() == *name) {

+ ReportError(CStrVector("Duplicate capture group name"));

+ return false;

+ }

+ RegExpCapture* capture = GetCapture(index);

+ DCHECK(capture->name() == nullptr);

+ capture->set_name(name);

+ named_captures_->Add(capture, zone());

+ return true;

+bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,

+ RegExpParserState* state) {

+ // The parser is assumed to be on the '<' in \k<name>.

+ if (current() != '<') {

+ ReportError(CStrVector("Invalid named reference"));

+ return false;

+ }

+ Advance();

+ const ZoneVector<uc16>* name = ParseCaptureGroupName();

+ if (name == nullptr) {

+ return false;

+ }

+ const int index = LookupCaptureGroupIndex(name);

Yang 2016/06/13 10:54:53 Let's not do this twice, here and in PatchNamedBac

jgruber 2016/06/13 13:10:00 We needed the index here to determine whether to c

+ if (index != -1 && state->IsInsideCaptureGroup(index)) {

+ builder->AddEmpty();

+ } else {

+ RegExpBackReference* atom = new (zone()) RegExpBackReference();

+ atom->set_name(name);

+ builder->AddAtom(atom);

+ if (named_back_references_ == nullptr) {

+ named_back_references_ =

Yang 2016/06/13 10:54:53 Same here, let's make named_back_references_ a non

jgruber 2016/06/13 13:10:00 See above. This change is trivial, just want to m

+ new (zone()) ZoneList<RegExpBackReference*>(1, zone());

+ }

+ named_back_references_->Add(atom, zone());

+ }

+ return true;

+void RegExpParser::PatchNamedBackReferences() {

+ if (named_back_references_ == nullptr) return;

+ if (named_captures_ == nullptr) {

+ ReportError(CStrVector("Invalid named capture referenced"));

+ return;

+ }

+ // Look up and patch the actual capture for each named back reference.

+ // TODO(jgruber): O(n^2), optimize if necessary.

+ for (int i = 0; i < named_back_references_->length(); i++) {

+ RegExpBackReference* ref = named_back_references_->at(i);

+ int index = LookupCaptureGroupIndex(ref->name());

+ if (index == -1) {

+ ReportError(CStrVector("Invalid named capture referenced"));

+ return;

+ }

+ ref->set_capture(GetCapture(index));

+ }

+int RegExpParser::LookupCaptureGroupIndex(const ZoneVector<uc16>* name) {

Yang 2016/06/13 10:54:52 This can be inlined into PatchNamedBackReferences

jgruber 2016/06/13 13:10:00 Done.

+ DCHECK(FLAG_harmony_regexp_named_captures);

+ DCHECK(unicode());

+ DCHECK_NOT_NULL(name);

+ // Attempt an initial lookup.

+ if (named_captures_ == nullptr) {

+ return -1;

+ }

+ for (int i = 0; i < named_captures_->length(); i++) {

+ RegExpCapture* capture = named_captures_->at(i);

+ if (*capture->name() == *name) {

+ return capture->index();

+ }

+ return -1;

RegExpCapture* RegExpParser::GetCapture(int index) {

// The index for the capture groups are one-based. Its index in the list is

@@ -691,6 +893,32 @@ RegExpCapture* RegExpParser::GetCapture(int index) {

return captures_->at(index - 1);

}

+Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {

+ if (named_captures_ == nullptr || named_captures_->is_empty())

+ return Handle<FixedArray>();

+ int len = named_captures_->length() * 2;

+ Handle<FixedArray> array = isolate()->factory()->NewFixedArray(len);

+ for (int i = 0; i < named_captures_->length(); i++) {

+ RegExpCapture* capture = named_captures_->at(i);

+ Vector<const uc16> vector(&(*capture->name())[0],

Yang 2016/06/13 10:54:53 Could we use a ZoneList for capture->name() instea

jgruber 2016/06/13 13:10:00 I used ZoneVector because of mstarzinger's comment

Yang 2016/06/13 13:38:00 I guess adding ToConstVector to ZoneVector also wo

jgruber 2016/06/14 07:53:12 Done.

+ static_cast<int>(capture->name()->size()));

+ MaybeHandle<String> name =

+ isolate()->factory()->NewStringFromTwoByte(vector);

+ array->set(i * 2, *name.ToHandleChecked());

+ array->set(i * 2 + 1, Smi::FromInt(capture->index()));

+ }

+ return array;

+void RegExpParser::FreeCaptureStrings() {

Yang 2016/06/13 10:54:52 Do we still need this and capture_strings_?

jgruber 2016/06/13 13:10:00 No. Thanks, good catch.

+ for (int i = 0; i < capture_strings_.length(); i++) {

+ capture_strings_[i].Dispose();

+ }

+ capture_strings_.Clear();

bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {

for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {

@@ -1135,7 +1363,6 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {

return CharacterRange::Singleton(first);

}

static const uc16 kNoCharClass = 0;

// Adds range or pre-defined character class to character ranges.

@@ -1268,8 +1495,10 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,

int capture_count = parser.captures_started();

result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;

result->contains_anchor = parser.contains_anchor();

+ result->capture_name_map = parser.CreateCaptureNameMap();

result->capture_count = capture_count;

}

+ parser.FreeCaptureStrings();

return !parser.failed();

}

« src/regexp/jsregexp.cc ('K') | « src/regexp/regexp-parser.h ('k') | src/runtime/runtime.h » ('j') | src/runtime/runtime-regexp.cc » ('J')