src/regexp/regexp-parser.cc - Issue 2050343002: [regexp] Experimental support for regexp named captures

Unified Diff: src/regexp/regexp-parser.cc

Issue 2050343002: [regexp] Experimental support for regexp named captures (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: Proper fixed array cast Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/regexp/regexp-parser.cc

diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc

index bdfa13f719e09416f4af414d696100d2e6101690..5c9be75d707ea448d6195b5662347d0ea9c0be7f 100644

--- a/src/regexp/regexp-parser.cc

+++ b/src/regexp/regexp-parser.cc

@@ -25,6 +25,9 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,

zone_(zone),

error_(error),

captures_(NULL),

+ named_captures_(NULL),

+ named_back_references_(NULL),

+ capture_strings_(0, zone),

in_(in),

current_(kEndMarker),

ignore_case_(flags & JSRegExp::kIgnoreCase),

@@ -149,6 +152,7 @@ RegExpTree* RegExpParser::ReportError(Vector<const char> message) {

// Disjunction

RegExpTree* RegExpParser::ParsePattern() {

RegExpTree* result = ParseDisjunction(CHECK_FAILED);

+ PatchNamedBackReferences(CHECK_FAILED);

DCHECK(!has_more());

// If the result of parsing is a literal string atom, and it has the

// same length as the input, then the atom is identical to the input.

@@ -268,29 +272,44 @@ RegExpTree* RegExpParser::ParseDisjunction() {

case '(': {

SubexpressionType subexpr_type = CAPTURE;

RegExpLookaround::Type lookaround_type = state->lookaround_type();

+ bool is_named_capture = false;

Advance();

if (current() == '?') {

switch (Next()) {

case ':':

subexpr_type = GROUPING;

+ Advance(2);

break;

case '=':

lookaround_type = RegExpLookaround::LOOKAHEAD;

subexpr_type = POSITIVE_LOOKAROUND;

+ Advance(2);

break;

case '!':

lookaround_type = RegExpLookaround::LOOKAHEAD;

subexpr_type = NEGATIVE_LOOKAROUND;

+ Advance(2);

break;

case '<':

- if (FLAG_harmony_regexp_lookbehind) {

+ if (FLAG_harmony_regexp_lookbehind ||

+ FLAG_harmony_regexp_named_captures) {

Advance();

- lookaround_type = RegExpLookaround::LOOKBEHIND;

- if (Next() == '=') {

- subexpr_type = POSITIVE_LOOKAROUND;

- break;

- } else if (Next() == '!') {

- subexpr_type = NEGATIVE_LOOKAROUND;

+ if (FLAG_harmony_regexp_lookbehind) {

+ if (Next() == '=') {

+ subexpr_type = POSITIVE_LOOKAROUND;

+ lookaround_type = RegExpLookaround::LOOKBEHIND;

+ Advance(2);

+ break;

+ } else if (Next() == '!') {

+ subexpr_type = NEGATIVE_LOOKAROUND;

+ lookaround_type = RegExpLookaround::LOOKBEHIND;

+ Advance(2);

+ break;

+ }

+ if (FLAG_harmony_regexp_named_captures && unicode()) {

+ is_named_capture = true;

+ Advance();

break;

}

@@ -298,12 +317,18 @@ RegExpTree* RegExpParser::ParseDisjunction() {

default:

return ReportError(CStrVector("Invalid group"));

}

- Advance(2);

- } else {

+ }

+ if (subexpr_type == CAPTURE) {

if (captures_started_ >= kMaxCaptures) {

return ReportError(CStrVector("Too many captures"));

}

captures_started_++;

+ if (is_named_capture) {

+ Vector<const uc16> name = ParseCaptureGroupName(CHECK_FAILED);

+ CreateNamedCaptureAtIndex(name, captures_started_ CHECK_FAILED);

+ }

}

// Store current state and begin new disjunction parsing.

state = new (zone()) RegExpParserState(

@@ -497,6 +522,13 @@ RegExpTree* RegExpParser::ParseDisjunction() {

}

break;

}

+ case 'k':

+ if (FLAG_harmony_regexp_named_captures && unicode()) {

+ Advance(2);

+ ParseNamedBackReference(builder, state CHECK_FAILED);

+ break;

+ }

+ // FALLTHROUGH

default:

Advance();

// With /u, no identity escapes except for syntax characters

@@ -675,6 +707,206 @@ bool RegExpParser::ParseBackReferenceIndex(int* index_out) {

return true;

}

+class CaptureNameBuffer {

+ public:

+ CaptureNameBuffer() : position_(0), backing_store_() {}

+ INLINE(void AddChar(uint32_t code_unit)) {

+ if (position_ >= backing_store_.length()) ExpandBuffer();

+ if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {

+ *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;

+ position_ += kUC16Size;

+ } else {

+ *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =

+ unibrow::Utf16::LeadSurrogate(code_unit);

+ position_ += kUC16Size;

+ if (position_ >= backing_store_.length()) ExpandBuffer();

+ *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =

+ unibrow::Utf16::TrailSurrogate(code_unit);

+ position_ += kUC16Size;

+ }

+ Vector<const uc16> two_byte_literal() const {

+ DCHECK((position_ & 0x1) == 0);

+ return Vector<const uc16>(reinterpret_cast<uc16*>(backing_store_.start()),

+ position_ >> 1);

+ }

+ void Dispose() { backing_store_.Dispose(); }

+ private:

+ static const int kInitialCapacity = 16;

+ static const int kGrowthFactor = 4;

+ static const int kMaxGrowth = 1 * MB;

+ inline int NewCapacity(int min_capacity) {

+ int capacity = Max(min_capacity, backing_store_.length());

+ int new_capacity = Min(capacity * kGrowthFactor, capacity + kMaxGrowth);

+ return new_capacity;

+ }

+ void ExpandBuffer() {

+ Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));

+ MemCopy(new_store.start(), backing_store_.start(), position_);

+ backing_store_.Dispose();

+ backing_store_ = new_store;

+ }

+ int position_;

+ Vector<byte> backing_store_;

Yang 2016/06/10 15:43:27 Can't we simply use a ZoneList here? It already ha

jgruber 2016/06/13 08:26:00 Right. I switched the CL to my ZoneVector version.

+ DISALLOW_COPY_AND_ASSIGN(CaptureNameBuffer);

+};

+Vector<const uc16> RegExpParser::ParseCaptureGroupName() {

+ DCHECK(FLAG_harmony_regexp_named_captures);

+ DCHECK(unicode());

+ CaptureNameBuffer buf;

+ bool at_start = true;

+ while (true) {

+ uc32 c = current();

+ Advance();

+ // Convert unicode escapes.

+ if (c == '\\' && current() == 'u') {

+ Advance();

+ if (!ParseUnicodeEscape(&c)) {

+ buf.Dispose();

+ ReportError(CStrVector("Invalid Unicode escape sequence"));

+ return Vector<const uc16>::empty();

+ }

+ if (at_start) {

+ if (!IdentifierStart::Is(c)) {

+ buf.Dispose();

+ ReportError(CStrVector("Invalid capture group name"));

+ return Vector<const uc16>::empty();

+ }

+ buf.AddChar(c);

+ at_start = false;

+ } else {

+ if (c == '>') {

+ break;

+ } else if (IdentifierPart::Is(c)) {

+ buf.AddChar(c);

+ } else {

+ buf.Dispose();

+ ReportError(CStrVector("Invalid capture group name"));

+ return Vector<const uc16>::empty();

+ }

+ Vector<const uc16> name = buf.two_byte_literal();

+ capture_strings_.Add(name, zone());

+ return name;

+bool RegExpParser::CreateNamedCaptureAtIndex(Vector<const uc16> name,

+ int index) {

+ DCHECK(FLAG_harmony_regexp_named_captures);

+ DCHECK(unicode());

+ DCHECK(!name.is_empty());

+ DCHECK(0 < index && index <= captures_started_);

+ if (named_captures_ == nullptr) {

+ named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());

+ } else {

+ // Check for duplicates and bail if we find any.

+ for (int i = 0; i < named_captures_->length(); i++) {

+ if (named_captures_->at(i)->name() == name) {

+ ReportError(CStrVector("Duplicate capture group name"));

+ return false;

+ }

+ RegExpCapture* capture = GetCapture(index);

+ DCHECK(capture->name().is_empty());

+ capture->set_name(name);

+ named_captures_->Add(capture, zone());

+ return true;

+bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,

+ RegExpParserState* state) {

+ // The parser is assumed to be on the '<' in \k<name>.

+ if (current() != '<') {

+ ReportError(CStrVector("Invalid named reference"));

+ return false;

+ }

+ Advance();

+ Vector<const uc16> name = ParseCaptureGroupName();

+ if (name.is_empty()) {

+ return false;

+ }

+ const int index = LookupCaptureGroupIndex(name);

+ if (index != -1 && state->IsInsideCaptureGroup(index)) {

+ builder->AddEmpty();

+ } else {

+ RegExpBackReference* atom = new (zone()) RegExpBackReference();

+ atom->set_name(name);

+ builder->AddAtom(atom);

+ if (named_back_references_ == nullptr) {

+ named_back_references_ =

+ new (zone()) ZoneList<RegExpBackReference*>(1, zone());

+ }

+ named_back_references_->Add(atom, zone());

+ }

+ return true;

+void RegExpParser::PatchNamedBackReferences() {

+ if (named_back_references_ == nullptr) return;

+ if (named_captures_ == nullptr) {

+ ReportError(CStrVector("Invalid named capture referenced"));

+ return;

+ }

+ // Look up and patch the actual capture for each named back reference.

+ // TODO(jgruber): O(n^2), optimize if necessary.

+ for (int i = 0; i < named_back_references_->length(); i++) {

+ RegExpBackReference* ref = named_back_references_->at(i);

+ int index = LookupCaptureGroupIndex(ref->name());

+ if (index == -1) {

+ ReportError(CStrVector("Invalid named capture referenced"));

+ return;

+ }

+ ref->set_capture(GetCapture(index));

+ }

+int RegExpParser::LookupCaptureGroupIndex(Vector<const uc16> name) {

+ DCHECK(FLAG_harmony_regexp_named_captures);

+ DCHECK(unicode());

+ DCHECK(!name.is_empty());

+ // Attempt an initial lookup.

+ if (named_captures_ == nullptr) {

+ return -1;

+ }

+ for (int i = 0; i < named_captures_->length(); i++) {

+ RegExpCapture* capture = named_captures_->at(i);

+ if (capture->name() == name) {

+ return capture->index();

+ }

+ return -1;

RegExpCapture* RegExpParser::GetCapture(int index) {

// The index for the capture groups are one-based. Its index in the list is

@@ -691,6 +923,30 @@ RegExpCapture* RegExpParser::GetCapture(int index) {

return captures_->at(index - 1);

}

+Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {

+ if (named_captures_ == nullptr || named_captures_->is_empty())

+ return Handle<FixedArray>();

+ int len = named_captures_->length() * 2;

+ Handle<FixedArray> array = isolate()->factory()->NewFixedArray(len);

+ for (int i = 0; i < named_captures_->length(); i++) {

+ RegExpCapture* capture = named_captures_->at(i);

+ MaybeHandle<String> name =

+ isolate()->factory()->NewStringFromTwoByte(capture->name());

+ array->set(i * 2, *name.ToHandleChecked());

+ array->set(i * 2 + 1, Smi::FromInt(capture->index()));

+ }

+ return array;

+void RegExpParser::FreeCaptureStrings() {

+ for (int i = 0; i < capture_strings_.length(); i++) {

+ capture_strings_[i].Dispose();

+ }

+ capture_strings_.Clear();

bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {

for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {

@@ -1268,8 +1524,10 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,

int capture_count = parser.captures_started();

result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;

result->contains_anchor = parser.contains_anchor();

+ result->capture_name_map = parser.CreateCaptureNameMap();

result->capture_count = capture_count;

}

+ parser.FreeCaptureStrings();

return !parser.failed();

}

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | src/runtime/runtime.h » ('j') | no next file with comments »