Chromium Code Reviews| Index: src/regexp/regexp-parser.cc |
| diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc |
| index bdfa13f719e09416f4af414d696100d2e6101690..2bac26020644bb8bc5949930e2d456c06b2f2704 100644 |
| --- a/src/regexp/regexp-parser.cc |
| +++ b/src/regexp/regexp-parser.cc |
| @@ -25,6 +25,9 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, |
| zone_(zone), |
| error_(error), |
| captures_(NULL), |
| + named_captures_(NULL), |
| + named_back_references_(NULL), |
| + capture_strings_(0, zone), |
| in_(in), |
| current_(kEndMarker), |
| ignore_case_(flags & JSRegExp::kIgnoreCase), |
| @@ -149,6 +152,7 @@ RegExpTree* RegExpParser::ReportError(Vector<const char> message) { |
| // Disjunction |
| RegExpTree* RegExpParser::ParsePattern() { |
| RegExpTree* result = ParseDisjunction(CHECK_FAILED); |
| + PatchNamedBackReferences(CHECK_FAILED); |
| DCHECK(!has_more()); |
| // If the result of parsing is a literal string atom, and it has the |
| // same length as the input, then the atom is identical to the input. |
| @@ -268,29 +272,44 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
| case '(': { |
| SubexpressionType subexpr_type = CAPTURE; |
| RegExpLookaround::Type lookaround_type = state->lookaround_type(); |
| + bool is_named_capture = false; |
| Advance(); |
| if (current() == '?') { |
| switch (Next()) { |
| case ':': |
| subexpr_type = GROUPING; |
| + Advance(2); |
| break; |
| case '=': |
| lookaround_type = RegExpLookaround::LOOKAHEAD; |
| subexpr_type = POSITIVE_LOOKAROUND; |
| + Advance(2); |
| break; |
| case '!': |
| lookaround_type = RegExpLookaround::LOOKAHEAD; |
| subexpr_type = NEGATIVE_LOOKAROUND; |
| + Advance(2); |
| break; |
| case '<': |
| - if (FLAG_harmony_regexp_lookbehind) { |
| + if (FLAG_harmony_regexp_lookbehind || |
| + FLAG_harmony_regexp_named_captures) { |
|
Yang
2016/06/13 10:54:52
I don't think this check is still necessary. We ca
jgruber
2016/06/13 13:10:00
Done.
|
| Advance(); |
| - lookaround_type = RegExpLookaround::LOOKBEHIND; |
| - if (Next() == '=') { |
| - subexpr_type = POSITIVE_LOOKAROUND; |
| - break; |
| - } else if (Next() == '!') { |
| - subexpr_type = NEGATIVE_LOOKAROUND; |
| + if (FLAG_harmony_regexp_lookbehind) { |
| + if (Next() == '=') { |
| + subexpr_type = POSITIVE_LOOKAROUND; |
| + lookaround_type = RegExpLookaround::LOOKBEHIND; |
| + Advance(2); |
| + break; |
| + } else if (Next() == '!') { |
| + subexpr_type = NEGATIVE_LOOKAROUND; |
| + lookaround_type = RegExpLookaround::LOOKBEHIND; |
| + Advance(2); |
| + break; |
| + } |
| + } |
| + if (FLAG_harmony_regexp_named_captures && unicode()) { |
| + is_named_capture = true; |
| + Advance(); |
| break; |
| } |
| } |
| @@ -298,12 +317,18 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
| default: |
| return ReportError(CStrVector("Invalid group")); |
| } |
| - Advance(2); |
| - } else { |
| + } |
| + |
| + if (subexpr_type == CAPTURE) { |
| if (captures_started_ >= kMaxCaptures) { |
| return ReportError(CStrVector("Too many captures")); |
| } |
| captures_started_++; |
| + |
| + if (is_named_capture) { |
| + const ZoneVector<uc16>* name = ParseCaptureGroupName(CHECK_FAILED); |
| + CreateNamedCaptureAtIndex(name, captures_started_ CHECK_FAILED); |
|
Yang
2016/06/13 10:54:52
Can we simply attach the name to the parser state
jgruber
2016/06/13 13:10:00
Done.
|
| + } |
| } |
| // Store current state and begin new disjunction parsing. |
| state = new (zone()) RegExpParserState( |
| @@ -497,6 +522,13 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
| } |
| break; |
| } |
| + case 'k': |
| + if (FLAG_harmony_regexp_named_captures && unicode()) { |
| + Advance(2); |
| + ParseNamedBackReference(builder, state CHECK_FAILED); |
| + break; |
| + } |
| + // FALLTHROUGH |
|
Yang
2016/06/13 10:54:53
I don't think we need all caps here :)
Above we ha
jgruber
2016/06/13 13:10:00
Looking at the rest of the file, I think we have e
|
| default: |
| Advance(); |
| // With /u, no identity escapes except for syntax characters |
| @@ -675,6 +707,176 @@ bool RegExpParser::ParseBackReferenceIndex(int* index_out) { |
| return true; |
| } |
| +class CaptureNameBuffer { |
| + public: |
| + explicit CaptureNameBuffer(Zone* zone) |
| + : backing_store_(nullptr), zone_(zone) {} |
| + |
| + INLINE(void AddChar(uint32_t code_unit)) { |
| + if (backing_store_ == nullptr) { |
| + backing_store_ = |
| + new (zone_->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone_); |
| + } |
| + if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { |
| + backing_store_->push_back(code_unit); |
| + } else { |
| + backing_store_->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); |
| + backing_store_->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); |
| + } |
| + } |
| + |
| + const ZoneVector<uc16>* two_byte_literal() { return backing_store_; } |
| + |
| + private: |
| + ZoneVector<uc16>* backing_store_; |
|
Yang
2016/06/13 10:54:52
Let's make this a non-dynamic member, like Bytecod
jgruber
2016/06/13 13:10:00
Done.
|
| + Zone* zone_; |
| + |
| + DISALLOW_COPY_AND_ASSIGN(CaptureNameBuffer); |
| +}; |
| + |
| +const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { |
| + DCHECK(FLAG_harmony_regexp_named_captures); |
| + DCHECK(unicode()); |
| + |
| + CaptureNameBuffer buf(zone()); |
| + bool at_start = true; |
| + while (true) { |
| + uc32 c = current(); |
| + Advance(); |
| + |
| + // Convert unicode escapes. |
| + if (c == '\\' && current() == 'u') { |
| + Advance(); |
| + if (!ParseUnicodeEscape(&c)) { |
| + ReportError(CStrVector("Invalid Unicode escape sequence")); |
| + return nullptr; |
| + } |
| + } |
| + |
| + if (at_start) { |
| + if (!IdentifierStart::Is(c)) { |
| + ReportError(CStrVector("Invalid capture group name")); |
| + return nullptr; |
| + } |
| + buf.AddChar(c); |
| + at_start = false; |
| + } else { |
| + if (c == '>') { |
| + break; |
| + } else if (IdentifierPart::Is(c)) { |
| + buf.AddChar(c); |
| + } else { |
| + ReportError(CStrVector("Invalid capture group name")); |
| + return nullptr; |
| + } |
| + } |
| + } |
| + |
| + return buf.two_byte_literal(); |
| +} |
| + |
| +bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, |
| + int index) { |
| + DCHECK(FLAG_harmony_regexp_named_captures); |
| + DCHECK(unicode()); |
| + DCHECK(0 < index && index <= captures_started_); |
| + DCHECK_NOT_NULL(name); |
| + |
| + if (named_captures_ == nullptr) { |
| + named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone()); |
|
Yang
2016/06/13 10:54:52
Let's make named_captures_ a non-dynamic member of
jgruber
2016/06/13 13:10:00
Do you have an intuition about how much overhead i
Yang
2016/06/13 13:38:00
Not a lot. List takes 3 pointers, so dynamic alloc
jgruber
2016/06/14 07:53:12
Ok. I'll stick with dynamic lists for now, just to
|
| + } else { |
| + // Check for duplicates and bail if we find any. |
| + for (int i = 0; i < named_captures_->length(); i++) { |
|
Yang
2016/06/13 10:54:52
You can use C++11 syntax here.
for (const auto& n
jgruber
2016/06/13 13:10:00
Done.
|
| + if (*named_captures_->at(i)->name() == *name) { |
| + ReportError(CStrVector("Duplicate capture group name")); |
| + return false; |
| + } |
| + } |
| + } |
| + |
| + RegExpCapture* capture = GetCapture(index); |
| + DCHECK(capture->name() == nullptr); |
| + |
| + capture->set_name(name); |
| + named_captures_->Add(capture, zone()); |
| + |
| + return true; |
| +} |
| + |
| +bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, |
| + RegExpParserState* state) { |
| + // The parser is assumed to be on the '<' in \k<name>. |
| + if (current() != '<') { |
| + ReportError(CStrVector("Invalid named reference")); |
| + return false; |
| + } |
| + |
| + Advance(); |
| + const ZoneVector<uc16>* name = ParseCaptureGroupName(); |
| + if (name == nullptr) { |
| + return false; |
| + } |
| + |
| + const int index = LookupCaptureGroupIndex(name); |
|
Yang
2016/06/13 10:54:53
Let's not do this twice, here and in PatchNamedBac
jgruber
2016/06/13 13:10:00
We needed the index here to determine whether to c
|
| + if (index != -1 && state->IsInsideCaptureGroup(index)) { |
| + builder->AddEmpty(); |
| + } else { |
| + RegExpBackReference* atom = new (zone()) RegExpBackReference(); |
| + atom->set_name(name); |
| + |
| + builder->AddAtom(atom); |
| + |
| + if (named_back_references_ == nullptr) { |
| + named_back_references_ = |
|
Yang
2016/06/13 10:54:53
Same here, let's make named_back_references_ a non
jgruber
2016/06/13 13:10:00
See above.
This change is trivial, just want to m
|
| + new (zone()) ZoneList<RegExpBackReference*>(1, zone()); |
| + } |
| + named_back_references_->Add(atom, zone()); |
| + } |
| + |
| + return true; |
| +} |
| + |
| +void RegExpParser::PatchNamedBackReferences() { |
| + if (named_back_references_ == nullptr) return; |
| + |
| + if (named_captures_ == nullptr) { |
| + ReportError(CStrVector("Invalid named capture referenced")); |
| + return; |
| + } |
| + |
| + // Look up and patch the actual capture for each named back reference. |
| + // TODO(jgruber): O(n^2), optimize if necessary. |
| + |
| + for (int i = 0; i < named_back_references_->length(); i++) { |
| + RegExpBackReference* ref = named_back_references_->at(i); |
| + int index = LookupCaptureGroupIndex(ref->name()); |
| + if (index == -1) { |
| + ReportError(CStrVector("Invalid named capture referenced")); |
| + return; |
| + } |
| + ref->set_capture(GetCapture(index)); |
| + } |
| +} |
| + |
| +int RegExpParser::LookupCaptureGroupIndex(const ZoneVector<uc16>* name) { |
|
Yang
2016/06/13 10:54:52
This can be inlined into PatchNamedBackReferences
jgruber
2016/06/13 13:10:00
Done.
|
| + DCHECK(FLAG_harmony_regexp_named_captures); |
| + DCHECK(unicode()); |
| + DCHECK_NOT_NULL(name); |
| + |
| + // Attempt an initial lookup. |
| + if (named_captures_ == nullptr) { |
| + return -1; |
| + } |
| + |
| + for (int i = 0; i < named_captures_->length(); i++) { |
| + RegExpCapture* capture = named_captures_->at(i); |
| + if (*capture->name() == *name) { |
| + return capture->index(); |
| + } |
| + } |
| + |
| + return -1; |
| +} |
| RegExpCapture* RegExpParser::GetCapture(int index) { |
| // The index for the capture groups are one-based. Its index in the list is |
| @@ -691,6 +893,32 @@ RegExpCapture* RegExpParser::GetCapture(int index) { |
| return captures_->at(index - 1); |
| } |
| +Handle<FixedArray> RegExpParser::CreateCaptureNameMap() { |
| + if (named_captures_ == nullptr || named_captures_->is_empty()) |
| + return Handle<FixedArray>(); |
| + |
| + int len = named_captures_->length() * 2; |
| + Handle<FixedArray> array = isolate()->factory()->NewFixedArray(len); |
| + |
| + for (int i = 0; i < named_captures_->length(); i++) { |
| + RegExpCapture* capture = named_captures_->at(i); |
| + Vector<const uc16> vector(&(*capture->name())[0], |
|
Yang
2016/06/13 10:54:53
Could we use a ZoneList for capture->name() instea
jgruber
2016/06/13 13:10:00
I used ZoneVector because of mstarzinger's comment
Yang
2016/06/13 13:38:00
I guess adding ToConstVector to ZoneVector also wo
jgruber
2016/06/14 07:53:12
Done.
|
| + static_cast<int>(capture->name()->size())); |
| + MaybeHandle<String> name = |
| + isolate()->factory()->NewStringFromTwoByte(vector); |
| + array->set(i * 2, *name.ToHandleChecked()); |
| + array->set(i * 2 + 1, Smi::FromInt(capture->index())); |
| + } |
| + |
| + return array; |
| +} |
| + |
| +void RegExpParser::FreeCaptureStrings() { |
|
Yang
2016/06/13 10:54:52
Do we still need this and capture_strings_?
jgruber
2016/06/13 13:10:00
No. Thanks, good catch.
|
| + for (int i = 0; i < capture_strings_.length(); i++) { |
| + capture_strings_[i].Dispose(); |
| + } |
| + capture_strings_.Clear(); |
| +} |
| bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) { |
| for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { |
| @@ -1135,7 +1363,6 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { |
| return CharacterRange::Singleton(first); |
| } |
| - |
| static const uc16 kNoCharClass = 0; |
| // Adds range or pre-defined character class to character ranges. |
| @@ -1268,8 +1495,10 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, |
| int capture_count = parser.captures_started(); |
| result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; |
| result->contains_anchor = parser.contains_anchor(); |
| + result->capture_name_map = parser.CreateCaptureNameMap(); |
| result->capture_count = capture_count; |
| } |
| + parser.FreeCaptureStrings(); |
| return !parser.failed(); |
| } |