Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1112)

Unified Diff: src/regexp/regexp-parser.cc

Issue 2050343002: [regexp] Experimental support for regexp named captures (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: static_cast<int> Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/regexp/regexp-parser.cc
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc
index bdfa13f719e09416f4af414d696100d2e6101690..2bac26020644bb8bc5949930e2d456c06b2f2704 100644
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
@@ -25,6 +25,9 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
zone_(zone),
error_(error),
captures_(NULL),
+ named_captures_(NULL),
+ named_back_references_(NULL),
+ capture_strings_(0, zone),
in_(in),
current_(kEndMarker),
ignore_case_(flags & JSRegExp::kIgnoreCase),
@@ -149,6 +152,7 @@ RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
// Disjunction
RegExpTree* RegExpParser::ParsePattern() {
RegExpTree* result = ParseDisjunction(CHECK_FAILED);
+ PatchNamedBackReferences(CHECK_FAILED);
DCHECK(!has_more());
// If the result of parsing is a literal string atom, and it has the
// same length as the input, then the atom is identical to the input.
@@ -268,29 +272,44 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '(': {
SubexpressionType subexpr_type = CAPTURE;
RegExpLookaround::Type lookaround_type = state->lookaround_type();
+ bool is_named_capture = false;
Advance();
if (current() == '?') {
switch (Next()) {
case ':':
subexpr_type = GROUPING;
+ Advance(2);
break;
case '=':
lookaround_type = RegExpLookaround::LOOKAHEAD;
subexpr_type = POSITIVE_LOOKAROUND;
+ Advance(2);
break;
case '!':
lookaround_type = RegExpLookaround::LOOKAHEAD;
subexpr_type = NEGATIVE_LOOKAROUND;
+ Advance(2);
break;
case '<':
- if (FLAG_harmony_regexp_lookbehind) {
+ if (FLAG_harmony_regexp_lookbehind ||
+ FLAG_harmony_regexp_named_captures) {
Yang 2016/06/13 10:54:52 I don't think this check is still necessary. We ca
jgruber 2016/06/13 13:10:00 Done.
Advance();
- lookaround_type = RegExpLookaround::LOOKBEHIND;
- if (Next() == '=') {
- subexpr_type = POSITIVE_LOOKAROUND;
- break;
- } else if (Next() == '!') {
- subexpr_type = NEGATIVE_LOOKAROUND;
+ if (FLAG_harmony_regexp_lookbehind) {
+ if (Next() == '=') {
+ subexpr_type = POSITIVE_LOOKAROUND;
+ lookaround_type = RegExpLookaround::LOOKBEHIND;
+ Advance(2);
+ break;
+ } else if (Next() == '!') {
+ subexpr_type = NEGATIVE_LOOKAROUND;
+ lookaround_type = RegExpLookaround::LOOKBEHIND;
+ Advance(2);
+ break;
+ }
+ }
+ if (FLAG_harmony_regexp_named_captures && unicode()) {
+ is_named_capture = true;
+ Advance();
break;
}
}
@@ -298,12 +317,18 @@ RegExpTree* RegExpParser::ParseDisjunction() {
default:
return ReportError(CStrVector("Invalid group"));
}
- Advance(2);
- } else {
+ }
+
+ if (subexpr_type == CAPTURE) {
if (captures_started_ >= kMaxCaptures) {
return ReportError(CStrVector("Too many captures"));
}
captures_started_++;
+
+ if (is_named_capture) {
+ const ZoneVector<uc16>* name = ParseCaptureGroupName(CHECK_FAILED);
+ CreateNamedCaptureAtIndex(name, captures_started_ CHECK_FAILED);
Yang 2016/06/13 10:54:52 Can we simply attach the name to the parser state
jgruber 2016/06/13 13:10:00 Done.
+ }
}
// Store current state and begin new disjunction parsing.
state = new (zone()) RegExpParserState(
@@ -497,6 +522,13 @@ RegExpTree* RegExpParser::ParseDisjunction() {
}
break;
}
+ case 'k':
+ if (FLAG_harmony_regexp_named_captures && unicode()) {
+ Advance(2);
+ ParseNamedBackReference(builder, state CHECK_FAILED);
+ break;
+ }
+ // FALLTHROUGH
Yang 2016/06/13 10:54:53 I don't think we need all caps here :) Above we ha
jgruber 2016/06/13 13:10:00 Looking at the rest of the file, I think we have e
default:
Advance();
// With /u, no identity escapes except for syntax characters
@@ -675,6 +707,176 @@ bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
return true;
}
+class CaptureNameBuffer {
+ public:
+ explicit CaptureNameBuffer(Zone* zone)
+ : backing_store_(nullptr), zone_(zone) {}
+
+ INLINE(void AddChar(uint32_t code_unit)) {
+ if (backing_store_ == nullptr) {
+ backing_store_ =
+ new (zone_->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone_);
+ }
+ if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
+ backing_store_->push_back(code_unit);
+ } else {
+ backing_store_->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
+ backing_store_->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
+ }
+ }
+
+ const ZoneVector<uc16>* two_byte_literal() { return backing_store_; }
+
+ private:
+ ZoneVector<uc16>* backing_store_;
Yang 2016/06/13 10:54:52 Let's make this a non-dynamic member, like Bytecod
jgruber 2016/06/13 13:10:00 Done.
+ Zone* zone_;
+
+ DISALLOW_COPY_AND_ASSIGN(CaptureNameBuffer);
+};
+
+const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
+ DCHECK(FLAG_harmony_regexp_named_captures);
+ DCHECK(unicode());
+
+ CaptureNameBuffer buf(zone());
+ bool at_start = true;
+ while (true) {
+ uc32 c = current();
+ Advance();
+
+ // Convert unicode escapes.
+ if (c == '\\' && current() == 'u') {
+ Advance();
+ if (!ParseUnicodeEscape(&c)) {
+ ReportError(CStrVector("Invalid Unicode escape sequence"));
+ return nullptr;
+ }
+ }
+
+ if (at_start) {
+ if (!IdentifierStart::Is(c)) {
+ ReportError(CStrVector("Invalid capture group name"));
+ return nullptr;
+ }
+ buf.AddChar(c);
+ at_start = false;
+ } else {
+ if (c == '>') {
+ break;
+ } else if (IdentifierPart::Is(c)) {
+ buf.AddChar(c);
+ } else {
+ ReportError(CStrVector("Invalid capture group name"));
+ return nullptr;
+ }
+ }
+ }
+
+ return buf.two_byte_literal();
+}
+
+bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
+ int index) {
+ DCHECK(FLAG_harmony_regexp_named_captures);
+ DCHECK(unicode());
+ DCHECK(0 < index && index <= captures_started_);
+ DCHECK_NOT_NULL(name);
+
+ if (named_captures_ == nullptr) {
+ named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
Yang 2016/06/13 10:54:52 Let's make named_captures_ a non-dynamic member of
jgruber 2016/06/13 13:10:00 Do you have an intuition about how much overhead i
Yang 2016/06/13 13:38:00 Not a lot. List takes 3 pointers, so dynamic alloc
jgruber 2016/06/14 07:53:12 Ok. I'll stick with dynamic lists for now, just to
+ } else {
+ // Check for duplicates and bail if we find any.
+ for (int i = 0; i < named_captures_->length(); i++) {
Yang 2016/06/13 10:54:52 You can use C++11 syntax here. for (const auto& n
jgruber 2016/06/13 13:10:00 Done.
+ if (*named_captures_->at(i)->name() == *name) {
+ ReportError(CStrVector("Duplicate capture group name"));
+ return false;
+ }
+ }
+ }
+
+ RegExpCapture* capture = GetCapture(index);
+ DCHECK(capture->name() == nullptr);
+
+ capture->set_name(name);
+ named_captures_->Add(capture, zone());
+
+ return true;
+}
+
+bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
+ RegExpParserState* state) {
+ // The parser is assumed to be on the '<' in \k<name>.
+ if (current() != '<') {
+ ReportError(CStrVector("Invalid named reference"));
+ return false;
+ }
+
+ Advance();
+ const ZoneVector<uc16>* name = ParseCaptureGroupName();
+ if (name == nullptr) {
+ return false;
+ }
+
+ const int index = LookupCaptureGroupIndex(name);
Yang 2016/06/13 10:54:53 Let's not do this twice, here and in PatchNamedBac
jgruber 2016/06/13 13:10:00 We needed the index here to determine whether to c
+ if (index != -1 && state->IsInsideCaptureGroup(index)) {
+ builder->AddEmpty();
+ } else {
+ RegExpBackReference* atom = new (zone()) RegExpBackReference();
+ atom->set_name(name);
+
+ builder->AddAtom(atom);
+
+ if (named_back_references_ == nullptr) {
+ named_back_references_ =
Yang 2016/06/13 10:54:53 Same here, let's make named_back_references_ a non
jgruber 2016/06/13 13:10:00 See above. This change is trivial, just want to m
+ new (zone()) ZoneList<RegExpBackReference*>(1, zone());
+ }
+ named_back_references_->Add(atom, zone());
+ }
+
+ return true;
+}
+
+void RegExpParser::PatchNamedBackReferences() {
+ if (named_back_references_ == nullptr) return;
+
+ if (named_captures_ == nullptr) {
+ ReportError(CStrVector("Invalid named capture referenced"));
+ return;
+ }
+
+ // Look up and patch the actual capture for each named back reference.
+ // TODO(jgruber): O(n^2), optimize if necessary.
+
+ for (int i = 0; i < named_back_references_->length(); i++) {
+ RegExpBackReference* ref = named_back_references_->at(i);
+ int index = LookupCaptureGroupIndex(ref->name());
+ if (index == -1) {
+ ReportError(CStrVector("Invalid named capture referenced"));
+ return;
+ }
+ ref->set_capture(GetCapture(index));
+ }
+}
+
+int RegExpParser::LookupCaptureGroupIndex(const ZoneVector<uc16>* name) {
Yang 2016/06/13 10:54:52 This can be inlined into PatchNamedBackReferences
jgruber 2016/06/13 13:10:00 Done.
+ DCHECK(FLAG_harmony_regexp_named_captures);
+ DCHECK(unicode());
+ DCHECK_NOT_NULL(name);
+
+ // Attempt an initial lookup.
+ if (named_captures_ == nullptr) {
+ return -1;
+ }
+
+ for (int i = 0; i < named_captures_->length(); i++) {
+ RegExpCapture* capture = named_captures_->at(i);
+ if (*capture->name() == *name) {
+ return capture->index();
+ }
+ }
+
+ return -1;
+}
RegExpCapture* RegExpParser::GetCapture(int index) {
// The index for the capture groups are one-based. Its index in the list is
@@ -691,6 +893,32 @@ RegExpCapture* RegExpParser::GetCapture(int index) {
return captures_->at(index - 1);
}
+Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
+ if (named_captures_ == nullptr || named_captures_->is_empty())
+ return Handle<FixedArray>();
+
+ int len = named_captures_->length() * 2;
+ Handle<FixedArray> array = isolate()->factory()->NewFixedArray(len);
+
+ for (int i = 0; i < named_captures_->length(); i++) {
+ RegExpCapture* capture = named_captures_->at(i);
+ Vector<const uc16> vector(&(*capture->name())[0],
Yang 2016/06/13 10:54:53 Could we use a ZoneList for capture->name() instea
jgruber 2016/06/13 13:10:00 I used ZoneVector because of mstarzinger's comment
Yang 2016/06/13 13:38:00 I guess adding ToConstVector to ZoneVector also wo
jgruber 2016/06/14 07:53:12 Done.
+ static_cast<int>(capture->name()->size()));
+ MaybeHandle<String> name =
+ isolate()->factory()->NewStringFromTwoByte(vector);
+ array->set(i * 2, *name.ToHandleChecked());
+ array->set(i * 2 + 1, Smi::FromInt(capture->index()));
+ }
+
+ return array;
+}
+
+void RegExpParser::FreeCaptureStrings() {
Yang 2016/06/13 10:54:52 Do we still need this and capture_strings_?
jgruber 2016/06/13 13:10:00 No. Thanks, good catch.
+ for (int i = 0; i < capture_strings_.length(); i++) {
+ capture_strings_[i].Dispose();
+ }
+ capture_strings_.Clear();
+}
bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
@@ -1135,7 +1363,6 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
return CharacterRange::Singleton(first);
}
-
static const uc16 kNoCharClass = 0;
// Adds range or pre-defined character class to character ranges.
@@ -1268,8 +1495,10 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
int capture_count = parser.captures_started();
result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
result->contains_anchor = parser.contains_anchor();
+ result->capture_name_map = parser.CreateCaptureNameMap();
result->capture_count = capture_count;
}
+ parser.FreeCaptureStrings();
return !parser.failed();
}

Powered by Google App Engine
This is Rietveld 408576698