| Index: src/regexp/regexp-parser.cc
|
| diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc
|
| index bdfa13f719e09416f4af414d696100d2e6101690..252b36a6f0f99d3e565f0a90c480a942f80bf239 100644
|
| --- a/src/regexp/regexp-parser.cc
|
| +++ b/src/regexp/regexp-parser.cc
|
| @@ -25,6 +25,8 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
|
| zone_(zone),
|
| error_(error),
|
| captures_(NULL),
|
| + named_captures_(NULL),
|
| + named_back_references_(NULL),
|
| in_(in),
|
| current_(kEndMarker),
|
| ignore_case_(flags & JSRegExp::kIgnoreCase),
|
| @@ -149,6 +151,7 @@ RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
|
| // Disjunction
|
| RegExpTree* RegExpParser::ParsePattern() {
|
| RegExpTree* result = ParseDisjunction(CHECK_FAILED);
|
| + PatchNamedBackReferences(CHECK_FAILED);
|
| DCHECK(!has_more());
|
| // If the result of parsing is a literal string atom, and it has the
|
| // same length as the input, then the atom is identical to the input.
|
| @@ -172,7 +175,7 @@ RegExpTree* RegExpParser::ParsePattern() {
|
| RegExpTree* RegExpParser::ParseDisjunction() {
|
| // Used to store current state while parsing subexpressions.
|
| RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
|
| - ignore_case(), unicode(), zone());
|
| + nullptr, ignore_case(), unicode(), zone());
|
| RegExpParserState* state = &initial_state;
|
| // Cache the builder in a local variable for quick access.
|
| RegExpBuilder* builder = initial_state.builder();
|
| @@ -204,6 +207,10 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
|
|
| // Build result of subexpression.
|
| if (group_type == CAPTURE) {
|
| + if (state->IsNamedCapture()) {
|
| + CreateNamedCaptureAtIndex(state->capture_name(),
|
| + capture_index CHECK_FAILED);
|
| + }
|
| RegExpCapture* capture = GetCapture(capture_index);
|
| capture->set_body(body);
|
| body = capture;
|
| @@ -268,47 +275,65 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
| case '(': {
|
| SubexpressionType subexpr_type = CAPTURE;
|
| RegExpLookaround::Type lookaround_type = state->lookaround_type();
|
| + bool is_named_capture = false;
|
| Advance();
|
| if (current() == '?') {
|
| switch (Next()) {
|
| case ':':
|
| subexpr_type = GROUPING;
|
| + Advance(2);
|
| break;
|
| case '=':
|
| lookaround_type = RegExpLookaround::LOOKAHEAD;
|
| subexpr_type = POSITIVE_LOOKAROUND;
|
| + Advance(2);
|
| break;
|
| case '!':
|
| lookaround_type = RegExpLookaround::LOOKAHEAD;
|
| subexpr_type = NEGATIVE_LOOKAROUND;
|
| + Advance(2);
|
| break;
|
| case '<':
|
| + Advance();
|
| if (FLAG_harmony_regexp_lookbehind) {
|
| - Advance();
|
| - lookaround_type = RegExpLookaround::LOOKBEHIND;
|
| if (Next() == '=') {
|
| subexpr_type = POSITIVE_LOOKAROUND;
|
| + lookaround_type = RegExpLookaround::LOOKBEHIND;
|
| + Advance(2);
|
| break;
|
| } else if (Next() == '!') {
|
| subexpr_type = NEGATIVE_LOOKAROUND;
|
| + lookaround_type = RegExpLookaround::LOOKBEHIND;
|
| + Advance(2);
|
| break;
|
| }
|
| }
|
| + if (FLAG_harmony_regexp_named_captures && unicode()) {
|
| + is_named_capture = true;
|
| + Advance();
|
| + break;
|
| + }
|
| // Fall through.
|
| default:
|
| return ReportError(CStrVector("Invalid group"));
|
| }
|
| - Advance(2);
|
| - } else {
|
| + }
|
| +
|
| + const ZoneVector<uc16>* capture_name = nullptr;
|
| + if (subexpr_type == CAPTURE) {
|
| if (captures_started_ >= kMaxCaptures) {
|
| return ReportError(CStrVector("Too many captures"));
|
| }
|
| captures_started_++;
|
| +
|
| + if (is_named_capture) {
|
| + capture_name = ParseCaptureGroupName(CHECK_FAILED);
|
| + }
|
| }
|
| // Store current state and begin new disjunction parsing.
|
| state = new (zone()) RegExpParserState(
|
| state, subexpr_type, lookaround_type, captures_started_,
|
| - ignore_case(), unicode(), zone());
|
| + capture_name, ignore_case(), unicode(), zone());
|
| builder = state->builder();
|
| continue;
|
| }
|
| @@ -416,7 +441,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
| break;
|
| }
|
| }
|
| - // FALLTHROUGH
|
| + // Fall through.
|
| case '0': {
|
| Advance();
|
| if (unicode() && Next() >= '0' && Next() <= '9') {
|
| @@ -497,6 +522,13 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
| }
|
| break;
|
| }
|
| + case 'k':
|
| + if (FLAG_harmony_regexp_named_captures && unicode()) {
|
| + Advance(2);
|
| + ParseNamedBackReference(builder, state CHECK_FAILED);
|
| + break;
|
| + }
|
| + // Fall through.
|
| default:
|
| Advance();
|
| // With /u, no identity escapes except for syntax characters
|
| @@ -514,14 +546,14 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
| int dummy;
|
| bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
|
| if (parsed) return ReportError(CStrVector("Nothing to repeat"));
|
| - // fallthrough
|
| + // Fall through.
|
| }
|
| case '}':
|
| case ']':
|
| if (unicode()) {
|
| return ReportError(CStrVector("Lone quantifier brackets"));
|
| }
|
| - // fallthrough
|
| + // Fall through.
|
| default:
|
| builder->AddUnicodeCharacter(current());
|
| Advance();
|
| @@ -675,6 +707,148 @@ bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
|
| return true;
|
| }
|
|
|
| +static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
|
| + if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
|
| + v->push_back(code_unit);
|
| + } else {
|
| + v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
|
| + v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
|
| + }
|
| +}
|
| +
|
| +const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
|
| + DCHECK(FLAG_harmony_regexp_named_captures);
|
| + DCHECK(unicode());
|
| +
|
| + ZoneVector<uc16>* name =
|
| + new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
|
| +
|
| + bool at_start = true;
|
| + while (true) {
|
| + uc32 c = current();
|
| + Advance();
|
| +
|
| + // Convert unicode escapes.
|
| + if (c == '\\' && current() == 'u') {
|
| + Advance();
|
| + if (!ParseUnicodeEscape(&c)) {
|
| + ReportError(CStrVector("Invalid Unicode escape sequence"));
|
| + return nullptr;
|
| + }
|
| + }
|
| +
|
| + if (at_start) {
|
| + if (!IdentifierStart::Is(c)) {
|
| + ReportError(CStrVector("Invalid capture group name"));
|
| + return nullptr;
|
| + }
|
| + push_code_unit(name, c);
|
| + at_start = false;
|
| + } else {
|
| + if (c == '>') {
|
| + break;
|
| + } else if (IdentifierPart::Is(c)) {
|
| + push_code_unit(name, c);
|
| + } else {
|
| + ReportError(CStrVector("Invalid capture group name"));
|
| + return nullptr;
|
| + }
|
| + }
|
| + }
|
| +
|
| + return name;
|
| +}
|
| +
|
| +bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
|
| + int index) {
|
| + DCHECK(FLAG_harmony_regexp_named_captures);
|
| + DCHECK(unicode());
|
| + DCHECK(0 < index && index <= captures_started_);
|
| + DCHECK_NOT_NULL(name);
|
| +
|
| + if (named_captures_ == nullptr) {
|
| + named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
|
| + } else {
|
| + // Check for duplicates and bail if we find any.
|
| + for (const auto& named_capture : *named_captures_) {
|
| + if (*named_capture->name() == *name) {
|
| + ReportError(CStrVector("Duplicate capture group name"));
|
| + return false;
|
| + }
|
| + }
|
| + }
|
| +
|
| + RegExpCapture* capture = GetCapture(index);
|
| + DCHECK(capture->name() == nullptr);
|
| +
|
| + capture->set_name(name);
|
| + named_captures_->Add(capture, zone());
|
| +
|
| + return true;
|
| +}
|
| +
|
| +bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
|
| + RegExpParserState* state) {
|
| + // The parser is assumed to be on the '<' in \k<name>.
|
| + if (current() != '<') {
|
| + ReportError(CStrVector("Invalid named reference"));
|
| + return false;
|
| + }
|
| +
|
| + Advance();
|
| + const ZoneVector<uc16>* name = ParseCaptureGroupName();
|
| + if (name == nullptr) {
|
| + return false;
|
| + }
|
| +
|
| + if (state->IsInsideCaptureGroup(name)) {
|
| + builder->AddEmpty();
|
| + } else {
|
| + RegExpBackReference* atom = new (zone()) RegExpBackReference();
|
| + atom->set_name(name);
|
| +
|
| + builder->AddAtom(atom);
|
| +
|
| + if (named_back_references_ == nullptr) {
|
| + named_back_references_ =
|
| + new (zone()) ZoneList<RegExpBackReference*>(1, zone());
|
| + }
|
| + named_back_references_->Add(atom, zone());
|
| + }
|
| +
|
| + return true;
|
| +}
|
| +
|
| +void RegExpParser::PatchNamedBackReferences() {
|
| + if (named_back_references_ == nullptr) return;
|
| +
|
| + if (named_captures_ == nullptr) {
|
| + ReportError(CStrVector("Invalid named capture referenced"));
|
| + return;
|
| + }
|
| +
|
| + // Look up and patch the actual capture for each named back reference.
|
| + // TODO(jgruber): O(n^2), optimize if necessary.
|
| +
|
| + for (int i = 0; i < named_back_references_->length(); i++) {
|
| + RegExpBackReference* ref = named_back_references_->at(i);
|
| +
|
| + int index = -1;
|
| + for (const auto& capture : *named_captures_) {
|
| + if (*capture->name() == *ref->name()) {
|
| + index = capture->index();
|
| + break;
|
| + }
|
| + }
|
| +
|
| + if (index == -1) {
|
| + ReportError(CStrVector("Invalid named capture referenced"));
|
| + return;
|
| + }
|
| +
|
| + ref->set_capture(GetCapture(index));
|
| + }
|
| +}
|
|
|
| RegExpCapture* RegExpParser::GetCapture(int index) {
|
| // The index for the capture groups are one-based. Its index in the list is
|
| @@ -691,6 +865,24 @@ RegExpCapture* RegExpParser::GetCapture(int index) {
|
| return captures_->at(index - 1);
|
| }
|
|
|
| +Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
|
| + if (named_captures_ == nullptr || named_captures_->is_empty())
|
| + return Handle<FixedArray>();
|
| +
|
| + Factory* factory = isolate()->factory();
|
| +
|
| + int len = named_captures_->length() * 2;
|
| + Handle<FixedArray> array = factory->NewFixedArray(len);
|
| +
|
| + for (int i = 0; i < named_captures_->length(); i++) {
|
| + RegExpCapture* capture = named_captures_->at(i);
|
| + MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
|
| + array->set(i * 2, *name.ToHandleChecked());
|
| + array->set(i * 2 + 1, Smi::FromInt(capture->index()));
|
| + }
|
| +
|
| + return array;
|
| +}
|
|
|
| bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
|
| for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
|
| @@ -703,6 +895,15 @@ bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
|
| return false;
|
| }
|
|
|
| +bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
|
| + const ZoneVector<uc16>* name) {
|
| + DCHECK_NOT_NULL(name);
|
| + for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
|
| + if (s->capture_name() == nullptr) continue;
|
| + if (*s->capture_name() == *name) return true;
|
| + }
|
| + return false;
|
| +}
|
|
|
| // QuantifierPrefix ::
|
| // { DecimalDigits }
|
| @@ -1135,7 +1336,6 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
|
| return CharacterRange::Singleton(first);
|
| }
|
|
|
| -
|
| static const uc16 kNoCharClass = 0;
|
|
|
| // Adds range or pre-defined character class to character ranges.
|
| @@ -1268,6 +1468,7 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
|
| int capture_count = parser.captures_started();
|
| result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
|
| result->contains_anchor = parser.contains_anchor();
|
| + result->capture_name_map = parser.CreateCaptureNameMap();
|
| result->capture_count = capture_count;
|
| }
|
| return !parser.failed();
|
|
|