Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(815)

Unified Diff: src/parser.cc

Issue 1418963009: Experimental support for RegExp lookbehind. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: fixed test cases Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/parser.cc
diff --git a/src/parser.cc b/src/parser.cc
index 9227272e09ca51ed62dce91704ad6e940b7723e5..89362d082b347e0341e5c5b5bb4cbafb82dd936f 100644
--- a/src/parser.cc
+++ b/src/parser.cc
@@ -94,12 +94,14 @@ ParseInfo::ParseInfo(Zone* zone, Handle<Script> script) : ParseInfo(zone) {
}
-RegExpBuilder::RegExpBuilder(Zone* zone)
+RegExpBuilder::RegExpBuilder(Zone* zone,
+ RegExpTree::ReadDirection read_direction)
: zone_(zone),
pending_empty_(false),
characters_(NULL),
terms_(),
- alternatives_()
+ alternatives_(),
+ read_direction_(read_direction)
#ifdef DEBUG
, last_added_(ADD_NONE)
#endif
@@ -109,7 +111,8 @@ RegExpBuilder::RegExpBuilder(Zone* zone)
void RegExpBuilder::FlushCharacters() {
pending_empty_ = false;
if (characters_ != NULL) {
- RegExpTree* atom = new(zone()) RegExpAtom(characters_->ToConstVector());
+ RegExpTree* atom =
+ new (zone()) RegExpAtom(characters_->ToConstVector(), read_direction_);
characters_ = NULL;
text_.Add(atom, zone());
LAST(ADD_ATOM);
@@ -125,7 +128,7 @@ void RegExpBuilder::FlushText() {
} else if (num_text == 1) {
terms_.Add(text_.last(), zone());
} else {
- RegExpText* text = new(zone()) RegExpText(zone());
+ RegExpText* text = new (zone()) RegExpText(zone(), read_direction_);
for (int i = 0; i < num_text; i++)
text_.Get(i)->AppendToText(text, zone());
terms_.Add(text, zone());
@@ -186,7 +189,8 @@ void RegExpBuilder::FlushTerms() {
} else if (num_terms == 1) {
alternative = terms_.last();
} else {
- alternative = new(zone()) RegExpAlternative(terms_.GetList(zone()));
+ alternative =
+ new (zone()) RegExpAlternative(terms_.GetList(zone()), read_direction_);
}
alternatives_.Add(alternative, zone());
terms_.Clear();
@@ -199,7 +203,8 @@ RegExpTree* RegExpBuilder::ToRegExp() {
int num_alternatives = alternatives_.length();
if (num_alternatives == 0) return new (zone()) RegExpEmpty();
if (num_alternatives == 1) return alternatives_.last();
- return new(zone()) RegExpDisjunction(alternatives_.GetList(zone()));
+ return new (zone())
+ RegExpDisjunction(alternatives_.GetList(zone()), read_direction_);
}
@@ -217,11 +222,11 @@ void RegExpBuilder::AddQuantifierToAtom(
int num_chars = char_vector.length();
if (num_chars > 1) {
Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
- text_.Add(new(zone()) RegExpAtom(prefix), zone());
+ text_.Add(new (zone()) RegExpAtom(prefix, read_direction_), zone());
char_vector = char_vector.SubVector(num_chars - 1, num_chars);
}
characters_ = NULL;
- atom = new(zone()) RegExpAtom(char_vector);
+ atom = new (zone()) RegExpAtom(char_vector, read_direction_);
FlushText();
} else if (text_.length() > 0) {
DCHECK(last_added_ == ADD_ATOM);
@@ -244,8 +249,9 @@ void RegExpBuilder::AddQuantifierToAtom(
UNREACHABLE();
return;
}
- terms_.Add(
- new(zone()) RegExpQuantifier(min, max, quantifier_type, atom), zone());
+ terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom,
+ read_direction_),
+ zone());
LAST(ADD_TERM);
}
@@ -5221,6 +5227,7 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
in_(in),
current_(kEndMarker),
next_pos_(0),
+ captures_started_(0),
capture_count_(0),
has_more_(true),
multiline_(multiline),
@@ -5302,6 +5309,7 @@ RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
// Disjunction
RegExpTree* RegExpParser::ParsePattern() {
RegExpTree* result = ParseDisjunction(CHECK_FAILED);
+
DCHECK(!has_more());
// If the result of parsing is a literal string atom, and it has the
// same length as the input, then the atom is identical to the input.
@@ -5324,54 +5332,56 @@ RegExpTree* RegExpParser::ParsePattern() {
// Atom Quantifier
RegExpTree* RegExpParser::ParseDisjunction() {
// Used to store current state while parsing subexpressions.
- RegExpParserState initial_state(NULL, INITIAL, 0, zone());
- RegExpParserState* stored_state = &initial_state;
+ RegExpParserState initial_state(NULL, INITIAL, RegExpTree::READ_FORWARD, 0,
+ zone());
+ RegExpParserState* state = &initial_state;
// Cache the builder in a local variable for quick access.
RegExpBuilder* builder = initial_state.builder();
while (true) {
switch (current()) {
case kEndMarker:
- if (stored_state->IsSubexpression()) {
+ if (state->IsSubexpression()) {
// Inside a parenthesized group when hitting end of input.
ReportError(CStrVector("Unterminated group") CHECK_FAILED);
}
- DCHECK_EQ(INITIAL, stored_state->group_type());
+ DCHECK_EQ(INITIAL, state->group_type());
// Parsing completed successfully.
return builder->ToRegExp();
case ')': {
- if (!stored_state->IsSubexpression()) {
+ if (!state->IsSubexpression()) {
ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
}
- DCHECK_NE(INITIAL, stored_state->group_type());
+ DCHECK_NE(INITIAL, state->group_type());
Advance();
// End disjunction parsing and convert builder content to new single
// regexp atom.
RegExpTree* body = builder->ToRegExp();
- int end_capture_index = captures_started();
-
- int capture_index = stored_state->capture_index();
- SubexpressionType group_type = stored_state->group_type();
+ int end_capture_index = captures_started_;
- // Restore previous state.
- stored_state = stored_state->previous_state();
- builder = stored_state->builder();
+ int capture_index = state->capture_index();
+ SubexpressionType group_type = state->group_type();
// Build result of subexpression.
if (group_type == CAPTURE) {
- RegExpCapture* capture = new(zone()) RegExpCapture(body, capture_index);
- captures_->at(capture_index - 1) = capture;
+ RegExpCapture* capture = GetCapture(capture_index);
+ capture->set_body(body);
+ capture->set_read_direction(state->read_direction());
body = capture;
} else if (group_type != GROUPING) {
DCHECK(group_type == POSITIVE_LOOKAHEAD ||
group_type == NEGATIVE_LOOKAHEAD);
bool is_positive = (group_type == POSITIVE_LOOKAHEAD);
- body = new(zone()) RegExpLookahead(body,
- is_positive,
- end_capture_index - capture_index,
- capture_index);
+ body = new (zone()) RegExpLookaround(
+ body, is_positive, end_capture_index - capture_index, capture_index,
+ state->read_direction());
}
+
+ // Restore previous state.
+ state = state->previous_state();
+ builder = state->builder();
+
builder->AddAtom(body);
// For compatability with JSC and ES3, we allow quantifiers after
// lookaheads, and break in all cases.
@@ -5389,11 +5399,11 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '^': {
Advance();
if (multiline_) {
- builder->AddAssertion(
- new(zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
+ builder->AddAssertion(new (zone()) RegExpAssertion(
+ RegExpAssertion::START_OF_LINE, state->read_direction()));
} else {
- builder->AddAssertion(
- new(zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
+ builder->AddAssertion(new (zone()) RegExpAssertion(
+ RegExpAssertion::START_OF_INPUT, state->read_direction()));
set_contains_anchor();
}
continue;
@@ -5403,7 +5413,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {
RegExpAssertion::AssertionType assertion_type =
multiline_ ? RegExpAssertion::END_OF_LINE :
RegExpAssertion::END_OF_INPUT;
- builder->AddAssertion(new(zone()) RegExpAssertion(assertion_type));
+ builder->AddAssertion(new (zone()) RegExpAssertion(
+ assertion_type, state->read_direction()));
continue;
}
case '.': {
@@ -5412,12 +5423,14 @@ RegExpTree* RegExpParser::ParseDisjunction() {
ZoneList<CharacterRange>* ranges =
new(zone()) ZoneList<CharacterRange>(2, zone());
CharacterRange::AddClassEscape('.', ranges, zone());
- RegExpTree* atom = new(zone()) RegExpCharacterClass(ranges, false);
+ RegExpTree* atom = new (zone())
+ RegExpCharacterClass(ranges, false, state->read_direction());
builder->AddAtom(atom);
break;
}
case '(': {
SubexpressionType subexpr_type = CAPTURE;
+ RegExpTree::ReadDirection read_direction = state->read_direction();
Advance();
if (current() == '?') {
switch (Next()) {
@@ -5425,33 +5438,46 @@ RegExpTree* RegExpParser::ParseDisjunction() {
subexpr_type = GROUPING;
break;
case '=':
+ read_direction = RegExpTree::READ_FORWARD;
subexpr_type = POSITIVE_LOOKAHEAD;
break;
case '!':
+ read_direction = RegExpTree::READ_FORWARD;
subexpr_type = NEGATIVE_LOOKAHEAD;
break;
+ case '<':
+ if (FLAG_harmony_regexp_lookbehind) {
+ Advance();
+ read_direction = RegExpTree::READ_BACKWARD;
+ if (Next() == '=') {
+ subexpr_type = POSITIVE_LOOKAHEAD;
+ break;
+ } else if (Next() == '!') {
+ subexpr_type = NEGATIVE_LOOKAHEAD;
+ break;
+ }
+ }
+ // Fall through.
default:
ReportError(CStrVector("Invalid group") CHECK_FAILED);
break;
}
Advance(2);
} else {
- if (captures_ == NULL) {
- captures_ = new(zone()) ZoneList<RegExpCapture*>(2, zone());
- }
- if (captures_started() >= kMaxCaptures) {
+ if (captures_started_ >= kMaxCaptures) {
ReportError(CStrVector("Too many captures") CHECK_FAILED);
}
- captures_->Add(NULL, zone());
+ captures_started_++;
}
// Store current state and begin new disjunction parsing.
- stored_state = new(zone()) RegExpParserState(stored_state, subexpr_type,
- captures_started(), zone());
- builder = stored_state->builder();
+ state = new (zone()) RegExpParserState(
+ state, subexpr_type, read_direction, captures_started_, zone());
+ builder = state->builder();
continue;
}
case '[': {
- RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);
+ RegExpTree* atom =
+ ParseCharacterClass(state->read_direction() CHECK_FAILED);
builder->AddAtom(atom);
break;
}
@@ -5463,13 +5489,13 @@ RegExpTree* RegExpParser::ParseDisjunction() {
return ReportError(CStrVector("\\ at end of pattern"));
case 'b':
Advance(2);
- builder->AddAssertion(
- new(zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
+ builder->AddAssertion(new (zone()) RegExpAssertion(
+ RegExpAssertion::BOUNDARY, state->read_direction()));
continue;
case 'B':
Advance(2);
- builder->AddAssertion(
- new(zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
+ builder->AddAssertion(new (zone()) RegExpAssertion(
+ RegExpAssertion::NON_BOUNDARY, state->read_direction()));
continue;
// AtomEscape ::
// CharacterClassEscape
@@ -5482,7 +5508,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {
ZoneList<CharacterRange>* ranges =
new(zone()) ZoneList<CharacterRange>(2, zone());
CharacterRange::AddClassEscape(c, ranges, zone());
- RegExpTree* atom = new(zone()) RegExpCharacterClass(ranges, false);
+ RegExpTree* atom = new (zone())
+ RegExpCharacterClass(ranges, false, state->read_direction());
builder->AddAtom(atom);
break;
}
@@ -5490,15 +5517,9 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '7': case '8': case '9': {
int index = 0;
if (ParseBackReferenceIndex(&index)) {
- RegExpCapture* capture = NULL;
- if (captures_ != NULL && index <= captures_->length()) {
- capture = captures_->at(index - 1);
- }
- if (capture == NULL) {
- builder->AddEmpty();
- break;
- }
- RegExpTree* atom = new(zone()) RegExpBackReference(capture);
+ RegExpCapture* capture = GetCapture(index);
+ RegExpTree* atom = new (zone())
+ RegExpBackReference(capture, state->read_direction());
builder->AddAtom(atom);
break;
}
@@ -5692,7 +5713,7 @@ static bool IsSpecialClassEscape(uc32 c) {
// characters.
void RegExpParser::ScanForCaptures() {
// Start with captures started previous to current position
- int capture_count = captures_started();
+ int capture_count = captures_started_;
// Add count of captures after this position.
int n;
while ((n = current()) != kEndMarker) {
@@ -5744,7 +5765,7 @@ bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
break;
}
}
- if (value > captures_started()) {
+ if (value > captures_started_) {
if (!is_scanned_for_captures_) {
int saved_position = position();
ScanForCaptures();
@@ -5760,6 +5781,22 @@ bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
}
+RegExpCapture* RegExpParser::GetCapture(int index) {
+ // The index for the capture groups are one-based. Its index in the list is
+ // zero-based.
+ int know_captures =
+ is_scanned_for_captures_ ? capture_count_ : captures_started_;
+ DCHECK(index <= know_captures);
+ if (captures_ == NULL) {
+ captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
+ }
+ while (captures_->length() < know_captures) {
+ captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
+ }
+ return captures_->at(index - 1);
+}
+
+
// QuantifierPrefix ::
// { DecimalDigits }
// { DecimalDigits , }
@@ -6039,7 +6076,8 @@ static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
}
-RegExpTree* RegExpParser::ParseCharacterClass() {
+RegExpTree* RegExpParser::ParseCharacterClass(
+ RegExpTree::ReadDirection read_direction) {
static const char* kUnterminated = "Unterminated character class";
static const char* kRangeOutOfOrder = "Range out of order in character class";
@@ -6091,7 +6129,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
ranges->Add(CharacterRange::Everything(), zone());
is_negated = !is_negated;
}
- return new(zone()) RegExpCharacterClass(ranges, is_negated);
+ return new (zone()) RegExpCharacterClass(ranges, is_negated, read_direction);
}
« no previous file with comments | « src/parser.h ('k') | src/regexp/bytecodes-irregexp.h » ('j') | src/regexp/jsregexp.cc » ('J')

Powered by Google App Engine
This is Rietveld 408576698