Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(293)

Unified Diff: src/parser.cc

Issue 149069: Changed RegExp parser to use a recursive data structure instead of stack-based recursion. (Closed)
Patch Set: Removed static nonparticipating capture optimization. Not worth the complexity. Created 11 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/ast.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/parser.cc
diff --git a/src/parser.cc b/src/parser.cc
index 2b4be79a0f3331fe957057025c729e9d8d30ee3d..b1a68da0501620523afaa209c499af3ba30794ad 100644
--- a/src/parser.cc
+++ b/src/parser.cc
@@ -361,7 +361,7 @@ class BufferedZoneList {
};
// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
-class RegExpBuilder {
+class RegExpBuilder: public ZoneObject {
public:
RegExpBuilder();
void AddCharacter(uc16 character);
@@ -392,7 +392,10 @@ class RegExpBuilder {
RegExpBuilder::RegExpBuilder()
- : pending_empty_(false), characters_(NULL), terms_(), alternatives_()
+ : pending_empty_(false),
+ characters_(NULL),
+ terms_(),
+ alternatives_()
#ifdef DEBUG
, last_added_(ADD_NONE)
#endif
@@ -594,6 +597,32 @@ class RegExpParser {
static const int kMaxCaptures = 1 << 16;
static const uc32 kEndMarker = (1 << 21);
private:
+ enum SubexpressionType {
+ INITIAL,
+ CAPTURE, // All positive values represent captures.
+ POSITIVE_LOOKAHEAD,
+ NEGATIVE_LOOKAHEAD,
+ GROUPING
+ };
+
+ struct RegExpParserState : public ZoneObject {
Erik Corry 2009/07/01 11:29:14 Please make this a real object.
+ RegExpParserState(RegExpParserState* previous_state,
+ RegExpBuilder* builder,
+ int disjunction_capture_index,
+ SubexpressionType group_type)
+ : previous_state(previous_state),
+ builder(builder),
+ disjunction_capture_index(disjunction_capture_index),
+ group_type(group_type) {}
+ // Linked list implementation of stack of states.
+ RegExpParserState* previous_state;
+ // Builder for the stored disjunction.
+ RegExpBuilder* builder;
+ // Stored disjunction's capture index (if any).
+ int disjunction_capture_index;
+ // Stored disjunction type (capture, look-ahead or grouping), if any.
+ SubexpressionType group_type;
+ };
uc32 current() { return current_; }
bool has_more() { return has_more_; }
@@ -601,7 +630,6 @@ class RegExpParser {
uc32 Next();
FlatStringReader* in() { return in_; }
void ScanForCaptures();
- bool CaptureAvailable(int index);
uc32 current_;
bool has_more_;
bool multiline_;
@@ -3820,14 +3848,6 @@ RegExpTree* RegExpParser::ParsePattern() {
}
-bool RegExpParser::CaptureAvailable(int index) {
- if (captures_ == NULL) return false;
- if (index >= captures_->length()) return false;
- RegExpCapture* capture = captures_->at(index);
- return capture != NULL && capture->available() == CAPTURE_AVAILABLE;
-}
-
-
// Disjunction ::
// Alternative
// Alternative | Disjunction
@@ -3839,24 +3859,65 @@ bool RegExpParser::CaptureAvailable(int index) {
// Atom
// Atom Quantifier
RegExpTree* RegExpParser::ParseDisjunction() {
- RegExpBuilder builder;
- int capture_start_index = captures_started();
+ // Used to store current state while parsing subexpressions.
+ RegExpParserState* stored_state = NULL;
Erik Corry 2009/07/01 11:29:14 If you initialize this to an initial stored state
+ RegExpBuilder* builder = new RegExpBuilder();
+ SubexpressionType group_type = INITIAL;
+ // Index in captures array of first capture in this sub-expression, if any.
+ // Also the capture index of this sub-expression itself, if group_type
+ // is CAPTURE.
+ int disjunction_capture_index = 0;
while (true) {
switch (current()) {
case kEndMarker:
- case ')':
- return builder.ToRegExp();
- case '|': {
+ if (stored_state != NULL) {
+ // Inside a parenthesized group when hitting end of input.
+ ReportError(CStrVector("Unterminated group") CHECK_FAILED);
+ }
+ ASSERT_EQ(INITIAL, group_type);
+ // Parsing completed successfully.
+ return builder->ToRegExp();
+ case ')': {
+ if (stored_state == NULL) {
+ ReportError(CStrVector("Unexpected ')'") CHECK_FAILED);
+ }
+ ASSERT_NE(INITIAL, group_type);
+
Advance();
- builder.NewAlternative();
- int capture_new_alt_start_index = captures_started();
- for (int i = capture_start_index; i < capture_new_alt_start_index; i++) {
- RegExpCapture* capture = captures_->at(i);
- if (capture->available() == CAPTURE_AVAILABLE) {
- capture->set_available(CAPTURE_UNREACHABLE);
- }
+ // End disjunction parsing and convert builder content to new single
+ // regexp atom.
+ RegExpTree* body = builder->ToRegExp();
+
+ int end_capture_index = captures_started();
+
+ int capture_index = disjunction_capture_index;
+ SubexpressionType type = group_type;
+
+ // Restore previous state.
+ builder = stored_state->builder;
+ group_type = stored_state->group_type;
+ disjunction_capture_index = stored_state->disjunction_capture_index;
+ stored_state = stored_state->previous_state;
+
+ // Build result of subexpression.
+ if (type == CAPTURE) {
+ RegExpCapture* capture = new RegExpCapture(body, capture_index);
+ captures_->at(capture_index - 1) = capture;
+ body = capture;
+ } else if (type != GROUPING) {
+ ASSERT(type == POSITIVE_LOOKAHEAD || type == NEGATIVE_LOOKAHEAD);
+ bool is_positive = (type == POSITIVE_LOOKAHEAD);
+ body = new RegExpLookahead(body,
+ is_positive,
+ end_capture_index - capture_index,
+ capture_index);
}
- capture_start_index = capture_new_alt_start_index;
+ builder->AddAtom(body);
+ break;
+ }
+ case '|': {
+ Advance();
+ builder->NewAlternative();
continue;
}
case '*':
@@ -3866,10 +3927,10 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '^': {
Advance();
if (multiline_) {
- builder.AddAssertion(
+ builder->AddAssertion(
new RegExpAssertion(RegExpAssertion::START_OF_LINE));
} else {
- builder.AddAssertion(
+ builder->AddAssertion(
new RegExpAssertion(RegExpAssertion::START_OF_INPUT));
set_contains_anchor();
}
@@ -3880,7 +3941,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
RegExpAssertion::Type type =
multiline_ ? RegExpAssertion::END_OF_LINE :
RegExpAssertion::END_OF_INPUT;
- builder.AddAssertion(new RegExpAssertion(type));
+ builder->AddAssertion(new RegExpAssertion(type));
continue;
}
case '.': {
@@ -3889,17 +3950,52 @@ RegExpTree* RegExpParser::ParseDisjunction() {
ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
CharacterRange::AddClassEscape('.', ranges);
RegExpTree* atom = new RegExpCharacterClass(ranges, false);
- builder.AddAtom(atom);
+ builder->AddAtom(atom);
break;
}
case '(': {
- RegExpTree* atom = ParseGroup(CHECK_FAILED);
- builder.AddAtom(atom);
+ SubexpressionType type = CAPTURE;
+ Advance();
+ if (current() == '?') {
+ switch (Next()) {
+ case ':':
+ type = GROUPING;
+ break;
+ case '=':
+ type = POSITIVE_LOOKAHEAD;
+ break;
+ case '!':
+ type = NEGATIVE_LOOKAHEAD;
+ break;
+ default:
+ ReportError(CStrVector("Invalid group") CHECK_FAILED);
+ break;
+ }
+ Advance(2);
+ } else {
+ if (captures_ == NULL) {
+ captures_ = new ZoneList<RegExpCapture*>(2);
+ }
+ if (captures_started() >= kMaxCaptures) {
+ ReportError(CStrVector("Too many captures") CHECK_FAILED);
+ }
+ captures_->Add(NULL);
+ }
+ // Store current state and begin new disjunction parsing.
+ stored_state = new RegExpParserState(stored_state,
+ builder,
+ disjunction_capture_index,
+ group_type);
+ builder = new RegExpBuilder();
+ group_type = type;
+ if (type == CAPTURE) {
+ disjunction_capture_index = captures_started();
+ }
break;
}
case '[': {
RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);
- builder.AddAtom(atom);
+ builder->AddAtom(atom);
break;
}
// Atom ::
@@ -3910,12 +4006,12 @@ RegExpTree* RegExpParser::ParseDisjunction() {
ReportError(CStrVector("\\ at end of pattern") CHECK_FAILED);
case 'b':
Advance(2);
- builder.AddAssertion(
+ builder->AddAssertion(
new RegExpAssertion(RegExpAssertion::BOUNDARY));
continue;
case 'B':
Advance(2);
- builder.AddAssertion(
+ builder->AddAssertion(
new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
continue;
// AtomEscape ::
@@ -3929,27 +4025,29 @@ RegExpTree* RegExpParser::ParseDisjunction() {
ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
CharacterRange::AddClassEscape(c, ranges);
RegExpTree* atom = new RegExpCharacterClass(ranges, false);
- builder.AddAtom(atom);
+ builder->AddAtom(atom);
goto has_read_atom; // Avoid setting has_character_escapes_.
}
case '1': case '2': case '3': case '4': case '5': case '6':
case '7': case '8': case '9': {
int index = 0;
if (ParseBackReferenceIndex(&index)) {
- if (!CaptureAvailable(index - 1)) {
- // Prepare to ignore a following quantifier
- builder.AddEmpty();
+ RegExpCapture* capture = NULL;
+ if (captures_ != NULL && index <= captures_->length()) {
+ capture = captures_->at(index - 1);
+ }
+ if (capture == NULL) {
+ builder->AddEmpty();
goto has_read_atom;
}
- RegExpCapture* capture = captures_->at(index - 1);
RegExpTree* atom = new RegExpBackReference(capture);
- builder.AddAtom(atom);
+ builder->AddAtom(atom);
goto has_read_atom; // Avoid setting has_character_escapes_.
}
uc32 first_digit = Next();
if (first_digit == '8' || first_digit == '9') {
// Treat as identity escape
- builder.AddCharacter(first_digit);
+ builder->AddCharacter(first_digit);
Advance(2);
break;
}
@@ -3958,44 +4056,44 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '0': {
Advance();
uc32 octal = ParseOctalLiteral();
- builder.AddCharacter(octal);
+ builder->AddCharacter(octal);
break;
}
// ControlEscape :: one of
// f n r t v
case 'f':
Advance(2);
- builder.AddCharacter('\f');
+ builder->AddCharacter('\f');
break;
case 'n':
Advance(2);
- builder.AddCharacter('\n');
+ builder->AddCharacter('\n');
break;
case 'r':
Advance(2);
- builder.AddCharacter('\r');
+ builder->AddCharacter('\r');
break;
case 't':
Advance(2);
- builder.AddCharacter('\t');
+ builder->AddCharacter('\t');
break;
case 'v':
Advance(2);
- builder.AddCharacter('\v');
+ builder->AddCharacter('\v');
break;
case 'c': {
Advance(2);
uc32 control = ParseControlLetterEscape();
- builder.AddCharacter(control);
+ builder->AddCharacter(control);
break;
}
case 'x': {
Advance(2);
uc32 value;
if (ParseHexEscape(2, &value)) {
- builder.AddCharacter(value);
+ builder->AddCharacter(value);
} else {
- builder.AddCharacter('x');
+ builder->AddCharacter('x');
}
break;
}
@@ -4003,15 +4101,15 @@ RegExpTree* RegExpParser::ParseDisjunction() {
Advance(2);
uc32 value;
if (ParseHexEscape(4, &value)) {
- builder.AddCharacter(value);
+ builder->AddCharacter(value);
} else {
- builder.AddCharacter('u');
+ builder->AddCharacter('u');
}
break;
}
default:
// Identity escape.
- builder.AddCharacter(Next());
+ builder->AddCharacter(Next());
Advance(2);
break;
}
@@ -4024,7 +4122,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
// fallthrough
}
default:
- builder.AddCharacter(current());
+ builder->AddCharacter(current());
Advance();
break;
} // end switch(current())
@@ -4071,7 +4169,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
is_greedy = false;
Advance();
}
- builder.AddQuantifierToAtom(min, max, is_greedy);
+ builder->AddQuantifierToAtom(min, max, is_greedy);
}
}
@@ -4382,73 +4480,6 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
}
-RegExpTree* RegExpParser::ParseGroup() {
- ASSERT_EQ(current(), '(');
- char type = '(';
- Advance();
- if (current() == '?') {
- switch (Next()) {
- case ':': case '=': case '!':
- type = Next();
- Advance(2);
- break;
- default:
- ReportError(CStrVector("Invalid group") CHECK_FAILED);
- break;
- }
- } else {
- if (captures_ == NULL) {
- captures_ = new ZoneList<RegExpCapture*>(2);
- }
- if (captures_started() >= kMaxCaptures) {
- ReportError(CStrVector("Too many captures") CHECK_FAILED);
- }
- captures_->Add(NULL);
- }
- int capture_index = captures_started();
- RegExpTree* body = ParseDisjunction(CHECK_FAILED);
- if (current() != ')') {
- ReportError(CStrVector("Unterminated group") CHECK_FAILED);
- }
- Advance();
-
- int end_capture_index = captures_started();
- if (type == '!') {
- // Captures inside a negative lookahead are never available outside it.
- for (int i = capture_index; i < end_capture_index; i++) {
- RegExpCapture* capture = captures_->at(i);
- ASSERT(capture != NULL);
- capture->set_available(CAPTURE_PERMANENTLY_UNREACHABLE);
- }
- } else {
- // Captures temporarily unavailable because they are in different
- // alternatives are all available after the disjunction.
- for (int i = capture_index; i < end_capture_index; i++) {
- RegExpCapture* capture = captures_->at(i);
- ASSERT(capture != NULL);
- if (capture->available() == CAPTURE_UNREACHABLE) {
- capture->set_available(CAPTURE_AVAILABLE);
- }
- }
- }
-
- if (type == '(') {
- RegExpCapture* capture = new RegExpCapture(body, capture_index);
- captures_->at(capture_index - 1) = capture;
- return capture;
- } else if (type == ':') {
- return body;
- } else {
- ASSERT(type == '=' || type == '!');
- bool is_positive = (type == '=');
- return new RegExpLookahead(body,
- is_positive,
- end_capture_index - capture_index,
- capture_index);
- }
-}
-
-
CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
ASSERT_EQ(0, *char_class);
uc32 first = current();
« no previous file with comments | « src/ast.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698