src/parser.cc - Issue 149069: Changed RegExp parser to use a recursive data structure instead of stack-based recursion.

Unified Diff: src/parser.cc

Issue 149069: Changed RegExp parser to use a recursive data structure instead of stack-based recursion. (Closed)

Patch Set: Removed static nonparticipating capture optimization. Not worth the complexity. Created 11 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/parser.cc

diff --git a/src/parser.cc b/src/parser.cc

index 2b4be79a0f3331fe957057025c729e9d8d30ee3d..b1a68da0501620523afaa209c499af3ba30794ad 100644

--- a/src/parser.cc

+++ b/src/parser.cc

@@ -361,7 +361,7 @@ class BufferedZoneList {

};

// Accumulates RegExp atoms and assertions into lists of terms and alternatives.

-class RegExpBuilder {

+class RegExpBuilder: public ZoneObject {

public:

RegExpBuilder();

void AddCharacter(uc16 character);

@@ -392,7 +392,10 @@ class RegExpBuilder {

RegExpBuilder::RegExpBuilder()

- : pending_empty_(false), characters_(NULL), terms_(), alternatives_()

+ : pending_empty_(false),

+ characters_(NULL),

+ terms_(),

+ alternatives_()

#ifdef DEBUG

, last_added_(ADD_NONE)

#endif

@@ -594,6 +597,32 @@ class RegExpParser {

static const int kMaxCaptures = 1 << 16;

static const uc32 kEndMarker = (1 << 21);

private:

+ enum SubexpressionType {

+ INITIAL,

+ CAPTURE, // All positive values represent captures.

+ POSITIVE_LOOKAHEAD,

+ NEGATIVE_LOOKAHEAD,

+ GROUPING

+ };

+ struct RegExpParserState : public ZoneObject {

Erik Corry 2009/07/01 11:29:14 Please make this a real object.

+ RegExpParserState(RegExpParserState* previous_state,

+ RegExpBuilder* builder,

+ int disjunction_capture_index,

+ SubexpressionType group_type)

+ : previous_state(previous_state),

+ builder(builder),

+ disjunction_capture_index(disjunction_capture_index),

+ group_type(group_type) {}

+ // Linked list implementation of stack of states.

+ RegExpParserState* previous_state;

+ // Builder for the stored disjunction.

+ RegExpBuilder* builder;

+ // Stored disjunction's capture index (if any).

+ int disjunction_capture_index;

+ // Stored disjunction type (capture, look-ahead or grouping), if any.

+ SubexpressionType group_type;

+ };

uc32 current() { return current_; }

bool has_more() { return has_more_; }

@@ -601,7 +630,6 @@ class RegExpParser {

uc32 Next();

FlatStringReader* in() { return in_; }

void ScanForCaptures();

- bool CaptureAvailable(int index);

uc32 current_;

bool has_more_;

bool multiline_;

@@ -3820,14 +3848,6 @@ RegExpTree* RegExpParser::ParsePattern() {

}

-bool RegExpParser::CaptureAvailable(int index) {

- if (captures_ == NULL) return false;

- if (index >= captures_->length()) return false;

- RegExpCapture* capture = captures_->at(index);

- return capture != NULL && capture->available() == CAPTURE_AVAILABLE;

// Disjunction ::

// Alternative

// Alternative | Disjunction

@@ -3839,24 +3859,65 @@ bool RegExpParser::CaptureAvailable(int index) {

// Atom

// Atom Quantifier

RegExpTree* RegExpParser::ParseDisjunction() {

- RegExpBuilder builder;

- int capture_start_index = captures_started();

+ // Used to store current state while parsing subexpressions.

+ RegExpParserState* stored_state = NULL;

Erik Corry 2009/07/01 11:29:14 If you initialize this to an initial stored state

+ RegExpBuilder* builder = new RegExpBuilder();

+ SubexpressionType group_type = INITIAL;

+ // Index in captures array of first capture in this sub-expression, if any.

+ // Also the capture index of this sub-expression itself, if group_type

+ // is CAPTURE.

+ int disjunction_capture_index = 0;

while (true) {

switch (current()) {

case kEndMarker:

- case ')':

- return builder.ToRegExp();

- case '|': {

+ if (stored_state != NULL) {

+ // Inside a parenthesized group when hitting end of input.

+ ReportError(CStrVector("Unterminated group") CHECK_FAILED);

+ }

+ ASSERT_EQ(INITIAL, group_type);

+ // Parsing completed successfully.

+ return builder->ToRegExp();

+ case ')': {

+ if (stored_state == NULL) {

+ ReportError(CStrVector("Unexpected ')'") CHECK_FAILED);

+ }

+ ASSERT_NE(INITIAL, group_type);

Advance();

- builder.NewAlternative();

- int capture_new_alt_start_index = captures_started();

- for (int i = capture_start_index; i < capture_new_alt_start_index; i++) {

- RegExpCapture* capture = captures_->at(i);

- if (capture->available() == CAPTURE_AVAILABLE) {

- capture->set_available(CAPTURE_UNREACHABLE);

- }

+ // End disjunction parsing and convert builder content to new single

+ // regexp atom.

+ RegExpTree* body = builder->ToRegExp();

+ int end_capture_index = captures_started();

+ int capture_index = disjunction_capture_index;

+ SubexpressionType type = group_type;

+ // Restore previous state.

+ builder = stored_state->builder;

+ group_type = stored_state->group_type;

+ disjunction_capture_index = stored_state->disjunction_capture_index;

+ stored_state = stored_state->previous_state;

+ // Build result of subexpression.

+ if (type == CAPTURE) {

+ RegExpCapture* capture = new RegExpCapture(body, capture_index);

+ captures_->at(capture_index - 1) = capture;

+ body = capture;

+ } else if (type != GROUPING) {

+ ASSERT(type == POSITIVE_LOOKAHEAD || type == NEGATIVE_LOOKAHEAD);

+ bool is_positive = (type == POSITIVE_LOOKAHEAD);

+ body = new RegExpLookahead(body,

+ is_positive,

+ end_capture_index - capture_index,

+ capture_index);

}

- capture_start_index = capture_new_alt_start_index;

+ builder->AddAtom(body);

+ break;

+ }

+ case '|': {

+ Advance();

+ builder->NewAlternative();

continue;

}

case '*':

@@ -3866,10 +3927,10 @@ RegExpTree* RegExpParser::ParseDisjunction() {

case '^': {

Advance();

if (multiline_) {

- builder.AddAssertion(

+ builder->AddAssertion(

new RegExpAssertion(RegExpAssertion::START_OF_LINE));

} else {

- builder.AddAssertion(

+ builder->AddAssertion(

new RegExpAssertion(RegExpAssertion::START_OF_INPUT));

set_contains_anchor();

}

@@ -3880,7 +3941,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {

RegExpAssertion::Type type =

multiline_ ? RegExpAssertion::END_OF_LINE :

RegExpAssertion::END_OF_INPUT;

- builder.AddAssertion(new RegExpAssertion(type));

+ builder->AddAssertion(new RegExpAssertion(type));

continue;

}

case '.': {

@@ -3889,17 +3950,52 @@ RegExpTree* RegExpParser::ParseDisjunction() {

ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);

CharacterRange::AddClassEscape('.', ranges);

RegExpTree* atom = new RegExpCharacterClass(ranges, false);

- builder.AddAtom(atom);

+ builder->AddAtom(atom);

break;

}

case '(': {

- RegExpTree* atom = ParseGroup(CHECK_FAILED);

- builder.AddAtom(atom);

+ SubexpressionType type = CAPTURE;

+ Advance();

+ if (current() == '?') {

+ switch (Next()) {

+ case ':':

+ type = GROUPING;

+ break;

+ case '=':

+ type = POSITIVE_LOOKAHEAD;

+ break;

+ case '!':

+ type = NEGATIVE_LOOKAHEAD;

+ break;

+ default:

+ ReportError(CStrVector("Invalid group") CHECK_FAILED);

+ break;

+ }

+ Advance(2);

+ } else {

+ if (captures_ == NULL) {

+ captures_ = new ZoneList<RegExpCapture*>(2);

+ }

+ if (captures_started() >= kMaxCaptures) {

+ ReportError(CStrVector("Too many captures") CHECK_FAILED);

+ }

+ captures_->Add(NULL);

+ }

+ // Store current state and begin new disjunction parsing.

+ stored_state = new RegExpParserState(stored_state,

+ builder,

+ disjunction_capture_index,

+ group_type);

+ builder = new RegExpBuilder();

+ group_type = type;

+ if (type == CAPTURE) {

+ disjunction_capture_index = captures_started();

+ }

break;

}

case '[': {

RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);

- builder.AddAtom(atom);

+ builder->AddAtom(atom);

break;

}

// Atom ::

@@ -3910,12 +4006,12 @@ RegExpTree* RegExpParser::ParseDisjunction() {

ReportError(CStrVector("\\ at end of pattern") CHECK_FAILED);

case 'b':

Advance(2);

- builder.AddAssertion(

+ builder->AddAssertion(

new RegExpAssertion(RegExpAssertion::BOUNDARY));

continue;

case 'B':

Advance(2);

- builder.AddAssertion(

+ builder->AddAssertion(

new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));

continue;

// AtomEscape ::

@@ -3929,27 +4025,29 @@ RegExpTree* RegExpParser::ParseDisjunction() {

ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);

CharacterRange::AddClassEscape(c, ranges);

RegExpTree* atom = new RegExpCharacterClass(ranges, false);

- builder.AddAtom(atom);

+ builder->AddAtom(atom);

goto has_read_atom; // Avoid setting has_character_escapes_.

}

case '1': case '2': case '3': case '4': case '5': case '6':

case '7': case '8': case '9': {

int index = 0;

if (ParseBackReferenceIndex(&index)) {

- if (!CaptureAvailable(index - 1)) {

- // Prepare to ignore a following quantifier

- builder.AddEmpty();

+ RegExpCapture* capture = NULL;

+ if (captures_ != NULL && index <= captures_->length()) {

+ capture = captures_->at(index - 1);

+ }

+ if (capture == NULL) {

+ builder->AddEmpty();

goto has_read_atom;

}

- RegExpCapture* capture = captures_->at(index - 1);

RegExpTree* atom = new RegExpBackReference(capture);

- builder.AddAtom(atom);

+ builder->AddAtom(atom);

goto has_read_atom; // Avoid setting has_character_escapes_.

}

uc32 first_digit = Next();

if (first_digit == '8' || first_digit == '9') {

// Treat as identity escape

- builder.AddCharacter(first_digit);

+ builder->AddCharacter(first_digit);

Advance(2);

break;

}

@@ -3958,44 +4056,44 @@ RegExpTree* RegExpParser::ParseDisjunction() {

case '0': {

Advance();

uc32 octal = ParseOctalLiteral();

- builder.AddCharacter(octal);

+ builder->AddCharacter(octal);

break;

}

// ControlEscape :: one of

// f n r t v

case 'f':

Advance(2);

- builder.AddCharacter('\f');

+ builder->AddCharacter('\f');

break;

case 'n':

Advance(2);

- builder.AddCharacter('\n');

+ builder->AddCharacter('\n');

break;

case 'r':

Advance(2);

- builder.AddCharacter('\r');

+ builder->AddCharacter('\r');

break;

case 't':

Advance(2);

- builder.AddCharacter('\t');

+ builder->AddCharacter('\t');

break;

case 'v':

Advance(2);

- builder.AddCharacter('\v');

+ builder->AddCharacter('\v');

break;

case 'c': {

Advance(2);

uc32 control = ParseControlLetterEscape();

- builder.AddCharacter(control);

+ builder->AddCharacter(control);

break;

}

case 'x': {

Advance(2);

uc32 value;

if (ParseHexEscape(2, &value)) {

- builder.AddCharacter(value);

+ builder->AddCharacter(value);

} else {

- builder.AddCharacter('x');

+ builder->AddCharacter('x');

}

break;

}

@@ -4003,15 +4101,15 @@ RegExpTree* RegExpParser::ParseDisjunction() {

Advance(2);

uc32 value;

if (ParseHexEscape(4, &value)) {

- builder.AddCharacter(value);

+ builder->AddCharacter(value);

} else {

- builder.AddCharacter('u');

+ builder->AddCharacter('u');

}

break;

}

default:

// Identity escape.

- builder.AddCharacter(Next());

+ builder->AddCharacter(Next());

Advance(2);

break;

}

@@ -4024,7 +4122,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {

// fallthrough

}

default:

- builder.AddCharacter(current());

+ builder->AddCharacter(current());

Advance();

break;

} // end switch(current())

@@ -4071,7 +4169,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {

is_greedy = false;

Advance();

}

- builder.AddQuantifierToAtom(min, max, is_greedy);

+ builder->AddQuantifierToAtom(min, max, is_greedy);

}

@@ -4382,73 +4480,6 @@ uc32 RegExpParser::ParseClassCharacterEscape() {

}

-RegExpTree* RegExpParser::ParseGroup() {

- ASSERT_EQ(current(), '(');

- char type = '(';

- Advance();

- if (current() == '?') {

- switch (Next()) {

- case ':': case '=': case '!':

- type = Next();

- Advance(2);

- break;

- default:

- ReportError(CStrVector("Invalid group") CHECK_FAILED);

- break;

- }

- } else {

- if (captures_ == NULL) {

- captures_ = new ZoneList<RegExpCapture*>(2);

- }

- if (captures_started() >= kMaxCaptures) {

- ReportError(CStrVector("Too many captures") CHECK_FAILED);

- }

- captures_->Add(NULL);

- }

- int capture_index = captures_started();

- RegExpTree* body = ParseDisjunction(CHECK_FAILED);

- if (current() != ')') {

- ReportError(CStrVector("Unterminated group") CHECK_FAILED);

- }

- Advance();

- int end_capture_index = captures_started();

- if (type == '!') {

- // Captures inside a negative lookahead are never available outside it.

- for (int i = capture_index; i < end_capture_index; i++) {

- RegExpCapture* capture = captures_->at(i);

- ASSERT(capture != NULL);

- capture->set_available(CAPTURE_PERMANENTLY_UNREACHABLE);

- }

- } else {

- // Captures temporarily unavailable because they are in different

- // alternatives are all available after the disjunction.

- for (int i = capture_index; i < end_capture_index; i++) {

- RegExpCapture* capture = captures_->at(i);

- ASSERT(capture != NULL);

- if (capture->available() == CAPTURE_UNREACHABLE) {

- capture->set_available(CAPTURE_AVAILABLE);

- }

- if (type == '(') {

- RegExpCapture* capture = new RegExpCapture(body, capture_index);

- captures_->at(capture_index - 1) = capture;

- return capture;

- } else if (type == ':') {

- return body;

- } else {

- ASSERT(type == '=' || type == '!');

- bool is_positive = (type == '=');

- return new RegExpLookahead(body,

- is_positive,

- end_capture_index - capture_index,

- capture_index);

- }

CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {

ASSERT_EQ(0, *char_class);

uc32 first = current();

« no previous file with comments | « src/ast.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »