Index: src/parser.cc |
diff --git a/src/parser.cc b/src/parser.cc |
index 2b4be79a0f3331fe957057025c729e9d8d30ee3d..b1a68da0501620523afaa209c499af3ba30794ad 100644 |
--- a/src/parser.cc |
+++ b/src/parser.cc |
@@ -361,7 +361,7 @@ class BufferedZoneList { |
}; |
// Accumulates RegExp atoms and assertions into lists of terms and alternatives. |
-class RegExpBuilder { |
+class RegExpBuilder: public ZoneObject { |
public: |
RegExpBuilder(); |
void AddCharacter(uc16 character); |
@@ -392,7 +392,10 @@ class RegExpBuilder { |
RegExpBuilder::RegExpBuilder() |
- : pending_empty_(false), characters_(NULL), terms_(), alternatives_() |
+ : pending_empty_(false), |
+ characters_(NULL), |
+ terms_(), |
+ alternatives_() |
#ifdef DEBUG |
, last_added_(ADD_NONE) |
#endif |
@@ -594,6 +597,32 @@ class RegExpParser { |
static const int kMaxCaptures = 1 << 16; |
static const uc32 kEndMarker = (1 << 21); |
private: |
+ enum SubexpressionType { |
+ INITIAL, |
+ CAPTURE, // All positive values represent captures. |
+ POSITIVE_LOOKAHEAD, |
+ NEGATIVE_LOOKAHEAD, |
+ GROUPING |
+ }; |
+ |
+ struct RegExpParserState : public ZoneObject { |
Erik Corry
2009/07/01 11:29:14
Please make this a real object.
|
+ RegExpParserState(RegExpParserState* previous_state, |
+ RegExpBuilder* builder, |
+ int disjunction_capture_index, |
+ SubexpressionType group_type) |
+ : previous_state(previous_state), |
+ builder(builder), |
+ disjunction_capture_index(disjunction_capture_index), |
+ group_type(group_type) {} |
+ // Linked list implementation of stack of states. |
+ RegExpParserState* previous_state; |
+ // Builder for the stored disjunction. |
+ RegExpBuilder* builder; |
+ // Stored disjunction's capture index (if any). |
+ int disjunction_capture_index; |
+ // Stored disjunction type (capture, look-ahead or grouping), if any. |
+ SubexpressionType group_type; |
+ }; |
uc32 current() { return current_; } |
bool has_more() { return has_more_; } |
@@ -601,7 +630,6 @@ class RegExpParser { |
uc32 Next(); |
FlatStringReader* in() { return in_; } |
void ScanForCaptures(); |
- bool CaptureAvailable(int index); |
uc32 current_; |
bool has_more_; |
bool multiline_; |
@@ -3820,14 +3848,6 @@ RegExpTree* RegExpParser::ParsePattern() { |
} |
-bool RegExpParser::CaptureAvailable(int index) { |
- if (captures_ == NULL) return false; |
- if (index >= captures_->length()) return false; |
- RegExpCapture* capture = captures_->at(index); |
- return capture != NULL && capture->available() == CAPTURE_AVAILABLE; |
-} |
- |
- |
// Disjunction :: |
// Alternative |
// Alternative | Disjunction |
@@ -3839,24 +3859,65 @@ bool RegExpParser::CaptureAvailable(int index) { |
// Atom |
// Atom Quantifier |
RegExpTree* RegExpParser::ParseDisjunction() { |
- RegExpBuilder builder; |
- int capture_start_index = captures_started(); |
+ // Used to store current state while parsing subexpressions. |
+ RegExpParserState* stored_state = NULL; |
Erik Corry
2009/07/01 11:29:14
If you initialize this to an initial stored state
|
+ RegExpBuilder* builder = new RegExpBuilder(); |
+ SubexpressionType group_type = INITIAL; |
+ // Index in captures array of first capture in this sub-expression, if any. |
+ // Also the capture index of this sub-expression itself, if group_type |
+ // is CAPTURE. |
+ int disjunction_capture_index = 0; |
while (true) { |
switch (current()) { |
case kEndMarker: |
- case ')': |
- return builder.ToRegExp(); |
- case '|': { |
+ if (stored_state != NULL) { |
+ // Inside a parenthesized group when hitting end of input. |
+ ReportError(CStrVector("Unterminated group") CHECK_FAILED); |
+ } |
+ ASSERT_EQ(INITIAL, group_type); |
+ // Parsing completed successfully. |
+ return builder->ToRegExp(); |
+ case ')': { |
+ if (stored_state == NULL) { |
+ ReportError(CStrVector("Unexpected ')'") CHECK_FAILED); |
+ } |
+ ASSERT_NE(INITIAL, group_type); |
+ |
Advance(); |
- builder.NewAlternative(); |
- int capture_new_alt_start_index = captures_started(); |
- for (int i = capture_start_index; i < capture_new_alt_start_index; i++) { |
- RegExpCapture* capture = captures_->at(i); |
- if (capture->available() == CAPTURE_AVAILABLE) { |
- capture->set_available(CAPTURE_UNREACHABLE); |
- } |
+ // End disjunction parsing and convert builder content to new single |
+ // regexp atom. |
+ RegExpTree* body = builder->ToRegExp(); |
+ |
+ int end_capture_index = captures_started(); |
+ |
+ int capture_index = disjunction_capture_index; |
+ SubexpressionType type = group_type; |
+ |
+ // Restore previous state. |
+ builder = stored_state->builder; |
+ group_type = stored_state->group_type; |
+ disjunction_capture_index = stored_state->disjunction_capture_index; |
+ stored_state = stored_state->previous_state; |
+ |
+ // Build result of subexpression. |
+ if (type == CAPTURE) { |
+ RegExpCapture* capture = new RegExpCapture(body, capture_index); |
+ captures_->at(capture_index - 1) = capture; |
+ body = capture; |
+ } else if (type != GROUPING) { |
+ ASSERT(type == POSITIVE_LOOKAHEAD || type == NEGATIVE_LOOKAHEAD); |
+ bool is_positive = (type == POSITIVE_LOOKAHEAD); |
+ body = new RegExpLookahead(body, |
+ is_positive, |
+ end_capture_index - capture_index, |
+ capture_index); |
} |
- capture_start_index = capture_new_alt_start_index; |
+ builder->AddAtom(body); |
+ break; |
+ } |
+ case '|': { |
+ Advance(); |
+ builder->NewAlternative(); |
continue; |
} |
case '*': |
@@ -3866,10 +3927,10 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
case '^': { |
Advance(); |
if (multiline_) { |
- builder.AddAssertion( |
+ builder->AddAssertion( |
new RegExpAssertion(RegExpAssertion::START_OF_LINE)); |
} else { |
- builder.AddAssertion( |
+ builder->AddAssertion( |
new RegExpAssertion(RegExpAssertion::START_OF_INPUT)); |
set_contains_anchor(); |
} |
@@ -3880,7 +3941,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
RegExpAssertion::Type type = |
multiline_ ? RegExpAssertion::END_OF_LINE : |
RegExpAssertion::END_OF_INPUT; |
- builder.AddAssertion(new RegExpAssertion(type)); |
+ builder->AddAssertion(new RegExpAssertion(type)); |
continue; |
} |
case '.': { |
@@ -3889,17 +3950,52 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2); |
CharacterRange::AddClassEscape('.', ranges); |
RegExpTree* atom = new RegExpCharacterClass(ranges, false); |
- builder.AddAtom(atom); |
+ builder->AddAtom(atom); |
break; |
} |
case '(': { |
- RegExpTree* atom = ParseGroup(CHECK_FAILED); |
- builder.AddAtom(atom); |
+ SubexpressionType type = CAPTURE; |
+ Advance(); |
+ if (current() == '?') { |
+ switch (Next()) { |
+ case ':': |
+ type = GROUPING; |
+ break; |
+ case '=': |
+ type = POSITIVE_LOOKAHEAD; |
+ break; |
+ case '!': |
+ type = NEGATIVE_LOOKAHEAD; |
+ break; |
+ default: |
+ ReportError(CStrVector("Invalid group") CHECK_FAILED); |
+ break; |
+ } |
+ Advance(2); |
+ } else { |
+ if (captures_ == NULL) { |
+ captures_ = new ZoneList<RegExpCapture*>(2); |
+ } |
+ if (captures_started() >= kMaxCaptures) { |
+ ReportError(CStrVector("Too many captures") CHECK_FAILED); |
+ } |
+ captures_->Add(NULL); |
+ } |
+ // Store current state and begin new disjunction parsing. |
+ stored_state = new RegExpParserState(stored_state, |
+ builder, |
+ disjunction_capture_index, |
+ group_type); |
+ builder = new RegExpBuilder(); |
+ group_type = type; |
+ if (type == CAPTURE) { |
+ disjunction_capture_index = captures_started(); |
+ } |
break; |
} |
case '[': { |
RegExpTree* atom = ParseCharacterClass(CHECK_FAILED); |
- builder.AddAtom(atom); |
+ builder->AddAtom(atom); |
break; |
} |
// Atom :: |
@@ -3910,12 +4006,12 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
ReportError(CStrVector("\\ at end of pattern") CHECK_FAILED); |
case 'b': |
Advance(2); |
- builder.AddAssertion( |
+ builder->AddAssertion( |
new RegExpAssertion(RegExpAssertion::BOUNDARY)); |
continue; |
case 'B': |
Advance(2); |
- builder.AddAssertion( |
+ builder->AddAssertion( |
new RegExpAssertion(RegExpAssertion::NON_BOUNDARY)); |
continue; |
// AtomEscape :: |
@@ -3929,27 +4025,29 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2); |
CharacterRange::AddClassEscape(c, ranges); |
RegExpTree* atom = new RegExpCharacterClass(ranges, false); |
- builder.AddAtom(atom); |
+ builder->AddAtom(atom); |
goto has_read_atom; // Avoid setting has_character_escapes_. |
} |
case '1': case '2': case '3': case '4': case '5': case '6': |
case '7': case '8': case '9': { |
int index = 0; |
if (ParseBackReferenceIndex(&index)) { |
- if (!CaptureAvailable(index - 1)) { |
- // Prepare to ignore a following quantifier |
- builder.AddEmpty(); |
+ RegExpCapture* capture = NULL; |
+ if (captures_ != NULL && index <= captures_->length()) { |
+ capture = captures_->at(index - 1); |
+ } |
+ if (capture == NULL) { |
+ builder->AddEmpty(); |
goto has_read_atom; |
} |
- RegExpCapture* capture = captures_->at(index - 1); |
RegExpTree* atom = new RegExpBackReference(capture); |
- builder.AddAtom(atom); |
+ builder->AddAtom(atom); |
goto has_read_atom; // Avoid setting has_character_escapes_. |
} |
uc32 first_digit = Next(); |
if (first_digit == '8' || first_digit == '9') { |
// Treat as identity escape |
- builder.AddCharacter(first_digit); |
+ builder->AddCharacter(first_digit); |
Advance(2); |
break; |
} |
@@ -3958,44 +4056,44 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
case '0': { |
Advance(); |
uc32 octal = ParseOctalLiteral(); |
- builder.AddCharacter(octal); |
+ builder->AddCharacter(octal); |
break; |
} |
// ControlEscape :: one of |
// f n r t v |
case 'f': |
Advance(2); |
- builder.AddCharacter('\f'); |
+ builder->AddCharacter('\f'); |
break; |
case 'n': |
Advance(2); |
- builder.AddCharacter('\n'); |
+ builder->AddCharacter('\n'); |
break; |
case 'r': |
Advance(2); |
- builder.AddCharacter('\r'); |
+ builder->AddCharacter('\r'); |
break; |
case 't': |
Advance(2); |
- builder.AddCharacter('\t'); |
+ builder->AddCharacter('\t'); |
break; |
case 'v': |
Advance(2); |
- builder.AddCharacter('\v'); |
+ builder->AddCharacter('\v'); |
break; |
case 'c': { |
Advance(2); |
uc32 control = ParseControlLetterEscape(); |
- builder.AddCharacter(control); |
+ builder->AddCharacter(control); |
break; |
} |
case 'x': { |
Advance(2); |
uc32 value; |
if (ParseHexEscape(2, &value)) { |
- builder.AddCharacter(value); |
+ builder->AddCharacter(value); |
} else { |
- builder.AddCharacter('x'); |
+ builder->AddCharacter('x'); |
} |
break; |
} |
@@ -4003,15 +4101,15 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
Advance(2); |
uc32 value; |
if (ParseHexEscape(4, &value)) { |
- builder.AddCharacter(value); |
+ builder->AddCharacter(value); |
} else { |
- builder.AddCharacter('u'); |
+ builder->AddCharacter('u'); |
} |
break; |
} |
default: |
// Identity escape. |
- builder.AddCharacter(Next()); |
+ builder->AddCharacter(Next()); |
Advance(2); |
break; |
} |
@@ -4024,7 +4122,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
// fallthrough |
} |
default: |
- builder.AddCharacter(current()); |
+ builder->AddCharacter(current()); |
Advance(); |
break; |
} // end switch(current()) |
@@ -4071,7 +4169,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
is_greedy = false; |
Advance(); |
} |
- builder.AddQuantifierToAtom(min, max, is_greedy); |
+ builder->AddQuantifierToAtom(min, max, is_greedy); |
} |
} |
@@ -4382,73 +4480,6 @@ uc32 RegExpParser::ParseClassCharacterEscape() { |
} |
-RegExpTree* RegExpParser::ParseGroup() { |
- ASSERT_EQ(current(), '('); |
- char type = '('; |
- Advance(); |
- if (current() == '?') { |
- switch (Next()) { |
- case ':': case '=': case '!': |
- type = Next(); |
- Advance(2); |
- break; |
- default: |
- ReportError(CStrVector("Invalid group") CHECK_FAILED); |
- break; |
- } |
- } else { |
- if (captures_ == NULL) { |
- captures_ = new ZoneList<RegExpCapture*>(2); |
- } |
- if (captures_started() >= kMaxCaptures) { |
- ReportError(CStrVector("Too many captures") CHECK_FAILED); |
- } |
- captures_->Add(NULL); |
- } |
- int capture_index = captures_started(); |
- RegExpTree* body = ParseDisjunction(CHECK_FAILED); |
- if (current() != ')') { |
- ReportError(CStrVector("Unterminated group") CHECK_FAILED); |
- } |
- Advance(); |
- |
- int end_capture_index = captures_started(); |
- if (type == '!') { |
- // Captures inside a negative lookahead are never available outside it. |
- for (int i = capture_index; i < end_capture_index; i++) { |
- RegExpCapture* capture = captures_->at(i); |
- ASSERT(capture != NULL); |
- capture->set_available(CAPTURE_PERMANENTLY_UNREACHABLE); |
- } |
- } else { |
- // Captures temporarily unavailable because they are in different |
- // alternatives are all available after the disjunction. |
- for (int i = capture_index; i < end_capture_index; i++) { |
- RegExpCapture* capture = captures_->at(i); |
- ASSERT(capture != NULL); |
- if (capture->available() == CAPTURE_UNREACHABLE) { |
- capture->set_available(CAPTURE_AVAILABLE); |
- } |
- } |
- } |
- |
- if (type == '(') { |
- RegExpCapture* capture = new RegExpCapture(body, capture_index); |
- captures_->at(capture_index - 1) = capture; |
- return capture; |
- } else if (type == ':') { |
- return body; |
- } else { |
- ASSERT(type == '=' || type == '!'); |
- bool is_positive = (type == '='); |
- return new RegExpLookahead(body, |
- is_positive, |
- end_capture_index - capture_index, |
- capture_index); |
- } |
-} |
- |
- |
CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { |
ASSERT_EQ(0, *char_class); |
uc32 first = current(); |