Chromium Code Reviews| Index: src/parser.cc |
| diff --git a/src/parser.cc b/src/parser.cc |
| index 989e9cb716c51377ca59b8f4aef423bae5f0c344..c71a4bf5ec6b78e314fa26516cfdae9a4d167a56 100644 |
| --- a/src/parser.cc |
| +++ b/src/parser.cc |
| @@ -252,6 +252,12 @@ class RegExpParser { |
| uc32 ParseControlEscape(bool* ok); |
| uc32 ParseOctalLiteral(bool* ok); |
| + // Tries to parse the input as a backreference. If successful it |
| + // stores the result in the output parameter and returns true. If |
| + // it fails it will push back the characters read so the same characters |
| + // can be reparsed. |
| + bool ParseBackreferenceIndex(int* index_out); |
| + |
| CharacterRange ParseClassAtom(bool* ok); |
| RegExpTree* ReportError(Vector<const char> message, bool* ok); |
| void Advance(); |
| @@ -270,6 +276,9 @@ class RegExpParser { |
| int captures_seen_; |
| unibrow::CharacterStream* in_; |
| Handle<String>* error_; |
| + static const int kMaxPushback = 5; |
| + int pushback_count_; |
| + uc32 pushback_buffer_[kMaxPushback]; |
| }; |
| @@ -3208,7 +3217,8 @@ Expression* Parser::NewThrowError(Handle<String> constructor, |
| // ---------------------------------------------------------------------------- |
| -// Regular expressions. |
| +// Regular expressions |
| + |
| RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>* error) |
| : current_(kEndMarker), |
| @@ -3217,14 +3227,20 @@ RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>* error) |
| has_next_(true), |
| captures_seen_(0), |
| in_(in), |
| - error_(error) { |
| + error_(error), |
| + pushback_count_(0) { |
| Advance(2); |
| } |
| + |
| void RegExpParser::Advance() { |
| current_ = next_; |
| has_more_ = has_next_; |
| - if (in()->has_more()) { |
| + if (pushback_count_ > 0) { |
| + pushback_count_--; |
| + next_ = pushback_buffer_[pushback_count_]; |
| + has_next_ = true; |
| + } else if (in()->has_more()) { |
| next_ = in()->GetNext(); |
|
Lasse Reichstein
2008/10/28 10:09:52
the has_next variable is set to true in the push_b
Christian Plesner Hansen
2008/10/28 11:17:45
We know it will work because we advance twice afte
|
| } else { |
| next_ = kEndMarker; |
| @@ -3232,23 +3248,27 @@ void RegExpParser::Advance() { |
| } |
| } |
| + |
| void RegExpParser::Advance(int dist) { |
| for (int i = 0; i < dist; i++) |
| Advance(); |
| } |
| + |
| RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) { |
| *ok = false; |
| *error_ = Factory::NewStringFromAscii(message, NOT_TENURED); |
| return NULL; |
| } |
| + |
| // Pattern :: |
| // Disjunction |
| RegExpTree* RegExpParser::ParsePattern(bool* ok) { |
| return ParseDisjunction(ok); |
| } |
| + |
| // Disjunction :: |
| // Alternative |
| // Alternative | Disjunction |
| @@ -3268,10 +3288,12 @@ RegExpTree* RegExpParser::ParseDisjunction(bool* ok) { |
| } |
| } |
| + |
| static bool IsAlternativeTerminator(uc32 c) { |
| return c == '|' || c == ')' || c == RegExpParser::kEndMarker; |
| } |
| + |
| // Alternative :: |
| // [empty] |
| // Alternative Term |
| @@ -3332,6 +3354,52 @@ static bool IsSpecialEscape(uc32 c) { |
| } |
| +bool RegExpParser::ParseBackreferenceIndex(int* index_out) { |
| + ASSERT_EQ('\\', current()); |
| + ASSERT('1' <= next() && next() <= '9'); |
| + ASSERT_EQ(0, pushback_count_); |
| + if (captures_seen_ == 0) |
| + return false; |
| + int value = next() - '0'; |
| + if (value > captures_seen_) |
| + return false; |
| + static const int kMaxChars = kMaxPushback - 2; |
| + EmbeddedVector<uc32, kMaxChars> chars_seen; |
|
Lasse Reichstein
2008/10/28 10:09:52
EmbeddedVector is used instead of array to have ra
Christian Plesner Hansen
2008/10/28 11:17:45
Bingo
|
| + chars_seen[0] = next(); |
| + int char_count = 1; |
| + Advance(2); |
| + while (true) { |
| + uc32 c = current(); |
| + if (IsDecimalDigit(c)) { |
| + int next_value = 10 * value + (c - '0'); |
| + // To avoid reading past the end of thw stack-allocated pushback |
|
Lasse Reichstein
2008/10/28 10:09:52
"thw" -> "the"
|
| + // buffers we only read kMaxChars before giving up. |
| + if (next_value > captures_seen_ || char_count > kMaxChars) { |
| + // If we give up we have to push the characters we read back |
| + // onto the pushback buffer in the reverse order. |
| + pushback_buffer_[0] = current(); |
| + for (int i = 0; i < char_count; i++) |
| + pushback_buffer_[i + 1] = chars_seen[char_count - i - 1]; |
| + pushback_buffer_[char_count + 1] = '\\'; |
| + pushback_count_ = char_count + 2; |
| + // Then, once we've filled up the buffer, we read the two |
| + // first characters into the lookahead. This is a roundabout |
| + // way of doing it but makes the code simpler. |
| + Advance(2); |
| + return false; |
| + } else { |
| + value = next_value; |
| + chars_seen[char_count++] = current(); |
| + Advance(); |
| + } |
| + } else { |
| + *index_out = value; |
| + return true; |
| + } |
| + } |
| +} |
| + |
| + |
| // Term :: |
| // Assertion |
| // Atom |
| @@ -3384,15 +3452,27 @@ RegExpTree* RegExpParser::ParseTerm(bool* ok) { |
| atom = new RegExpCharacterClass(CharacterRange::CharacterClass(c)); |
| goto has_read_atom; |
| } |
| - // Todo backreferences |
| + case '1': case '2': case '3': case '4': case '5': case '6': |
| + case '7': case '8': case '9': { |
| + int index = 0; |
| + if (ParseBackreferenceIndex(&index)) { |
| + atom = new RegExpBackreference(index); |
| + goto has_read_atom; |
| + } else { |
| + // If this is not a backreference we go to the atom parser |
| + // which will read it as an octal escape. |
| + goto parse_atom; |
| + } |
| + } |
| default: |
| - break; |
| + goto parse_atom; |
| } |
| } |
| // All other escapes fall through to the default case since |
| // they correspond to single characters that can be |
| // represented within atoms. |
| default: { |
| + parse_atom: |
| atom = ParseAtom(CHECK_OK); |
| break; |
| } |