Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(413)

Unified Diff: src/parser.cc

Issue 8635: Backreferences (Closed)
Patch Set: Added test case Created 12 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/parser.cc
diff --git a/src/parser.cc b/src/parser.cc
index 989e9cb716c51377ca59b8f4aef423bae5f0c344..c71a4bf5ec6b78e314fa26516cfdae9a4d167a56 100644
--- a/src/parser.cc
+++ b/src/parser.cc
@@ -252,6 +252,12 @@ class RegExpParser {
uc32 ParseControlEscape(bool* ok);
uc32 ParseOctalLiteral(bool* ok);
+ // Tries to parse the input as a backreference. If successful it
+ // stores the result in the output parameter and returns true. If
+ // it fails it will push back the characters read so the same characters
+ // can be reparsed.
+ bool ParseBackreferenceIndex(int* index_out);
+
CharacterRange ParseClassAtom(bool* ok);
RegExpTree* ReportError(Vector<const char> message, bool* ok);
void Advance();
@@ -270,6 +276,9 @@ class RegExpParser {
int captures_seen_;
unibrow::CharacterStream* in_;
Handle<String>* error_;
+ static const int kMaxPushback = 5;
+ int pushback_count_;
+ uc32 pushback_buffer_[kMaxPushback];
};
@@ -3208,7 +3217,8 @@ Expression* Parser::NewThrowError(Handle<String> constructor,
// ----------------------------------------------------------------------------
-// Regular expressions.
+// Regular expressions
+
RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>* error)
: current_(kEndMarker),
@@ -3217,14 +3227,20 @@ RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>* error)
has_next_(true),
captures_seen_(0),
in_(in),
- error_(error) {
+ error_(error),
+ pushback_count_(0) {
Advance(2);
}
+
void RegExpParser::Advance() {
current_ = next_;
has_more_ = has_next_;
- if (in()->has_more()) {
+ if (pushback_count_ > 0) {
+ pushback_count_--;
+ next_ = pushback_buffer_[pushback_count_];
+ has_next_ = true;
+ } else if (in()->has_more()) {
next_ = in()->GetNext();
Lasse Reichstein 2008/10/28 10:09:52 the has_next variable is set to true in the push_b
Christian Plesner Hansen 2008/10/28 11:17:45 We know it will work because we advance twice afte
} else {
next_ = kEndMarker;
@@ -3232,23 +3248,27 @@ void RegExpParser::Advance() {
}
}
+
void RegExpParser::Advance(int dist) {
for (int i = 0; i < dist; i++)
Advance();
}
+
RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) {
*ok = false;
*error_ = Factory::NewStringFromAscii(message, NOT_TENURED);
return NULL;
}
+
// Pattern ::
// Disjunction
RegExpTree* RegExpParser::ParsePattern(bool* ok) {
return ParseDisjunction(ok);
}
+
// Disjunction ::
// Alternative
// Alternative | Disjunction
@@ -3268,10 +3288,12 @@ RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {
}
}
+
static bool IsAlternativeTerminator(uc32 c) {
return c == '|' || c == ')' || c == RegExpParser::kEndMarker;
}
+
// Alternative ::
// [empty]
// Alternative Term
@@ -3332,6 +3354,52 @@ static bool IsSpecialEscape(uc32 c) {
}
+bool RegExpParser::ParseBackreferenceIndex(int* index_out) {
+ ASSERT_EQ('\\', current());
+ ASSERT('1' <= next() && next() <= '9');
+ ASSERT_EQ(0, pushback_count_);
+ if (captures_seen_ == 0)
+ return false;
+ int value = next() - '0';
+ if (value > captures_seen_)
+ return false;
+ static const int kMaxChars = kMaxPushback - 2;
+ EmbeddedVector<uc32, kMaxChars> chars_seen;
Lasse Reichstein 2008/10/28 10:09:52 EmbeddedVector is used instead of array to have ra
Christian Plesner Hansen 2008/10/28 11:17:45 Bingo
+ chars_seen[0] = next();
+ int char_count = 1;
+ Advance(2);
+ while (true) {
+ uc32 c = current();
+ if (IsDecimalDigit(c)) {
+ int next_value = 10 * value + (c - '0');
+ // To avoid reading past the end of thw stack-allocated pushback
Lasse Reichstein 2008/10/28 10:09:52 "thw" -> "the"
+ // buffers we only read kMaxChars before giving up.
+ if (next_value > captures_seen_ || char_count > kMaxChars) {
+ // If we give up we have to push the characters we read back
+ // onto the pushback buffer in the reverse order.
+ pushback_buffer_[0] = current();
+ for (int i = 0; i < char_count; i++)
+ pushback_buffer_[i + 1] = chars_seen[char_count - i - 1];
+ pushback_buffer_[char_count + 1] = '\\';
+ pushback_count_ = char_count + 2;
+ // Then, once we've filled up the buffer, we read the two
+ // first characters into the lookahead. This is a roundabout
+ // way of doing it but makes the code simpler.
+ Advance(2);
+ return false;
+ } else {
+ value = next_value;
+ chars_seen[char_count++] = current();
+ Advance();
+ }
+ } else {
+ *index_out = value;
+ return true;
+ }
+ }
+}
+
+
// Term ::
// Assertion
// Atom
@@ -3384,15 +3452,27 @@ RegExpTree* RegExpParser::ParseTerm(bool* ok) {
atom = new RegExpCharacterClass(CharacterRange::CharacterClass(c));
goto has_read_atom;
}
- // Todo backreferences
+ case '1': case '2': case '3': case '4': case '5': case '6':
+ case '7': case '8': case '9': {
+ int index = 0;
+ if (ParseBackreferenceIndex(&index)) {
+ atom = new RegExpBackreference(index);
+ goto has_read_atom;
+ } else {
+ // If this is not a backreference we go to the atom parser
+ // which will read it as an octal escape.
+ goto parse_atom;
+ }
+ }
default:
- break;
+ goto parse_atom;
}
}
// All other escapes fall through to the default case since
// they correspond to single characters that can be
// represented within atoms.
default: {
+ parse_atom:
atom = ParseAtom(CHECK_OK);
break;
}

Powered by Google App Engine
This is Rietveld 408576698