src/parser.cc - Issue 8635: Backreferences

Unified Diff: src/parser.cc

Issue 8635: Backreferences (Closed)

Patch Set: Added test case Created 12 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/parser.cc

diff --git a/src/parser.cc b/src/parser.cc

index 989e9cb716c51377ca59b8f4aef423bae5f0c344..c71a4bf5ec6b78e314fa26516cfdae9a4d167a56 100644

--- a/src/parser.cc

+++ b/src/parser.cc

@@ -252,6 +252,12 @@ class RegExpParser {

uc32 ParseControlEscape(bool* ok);

uc32 ParseOctalLiteral(bool* ok);

+ // Tries to parse the input as a backreference. If successful it

+ // stores the result in the output parameter and returns true. If

+ // it fails it will push back the characters read so the same characters

+ // can be reparsed.

+ bool ParseBackreferenceIndex(int* index_out);

CharacterRange ParseClassAtom(bool* ok);

RegExpTree* ReportError(Vector<const char> message, bool* ok);

void Advance();

@@ -270,6 +276,9 @@ class RegExpParser {

int captures_seen_;

unibrow::CharacterStream* in_;

Handle<String>* error_;

+ static const int kMaxPushback = 5;

+ int pushback_count_;

+ uc32 pushback_buffer_[kMaxPushback];

};

@@ -3208,7 +3217,8 @@ Expression* Parser::NewThrowError(Handle<String> constructor,

// ----------------------------------------------------------------------------

-// Regular expressions.

+// Regular expressions

RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>* error)

: current_(kEndMarker),

@@ -3217,14 +3227,20 @@ RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>* error)

has_next_(true),

captures_seen_(0),

in_(in),

- error_(error) {

+ error_(error),

+ pushback_count_(0) {

Advance(2);

}

void RegExpParser::Advance() {

current_ = next_;

has_more_ = has_next_;

- if (in()->has_more()) {

+ if (pushback_count_ > 0) {

+ pushback_count_--;

+ next_ = pushback_buffer_[pushback_count_];

+ has_next_ = true;

+ } else if (in()->has_more()) {

next_ = in()->GetNext();

Lasse Reichstein 2008/10/28 10:09:52 the has_next variable is set to true in the push_b

Christian Plesner Hansen 2008/10/28 11:17:45 We know it will work because we advance twice afte

} else {

next_ = kEndMarker;

@@ -3232,23 +3248,27 @@ void RegExpParser::Advance() {

}

void RegExpParser::Advance(int dist) {

for (int i = 0; i < dist; i++)

Advance();

}

RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) {

*ok = false;

*error_ = Factory::NewStringFromAscii(message, NOT_TENURED);

return NULL;

}

// Pattern ::

// Disjunction

RegExpTree* RegExpParser::ParsePattern(bool* ok) {

return ParseDisjunction(ok);

}

// Disjunction ::

// Alternative

// Alternative | Disjunction

@@ -3268,10 +3288,12 @@ RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {

}

static bool IsAlternativeTerminator(uc32 c) {

return c == '|' || c == ')' || c == RegExpParser::kEndMarker;

}

// Alternative ::

// [empty]

// Alternative Term

@@ -3332,6 +3354,52 @@ static bool IsSpecialEscape(uc32 c) {

}

+bool RegExpParser::ParseBackreferenceIndex(int* index_out) {

+ ASSERT_EQ('\\', current());

+ ASSERT('1' <= next() && next() <= '9');

+ ASSERT_EQ(0, pushback_count_);

+ if (captures_seen_ == 0)

+ return false;

+ int value = next() - '0';

+ if (value > captures_seen_)

+ return false;

+ static const int kMaxChars = kMaxPushback - 2;

+ EmbeddedVector<uc32, kMaxChars> chars_seen;

Lasse Reichstein 2008/10/28 10:09:52 EmbeddedVector is used instead of array to have ra

Christian Plesner Hansen 2008/10/28 11:17:45 Bingo

+ chars_seen[0] = next();

+ int char_count = 1;

+ Advance(2);

+ while (true) {

+ uc32 c = current();

+ if (IsDecimalDigit(c)) {

+ int next_value = 10 * value + (c - '0');

+ // To avoid reading past the end of thw stack-allocated pushback

Lasse Reichstein 2008/10/28 10:09:52 "thw" -> "the"

+ // buffers we only read kMaxChars before giving up.

+ if (next_value > captures_seen_ || char_count > kMaxChars) {

+ // If we give up we have to push the characters we read back

+ // onto the pushback buffer in the reverse order.

+ pushback_buffer_[0] = current();

+ for (int i = 0; i < char_count; i++)

+ pushback_buffer_[i + 1] = chars_seen[char_count - i - 1];

+ pushback_buffer_[char_count + 1] = '\\';

+ pushback_count_ = char_count + 2;

+ // Then, once we've filled up the buffer, we read the two

+ // first characters into the lookahead. This is a roundabout

+ // way of doing it but makes the code simpler.

+ Advance(2);

+ return false;

+ } else {

+ value = next_value;

+ chars_seen[char_count++] = current();

+ Advance();

+ }

+ } else {

+ *index_out = value;

+ return true;

+ }

// Term ::

// Assertion

// Atom

@@ -3384,15 +3452,27 @@ RegExpTree* RegExpParser::ParseTerm(bool* ok) {

atom = new RegExpCharacterClass(CharacterRange::CharacterClass(c));

goto has_read_atom;

}

- // Todo backreferences

+ case '1': case '2': case '3': case '4': case '5': case '6':

+ case '7': case '8': case '9': {

+ int index = 0;

+ if (ParseBackreferenceIndex(&index)) {

+ atom = new RegExpBackreference(index);

+ goto has_read_atom;

+ } else {

+ // If this is not a backreference we go to the atom parser

+ // which will read it as an octal escape.

+ goto parse_atom;

+ }

default:

- break;

+ goto parse_atom;

}

// All other escapes fall through to the default case since

// they correspond to single characters that can be

// represented within atoms.

default: {

+ parse_atom:

atom = ParseAtom(CHECK_OK);

break;

}

« src/ast.h ('K') | « src/jsregexp.cc ('k') | test/cctest/test-regexp.cc » ('j') | test/cctest/test-regexp.cc » ('J')