src/parser.cc - Issue 11350: Add back references. Since "back references" is two words...

Unified Diff: src/parser.cc

Issue 11350: Add back references. Since "back references" is two words... (Closed) Base URL: http://v8.googlecode.com/svn/branches/experimental/regexp2000/

Patch Set: Created 12 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/parser.cc

===================================================================

--- src/parser.cc (revision 799)

+++ src/parser.cc (working copy)

@@ -515,11 +515,11 @@

uc32 ParseControlLetterEscape(bool* ok);

uc32 ParseOctalLiteral();

- // Tries to parse the input as a backreference. If successful it

+ // Tries to parse the input as a back reference. If successful it

// stores the result in the output parameter and returns true. If

// it fails it will push back the characters read so the same characters

// can be reparsed.

- bool ParseBackreferenceIndex(int* index_out);

+ bool ParseBackReferenceIndex(int* index_out);

CharacterRange ParseClassAtom(bool* is_char_class,

ZoneList<CharacterRange>* ranges,

@@ -541,6 +541,8 @@

bool has_next() { return next_pos_ < in()->length(); }

uc32 Next();

FlatStringReader* in() { return in_; }

+ void ScanForCaptures();

+ bool CaptureAvailable(int index);

uc32 current_;

bool has_more_;

bool multiline_mode_;

@@ -548,7 +550,9 @@

FlatStringReader* in_;

Handle<String>* error_;

bool has_character_escapes_;

+ bool is_scanned_for_captures_;

ZoneList<RegExpCapture*>* captures_;

+ int capture_count_;

Christian Plesner Hansen 2008/11/21 11:16:39 This name could be misleading but I can't think of

};

@@ -3502,7 +3506,9 @@

in_(in),

error_(error),

has_character_escapes_(false),

- captures_(NULL) {

+ is_scanned_for_captures_(false),

+ captures_(NULL),

+ capture_count_(0) {

Advance(1);

}

@@ -3564,6 +3570,14 @@

}

+bool RegExpParser::CaptureAvailable(int index) {

+ if (captures_ == NULL) return false;

+ if (index >= captures_->length()) return false;

+ RegExpCapture* capture = captures_->at(index);

+ return capture != NULL && capture->available() == CAPTURE_AVAILABLE;

// Disjunction ::

// Alternative

// Alternative | Disjunction

@@ -3667,14 +3681,14 @@

case '1': case '2': case '3': case '4': case '5': case '6':

case '7': case '8': case '9': {

int index = 0;

- if (ParseBackreferenceIndex(&index)) {

- RegExpCapture* capture = captures_->at(index - 1);

- if (capture == NULL || capture->available() != CAPTURE_AVAILABLE) {

+ if (ParseBackReferenceIndex(&index)) {

+ if (!CaptureAvailable(index - 1)) {

// Prepare to ignore a following quantifier

builder.AddEmpty();

goto has_read_atom;

}

- RegExpTree* atom = new RegExpBackreference(capture);

+ RegExpCapture* capture = captures_->at(index - 1);

+ RegExpTree* atom = new RegExpBackReference(capture);

builder.AddAtom(atom);

goto has_read_atom; // Avoid setting has_character_escapes_.

}

@@ -3844,7 +3858,42 @@

#endif

-bool RegExpParser::ParseBackreferenceIndex(int* index_out) {

+// In order to know whether an escape is a backreference or not we have to scan

+// the entire regexp and find the number of capturing parentheses. However we

+// don't want to scan the regexp twice unless it is necessary. This mini-parser

+// is called when needed. It can see the difference between capturing and

+// noncapturing parentheses and can skip character classes and backslash-escaped

+// characters.

+void RegExpParser::ScanForCaptures() {

+ int n;

+ while ((n = current()) != kEndMarker) {

+ Advance();

+ switch (n) {

+ case '\\':

+ Advance();

+ break;

+ case '[': {

+ int c;

+ while ((c = current()) != kEndMarker) {

+ Advance();

+ if (c == '\\') {

+ Advance();

+ } else {

+ if (c == ']') break;

+ }

+ break;

+ }

+ case '(':

+ if (current() != '?') capture_count_++;

+ break;

+ }

+ is_scanned_for_captures_ = true;

+bool RegExpParser::ParseBackReferenceIndex(int* index_out) {

ASSERT_EQ('\\', current());

ASSERT('1' <= Next() && Next() <= '9');

// Try to parse a decimal literal that is no greater than the number

@@ -3852,18 +3901,21 @@

// This is a not according the the ECMAScript specification. According to

// that, one must accept values up to the total number of left capturing

// parentheses in the entire input, even if they are meaningless.

- if (captures_ == NULL)

- return false;

+ if (!is_scanned_for_captures_) {

+ int saved_position = position();

+ ScanForCaptures();

+ Reset(saved_position);

+ }

+ if (capture_count_ == 0) return false;

int start = position();

int value = Next() - '0';

- if (value > captures_->length())

- return false;

+ if (value > capture_count_) return false;

Advance(2);

while (true) {

uc32 c = current();

if (IsDecimalDigit(c)) {

value = 10 * value + (c - '0');

- if (value > captures_->length()) {

+ if (value > capture_count_) {

Reset(start);

return false;

}

@@ -4068,6 +4120,7 @@

captures_ = new ZoneList<RegExpCapture*>(2);

}

captures_->Add(NULL);

+ if (!is_scanned_for_captures_) capture_count_++;

}

int capture_index = captures_started();

RegExpTree* body = ParseDisjunction(CHECK_OK);

« src/jsregexp.cc ('K') | « src/jsregexp.cc ('k') | src/regexp-macro-assembler.h » ('j') | no next file with comments »