regexp2000/src/parser.cc - Issue 10750: * Update to RegExp parsing and AST.

Unified Diff: regexp2000/src/parser.cc

Issue 10750: * Update to RegExp parsing and AST. (Closed)

Patch Set: Addressed review comments Created 12 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: regexp2000/src/parser.cc

diff --git a/regexp2000/src/parser.cc b/regexp2000/src/parser.cc

index aa923d4e9208d1b712cba82f3abc5982ee4cd766..82d0370a97b9c7e44d3d176b63f9b903db40abca 100644

--- a/regexp2000/src/parser.cc

+++ b/regexp2000/src/parser.cc

@@ -291,6 +291,9 @@ class RegExpBuilder {

public:

RegExpBuilder();

void AddCharacter(uc16 character);

+ // "Adds" an empty expression. Does nothing except consume a

+ // following quantifier

+ void AddEmpty();

void AddAtom(RegExpTree* tree);

void AddAssertion(RegExpTree* tree);

void NewAlternative(); // '|'

@@ -299,6 +302,7 @@ class RegExpBuilder {

private:

void FlushCharacters();

bool FlushTerms();

+ bool pending_empty_;

ZoneList<uc16>* characters_;

BufferedZoneList<RegExpTree, 2> terms_;

BufferedZoneList<RegExpTree, 2> alternatives_;

@@ -311,7 +315,8 @@ class RegExpBuilder {

};

-RegExpBuilder::RegExpBuilder() : characters_(NULL), terms_(), alternatives_()

+RegExpBuilder::RegExpBuilder()

+ : pending_empty_(false), characters_(NULL), terms_(), alternatives_()

#ifdef DEBUG

, last_added_(ADD_NONE)

#endif

@@ -319,6 +324,7 @@ RegExpBuilder::RegExpBuilder() : characters_(NULL), terms_(), alternatives_()

void RegExpBuilder::FlushCharacters() {

+ pending_empty_ = false;

if (characters_ != NULL) {

RegExpTree* atom = new RegExpAtom(characters_->ToConstVector());

characters_ = NULL;

@@ -329,6 +335,7 @@ void RegExpBuilder::FlushCharacters() {

void RegExpBuilder::AddCharacter(uc16 c) {

+ pending_empty_ = false;

if (characters_ == NULL) {

characters_ = new ZoneList<uc16>(4);

}

@@ -337,6 +344,11 @@ void RegExpBuilder::AddCharacter(uc16 c) {

}

+void RegExpBuilder::AddEmpty() {

+ pending_empty_ = true;

void RegExpBuilder::AddAtom(RegExpTree* atom) {

FlushCharacters();

terms_.Add(atom);

@@ -391,6 +403,10 @@ RegExpTree* RegExpBuilder::ToRegExp() {

void RegExpBuilder::AddQuantifierToAtom(int min, int max, bool is_greedy) {

+ if (pending_empty_) {

+ pending_empty_ = false;

+ return;

+ }

RegExpTree* atom;

if (characters_ != NULL) {

ASSERT(last_added_ == ADD_CHAR);

@@ -465,7 +481,7 @@ class RegExpParser {

bool HasCharacterEscapes();

- int captures_started() { return captures_started_; }

+ int captures_started() { return captures_ == NULL ? 0 : captures_->length(); }

static const uc32 kEndMarker = unibrow::Utf8::kBadChar;

private:

@@ -479,13 +495,13 @@ class RegExpParser {

bool has_more_;

bool has_next_;

bool multiline_mode_;

- int captures_started_;

unibrow::CharacterStream* in_;

Handle<String>* error_;

static const int kMaxPushback = 5;

int pushback_count_;

uc32 pushback_buffer_[kMaxPushback];

bool has_character_escapes_;

+ ZoneList<RegExpCapture*>* captures_;

};

@@ -3437,11 +3453,11 @@ RegExpParser::RegExpParser(unibrow::CharacterStream* in,

has_more_(true),

has_next_(true),

multiline_mode_(multiline_mode),

- captures_started_(0),

in_(in),

error_(error),

pushback_count_(0),

- has_character_escapes_(false) {

+ has_character_escapes_(false),

+ captures_(NULL) {

Advance(2);

}

@@ -3523,15 +3539,25 @@ RegExpTree* RegExpParser::ParsePattern(bool* ok) {

// Atom Quantifier

RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {

RegExpBuilder builder;

+ int capture_start_index = captures_started();

while (true) {

switch (current()) {

case kEndMarker:

case ')':

return builder.ToRegExp();

- case '|':

+ case '|': {

Advance();

builder.NewAlternative();

+ int capture_new_alt_start_index = captures_started();

+ for (int i = capture_start_index; i < capture_new_alt_start_index; i++) {

+ RegExpCapture* capture = captures_->at(i);

+ if (capture->available() == CAPTURE_AVAILABLE) {

+ capture->set_available(CAPTURE_UNREACHABLE);

+ }

+ capture_start_index = capture_new_alt_start_index;

continue;

+ }

case '*':

case '+':

case '?':

@@ -3606,7 +3632,13 @@ RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {

case '7': case '8': case '9': {

int index = 0;

if (ParseBackreferenceIndex(&index)) {

- RegExpTree* atom = new RegExpBackreference(index);

+ RegExpCapture* capture = captures_->at(index - 1);

+ if (capture == NULL || capture->available() != CAPTURE_AVAILABLE) {

+ // Prepare to ignore a following quantifier

+ builder.AddEmpty();

+ goto has_read_atom;

+ }

+ RegExpTree* atom = new RegExpBackreference(capture);

builder.AddAtom(atom);

goto has_read_atom; // Avoid setting has_character_escapes_.

}

@@ -3775,10 +3807,10 @@ bool RegExpParser::ParseBackreferenceIndex(int* index_out) {

// This is a not according the the ECMAScript specification. According to

// that, one must accept values up to the total number of left capturing

// parentheses in the entire input, even if they are meaningless.

- if (captures_started_ == 0)

+ if (captures_ == NULL)

return false;

int value = next() - '0';

- if (value > captures_started_)

+ if (value > captures_->length())

return false;

static const int kMaxChars = kMaxPushback - 2;

EmbeddedVector<uc32, kMaxChars> chars_seen;

@@ -3791,7 +3823,7 @@ bool RegExpParser::ParseBackreferenceIndex(int* index_out) {

value = 10 * value + (c - '0');

// To avoid reading past the end of the stack-allocated pushback

// buffers we only read kMaxChars before giving up.

- if (value > captures_started_ || char_count > kMaxChars) {

+ if (value > captures_->length() || char_count > kMaxChars) {

// If we give up we have to push the characters we read back

// onto the pushback buffer in the reverse order.

for (int i = 0; i < char_count; i++) {

@@ -4005,16 +4037,42 @@ RegExpTree* RegExpParser::ParseGroup(bool* ok) {

break;

}

} else {

- captures_started_++;

+ if (captures_ == NULL) {

+ captures_ = new ZoneList<RegExpCapture*>(2);

+ }

+ captures_->Add(NULL);

}

- int capture_index = captures_started_;

+ int capture_index = captures_started();

RegExpTree* body = ParseDisjunction(CHECK_OK);

if (current() != ')') {

ReportError(CStrVector("Unterminated group"), CHECK_OK);

}

Advance();

+ int end_capture_index = captures_started();

+ if (type == '!') {

+ // Captures inside a negative lookahead are never available outside it.

+ for (int i = capture_index; i < end_capture_index; i++) {

+ RegExpCapture* capture = captures_->at(i);

+ ASSERT(capture != NULL);

+ capture->set_available(CAPTURE_PERMANENTLY_UNREACHABLE);

+ }

+ } else {

+ // Captures temporarily unavailable because they are in different

+ // alternatives are all available after the disjunction.

+ for (int i = capture_index; i < end_capture_index; i++) {

+ RegExpCapture* capture = captures_->at(i);

+ ASSERT(capture != NULL);

+ if (capture->available() == CAPTURE_UNREACHABLE) {

+ capture->set_available(CAPTURE_AVAILABLE);

+ }

if (type == '(') {

- return new RegExpCapture(body, capture_index);

+ RegExpCapture* capture = new RegExpCapture(body, capture_index);

+ captures_->at(capture_index - 1) = capture;

+ return capture;

} else if (type == ':') {

return body;

} else {

@@ -4093,10 +4151,10 @@ RegExpTree* RegExpParser::ParseCharacterClass(bool* ok) {

}

Advance();

if (ranges->length() == 0) {

- return RegExpEmpty::GetInstance();

- } else {

- return new RegExpCharacterClass(ranges, is_negated);

+ ranges->Add(CharacterRange::Range(0, 0xffff));

+ is_negated = !is_negated;

}

+ return new RegExpCharacterClass(ranges, is_negated);

}

« no previous file with comments | « regexp2000/src/jsregexp.cc ('k') | regexp2000/src/regexp-macro-assembler.h » ('j') | no next file with comments »