Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1535)

Unified Diff: regexp2000/src/parser.cc

Issue 10750: * Update to RegExp parsing and AST. (Closed)
Patch Set: Addressed review comments Created 12 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « regexp2000/src/jsregexp.cc ('k') | regexp2000/src/regexp-macro-assembler.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: regexp2000/src/parser.cc
diff --git a/regexp2000/src/parser.cc b/regexp2000/src/parser.cc
index aa923d4e9208d1b712cba82f3abc5982ee4cd766..82d0370a97b9c7e44d3d176b63f9b903db40abca 100644
--- a/regexp2000/src/parser.cc
+++ b/regexp2000/src/parser.cc
@@ -291,6 +291,9 @@ class RegExpBuilder {
public:
RegExpBuilder();
void AddCharacter(uc16 character);
+ // "Adds" an empty expression. Does nothing except consume a
+ // following quantifier
+ void AddEmpty();
void AddAtom(RegExpTree* tree);
void AddAssertion(RegExpTree* tree);
void NewAlternative(); // '|'
@@ -299,6 +302,7 @@ class RegExpBuilder {
private:
void FlushCharacters();
bool FlushTerms();
+ bool pending_empty_;
ZoneList<uc16>* characters_;
BufferedZoneList<RegExpTree, 2> terms_;
BufferedZoneList<RegExpTree, 2> alternatives_;
@@ -311,7 +315,8 @@ class RegExpBuilder {
};
-RegExpBuilder::RegExpBuilder() : characters_(NULL), terms_(), alternatives_()
+RegExpBuilder::RegExpBuilder()
+ : pending_empty_(false), characters_(NULL), terms_(), alternatives_()
#ifdef DEBUG
, last_added_(ADD_NONE)
#endif
@@ -319,6 +324,7 @@ RegExpBuilder::RegExpBuilder() : characters_(NULL), terms_(), alternatives_()
void RegExpBuilder::FlushCharacters() {
+ pending_empty_ = false;
if (characters_ != NULL) {
RegExpTree* atom = new RegExpAtom(characters_->ToConstVector());
characters_ = NULL;
@@ -329,6 +335,7 @@ void RegExpBuilder::FlushCharacters() {
void RegExpBuilder::AddCharacter(uc16 c) {
+ pending_empty_ = false;
if (characters_ == NULL) {
characters_ = new ZoneList<uc16>(4);
}
@@ -337,6 +344,11 @@ void RegExpBuilder::AddCharacter(uc16 c) {
}
+void RegExpBuilder::AddEmpty() {
+ pending_empty_ = true;
+}
+
+
void RegExpBuilder::AddAtom(RegExpTree* atom) {
FlushCharacters();
terms_.Add(atom);
@@ -391,6 +403,10 @@ RegExpTree* RegExpBuilder::ToRegExp() {
void RegExpBuilder::AddQuantifierToAtom(int min, int max, bool is_greedy) {
+ if (pending_empty_) {
+ pending_empty_ = false;
+ return;
+ }
RegExpTree* atom;
if (characters_ != NULL) {
ASSERT(last_added_ == ADD_CHAR);
@@ -465,7 +481,7 @@ class RegExpParser {
bool HasCharacterEscapes();
- int captures_started() { return captures_started_; }
+ int captures_started() { return captures_ == NULL ? 0 : captures_->length(); }
static const uc32 kEndMarker = unibrow::Utf8::kBadChar;
private:
@@ -479,13 +495,13 @@ class RegExpParser {
bool has_more_;
bool has_next_;
bool multiline_mode_;
- int captures_started_;
unibrow::CharacterStream* in_;
Handle<String>* error_;
static const int kMaxPushback = 5;
int pushback_count_;
uc32 pushback_buffer_[kMaxPushback];
bool has_character_escapes_;
+ ZoneList<RegExpCapture*>* captures_;
};
@@ -3437,11 +3453,11 @@ RegExpParser::RegExpParser(unibrow::CharacterStream* in,
has_more_(true),
has_next_(true),
multiline_mode_(multiline_mode),
- captures_started_(0),
in_(in),
error_(error),
pushback_count_(0),
- has_character_escapes_(false) {
+ has_character_escapes_(false),
+ captures_(NULL) {
Advance(2);
}
@@ -3523,15 +3539,25 @@ RegExpTree* RegExpParser::ParsePattern(bool* ok) {
// Atom Quantifier
RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {
RegExpBuilder builder;
+ int capture_start_index = captures_started();
while (true) {
switch (current()) {
case kEndMarker:
case ')':
return builder.ToRegExp();
- case '|':
+ case '|': {
Advance();
builder.NewAlternative();
+ int capture_new_alt_start_index = captures_started();
+ for (int i = capture_start_index; i < capture_new_alt_start_index; i++) {
+ RegExpCapture* capture = captures_->at(i);
+ if (capture->available() == CAPTURE_AVAILABLE) {
+ capture->set_available(CAPTURE_UNREACHABLE);
+ }
+ }
+ capture_start_index = capture_new_alt_start_index;
continue;
+ }
case '*':
case '+':
case '?':
@@ -3606,7 +3632,13 @@ RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {
case '7': case '8': case '9': {
int index = 0;
if (ParseBackreferenceIndex(&index)) {
- RegExpTree* atom = new RegExpBackreference(index);
+ RegExpCapture* capture = captures_->at(index - 1);
+ if (capture == NULL || capture->available() != CAPTURE_AVAILABLE) {
+ // Prepare to ignore a following quantifier
+ builder.AddEmpty();
+ goto has_read_atom;
+ }
+ RegExpTree* atom = new RegExpBackreference(capture);
builder.AddAtom(atom);
goto has_read_atom; // Avoid setting has_character_escapes_.
}
@@ -3775,10 +3807,10 @@ bool RegExpParser::ParseBackreferenceIndex(int* index_out) {
// This is a not according the the ECMAScript specification. According to
// that, one must accept values up to the total number of left capturing
// parentheses in the entire input, even if they are meaningless.
- if (captures_started_ == 0)
+ if (captures_ == NULL)
return false;
int value = next() - '0';
- if (value > captures_started_)
+ if (value > captures_->length())
return false;
static const int kMaxChars = kMaxPushback - 2;
EmbeddedVector<uc32, kMaxChars> chars_seen;
@@ -3791,7 +3823,7 @@ bool RegExpParser::ParseBackreferenceIndex(int* index_out) {
value = 10 * value + (c - '0');
// To avoid reading past the end of the stack-allocated pushback
// buffers we only read kMaxChars before giving up.
- if (value > captures_started_ || char_count > kMaxChars) {
+ if (value > captures_->length() || char_count > kMaxChars) {
// If we give up we have to push the characters we read back
// onto the pushback buffer in the reverse order.
for (int i = 0; i < char_count; i++) {
@@ -4005,16 +4037,42 @@ RegExpTree* RegExpParser::ParseGroup(bool* ok) {
break;
}
} else {
- captures_started_++;
+ if (captures_ == NULL) {
+ captures_ = new ZoneList<RegExpCapture*>(2);
+ }
+ captures_->Add(NULL);
}
- int capture_index = captures_started_;
+ int capture_index = captures_started();
RegExpTree* body = ParseDisjunction(CHECK_OK);
if (current() != ')') {
ReportError(CStrVector("Unterminated group"), CHECK_OK);
}
Advance();
+
+ int end_capture_index = captures_started();
+ if (type == '!') {
+ // Captures inside a negative lookahead are never available outside it.
+ for (int i = capture_index; i < end_capture_index; i++) {
+ RegExpCapture* capture = captures_->at(i);
+ ASSERT(capture != NULL);
+ capture->set_available(CAPTURE_PERMANENTLY_UNREACHABLE);
+ }
+ } else {
+ // Captures temporarily unavailable because they are in different
+ // alternatives are all available after the disjunction.
+ for (int i = capture_index; i < end_capture_index; i++) {
+ RegExpCapture* capture = captures_->at(i);
+ ASSERT(capture != NULL);
+ if (capture->available() == CAPTURE_UNREACHABLE) {
+ capture->set_available(CAPTURE_AVAILABLE);
+ }
+ }
+ }
+
if (type == '(') {
- return new RegExpCapture(body, capture_index);
+ RegExpCapture* capture = new RegExpCapture(body, capture_index);
+ captures_->at(capture_index - 1) = capture;
+ return capture;
} else if (type == ':') {
return body;
} else {
@@ -4093,10 +4151,10 @@ RegExpTree* RegExpParser::ParseCharacterClass(bool* ok) {
}
Advance();
if (ranges->length() == 0) {
- return RegExpEmpty::GetInstance();
- } else {
- return new RegExpCharacterClass(ranges, is_negated);
+ ranges->Add(CharacterRange::Range(0, 0xffff));
+ is_negated = !is_negated;
}
+ return new RegExpCharacterClass(ranges, is_negated);
}
« no previous file with comments | « regexp2000/src/jsregexp.cc ('k') | regexp2000/src/regexp-macro-assembler.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698