Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(332)

Unified Diff: src/parser.cc

Issue 8188: Some new regexp infrastructure. (Closed)
Patch Set: Created 12 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« src/jsregexp.cc ('K') | « src/parser.h ('k') | src/string-stream.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/parser.cc
diff --git a/src/parser.cc b/src/parser.cc
index 9090dbb3eea8f5f8ee9626922fdb26528b73615c..f1ba94d90f4ea2c1c93f771ab96eec7beee07967 100644
--- a/src/parser.cc
+++ b/src/parser.cc
@@ -34,6 +34,7 @@
#include "runtime.h"
#include "parser.h"
#include "scopes.h"
+#include "string-stream.h"
namespace v8 { namespace internal {
@@ -227,6 +228,50 @@ class Parser {
};
+class RegExpParser {
+ public:
+ RegExpParser(unibrow::CharacterStream* in, Handle<String>* error);
+ RegExpTree* ParsePattern(bool* ok);
+ RegExpTree* ParseDisjunction(bool* ok);
+ RegExpTree* ParseAlternative(bool* ok);
+ RegExpTree* ParseTerm(bool* ok);
+ RegExpTree* ParseAtom(bool* ok);
+ RegExpTree* ParseGroup(bool* ok);
+ RegExpTree* ParseCharacterClass(bool* ok);
+
+ // Parses a {...,...} quantifier and stores the range in the given
+ // out parameters.
+ void* ParseIntervalQuantifier(int* min_out, int* max_out, bool* ok);
+
+ // Parses and returns a single escaped character. The character
+ // must not be 'b' or 'B' since they are usually handle specially.
+ uc32 ParseCharacterEscape(bool* ok);
+
+ uc32 ParseHexEscape(int length);
+
+ uc32 ParseControlEscape(bool* ok);
+ uc32 ParseOctalLiteral(bool* ok);
+
+ CharacterRange ParseClassAtom(bool* ok);
+ RegExpTree* ReportError(Vector<const char> message, bool* ok);
+ void Advance();
+ void Advance(int dist);
+ private:
+ uc32 current() { return current_; }
+ uc32 next() { return next_; }
+ bool has_more() { return has_more_; }
+ bool has_next() { return has_next_; }
+ unibrow::CharacterStream* in() { return in_; }
+ uc32 current_;
+ uc32 next_;
+ bool has_more_;
+ bool has_next_;
+ int captures_seen_;
+ unibrow::CharacterStream* in_;
+ Handle<String>* error_;
+};
+
+
// A temporary scope stores information during parsing, just like
// a plain scope. However, temporary scopes are not kept around
// after parsing or referenced by syntax trees so they can be stack-
@@ -3162,6 +3207,511 @@ Expression* Parser::NewThrowError(Handle<String> constructor,
// ----------------------------------------------------------------------------
+// Regular expressions.
+
+RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>* error)
+ : current_('\0'),
+ next_('\0'),
+ has_more_(true),
+ has_next_(true),
+ captures_seen_(0),
+ in_(in),
+ error_(error) {
+ Advance(2);
+}
+
+void RegExpParser::Advance() {
+ current_ = next_;
+ has_more_ = has_next_;
+ if (in()->has_more()) {
+ next_ = in()->GetNext();
+ } else {
+ next_ = '\0';
+ has_next_ = false;
+ }
+}
+
+void RegExpParser::Advance(int dist) {
+ for (int i = 0; i < dist; i++)
+ Advance();
+}
+
+RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) {
+ *ok = false;
+ *error_ = Factory::NewStringFromAscii(message, NOT_TENURED);
+ return NULL;
+}
+
+// Pattern ::
+// Disjunction
+RegExpTree* RegExpParser::ParsePattern(bool* ok) {
+ return ParseDisjunction(ok);
+}
+
+// Disjunction ::
+// Alternative
+// Alternative | Disjunction
+RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {
+ RegExpTree* first = ParseAlternative(CHECK_OK);
+ if (current() == '|') {
+ ZoneList<RegExpTree*>* nodes = new ZoneList<RegExpTree*>(2);
+ nodes->Add(first);
+ while (current() == '|') {
+ Advance();
+ RegExpTree* next = ParseAlternative(CHECK_OK);
+ nodes->Add(next);
+ }
+ return new RegExpDisjunction(nodes);
+ } else {
+ return first;
+ }
+}
+
+static bool IsAlternativeTerminator(uc32 c) {
+ return c == '|' || c == ')' || c == '\0';
Erik Corry 2008/10/27 14:58:44 This probably won't work in the long run, can't JS
Christian Plesner Hansen 2008/10/27 18:57:02 Hmm. I've replaced it with a special kEndMarker w
+}
+
+// Alternative ::
+// [empty]
+// Alternative Term
+RegExpTree* RegExpParser::ParseAlternative(bool* ok) {
+ if (!IsAlternativeTerminator(current())) {
+ RegExpTree* first = ParseTerm(CHECK_OK);
+ if (!IsAlternativeTerminator(current())) {
+ ZoneList<RegExpTree*>* nodes = new ZoneList<RegExpTree*>(2);
+ nodes->Add(first);
+ while (!IsAlternativeTerminator(current())) {
+ RegExpTree* next = ParseTerm(CHECK_OK);
+ nodes->Add(next);
+ }
+ return new RegExpAlternative(nodes);
+ } else {
+ return first;
+ }
+ } else {
+ return RegExpEmpty::GetInstance();
+ }
+}
+
+
+class SourceCharacter {
+ public:
+ static bool Is(uc32 c) {
+ switch (c) {
+ // case ']': case '}':
+ // In spidermonkey and jsc these are treated as source characters
+ // so we do too.
+ case '^': case '$': case '\\': case '.': case '*': case '+':
+ case '?': case '(': case ')': case '[': case '{': case '|':
+ return false;
+ default:
+ return true;
+ }
+ }
+};
+
+
+static unibrow::Predicate<SourceCharacter> source_character;
+
+
+static inline bool IsSourceCharacter(uc32 c) {
+ return source_character.get(c);
+}
+
+
+static bool IsSpecialEscape(uc32 c) {
+ switch (c) {
+ case 'b': case 'B': case 'd': case 'D': case 's': case 'S':
+ case 'w': case 'W':
+ return true;
+ default:
+ return false;
+ }
+}
+
+
+// Term ::
+// Assertion
+// Atom
+// Atom Quantifier
+RegExpTree* RegExpParser::ParseTerm(bool* ok) {
+ RegExpTree* atom = NULL;
+ switch (current()) {
+ // Assertion ::
+ // ^
+ // $
+ // \ b
+ // \ B
+ case '^':
+ Advance();
+ return new RegExpAssertion(RegExpAssertion::START);
+ case '$':
+ Advance();
+ return new RegExpAssertion(RegExpAssertion::END);
+ case '.':
+ Advance();
+ atom = new RegExpCharacterClass(CharacterRange::Special('.'));
+ break;
+ case '(':
+ atom = ParseGroup(CHECK_OK);
+ break;
+ case '[':
+ atom = ParseCharacterClass(CHECK_OK);
+ break;
+ // Atom ::
+ // \ AtomEscape
+ case '\\':
+ if (has_next()) {
+ switch (next()) {
+ case 'b':
+ Advance(2);
+ return new RegExpAssertion(RegExpAssertion::BOUNDARY);
+ case 'B':
+ Advance(2);
+ return new RegExpAssertion(RegExpAssertion::NON_BOUNDARY);
+ // AtomEscape ::
+ // CharacterClassEscape
+ //
+ // CharacterClassEscape :: one of
+ // d D s S w W
+ case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {
+ uc32 c = next();
+ Advance(2);
+ atom = new RegExpCharacterClass(CharacterRange::Special(c));
+ goto has_read_atom;
+ }
+ // TODO: backreferences
+ default:
+ break;
+ }
+ }
+ // All other escapes fall through to the default case since
+ // they correspond to single characters that can be
+ // represented within atoms.
+ default: {
+ atom = ParseAtom(CHECK_OK);
+ break;
+ }
+ }
+ has_read_atom:
+ int min;
+ int max;
+ switch (current()) {
+ // QuantifierPrefix ::
+ // *
+ // +
+ // ?
+ case '*':
+ min = 0;
+ max = RegExpQuantifier::kInfinity;
+ Advance();
+ break;
+ case '+':
+ min = 1;
+ max = RegExpQuantifier::kInfinity;
+ Advance();
+ break;
+ case '?':
+ min = 0;
+ max = 1;
+ Advance();
+ break;
+ case '{':
+ ParseIntervalQuantifier(&min, &max, CHECK_OK);
+ break;
+ default:
+ return atom;
+ }
+ bool is_greedy = true;
+ if (current() == '?') {
+ is_greedy = false;
+ Advance();
+ }
+ return new RegExpQuantifier(min, max, is_greedy, atom);
+}
+
+
+// QuantifierPrefix ::
+// { DecimalDigits }
+// { DecimalDigits , }
+// { DecimalDigits , DecimalDigits }
+void* RegExpParser::ParseIntervalQuantifier(int* min_out,
+ int* max_out,
+ bool* ok) {
+ ASSERT_EQ(current(), '{');
+ static const char* kInvalidQuantifier = "Invalid quantifier";
+ Advance();
+ int min = 0;
+ if (!IsDecimalDigit(current())) {
+ // JSC allows {} and {,} as quantifiers (and { and } and all
+ // sorts of crazy stuff) but my puny human brain has been unable
+ // to figure out what they mean exactly, if anything. For now
+ // we follow the spec and report a syntax error.
+ ReportError(CStrVector(kInvalidQuantifier), CHECK_OK);
+ }
+ while (IsDecimalDigit(current())) {
+ min = 10 * min + (current() - '0');
+ Advance();
+ }
+ int max = 0;
+ if (current() == '}') {
+ max = min;
+ Advance();
+ } else if (current() == ',') {
+ Advance();
+ if (current() == '}') {
+ Advance();
+ max = RegExpQuantifier::kInfinity;
Lasse Reichstein 2008/10/27 13:12:58 All the other cases has the Advance() call after t
+ } else {
+ while (IsDecimalDigit(current())) {
+ max = 10 * max + (current() - '0');
+ Advance();
+ }
+ if (current() != '}') {
+ ReportError(CStrVector(kInvalidQuantifier), CHECK_OK);
+ }
+ Advance();
+ }
+ } else {
+ ReportError(CStrVector(kInvalidQuantifier), CHECK_OK);
+ }
+ *min_out = min;
+ *max_out = max;
+ return NULL;
+}
+
+
+RegExpTree* RegExpParser::ParseAtom(bool* ok) {
+ ASSERT(current() == '\\' || IsSourceCharacter(current()));
+ ZoneList<uc16>* buf = new ZoneList<uc16>(4);
+ while (true) {
+ if (IsSourceCharacter(current())) {
+ buf->Add(current());
+ Advance();
+ } else if (current() == '\\') {
+ if (!has_next()) {
+ ReportError(CStrVector("\\ at end of pattern"), CHECK_OK);
+ } else if (IsSpecialEscape(next())) {
+ // If the next thing we see is a special escape we stop
+ // reading this atom.
+ break;
+ } else {
+ uc32 escape = ParseCharacterEscape(CHECK_OK);
+ buf->Add(escape);
+ }
+ } else {
+ break;
+ }
+ }
+ return new RegExpAtom(buf->ToConstVector());
+}
+
+
+uc32 RegExpParser::ParseControlEscape(bool* ok) {
+ ASSERT(current() == 'c');
+ Advance();
+ if (!has_more()) {
+ ReportError(CStrVector("\\c at end of pattern"), ok);
+ return '\0';
+ } else {
+ uc32 letter = current();
+ if (!('a' <= letter && letter <= 'z') &&
+ !('A' <= letter && letter <= 'Z')) {
+ ReportError(CStrVector("Illegal control letter"), ok);
+ return '\0';
+ }
+ Advance();
+ return letter & ((1 << 5) - 1);
+ }
+}
+
+
+uc32 RegExpParser::ParseOctalLiteral(bool* ok) {
+ ASSERT('0' <= current() && current() <= '7');
+ // Here we're really supposed to break out after the first digit
+ // if it is '0' but the other implementations don't do that so
+ // neither do we. Is this deviation from the spec error prone?
+ // Yes, it's probably as error prone as it's possible to get. Isn't
+ // JavaScript wonderful?
+ uc32 value = 0;
+ while ('0' <= current() && current() <= '7') {
+ int next = (8 * value) + (current() - '0');
+ if (next >= 256) {
+ break;
+ } else {
+ value = next;
+ Advance();
+ }
+ }
+ return value;
+}
+
+
+uc32 RegExpParser::ParseHexEscape(int length) {
+ uc32 value = 0;
+ for (int i = 0; i < length; i++) {
+ int d = HexValue(current());
+ if (d < 0)
+ return value;
+ value = value * 16 + d;
+ Advance();
+ }
+
+ return value;
+}
+
+
+uc32 RegExpParser::ParseCharacterEscape(bool* ok) {
+ ASSERT(current() == '\\');
+ ASSERT(has_next() && !IsSpecialEscape(next()));
+ Advance();
+ ASSERT(current() != 'b' && current() != 'B');
+ switch (current()) {
+ // ControlEscape :: one of
+ // f n r t v
+ case 'f':
+ Advance();
+ return '\f';
+ case 'n':
+ Advance();
+ return '\n';
+ case 'r':
+ Advance();
+ return '\r';
+ case 't':
+ Advance();
+ return '\t';
+ case 'v':
+ Advance();
+ return '\v';
+ case 'c':
+ return ParseControlEscape(ok);
+ case '0': case '1': case '2': case '3': case '4': case '5':
+ case '6': case '7':
+ // We're really supposed to read this as a decimal integer
+ // literal which is base 10 but for whatever reason the other
+ // implementations read base 8. It's hard to believe that the
+ // spec was written by some ofthe same people that wrote the
+ // other implementations...
Lasse Reichstein 2008/10/27 13:12:58 So bitter! ;P Spec says to read as decimal literal
Christian Plesner Hansen 2008/10/27 18:57:02 I think we're free to do whatever we want with \8
+ return ParseOctalLiteral(ok);
+ case 'x':
+ Advance();
+ return ParseHexEscape(2);
+ case 'A': case 'Z': {
+ uc32 result = current();
+ Advance();
+ return result;
+ }
+ default: {
+ ASSERT(!Scanner::kIsIdentifierPart.get(current()));
+ uc32 result = current();
+ Advance();
+ return result;
+ }
+ }
+ return 0;
+}
+
+
+RegExpTree* RegExpParser::ParseGroup(bool* ok) {
+ ASSERT_EQ(current(), '(');
+ char type = '(';
+ Advance();
+ if (current() == '?') {
+ switch (next()) {
+ case ':': case '=': case '!':
+ type = next();
+ Advance(2);
+ break;
+ default:
+ ReportError(CStrVector("Invalid group"), CHECK_OK);
+ break;
+ }
+ }
+ RegExpTree* body = ParseDisjunction(CHECK_OK);
+ if (current() != ')') {
+ ReportError(CStrVector("Unterminated group"), CHECK_OK);
+ }
+ Advance();
+ if (type == '(') {
+ captures_seen_++;
+ return new RegExpCapture(body);
+ } else if (type == ':') {
+ return body;
+ } else {
+ ASSERT(type == '=' || type == '!');
+ bool is_positive = (type == '=');
+ return new RegExpLookahead(body, is_positive);
+ }
+}
+
+
+CharacterRange RegExpParser::ParseClassAtom(bool* ok) {
+ uc32 first = current();
+ if (first == '\\') {
+ switch (next()) {
+ case 'b':
+ Advance(2);
+ return CharacterRange::Singleton('\b');
+ case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {
+ uc32 c = next();
+ Advance(2);
+ return CharacterRange::Special(c);
+ }
+ default:
+ uc32 c = ParseCharacterEscape(CHECK_OK);
Lasse Reichstein 2008/10/27 13:12:58 If we add parsing of back-references, ParseCharact
Christian Plesner Hansen 2008/10/27 18:57:02 Right, I expect what we'll do is check for backref
+ return CharacterRange::Singleton(c);
+ }
+ } else {
+ Advance();
+ return CharacterRange::Singleton(first);
+ }
+}
+
+
+RegExpTree* RegExpParser::ParseCharacterClass(bool* ok) {
+ static const char* kUnterminated = "Unterminated character class";
+ static const char* kIllegal = "Illegal character class";
+
+ ASSERT_EQ(current(), '[');
+ Advance();
+ bool is_negated = false;
+ if (current() == '^') {
+ is_negated = true;
+ Advance();
+ }
+ ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
+ while (has_more() && current() != ']') {
+ if (current() == '-') {
+ Advance();
+ ranges->Add(CharacterRange::Singleton('-'));
+ } else {
+ CharacterRange first = ParseClassAtom(CHECK_OK);
+ if (!first.is_special() && current() == '-') {
+ Advance();
+ CharacterRange next = ParseClassAtom(CHECK_OK);
+ if (next.is_special()) {
+ return ReportError(CStrVector(kIllegal), CHECK_OK);
+ }
+ ranges->Add(CharacterRange::Range(first.from(), next.to()));
+ } else {
+ ranges->Add(first);
+ }
+ }
+ }
+ if (!has_more()) {
+ return ReportError(CStrVector(kUnterminated), CHECK_OK);
+ }
+ Advance();
+ if (ranges->length() == 0) {
+ return RegExpEmpty::GetInstance();
+ } else {
+ return new RegExpCharacterClass(ranges, is_negated);
+ }
+}
+
+
+// ----------------------------------------------------------------------------
// The Parser interface.
// MakeAST() is just a wrapper for the corresponding Parser calls
@@ -3209,6 +3759,23 @@ ScriptDataImpl* PreParse(unibrow::CharacterStream* stream,
}
+RegExpTree* ParseRegExp(unibrow::CharacterStream* stream,
+ Handle<String>* error) {
+ ASSERT(error->is_null());
+ RegExpParser parser(stream, error);
+ bool ok = true;
+ RegExpTree* result = parser.ParsePattern(&ok);
+ if (!ok) {
+ ASSERT(result == NULL);
+ ASSERT(!error->is_null());
+ } else {
+ ASSERT(result != NULL);
+ ASSERT(error->is_null());
+ }
+ return result;
+}
+
+
FunctionLiteral* MakeAST(bool compile_in_global_context,
Handle<Script> script,
v8::Extension* extension,
« src/jsregexp.cc ('K') | « src/parser.h ('k') | src/string-stream.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698