Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(314)

Unified Diff: src/parser.cc

Issue 12427: Merge regexp2000 back into bleeding_edge (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 12 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/parser.cc
===================================================================
--- src/parser.cc (revision 830)
+++ src/parser.cc (working copy)
@@ -34,6 +34,7 @@
#include "runtime.h"
#include "parser.h"
#include "scopes.h"
+#include "string-stream.h"
namespace v8 { namespace internal {
@@ -227,6 +228,334 @@
};
+template <typename T, int initial_size>
+class BufferedZoneList {
+ public:
+
+ BufferedZoneList() :
Mads Ager (chromium) 2008/11/25 21:09:41 All on one line.
+ list_(NULL), last_(NULL) {}
+
+ // Adds element at end of list. This element is buffered and can
+ // be read using last() or removed using RemoveLast until a new Add or until
+ // RemoveLast or GetList has been called.
+ void Add(T* value) {
+ if (last_ != NULL) {
+ if (list_ == NULL) {
+ list_ = new ZoneList<T*>(initial_size);
+ }
+ list_->Add(last_);
+ }
+ last_ = value;
+ }
+
+ T* last() {
+ ASSERT(last_ != NULL);
+ return last_;
+ }
+
+ T* RemoveLast() {
+ ASSERT(last_ != NULL);
+ T* result = last_;
+ if (list_ != NULL && list_->length() > 0)
+ last_ = list_->RemoveLast();
+ else
+ last_ = NULL;
+ return result;
+ }
+
+ T* Get(int i) {
+ ASSERT(0 <= i && i < length());
+ if (list_ == NULL) {
+ ASSERT_EQ(0, i);
+ return last_;
+ } else {
+ if (i == list_->length()) {
+ ASSERT(last_ != NULL);
+ return last_;
+ } else {
+ return list_->at(i);
+ }
+ }
+ }
+
+ void Clear() {
+ list_ = NULL;
+ last_ = NULL;
+ }
+
+ int length() {
+ int length = (list_ == NULL) ? 0 : list_->length();
+ return length + ((last_ == NULL) ? 0 : 1);
+ }
+
+ ZoneList<T*>* GetList() {
+ if (list_ == NULL) {
+ list_ = new ZoneList<T*>(initial_size);
+ }
+ if (last_ != NULL) {
+ list_->Add(last_);
+ last_ = NULL;
+ }
+ return list_;
+ }
+
+ private:
+ ZoneList<T*>* list_;
+ T* last_;
+};
+
+// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
+class RegExpBuilder {
+ public:
+ RegExpBuilder();
+ void AddCharacter(uc16 character);
+ // "Adds" an empty expression. Does nothing except consume a
+ // following quantifier
+ void AddEmpty();
+ void AddAtom(RegExpTree* tree);
+ void AddAssertion(RegExpTree* tree);
+ void NewAlternative(); // '|'
+ void AddQuantifierToAtom(int min, int max, bool is_greedy);
+ RegExpTree* ToRegExp();
+ private:
+ void FlushCharacters();
+ void FlushText();
+ void FlushTerms();
+ bool pending_empty_;
+ ZoneList<uc16>* characters_;
+ BufferedZoneList<RegExpTree, 2> terms_;
+ BufferedZoneList<RegExpTree, 2> text_;
+ BufferedZoneList<RegExpTree, 2> alternatives_;
+#ifdef DEBUG
+ enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_;
+#define LAST(x) last_added_ = x;
+#else
+#define LAST(x)
+#endif
+};
+
+
+RegExpBuilder::RegExpBuilder()
+ : pending_empty_(false), characters_(NULL), terms_(), alternatives_()
Mads Ager (chromium) 2008/11/25 21:09:41 Four space indent. More occurrences below.
+#ifdef DEBUG
+ , last_added_(ADD_NONE)
+#endif
+ {}
+
+
+void RegExpBuilder::FlushCharacters() {
+ pending_empty_ = false;
+ if (characters_ != NULL) {
+ RegExpTree* atom = new RegExpAtom(characters_->ToConstVector());
+ characters_ = NULL;
+ text_.Add(atom);
+ LAST(ADD_ATOM);
+ }
+}
+
+
+void RegExpBuilder::FlushText() {
+ FlushCharacters();
+ int num_text = text_.length();
+ if (num_text == 0) {
+ return;
+ } else if (num_text == 1) {
+ terms_.Add(text_.last());
+ } else {
+ RegExpText* text = new RegExpText();
+ for (int i = 0; i < num_text; i++)
+ text_.Get(i)->AppendToText(text);
+ terms_.Add(text);
+ }
+ text_.Clear();
+}
+
+
+void RegExpBuilder::AddCharacter(uc16 c) {
+ pending_empty_ = false;
+ if (characters_ == NULL) {
+ characters_ = new ZoneList<uc16>(4);
+ }
+ characters_->Add(c);
+ LAST(ADD_CHAR);
+}
+
+
+void RegExpBuilder::AddEmpty() {
+ pending_empty_ = true;
+}
+
+
+void RegExpBuilder::AddAtom(RegExpTree* term) {
+ if (term->IsEmpty()) {
+ AddEmpty();
+ return;
+ }
+ if (term->IsTextElement()) {
+ FlushCharacters();
+ text_.Add(term);
+ } else {
+ FlushText();
+ terms_.Add(term);
+ }
+ LAST(ADD_ATOM);
+}
+
+
+void RegExpBuilder::AddAssertion(RegExpTree* assert) {
+ FlushText();
+ terms_.Add(assert);
+ LAST(ADD_ASSERT);
+}
+
+
+void RegExpBuilder::NewAlternative() {
+ FlushTerms();
+}
+
+
+void RegExpBuilder::FlushTerms() {
+ FlushText();
+ int num_terms = terms_.length();
+ RegExpTree* alternative;
+ if (num_terms == 0) {
+ alternative = RegExpEmpty::GetInstance();
+ } else if (num_terms == 1) {
+ alternative = terms_.last();
+ } else {
+ alternative = new RegExpAlternative(terms_.GetList());
+ }
+ alternatives_.Add(alternative);
+ terms_.Clear();
+ LAST(ADD_NONE);
+}
+
+
+RegExpTree* RegExpBuilder::ToRegExp() {
+ FlushTerms();
+ int num_alternatives = alternatives_.length();
+ if (num_alternatives == 0) {
+ return RegExpEmpty::GetInstance();
+ }
+ if (num_alternatives == 1) {
+ return alternatives_.last();
+ }
+ return new RegExpDisjunction(alternatives_.GetList());
+}
+
+
+void RegExpBuilder::AddQuantifierToAtom(int min, int max, bool is_greedy) {
+ if (pending_empty_) {
+ pending_empty_ = false;
+ return;
+ }
+ RegExpTree* atom;
+ if (characters_ != NULL) {
+ ASSERT(last_added_ == ADD_CHAR);
+ // Last atom was character.
+ Vector<const uc16> char_vector = characters_->ToConstVector();
+ int num_chars = char_vector.length();
+ if (num_chars > 1) {
+ Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
+ text_.Add(new RegExpAtom(prefix));
+ char_vector = char_vector.SubVector(num_chars - 1, num_chars);
+ }
+ characters_ = NULL;
+ atom = new RegExpAtom(char_vector);
+ FlushText();
+ } else if (text_.length() > 0) {
+ ASSERT(last_added_ == ADD_ATOM);
+ atom = text_.RemoveLast();
+ FlushText();
+ } else if (terms_.length() > 0) {
+ ASSERT(last_added_ == ADD_ATOM);
+ atom = terms_.RemoveLast();
+ if (atom->IsLookahead() || atom->IsAssertion()) {
+ // Guaranteed not to match a non-empty string.
+ // Assertion as an atom can happen as, e.g., (?:\b)
+ LAST(ADD_TERM);
+ if (min == 0) {
+ return;
+ }
+ terms_.Add(atom);
+ return;
+ }
+ } else {
+ // Only call immediately after adding an atom or character!
+ UNREACHABLE();
+ return;
+ }
+ terms_.Add(new RegExpQuantifier(min, max, is_greedy, atom));
+ LAST(ADD_TERM);
+}
+
+
+class RegExpParser {
+ public:
+ RegExpParser(FlatStringReader* in,
+ Handle<String>* error,
+ bool multiline_mode);
+ RegExpTree* ParsePattern(bool* ok);
+ RegExpTree* ParseDisjunction(bool* ok);
+ RegExpTree* ParseGroup(bool* ok);
+ RegExpTree* ParseCharacterClass(bool* ok);
+
+ // Parses a {...,...} quantifier and stores the range in the given
+ // out parameters.
+ bool ParseIntervalQuantifier(int* min_out, int* max_out);
+
+ // Parses and returns a single escaped character. The character
+ // must not be 'b' or 'B' since they are usually handle specially.
+ uc32 ParseClassCharacterEscape(bool* ok);
+
+ // Checks whether the following is a length-digit hexadecimal number,
+ // and sets the value if it is.
+ bool ParseHexEscape(int length, uc32* value);
+
+ uc32 ParseControlLetterEscape(bool* ok);
+ uc32 ParseOctalLiteral();
+
+ // Tries to parse the input as a back reference. If successful it
+ // stores the result in the output parameter and returns true. If
+ // it fails it will push back the characters read so the same characters
+ // can be reparsed.
+ bool ParseBackReferenceIndex(int* index_out);
+
+ CharacterRange ParseClassAtom(bool* is_char_class,
+ ZoneList<CharacterRange>* ranges,
+ bool* ok);
+ RegExpTree* ReportError(Vector<const char> message, bool* ok);
+ void Advance();
+ void Advance(int dist);
+ void Reset(int pos);
+
+ bool HasCharacterEscapes();
+
+ int captures_started() { return captures_ == NULL ? 0 : captures_->length(); }
+ int position() { return next_pos_ - 1; }
+
+ static const uc32 kEndMarker = (1 << 21);
+ private:
+ uc32 current() { return current_; }
+ bool has_more() { return has_more_; }
+ bool has_next() { return next_pos_ < in()->length(); }
+ uc32 Next();
+ FlatStringReader* in() { return in_; }
+ void ScanForCaptures();
+ bool CaptureAvailable(int index);
+ uc32 current_;
+ bool has_more_;
+ bool multiline_mode_;
+ int next_pos_;
+ FlatStringReader* in_;
+ Handle<String>* error_;
+ bool has_character_escapes_;
+ bool is_scanned_for_captures_;
+ ZoneList<RegExpCapture*>* captures_;
+ int capture_count_;
+};
+
+
// A temporary scope stores information during parsing, just like
// a plain scope. However, temporary scopes are not kept around
// after parsing or referenced by syntax trees so they can be stack-
@@ -3164,6 +3493,756 @@
// ----------------------------------------------------------------------------
+// Regular expressions
+
+
+RegExpParser::RegExpParser(FlatStringReader* in,
+ Handle<String>* error,
+ bool multiline_mode)
+ : current_(kEndMarker),
+ has_more_(true),
+ multiline_mode_(multiline_mode),
+ next_pos_(0),
+ in_(in),
+ error_(error),
+ has_character_escapes_(false),
+ is_scanned_for_captures_(false),
+ captures_(NULL),
+ capture_count_(0) {
+ Advance(1);
+}
+
+
+uc32 RegExpParser::Next() {
+ if (has_next()) {
+ return in()->Get(next_pos_);
+ } else {
+ return kEndMarker;
+ }
+}
+
+
+void RegExpParser::Advance() {
+ if (next_pos_ < in()->length()) {
+ current_ = in()->Get(next_pos_);
+ next_pos_++;
+ } else {
+ current_ = kEndMarker;
+ has_more_ = false;
+ }
+}
+
+
+void RegExpParser::Reset(int pos) {
+ next_pos_ = pos;
+ Advance();
+}
+
+
+void RegExpParser::Advance(int dist) {
+ for (int i = 0; i < dist; i++)
+ Advance();
+}
+
+
+// Reports whether the parsed string atoms contain any characters that were
+// escaped in the original pattern. If not, all atoms are proper substrings
+// of the original pattern.
+bool RegExpParser::HasCharacterEscapes() {
+ return has_character_escapes_;
+}
+
+RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) {
+ *ok = false;
+ *error_ = Factory::NewStringFromAscii(message, NOT_TENURED);
+ return NULL;
+}
+
+
+// Pattern ::
+// Disjunction
+RegExpTree* RegExpParser::ParsePattern(bool* ok) {
+ RegExpTree* result = ParseDisjunction(CHECK_OK);
+ if (has_more()) {
+ ReportError(CStrVector("Unmatched ')'"), CHECK_OK);
+ }
+ return result;
+}
+
+
+bool RegExpParser::CaptureAvailable(int index) {
+ if (captures_ == NULL) return false;
+ if (index >= captures_->length()) return false;
+ RegExpCapture* capture = captures_->at(index);
+ return capture != NULL && capture->available() == CAPTURE_AVAILABLE;
+}
+
+
+// Disjunction ::
+// Alternative
+// Alternative | Disjunction
+// Alternative ::
+// [empty]
+// Term Alternative
+// Term ::
+// Assertion
+// Atom
+// Atom Quantifier
+RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {
+ RegExpBuilder builder;
+ int capture_start_index = captures_started();
+ while (true) {
+ switch (current()) {
+ case kEndMarker:
+ case ')':
+ return builder.ToRegExp();
+ case '|': {
+ Advance();
+ builder.NewAlternative();
+ int capture_new_alt_start_index = captures_started();
+ for (int i = capture_start_index; i < capture_new_alt_start_index; i++) {
+ RegExpCapture* capture = captures_->at(i);
+ if (capture->available() == CAPTURE_AVAILABLE) {
+ capture->set_available(CAPTURE_UNREACHABLE);
+ }
+ }
+ capture_start_index = capture_new_alt_start_index;
+ continue;
+ }
+ case '*':
+ case '+':
+ case '?':
+ ReportError(CStrVector("Nothing to repeat"), CHECK_OK);
+ case '^': {
+ Advance();
+ RegExpAssertion::Type type =
+ multiline_mode_ ? RegExpAssertion::START_OF_LINE :
+ RegExpAssertion::START_OF_INPUT;
+ builder.AddAssertion(new RegExpAssertion(type));
+ continue;
+ }
+ case '$': {
+ Advance();
+ RegExpAssertion::Type type =
+ multiline_mode_ ? RegExpAssertion::END_OF_LINE :
+ RegExpAssertion::END_OF_INPUT;
+ builder.AddAssertion(new RegExpAssertion(type));
+ continue;
+ }
+ case '.': {
+ Advance();
+ // everything except \x0a, \x0d, \u2028 and \u2029
+ ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
+ CharacterRange::AddClassEscape('.', ranges);
+ RegExpTree* atom = new RegExpCharacterClass(ranges, false);
+ builder.AddAtom(atom);
+ break;
+ }
+ case '(': {
+ RegExpTree* atom = ParseGroup(CHECK_OK);
+ builder.AddAtom(atom);
+ break;
+ }
+ case '[': {
+ RegExpTree* atom = ParseCharacterClass(CHECK_OK);
+ builder.AddAtom(atom);
+ break;
+ }
+ // Atom ::
+ // \ AtomEscape
+ case '\\':
+ switch (Next()) {
+ case kEndMarker:
+ ReportError(CStrVector("\\ at end of pattern"), CHECK_OK);
+ case 'b':
+ Advance(2);
+ builder.AddAssertion(
+ new RegExpAssertion(RegExpAssertion::BOUNDARY));
+ continue;
+ case 'B':
+ Advance(2);
+ builder.AddAssertion(
+ new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
+ continue;
+ // AtomEscape ::
+ // CharacterClassEscape
+ //
+ // CharacterClassEscape :: one of
+ // d D s S w W
+ case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {
+ uc32 c = Next();
+ Advance(2);
+ ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
+ CharacterRange::AddClassEscape(c, ranges);
+ RegExpTree* atom = new RegExpCharacterClass(ranges, false);
+ builder.AddAtom(atom);
+ goto has_read_atom; // Avoid setting has_character_escapes_.
+ }
+ case '1': case '2': case '3': case '4': case '5': case '6':
+ case '7': case '8': case '9': {
+ int index = 0;
+ if (ParseBackReferenceIndex(&index)) {
+ if (!CaptureAvailable(index - 1)) {
+ // Prepare to ignore a following quantifier
+ builder.AddEmpty();
+ goto has_read_atom;
+ }
+ RegExpCapture* capture = captures_->at(index - 1);
+ RegExpTree* atom = new RegExpBackReference(capture);
+ builder.AddAtom(atom);
+ goto has_read_atom; // Avoid setting has_character_escapes_.
+ }
+ uc32 first_digit = Next();
+ if (first_digit == '8' || first_digit == '9') {
+ // Treat as identity escape
+ builder.AddCharacter(first_digit);
+ Advance(2);
+ break;
+ }
+ }
+ // FALLTHROUGH
+ case '0': {
+ Advance();
+ uc32 octal = ParseOctalLiteral();
+ builder.AddCharacter(octal);
+ break;
+ }
+ // ControlEscape :: one of
+ // f n r t v
+ case 'f':
+ Advance(2);
+ builder.AddCharacter('\f');
+ break;
+ case 'n':
+ Advance(2);
+ builder.AddCharacter('\n');
+ break;
+ case 'r':
+ Advance(2);
+ builder.AddCharacter('\r');
+ break;
+ case 't':
+ Advance(2);
+ builder.AddCharacter('\t');
+ break;
+ case 'v':
+ Advance(2);
+ builder.AddCharacter('\v');
+ break;
+ case 'c': {
+ Advance(2);
+ uc32 control = ParseControlLetterEscape(ok);
+ builder.AddCharacter(control);
+ break;
+ }
+ case 'x': {
+ Advance(2);
+ uc32 value;
+ if (ParseHexEscape(2, &value)) {
+ builder.AddCharacter(value);
+ } else {
+ builder.AddCharacter('x');
+ }
+ break;
+ }
+ case 'u': {
+ Advance(2);
+ uc32 value;
+ if (ParseHexEscape(4, &value)) {
+ builder.AddCharacter(value);
+ } else {
+ builder.AddCharacter('u');
+ }
+ break;
+ }
+ default:
+ // Identity escape.
+ builder.AddCharacter(Next());
+ Advance(2);
+ break;
+ }
+ has_character_escapes_ = true;
+ break;
+ case '{': {
+ int dummy;
+ if (ParseIntervalQuantifier(&dummy, &dummy)) {
+ ReportError(CStrVector("Nothing to repeat"), CHECK_OK);
+ }
+ // fallthrough
+ }
+ default:
+ builder.AddCharacter(current());
+ Advance();
+ break;
+ } // end switch(current())
+
+ has_read_atom:
+ int min;
+ int max;
+ switch (current()) {
+ // QuantifierPrefix ::
+ // *
+ // +
+ // ?
+ // {
+ case '*':
+ min = 0;
+ max = RegExpQuantifier::kInfinity;
+ Advance();
+ break;
+ case '+':
+ min = 1;
+ max = RegExpQuantifier::kInfinity;
+ Advance();
+ break;
+ case '?':
+ min = 0;
+ max = 1;
+ Advance();
+ break;
+ case '{':
+ if (ParseIntervalQuantifier(&min, &max)) {
+ break;
+ } else {
+ continue;
+ }
+ default:
+ continue;
+ }
+ bool is_greedy = true;
+ if (current() == '?') {
+ is_greedy = false;
+ Advance();
+ }
+ builder.AddQuantifierToAtom(min, max, is_greedy);
+ }
+}
+
+class SourceCharacter {
+ public:
+ static bool Is(uc32 c) {
+ switch (c) {
+ // case ']': case '}':
+ // In spidermonkey and jsc these are treated as source characters
+ // so we do too.
+ case '^': case '$': case '\\': case '.': case '*': case '+':
+ case '?': case '(': case ')': case '[': case '{': case '|':
+ case RegExpParser::kEndMarker:
+ return false;
+ default:
+ return true;
+ }
+ }
+};
+
+
+static unibrow::Predicate<SourceCharacter> source_character;
+
+
+static inline bool IsSourceCharacter(uc32 c) {
+ return source_character.get(c);
+}
+
+#ifdef DEBUG
+// Currently only used in an ASSERT.
+static bool IsSpecialClassEscape(uc32 c) {
+ switch (c) {
+ case 'd': case 'D':
+ case 's': case 'S':
+ case 'w': case 'W':
+ return true;
+ default:
+ return false;
+ }
+}
+#endif
+
+
+// In order to know whether an escape is a backreference or not we have to scan
+// the entire regexp and find the number of capturing parentheses. However we
+// don't want to scan the regexp twice unless it is necessary. This mini-parser
+// is called when needed. It can see the difference between capturing and
+// noncapturing parentheses and can skip character classes and backslash-escaped
+// characters.
+void RegExpParser::ScanForCaptures() {
+ int n;
+ while ((n = current()) != kEndMarker) {
+ Advance();
+ switch (n) {
+ case '\\':
+ Advance();
+ break;
+ case '[': {
+ int c;
+ while ((c = current()) != kEndMarker) {
+ Advance();
+ if (c == '\\') {
+ Advance();
+ } else {
+ if (c == ']') break;
+ }
+ }
+ break;
+ }
+ case '(':
+ if (current() != '?') capture_count_++;
+ break;
+ }
+ }
+ is_scanned_for_captures_ = true;
+}
+
+
+bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
+ ASSERT_EQ('\\', current());
+ ASSERT('1' <= Next() && Next() <= '9');
+ // Try to parse a decimal literal that is no greater than the number
+ // of previously encountered left capturing parentheses.
+ // This is a not according the the ECMAScript specification. According to
+ // that, one must accept values up to the total number of left capturing
+ // parentheses in the entire input, even if they are meaningless.
+ if (!is_scanned_for_captures_) {
+ int saved_position = position();
+ ScanForCaptures();
+ Reset(saved_position);
+ }
+ if (capture_count_ == 0) return false;
+ int start = position();
+ int value = Next() - '0';
+ if (value > capture_count_) return false;
+ Advance(2);
+ while (true) {
+ uc32 c = current();
+ if (IsDecimalDigit(c)) {
+ value = 10 * value + (c - '0');
+ if (value > capture_count_) {
+ Reset(start);
+ return false;
+ }
+ Advance();
+ } else {
+ break;
+ }
+ }
+ *index_out = value;
+ return true;
+}
+
+
+// QuantifierPrefix ::
+// { DecimalDigits }
+// { DecimalDigits , }
+// { DecimalDigits , DecimalDigits }
+bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
+ ASSERT_EQ(current(), '{');
+ int start = position();
+ Advance();
+ int min = 0;
+ if (!IsDecimalDigit(current())) {
+ Reset(start);
+ return false;
+ }
+ while (IsDecimalDigit(current())) {
+ min = 10 * min + (current() - '0');
+ Advance();
+ }
+ int max = 0;
+ if (current() == '}') {
+ max = min;
+ Advance();
+ } else if (current() == ',') {
+ Advance();
+ if (current() == '}') {
+ max = RegExpQuantifier::kInfinity;
+ Advance();
+ } else {
+ while (IsDecimalDigit(current())) {
+ max = 10 * max + (current() - '0');
+ Advance();
+ }
+ if (current() != '}') {
+ Reset(start);
+ return false;
+ }
+ Advance();
+ }
+ } else {
+ Reset(start);
+ return false;
+ }
+ *min_out = min;
+ *max_out = max;
+ return true;
+}
+
+
+// Upper and lower case letters differ by one bit.
+STATIC_CHECK(('a' ^ 'A') == 0x20);
+
+uc32 RegExpParser::ParseControlLetterEscape(bool* ok) {
+ if (!has_more()) {
+ ReportError(CStrVector("\\c at end of pattern"), ok);
+ return '\0';
+ }
+ uc32 letter = current() & ~(0x20); // Collapse upper and lower case letters.
+ if (letter < 'A' || 'Z' < letter) {
+ // Non-spec error-correction: "\c" followed by non-control letter is
+ // interpreted as an IdentityEscape of 'c'.
+ return 'c';
+ }
+ Advance();
+ return letter & 0x1f; // Remainder modulo 32, per specification.
+}
+
+
+uc32 RegExpParser::ParseOctalLiteral() {
+ ASSERT('0' <= current() && current() <= '7');
+ // For compatibility with some other browsers (not all), we parse
+ // up to three octal digits with a value below 256.
+ uc32 value = current() - '0';
+ Advance();
+ if ('0' <= current() && current() <= '7') {
+ value = value * 8 + current() - '0';
+ Advance();
+ if (value < 32 && '0' <= current() && current() <= '7') {
+ value = value * 8 + current() - '0';
+ Advance();
+ }
+ }
+ return value;
+}
+
+
+bool RegExpParser::ParseHexEscape(int length, uc32 *value) {
+ int start = position();
+ uc32 val = 0;
+ bool done = false;
+ for (int i = 0; !done; i++) {
+ uc32 c = current();
+ int d = HexValue(c);
+ if (d < 0) {
+ Reset(start);
+ return false;
+ }
+ val = val * 16 + d;
+ Advance();
+ if (i == length - 1) {
+ done = true;
+ }
+ }
+ *value = val;
+ return true;
+}
+
+
+uc32 RegExpParser::ParseClassCharacterEscape(bool* ok) {
+ ASSERT(current() == '\\');
+ ASSERT(has_next() && !IsSpecialClassEscape(Next()));
+ Advance();
+ switch (current()) {
+ case 'b':
+ Advance();
+ return '\b';
+ // ControlEscape :: one of
+ // f n r t v
+ case 'f':
+ Advance();
+ return '\f';
+ case 'n':
+ Advance();
+ return '\n';
+ case 'r':
+ Advance();
+ return '\r';
+ case 't':
+ Advance();
+ return '\t';
+ case 'v':
+ Advance();
+ return '\v';
+ case 'c':
+ return ParseControlLetterEscape(ok);
+ case '0': case '1': case '2': case '3': case '4': case '5':
+ case '6': case '7':
+ // For compatibility, we interpret a decimal escape that isn't
+ // a back reference (and therefore either \0 or not valid according
+ // to the specification) as a 1..3 digit octal character code.
+ return ParseOctalLiteral();
+ case 'x': {
+ Advance();
+ uc32 value;
+ if (ParseHexEscape(2, &value)) {
+ return value;
+ }
+ // If \x is not followed by a two-digit hexadecimal, treat it
+ // as an identity escape.
+ return 'x';
+ }
+ case 'u': {
+ Advance();
+ uc32 value;
+ if (ParseHexEscape(4, &value)) {
+ return value;
+ }
+ // If \u is not followed by a four-digit hexadecimal, treat it
+ // as an identity escape.
+ return 'u';
+ }
+ default: {
+ // Extended identity escape. We accept any character that hasn't
+ // been matched by a more specific case, not just the subset required
+ // by the ECMAScript specification.
+ uc32 result = current();
+ Advance();
+ return result;
+ }
+ }
+ return 0;
+}
+
+
+RegExpTree* RegExpParser::ParseGroup(bool* ok) {
+ ASSERT_EQ(current(), '(');
+ char type = '(';
+ Advance();
+ if (current() == '?') {
+ switch (Next()) {
+ case ':': case '=': case '!':
+ type = Next();
+ Advance(2);
+ break;
+ default:
+ ReportError(CStrVector("Invalid group"), CHECK_OK);
+ break;
+ }
+ } else {
+ if (captures_ == NULL) {
+ captures_ = new ZoneList<RegExpCapture*>(2);
+ }
+ captures_->Add(NULL);
+ if (!is_scanned_for_captures_) capture_count_++;
+ }
+ int capture_index = captures_started();
+ RegExpTree* body = ParseDisjunction(CHECK_OK);
+ if (current() != ')') {
+ ReportError(CStrVector("Unterminated group"), CHECK_OK);
+ }
+ Advance();
+
+ int end_capture_index = captures_started();
+ if (type == '!') {
+ // Captures inside a negative lookahead are never available outside it.
+ for (int i = capture_index; i < end_capture_index; i++) {
+ RegExpCapture* capture = captures_->at(i);
+ ASSERT(capture != NULL);
+ capture->set_available(CAPTURE_PERMANENTLY_UNREACHABLE);
+ }
+ } else {
+ // Captures temporarily unavailable because they are in different
+ // alternatives are all available after the disjunction.
+ for (int i = capture_index; i < end_capture_index; i++) {
+ RegExpCapture* capture = captures_->at(i);
+ ASSERT(capture != NULL);
+ if (capture->available() == CAPTURE_UNREACHABLE) {
+ capture->set_available(CAPTURE_AVAILABLE);
+ }
+ }
+ }
+
+ if (type == '(') {
+ RegExpCapture* capture = new RegExpCapture(body, capture_index);
+ captures_->at(capture_index - 1) = capture;
+ return capture;
+ } else if (type == ':') {
+ return body;
+ } else {
+ ASSERT(type == '=' || type == '!');
+ bool is_positive = (type == '=');
+ return new RegExpLookahead(body, is_positive);
+ }
+}
+
+
+CharacterRange RegExpParser::ParseClassAtom(bool* is_char_class,
+ ZoneList<CharacterRange>* ranges,
+ bool* ok) {
+ ASSERT_EQ(false, *is_char_class);
+ uc32 first = current();
+ if (first == '\\') {
+ switch (Next()) {
+ case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {
+ *is_char_class = true;
+ uc32 c = Next();
+ CharacterRange::AddClassEscape(c, ranges);
+ Advance(2);
+ return NULL;
+ }
+ default:
+ uc32 c = ParseClassCharacterEscape(CHECK_OK);
+ return CharacterRange::Singleton(c);
+ }
+ } else {
+ Advance();
+ return CharacterRange::Singleton(first);
+ }
+}
+
+
+RegExpTree* RegExpParser::ParseCharacterClass(bool* ok) {
+ static const char* kUnterminated = "Unterminated character class";
+ static const char* kIllegal = "Illegal character class";
+ static const char* kRangeOutOfOrder = "Range out of order in character class";
+
+ ASSERT_EQ(current(), '[');
+ Advance();
+ bool is_negated = false;
+ if (current() == '^') {
+ is_negated = true;
+ Advance();
+ }
+ ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
+ while (has_more() && current() != ']') {
+ bool is_char_class = false;
+ CharacterRange first = ParseClassAtom(&is_char_class, ranges, CHECK_OK);
+ if (!is_char_class) {
+ if (current() == '-') {
+ Advance();
+ if (current() == kEndMarker) {
+ // If we reach the end we break out of the loop and let the
+ // following code report an error.
+ break;
+ } else if (current() == ']') {
+ ranges->Add(first);
+ ranges->Add(CharacterRange::Singleton('-'));
+ break;
+ }
+ CharacterRange next =
+ ParseClassAtom(&is_char_class, ranges, CHECK_OK);
+ if (is_char_class) {
+ return ReportError(CStrVector(kIllegal), CHECK_OK);
+ }
+ if (first.from() > next.to()) {
+ return ReportError(CStrVector(kRangeOutOfOrder), CHECK_OK);
+ }
+ ranges->Add(CharacterRange::Range(first.from(), next.to()));
+ } else {
+ ranges->Add(first);
+ }
+ }
+ }
+ if (!has_more()) {
+ return ReportError(CStrVector(kUnterminated), CHECK_OK);
+ }
+ Advance();
+ if (ranges->length() == 0) {
+ ranges->Add(CharacterRange::Range(0, 0xffff));
+ is_negated = !is_negated;
+ }
+ return new RegExpCharacterClass(ranges, is_negated);
+}
+
+
+// ----------------------------------------------------------------------------
// The Parser interface.
// MakeAST() is just a wrapper for the corresponding Parser calls
@@ -3211,6 +4290,27 @@
}
+bool ParseRegExp(FlatStringReader* input, RegExpParseResult* result) {
+ ASSERT(result != NULL);
+ // Get multiline flag somehow
Mads Ager (chromium) 2008/11/25 21:09:41 Is this a TODO or a comment on what the code is do
Erik Corry 2008/11/26 12:18:36 It's a TODO. Fixed.
+ RegExpParser parser(input, &result->error, false);
+ bool ok = true;
+ result->tree = parser.ParsePattern(&ok);
+ if (!ok) {
+ ASSERT(result->tree == NULL);
+ ASSERT(!result->error.is_null());
+ } else {
+ ASSERT(result->tree != NULL);
+ ASSERT(result->error.is_null());
+ }
+ if (ok) {
+ result->has_character_escapes = parser.HasCharacterEscapes();
+ result->capture_count = parser.captures_started();
+ }
+ return ok;
+}
+
+
FunctionLiteral* MakeAST(bool compile_in_global_context,
Handle<Script> script,
v8::Extension* extension,

Powered by Google App Engine
This is Rietveld 408576698