src/parser.cc - Issue 12427: Merge regexp2000 back into bleeding_edge

Unified Diff: src/parser.cc

Issue 12427: Merge regexp2000 back into bleeding_edge (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 12 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/parser.cc

===================================================================

--- src/parser.cc (revision 830)

+++ src/parser.cc (working copy)

@@ -34,6 +34,7 @@

#include "runtime.h"

#include "parser.h"

#include "scopes.h"

+#include "string-stream.h"

namespace v8 { namespace internal {

@@ -227,6 +228,334 @@

};

+template <typename T, int initial_size>

+class BufferedZoneList {

+ public:

+ BufferedZoneList() :

Mads Ager (chromium) 2008/11/25 21:09:41 All on one line.

+ list_(NULL), last_(NULL) {}

+ // Adds element at end of list. This element is buffered and can

+ // be read using last() or removed using RemoveLast until a new Add or until

+ // RemoveLast or GetList has been called.

+ void Add(T* value) {

+ if (last_ != NULL) {

+ if (list_ == NULL) {

+ list_ = new ZoneList<T*>(initial_size);

+ }

+ list_->Add(last_);

+ }

+ last_ = value;

+ }

+ T* last() {

+ ASSERT(last_ != NULL);

+ return last_;

+ }

+ T* RemoveLast() {

+ ASSERT(last_ != NULL);

+ T* result = last_;

+ if (list_ != NULL && list_->length() > 0)

+ last_ = list_->RemoveLast();

+ else

+ last_ = NULL;

+ return result;

+ }

+ T* Get(int i) {

+ ASSERT(0 <= i && i < length());

+ if (list_ == NULL) {

+ ASSERT_EQ(0, i);

+ return last_;

+ } else {

+ if (i == list_->length()) {

+ ASSERT(last_ != NULL);

+ return last_;

+ } else {

+ return list_->at(i);

+ }

+ void Clear() {

+ list_ = NULL;

+ last_ = NULL;

+ }

+ int length() {

+ int length = (list_ == NULL) ? 0 : list_->length();

+ return length + ((last_ == NULL) ? 0 : 1);

+ }

+ ZoneList<T*>* GetList() {

+ if (list_ == NULL) {

+ list_ = new ZoneList<T*>(initial_size);

+ }

+ if (last_ != NULL) {

+ list_->Add(last_);

+ last_ = NULL;

+ }

+ return list_;

+ }

+ private:

+ ZoneList<T*>* list_;

+ T* last_;

+};

+// Accumulates RegExp atoms and assertions into lists of terms and alternatives.

+class RegExpBuilder {

+ public:

+ RegExpBuilder();

+ void AddCharacter(uc16 character);

+ // "Adds" an empty expression. Does nothing except consume a

+ // following quantifier

+ void AddEmpty();

+ void AddAtom(RegExpTree* tree);

+ void AddAssertion(RegExpTree* tree);

+ void NewAlternative(); // '|'

+ void AddQuantifierToAtom(int min, int max, bool is_greedy);

+ RegExpTree* ToRegExp();

+ private:

+ void FlushCharacters();

+ void FlushText();

+ void FlushTerms();

+ bool pending_empty_;

+ ZoneList<uc16>* characters_;

+ BufferedZoneList<RegExpTree, 2> terms_;

+ BufferedZoneList<RegExpTree, 2> text_;

+ BufferedZoneList<RegExpTree, 2> alternatives_;

+#ifdef DEBUG

+ enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_;

+#define LAST(x) last_added_ = x;

+#else

+#define LAST(x)

+#endif

+};

+RegExpBuilder::RegExpBuilder()

+ : pending_empty_(false), characters_(NULL), terms_(), alternatives_()

Mads Ager (chromium) 2008/11/25 21:09:41 Four space indent. More occurrences below.

+#ifdef DEBUG

+ , last_added_(ADD_NONE)

+#endif

+ {}

+void RegExpBuilder::FlushCharacters() {

+ pending_empty_ = false;

+ if (characters_ != NULL) {

+ RegExpTree* atom = new RegExpAtom(characters_->ToConstVector());

+ characters_ = NULL;

+ text_.Add(atom);

+ LAST(ADD_ATOM);

+ }

+void RegExpBuilder::FlushText() {

+ FlushCharacters();

+ int num_text = text_.length();

+ if (num_text == 0) {

+ return;

+ } else if (num_text == 1) {

+ terms_.Add(text_.last());

+ } else {

+ RegExpText* text = new RegExpText();

+ for (int i = 0; i < num_text; i++)

+ text_.Get(i)->AppendToText(text);

+ terms_.Add(text);

+ }

+ text_.Clear();

+void RegExpBuilder::AddCharacter(uc16 c) {

+ pending_empty_ = false;

+ if (characters_ == NULL) {

+ characters_ = new ZoneList<uc16>(4);

+ }

+ characters_->Add(c);

+ LAST(ADD_CHAR);

+void RegExpBuilder::AddEmpty() {

+ pending_empty_ = true;

+void RegExpBuilder::AddAtom(RegExpTree* term) {

+ if (term->IsEmpty()) {

+ AddEmpty();

+ return;

+ }

+ if (term->IsTextElement()) {

+ FlushCharacters();

+ text_.Add(term);

+ } else {

+ FlushText();

+ terms_.Add(term);

+ }

+ LAST(ADD_ATOM);

+void RegExpBuilder::AddAssertion(RegExpTree* assert) {

+ FlushText();

+ terms_.Add(assert);

+ LAST(ADD_ASSERT);

+void RegExpBuilder::NewAlternative() {

+ FlushTerms();

+void RegExpBuilder::FlushTerms() {

+ FlushText();

+ int num_terms = terms_.length();

+ RegExpTree* alternative;

+ if (num_terms == 0) {

+ alternative = RegExpEmpty::GetInstance();

+ } else if (num_terms == 1) {

+ alternative = terms_.last();

+ } else {

+ alternative = new RegExpAlternative(terms_.GetList());

+ }

+ alternatives_.Add(alternative);

+ terms_.Clear();

+ LAST(ADD_NONE);

+RegExpTree* RegExpBuilder::ToRegExp() {

+ FlushTerms();

+ int num_alternatives = alternatives_.length();

+ if (num_alternatives == 0) {

+ return RegExpEmpty::GetInstance();

+ }

+ if (num_alternatives == 1) {

+ return alternatives_.last();

+ }

+ return new RegExpDisjunction(alternatives_.GetList());

+void RegExpBuilder::AddQuantifierToAtom(int min, int max, bool is_greedy) {

+ if (pending_empty_) {

+ pending_empty_ = false;

+ return;

+ }

+ RegExpTree* atom;

+ if (characters_ != NULL) {

+ ASSERT(last_added_ == ADD_CHAR);

+ // Last atom was character.

+ Vector<const uc16> char_vector = characters_->ToConstVector();

+ int num_chars = char_vector.length();

+ if (num_chars > 1) {

+ Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);

+ text_.Add(new RegExpAtom(prefix));

+ char_vector = char_vector.SubVector(num_chars - 1, num_chars);

+ }

+ characters_ = NULL;

+ atom = new RegExpAtom(char_vector);

+ FlushText();

+ } else if (text_.length() > 0) {

+ ASSERT(last_added_ == ADD_ATOM);

+ atom = text_.RemoveLast();

+ FlushText();

+ } else if (terms_.length() > 0) {

+ ASSERT(last_added_ == ADD_ATOM);

+ atom = terms_.RemoveLast();

+ if (atom->IsLookahead() || atom->IsAssertion()) {

+ // Guaranteed not to match a non-empty string.

+ // Assertion as an atom can happen as, e.g., (?:\b)

+ LAST(ADD_TERM);

+ if (min == 0) {

+ return;

+ }

+ terms_.Add(atom);

+ return;

+ }

+ } else {

+ // Only call immediately after adding an atom or character!

+ UNREACHABLE();

+ return;

+ }

+ terms_.Add(new RegExpQuantifier(min, max, is_greedy, atom));

+ LAST(ADD_TERM);

+class RegExpParser {

+ public:

+ RegExpParser(FlatStringReader* in,

+ Handle<String>* error,

+ bool multiline_mode);

+ RegExpTree* ParsePattern(bool* ok);

+ RegExpTree* ParseDisjunction(bool* ok);

+ RegExpTree* ParseGroup(bool* ok);

+ RegExpTree* ParseCharacterClass(bool* ok);

+ // Parses a {...,...} quantifier and stores the range in the given

+ // out parameters.

+ bool ParseIntervalQuantifier(int* min_out, int* max_out);

+ // Parses and returns a single escaped character. The character

+ // must not be 'b' or 'B' since they are usually handle specially.

+ uc32 ParseClassCharacterEscape(bool* ok);

+ // Checks whether the following is a length-digit hexadecimal number,

+ // and sets the value if it is.

+ bool ParseHexEscape(int length, uc32* value);

+ uc32 ParseControlLetterEscape(bool* ok);

+ uc32 ParseOctalLiteral();

+ // Tries to parse the input as a back reference. If successful it

+ // stores the result in the output parameter and returns true. If

+ // it fails it will push back the characters read so the same characters

+ // can be reparsed.

+ bool ParseBackReferenceIndex(int* index_out);

+ CharacterRange ParseClassAtom(bool* is_char_class,

+ ZoneList<CharacterRange>* ranges,

+ bool* ok);

+ RegExpTree* ReportError(Vector<const char> message, bool* ok);

+ void Advance();

+ void Advance(int dist);

+ void Reset(int pos);

+ bool HasCharacterEscapes();

+ int captures_started() { return captures_ == NULL ? 0 : captures_->length(); }

+ int position() { return next_pos_ - 1; }

+ static const uc32 kEndMarker = (1 << 21);

+ private:

+ uc32 current() { return current_; }

+ bool has_more() { return has_more_; }

+ bool has_next() { return next_pos_ < in()->length(); }

+ uc32 Next();

+ FlatStringReader* in() { return in_; }

+ void ScanForCaptures();

+ bool CaptureAvailable(int index);

+ uc32 current_;

+ bool has_more_;

+ bool multiline_mode_;

+ int next_pos_;

+ FlatStringReader* in_;

+ Handle<String>* error_;

+ bool has_character_escapes_;

+ bool is_scanned_for_captures_;

+ ZoneList<RegExpCapture*>* captures_;

+ int capture_count_;

+};

// A temporary scope stores information during parsing, just like

// a plain scope. However, temporary scopes are not kept around

// after parsing or referenced by syntax trees so they can be stack-

@@ -3164,6 +3493,756 @@

// ----------------------------------------------------------------------------

+// Regular expressions

+RegExpParser::RegExpParser(FlatStringReader* in,

+ Handle<String>* error,

+ bool multiline_mode)

+ : current_(kEndMarker),

+ has_more_(true),

+ multiline_mode_(multiline_mode),

+ next_pos_(0),

+ in_(in),

+ error_(error),

+ has_character_escapes_(false),

+ is_scanned_for_captures_(false),

+ captures_(NULL),

+ capture_count_(0) {

+ Advance(1);

+uc32 RegExpParser::Next() {

+ if (has_next()) {

+ return in()->Get(next_pos_);

+ } else {

+ return kEndMarker;

+ }

+void RegExpParser::Advance() {

+ if (next_pos_ < in()->length()) {

+ current_ = in()->Get(next_pos_);

+ next_pos_++;

+ } else {

+ current_ = kEndMarker;

+ has_more_ = false;

+ }

+void RegExpParser::Reset(int pos) {

+ next_pos_ = pos;

+ Advance();

+void RegExpParser::Advance(int dist) {

+ for (int i = 0; i < dist; i++)

+ Advance();

+// Reports whether the parsed string atoms contain any characters that were

+// escaped in the original pattern. If not, all atoms are proper substrings

+// of the original pattern.

+bool RegExpParser::HasCharacterEscapes() {

+ return has_character_escapes_;

+RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) {

+ *ok = false;

+ *error_ = Factory::NewStringFromAscii(message, NOT_TENURED);

+ return NULL;

+// Pattern ::

+// Disjunction

+RegExpTree* RegExpParser::ParsePattern(bool* ok) {

+ RegExpTree* result = ParseDisjunction(CHECK_OK);

+ if (has_more()) {

+ ReportError(CStrVector("Unmatched ')'"), CHECK_OK);

+ }

+ return result;

+bool RegExpParser::CaptureAvailable(int index) {

+ if (captures_ == NULL) return false;

+ if (index >= captures_->length()) return false;

+ RegExpCapture* capture = captures_->at(index);

+ return capture != NULL && capture->available() == CAPTURE_AVAILABLE;

+// Disjunction ::

+// Alternative

+// Alternative | Disjunction

+// Alternative ::

+// [empty]

+// Term Alternative

+// Term ::

+// Assertion

+// Atom

+// Atom Quantifier

+RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {

+ RegExpBuilder builder;

+ int capture_start_index = captures_started();

+ while (true) {

+ switch (current()) {

+ case kEndMarker:

+ case ')':

+ return builder.ToRegExp();

+ case '|': {

+ Advance();

+ builder.NewAlternative();

+ int capture_new_alt_start_index = captures_started();

+ for (int i = capture_start_index; i < capture_new_alt_start_index; i++) {

+ RegExpCapture* capture = captures_->at(i);

+ if (capture->available() == CAPTURE_AVAILABLE) {

+ capture->set_available(CAPTURE_UNREACHABLE);

+ }

+ capture_start_index = capture_new_alt_start_index;

+ continue;

+ }

+ case '*':

+ case '+':

+ case '?':

+ ReportError(CStrVector("Nothing to repeat"), CHECK_OK);

+ case '^': {

+ Advance();

+ RegExpAssertion::Type type =

+ multiline_mode_ ? RegExpAssertion::START_OF_LINE :

+ RegExpAssertion::START_OF_INPUT;

+ builder.AddAssertion(new RegExpAssertion(type));

+ continue;

+ }

+ case '$': {

+ Advance();

+ RegExpAssertion::Type type =

+ multiline_mode_ ? RegExpAssertion::END_OF_LINE :

+ RegExpAssertion::END_OF_INPUT;

+ builder.AddAssertion(new RegExpAssertion(type));

+ continue;

+ }

+ case '.': {

+ Advance();

+ // everything except \x0a, \x0d, \u2028 and \u2029

+ ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);

+ CharacterRange::AddClassEscape('.', ranges);

+ RegExpTree* atom = new RegExpCharacterClass(ranges, false);

+ builder.AddAtom(atom);

+ break;

+ }

+ case '(': {

+ RegExpTree* atom = ParseGroup(CHECK_OK);

+ builder.AddAtom(atom);

+ break;

+ }

+ case '[': {

+ RegExpTree* atom = ParseCharacterClass(CHECK_OK);

+ builder.AddAtom(atom);

+ break;

+ }

+ // Atom ::

+ // \ AtomEscape

+ case '\\':

+ switch (Next()) {

+ case kEndMarker:

+ ReportError(CStrVector("\\ at end of pattern"), CHECK_OK);

+ case 'b':

+ Advance(2);

+ builder.AddAssertion(

+ new RegExpAssertion(RegExpAssertion::BOUNDARY));

+ continue;

+ case 'B':

+ Advance(2);

+ builder.AddAssertion(

+ new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));

+ continue;

+ // AtomEscape ::

+ // CharacterClassEscape

+ //

+ // CharacterClassEscape :: one of

+ // d D s S w W

+ case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {

+ uc32 c = Next();

+ Advance(2);

+ ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);

+ CharacterRange::AddClassEscape(c, ranges);

+ RegExpTree* atom = new RegExpCharacterClass(ranges, false);

+ builder.AddAtom(atom);

+ goto has_read_atom; // Avoid setting has_character_escapes_.

+ }

+ case '1': case '2': case '3': case '4': case '5': case '6':

+ case '7': case '8': case '9': {

+ int index = 0;

+ if (ParseBackReferenceIndex(&index)) {

+ if (!CaptureAvailable(index - 1)) {

+ // Prepare to ignore a following quantifier

+ builder.AddEmpty();

+ goto has_read_atom;

+ }

+ RegExpCapture* capture = captures_->at(index - 1);

+ RegExpTree* atom = new RegExpBackReference(capture);

+ builder.AddAtom(atom);

+ goto has_read_atom; // Avoid setting has_character_escapes_.

+ }

+ uc32 first_digit = Next();

+ if (first_digit == '8' || first_digit == '9') {

+ // Treat as identity escape

+ builder.AddCharacter(first_digit);

+ Advance(2);

+ break;

+ }

+ // FALLTHROUGH

+ case '0': {

+ Advance();

+ uc32 octal = ParseOctalLiteral();

+ builder.AddCharacter(octal);

+ break;

+ }

+ // ControlEscape :: one of

+ // f n r t v

+ case 'f':

+ Advance(2);

+ builder.AddCharacter('\f');

+ break;

+ case 'n':

+ Advance(2);

+ builder.AddCharacter('\n');

+ break;

+ case 'r':

+ Advance(2);

+ builder.AddCharacter('\r');

+ break;

+ case 't':

+ Advance(2);

+ builder.AddCharacter('\t');

+ break;

+ case 'v':

+ Advance(2);

+ builder.AddCharacter('\v');

+ break;

+ case 'c': {

+ Advance(2);

+ uc32 control = ParseControlLetterEscape(ok);

+ builder.AddCharacter(control);

+ break;

+ }

+ case 'x': {

+ Advance(2);

+ uc32 value;

+ if (ParseHexEscape(2, &value)) {

+ builder.AddCharacter(value);

+ } else {

+ builder.AddCharacter('x');

+ }

+ break;

+ }

+ case 'u': {

+ Advance(2);

+ uc32 value;

+ if (ParseHexEscape(4, &value)) {

+ builder.AddCharacter(value);

+ } else {

+ builder.AddCharacter('u');

+ }

+ break;

+ }

+ default:

+ // Identity escape.

+ builder.AddCharacter(Next());

+ Advance(2);

+ break;

+ }

+ has_character_escapes_ = true;

+ break;

+ case '{': {

+ int dummy;

+ if (ParseIntervalQuantifier(&dummy, &dummy)) {

+ ReportError(CStrVector("Nothing to repeat"), CHECK_OK);

+ }

+ // fallthrough

+ }

+ default:

+ builder.AddCharacter(current());

+ Advance();

+ break;

+ } // end switch(current())

+ has_read_atom:

+ int min;

+ int max;

+ switch (current()) {

+ // QuantifierPrefix ::

+ // *

+ // +

+ // ?

+ // {

+ case '*':

+ min = 0;

+ max = RegExpQuantifier::kInfinity;

+ Advance();

+ break;

+ case '+':

+ min = 1;

+ max = RegExpQuantifier::kInfinity;

+ Advance();

+ break;

+ case '?':

+ min = 0;

+ max = 1;

+ Advance();

+ break;

+ case '{':

+ if (ParseIntervalQuantifier(&min, &max)) {

+ break;

+ } else {

+ continue;

+ }

+ default:

+ continue;

+ }

+ bool is_greedy = true;

+ if (current() == '?') {

+ is_greedy = false;

+ Advance();

+ }

+ builder.AddQuantifierToAtom(min, max, is_greedy);

+ }

+class SourceCharacter {

+ public:

+ static bool Is(uc32 c) {

+ switch (c) {

+ // case ']': case '}':

+ // In spidermonkey and jsc these are treated as source characters

+ // so we do too.

+ case '^': case '$': case '\\': case '.': case '*': case '+':

+ case '?': case '(': case ')': case '[': case '{': case '|':

+ case RegExpParser::kEndMarker:

+ return false;

+ default:

+ return true;

+ }

+};

+static unibrow::Predicate<SourceCharacter> source_character;

+static inline bool IsSourceCharacter(uc32 c) {

+ return source_character.get(c);

+#ifdef DEBUG

+// Currently only used in an ASSERT.

+static bool IsSpecialClassEscape(uc32 c) {

+ switch (c) {

+ case 'd': case 'D':

+ case 's': case 'S':

+ case 'w': case 'W':

+ return true;

+ default:

+ return false;

+ }

+#endif

+// In order to know whether an escape is a backreference or not we have to scan

+// the entire regexp and find the number of capturing parentheses. However we

+// don't want to scan the regexp twice unless it is necessary. This mini-parser

+// is called when needed. It can see the difference between capturing and

+// noncapturing parentheses and can skip character classes and backslash-escaped

+// characters.

+void RegExpParser::ScanForCaptures() {

+ int n;

+ while ((n = current()) != kEndMarker) {

+ Advance();

+ switch (n) {

+ case '\\':

+ Advance();

+ break;

+ case '[': {

+ int c;

+ while ((c = current()) != kEndMarker) {

+ Advance();

+ if (c == '\\') {

+ Advance();

+ } else {

+ if (c == ']') break;

+ }

+ break;

+ }

+ case '(':

+ if (current() != '?') capture_count_++;

+ break;

+ }

+ is_scanned_for_captures_ = true;

+bool RegExpParser::ParseBackReferenceIndex(int* index_out) {

+ ASSERT_EQ('\\', current());

+ ASSERT('1' <= Next() && Next() <= '9');

+ // Try to parse a decimal literal that is no greater than the number

+ // of previously encountered left capturing parentheses.

+ // This is a not according the the ECMAScript specification. According to

+ // that, one must accept values up to the total number of left capturing

+ // parentheses in the entire input, even if they are meaningless.

+ if (!is_scanned_for_captures_) {

+ int saved_position = position();

+ ScanForCaptures();

+ Reset(saved_position);

+ }

+ if (capture_count_ == 0) return false;

+ int start = position();

+ int value = Next() - '0';

+ if (value > capture_count_) return false;

+ Advance(2);

+ while (true) {

+ uc32 c = current();

+ if (IsDecimalDigit(c)) {

+ value = 10 * value + (c - '0');

+ if (value > capture_count_) {

+ Reset(start);

+ return false;

+ }

+ Advance();

+ } else {

+ break;

+ }

+ *index_out = value;

+ return true;

+// QuantifierPrefix ::

+// { DecimalDigits }

+// { DecimalDigits , }

+// { DecimalDigits , DecimalDigits }

+bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {

+ ASSERT_EQ(current(), '{');

+ int start = position();

+ Advance();

+ int min = 0;

+ if (!IsDecimalDigit(current())) {

+ Reset(start);

+ return false;

+ }

+ while (IsDecimalDigit(current())) {

+ min = 10 * min + (current() - '0');

+ Advance();

+ }

+ int max = 0;

+ if (current() == '}') {

+ max = min;

+ Advance();

+ } else if (current() == ',') {

+ Advance();

+ if (current() == '}') {

+ max = RegExpQuantifier::kInfinity;

+ Advance();

+ } else {

+ while (IsDecimalDigit(current())) {

+ max = 10 * max + (current() - '0');

+ Advance();

+ }

+ if (current() != '}') {

+ Reset(start);

+ return false;

+ }

+ Advance();

+ }

+ } else {

+ Reset(start);

+ return false;

+ }

+ *min_out = min;

+ *max_out = max;

+ return true;

+// Upper and lower case letters differ by one bit.

+STATIC_CHECK(('a' ^ 'A') == 0x20);

+uc32 RegExpParser::ParseControlLetterEscape(bool* ok) {

+ if (!has_more()) {

+ ReportError(CStrVector("\\c at end of pattern"), ok);

+ return '\0';

+ }

+ uc32 letter = current() & ~(0x20); // Collapse upper and lower case letters.

+ if (letter < 'A' || 'Z' < letter) {

+ // Non-spec error-correction: "\c" followed by non-control letter is

+ // interpreted as an IdentityEscape of 'c'.

+ return 'c';

+ }

+ Advance();

+ return letter & 0x1f; // Remainder modulo 32, per specification.

+uc32 RegExpParser::ParseOctalLiteral() {

+ ASSERT('0' <= current() && current() <= '7');

+ // For compatibility with some other browsers (not all), we parse

+ // up to three octal digits with a value below 256.

+ uc32 value = current() - '0';

+ Advance();

+ if ('0' <= current() && current() <= '7') {

+ value = value * 8 + current() - '0';

+ Advance();

+ if (value < 32 && '0' <= current() && current() <= '7') {

+ value = value * 8 + current() - '0';

+ Advance();

+ }

+ return value;

+bool RegExpParser::ParseHexEscape(int length, uc32 *value) {

+ int start = position();

+ uc32 val = 0;

+ bool done = false;

+ for (int i = 0; !done; i++) {

+ uc32 c = current();

+ int d = HexValue(c);

+ if (d < 0) {

+ Reset(start);

+ return false;

+ }

+ val = val * 16 + d;

+ Advance();

+ if (i == length - 1) {

+ done = true;

+ }

+ *value = val;

+ return true;

+uc32 RegExpParser::ParseClassCharacterEscape(bool* ok) {

+ ASSERT(current() == '\\');

+ ASSERT(has_next() && !IsSpecialClassEscape(Next()));

+ Advance();

+ switch (current()) {

+ case 'b':

+ Advance();

+ return '\b';

+ // ControlEscape :: one of

+ // f n r t v

+ case 'f':

+ Advance();

+ return '\f';

+ case 'n':

+ Advance();

+ return '\n';

+ case 'r':

+ Advance();

+ return '\r';

+ case 't':

+ Advance();

+ return '\t';

+ case 'v':

+ Advance();

+ return '\v';

+ case 'c':

+ return ParseControlLetterEscape(ok);

+ case '0': case '1': case '2': case '3': case '4': case '5':

+ case '6': case '7':

+ // For compatibility, we interpret a decimal escape that isn't

+ // a back reference (and therefore either \0 or not valid according

+ // to the specification) as a 1..3 digit octal character code.

+ return ParseOctalLiteral();

+ case 'x': {

+ Advance();

+ uc32 value;

+ if (ParseHexEscape(2, &value)) {

+ return value;

+ }

+ // If \x is not followed by a two-digit hexadecimal, treat it

+ // as an identity escape.

+ return 'x';

+ }

+ case 'u': {

+ Advance();

+ uc32 value;

+ if (ParseHexEscape(4, &value)) {

+ return value;

+ }

+ // If \u is not followed by a four-digit hexadecimal, treat it

+ // as an identity escape.

+ return 'u';

+ }

+ default: {

+ // Extended identity escape. We accept any character that hasn't

+ // been matched by a more specific case, not just the subset required

+ // by the ECMAScript specification.

+ uc32 result = current();

+ Advance();

+ return result;

+ }

+ return 0;

+RegExpTree* RegExpParser::ParseGroup(bool* ok) {

+ ASSERT_EQ(current(), '(');

+ char type = '(';

+ Advance();

+ if (current() == '?') {

+ switch (Next()) {

+ case ':': case '=': case '!':

+ type = Next();

+ Advance(2);

+ break;

+ default:

+ ReportError(CStrVector("Invalid group"), CHECK_OK);

+ break;

+ }

+ } else {

+ if (captures_ == NULL) {

+ captures_ = new ZoneList<RegExpCapture*>(2);

+ }

+ captures_->Add(NULL);

+ if (!is_scanned_for_captures_) capture_count_++;

+ }

+ int capture_index = captures_started();

+ RegExpTree* body = ParseDisjunction(CHECK_OK);

+ if (current() != ')') {

+ ReportError(CStrVector("Unterminated group"), CHECK_OK);

+ }

+ Advance();

+ int end_capture_index = captures_started();

+ if (type == '!') {

+ // Captures inside a negative lookahead are never available outside it.

+ for (int i = capture_index; i < end_capture_index; i++) {

+ RegExpCapture* capture = captures_->at(i);

+ ASSERT(capture != NULL);

+ capture->set_available(CAPTURE_PERMANENTLY_UNREACHABLE);

+ }

+ } else {

+ // Captures temporarily unavailable because they are in different

+ // alternatives are all available after the disjunction.

+ for (int i = capture_index; i < end_capture_index; i++) {

+ RegExpCapture* capture = captures_->at(i);

+ ASSERT(capture != NULL);

+ if (capture->available() == CAPTURE_UNREACHABLE) {

+ capture->set_available(CAPTURE_AVAILABLE);

+ }

+ if (type == '(') {

+ RegExpCapture* capture = new RegExpCapture(body, capture_index);

+ captures_->at(capture_index - 1) = capture;

+ return capture;

+ } else if (type == ':') {

+ return body;

+ } else {

+ ASSERT(type == '=' || type == '!');

+ bool is_positive = (type == '=');

+ return new RegExpLookahead(body, is_positive);

+ }

+CharacterRange RegExpParser::ParseClassAtom(bool* is_char_class,

+ ZoneList<CharacterRange>* ranges,

+ bool* ok) {

+ ASSERT_EQ(false, *is_char_class);

+ uc32 first = current();

+ if (first == '\\') {

+ switch (Next()) {

+ case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {

+ *is_char_class = true;

+ uc32 c = Next();

+ CharacterRange::AddClassEscape(c, ranges);

+ Advance(2);

+ return NULL;

+ }

+ default:

+ uc32 c = ParseClassCharacterEscape(CHECK_OK);

+ return CharacterRange::Singleton(c);

+ }

+ } else {

+ Advance();

+ return CharacterRange::Singleton(first);

+ }

+RegExpTree* RegExpParser::ParseCharacterClass(bool* ok) {

+ static const char* kUnterminated = "Unterminated character class";

+ static const char* kIllegal = "Illegal character class";

+ static const char* kRangeOutOfOrder = "Range out of order in character class";

+ ASSERT_EQ(current(), '[');

+ Advance();

+ bool is_negated = false;

+ if (current() == '^') {

+ is_negated = true;

+ Advance();

+ }

+ ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);

+ while (has_more() && current() != ']') {

+ bool is_char_class = false;

+ CharacterRange first = ParseClassAtom(&is_char_class, ranges, CHECK_OK);

+ if (!is_char_class) {

+ if (current() == '-') {

+ Advance();

+ if (current() == kEndMarker) {

+ // If we reach the end we break out of the loop and let the

+ // following code report an error.

+ break;

+ } else if (current() == ']') {

+ ranges->Add(first);

+ ranges->Add(CharacterRange::Singleton('-'));

+ break;

+ }

+ CharacterRange next =

+ ParseClassAtom(&is_char_class, ranges, CHECK_OK);

+ if (is_char_class) {

+ return ReportError(CStrVector(kIllegal), CHECK_OK);

+ }

+ if (first.from() > next.to()) {

+ return ReportError(CStrVector(kRangeOutOfOrder), CHECK_OK);

+ }

+ ranges->Add(CharacterRange::Range(first.from(), next.to()));

+ } else {

+ ranges->Add(first);

+ }

+ if (!has_more()) {

+ return ReportError(CStrVector(kUnterminated), CHECK_OK);

+ }

+ Advance();

+ if (ranges->length() == 0) {

+ ranges->Add(CharacterRange::Range(0, 0xffff));

+ is_negated = !is_negated;

+ }

+ return new RegExpCharacterClass(ranges, is_negated);

+// ----------------------------------------------------------------------------

// The Parser interface.

// MakeAST() is just a wrapper for the corresponding Parser calls

@@ -3211,6 +4290,27 @@

}

+bool ParseRegExp(FlatStringReader* input, RegExpParseResult* result) {

+ ASSERT(result != NULL);

+ // Get multiline flag somehow

Mads Ager (chromium) 2008/11/25 21:09:41 Is this a TODO or a comment on what the code is do

Erik Corry 2008/11/26 12:18:36 It's a TODO. Fixed.

+ RegExpParser parser(input, &result->error, false);

+ bool ok = true;

+ result->tree = parser.ParsePattern(&ok);

+ if (!ok) {

+ ASSERT(result->tree == NULL);

+ ASSERT(!result->error.is_null());

+ } else {

+ ASSERT(result->tree != NULL);

+ ASSERT(result->error.is_null());

+ }

+ if (ok) {

+ result->has_character_escapes = parser.HasCharacterEscapes();

+ result->capture_count = parser.captures_started();

+ }

+ return ok;

FunctionLiteral* MakeAST(bool compile_in_global_context,

Handle<Script> script,

v8::Extension* extension,

« src/objects.cc ('K') | « src/parser.h ('k') | src/regexp-macro-assembler.h » ('j') | src/regexp-macro-assembler-ia32.h » ('J')