Chromium Code Reviews| Index: src/parser.cc |
| =================================================================== |
| --- src/parser.cc (revision 830) |
| +++ src/parser.cc (working copy) |
| @@ -34,6 +34,7 @@ |
| #include "runtime.h" |
| #include "parser.h" |
| #include "scopes.h" |
| +#include "string-stream.h" |
| namespace v8 { namespace internal { |
| @@ -227,6 +228,334 @@ |
| }; |
| +template <typename T, int initial_size> |
| +class BufferedZoneList { |
| + public: |
| + |
| + BufferedZoneList() : |
|
Mads Ager (chromium)
2008/11/25 21:09:41
All on one line.
|
| + list_(NULL), last_(NULL) {} |
| + |
| + // Adds element at end of list. This element is buffered and can |
| + // be read using last() or removed using RemoveLast until a new Add or until |
| + // RemoveLast or GetList has been called. |
| + void Add(T* value) { |
| + if (last_ != NULL) { |
| + if (list_ == NULL) { |
| + list_ = new ZoneList<T*>(initial_size); |
| + } |
| + list_->Add(last_); |
| + } |
| + last_ = value; |
| + } |
| + |
| + T* last() { |
| + ASSERT(last_ != NULL); |
| + return last_; |
| + } |
| + |
| + T* RemoveLast() { |
| + ASSERT(last_ != NULL); |
| + T* result = last_; |
| + if (list_ != NULL && list_->length() > 0) |
| + last_ = list_->RemoveLast(); |
| + else |
| + last_ = NULL; |
| + return result; |
| + } |
| + |
| + T* Get(int i) { |
| + ASSERT(0 <= i && i < length()); |
| + if (list_ == NULL) { |
| + ASSERT_EQ(0, i); |
| + return last_; |
| + } else { |
| + if (i == list_->length()) { |
| + ASSERT(last_ != NULL); |
| + return last_; |
| + } else { |
| + return list_->at(i); |
| + } |
| + } |
| + } |
| + |
| + void Clear() { |
| + list_ = NULL; |
| + last_ = NULL; |
| + } |
| + |
| + int length() { |
| + int length = (list_ == NULL) ? 0 : list_->length(); |
| + return length + ((last_ == NULL) ? 0 : 1); |
| + } |
| + |
| + ZoneList<T*>* GetList() { |
| + if (list_ == NULL) { |
| + list_ = new ZoneList<T*>(initial_size); |
| + } |
| + if (last_ != NULL) { |
| + list_->Add(last_); |
| + last_ = NULL; |
| + } |
| + return list_; |
| + } |
| + |
| + private: |
| + ZoneList<T*>* list_; |
| + T* last_; |
| +}; |
| + |
| +// Accumulates RegExp atoms and assertions into lists of terms and alternatives. |
| +class RegExpBuilder { |
| + public: |
| + RegExpBuilder(); |
| + void AddCharacter(uc16 character); |
| + // "Adds" an empty expression. Does nothing except consume a |
| + // following quantifier |
| + void AddEmpty(); |
| + void AddAtom(RegExpTree* tree); |
| + void AddAssertion(RegExpTree* tree); |
| + void NewAlternative(); // '|' |
| + void AddQuantifierToAtom(int min, int max, bool is_greedy); |
| + RegExpTree* ToRegExp(); |
| + private: |
| + void FlushCharacters(); |
| + void FlushText(); |
| + void FlushTerms(); |
| + bool pending_empty_; |
| + ZoneList<uc16>* characters_; |
| + BufferedZoneList<RegExpTree, 2> terms_; |
| + BufferedZoneList<RegExpTree, 2> text_; |
| + BufferedZoneList<RegExpTree, 2> alternatives_; |
| +#ifdef DEBUG |
| + enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_; |
| +#define LAST(x) last_added_ = x; |
| +#else |
| +#define LAST(x) |
| +#endif |
| +}; |
| + |
| + |
| +RegExpBuilder::RegExpBuilder() |
| + : pending_empty_(false), characters_(NULL), terms_(), alternatives_() |
|
Mads Ager (chromium)
2008/11/25 21:09:41
Four space indent. More occurrences below.
|
| +#ifdef DEBUG |
| + , last_added_(ADD_NONE) |
| +#endif |
| + {} |
| + |
| + |
| +void RegExpBuilder::FlushCharacters() { |
| + pending_empty_ = false; |
| + if (characters_ != NULL) { |
| + RegExpTree* atom = new RegExpAtom(characters_->ToConstVector()); |
| + characters_ = NULL; |
| + text_.Add(atom); |
| + LAST(ADD_ATOM); |
| + } |
| +} |
| + |
| + |
| +void RegExpBuilder::FlushText() { |
| + FlushCharacters(); |
| + int num_text = text_.length(); |
| + if (num_text == 0) { |
| + return; |
| + } else if (num_text == 1) { |
| + terms_.Add(text_.last()); |
| + } else { |
| + RegExpText* text = new RegExpText(); |
| + for (int i = 0; i < num_text; i++) |
| + text_.Get(i)->AppendToText(text); |
| + terms_.Add(text); |
| + } |
| + text_.Clear(); |
| +} |
| + |
| + |
| +void RegExpBuilder::AddCharacter(uc16 c) { |
| + pending_empty_ = false; |
| + if (characters_ == NULL) { |
| + characters_ = new ZoneList<uc16>(4); |
| + } |
| + characters_->Add(c); |
| + LAST(ADD_CHAR); |
| +} |
| + |
| + |
| +void RegExpBuilder::AddEmpty() { |
| + pending_empty_ = true; |
| +} |
| + |
| + |
| +void RegExpBuilder::AddAtom(RegExpTree* term) { |
| + if (term->IsEmpty()) { |
| + AddEmpty(); |
| + return; |
| + } |
| + if (term->IsTextElement()) { |
| + FlushCharacters(); |
| + text_.Add(term); |
| + } else { |
| + FlushText(); |
| + terms_.Add(term); |
| + } |
| + LAST(ADD_ATOM); |
| +} |
| + |
| + |
| +void RegExpBuilder::AddAssertion(RegExpTree* assert) { |
| + FlushText(); |
| + terms_.Add(assert); |
| + LAST(ADD_ASSERT); |
| +} |
| + |
| + |
| +void RegExpBuilder::NewAlternative() { |
| + FlushTerms(); |
| +} |
| + |
| + |
| +void RegExpBuilder::FlushTerms() { |
| + FlushText(); |
| + int num_terms = terms_.length(); |
| + RegExpTree* alternative; |
| + if (num_terms == 0) { |
| + alternative = RegExpEmpty::GetInstance(); |
| + } else if (num_terms == 1) { |
| + alternative = terms_.last(); |
| + } else { |
| + alternative = new RegExpAlternative(terms_.GetList()); |
| + } |
| + alternatives_.Add(alternative); |
| + terms_.Clear(); |
| + LAST(ADD_NONE); |
| +} |
| + |
| + |
| +RegExpTree* RegExpBuilder::ToRegExp() { |
| + FlushTerms(); |
| + int num_alternatives = alternatives_.length(); |
| + if (num_alternatives == 0) { |
| + return RegExpEmpty::GetInstance(); |
| + } |
| + if (num_alternatives == 1) { |
| + return alternatives_.last(); |
| + } |
| + return new RegExpDisjunction(alternatives_.GetList()); |
| +} |
| + |
| + |
| +void RegExpBuilder::AddQuantifierToAtom(int min, int max, bool is_greedy) { |
| + if (pending_empty_) { |
| + pending_empty_ = false; |
| + return; |
| + } |
| + RegExpTree* atom; |
| + if (characters_ != NULL) { |
| + ASSERT(last_added_ == ADD_CHAR); |
| + // Last atom was character. |
| + Vector<const uc16> char_vector = characters_->ToConstVector(); |
| + int num_chars = char_vector.length(); |
| + if (num_chars > 1) { |
| + Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1); |
| + text_.Add(new RegExpAtom(prefix)); |
| + char_vector = char_vector.SubVector(num_chars - 1, num_chars); |
| + } |
| + characters_ = NULL; |
| + atom = new RegExpAtom(char_vector); |
| + FlushText(); |
| + } else if (text_.length() > 0) { |
| + ASSERT(last_added_ == ADD_ATOM); |
| + atom = text_.RemoveLast(); |
| + FlushText(); |
| + } else if (terms_.length() > 0) { |
| + ASSERT(last_added_ == ADD_ATOM); |
| + atom = terms_.RemoveLast(); |
| + if (atom->IsLookahead() || atom->IsAssertion()) { |
| + // Guaranteed not to match a non-empty string. |
| + // Assertion as an atom can happen as, e.g., (?:\b) |
| + LAST(ADD_TERM); |
| + if (min == 0) { |
| + return; |
| + } |
| + terms_.Add(atom); |
| + return; |
| + } |
| + } else { |
| + // Only call immediately after adding an atom or character! |
| + UNREACHABLE(); |
| + return; |
| + } |
| + terms_.Add(new RegExpQuantifier(min, max, is_greedy, atom)); |
| + LAST(ADD_TERM); |
| +} |
| + |
| + |
| +class RegExpParser { |
| + public: |
| + RegExpParser(FlatStringReader* in, |
| + Handle<String>* error, |
| + bool multiline_mode); |
| + RegExpTree* ParsePattern(bool* ok); |
| + RegExpTree* ParseDisjunction(bool* ok); |
| + RegExpTree* ParseGroup(bool* ok); |
| + RegExpTree* ParseCharacterClass(bool* ok); |
| + |
| + // Parses a {...,...} quantifier and stores the range in the given |
| + // out parameters. |
| + bool ParseIntervalQuantifier(int* min_out, int* max_out); |
| + |
| + // Parses and returns a single escaped character. The character |
| + // must not be 'b' or 'B' since they are usually handle specially. |
| + uc32 ParseClassCharacterEscape(bool* ok); |
| + |
| + // Checks whether the following is a length-digit hexadecimal number, |
| + // and sets the value if it is. |
| + bool ParseHexEscape(int length, uc32* value); |
| + |
| + uc32 ParseControlLetterEscape(bool* ok); |
| + uc32 ParseOctalLiteral(); |
| + |
| + // Tries to parse the input as a back reference. If successful it |
| + // stores the result in the output parameter and returns true. If |
| + // it fails it will push back the characters read so the same characters |
| + // can be reparsed. |
| + bool ParseBackReferenceIndex(int* index_out); |
| + |
| + CharacterRange ParseClassAtom(bool* is_char_class, |
| + ZoneList<CharacterRange>* ranges, |
| + bool* ok); |
| + RegExpTree* ReportError(Vector<const char> message, bool* ok); |
| + void Advance(); |
| + void Advance(int dist); |
| + void Reset(int pos); |
| + |
| + bool HasCharacterEscapes(); |
| + |
| + int captures_started() { return captures_ == NULL ? 0 : captures_->length(); } |
| + int position() { return next_pos_ - 1; } |
| + |
| + static const uc32 kEndMarker = (1 << 21); |
| + private: |
| + uc32 current() { return current_; } |
| + bool has_more() { return has_more_; } |
| + bool has_next() { return next_pos_ < in()->length(); } |
| + uc32 Next(); |
| + FlatStringReader* in() { return in_; } |
| + void ScanForCaptures(); |
| + bool CaptureAvailable(int index); |
| + uc32 current_; |
| + bool has_more_; |
| + bool multiline_mode_; |
| + int next_pos_; |
| + FlatStringReader* in_; |
| + Handle<String>* error_; |
| + bool has_character_escapes_; |
| + bool is_scanned_for_captures_; |
| + ZoneList<RegExpCapture*>* captures_; |
| + int capture_count_; |
| +}; |
| + |
| + |
| // A temporary scope stores information during parsing, just like |
| // a plain scope. However, temporary scopes are not kept around |
| // after parsing or referenced by syntax trees so they can be stack- |
| @@ -3164,6 +3493,756 @@ |
| // ---------------------------------------------------------------------------- |
| +// Regular expressions |
| + |
| + |
| +RegExpParser::RegExpParser(FlatStringReader* in, |
| + Handle<String>* error, |
| + bool multiline_mode) |
| + : current_(kEndMarker), |
| + has_more_(true), |
| + multiline_mode_(multiline_mode), |
| + next_pos_(0), |
| + in_(in), |
| + error_(error), |
| + has_character_escapes_(false), |
| + is_scanned_for_captures_(false), |
| + captures_(NULL), |
| + capture_count_(0) { |
| + Advance(1); |
| +} |
| + |
| + |
| +uc32 RegExpParser::Next() { |
| + if (has_next()) { |
| + return in()->Get(next_pos_); |
| + } else { |
| + return kEndMarker; |
| + } |
| +} |
| + |
| + |
| +void RegExpParser::Advance() { |
| + if (next_pos_ < in()->length()) { |
| + current_ = in()->Get(next_pos_); |
| + next_pos_++; |
| + } else { |
| + current_ = kEndMarker; |
| + has_more_ = false; |
| + } |
| +} |
| + |
| + |
| +void RegExpParser::Reset(int pos) { |
| + next_pos_ = pos; |
| + Advance(); |
| +} |
| + |
| + |
| +void RegExpParser::Advance(int dist) { |
| + for (int i = 0; i < dist; i++) |
| + Advance(); |
| +} |
| + |
| + |
| +// Reports whether the parsed string atoms contain any characters that were |
| +// escaped in the original pattern. If not, all atoms are proper substrings |
| +// of the original pattern. |
| +bool RegExpParser::HasCharacterEscapes() { |
| + return has_character_escapes_; |
| +} |
| + |
| +RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) { |
| + *ok = false; |
| + *error_ = Factory::NewStringFromAscii(message, NOT_TENURED); |
| + return NULL; |
| +} |
| + |
| + |
| +// Pattern :: |
| +// Disjunction |
| +RegExpTree* RegExpParser::ParsePattern(bool* ok) { |
| + RegExpTree* result = ParseDisjunction(CHECK_OK); |
| + if (has_more()) { |
| + ReportError(CStrVector("Unmatched ')'"), CHECK_OK); |
| + } |
| + return result; |
| +} |
| + |
| + |
| +bool RegExpParser::CaptureAvailable(int index) { |
| + if (captures_ == NULL) return false; |
| + if (index >= captures_->length()) return false; |
| + RegExpCapture* capture = captures_->at(index); |
| + return capture != NULL && capture->available() == CAPTURE_AVAILABLE; |
| +} |
| + |
| + |
| +// Disjunction :: |
| +// Alternative |
| +// Alternative | Disjunction |
| +// Alternative :: |
| +// [empty] |
| +// Term Alternative |
| +// Term :: |
| +// Assertion |
| +// Atom |
| +// Atom Quantifier |
| +RegExpTree* RegExpParser::ParseDisjunction(bool* ok) { |
| + RegExpBuilder builder; |
| + int capture_start_index = captures_started(); |
| + while (true) { |
| + switch (current()) { |
| + case kEndMarker: |
| + case ')': |
| + return builder.ToRegExp(); |
| + case '|': { |
| + Advance(); |
| + builder.NewAlternative(); |
| + int capture_new_alt_start_index = captures_started(); |
| + for (int i = capture_start_index; i < capture_new_alt_start_index; i++) { |
| + RegExpCapture* capture = captures_->at(i); |
| + if (capture->available() == CAPTURE_AVAILABLE) { |
| + capture->set_available(CAPTURE_UNREACHABLE); |
| + } |
| + } |
| + capture_start_index = capture_new_alt_start_index; |
| + continue; |
| + } |
| + case '*': |
| + case '+': |
| + case '?': |
| + ReportError(CStrVector("Nothing to repeat"), CHECK_OK); |
| + case '^': { |
| + Advance(); |
| + RegExpAssertion::Type type = |
| + multiline_mode_ ? RegExpAssertion::START_OF_LINE : |
| + RegExpAssertion::START_OF_INPUT; |
| + builder.AddAssertion(new RegExpAssertion(type)); |
| + continue; |
| + } |
| + case '$': { |
| + Advance(); |
| + RegExpAssertion::Type type = |
| + multiline_mode_ ? RegExpAssertion::END_OF_LINE : |
| + RegExpAssertion::END_OF_INPUT; |
| + builder.AddAssertion(new RegExpAssertion(type)); |
| + continue; |
| + } |
| + case '.': { |
| + Advance(); |
| + // everything except \x0a, \x0d, \u2028 and \u2029 |
| + ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2); |
| + CharacterRange::AddClassEscape('.', ranges); |
| + RegExpTree* atom = new RegExpCharacterClass(ranges, false); |
| + builder.AddAtom(atom); |
| + break; |
| + } |
| + case '(': { |
| + RegExpTree* atom = ParseGroup(CHECK_OK); |
| + builder.AddAtom(atom); |
| + break; |
| + } |
| + case '[': { |
| + RegExpTree* atom = ParseCharacterClass(CHECK_OK); |
| + builder.AddAtom(atom); |
| + break; |
| + } |
| + // Atom :: |
| + // \ AtomEscape |
| + case '\\': |
| + switch (Next()) { |
| + case kEndMarker: |
| + ReportError(CStrVector("\\ at end of pattern"), CHECK_OK); |
| + case 'b': |
| + Advance(2); |
| + builder.AddAssertion( |
| + new RegExpAssertion(RegExpAssertion::BOUNDARY)); |
| + continue; |
| + case 'B': |
| + Advance(2); |
| + builder.AddAssertion( |
| + new RegExpAssertion(RegExpAssertion::NON_BOUNDARY)); |
| + continue; |
| + // AtomEscape :: |
| + // CharacterClassEscape |
| + // |
| + // CharacterClassEscape :: one of |
| + // d D s S w W |
| + case 'd': case 'D': case 's': case 'S': case 'w': case 'W': { |
| + uc32 c = Next(); |
| + Advance(2); |
| + ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2); |
| + CharacterRange::AddClassEscape(c, ranges); |
| + RegExpTree* atom = new RegExpCharacterClass(ranges, false); |
| + builder.AddAtom(atom); |
| + goto has_read_atom; // Avoid setting has_character_escapes_. |
| + } |
| + case '1': case '2': case '3': case '4': case '5': case '6': |
| + case '7': case '8': case '9': { |
| + int index = 0; |
| + if (ParseBackReferenceIndex(&index)) { |
| + if (!CaptureAvailable(index - 1)) { |
| + // Prepare to ignore a following quantifier |
| + builder.AddEmpty(); |
| + goto has_read_atom; |
| + } |
| + RegExpCapture* capture = captures_->at(index - 1); |
| + RegExpTree* atom = new RegExpBackReference(capture); |
| + builder.AddAtom(atom); |
| + goto has_read_atom; // Avoid setting has_character_escapes_. |
| + } |
| + uc32 first_digit = Next(); |
| + if (first_digit == '8' || first_digit == '9') { |
| + // Treat as identity escape |
| + builder.AddCharacter(first_digit); |
| + Advance(2); |
| + break; |
| + } |
| + } |
| + // FALLTHROUGH |
| + case '0': { |
| + Advance(); |
| + uc32 octal = ParseOctalLiteral(); |
| + builder.AddCharacter(octal); |
| + break; |
| + } |
| + // ControlEscape :: one of |
| + // f n r t v |
| + case 'f': |
| + Advance(2); |
| + builder.AddCharacter('\f'); |
| + break; |
| + case 'n': |
| + Advance(2); |
| + builder.AddCharacter('\n'); |
| + break; |
| + case 'r': |
| + Advance(2); |
| + builder.AddCharacter('\r'); |
| + break; |
| + case 't': |
| + Advance(2); |
| + builder.AddCharacter('\t'); |
| + break; |
| + case 'v': |
| + Advance(2); |
| + builder.AddCharacter('\v'); |
| + break; |
| + case 'c': { |
| + Advance(2); |
| + uc32 control = ParseControlLetterEscape(ok); |
| + builder.AddCharacter(control); |
| + break; |
| + } |
| + case 'x': { |
| + Advance(2); |
| + uc32 value; |
| + if (ParseHexEscape(2, &value)) { |
| + builder.AddCharacter(value); |
| + } else { |
| + builder.AddCharacter('x'); |
| + } |
| + break; |
| + } |
| + case 'u': { |
| + Advance(2); |
| + uc32 value; |
| + if (ParseHexEscape(4, &value)) { |
| + builder.AddCharacter(value); |
| + } else { |
| + builder.AddCharacter('u'); |
| + } |
| + break; |
| + } |
| + default: |
| + // Identity escape. |
| + builder.AddCharacter(Next()); |
| + Advance(2); |
| + break; |
| + } |
| + has_character_escapes_ = true; |
| + break; |
| + case '{': { |
| + int dummy; |
| + if (ParseIntervalQuantifier(&dummy, &dummy)) { |
| + ReportError(CStrVector("Nothing to repeat"), CHECK_OK); |
| + } |
| + // fallthrough |
| + } |
| + default: |
| + builder.AddCharacter(current()); |
| + Advance(); |
| + break; |
| + } // end switch(current()) |
| + |
| + has_read_atom: |
| + int min; |
| + int max; |
| + switch (current()) { |
| + // QuantifierPrefix :: |
| + // * |
| + // + |
| + // ? |
| + // { |
| + case '*': |
| + min = 0; |
| + max = RegExpQuantifier::kInfinity; |
| + Advance(); |
| + break; |
| + case '+': |
| + min = 1; |
| + max = RegExpQuantifier::kInfinity; |
| + Advance(); |
| + break; |
| + case '?': |
| + min = 0; |
| + max = 1; |
| + Advance(); |
| + break; |
| + case '{': |
| + if (ParseIntervalQuantifier(&min, &max)) { |
| + break; |
| + } else { |
| + continue; |
| + } |
| + default: |
| + continue; |
| + } |
| + bool is_greedy = true; |
| + if (current() == '?') { |
| + is_greedy = false; |
| + Advance(); |
| + } |
| + builder.AddQuantifierToAtom(min, max, is_greedy); |
| + } |
| +} |
| + |
| +class SourceCharacter { |
| + public: |
| + static bool Is(uc32 c) { |
| + switch (c) { |
| + // case ']': case '}': |
| + // In spidermonkey and jsc these are treated as source characters |
| + // so we do too. |
| + case '^': case '$': case '\\': case '.': case '*': case '+': |
| + case '?': case '(': case ')': case '[': case '{': case '|': |
| + case RegExpParser::kEndMarker: |
| + return false; |
| + default: |
| + return true; |
| + } |
| + } |
| +}; |
| + |
| + |
| +static unibrow::Predicate<SourceCharacter> source_character; |
| + |
| + |
| +static inline bool IsSourceCharacter(uc32 c) { |
| + return source_character.get(c); |
| +} |
| + |
| +#ifdef DEBUG |
| +// Currently only used in an ASSERT. |
| +static bool IsSpecialClassEscape(uc32 c) { |
| + switch (c) { |
| + case 'd': case 'D': |
| + case 's': case 'S': |
| + case 'w': case 'W': |
| + return true; |
| + default: |
| + return false; |
| + } |
| +} |
| +#endif |
| + |
| + |
| +// In order to know whether an escape is a backreference or not we have to scan |
| +// the entire regexp and find the number of capturing parentheses. However we |
| +// don't want to scan the regexp twice unless it is necessary. This mini-parser |
| +// is called when needed. It can see the difference between capturing and |
| +// noncapturing parentheses and can skip character classes and backslash-escaped |
| +// characters. |
| +void RegExpParser::ScanForCaptures() { |
| + int n; |
| + while ((n = current()) != kEndMarker) { |
| + Advance(); |
| + switch (n) { |
| + case '\\': |
| + Advance(); |
| + break; |
| + case '[': { |
| + int c; |
| + while ((c = current()) != kEndMarker) { |
| + Advance(); |
| + if (c == '\\') { |
| + Advance(); |
| + } else { |
| + if (c == ']') break; |
| + } |
| + } |
| + break; |
| + } |
| + case '(': |
| + if (current() != '?') capture_count_++; |
| + break; |
| + } |
| + } |
| + is_scanned_for_captures_ = true; |
| +} |
| + |
| + |
| +bool RegExpParser::ParseBackReferenceIndex(int* index_out) { |
| + ASSERT_EQ('\\', current()); |
| + ASSERT('1' <= Next() && Next() <= '9'); |
| + // Try to parse a decimal literal that is no greater than the number |
| + // of previously encountered left capturing parentheses. |
| + // This is a not according the the ECMAScript specification. According to |
| + // that, one must accept values up to the total number of left capturing |
| + // parentheses in the entire input, even if they are meaningless. |
| + if (!is_scanned_for_captures_) { |
| + int saved_position = position(); |
| + ScanForCaptures(); |
| + Reset(saved_position); |
| + } |
| + if (capture_count_ == 0) return false; |
| + int start = position(); |
| + int value = Next() - '0'; |
| + if (value > capture_count_) return false; |
| + Advance(2); |
| + while (true) { |
| + uc32 c = current(); |
| + if (IsDecimalDigit(c)) { |
| + value = 10 * value + (c - '0'); |
| + if (value > capture_count_) { |
| + Reset(start); |
| + return false; |
| + } |
| + Advance(); |
| + } else { |
| + break; |
| + } |
| + } |
| + *index_out = value; |
| + return true; |
| +} |
| + |
| + |
| +// QuantifierPrefix :: |
| +// { DecimalDigits } |
| +// { DecimalDigits , } |
| +// { DecimalDigits , DecimalDigits } |
| +bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) { |
| + ASSERT_EQ(current(), '{'); |
| + int start = position(); |
| + Advance(); |
| + int min = 0; |
| + if (!IsDecimalDigit(current())) { |
| + Reset(start); |
| + return false; |
| + } |
| + while (IsDecimalDigit(current())) { |
| + min = 10 * min + (current() - '0'); |
| + Advance(); |
| + } |
| + int max = 0; |
| + if (current() == '}') { |
| + max = min; |
| + Advance(); |
| + } else if (current() == ',') { |
| + Advance(); |
| + if (current() == '}') { |
| + max = RegExpQuantifier::kInfinity; |
| + Advance(); |
| + } else { |
| + while (IsDecimalDigit(current())) { |
| + max = 10 * max + (current() - '0'); |
| + Advance(); |
| + } |
| + if (current() != '}') { |
| + Reset(start); |
| + return false; |
| + } |
| + Advance(); |
| + } |
| + } else { |
| + Reset(start); |
| + return false; |
| + } |
| + *min_out = min; |
| + *max_out = max; |
| + return true; |
| +} |
| + |
| + |
| +// Upper and lower case letters differ by one bit. |
| +STATIC_CHECK(('a' ^ 'A') == 0x20); |
| + |
| +uc32 RegExpParser::ParseControlLetterEscape(bool* ok) { |
| + if (!has_more()) { |
| + ReportError(CStrVector("\\c at end of pattern"), ok); |
| + return '\0'; |
| + } |
| + uc32 letter = current() & ~(0x20); // Collapse upper and lower case letters. |
| + if (letter < 'A' || 'Z' < letter) { |
| + // Non-spec error-correction: "\c" followed by non-control letter is |
| + // interpreted as an IdentityEscape of 'c'. |
| + return 'c'; |
| + } |
| + Advance(); |
| + return letter & 0x1f; // Remainder modulo 32, per specification. |
| +} |
| + |
| + |
| +uc32 RegExpParser::ParseOctalLiteral() { |
| + ASSERT('0' <= current() && current() <= '7'); |
| + // For compatibility with some other browsers (not all), we parse |
| + // up to three octal digits with a value below 256. |
| + uc32 value = current() - '0'; |
| + Advance(); |
| + if ('0' <= current() && current() <= '7') { |
| + value = value * 8 + current() - '0'; |
| + Advance(); |
| + if (value < 32 && '0' <= current() && current() <= '7') { |
| + value = value * 8 + current() - '0'; |
| + Advance(); |
| + } |
| + } |
| + return value; |
| +} |
| + |
| + |
| +bool RegExpParser::ParseHexEscape(int length, uc32 *value) { |
| + int start = position(); |
| + uc32 val = 0; |
| + bool done = false; |
| + for (int i = 0; !done; i++) { |
| + uc32 c = current(); |
| + int d = HexValue(c); |
| + if (d < 0) { |
| + Reset(start); |
| + return false; |
| + } |
| + val = val * 16 + d; |
| + Advance(); |
| + if (i == length - 1) { |
| + done = true; |
| + } |
| + } |
| + *value = val; |
| + return true; |
| +} |
| + |
| + |
| +uc32 RegExpParser::ParseClassCharacterEscape(bool* ok) { |
| + ASSERT(current() == '\\'); |
| + ASSERT(has_next() && !IsSpecialClassEscape(Next())); |
| + Advance(); |
| + switch (current()) { |
| + case 'b': |
| + Advance(); |
| + return '\b'; |
| + // ControlEscape :: one of |
| + // f n r t v |
| + case 'f': |
| + Advance(); |
| + return '\f'; |
| + case 'n': |
| + Advance(); |
| + return '\n'; |
| + case 'r': |
| + Advance(); |
| + return '\r'; |
| + case 't': |
| + Advance(); |
| + return '\t'; |
| + case 'v': |
| + Advance(); |
| + return '\v'; |
| + case 'c': |
| + return ParseControlLetterEscape(ok); |
| + case '0': case '1': case '2': case '3': case '4': case '5': |
| + case '6': case '7': |
| + // For compatibility, we interpret a decimal escape that isn't |
| + // a back reference (and therefore either \0 or not valid according |
| + // to the specification) as a 1..3 digit octal character code. |
| + return ParseOctalLiteral(); |
| + case 'x': { |
| + Advance(); |
| + uc32 value; |
| + if (ParseHexEscape(2, &value)) { |
| + return value; |
| + } |
| + // If \x is not followed by a two-digit hexadecimal, treat it |
| + // as an identity escape. |
| + return 'x'; |
| + } |
| + case 'u': { |
| + Advance(); |
| + uc32 value; |
| + if (ParseHexEscape(4, &value)) { |
| + return value; |
| + } |
| + // If \u is not followed by a four-digit hexadecimal, treat it |
| + // as an identity escape. |
| + return 'u'; |
| + } |
| + default: { |
| + // Extended identity escape. We accept any character that hasn't |
| + // been matched by a more specific case, not just the subset required |
| + // by the ECMAScript specification. |
| + uc32 result = current(); |
| + Advance(); |
| + return result; |
| + } |
| + } |
| + return 0; |
| +} |
| + |
| + |
| +RegExpTree* RegExpParser::ParseGroup(bool* ok) { |
| + ASSERT_EQ(current(), '('); |
| + char type = '('; |
| + Advance(); |
| + if (current() == '?') { |
| + switch (Next()) { |
| + case ':': case '=': case '!': |
| + type = Next(); |
| + Advance(2); |
| + break; |
| + default: |
| + ReportError(CStrVector("Invalid group"), CHECK_OK); |
| + break; |
| + } |
| + } else { |
| + if (captures_ == NULL) { |
| + captures_ = new ZoneList<RegExpCapture*>(2); |
| + } |
| + captures_->Add(NULL); |
| + if (!is_scanned_for_captures_) capture_count_++; |
| + } |
| + int capture_index = captures_started(); |
| + RegExpTree* body = ParseDisjunction(CHECK_OK); |
| + if (current() != ')') { |
| + ReportError(CStrVector("Unterminated group"), CHECK_OK); |
| + } |
| + Advance(); |
| + |
| + int end_capture_index = captures_started(); |
| + if (type == '!') { |
| + // Captures inside a negative lookahead are never available outside it. |
| + for (int i = capture_index; i < end_capture_index; i++) { |
| + RegExpCapture* capture = captures_->at(i); |
| + ASSERT(capture != NULL); |
| + capture->set_available(CAPTURE_PERMANENTLY_UNREACHABLE); |
| + } |
| + } else { |
| + // Captures temporarily unavailable because they are in different |
| + // alternatives are all available after the disjunction. |
| + for (int i = capture_index; i < end_capture_index; i++) { |
| + RegExpCapture* capture = captures_->at(i); |
| + ASSERT(capture != NULL); |
| + if (capture->available() == CAPTURE_UNREACHABLE) { |
| + capture->set_available(CAPTURE_AVAILABLE); |
| + } |
| + } |
| + } |
| + |
| + if (type == '(') { |
| + RegExpCapture* capture = new RegExpCapture(body, capture_index); |
| + captures_->at(capture_index - 1) = capture; |
| + return capture; |
| + } else if (type == ':') { |
| + return body; |
| + } else { |
| + ASSERT(type == '=' || type == '!'); |
| + bool is_positive = (type == '='); |
| + return new RegExpLookahead(body, is_positive); |
| + } |
| +} |
| + |
| + |
| +CharacterRange RegExpParser::ParseClassAtom(bool* is_char_class, |
| + ZoneList<CharacterRange>* ranges, |
| + bool* ok) { |
| + ASSERT_EQ(false, *is_char_class); |
| + uc32 first = current(); |
| + if (first == '\\') { |
| + switch (Next()) { |
| + case 'w': case 'W': case 'd': case 'D': case 's': case 'S': { |
| + *is_char_class = true; |
| + uc32 c = Next(); |
| + CharacterRange::AddClassEscape(c, ranges); |
| + Advance(2); |
| + return NULL; |
| + } |
| + default: |
| + uc32 c = ParseClassCharacterEscape(CHECK_OK); |
| + return CharacterRange::Singleton(c); |
| + } |
| + } else { |
| + Advance(); |
| + return CharacterRange::Singleton(first); |
| + } |
| +} |
| + |
| + |
| +RegExpTree* RegExpParser::ParseCharacterClass(bool* ok) { |
| + static const char* kUnterminated = "Unterminated character class"; |
| + static const char* kIllegal = "Illegal character class"; |
| + static const char* kRangeOutOfOrder = "Range out of order in character class"; |
| + |
| + ASSERT_EQ(current(), '['); |
| + Advance(); |
| + bool is_negated = false; |
| + if (current() == '^') { |
| + is_negated = true; |
| + Advance(); |
| + } |
| + ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2); |
| + while (has_more() && current() != ']') { |
| + bool is_char_class = false; |
| + CharacterRange first = ParseClassAtom(&is_char_class, ranges, CHECK_OK); |
| + if (!is_char_class) { |
| + if (current() == '-') { |
| + Advance(); |
| + if (current() == kEndMarker) { |
| + // If we reach the end we break out of the loop and let the |
| + // following code report an error. |
| + break; |
| + } else if (current() == ']') { |
| + ranges->Add(first); |
| + ranges->Add(CharacterRange::Singleton('-')); |
| + break; |
| + } |
| + CharacterRange next = |
| + ParseClassAtom(&is_char_class, ranges, CHECK_OK); |
| + if (is_char_class) { |
| + return ReportError(CStrVector(kIllegal), CHECK_OK); |
| + } |
| + if (first.from() > next.to()) { |
| + return ReportError(CStrVector(kRangeOutOfOrder), CHECK_OK); |
| + } |
| + ranges->Add(CharacterRange::Range(first.from(), next.to())); |
| + } else { |
| + ranges->Add(first); |
| + } |
| + } |
| + } |
| + if (!has_more()) { |
| + return ReportError(CStrVector(kUnterminated), CHECK_OK); |
| + } |
| + Advance(); |
| + if (ranges->length() == 0) { |
| + ranges->Add(CharacterRange::Range(0, 0xffff)); |
| + is_negated = !is_negated; |
| + } |
| + return new RegExpCharacterClass(ranges, is_negated); |
| +} |
| + |
| + |
| +// ---------------------------------------------------------------------------- |
| // The Parser interface. |
| // MakeAST() is just a wrapper for the corresponding Parser calls |
| @@ -3211,6 +4290,27 @@ |
| } |
| +bool ParseRegExp(FlatStringReader* input, RegExpParseResult* result) { |
| + ASSERT(result != NULL); |
| + // Get multiline flag somehow |
|
Mads Ager (chromium)
2008/11/25 21:09:41
Is this a TODO or a comment on what the code is do
Erik Corry
2008/11/26 12:18:36
It's a TODO. Fixed.
|
| + RegExpParser parser(input, &result->error, false); |
| + bool ok = true; |
| + result->tree = parser.ParsePattern(&ok); |
| + if (!ok) { |
| + ASSERT(result->tree == NULL); |
| + ASSERT(!result->error.is_null()); |
| + } else { |
| + ASSERT(result->tree != NULL); |
| + ASSERT(result->error.is_null()); |
| + } |
| + if (ok) { |
| + result->has_character_escapes = parser.HasCharacterEscapes(); |
| + result->capture_count = parser.captures_started(); |
| + } |
| + return ok; |
| +} |
| + |
| + |
| FunctionLiteral* MakeAST(bool compile_in_global_context, |
| Handle<Script> script, |
| v8::Extension* extension, |