src/parsing/parser.cc - Issue 1565183002: [regexp] move regexp parser into own files.

Unified Diff: src/parsing/parser.cc

Issue 1565183002: [regexp] move regexp parser into own files. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: fix test compile Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/parsing/parser.cc

diff --git a/src/parsing/parser.cc b/src/parsing/parser.cc

index ee2a696d458b2441135e61a88fcb25f748e7db68..74fc00b7e480dbaf493282135560adae8b5698b0 100644

--- a/src/parsing/parser.cc

+++ b/src/parsing/parser.cc

@@ -95,162 +95,6 @@ ParseInfo::ParseInfo(Zone* zone, Handle<Script> script) : ParseInfo(zone) {

}

-RegExpBuilder::RegExpBuilder(Zone* zone)

- : zone_(zone),

- pending_empty_(false),

- characters_(NULL),

- terms_(),

- alternatives_()

-#ifdef DEBUG

- , last_added_(ADD_NONE)

-#endif

- {}

-void RegExpBuilder::FlushCharacters() {

- pending_empty_ = false;

- if (characters_ != NULL) {

- RegExpTree* atom = new(zone()) RegExpAtom(characters_->ToConstVector());

- characters_ = NULL;

- text_.Add(atom, zone());

- LAST(ADD_ATOM);

- }

-void RegExpBuilder::FlushText() {

- FlushCharacters();

- int num_text = text_.length();

- if (num_text == 0) {

- return;

- } else if (num_text == 1) {

- terms_.Add(text_.last(), zone());

- } else {

- RegExpText* text = new(zone()) RegExpText(zone());

- for (int i = 0; i < num_text; i++)

- text_.Get(i)->AppendToText(text, zone());

- terms_.Add(text, zone());

- }

- text_.Clear();

-void RegExpBuilder::AddCharacter(uc16 c) {

- pending_empty_ = false;

- if (characters_ == NULL) {

- characters_ = new(zone()) ZoneList<uc16>(4, zone());

- }

- characters_->Add(c, zone());

- LAST(ADD_CHAR);

-void RegExpBuilder::AddEmpty() {

- pending_empty_ = true;

-void RegExpBuilder::AddAtom(RegExpTree* term) {

- if (term->IsEmpty()) {

- AddEmpty();

- return;

- }

- if (term->IsTextElement()) {

- FlushCharacters();

- text_.Add(term, zone());

- } else {

- FlushText();

- terms_.Add(term, zone());

- }

- LAST(ADD_ATOM);

-void RegExpBuilder::AddAssertion(RegExpTree* assert) {

- FlushText();

- terms_.Add(assert, zone());

- LAST(ADD_ASSERT);

-void RegExpBuilder::NewAlternative() {

- FlushTerms();

-void RegExpBuilder::FlushTerms() {

- FlushText();

- int num_terms = terms_.length();

- RegExpTree* alternative;

- if (num_terms == 0) {

- alternative = new (zone()) RegExpEmpty();

- } else if (num_terms == 1) {

- alternative = terms_.last();

- } else {

- alternative = new(zone()) RegExpAlternative(terms_.GetList(zone()));

- }

- alternatives_.Add(alternative, zone());

- terms_.Clear();

- LAST(ADD_NONE);

-RegExpTree* RegExpBuilder::ToRegExp() {

- FlushTerms();

- int num_alternatives = alternatives_.length();

- if (num_alternatives == 0) return new (zone()) RegExpEmpty();

- if (num_alternatives == 1) return alternatives_.last();

- return new(zone()) RegExpDisjunction(alternatives_.GetList(zone()));

-void RegExpBuilder::AddQuantifierToAtom(

- int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {

- if (pending_empty_) {

- pending_empty_ = false;

- return;

- }

- RegExpTree* atom;

- if (characters_ != NULL) {

- DCHECK(last_added_ == ADD_CHAR);

- // Last atom was character.

- Vector<const uc16> char_vector = characters_->ToConstVector();

- int num_chars = char_vector.length();

- if (num_chars > 1) {

- Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);

- text_.Add(new(zone()) RegExpAtom(prefix), zone());

- char_vector = char_vector.SubVector(num_chars - 1, num_chars);

- }

- characters_ = NULL;

- atom = new(zone()) RegExpAtom(char_vector);

- FlushText();

- } else if (text_.length() > 0) {

- DCHECK(last_added_ == ADD_ATOM);

- atom = text_.RemoveLast();

- FlushText();

- } else if (terms_.length() > 0) {

- DCHECK(last_added_ == ADD_ATOM);

- atom = terms_.RemoveLast();

- if (atom->max_match() == 0) {

- // Guaranteed to only match an empty string.

- LAST(ADD_TERM);

- if (min == 0) {

- return;

- }

- terms_.Add(atom, zone());

- return;

- }

- } else {

- // Only call immediately after adding an atom or character!

- UNREACHABLE();

- return;

- }

- terms_.Add(

- new(zone()) RegExpQuantifier(min, max, quantifier_type, atom), zone());

- LAST(ADD_TERM);

FunctionEntry ParseData::GetFunctionEntry(int start) {

// The current pre-data entry must be a FunctionEntry with the given

// start position.

@@ -5198,962 +5042,8 @@ void Parser::Internalize(Isolate* isolate, Handle<Script> script, bool error) {

// ----------------------------------------------------------------------------

-// Regular expressions

-RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,

- bool multiline, bool unicode, Isolate* isolate,

- Zone* zone)

- : isolate_(isolate),

- zone_(zone),

- error_(error),

- captures_(NULL),

- in_(in),

- current_(kEndMarker),

- next_pos_(0),

- captures_started_(0),

- capture_count_(0),

- has_more_(true),

- multiline_(multiline),

- unicode_(unicode),

- simple_(false),

- contains_anchor_(false),

- is_scanned_for_captures_(false),

- failed_(false) {

- Advance();

-uc32 RegExpParser::Next() {

- if (has_next()) {

- return in()->Get(next_pos_);

- } else {

- return kEndMarker;

- }

-void RegExpParser::Advance() {

- if (next_pos_ < in()->length()) {

- StackLimitCheck check(isolate());

- if (check.HasOverflowed()) {

- ReportError(CStrVector(Isolate::kStackOverflowMessage));

- } else if (zone()->excess_allocation()) {

- ReportError(CStrVector("Regular expression too large"));

- } else {

- current_ = in()->Get(next_pos_);

- next_pos_++;

- }

- } else {

- current_ = kEndMarker;

- // Advance so that position() points to 1-after-the-last-character. This is

- // important so that Reset() to this position works correctly.

- next_pos_ = in()->length() + 1;

- has_more_ = false;

- }

-void RegExpParser::Reset(int pos) {

- next_pos_ = pos;

- has_more_ = (pos < in()->length());

- Advance();

-void RegExpParser::Advance(int dist) {

- next_pos_ += dist - 1;

- Advance();

-bool RegExpParser::simple() {

- return simple_;

-bool RegExpParser::IsSyntaxCharacter(uc32 c) {

- return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||

- c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||

- c == '{' || c == '}' || c == '|';

-RegExpTree* RegExpParser::ReportError(Vector<const char> message) {

- failed_ = true;

- *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();

- // Zip to the end to make sure the no more input is read.

- current_ = kEndMarker;

- next_pos_ = in()->length();

- return NULL;

-// Pattern ::

-// Disjunction

-RegExpTree* RegExpParser::ParsePattern() {

- RegExpTree* result = ParseDisjunction(CHECK_FAILED);

- DCHECK(!has_more());

- // If the result of parsing is a literal string atom, and it has the

- // same length as the input, then the atom is identical to the input.

- if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {

- simple_ = true;

- }

- return result;

-// Disjunction ::

-// Alternative

-// Alternative | Disjunction

-// Alternative ::

-// [empty]

-// Term Alternative

-// Term ::

-// Assertion

-// Atom

-// Atom Quantifier

-RegExpTree* RegExpParser::ParseDisjunction() {

- // Used to store current state while parsing subexpressions.

- RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,

- zone());

- RegExpParserState* state = &initial_state;

- // Cache the builder in a local variable for quick access.

- RegExpBuilder* builder = initial_state.builder();

- while (true) {

- switch (current()) {

- case kEndMarker:

- if (state->IsSubexpression()) {

- // Inside a parenthesized group when hitting end of input.

- ReportError(CStrVector("Unterminated group") CHECK_FAILED);

- }

- DCHECK_EQ(INITIAL, state->group_type());

- // Parsing completed successfully.

- return builder->ToRegExp();

- case ')': {

- if (!state->IsSubexpression()) {

- ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);

- }

- DCHECK_NE(INITIAL, state->group_type());

- Advance();

- // End disjunction parsing and convert builder content to new single

- // regexp atom.

- RegExpTree* body = builder->ToRegExp();

- int end_capture_index = captures_started();

- int capture_index = state->capture_index();

- SubexpressionType group_type = state->group_type();

- // Build result of subexpression.

- if (group_type == CAPTURE) {

- RegExpCapture* capture = GetCapture(capture_index);

- capture->set_body(body);

- body = capture;

- } else if (group_type != GROUPING) {

- DCHECK(group_type == POSITIVE_LOOKAROUND ||

- group_type == NEGATIVE_LOOKAROUND);

- bool is_positive = (group_type == POSITIVE_LOOKAROUND);

- body = new (zone()) RegExpLookaround(

- body, is_positive, end_capture_index - capture_index, capture_index,

- state->lookaround_type());

- }

- // Restore previous state.

- state = state->previous_state();

- builder = state->builder();

- builder->AddAtom(body);

- // For compatability with JSC and ES3, we allow quantifiers after

- // lookaheads, and break in all cases.

- break;

- }

- case '|': {

- Advance();

- builder->NewAlternative();

- continue;

- }

- case '*':

- case '+':

- case '?':

- return ReportError(CStrVector("Nothing to repeat"));

- case '^': {

- Advance();

- if (multiline_) {

- builder->AddAssertion(

- new(zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));

- } else {

- builder->AddAssertion(

- new(zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));

- set_contains_anchor();

- }

- continue;

- }

- case '$': {

- Advance();

- RegExpAssertion::AssertionType assertion_type =

- multiline_ ? RegExpAssertion::END_OF_LINE :

- RegExpAssertion::END_OF_INPUT;

- builder->AddAssertion(new(zone()) RegExpAssertion(assertion_type));

- continue;

- }

- case '.': {

- Advance();

- // everything except \x0a, \x0d, \u2028 and \u2029

- ZoneList<CharacterRange>* ranges =

- new(zone()) ZoneList<CharacterRange>(2, zone());

- CharacterRange::AddClassEscape('.', ranges, zone());

- RegExpTree* atom = new(zone()) RegExpCharacterClass(ranges, false);

- builder->AddAtom(atom);

- break;

- }

- case '(': {

- SubexpressionType subexpr_type = CAPTURE;

- RegExpLookaround::Type lookaround_type = state->lookaround_type();

- Advance();

- if (current() == '?') {

- switch (Next()) {

- case ':':

- subexpr_type = GROUPING;

- break;

- case '=':

- lookaround_type = RegExpLookaround::LOOKAHEAD;

- subexpr_type = POSITIVE_LOOKAROUND;

- break;

- case '!':

- lookaround_type = RegExpLookaround::LOOKAHEAD;

- subexpr_type = NEGATIVE_LOOKAROUND;

- break;

- case '<':

- if (FLAG_harmony_regexp_lookbehind) {

- Advance();

- lookaround_type = RegExpLookaround::LOOKBEHIND;

- if (Next() == '=') {

- subexpr_type = POSITIVE_LOOKAROUND;

- break;

- } else if (Next() == '!') {

- subexpr_type = NEGATIVE_LOOKAROUND;

- break;

- }

- // Fall through.

- default:

- ReportError(CStrVector("Invalid group") CHECK_FAILED);

- break;

- }

- Advance(2);

- } else {

- if (captures_started_ >= kMaxCaptures) {

- ReportError(CStrVector("Too many captures") CHECK_FAILED);

- }

- captures_started_++;

- }

- // Store current state and begin new disjunction parsing.

- state = new (zone()) RegExpParserState(

- state, subexpr_type, lookaround_type, captures_started_, zone());

- builder = state->builder();

- continue;

- }

- case '[': {

- RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);

- builder->AddAtom(atom);

- break;

- }

- // Atom ::

- // \ AtomEscape

- case '\\':

- switch (Next()) {

- case kEndMarker:

- return ReportError(CStrVector("\\ at end of pattern"));

- case 'b':

- Advance(2);

- builder->AddAssertion(

- new(zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));

- continue;

- case 'B':

- Advance(2);

- builder->AddAssertion(

- new(zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));

- continue;

- // AtomEscape ::

- // CharacterClassEscape

- //

- // CharacterClassEscape :: one of

- // d D s S w W

- case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {

- uc32 c = Next();

- Advance(2);

- ZoneList<CharacterRange>* ranges =

- new(zone()) ZoneList<CharacterRange>(2, zone());

- CharacterRange::AddClassEscape(c, ranges, zone());

- RegExpTree* atom = new(zone()) RegExpCharacterClass(ranges, false);

- builder->AddAtom(atom);

- break;

- }

- case '1': case '2': case '3': case '4': case '5': case '6':

- case '7': case '8': case '9': {

- int index = 0;

- if (ParseBackReferenceIndex(&index)) {

- if (state->IsInsideCaptureGroup(index)) {

- // The back reference is inside the capture group it refers to.

- // Nothing can possibly have been captured yet, so we use empty

- // instead. This ensures that, when checking a back reference,

- // the capture registers of the referenced capture are either

- // both set or both cleared.

- builder->AddEmpty();

- } else {

- RegExpCapture* capture = GetCapture(index);

- RegExpTree* atom = new (zone()) RegExpBackReference(capture);

- builder->AddAtom(atom);

- }

- break;

- }

- uc32 first_digit = Next();

- if (first_digit == '8' || first_digit == '9') {

- // If the 'u' flag is present, only syntax characters can be escaped,

- // no other identity escapes are allowed. If the 'u' flag is not

- // present, all identity escapes are allowed.

- if (!FLAG_harmony_unicode_regexps || !unicode_) {

- builder->AddCharacter(first_digit);

- Advance(2);

- } else {

- return ReportError(CStrVector("Invalid escape"));

- }

- break;

- }

- // FALLTHROUGH

- case '0': {

- Advance();

- uc32 octal = ParseOctalLiteral();

- builder->AddCharacter(octal);

- break;

- }

- // ControlEscape :: one of

- // f n r t v

- case 'f':

- Advance(2);

- builder->AddCharacter('\f');

- break;

- case 'n':

- Advance(2);

- builder->AddCharacter('\n');

- break;

- case 'r':

- Advance(2);

- builder->AddCharacter('\r');

- break;

- case 't':

- Advance(2);

- builder->AddCharacter('\t');

- break;

- case 'v':

- Advance(2);

- builder->AddCharacter('\v');

- break;

- case 'c': {

- Advance();

- uc32 controlLetter = Next();

- // Special case if it is an ASCII letter.

- // Convert lower case letters to uppercase.

- uc32 letter = controlLetter & ~('a' ^ 'A');

- if (letter < 'A' || 'Z' < letter) {

- // controlLetter is not in range 'A'-'Z' or 'a'-'z'.

- // This is outside the specification. We match JSC in

- // reading the backslash as a literal character instead

- // of as starting an escape.

- builder->AddCharacter('\\');

- } else {

- Advance(2);

- builder->AddCharacter(controlLetter & 0x1f);

- }

- break;

- }

- case 'x': {

- Advance(2);

- uc32 value;

- if (ParseHexEscape(2, &value)) {

- builder->AddCharacter(value);

- } else if (!FLAG_harmony_unicode_regexps || !unicode_) {

- builder->AddCharacter('x');

- } else {

- // If the 'u' flag is present, invalid escapes are not treated as

- // identity escapes.

- return ReportError(CStrVector("Invalid escape"));

- }

- break;

- }

- case 'u': {

- Advance(2);

- uc32 value;

- if (ParseUnicodeEscape(&value)) {

- builder->AddCharacter(value);

- } else if (!FLAG_harmony_unicode_regexps || !unicode_) {

- builder->AddCharacter('u');

- } else {

- // If the 'u' flag is present, invalid escapes are not treated as

- // identity escapes.

- return ReportError(CStrVector("Invalid unicode escape"));

- }

- break;

- }

- default:

- Advance();

- // If the 'u' flag is present, only syntax characters can be escaped, no

- // other identity escapes are allowed. If the 'u' flag is not present,

- // all identity escapes are allowed.

- if (!FLAG_harmony_unicode_regexps || !unicode_ ||

- IsSyntaxCharacter(current())) {

- builder->AddCharacter(current());

- Advance();

- } else {

- return ReportError(CStrVector("Invalid escape"));

- }

- break;

- }

- break;

- case '{': {

- int dummy;

- if (ParseIntervalQuantifier(&dummy, &dummy)) {

- ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);

- }

- // fallthrough

- }

- default:

- builder->AddCharacter(current());

- Advance();

- break;

- } // end switch(current())

- int min;

- int max;

- switch (current()) {

- // QuantifierPrefix ::

- // *

- // +

- // ?

- // {

- case '*':

- min = 0;

- max = RegExpTree::kInfinity;

- Advance();

- break;

- case '+':

- min = 1;

- max = RegExpTree::kInfinity;

- Advance();

- break;

- case '?':

- min = 0;

- max = 1;

- Advance();

- break;

- case '{':

- if (ParseIntervalQuantifier(&min, &max)) {

- if (max < min) {

- ReportError(CStrVector("numbers out of order in {} quantifier.")

- CHECK_FAILED);

- }

- break;

- } else {

- continue;

- }

- default:

- continue;

- }

- RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;

- if (current() == '?') {

- quantifier_type = RegExpQuantifier::NON_GREEDY;

- Advance();

- } else if (FLAG_regexp_possessive_quantifier && current() == '+') {

- // FLAG_regexp_possessive_quantifier is a debug-only flag.

- quantifier_type = RegExpQuantifier::POSSESSIVE;

- Advance();

- }

- builder->AddQuantifierToAtom(min, max, quantifier_type);

- }

-#ifdef DEBUG

-// Currently only used in an DCHECK.

-static bool IsSpecialClassEscape(uc32 c) {

- switch (c) {

- case 'd': case 'D':

- case 's': case 'S':

- case 'w': case 'W':

- return true;

- default:

- return false;

- }

-#endif

-// In order to know whether an escape is a backreference or not we have to scan

-// the entire regexp and find the number of capturing parentheses. However we

-// don't want to scan the regexp twice unless it is necessary. This mini-parser

-// is called when needed. It can see the difference between capturing and

-// noncapturing parentheses and can skip character classes and backslash-escaped

-// characters.

-void RegExpParser::ScanForCaptures() {

- // Start with captures started previous to current position

- int capture_count = captures_started();

- // Add count of captures after this position.

- int n;

- while ((n = current()) != kEndMarker) {

- Advance();

- switch (n) {

- case '\\':

- Advance();

- break;

- case '[': {

- int c;

- while ((c = current()) != kEndMarker) {

- Advance();

- if (c == '\\') {

- Advance();

- } else {

- if (c == ']') break;

- }

- break;

- }

- case '(':

- if (current() != '?') capture_count++;

- break;

- }

- capture_count_ = capture_count;

- is_scanned_for_captures_ = true;

-bool RegExpParser::ParseBackReferenceIndex(int* index_out) {

- DCHECK_EQ('\\', current());

- DCHECK('1' <= Next() && Next() <= '9');

- // Try to parse a decimal literal that is no greater than the total number

- // of left capturing parentheses in the input.

- int start = position();

- int value = Next() - '0';

- Advance(2);

- while (true) {

- uc32 c = current();

- if (IsDecimalDigit(c)) {

- value = 10 * value + (c - '0');

- if (value > kMaxCaptures) {

- Reset(start);

- return false;

- }

- Advance();

- } else {

- break;

- }

- if (value > captures_started()) {

- if (!is_scanned_for_captures_) {

- int saved_position = position();

- ScanForCaptures();

- Reset(saved_position);

- }

- if (value > capture_count_) {

- Reset(start);

- return false;

- }

- *index_out = value;

- return true;

-RegExpCapture* RegExpParser::GetCapture(int index) {

- // The index for the capture groups are one-based. Its index in the list is

- // zero-based.

- int know_captures =

- is_scanned_for_captures_ ? capture_count_ : captures_started_;

- DCHECK(index <= know_captures);

- if (captures_ == NULL) {

- captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());

- }

- while (captures_->length() < know_captures) {

- captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());

- }

- return captures_->at(index - 1);

-bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {

- for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {

- if (s->group_type() != CAPTURE) continue;

- // Return true if we found the matching capture index.

- if (index == s->capture_index()) return true;

- // Abort if index is larger than what has been parsed up till this state.

- if (index > s->capture_index()) return false;

- }

- return false;

-// QuantifierPrefix ::

-// { DecimalDigits }

-// { DecimalDigits , }

-// { DecimalDigits , DecimalDigits }

-//

-// Returns true if parsing succeeds, and set the min_out and max_out

-// values. Values are truncated to RegExpTree::kInfinity if they overflow.

-bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {

- DCHECK_EQ(current(), '{');

- int start = position();

- Advance();

- int min = 0;

- if (!IsDecimalDigit(current())) {

- Reset(start);

- return false;

- }

- while (IsDecimalDigit(current())) {

- int next = current() - '0';

- if (min > (RegExpTree::kInfinity - next) / 10) {

- // Overflow. Skip past remaining decimal digits and return -1.

- do {

- Advance();

- } while (IsDecimalDigit(current()));

- min = RegExpTree::kInfinity;

- break;

- }

- min = 10 * min + next;

- Advance();

- }

- int max = 0;

- if (current() == '}') {

- max = min;

- Advance();

- } else if (current() == ',') {

- Advance();

- if (current() == '}') {

- max = RegExpTree::kInfinity;

- Advance();

- } else {

- while (IsDecimalDigit(current())) {

- int next = current() - '0';

- if (max > (RegExpTree::kInfinity - next) / 10) {

- do {

- Advance();

- } while (IsDecimalDigit(current()));

- max = RegExpTree::kInfinity;

- break;

- }

- max = 10 * max + next;

- Advance();

- }

- if (current() != '}') {

- Reset(start);

- return false;

- }

- Advance();

- }

- } else {

- Reset(start);

- return false;

- }

- *min_out = min;

- *max_out = max;

- return true;

-uc32 RegExpParser::ParseOctalLiteral() {

- DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);

- // For compatibility with some other browsers (not all), we parse

- // up to three octal digits with a value below 256.

- uc32 value = current() - '0';

- Advance();

- if ('0' <= current() && current() <= '7') {

- value = value * 8 + current() - '0';

- Advance();

- if (value < 32 && '0' <= current() && current() <= '7') {

- value = value * 8 + current() - '0';

- Advance();

- }

- return value;

-bool RegExpParser::ParseHexEscape(int length, uc32* value) {

- int start = position();

- uc32 val = 0;

- for (int i = 0; i < length; ++i) {

- uc32 c = current();

- int d = HexValue(c);

- if (d < 0) {

- Reset(start);

- return false;

- }

- val = val * 16 + d;

- Advance();

- }

- *value = val;

- return true;

-bool RegExpParser::ParseUnicodeEscape(uc32* value) {

- // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are

- // allowed). In the latter case, the number of hex digits between { } is

- // arbitrary. \ and u have already been read.

- if (current() == '{' && FLAG_harmony_unicode_regexps && unicode_) {

- int start = position();

- Advance();

- if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {

- if (current() == '}') {

- Advance();

- return true;

- }

- Reset(start);

- return false;

- }

- // \u but no {, or \u{...} escapes not allowed.

- return ParseHexEscape(4, value);

-bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {

- uc32 x = 0;

- int d = HexValue(current());

- if (d < 0) {

- return false;

- }

- while (d >= 0) {

- x = x * 16 + d;

- if (x > max_value) {

- return false;

- }

- Advance();

- d = HexValue(current());

- }

- *value = x;

- return true;

-uc32 RegExpParser::ParseClassCharacterEscape() {

- DCHECK(current() == '\\');

- DCHECK(has_next() && !IsSpecialClassEscape(Next()));

- Advance();

- switch (current()) {

- case 'b':

- Advance();

- return '\b';

- // ControlEscape :: one of

- // f n r t v

- case 'f':

- Advance();

- return '\f';

- case 'n':

- Advance();

- return '\n';

- case 'r':

- Advance();

- return '\r';

- case 't':

- Advance();

- return '\t';

- case 'v':

- Advance();

- return '\v';

- case 'c': {

- uc32 controlLetter = Next();

- uc32 letter = controlLetter & ~('A' ^ 'a');

- // For compatibility with JSC, inside a character class

- // we also accept digits and underscore as control characters.

- if ((controlLetter >= '0' && controlLetter <= '9') ||

- controlLetter == '_' ||

- (letter >= 'A' && letter <= 'Z')) {

- Advance(2);

- // Control letters mapped to ASCII control characters in the range

- // 0x00-0x1f.

- return controlLetter & 0x1f;

- }

- // We match JSC in reading the backslash as a literal

- // character instead of as starting an escape.

- return '\\';

- }

- case '0': case '1': case '2': case '3': case '4': case '5':

- case '6': case '7':

- // For compatibility, we interpret a decimal escape that isn't

- // a back reference (and therefore either \0 or not valid according

- // to the specification) as a 1..3 digit octal character code.

- return ParseOctalLiteral();

- case 'x': {

- Advance();

- uc32 value;

- if (ParseHexEscape(2, &value)) {

- return value;

- }

- if (!FLAG_harmony_unicode_regexps || !unicode_) {

- // If \x is not followed by a two-digit hexadecimal, treat it

- // as an identity escape.

- return 'x';

- }

- // If the 'u' flag is present, invalid escapes are not treated as

- // identity escapes.

- ReportError(CStrVector("Invalid escape"));

- return 0;

- }

- case 'u': {

- Advance();

- uc32 value;

- if (ParseUnicodeEscape(&value)) {

- return value;

- }

- if (!FLAG_harmony_unicode_regexps || !unicode_) {

- return 'u';

- }

- // If the 'u' flag is present, invalid escapes are not treated as

- // identity escapes.

- ReportError(CStrVector("Invalid unicode escape"));

- return 0;

- }

- default: {

- uc32 result = current();

- // If the 'u' flag is present, only syntax characters can be escaped, no

- // other identity escapes are allowed. If the 'u' flag is not present, all

- // identity escapes are allowed.

- if (!FLAG_harmony_unicode_regexps || !unicode_ ||

- IsSyntaxCharacter(result)) {

- Advance();

- return result;

- }

- ReportError(CStrVector("Invalid escape"));

- return 0;

- }

- return 0;

-CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {

- DCHECK_EQ(0, *char_class);

- uc32 first = current();

- if (first == '\\') {

- switch (Next()) {

- case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {

- *char_class = Next();

- Advance(2);

- return CharacterRange::Singleton(0); // Return dummy value.

- }

- case kEndMarker:

- return ReportError(CStrVector("\\ at end of pattern"));

- default:

- uc32 c = ParseClassCharacterEscape(CHECK_FAILED);

- return CharacterRange::Singleton(c);

- }

- } else {

- Advance();

- return CharacterRange::Singleton(first);

- }

-static const uc16 kNoCharClass = 0;

-// Adds range or pre-defined character class to character ranges.

-// If char_class is not kInvalidClass, it's interpreted as a class

-// escape (i.e., 's' means whitespace, from '\s').

-static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,

- uc16 char_class,

- CharacterRange range,

- Zone* zone) {

- if (char_class != kNoCharClass) {

- CharacterRange::AddClassEscape(char_class, ranges, zone);

- } else {

- ranges->Add(range, zone);

- }

-RegExpTree* RegExpParser::ParseCharacterClass() {

- static const char* kUnterminated = "Unterminated character class";

- static const char* kRangeOutOfOrder = "Range out of order in character class";

- DCHECK_EQ(current(), '[');

- Advance();

- bool is_negated = false;

- if (current() == '^') {

- is_negated = true;

- Advance();

- }

- ZoneList<CharacterRange>* ranges =

- new(zone()) ZoneList<CharacterRange>(2, zone());

- while (has_more() && current() != ']') {

- uc16 char_class = kNoCharClass;

- CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);

- if (current() == '-') {

- Advance();

- if (current() == kEndMarker) {

- // If we reach the end we break out of the loop and let the

- // following code report an error.

- break;

- } else if (current() == ']') {

- AddRangeOrEscape(ranges, char_class, first, zone());

- ranges->Add(CharacterRange::Singleton('-'), zone());

- break;

- }

- uc16 char_class_2 = kNoCharClass;

- CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);

- if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {

- // Either end is an escaped character class. Treat the '-' verbatim.

- AddRangeOrEscape(ranges, char_class, first, zone());

- ranges->Add(CharacterRange::Singleton('-'), zone());

- AddRangeOrEscape(ranges, char_class_2, next, zone());

- continue;

- }

- if (first.from() > next.to()) {

- return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);

- }

- ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());

- } else {

- AddRangeOrEscape(ranges, char_class, first, zone());

- }

- if (!has_more()) {

- return ReportError(CStrVector(kUnterminated) CHECK_FAILED);

- }

- Advance();

- if (ranges->length() == 0) {

- ranges->Add(CharacterRange::Everything(), zone());

- is_negated = !is_negated;

- }

- return new (zone()) RegExpCharacterClass(ranges, is_negated);

-// ----------------------------------------------------------------------------

// The Parser interface.

-bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,

- FlatStringReader* input, bool multiline,

- bool unicode, RegExpCompileData* result) {

- DCHECK(result != NULL);

- RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);

- RegExpTree* tree = parser.ParsePattern();

- if (parser.failed()) {

- DCHECK(tree == NULL);

- DCHECK(!result->error.is_null());

- } else {

- DCHECK(tree != NULL);

- DCHECK(result->error.is_null());

- result->tree = tree;

- int capture_count = parser.captures_started();

- result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;

- result->contains_anchor = parser.contains_anchor();

- result->capture_count = capture_count;

- }

- return !parser.failed();

bool Parser::ParseStatic(ParseInfo* info) {

Parser parser(info);

« no previous file with comments | « src/parsing/parser.h ('k') | src/parsing/scanner.h » ('j') | no next file with comments »