| Index: src/regexp/regexp-parser.cc
|
| diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc
|
| index 77a741f1e8544a43abc48204bf058d66ba4fcc11..91c14cce497bd285e33bcd4896589b08bbe06437 100644
|
| --- a/src/regexp/regexp-parser.cc
|
| +++ b/src/regexp/regexp-parser.cc
|
| @@ -102,11 +102,28 @@ void RegExpParser::Advance(int dist) {
|
|
|
| bool RegExpParser::simple() { return simple_; }
|
|
|
| -
|
| -bool RegExpParser::IsSyntaxCharacter(uc32 c) {
|
| - return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
|
| - c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
|
| - c == '{' || c == '}' || c == '|';
|
| +bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
|
| + switch (c) {
|
| + case '^':
|
| + case '$':
|
| + case '\\':
|
| + case '.':
|
| + case '*':
|
| + case '+':
|
| + case '?':
|
| + case '(':
|
| + case ')':
|
| + case '[':
|
| + case ']':
|
| + case '{':
|
| + case '}':
|
| + case '|':
|
| + case '/':
|
| + return true;
|
| + default:
|
| + break;
|
| + }
|
| + return false;
|
| }
|
|
|
|
|
| @@ -161,14 +178,14 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
| case kEndMarker:
|
| if (state->IsSubexpression()) {
|
| // Inside a parenthesized group when hitting end of input.
|
| - ReportError(CStrVector("Unterminated group") CHECK_FAILED);
|
| + return ReportError(CStrVector("Unterminated group"));
|
| }
|
| DCHECK_EQ(INITIAL, state->group_type());
|
| // Parsing completed successfully.
|
| return builder->ToRegExp();
|
| case ')': {
|
| if (!state->IsSubexpression()) {
|
| - ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
|
| + return ReportError(CStrVector("Unmatched ')'"));
|
| }
|
| DCHECK_NE(INITIAL, state->group_type());
|
|
|
| @@ -276,13 +293,12 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
| }
|
| // Fall through.
|
| default:
|
| - ReportError(CStrVector("Invalid group") CHECK_FAILED);
|
| - break;
|
| + return ReportError(CStrVector("Invalid group"));
|
| }
|
| Advance(2);
|
| } else {
|
| if (captures_started_ >= kMaxCaptures) {
|
| - ReportError(CStrVector("Too many captures") CHECK_FAILED);
|
| + return ReportError(CStrVector("Too many captures"));
|
| }
|
| captures_started_++;
|
| }
|
| @@ -360,24 +376,25 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
| }
|
| break;
|
| }
|
| + // With /u, no identity escapes except for syntax characters
|
| + // are allowed. Otherwise, all identity escapes are allowed.
|
| + if (unicode()) {
|
| + return ReportError(CStrVector("Invalid escape"));
|
| + }
|
| uc32 first_digit = Next();
|
| if (first_digit == '8' || first_digit == '9') {
|
| - // If the 'u' flag is present, only syntax characters can be
|
| - // escaped,
|
| - // no other identity escapes are allowed. If the 'u' flag is not
|
| - // present, all identity escapes are allowed.
|
| - if (!unicode()) {
|
| - builder->AddCharacter(first_digit);
|
| - Advance(2);
|
| - } else {
|
| - return ReportError(CStrVector("Invalid escape"));
|
| - }
|
| + builder->AddCharacter(first_digit);
|
| + Advance(2);
|
| break;
|
| }
|
| }
|
| // FALLTHROUGH
|
| case '0': {
|
| Advance();
|
| + if (unicode() && Next() >= '0' && Next() <= '9') {
|
| + // With /u, decimal escape with leading 0 are not parsed as octal.
|
| + return ReportError(CStrVector("Invalid decimal escape"));
|
| + }
|
| uc32 octal = ParseOctalLiteral();
|
| builder->AddCharacter(octal);
|
| break;
|
| @@ -415,6 +432,10 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
| // This is outside the specification. We match JSC in
|
| // reading the backslash as a literal character instead
|
| // of as starting an escape.
|
| + if (unicode()) {
|
| + // With /u, invalid escapes are not treated as identity escapes.
|
| + return ReportError(CStrVector("Invalid unicode escape"));
|
| + }
|
| builder->AddCharacter('\\');
|
| } else {
|
| Advance(2);
|
| @@ -430,8 +451,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
| } else if (!unicode()) {
|
| builder->AddCharacter('x');
|
| } else {
|
| - // If the 'u' flag is present, invalid escapes are not treated as
|
| - // identity escapes.
|
| + // With /u, invalid escapes are not treated as identity escapes.
|
| return ReportError(CStrVector("Invalid escape"));
|
| }
|
| break;
|
| @@ -444,20 +464,16 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
| } else if (!unicode()) {
|
| builder->AddCharacter('u');
|
| } else {
|
| - // If the 'u' flag is present, invalid escapes are not treated as
|
| - // identity escapes.
|
| + // With /u, invalid escapes are not treated as identity escapes.
|
| return ReportError(CStrVector("Invalid unicode escape"));
|
| }
|
| break;
|
| }
|
| default:
|
| Advance();
|
| - // If the 'u' flag is present, only syntax characters can be
|
| - // escaped, no
|
| - // other identity escapes are allowed. If the 'u' flag is not
|
| - // present,
|
| - // all identity escapes are allowed.
|
| - if (!unicode() || IsSyntaxCharacter(current())) {
|
| + // With /u, no identity escapes except for syntax characters
|
| + // are allowed. Otherwise, all identity escapes are allowed.
|
| + if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
|
| builder->AddCharacter(current());
|
| Advance();
|
| } else {
|
| @@ -469,10 +485,16 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
| case '{': {
|
| int dummy;
|
| if (ParseIntervalQuantifier(&dummy, &dummy)) {
|
| - ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
|
| + return ReportError(CStrVector("Nothing to repeat"));
|
| }
|
| // fallthrough
|
| }
|
| + case '}':
|
| + case ']':
|
| + if (unicode()) {
|
| + return ReportError(CStrVector("Lone quantifier brackets"));
|
| + }
|
| + // fallthrough
|
| default:
|
| builder->AddUnicodeCharacter(current());
|
| Advance();
|
| @@ -505,13 +527,15 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
| case '{':
|
| if (ParseIntervalQuantifier(&min, &max)) {
|
| if (max < min) {
|
| - ReportError(CStrVector("numbers out of order in {} quantifier.")
|
| - CHECK_FAILED);
|
| + return ReportError(
|
| + CStrVector("numbers out of order in {} quantifier"));
|
| }
|
| break;
|
| - } else {
|
| - continue;
|
| + } else if (unicode()) {
|
| + // With /u, incomplete quantifiers are not allowed.
|
| + return ReportError(CStrVector("Incomplete quantifier"));
|
| }
|
| + continue;
|
| default:
|
| continue;
|
| }
|
| @@ -524,7 +548,9 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
| quantifier_type = RegExpQuantifier::POSSESSIVE;
|
| Advance();
|
| }
|
| - builder->AddQuantifierToAtom(min, max, quantifier_type);
|
| + if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
|
| + return ReportError(CStrVector("Invalid quantifier"));
|
| + }
|
| }
|
| }
|
|
|
| @@ -822,15 +848,24 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
|
| case 'c': {
|
| uc32 controlLetter = Next();
|
| uc32 letter = controlLetter & ~('A' ^ 'a');
|
| - // For compatibility with JSC, inside a character class
|
| - // we also accept digits and underscore as control characters.
|
| - if ((controlLetter >= '0' && controlLetter <= '9') ||
|
| - controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
|
| + // For compatibility with JSC, inside a character class. We also accept
|
| + // digits and underscore as control characters, unless with /u.
|
| + if (letter >= 'A' && letter <= 'Z') {
|
| Advance(2);
|
| // Control letters mapped to ASCII control characters in the range
|
| // 0x00-0x1f.
|
| return controlLetter & 0x1f;
|
| }
|
| + if (unicode()) {
|
| + // With /u, invalid escapes are not treated as identity escapes.
|
| + ReportError(CStrVector("Invalid class escape"));
|
| + return 0;
|
| + }
|
| + if ((controlLetter >= '0' && controlLetter <= '9') ||
|
| + controlLetter == '_') {
|
| + Advance(2);
|
| + return controlLetter & 0x1f;
|
| + }
|
| // We match JSC in reading the backslash as a literal
|
| // character instead of as starting an escape.
|
| return '\\';
|
| @@ -846,43 +881,43 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
|
| // For compatibility, we interpret a decimal escape that isn't
|
| // a back reference (and therefore either \0 or not valid according
|
| // to the specification) as a 1..3 digit octal character code.
|
| + if (unicode()) {
|
| + // With /u, decimal escape is not interpreted as octal character code.
|
| + ReportError(CStrVector("Invalid class escape"));
|
| + return 0;
|
| + }
|
| return ParseOctalLiteral();
|
| case 'x': {
|
| Advance();
|
| uc32 value;
|
| - if (ParseHexEscape(2, &value)) {
|
| - return value;
|
| - }
|
| - if (!unicode()) {
|
| - // If \x is not followed by a two-digit hexadecimal, treat it
|
| - // as an identity escape.
|
| - return 'x';
|
| + if (ParseHexEscape(2, &value)) return value;
|
| + if (unicode()) {
|
| + // With /u, invalid escapes are not treated as identity escapes.
|
| + ReportError(CStrVector("Invalid escape"));
|
| + return 0;
|
| }
|
| - // If the 'u' flag is present, invalid escapes are not treated as
|
| - // identity escapes.
|
| - ReportError(CStrVector("Invalid escape"));
|
| - return 0;
|
| + // If \x is not followed by a two-digit hexadecimal, treat it
|
| + // as an identity escape.
|
| + return 'x';
|
| }
|
| case 'u': {
|
| Advance();
|
| uc32 value;
|
| - if (ParseUnicodeEscape(&value)) {
|
| - return value;
|
| - }
|
| - if (!unicode()) {
|
| - return 'u';
|
| + if (ParseUnicodeEscape(&value)) return value;
|
| + if (unicode()) {
|
| + // With /u, invalid escapes are not treated as identity escapes.
|
| + ReportError(CStrVector("Invalid unicode escape"));
|
| + return 0;
|
| }
|
| - // If the 'u' flag is present, invalid escapes are not treated as
|
| - // identity escapes.
|
| - ReportError(CStrVector("Invalid unicode escape"));
|
| - return 0;
|
| + // If \u is not followed by a two-digit hexadecimal, treat it
|
| + // as an identity escape.
|
| + return 'u';
|
| }
|
| default: {
|
| uc32 result = current();
|
| - // If the 'u' flag is present, only syntax characters can be escaped, no
|
| - // other identity escapes are allowed. If the 'u' flag is not present, all
|
| - // identity escapes are allowed.
|
| - if (!unicode() || IsSyntaxCharacter(result)) {
|
| + // With /u, no identity escapes except for syntax characters are
|
| + // allowed. Otherwise, all identity escapes are allowed.
|
| + if (!unicode() || IsSyntaxCharacterOrSlash(result)) {
|
| Advance();
|
| return result;
|
| }
|
| @@ -956,6 +991,7 @@ static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
|
|
|
| RegExpTree* RegExpParser::ParseCharacterClass() {
|
| static const char* kUnterminated = "Unterminated character class";
|
| + static const char* kRangeInvalid = "Invalid character class";
|
| static const char* kRangeOutOfOrder = "Range out of order in character class";
|
|
|
| DCHECK_EQ(current(), '[');
|
| @@ -985,13 +1021,18 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
|
| CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
|
| if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
|
| // Either end is an escaped character class. Treat the '-' verbatim.
|
| + if (unicode()) {
|
| + // ES2015 21.2.2.15.1 step 1.
|
| + return ReportError(CStrVector(kRangeInvalid));
|
| + }
|
| AddRangeOrEscape(ranges, char_class, first, zone());
|
| ranges->Add(CharacterRange::Singleton('-'), zone());
|
| AddRangeOrEscape(ranges, char_class_2, next, zone());
|
| continue;
|
| }
|
| + // ES2015 21.2.2.15.1 step 6.
|
| if (first.from() > next.to()) {
|
| - return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
|
| + return ReportError(CStrVector(kRangeOutOfOrder));
|
| }
|
| ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
|
| } else {
|
| @@ -999,7 +1040,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
|
| }
|
| }
|
| if (!has_more()) {
|
| - return ReportError(CStrVector(kUnterminated) CHECK_FAILED);
|
| + return ReportError(CStrVector(kUnterminated));
|
| }
|
| Advance();
|
| if (ranges->length() == 0) {
|
| @@ -1162,7 +1203,7 @@ void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
|
|
|
| void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
|
| if (NeedsDesugaringForUnicode(cc)) {
|
| - // In unicode mode, character class needs to be desugared, so it
|
| + // With /u, character class needs to be desugared, so it
|
| // must be a standalone term instead of being part of a RegExpText.
|
| AddTerm(cc);
|
| } else {
|
| @@ -1275,13 +1316,12 @@ RegExpTree* RegExpBuilder::ToRegExp() {
|
| return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
|
| }
|
|
|
| -
|
| -void RegExpBuilder::AddQuantifierToAtom(
|
| +bool RegExpBuilder::AddQuantifierToAtom(
|
| int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
|
| FlushPendingSurrogate();
|
| if (pending_empty_) {
|
| pending_empty_ = false;
|
| - return;
|
| + return true;
|
| }
|
| RegExpTree* atom;
|
| if (characters_ != NULL) {
|
| @@ -1304,23 +1344,26 @@ void RegExpBuilder::AddQuantifierToAtom(
|
| } else if (terms_.length() > 0) {
|
| DCHECK(last_added_ == ADD_ATOM);
|
| atom = terms_.RemoveLast();
|
| + // With /u, lookarounds are not quantifiable.
|
| + if (unicode() && atom->IsLookaround()) return false;
|
| if (atom->max_match() == 0) {
|
| // Guaranteed to only match an empty string.
|
| LAST(ADD_TERM);
|
| if (min == 0) {
|
| - return;
|
| + return true;
|
| }
|
| terms_.Add(atom, zone());
|
| - return;
|
| + return true;
|
| }
|
| } else {
|
| // Only call immediately after adding an atom or character!
|
| UNREACHABLE();
|
| - return;
|
| + return false;
|
| }
|
| terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
|
| zone());
|
| LAST(ADD_TERM);
|
| + return true;
|
| }
|
|
|
| } // namespace internal
|
|
|