Chromium Code Reviews| Index: src/regexp/regexp-parser.cc |
| diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc |
| index 77a741f1e8544a43abc48204bf058d66ba4fcc11..1061952aac48e1dcd328beb404f7759d779492ce 100644 |
| --- a/src/regexp/regexp-parser.cc |
| +++ b/src/regexp/regexp-parser.cc |
| @@ -102,11 +102,28 @@ void RegExpParser::Advance(int dist) { |
| bool RegExpParser::simple() { return simple_; } |
| - |
| -bool RegExpParser::IsSyntaxCharacter(uc32 c) { |
| - return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' || |
| - c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' || |
| - c == '{' || c == '}' || c == '|'; |
| +bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { |
| + switch (c) { |
| + case '^': |
| + case '$': |
| + case '\\': |
| + case '.': |
| + case '*': |
| + case '+': |
| + case '?': |
| + case '(': |
| + case ')': |
| + case '[': |
| + case ']': |
| + case '{': |
| + case '}': |
| + case '|': |
| + case '/': |
| + return true; |
| + default: |
| + break; |
| + } |
| + return false; |
| } |
| @@ -360,24 +377,25 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
| } |
| break; |
| } |
| + // With /u, no identity escapes except for syntax characters |
| + // are allowed. Otherwise, all identity escapes are allowed. |
| + if (unicode()) { |
| + return ReportError(CStrVector("Invalid escape")); |
| + } |
| uc32 first_digit = Next(); |
| if (first_digit == '8' || first_digit == '9') { |
| - // If the 'u' flag is present, only syntax characters can be |
| - // escaped, |
| - // no other identity escapes are allowed. If the 'u' flag is not |
| - // present, all identity escapes are allowed. |
| - if (!unicode()) { |
| - builder->AddCharacter(first_digit); |
| - Advance(2); |
| - } else { |
| - return ReportError(CStrVector("Invalid escape")); |
| - } |
| + builder->AddCharacter(first_digit); |
| + Advance(2); |
| break; |
| } |
| } |
| // FALLTHROUGH |
| case '0': { |
| Advance(); |
| + if (unicode() && Next() >= '0' && Next() <= '9') { |
| + // With /u, decimal escape with leading 0 are not parsed as octal. |
| + return ReportError(CStrVector("Invalid decimal escape")); |
| + } |
| uc32 octal = ParseOctalLiteral(); |
| builder->AddCharacter(octal); |
| break; |
| @@ -415,6 +433,10 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
| // This is outside the specification. We match JSC in |
| // reading the backslash as a literal character instead |
| // of as starting an escape. |
| + if (unicode()) { |
| + // With /u, invalid escapes are not treated as identity escapes. |
| + return ReportError(CStrVector("Invalid unicode escape")); |
| + } |
| builder->AddCharacter('\\'); |
| } else { |
| Advance(2); |
| @@ -430,8 +452,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
| } else if (!unicode()) { |
| builder->AddCharacter('x'); |
| } else { |
| - // If the 'u' flag is present, invalid escapes are not treated as |
| - // identity escapes. |
| + // With /u, invalid escapes are not treated as identity escapes. |
| return ReportError(CStrVector("Invalid escape")); |
| } |
| break; |
| @@ -444,20 +465,16 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
| } else if (!unicode()) { |
| builder->AddCharacter('u'); |
| } else { |
| - // If the 'u' flag is present, invalid escapes are not treated as |
| - // identity escapes. |
| + // With /u, invalid escapes are not treated as identity escapes. |
| return ReportError(CStrVector("Invalid unicode escape")); |
| } |
| break; |
| } |
| default: |
| Advance(); |
| - // If the 'u' flag is present, only syntax characters can be |
| - // escaped, no |
| - // other identity escapes are allowed. If the 'u' flag is not |
| - // present, |
| - // all identity escapes are allowed. |
| - if (!unicode() || IsSyntaxCharacter(current())) { |
| + // With /u, no identity escapes except for syntax characters |
| + // are allowed. Otherwise, all identity escapes are allowed. |
| + if (!unicode() || IsSyntaxCharacterOrSlash(current())) { |
| builder->AddCharacter(current()); |
| Advance(); |
| } else { |
| @@ -473,6 +490,12 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
| } |
| // fallthrough |
| } |
| + case '}': |
| + case ']': |
| + if (unicode()) { |
| + ReportError(CStrVector("Lone quantifier brackets") CHECK_FAILED); |
|
vogelheim
2016/01/28 13:38:21
I don't get the point of the ReportError(... CHECK
Yang
2016/01/28 14:01:07
Done.
|
| + } |
| + // fallthrough |
| default: |
| builder->AddUnicodeCharacter(current()); |
| Advance(); |
| @@ -505,11 +528,15 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
| case '{': |
| if (ParseIntervalQuantifier(&min, &max)) { |
| if (max < min) { |
| - ReportError(CStrVector("numbers out of order in {} quantifier.") |
| + ReportError(CStrVector("numbers out of order in {} quantifier") |
| CHECK_FAILED); |
| } |
| break; |
| } else { |
| + if (unicode()) { |
| + // With /u, incomplete quantifiers are not allowed. |
| + ReportError(CStrVector("Incomplete quantifier") CHECK_FAILED); |
| + } |
| continue; |
| } |
| default: |
| @@ -524,7 +551,9 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
| quantifier_type = RegExpQuantifier::POSSESSIVE; |
| Advance(); |
| } |
| - builder->AddQuantifierToAtom(min, max, quantifier_type); |
| + if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { |
| + ReportError(CStrVector("Invalid quantifier") CHECK_FAILED); |
| + } |
| } |
| } |
| @@ -822,15 +851,24 @@ uc32 RegExpParser::ParseClassCharacterEscape() { |
| case 'c': { |
| uc32 controlLetter = Next(); |
| uc32 letter = controlLetter & ~('A' ^ 'a'); |
| - // For compatibility with JSC, inside a character class |
| - // we also accept digits and underscore as control characters. |
| - if ((controlLetter >= '0' && controlLetter <= '9') || |
| - controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) { |
| + // For compatibility with JSC, inside a character class. We also accept |
| + // digits and underscore as control characters, unless with /u. |
| + if (letter >= 'A' && letter <= 'Z') { |
| Advance(2); |
| // Control letters mapped to ASCII control characters in the range |
| // 0x00-0x1f. |
| return controlLetter & 0x1f; |
| } |
| + if (unicode()) { |
| + // With /u, invalid escapes are not treated as identity escapes. |
| + ReportError(CStrVector("Invalid class escape")); |
| + return 0; |
| + } |
| + if ((controlLetter >= '0' && controlLetter <= '9') || |
| + controlLetter == '_') { |
| + Advance(2); |
| + return controlLetter & 0x1f; |
| + } |
| // We match JSC in reading the backslash as a literal |
| // character instead of as starting an escape. |
| return '\\'; |
| @@ -846,43 +884,43 @@ uc32 RegExpParser::ParseClassCharacterEscape() { |
| // For compatibility, we interpret a decimal escape that isn't |
| // a back reference (and therefore either \0 or not valid according |
| // to the specification) as a 1..3 digit octal character code. |
| + if (unicode()) { |
| + // With /u, decimal escape is not interpreted as octal character code. |
| + ReportError(CStrVector("Invalid class escape")); |
| + return 0; |
| + } |
| return ParseOctalLiteral(); |
| case 'x': { |
| Advance(); |
| uc32 value; |
| - if (ParseHexEscape(2, &value)) { |
| - return value; |
| + if (ParseHexEscape(2, &value)) return value; |
| + if (unicode()) { |
| + // With /u, invalid escapes are not treated as identity escapes. |
| + ReportError(CStrVector("Invalid escape")); |
| + return 0; |
| } |
| - if (!unicode()) { |
| - // If \x is not followed by a two-digit hexadecimal, treat it |
| - // as an identity escape. |
| - return 'x'; |
| - } |
| - // If the 'u' flag is present, invalid escapes are not treated as |
| - // identity escapes. |
| - ReportError(CStrVector("Invalid escape")); |
| - return 0; |
| + // If \x is not followed by a two-digit hexadecimal, treat it |
| + // as an identity escape. |
| + return 'x'; |
| } |
| case 'u': { |
| Advance(); |
| uc32 value; |
| - if (ParseUnicodeEscape(&value)) { |
| - return value; |
| - } |
| - if (!unicode()) { |
| - return 'u'; |
| + if (ParseUnicodeEscape(&value)) return value; |
| + if (unicode()) { |
| + // With /u, invalid escapes are not treated as identity escapes. |
| + ReportError(CStrVector("Invalid unicode escape")); |
| + return 0; |
| } |
| - // If the 'u' flag is present, invalid escapes are not treated as |
| - // identity escapes. |
| - ReportError(CStrVector("Invalid unicode escape")); |
| - return 0; |
| + // If \u is not followed by a two-digit hexadecimal, treat it |
| + // as an identity escape. |
| + return 'u'; |
| } |
| default: { |
| uc32 result = current(); |
| - // If the 'u' flag is present, only syntax characters can be escaped, no |
| - // other identity escapes are allowed. If the 'u' flag is not present, all |
| - // identity escapes are allowed. |
| - if (!unicode() || IsSyntaxCharacter(result)) { |
| + // With /u, no identity escapes except for syntax characters are |
| + // allowed. Otherwise, all identity escapes are allowed. |
| + if (!unicode() || IsSyntaxCharacterOrSlash(result)) { |
| Advance(); |
| return result; |
| } |
| @@ -956,6 +994,7 @@ static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, |
| RegExpTree* RegExpParser::ParseCharacterClass() { |
| static const char* kUnterminated = "Unterminated character class"; |
| + static const char* kRangeInvalid = "Invalid character class"; |
| static const char* kRangeOutOfOrder = "Range out of order in character class"; |
| DCHECK_EQ(current(), '['); |
| @@ -985,11 +1024,16 @@ RegExpTree* RegExpParser::ParseCharacterClass() { |
| CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); |
| if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { |
| // Either end is an escaped character class. Treat the '-' verbatim. |
| + if (unicode()) { |
| + // ES2015 21.2.2.15.1 step 1. |
| + return ReportError(CStrVector(kRangeInvalid) CHECK_FAILED); |
|
vogelheim
2016/01/28 13:38:21
CHECK_FAILED adds a return, after the unconditiona
Yang
2016/01/28 14:01:07
You are completely right. I simply copied the code
|
| + } |
| AddRangeOrEscape(ranges, char_class, first, zone()); |
| ranges->Add(CharacterRange::Singleton('-'), zone()); |
| AddRangeOrEscape(ranges, char_class_2, next, zone()); |
| continue; |
| } |
| + // ES2015 21.2.2.15.1 step 6. |
| if (first.from() > next.to()) { |
| return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED); |
| } |
| @@ -1162,7 +1206,7 @@ void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
| void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
| if (NeedsDesugaringForUnicode(cc)) { |
| - // In unicode mode, character class needs to be desugared, so it |
| + // With /u, character class needs to be desugared, so it |
| // must be a standalone term instead of being part of a RegExpText. |
| AddTerm(cc); |
| } else { |
| @@ -1275,13 +1319,12 @@ RegExpTree* RegExpBuilder::ToRegExp() { |
| return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); |
| } |
| - |
| -void RegExpBuilder::AddQuantifierToAtom( |
| +bool RegExpBuilder::AddQuantifierToAtom( |
| int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { |
| FlushPendingSurrogate(); |
| if (pending_empty_) { |
| pending_empty_ = false; |
| - return; |
| + return true; |
| } |
| RegExpTree* atom; |
| if (characters_ != NULL) { |
| @@ -1304,23 +1347,26 @@ void RegExpBuilder::AddQuantifierToAtom( |
| } else if (terms_.length() > 0) { |
| DCHECK(last_added_ == ADD_ATOM); |
| atom = terms_.RemoveLast(); |
| + // With /u, lookarounds are not quantifiable. |
| + if (unicode() && atom->IsLookaround()) return false; |
| if (atom->max_match() == 0) { |
| // Guaranteed to only match an empty string. |
| LAST(ADD_TERM); |
| if (min == 0) { |
| - return; |
| + return true; |
| } |
| terms_.Add(atom, zone()); |
| - return; |
| + return true; |
| } |
| } else { |
| // Only call immediately after adding an atom or character! |
| UNREACHABLE(); |
| - return; |
| + return false; |
| } |
| terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
| zone()); |
| LAST(ADD_TERM); |
| + return true; |
| } |
| } // namespace internal |