Index: src/regexp/regexp-parser.cc |
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc |
index 77a741f1e8544a43abc48204bf058d66ba4fcc11..91c14cce497bd285e33bcd4896589b08bbe06437 100644 |
--- a/src/regexp/regexp-parser.cc |
+++ b/src/regexp/regexp-parser.cc |
@@ -102,11 +102,28 @@ void RegExpParser::Advance(int dist) { |
bool RegExpParser::simple() { return simple_; } |
- |
-bool RegExpParser::IsSyntaxCharacter(uc32 c) { |
- return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' || |
- c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' || |
- c == '{' || c == '}' || c == '|'; |
+bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { |
+ switch (c) { |
+ case '^': |
+ case '$': |
+ case '\\': |
+ case '.': |
+ case '*': |
+ case '+': |
+ case '?': |
+ case '(': |
+ case ')': |
+ case '[': |
+ case ']': |
+ case '{': |
+ case '}': |
+ case '|': |
+ case '/': |
+ return true; |
+ default: |
+ break; |
+ } |
+ return false; |
} |
@@ -161,14 +178,14 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
case kEndMarker: |
if (state->IsSubexpression()) { |
// Inside a parenthesized group when hitting end of input. |
- ReportError(CStrVector("Unterminated group") CHECK_FAILED); |
+ return ReportError(CStrVector("Unterminated group")); |
} |
DCHECK_EQ(INITIAL, state->group_type()); |
// Parsing completed successfully. |
return builder->ToRegExp(); |
case ')': { |
if (!state->IsSubexpression()) { |
- ReportError(CStrVector("Unmatched ')'") CHECK_FAILED); |
+ return ReportError(CStrVector("Unmatched ')'")); |
} |
DCHECK_NE(INITIAL, state->group_type()); |
@@ -276,13 +293,12 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
} |
// Fall through. |
default: |
- ReportError(CStrVector("Invalid group") CHECK_FAILED); |
- break; |
+ return ReportError(CStrVector("Invalid group")); |
} |
Advance(2); |
} else { |
if (captures_started_ >= kMaxCaptures) { |
- ReportError(CStrVector("Too many captures") CHECK_FAILED); |
+ return ReportError(CStrVector("Too many captures")); |
} |
captures_started_++; |
} |
@@ -360,24 +376,25 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
} |
break; |
} |
+ // With /u, no identity escapes except for syntax characters |
+ // are allowed. Otherwise, all identity escapes are allowed. |
+ if (unicode()) { |
+ return ReportError(CStrVector("Invalid escape")); |
+ } |
uc32 first_digit = Next(); |
if (first_digit == '8' || first_digit == '9') { |
- // If the 'u' flag is present, only syntax characters can be |
- // escaped, |
- // no other identity escapes are allowed. If the 'u' flag is not |
- // present, all identity escapes are allowed. |
- if (!unicode()) { |
- builder->AddCharacter(first_digit); |
- Advance(2); |
- } else { |
- return ReportError(CStrVector("Invalid escape")); |
- } |
+ builder->AddCharacter(first_digit); |
+ Advance(2); |
break; |
} |
} |
// FALLTHROUGH |
case '0': { |
Advance(); |
+ if (unicode() && Next() >= '0' && Next() <= '9') { |
+ // With /u, decimal escape with leading 0 are not parsed as octal. |
+ return ReportError(CStrVector("Invalid decimal escape")); |
+ } |
uc32 octal = ParseOctalLiteral(); |
builder->AddCharacter(octal); |
break; |
@@ -415,6 +432,10 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
// This is outside the specification. We match JSC in |
// reading the backslash as a literal character instead |
// of as starting an escape. |
+ if (unicode()) { |
+ // With /u, invalid escapes are not treated as identity escapes. |
+ return ReportError(CStrVector("Invalid unicode escape")); |
+ } |
builder->AddCharacter('\\'); |
} else { |
Advance(2); |
@@ -430,8 +451,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
} else if (!unicode()) { |
builder->AddCharacter('x'); |
} else { |
- // If the 'u' flag is present, invalid escapes are not treated as |
- // identity escapes. |
+ // With /u, invalid escapes are not treated as identity escapes. |
return ReportError(CStrVector("Invalid escape")); |
} |
break; |
@@ -444,20 +464,16 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
} else if (!unicode()) { |
builder->AddCharacter('u'); |
} else { |
- // If the 'u' flag is present, invalid escapes are not treated as |
- // identity escapes. |
+ // With /u, invalid escapes are not treated as identity escapes. |
return ReportError(CStrVector("Invalid unicode escape")); |
} |
break; |
} |
default: |
Advance(); |
- // If the 'u' flag is present, only syntax characters can be |
- // escaped, no |
- // other identity escapes are allowed. If the 'u' flag is not |
- // present, |
- // all identity escapes are allowed. |
- if (!unicode() || IsSyntaxCharacter(current())) { |
+ // With /u, no identity escapes except for syntax characters |
+ // are allowed. Otherwise, all identity escapes are allowed. |
+ if (!unicode() || IsSyntaxCharacterOrSlash(current())) { |
builder->AddCharacter(current()); |
Advance(); |
} else { |
@@ -469,10 +485,16 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
case '{': { |
int dummy; |
if (ParseIntervalQuantifier(&dummy, &dummy)) { |
- ReportError(CStrVector("Nothing to repeat") CHECK_FAILED); |
+ return ReportError(CStrVector("Nothing to repeat")); |
} |
// fallthrough |
} |
+ case '}': |
+ case ']': |
+ if (unicode()) { |
+ return ReportError(CStrVector("Lone quantifier brackets")); |
+ } |
+ // fallthrough |
default: |
builder->AddUnicodeCharacter(current()); |
Advance(); |
@@ -505,13 +527,15 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
case '{': |
if (ParseIntervalQuantifier(&min, &max)) { |
if (max < min) { |
- ReportError(CStrVector("numbers out of order in {} quantifier.") |
- CHECK_FAILED); |
+ return ReportError( |
+ CStrVector("numbers out of order in {} quantifier")); |
} |
break; |
- } else { |
- continue; |
+ } else if (unicode()) { |
+ // With /u, incomplete quantifiers are not allowed. |
+ return ReportError(CStrVector("Incomplete quantifier")); |
} |
+ continue; |
default: |
continue; |
} |
@@ -524,7 +548,9 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
quantifier_type = RegExpQuantifier::POSSESSIVE; |
Advance(); |
} |
- builder->AddQuantifierToAtom(min, max, quantifier_type); |
+ if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { |
+ return ReportError(CStrVector("Invalid quantifier")); |
+ } |
} |
} |
@@ -822,15 +848,24 @@ uc32 RegExpParser::ParseClassCharacterEscape() { |
case 'c': { |
uc32 controlLetter = Next(); |
uc32 letter = controlLetter & ~('A' ^ 'a'); |
- // For compatibility with JSC, inside a character class |
- // we also accept digits and underscore as control characters. |
- if ((controlLetter >= '0' && controlLetter <= '9') || |
- controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) { |
+ // For compatibility with JSC, inside a character class. We also accept |
+ // digits and underscore as control characters, unless with /u. |
+ if (letter >= 'A' && letter <= 'Z') { |
Advance(2); |
// Control letters mapped to ASCII control characters in the range |
// 0x00-0x1f. |
return controlLetter & 0x1f; |
} |
+ if (unicode()) { |
+ // With /u, invalid escapes are not treated as identity escapes. |
+ ReportError(CStrVector("Invalid class escape")); |
+ return 0; |
+ } |
+ if ((controlLetter >= '0' && controlLetter <= '9') || |
+ controlLetter == '_') { |
+ Advance(2); |
+ return controlLetter & 0x1f; |
+ } |
// We match JSC in reading the backslash as a literal |
// character instead of as starting an escape. |
return '\\'; |
@@ -846,43 +881,43 @@ uc32 RegExpParser::ParseClassCharacterEscape() { |
// For compatibility, we interpret a decimal escape that isn't |
// a back reference (and therefore either \0 or not valid according |
// to the specification) as a 1..3 digit octal character code. |
+ if (unicode()) { |
+ // With /u, decimal escape is not interpreted as octal character code. |
+ ReportError(CStrVector("Invalid class escape")); |
+ return 0; |
+ } |
return ParseOctalLiteral(); |
case 'x': { |
Advance(); |
uc32 value; |
- if (ParseHexEscape(2, &value)) { |
- return value; |
- } |
- if (!unicode()) { |
- // If \x is not followed by a two-digit hexadecimal, treat it |
- // as an identity escape. |
- return 'x'; |
+ if (ParseHexEscape(2, &value)) return value; |
+ if (unicode()) { |
+ // With /u, invalid escapes are not treated as identity escapes. |
+ ReportError(CStrVector("Invalid escape")); |
+ return 0; |
} |
- // If the 'u' flag is present, invalid escapes are not treated as |
- // identity escapes. |
- ReportError(CStrVector("Invalid escape")); |
- return 0; |
+ // If \x is not followed by a two-digit hexadecimal, treat it |
+ // as an identity escape. |
+ return 'x'; |
} |
case 'u': { |
Advance(); |
uc32 value; |
- if (ParseUnicodeEscape(&value)) { |
- return value; |
- } |
- if (!unicode()) { |
- return 'u'; |
+ if (ParseUnicodeEscape(&value)) return value; |
+ if (unicode()) { |
+ // With /u, invalid escapes are not treated as identity escapes. |
+ ReportError(CStrVector("Invalid unicode escape")); |
+ return 0; |
} |
- // If the 'u' flag is present, invalid escapes are not treated as |
- // identity escapes. |
- ReportError(CStrVector("Invalid unicode escape")); |
- return 0; |
+ // If \u is not followed by a two-digit hexadecimal, treat it |
+ // as an identity escape. |
+ return 'u'; |
} |
default: { |
uc32 result = current(); |
- // If the 'u' flag is present, only syntax characters can be escaped, no |
- // other identity escapes are allowed. If the 'u' flag is not present, all |
- // identity escapes are allowed. |
- if (!unicode() || IsSyntaxCharacter(result)) { |
+ // With /u, no identity escapes except for syntax characters are |
+ // allowed. Otherwise, all identity escapes are allowed. |
+ if (!unicode() || IsSyntaxCharacterOrSlash(result)) { |
Advance(); |
return result; |
} |
@@ -956,6 +991,7 @@ static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, |
RegExpTree* RegExpParser::ParseCharacterClass() { |
static const char* kUnterminated = "Unterminated character class"; |
+ static const char* kRangeInvalid = "Invalid character class"; |
static const char* kRangeOutOfOrder = "Range out of order in character class"; |
DCHECK_EQ(current(), '['); |
@@ -985,13 +1021,18 @@ RegExpTree* RegExpParser::ParseCharacterClass() { |
CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); |
if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { |
// Either end is an escaped character class. Treat the '-' verbatim. |
+ if (unicode()) { |
+ // ES2015 21.2.2.15.1 step 1. |
+ return ReportError(CStrVector(kRangeInvalid)); |
+ } |
AddRangeOrEscape(ranges, char_class, first, zone()); |
ranges->Add(CharacterRange::Singleton('-'), zone()); |
AddRangeOrEscape(ranges, char_class_2, next, zone()); |
continue; |
} |
+ // ES2015 21.2.2.15.1 step 6. |
if (first.from() > next.to()) { |
- return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED); |
+ return ReportError(CStrVector(kRangeOutOfOrder)); |
} |
ranges->Add(CharacterRange::Range(first.from(), next.to()), zone()); |
} else { |
@@ -999,7 +1040,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() { |
} |
} |
if (!has_more()) { |
- return ReportError(CStrVector(kUnterminated) CHECK_FAILED); |
+ return ReportError(CStrVector(kUnterminated)); |
} |
Advance(); |
if (ranges->length() == 0) { |
@@ -1162,7 +1203,7 @@ void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
if (NeedsDesugaringForUnicode(cc)) { |
- // In unicode mode, character class needs to be desugared, so it |
+ // With /u, character class needs to be desugared, so it |
// must be a standalone term instead of being part of a RegExpText. |
AddTerm(cc); |
} else { |
@@ -1275,13 +1316,12 @@ RegExpTree* RegExpBuilder::ToRegExp() { |
return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); |
} |
- |
-void RegExpBuilder::AddQuantifierToAtom( |
+bool RegExpBuilder::AddQuantifierToAtom( |
int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { |
FlushPendingSurrogate(); |
if (pending_empty_) { |
pending_empty_ = false; |
- return; |
+ return true; |
} |
RegExpTree* atom; |
if (characters_ != NULL) { |
@@ -1304,23 +1344,26 @@ void RegExpBuilder::AddQuantifierToAtom( |
} else if (terms_.length() > 0) { |
DCHECK(last_added_ == ADD_ATOM); |
atom = terms_.RemoveLast(); |
+ // With /u, lookarounds are not quantifiable. |
+ if (unicode() && atom->IsLookaround()) return false; |
if (atom->max_match() == 0) { |
// Guaranteed to only match an empty string. |
LAST(ADD_TERM); |
if (min == 0) { |
- return; |
+ return true; |
} |
terms_.Add(atom, zone()); |
- return; |
+ return true; |
} |
} else { |
// Only call immediately after adding an atom or character! |
UNREACHABLE(); |
- return; |
+ return false; |
} |
terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
zone()); |
LAST(ADD_TERM); |
+ return true; |
} |
} // namespace internal |