Index: src/regexp/regexp-parser.cc |
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc |
index 91c14cce497bd285e33bcd4896589b08bbe06437..77a741f1e8544a43abc48204bf058d66ba4fcc11 100644 |
--- a/src/regexp/regexp-parser.cc |
+++ b/src/regexp/regexp-parser.cc |
@@ -102,28 +102,11 @@ |
bool RegExpParser::simple() { return simple_; } |
-bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { |
- switch (c) { |
- case '^': |
- case '$': |
- case '\\': |
- case '.': |
- case '*': |
- case '+': |
- case '?': |
- case '(': |
- case ')': |
- case '[': |
- case ']': |
- case '{': |
- case '}': |
- case '|': |
- case '/': |
- return true; |
- default: |
- break; |
- } |
- return false; |
+ |
+bool RegExpParser::IsSyntaxCharacter(uc32 c) { |
+ return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' || |
+ c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' || |
+ c == '{' || c == '}' || c == '|'; |
} |
@@ -178,14 +161,14 @@ |
case kEndMarker: |
if (state->IsSubexpression()) { |
// Inside a parenthesized group when hitting end of input. |
- return ReportError(CStrVector("Unterminated group")); |
+ ReportError(CStrVector("Unterminated group") CHECK_FAILED); |
} |
DCHECK_EQ(INITIAL, state->group_type()); |
// Parsing completed successfully. |
return builder->ToRegExp(); |
case ')': { |
if (!state->IsSubexpression()) { |
- return ReportError(CStrVector("Unmatched ')'")); |
+ ReportError(CStrVector("Unmatched ')'") CHECK_FAILED); |
} |
DCHECK_NE(INITIAL, state->group_type()); |
@@ -293,12 +276,13 @@ |
} |
// Fall through. |
default: |
- return ReportError(CStrVector("Invalid group")); |
+ ReportError(CStrVector("Invalid group") CHECK_FAILED); |
+ break; |
} |
Advance(2); |
} else { |
if (captures_started_ >= kMaxCaptures) { |
- return ReportError(CStrVector("Too many captures")); |
+ ReportError(CStrVector("Too many captures") CHECK_FAILED); |
} |
captures_started_++; |
} |
@@ -376,25 +360,24 @@ |
} |
break; |
} |
- // With /u, no identity escapes except for syntax characters |
- // are allowed. Otherwise, all identity escapes are allowed. |
- if (unicode()) { |
- return ReportError(CStrVector("Invalid escape")); |
- } |
uc32 first_digit = Next(); |
if (first_digit == '8' || first_digit == '9') { |
- builder->AddCharacter(first_digit); |
- Advance(2); |
+ // If the 'u' flag is present, only syntax characters can be |
+ // escaped, |
+ // no other identity escapes are allowed. If the 'u' flag is not |
+ // present, all identity escapes are allowed. |
+ if (!unicode()) { |
+ builder->AddCharacter(first_digit); |
+ Advance(2); |
+ } else { |
+ return ReportError(CStrVector("Invalid escape")); |
+ } |
break; |
} |
} |
// FALLTHROUGH |
case '0': { |
Advance(); |
- if (unicode() && Next() >= '0' && Next() <= '9') { |
- // With /u, decimal escape with leading 0 are not parsed as octal. |
- return ReportError(CStrVector("Invalid decimal escape")); |
- } |
uc32 octal = ParseOctalLiteral(); |
builder->AddCharacter(octal); |
break; |
@@ -432,10 +415,6 @@ |
// This is outside the specification. We match JSC in |
// reading the backslash as a literal character instead |
// of as starting an escape. |
- if (unicode()) { |
- // With /u, invalid escapes are not treated as identity escapes. |
- return ReportError(CStrVector("Invalid unicode escape")); |
- } |
builder->AddCharacter('\\'); |
} else { |
Advance(2); |
@@ -451,7 +430,8 @@ |
} else if (!unicode()) { |
builder->AddCharacter('x'); |
} else { |
- // With /u, invalid escapes are not treated as identity escapes. |
+ // If the 'u' flag is present, invalid escapes are not treated as |
+ // identity escapes. |
return ReportError(CStrVector("Invalid escape")); |
} |
break; |
@@ -464,16 +444,20 @@ |
} else if (!unicode()) { |
builder->AddCharacter('u'); |
} else { |
- // With /u, invalid escapes are not treated as identity escapes. |
+ // If the 'u' flag is present, invalid escapes are not treated as |
+ // identity escapes. |
return ReportError(CStrVector("Invalid unicode escape")); |
} |
break; |
} |
default: |
Advance(); |
- // With /u, no identity escapes except for syntax characters |
- // are allowed. Otherwise, all identity escapes are allowed. |
- if (!unicode() || IsSyntaxCharacterOrSlash(current())) { |
+ // If the 'u' flag is present, only syntax characters can be |
+ // escaped, no |
+ // other identity escapes are allowed. If the 'u' flag is not |
+ // present, |
+ // all identity escapes are allowed. |
+ if (!unicode() || IsSyntaxCharacter(current())) { |
builder->AddCharacter(current()); |
Advance(); |
} else { |
@@ -485,16 +469,10 @@ |
case '{': { |
int dummy; |
if (ParseIntervalQuantifier(&dummy, &dummy)) { |
- return ReportError(CStrVector("Nothing to repeat")); |
+ ReportError(CStrVector("Nothing to repeat") CHECK_FAILED); |
} |
// fallthrough |
} |
- case '}': |
- case ']': |
- if (unicode()) { |
- return ReportError(CStrVector("Lone quantifier brackets")); |
- } |
- // fallthrough |
default: |
builder->AddUnicodeCharacter(current()); |
Advance(); |
@@ -527,15 +505,13 @@ |
case '{': |
if (ParseIntervalQuantifier(&min, &max)) { |
if (max < min) { |
- return ReportError( |
- CStrVector("numbers out of order in {} quantifier")); |
+ ReportError(CStrVector("numbers out of order in {} quantifier.") |
+ CHECK_FAILED); |
} |
break; |
- } else if (unicode()) { |
- // With /u, incomplete quantifiers are not allowed. |
- return ReportError(CStrVector("Incomplete quantifier")); |
+ } else { |
+ continue; |
} |
- continue; |
default: |
continue; |
} |
@@ -548,9 +524,7 @@ |
quantifier_type = RegExpQuantifier::POSSESSIVE; |
Advance(); |
} |
- if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { |
- return ReportError(CStrVector("Invalid quantifier")); |
- } |
+ builder->AddQuantifierToAtom(min, max, quantifier_type); |
} |
} |
@@ -848,22 +822,13 @@ |
case 'c': { |
uc32 controlLetter = Next(); |
uc32 letter = controlLetter & ~('A' ^ 'a'); |
- // For compatibility with JSC, inside a character class. We also accept |
- // digits and underscore as control characters, unless with /u. |
- if (letter >= 'A' && letter <= 'Z') { |
+ // For compatibility with JSC, inside a character class |
+ // we also accept digits and underscore as control characters. |
+ if ((controlLetter >= '0' && controlLetter <= '9') || |
+ controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) { |
Advance(2); |
// Control letters mapped to ASCII control characters in the range |
// 0x00-0x1f. |
- return controlLetter & 0x1f; |
- } |
- if (unicode()) { |
- // With /u, invalid escapes are not treated as identity escapes. |
- ReportError(CStrVector("Invalid class escape")); |
- return 0; |
- } |
- if ((controlLetter >= '0' && controlLetter <= '9') || |
- controlLetter == '_') { |
- Advance(2); |
return controlLetter & 0x1f; |
} |
// We match JSC in reading the backslash as a literal |
@@ -881,43 +846,43 @@ |
// For compatibility, we interpret a decimal escape that isn't |
// a back reference (and therefore either \0 or not valid according |
// to the specification) as a 1..3 digit octal character code. |
- if (unicode()) { |
- // With /u, decimal escape is not interpreted as octal character code. |
- ReportError(CStrVector("Invalid class escape")); |
- return 0; |
- } |
return ParseOctalLiteral(); |
case 'x': { |
Advance(); |
uc32 value; |
- if (ParseHexEscape(2, &value)) return value; |
- if (unicode()) { |
- // With /u, invalid escapes are not treated as identity escapes. |
- ReportError(CStrVector("Invalid escape")); |
- return 0; |
- } |
- // If \x is not followed by a two-digit hexadecimal, treat it |
- // as an identity escape. |
- return 'x'; |
+ if (ParseHexEscape(2, &value)) { |
+ return value; |
+ } |
+ if (!unicode()) { |
+ // If \x is not followed by a two-digit hexadecimal, treat it |
+ // as an identity escape. |
+ return 'x'; |
+ } |
+ // If the 'u' flag is present, invalid escapes are not treated as |
+ // identity escapes. |
+ ReportError(CStrVector("Invalid escape")); |
+ return 0; |
} |
case 'u': { |
Advance(); |
uc32 value; |
- if (ParseUnicodeEscape(&value)) return value; |
- if (unicode()) { |
- // With /u, invalid escapes are not treated as identity escapes. |
- ReportError(CStrVector("Invalid unicode escape")); |
- return 0; |
- } |
- // If \u is not followed by a two-digit hexadecimal, treat it |
- // as an identity escape. |
- return 'u'; |
+ if (ParseUnicodeEscape(&value)) { |
+ return value; |
+ } |
+ if (!unicode()) { |
+ return 'u'; |
+ } |
+ // If the 'u' flag is present, invalid escapes are not treated as |
+ // identity escapes. |
+ ReportError(CStrVector("Invalid unicode escape")); |
+ return 0; |
} |
default: { |
uc32 result = current(); |
- // With /u, no identity escapes except for syntax characters are |
- // allowed. Otherwise, all identity escapes are allowed. |
- if (!unicode() || IsSyntaxCharacterOrSlash(result)) { |
+ // If the 'u' flag is present, only syntax characters can be escaped, no |
+ // other identity escapes are allowed. If the 'u' flag is not present, all |
+ // identity escapes are allowed. |
+ if (!unicode() || IsSyntaxCharacter(result)) { |
Advance(); |
return result; |
} |
@@ -991,7 +956,6 @@ |
RegExpTree* RegExpParser::ParseCharacterClass() { |
static const char* kUnterminated = "Unterminated character class"; |
- static const char* kRangeInvalid = "Invalid character class"; |
static const char* kRangeOutOfOrder = "Range out of order in character class"; |
DCHECK_EQ(current(), '['); |
@@ -1021,18 +985,13 @@ |
CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); |
if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { |
// Either end is an escaped character class. Treat the '-' verbatim. |
- if (unicode()) { |
- // ES2015 21.2.2.15.1 step 1. |
- return ReportError(CStrVector(kRangeInvalid)); |
- } |
AddRangeOrEscape(ranges, char_class, first, zone()); |
ranges->Add(CharacterRange::Singleton('-'), zone()); |
AddRangeOrEscape(ranges, char_class_2, next, zone()); |
continue; |
} |
- // ES2015 21.2.2.15.1 step 6. |
if (first.from() > next.to()) { |
- return ReportError(CStrVector(kRangeOutOfOrder)); |
+ return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED); |
} |
ranges->Add(CharacterRange::Range(first.from(), next.to()), zone()); |
} else { |
@@ -1040,7 +999,7 @@ |
} |
} |
if (!has_more()) { |
- return ReportError(CStrVector(kUnterminated)); |
+ return ReportError(CStrVector(kUnterminated) CHECK_FAILED); |
} |
Advance(); |
if (ranges->length() == 0) { |
@@ -1203,7 +1162,7 @@ |
void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
if (NeedsDesugaringForUnicode(cc)) { |
- // With /u, character class needs to be desugared, so it |
+ // In unicode mode, character class needs to be desugared, so it |
// must be a standalone term instead of being part of a RegExpText. |
AddTerm(cc); |
} else { |
@@ -1316,12 +1275,13 @@ |
return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); |
} |
-bool RegExpBuilder::AddQuantifierToAtom( |
+ |
+void RegExpBuilder::AddQuantifierToAtom( |
int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { |
FlushPendingSurrogate(); |
if (pending_empty_) { |
pending_empty_ = false; |
- return true; |
+ return; |
} |
RegExpTree* atom; |
if (characters_ != NULL) { |
@@ -1344,26 +1304,23 @@ |
} else if (terms_.length() > 0) { |
DCHECK(last_added_ == ADD_ATOM); |
atom = terms_.RemoveLast(); |
- // With /u, lookarounds are not quantifiable. |
- if (unicode() && atom->IsLookaround()) return false; |
if (atom->max_match() == 0) { |
// Guaranteed to only match an empty string. |
LAST(ADD_TERM); |
if (min == 0) { |
- return true; |
+ return; |
} |
terms_.Add(atom, zone()); |
- return true; |
+ return; |
} |
} else { |
// Only call immediately after adding an atom or character! |
UNREACHABLE(); |
- return false; |
+ return; |
} |
terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
zone()); |
LAST(ADD_TERM); |
- return true; |
} |
} // namespace internal |