Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1389)

Unified Diff: src/regexp/regexp-parser.cc

Issue 1648673002: Revert of [regexp] restrict pattern syntax for unicode mode. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@stage
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-regexp-restricted-syntax.js » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/regexp/regexp-parser.cc
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc
index 91c14cce497bd285e33bcd4896589b08bbe06437..77a741f1e8544a43abc48204bf058d66ba4fcc11 100644
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
@@ -102,28 +102,11 @@
bool RegExpParser::simple() { return simple_; }
-bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
- switch (c) {
- case '^':
- case '$':
- case '\\':
- case '.':
- case '*':
- case '+':
- case '?':
- case '(':
- case ')':
- case '[':
- case ']':
- case '{':
- case '}':
- case '|':
- case '/':
- return true;
- default:
- break;
- }
- return false;
+
+bool RegExpParser::IsSyntaxCharacter(uc32 c) {
+ return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
+ c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
+ c == '{' || c == '}' || c == '|';
}
@@ -178,14 +161,14 @@
case kEndMarker:
if (state->IsSubexpression()) {
// Inside a parenthesized group when hitting end of input.
- return ReportError(CStrVector("Unterminated group"));
+ ReportError(CStrVector("Unterminated group") CHECK_FAILED);
}
DCHECK_EQ(INITIAL, state->group_type());
// Parsing completed successfully.
return builder->ToRegExp();
case ')': {
if (!state->IsSubexpression()) {
- return ReportError(CStrVector("Unmatched ')'"));
+ ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
}
DCHECK_NE(INITIAL, state->group_type());
@@ -293,12 +276,13 @@
}
// Fall through.
default:
- return ReportError(CStrVector("Invalid group"));
+ ReportError(CStrVector("Invalid group") CHECK_FAILED);
+ break;
}
Advance(2);
} else {
if (captures_started_ >= kMaxCaptures) {
- return ReportError(CStrVector("Too many captures"));
+ ReportError(CStrVector("Too many captures") CHECK_FAILED);
}
captures_started_++;
}
@@ -376,25 +360,24 @@
}
break;
}
- // With /u, no identity escapes except for syntax characters
- // are allowed. Otherwise, all identity escapes are allowed.
- if (unicode()) {
- return ReportError(CStrVector("Invalid escape"));
- }
uc32 first_digit = Next();
if (first_digit == '8' || first_digit == '9') {
- builder->AddCharacter(first_digit);
- Advance(2);
+ // If the 'u' flag is present, only syntax characters can be
+ // escaped,
+ // no other identity escapes are allowed. If the 'u' flag is not
+ // present, all identity escapes are allowed.
+ if (!unicode()) {
+ builder->AddCharacter(first_digit);
+ Advance(2);
+ } else {
+ return ReportError(CStrVector("Invalid escape"));
+ }
break;
}
}
// FALLTHROUGH
case '0': {
Advance();
- if (unicode() && Next() >= '0' && Next() <= '9') {
- // With /u, decimal escape with leading 0 are not parsed as octal.
- return ReportError(CStrVector("Invalid decimal escape"));
- }
uc32 octal = ParseOctalLiteral();
builder->AddCharacter(octal);
break;
@@ -432,10 +415,6 @@
// This is outside the specification. We match JSC in
// reading the backslash as a literal character instead
// of as starting an escape.
- if (unicode()) {
- // With /u, invalid escapes are not treated as identity escapes.
- return ReportError(CStrVector("Invalid unicode escape"));
- }
builder->AddCharacter('\\');
} else {
Advance(2);
@@ -451,7 +430,8 @@
} else if (!unicode()) {
builder->AddCharacter('x');
} else {
- // With /u, invalid escapes are not treated as identity escapes.
+ // If the 'u' flag is present, invalid escapes are not treated as
+ // identity escapes.
return ReportError(CStrVector("Invalid escape"));
}
break;
@@ -464,16 +444,20 @@
} else if (!unicode()) {
builder->AddCharacter('u');
} else {
- // With /u, invalid escapes are not treated as identity escapes.
+ // If the 'u' flag is present, invalid escapes are not treated as
+ // identity escapes.
return ReportError(CStrVector("Invalid unicode escape"));
}
break;
}
default:
Advance();
- // With /u, no identity escapes except for syntax characters
- // are allowed. Otherwise, all identity escapes are allowed.
- if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
+ // If the 'u' flag is present, only syntax characters can be
+ // escaped, no
+ // other identity escapes are allowed. If the 'u' flag is not
+ // present,
+ // all identity escapes are allowed.
+ if (!unicode() || IsSyntaxCharacter(current())) {
builder->AddCharacter(current());
Advance();
} else {
@@ -485,16 +469,10 @@
case '{': {
int dummy;
if (ParseIntervalQuantifier(&dummy, &dummy)) {
- return ReportError(CStrVector("Nothing to repeat"));
+ ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
}
// fallthrough
}
- case '}':
- case ']':
- if (unicode()) {
- return ReportError(CStrVector("Lone quantifier brackets"));
- }
- // fallthrough
default:
builder->AddUnicodeCharacter(current());
Advance();
@@ -527,15 +505,13 @@
case '{':
if (ParseIntervalQuantifier(&min, &max)) {
if (max < min) {
- return ReportError(
- CStrVector("numbers out of order in {} quantifier"));
+ ReportError(CStrVector("numbers out of order in {} quantifier.")
+ CHECK_FAILED);
}
break;
- } else if (unicode()) {
- // With /u, incomplete quantifiers are not allowed.
- return ReportError(CStrVector("Incomplete quantifier"));
+ } else {
+ continue;
}
- continue;
default:
continue;
}
@@ -548,9 +524,7 @@
quantifier_type = RegExpQuantifier::POSSESSIVE;
Advance();
}
- if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
- return ReportError(CStrVector("Invalid quantifier"));
- }
+ builder->AddQuantifierToAtom(min, max, quantifier_type);
}
}
@@ -848,22 +822,13 @@
case 'c': {
uc32 controlLetter = Next();
uc32 letter = controlLetter & ~('A' ^ 'a');
- // For compatibility with JSC, inside a character class. We also accept
- // digits and underscore as control characters, unless with /u.
- if (letter >= 'A' && letter <= 'Z') {
+ // For compatibility with JSC, inside a character class
+ // we also accept digits and underscore as control characters.
+ if ((controlLetter >= '0' && controlLetter <= '9') ||
+ controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
Advance(2);
// Control letters mapped to ASCII control characters in the range
// 0x00-0x1f.
- return controlLetter & 0x1f;
- }
- if (unicode()) {
- // With /u, invalid escapes are not treated as identity escapes.
- ReportError(CStrVector("Invalid class escape"));
- return 0;
- }
- if ((controlLetter >= '0' && controlLetter <= '9') ||
- controlLetter == '_') {
- Advance(2);
return controlLetter & 0x1f;
}
// We match JSC in reading the backslash as a literal
@@ -881,43 +846,43 @@
// For compatibility, we interpret a decimal escape that isn't
// a back reference (and therefore either \0 or not valid according
// to the specification) as a 1..3 digit octal character code.
- if (unicode()) {
- // With /u, decimal escape is not interpreted as octal character code.
- ReportError(CStrVector("Invalid class escape"));
- return 0;
- }
return ParseOctalLiteral();
case 'x': {
Advance();
uc32 value;
- if (ParseHexEscape(2, &value)) return value;
- if (unicode()) {
- // With /u, invalid escapes are not treated as identity escapes.
- ReportError(CStrVector("Invalid escape"));
- return 0;
- }
- // If \x is not followed by a two-digit hexadecimal, treat it
- // as an identity escape.
- return 'x';
+ if (ParseHexEscape(2, &value)) {
+ return value;
+ }
+ if (!unicode()) {
+ // If \x is not followed by a two-digit hexadecimal, treat it
+ // as an identity escape.
+ return 'x';
+ }
+ // If the 'u' flag is present, invalid escapes are not treated as
+ // identity escapes.
+ ReportError(CStrVector("Invalid escape"));
+ return 0;
}
case 'u': {
Advance();
uc32 value;
- if (ParseUnicodeEscape(&value)) return value;
- if (unicode()) {
- // With /u, invalid escapes are not treated as identity escapes.
- ReportError(CStrVector("Invalid unicode escape"));
- return 0;
- }
- // If \u is not followed by a two-digit hexadecimal, treat it
- // as an identity escape.
- return 'u';
+ if (ParseUnicodeEscape(&value)) {
+ return value;
+ }
+ if (!unicode()) {
+ return 'u';
+ }
+ // If the 'u' flag is present, invalid escapes are not treated as
+ // identity escapes.
+ ReportError(CStrVector("Invalid unicode escape"));
+ return 0;
}
default: {
uc32 result = current();
- // With /u, no identity escapes except for syntax characters are
- // allowed. Otherwise, all identity escapes are allowed.
- if (!unicode() || IsSyntaxCharacterOrSlash(result)) {
+ // If the 'u' flag is present, only syntax characters can be escaped, no
+ // other identity escapes are allowed. If the 'u' flag is not present, all
+ // identity escapes are allowed.
+ if (!unicode() || IsSyntaxCharacter(result)) {
Advance();
return result;
}
@@ -991,7 +956,6 @@
RegExpTree* RegExpParser::ParseCharacterClass() {
static const char* kUnterminated = "Unterminated character class";
- static const char* kRangeInvalid = "Invalid character class";
static const char* kRangeOutOfOrder = "Range out of order in character class";
DCHECK_EQ(current(), '[');
@@ -1021,18 +985,13 @@
CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
// Either end is an escaped character class. Treat the '-' verbatim.
- if (unicode()) {
- // ES2015 21.2.2.15.1 step 1.
- return ReportError(CStrVector(kRangeInvalid));
- }
AddRangeOrEscape(ranges, char_class, first, zone());
ranges->Add(CharacterRange::Singleton('-'), zone());
AddRangeOrEscape(ranges, char_class_2, next, zone());
continue;
}
- // ES2015 21.2.2.15.1 step 6.
if (first.from() > next.to()) {
- return ReportError(CStrVector(kRangeOutOfOrder));
+ return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
}
ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
} else {
@@ -1040,7 +999,7 @@
}
}
if (!has_more()) {
- return ReportError(CStrVector(kUnterminated));
+ return ReportError(CStrVector(kUnterminated) CHECK_FAILED);
}
Advance();
if (ranges->length() == 0) {
@@ -1203,7 +1162,7 @@
void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
if (NeedsDesugaringForUnicode(cc)) {
- // With /u, character class needs to be desugared, so it
+ // In unicode mode, character class needs to be desugared, so it
// must be a standalone term instead of being part of a RegExpText.
AddTerm(cc);
} else {
@@ -1316,12 +1275,13 @@
return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
}
-bool RegExpBuilder::AddQuantifierToAtom(
+
+void RegExpBuilder::AddQuantifierToAtom(
int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
FlushPendingSurrogate();
if (pending_empty_) {
pending_empty_ = false;
- return true;
+ return;
}
RegExpTree* atom;
if (characters_ != NULL) {
@@ -1344,26 +1304,23 @@
} else if (terms_.length() > 0) {
DCHECK(last_added_ == ADD_ATOM);
atom = terms_.RemoveLast();
- // With /u, lookarounds are not quantifiable.
- if (unicode() && atom->IsLookaround()) return false;
if (atom->max_match() == 0) {
// Guaranteed to only match an empty string.
LAST(ADD_TERM);
if (min == 0) {
- return true;
+ return;
}
terms_.Add(atom, zone());
- return true;
+ return;
}
} else {
// Only call immediately after adding an atom or character!
UNREACHABLE();
- return false;
+ return;
}
terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
zone());
LAST(ADD_TERM);
- return true;
}
} // namespace internal
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-regexp-restricted-syntax.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698