src/regexp/regexp-parser.cc - Issue 1648673002: Revert of [regexp] restrict pattern syntax for unicode mode.

Unified Diff: src/regexp/regexp-parser.cc

Issue 1648673002: Revert of [regexp] restrict pattern syntax for unicode mode. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@stage

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/regexp/regexp-parser.cc

diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc

index 91c14cce497bd285e33bcd4896589b08bbe06437..77a741f1e8544a43abc48204bf058d66ba4fcc11 100644

--- a/src/regexp/regexp-parser.cc

+++ b/src/regexp/regexp-parser.cc

@@ -102,28 +102,11 @@

bool RegExpParser::simple() { return simple_; }

-bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {

- switch (c) {

- case '^':

- case '$':

- case '\\':

- case '.':

- case '*':

- case '+':

- case '?':

- case '(':

- case ')':

- case '[':

- case ']':

- case '{':

- case '}':

- case '|':

- case '/':

- return true;

- default:

- break;

- }

- return false;

+bool RegExpParser::IsSyntaxCharacter(uc32 c) {

+ return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||

+ c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||

+ c == '{' || c == '}' || c == '|';

}

@@ -178,14 +161,14 @@

case kEndMarker:

if (state->IsSubexpression()) {

// Inside a parenthesized group when hitting end of input.

- return ReportError(CStrVector("Unterminated group"));

+ ReportError(CStrVector("Unterminated group") CHECK_FAILED);

}

DCHECK_EQ(INITIAL, state->group_type());

// Parsing completed successfully.

return builder->ToRegExp();

case ')': {

if (!state->IsSubexpression()) {

- return ReportError(CStrVector("Unmatched ')'"));

+ ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);

}

DCHECK_NE(INITIAL, state->group_type());

@@ -293,12 +276,13 @@

}

// Fall through.

default:

- return ReportError(CStrVector("Invalid group"));

+ ReportError(CStrVector("Invalid group") CHECK_FAILED);

+ break;

}

Advance(2);

} else {

if (captures_started_ >= kMaxCaptures) {

- return ReportError(CStrVector("Too many captures"));

+ ReportError(CStrVector("Too many captures") CHECK_FAILED);

}

captures_started_++;

}

@@ -376,25 +360,24 @@

}

break;

}

- // With /u, no identity escapes except for syntax characters

- // are allowed. Otherwise, all identity escapes are allowed.

- if (unicode()) {

- return ReportError(CStrVector("Invalid escape"));

- }

uc32 first_digit = Next();

if (first_digit == '8' || first_digit == '9') {

- builder->AddCharacter(first_digit);

- Advance(2);

+ // If the 'u' flag is present, only syntax characters can be

+ // escaped,

+ // no other identity escapes are allowed. If the 'u' flag is not

+ // present, all identity escapes are allowed.

+ if (!unicode()) {

+ builder->AddCharacter(first_digit);

+ Advance(2);

+ } else {

+ return ReportError(CStrVector("Invalid escape"));

+ }

break;

}

// FALLTHROUGH

case '0': {

Advance();

- if (unicode() && Next() >= '0' && Next() <= '9') {

- // With /u, decimal escape with leading 0 are not parsed as octal.

- return ReportError(CStrVector("Invalid decimal escape"));

- }

uc32 octal = ParseOctalLiteral();

builder->AddCharacter(octal);

break;

@@ -432,10 +415,6 @@

// This is outside the specification. We match JSC in

// reading the backslash as a literal character instead

// of as starting an escape.

- if (unicode()) {

- // With /u, invalid escapes are not treated as identity escapes.

- return ReportError(CStrVector("Invalid unicode escape"));

- }

builder->AddCharacter('\\');

} else {

Advance(2);

@@ -451,7 +430,8 @@

} else if (!unicode()) {

builder->AddCharacter('x');

} else {

- // With /u, invalid escapes are not treated as identity escapes.

+ // If the 'u' flag is present, invalid escapes are not treated as

+ // identity escapes.

return ReportError(CStrVector("Invalid escape"));

}

break;

@@ -464,16 +444,20 @@

} else if (!unicode()) {

builder->AddCharacter('u');

} else {

- // With /u, invalid escapes are not treated as identity escapes.

+ // If the 'u' flag is present, invalid escapes are not treated as

+ // identity escapes.

return ReportError(CStrVector("Invalid unicode escape"));

}

break;

}

default:

Advance();

- // With /u, no identity escapes except for syntax characters

- // are allowed. Otherwise, all identity escapes are allowed.

- if (!unicode() || IsSyntaxCharacterOrSlash(current())) {

+ // If the 'u' flag is present, only syntax characters can be

+ // escaped, no

+ // other identity escapes are allowed. If the 'u' flag is not

+ // present,

+ // all identity escapes are allowed.

+ if (!unicode() || IsSyntaxCharacter(current())) {

builder->AddCharacter(current());

Advance();

} else {

@@ -485,16 +469,10 @@

case '{': {

int dummy;

if (ParseIntervalQuantifier(&dummy, &dummy)) {

- return ReportError(CStrVector("Nothing to repeat"));

+ ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);

}

// fallthrough

}

- case '}':

- case ']':

- if (unicode()) {

- return ReportError(CStrVector("Lone quantifier brackets"));

- }

- // fallthrough

default:

builder->AddUnicodeCharacter(current());

Advance();

@@ -527,15 +505,13 @@

case '{':

if (ParseIntervalQuantifier(&min, &max)) {

if (max < min) {

- return ReportError(

- CStrVector("numbers out of order in {} quantifier"));

+ ReportError(CStrVector("numbers out of order in {} quantifier.")

+ CHECK_FAILED);

}

break;

- } else if (unicode()) {

- // With /u, incomplete quantifiers are not allowed.

- return ReportError(CStrVector("Incomplete quantifier"));

+ } else {

+ continue;

}

- continue;

default:

continue;

}

@@ -548,9 +524,7 @@

quantifier_type = RegExpQuantifier::POSSESSIVE;

Advance();

}

- if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {

- return ReportError(CStrVector("Invalid quantifier"));

- }

+ builder->AddQuantifierToAtom(min, max, quantifier_type);

}

@@ -848,22 +822,13 @@

case 'c': {

uc32 controlLetter = Next();

uc32 letter = controlLetter & ~('A' ^ 'a');

- // For compatibility with JSC, inside a character class. We also accept

- // digits and underscore as control characters, unless with /u.

- if (letter >= 'A' && letter <= 'Z') {

+ // For compatibility with JSC, inside a character class

+ // we also accept digits and underscore as control characters.

+ if ((controlLetter >= '0' && controlLetter <= '9') ||

+ controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {

Advance(2);

// Control letters mapped to ASCII control characters in the range

// 0x00-0x1f.

- return controlLetter & 0x1f;

- }

- if (unicode()) {

- // With /u, invalid escapes are not treated as identity escapes.

- ReportError(CStrVector("Invalid class escape"));

- return 0;

- }

- if ((controlLetter >= '0' && controlLetter <= '9') ||

- controlLetter == '_') {

- Advance(2);

return controlLetter & 0x1f;

}

// We match JSC in reading the backslash as a literal

@@ -881,43 +846,43 @@

// For compatibility, we interpret a decimal escape that isn't

// a back reference (and therefore either \0 or not valid according

// to the specification) as a 1..3 digit octal character code.

- if (unicode()) {

- // With /u, decimal escape is not interpreted as octal character code.

- ReportError(CStrVector("Invalid class escape"));

- return 0;

- }

return ParseOctalLiteral();

case 'x': {

Advance();

uc32 value;

- if (ParseHexEscape(2, &value)) return value;

- if (unicode()) {

- // With /u, invalid escapes are not treated as identity escapes.

- ReportError(CStrVector("Invalid escape"));

- return 0;

- }

- // If \x is not followed by a two-digit hexadecimal, treat it

- // as an identity escape.

- return 'x';

+ if (ParseHexEscape(2, &value)) {

+ return value;

+ }

+ if (!unicode()) {

+ // If \x is not followed by a two-digit hexadecimal, treat it

+ // as an identity escape.

+ return 'x';

+ }

+ // If the 'u' flag is present, invalid escapes are not treated as

+ // identity escapes.

+ ReportError(CStrVector("Invalid escape"));

+ return 0;

}

case 'u': {

Advance();

uc32 value;

- if (ParseUnicodeEscape(&value)) return value;

- if (unicode()) {

- // With /u, invalid escapes are not treated as identity escapes.

- ReportError(CStrVector("Invalid unicode escape"));

- return 0;

- }

- // If \u is not followed by a two-digit hexadecimal, treat it

- // as an identity escape.

- return 'u';

+ if (ParseUnicodeEscape(&value)) {

+ return value;

+ }

+ if (!unicode()) {

+ return 'u';

+ }

+ // If the 'u' flag is present, invalid escapes are not treated as

+ // identity escapes.

+ ReportError(CStrVector("Invalid unicode escape"));

+ return 0;

}

default: {

uc32 result = current();

- // With /u, no identity escapes except for syntax characters are

- // allowed. Otherwise, all identity escapes are allowed.

- if (!unicode() || IsSyntaxCharacterOrSlash(result)) {

+ // If the 'u' flag is present, only syntax characters can be escaped, no

+ // other identity escapes are allowed. If the 'u' flag is not present, all

+ // identity escapes are allowed.

+ if (!unicode() || IsSyntaxCharacter(result)) {

Advance();

return result;

}

@@ -991,7 +956,6 @@

RegExpTree* RegExpParser::ParseCharacterClass() {

static const char* kUnterminated = "Unterminated character class";

- static const char* kRangeInvalid = "Invalid character class";

static const char* kRangeOutOfOrder = "Range out of order in character class";

DCHECK_EQ(current(), '[');

@@ -1021,18 +985,13 @@

CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);

if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {

// Either end is an escaped character class. Treat the '-' verbatim.

- if (unicode()) {

- // ES2015 21.2.2.15.1 step 1.

- return ReportError(CStrVector(kRangeInvalid));

- }

AddRangeOrEscape(ranges, char_class, first, zone());

ranges->Add(CharacterRange::Singleton('-'), zone());

AddRangeOrEscape(ranges, char_class_2, next, zone());

continue;

}

- // ES2015 21.2.2.15.1 step 6.

if (first.from() > next.to()) {

- return ReportError(CStrVector(kRangeOutOfOrder));

+ return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);

}

ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());

} else {

@@ -1040,7 +999,7 @@

}

if (!has_more()) {

- return ReportError(CStrVector(kUnterminated));

+ return ReportError(CStrVector(kUnterminated) CHECK_FAILED);

}

Advance();

if (ranges->length() == 0) {

@@ -1203,7 +1162,7 @@

void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {

if (NeedsDesugaringForUnicode(cc)) {

- // With /u, character class needs to be desugared, so it

+ // In unicode mode, character class needs to be desugared, so it

// must be a standalone term instead of being part of a RegExpText.

AddTerm(cc);

} else {

@@ -1316,12 +1275,13 @@

return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));

}

-bool RegExpBuilder::AddQuantifierToAtom(

+void RegExpBuilder::AddQuantifierToAtom(

int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {

FlushPendingSurrogate();

if (pending_empty_) {

pending_empty_ = false;

- return true;

+ return;

}

RegExpTree* atom;

if (characters_ != NULL) {

@@ -1344,26 +1304,23 @@

} else if (terms_.length() > 0) {

DCHECK(last_added_ == ADD_ATOM);

atom = terms_.RemoveLast();

- // With /u, lookarounds are not quantifiable.

- if (unicode() && atom->IsLookaround()) return false;

if (atom->max_match() == 0) {

// Guaranteed to only match an empty string.

LAST(ADD_TERM);

if (min == 0) {

- return true;

+ return;

}

terms_.Add(atom, zone());

- return true;

+ return;

}

} else {

// Only call immediately after adding an atom or character!

UNREACHABLE();

- return false;

+ return;

}

terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),

zone());

LAST(ADD_TERM);

- return true;

}

} // namespace internal

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-regexp-restricted-syntax.js » ('j') | no next file with comments »