Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(768)

Unified Diff: src/parser.cc

Issue 788043005: ES6 unicode escapes, part 2: Regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: error reporting Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/parser.h ('k') | src/regexp.js » ('j') | src/regexp.js » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/parser.cc
diff --git a/src/parser.cc b/src/parser.cc
index bfdeaa3276dbd047236ce6eec09470e55650230c..3f7ce4d4a9c23c4042e55d6c6a4637f324bf1aba 100644
--- a/src/parser.cc
+++ b/src/parser.cc
@@ -4278,10 +4278,8 @@ void Parser::Internalize() {
// Regular expressions
-RegExpParser::RegExpParser(FlatStringReader* in,
- Handle<String>* error,
- bool multiline,
- Zone* zone)
+RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
+ bool multiline, bool unicode, Zone* zone)
: isolate_(zone->isolate()),
zone_(zone),
error_(error),
@@ -4292,6 +4290,7 @@ RegExpParser::RegExpParser(FlatStringReader* in,
capture_count_(0),
has_more_(true),
multiline_(multiline),
+ unicode_(unicode),
simple_(false),
contains_anchor_(false),
is_scanned_for_captures_(false),
@@ -4348,6 +4347,13 @@ bool RegExpParser::simple() {
}
+bool RegExpParser::IsSyntaxCharacter(uc32 c) {
+ return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
+ c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
+ c == '{' || c == '}' || c == '|';
+}
mathias 2015/01/08 12:29:07 Should `-` be a “syntax character” as well because
marja 2015/01/08 13:42:18 The spec (draft rev 30) says: SyntaxCharacter ::
rossberg 2015/01/08 14:11:45 It's not a SyntaxCharacter and that's likely inten
mathias 2015/01/21 07:16:54 /[\-]/u is now allowed: https://bugs.ecmascript.or
+
+
RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
failed_ = true;
*error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
@@ -4564,9 +4570,15 @@ RegExpTree* RegExpParser::ParseDisjunction() {
}
uc32 first_digit = Next();
if (first_digit == '8' || first_digit == '9') {
- // Treat as identity escape
- builder->AddCharacter(first_digit);
- Advance(2);
+ // If the 'u' flag is present, only syntax characters can be escaped,
+ // no other identity escapes are allowed. If the 'u' flag is not
+ // present, all identity escapes are allowed.
+ if (!FLAG_harmony_unicode || !unicode_) {
+ builder->AddCharacter(first_digit);
+ Advance(2);
+ } else {
+ return ReportError(CStrVector("Invalid escape"));
+ }
break;
}
}
@@ -4622,25 +4634,41 @@ RegExpTree* RegExpParser::ParseDisjunction() {
uc32 value;
if (ParseHexEscape(2, &value)) {
builder->AddCharacter(value);
- } else {
+ } else if (!FLAG_harmony_unicode || !unicode_) {
builder->AddCharacter('x');
+ } else {
+ // If the 'u' flag is present, invalid escapes are not treated as
+ // identity escapes.
+ return ReportError(CStrVector("Invalid escape"));
}
break;
}
case 'u': {
Advance(2);
uc32 value;
- if (ParseHexEscape(4, &value)) {
+ if (ParseUnicodeEscape(&value)) {
builder->AddCharacter(value);
- } else {
+ } else if (!FLAG_harmony_unicode || !unicode_) {
builder->AddCharacter('u');
+ } else {
+ // If the 'u' flag is present, invalid escapes are not treated as
+ // identity escapes.
+ return ReportError(CStrVector("Invalid unicode escape"));
}
break;
}
default:
- // Identity escape.
- builder->AddCharacter(Next());
- Advance(2);
+ Advance();
+ // If the 'u' flag is present, only syntax characters can be escaped, no
+ // other identity escapes are allowed. If the 'u' flag is not present,
+ // all identity escapes are allowed.
+ if (!FLAG_harmony_unicode || !unicode_ ||
+ IsSyntaxCharacter(current())) {
+ builder->AddCharacter(current());
+ Advance();
+ } else {
+ return ReportError(CStrVector("Invalid escape"));
+ }
break;
}
break;
@@ -4883,11 +4911,10 @@ uc32 RegExpParser::ParseOctalLiteral() {
}
-bool RegExpParser::ParseHexEscape(int length, uc32 *value) {
+bool RegExpParser::ParseHexEscape(int length, uc32* value) {
int start = position();
uc32 val = 0;
- bool done = false;
- for (int i = 0; !done; i++) {
+ for (int i = 0; i < length; ++i) {
uc32 c = current();
int d = HexValue(c);
if (d < 0) {
@@ -4896,15 +4923,52 @@ bool RegExpParser::ParseHexEscape(int length, uc32 *value) {
}
val = val * 16 + d;
Advance();
- if (i == length - 1) {
- done = true;
- }
}
*value = val;
return true;
}
+bool RegExpParser::ParseUnicodeEscape(uc32* value) {
+ // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
+ // allowed). In the latter case, the number of hex digits between { } is
+ // arbitrary. \ and u have already been read.
+ if (current() == '{' && FLAG_harmony_unicode && unicode_) {
+ int start = position();
+ Advance();
+ if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
+ if (current() == '}') {
+ Advance();
+ return true;
+ }
+ }
+ Reset(start);
+ return false;
+ }
+ // \u but no {, or \u{...} escapes not allowed.
+ return ParseHexEscape(4, value);
+}
+
+
+bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
+ uc32 x = 0;
+ int d = HexValue(current());
+ if (d < 0) {
+ return false;
+ }
+ while (d >= 0) {
+ x = x * 16 + d;
+ if (x > max_value) {
+ return false;
+ }
+ Advance();
+ d = HexValue(current());
+ }
+ *value = x;
+ return true;
+}
+
+
uc32 RegExpParser::ParseClassCharacterEscape() {
DCHECK(current() == '\\');
DCHECK(has_next() && !IsSpecialClassEscape(Next()));
@@ -4959,27 +5023,41 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
if (ParseHexEscape(2, &value)) {
return value;
}
- // If \x is not followed by a two-digit hexadecimal, treat it
- // as an identity escape.
- return 'x';
+ if (!FLAG_harmony_unicode || !unicode_) {
+ // If \x is not followed by a two-digit hexadecimal, treat it
+ // as an identity escape.
+ return 'x';
+ }
+ // If the 'u' flag is present, invalid escapes are not treated as
+ // identity escapes.
+ ReportError(CStrVector("Invalid escape"));
+ return 0;
}
case 'u': {
Advance();
uc32 value;
- if (ParseHexEscape(4, &value)) {
+ if (ParseUnicodeEscape(&value)) {
return value;
}
- // If \u is not followed by a four-digit hexadecimal, treat it
- // as an identity escape.
- return 'u';
+ if (!FLAG_harmony_unicode || !unicode_) {
+ return 'u';
+ }
+ // If the 'u' flag is present, invalid escapes are not treated as
+ // identity escapes.
+ ReportError(CStrVector("Invalid unicode escape"));
+ return 0;
}
default: {
- // Extended identity escape. We accept any character that hasn't
- // been matched by a more specific case, not just the subset required
- // by the ECMAScript specification.
uc32 result = current();
- Advance();
- return result;
+ // If the 'u' flag is present, only syntax characters can be escaped, no
+ // other identity escapes are allowed. If the 'u' flag is not present, all
+ // identity escapes are allowed.
+ if (!FLAG_harmony_unicode || !unicode_ || IsSyntaxCharacter(result)) {
+ Advance();
+ return result;
+ }
+ ReportError(CStrVector("Invalid escape"));
+ return 0;
}
}
return 0;
@@ -5085,12 +5163,11 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
// ----------------------------------------------------------------------------
// The Parser interface.
-bool RegExpParser::ParseRegExp(FlatStringReader* input,
- bool multiline,
- RegExpCompileData* result,
+bool RegExpParser::ParseRegExp(FlatStringReader* input, bool multiline,
+ bool unicode, RegExpCompileData* result,
Zone* zone) {
DCHECK(result != NULL);
- RegExpParser parser(input, &result->error, multiline, zone);
+ RegExpParser parser(input, &result->error, multiline, unicode, zone);
RegExpTree* tree = parser.ParsePattern();
if (parser.failed()) {
DCHECK(tree == NULL);
« no previous file with comments | « src/parser.h ('k') | src/regexp.js » ('j') | src/regexp.js » ('J')

Powered by Google App Engine
This is Rietveld 408576698