src/parser.cc - Issue 788043005: ES6 unicode escapes, part 2: Regexps.

Unified Diff: src/parser.cc

Issue 788043005: ES6 unicode escapes, part 2: Regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: error reporting Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/parser.cc

diff --git a/src/parser.cc b/src/parser.cc

index bfdeaa3276dbd047236ce6eec09470e55650230c..3f7ce4d4a9c23c4042e55d6c6a4637f324bf1aba 100644

--- a/src/parser.cc

+++ b/src/parser.cc

@@ -4278,10 +4278,8 @@ void Parser::Internalize() {

// Regular expressions

-RegExpParser::RegExpParser(FlatStringReader* in,

- Handle<String>* error,

- bool multiline,

- Zone* zone)

+RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,

+ bool multiline, bool unicode, Zone* zone)

: isolate_(zone->isolate()),

zone_(zone),

error_(error),

@@ -4292,6 +4290,7 @@ RegExpParser::RegExpParser(FlatStringReader* in,

capture_count_(0),

has_more_(true),

multiline_(multiline),

+ unicode_(unicode),

simple_(false),

contains_anchor_(false),

is_scanned_for_captures_(false),

@@ -4348,6 +4347,13 @@ bool RegExpParser::simple() {

}

+bool RegExpParser::IsSyntaxCharacter(uc32 c) {

+ return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||

+ c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||

+ c == '{' || c == '}' || c == '|';

mathias 2015/01/08 12:29:07 Should `-` be a “syntax character” as well because

marja 2015/01/08 13:42:18 The spec (draft rev 30) says: SyntaxCharacter ::

rossberg 2015/01/08 14:11:45 It's not a SyntaxCharacter and that's likely inten

mathias 2015/01/21 07:16:54 /[\-]/u is now allowed: https://bugs.ecmascript.or

RegExpTree* RegExpParser::ReportError(Vector<const char> message) {

failed_ = true;

*error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();

@@ -4564,9 +4570,15 @@ RegExpTree* RegExpParser::ParseDisjunction() {

}

uc32 first_digit = Next();

if (first_digit == '8' || first_digit == '9') {

- // Treat as identity escape

- builder->AddCharacter(first_digit);

- Advance(2);

+ // If the 'u' flag is present, only syntax characters can be escaped,

+ // no other identity escapes are allowed. If the 'u' flag is not

+ // present, all identity escapes are allowed.

+ if (!FLAG_harmony_unicode || !unicode_) {

+ builder->AddCharacter(first_digit);

+ Advance(2);

+ } else {

+ return ReportError(CStrVector("Invalid escape"));

+ }

break;

}

@@ -4622,25 +4634,41 @@ RegExpTree* RegExpParser::ParseDisjunction() {

uc32 value;

if (ParseHexEscape(2, &value)) {

builder->AddCharacter(value);

- } else {

+ } else if (!FLAG_harmony_unicode || !unicode_) {

builder->AddCharacter('x');

+ } else {

+ // If the 'u' flag is present, invalid escapes are not treated as

+ // identity escapes.

+ return ReportError(CStrVector("Invalid escape"));

}

break;

}

case 'u': {

Advance(2);

uc32 value;

- if (ParseHexEscape(4, &value)) {

+ if (ParseUnicodeEscape(&value)) {

builder->AddCharacter(value);

- } else {

+ } else if (!FLAG_harmony_unicode || !unicode_) {

builder->AddCharacter('u');

+ } else {

+ // If the 'u' flag is present, invalid escapes are not treated as

+ // identity escapes.

+ return ReportError(CStrVector("Invalid unicode escape"));

}

break;

}

default:

- // Identity escape.

- builder->AddCharacter(Next());

- Advance(2);

+ Advance();

+ // If the 'u' flag is present, only syntax characters can be escaped, no

+ // other identity escapes are allowed. If the 'u' flag is not present,

+ // all identity escapes are allowed.

+ if (!FLAG_harmony_unicode || !unicode_ ||

+ IsSyntaxCharacter(current())) {

+ builder->AddCharacter(current());

+ Advance();

+ } else {

+ return ReportError(CStrVector("Invalid escape"));

+ }

break;

}

break;

@@ -4883,11 +4911,10 @@ uc32 RegExpParser::ParseOctalLiteral() {

}

-bool RegExpParser::ParseHexEscape(int length, uc32 *value) {

+bool RegExpParser::ParseHexEscape(int length, uc32* value) {

int start = position();

uc32 val = 0;

- bool done = false;

- for (int i = 0; !done; i++) {

+ for (int i = 0; i < length; ++i) {

uc32 c = current();

int d = HexValue(c);

if (d < 0) {

@@ -4896,15 +4923,52 @@ bool RegExpParser::ParseHexEscape(int length, uc32 *value) {

}

val = val * 16 + d;

Advance();

- if (i == length - 1) {

- done = true;

- }

}

*value = val;

return true;

}

+bool RegExpParser::ParseUnicodeEscape(uc32* value) {

+ // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are

+ // allowed). In the latter case, the number of hex digits between { } is

+ // arbitrary. \ and u have already been read.

+ if (current() == '{' && FLAG_harmony_unicode && unicode_) {

+ int start = position();

+ Advance();

+ if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {

+ if (current() == '}') {

+ Advance();

+ return true;

+ }

+ Reset(start);

+ return false;

+ }

+ // \u but no {, or \u{...} escapes not allowed.

+ return ParseHexEscape(4, value);

+bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {

+ uc32 x = 0;

+ int d = HexValue(current());

+ if (d < 0) {

+ return false;

+ }

+ while (d >= 0) {

+ x = x * 16 + d;

+ if (x > max_value) {

+ return false;

+ }

+ Advance();

+ d = HexValue(current());

+ }

+ *value = x;

+ return true;

uc32 RegExpParser::ParseClassCharacterEscape() {

DCHECK(current() == '\\');

DCHECK(has_next() && !IsSpecialClassEscape(Next()));

@@ -4959,27 +5023,41 @@ uc32 RegExpParser::ParseClassCharacterEscape() {

if (ParseHexEscape(2, &value)) {

return value;

}

- // If \x is not followed by a two-digit hexadecimal, treat it

- // as an identity escape.

- return 'x';

+ if (!FLAG_harmony_unicode || !unicode_) {

+ // If \x is not followed by a two-digit hexadecimal, treat it

+ // as an identity escape.

+ return 'x';

+ }

+ // If the 'u' flag is present, invalid escapes are not treated as

+ // identity escapes.

+ ReportError(CStrVector("Invalid escape"));

+ return 0;

}

case 'u': {

Advance();

uc32 value;

- if (ParseHexEscape(4, &value)) {

+ if (ParseUnicodeEscape(&value)) {

return value;

}

- // If \u is not followed by a four-digit hexadecimal, treat it

- // as an identity escape.

- return 'u';

+ if (!FLAG_harmony_unicode || !unicode_) {

+ return 'u';

+ }

+ // If the 'u' flag is present, invalid escapes are not treated as

+ // identity escapes.

+ ReportError(CStrVector("Invalid unicode escape"));

+ return 0;

}

default: {

- // Extended identity escape. We accept any character that hasn't

- // been matched by a more specific case, not just the subset required

- // by the ECMAScript specification.

uc32 result = current();

- Advance();

- return result;

+ // If the 'u' flag is present, only syntax characters can be escaped, no

+ // other identity escapes are allowed. If the 'u' flag is not present, all

+ // identity escapes are allowed.

+ if (!FLAG_harmony_unicode || !unicode_ || IsSyntaxCharacter(result)) {

+ Advance();

+ return result;

+ }

+ ReportError(CStrVector("Invalid escape"));

+ return 0;

}

return 0;

@@ -5085,12 +5163,11 @@ RegExpTree* RegExpParser::ParseCharacterClass() {

// ----------------------------------------------------------------------------

// The Parser interface.

-bool RegExpParser::ParseRegExp(FlatStringReader* input,

- bool multiline,

- RegExpCompileData* result,

+bool RegExpParser::ParseRegExp(FlatStringReader* input, bool multiline,

+ bool unicode, RegExpCompileData* result,

Zone* zone) {

DCHECK(result != NULL);

- RegExpParser parser(input, &result->error, multiline, zone);

+ RegExpParser parser(input, &result->error, multiline, unicode, zone);

RegExpTree* tree = parser.ParsePattern();

if (parser.failed()) {

DCHECK(tree == NULL);

« no previous file with comments | « src/parser.h ('k') | src/regexp.js » ('j') | src/regexp.js » ('J')