src/regexp/regexp-parser.cc - Issue 1681893002: [regexp] parse RegExpUnicodeEscapeSequence according to spec.

Unified Diff: src/regexp/regexp-parser.cc

Issue 1681893002: [regexp] parse RegExpUnicodeEscapeSequence according to spec. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: fix: /[\00]/u is not allowed. Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/regexp/regexp-parser.cc

diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc

index 1e6a0dacca638881cabf0470bf2798c0c53bc0cb..3091b6d49c027937ab67773871884dd1d3ae5864 100644

--- a/src/regexp/regexp-parser.cc

+++ b/src/regexp/regexp-parser.cc

@@ -461,7 +461,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {

Advance(2);

uc32 value;

if (ParseUnicodeEscape(&value)) {

- builder->AddUnicodeCharacter(value);

+ builder->AddEscapedUnicodeCharacter(value);

} else if (!unicode()) {

builder->AddCharacter('u');

} else {

@@ -780,7 +780,7 @@ bool RegExpParser::ParseHexEscape(int length, uc32* value) {

return true;

}

+// This parses RegExpUnicodeEscapeSequence as described in ECMA262.

bool RegExpParser::ParseUnicodeEscape(uc32* value) {

// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are

// allowed). In the latter case, the number of hex digits between { } is

@@ -798,7 +798,24 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) {

return false;

}

// \u but no {, or \u{...} escapes not allowed.

- return ParseHexEscape(4, value);

+ bool result = ParseHexEscape(4, value);

+ if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&

+ current() == '\\') {

+ // Attempt to read trail surrogate.

+ int start = position();

+ if (Next() == 'u') {

+ Advance(2);

+ uc32 trail;

+ if (ParseHexEscape(4, &trail) &&

+ unibrow::Utf16::IsTrailSurrogate(trail)) {

+ *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),

+ static_cast<uc16>(trail));

+ return true;

+ }

+ Reset(start);

+ }

+ return result;

}

@@ -879,14 +896,18 @@ uc32 RegExpParser::ParseClassCharacterEscape() {

case '5':

case '6':

case '7':

- // For compatibility, we interpret a decimal escape that isn't

- // a back reference (and therefore either \0 or not valid according

- // to the specification) as a 1..3 digit octal character code.

if (unicode()) {

+ // \0 is interpreted as \u0000 if it is not followed by another digit.

+ if (current() == '0') {

+ Advance();

+ if (current() < '0' || current() > '9') return 0;

Yang 2016/02/09 19:18:31 This now matches how we parse \<digit> outside of

+ }

// With /u, decimal escape is not interpreted as octal character code.

ReportError(CStrVector("Invalid class escape"));

return 0;

}

+ // For backward compatibility, we interpret escaped digit from 0 to 7 as

+ // a 1..3 digit octal character code.

return ParseOctalLiteral();

case 'x': {

Advance();

@@ -916,9 +937,9 @@ uc32 RegExpParser::ParseClassCharacterEscape() {

}

default: {

uc32 result = current();

- // With /u, no identity escapes except for syntax characters are

+ // With /u, no identity escapes except for syntax characters and '-' are

// allowed. Otherwise, all identity escapes are allowed.

- if (!unicode() || IsSyntaxCharacterOrSlash(result)) {

+ if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {

Advance();

return result;

}

@@ -954,22 +975,6 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {

Advance();

}

- if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {

- // Combine with possibly following trail surrogate.

- int start = position();

- uc32 second = current();

- if (second == '\\') {

- second = ParseClassCharacterEscape(CHECK_FAILED);

- } else {

- Advance();

- }

- if (unibrow::Utf16::IsTrailSurrogate(second)) {

- first = unibrow::Utf16::CombineSurrogatePair(first, second);

- } else {

- Reset(start);

- }

return CharacterRange::Singleton(first);

}

@@ -1198,6 +1203,13 @@ void RegExpBuilder::AddUnicodeCharacter(uc32 c) {

}

+void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {

+ // A lead or trail surrogate parsed via escape sequence will not

+ // pair up with any preceding lead or following trail surrogate.

+ FlushPendingSurrogate();

+ AddUnicodeCharacter(character);

+ FlushPendingSurrogate();

void RegExpBuilder::AddEmpty() { pending_empty_ = true; }

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-escapes-in-regexps.js » ('j') | no next file with comments »