src/regexp/regexp-parser.cc - Issue 1681893002: [regexp] parse RegExpUnicodeEscapeSequence according to spec.

Unified Diff: src/regexp/regexp-parser.cc

Issue 1681893002: [regexp] parse RegExpUnicodeEscapeSequence according to spec. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: addressed comments Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/regexp/regexp-parser.cc

diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc

index 5daea70df1787f4092c08caa4a194c95334779c3..7470e28aed696fa4afce7d774a3164bd593eb891 100644

--- a/src/regexp/regexp-parser.cc

+++ b/src/regexp/regexp-parser.cc

@@ -478,7 +478,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {

Advance(2);

uc32 value;

if (ParseUnicodeEscape(&value)) {

- builder->AddUnicodeCharacter(value);

+ builder->AddEscapedUnicodeCharacter(value);

} else if (!unicode()) {

builder->AddCharacter('u');

} else {

@@ -797,7 +797,7 @@ bool RegExpParser::ParseHexEscape(int length, uc32* value) {

return true;

}

+// This parses RegExpUnicodeEscapeSequence as described in ECMA262.

bool RegExpParser::ParseUnicodeEscape(uc32* value) {

// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are

// allowed). In the latter case, the number of hex digits between { } is

@@ -815,7 +815,24 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) {

return false;

}

// \u but no {, or \u{...} escapes not allowed.

- return ParseHexEscape(4, value);

+ bool result = ParseHexEscape(4, value);

+ if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&

+ current() == '\\') {

+ // Attempt to read trail surrogate.

+ int start = position();

+ if (Next() == 'u') {

+ Advance(2);

+ uc32 trail;

+ if (ParseHexEscape(4, &trail) &&

+ unibrow::Utf16::IsTrailSurrogate(trail)) {

+ *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),

+ static_cast<uc16>(trail));

+ return true;

+ }

+ Reset(start);

+ }

+ return result;

}

ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() {

@@ -938,6 +955,12 @@ uc32 RegExpParser::ParseClassCharacterEscape() {

return '\\';

}

case '0':

+ // With /u, \0 is interpreted as NUL if not followed by another digit.

+ if (unicode() && !(Next() >= '0' && Next() <= '9')) {

+ Advance();

+ return 0;

+ }

+ // Fall through.

case '1':

case '2':

case '3':

@@ -982,9 +1005,9 @@ uc32 RegExpParser::ParseClassCharacterEscape() {

}

default: {

uc32 result = current();

- // With /u, no identity escapes except for syntax characters are

+ // With /u, no identity escapes except for syntax characters and '-' are

// allowed. Otherwise, all identity escapes are allowed.

- if (!unicode() || IsSyntaxCharacterOrSlash(result)) {

+ if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {

Advance();

return result;

}

@@ -1020,22 +1043,6 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {

Advance();

}

- if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {

- // Combine with possibly following trail surrogate.

- int start = position();

- uc32 second = current();

- if (second == '\\') {

- second = ParseClassCharacterEscape(CHECK_FAILED);

- } else {

- Advance();

- }

- if (unibrow::Utf16::IsTrailSurrogate(second)) {

- first = unibrow::Utf16::CombineSurrogatePair(first, second);

- } else {

- Reset(start);

- }

return CharacterRange::Singleton(first);

}

@@ -1264,6 +1271,13 @@ void RegExpBuilder::AddUnicodeCharacter(uc32 c) {

}

+void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {

+ // A lead or trail surrogate parsed via escape sequence will not

+ // pair up with any preceding lead or following trail surrogate.

+ FlushPendingSurrogate();

+ AddUnicodeCharacter(character);

+ FlushPendingSurrogate();

void RegExpBuilder::AddEmpty() { pending_empty_ = true; }

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-escapes-in-regexps.js » ('j') | no next file with comments »