Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1340)

Unified Diff: src/regexp/regexp-parser.cc

Issue 1681893002: [regexp] parse RegExpUnicodeEscapeSequence according to spec. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: fix: /[\00]/u is not allowed. Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-escapes-in-regexps.js » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/regexp/regexp-parser.cc
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc
index 1e6a0dacca638881cabf0470bf2798c0c53bc0cb..3091b6d49c027937ab67773871884dd1d3ae5864 100644
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
@@ -461,7 +461,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
Advance(2);
uc32 value;
if (ParseUnicodeEscape(&value)) {
- builder->AddUnicodeCharacter(value);
+ builder->AddEscapedUnicodeCharacter(value);
} else if (!unicode()) {
builder->AddCharacter('u');
} else {
@@ -780,7 +780,7 @@ bool RegExpParser::ParseHexEscape(int length, uc32* value) {
return true;
}
-
+// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
bool RegExpParser::ParseUnicodeEscape(uc32* value) {
// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
// allowed). In the latter case, the number of hex digits between { } is
@@ -798,7 +798,24 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) {
return false;
}
// \u but no {, or \u{...} escapes not allowed.
- return ParseHexEscape(4, value);
+ bool result = ParseHexEscape(4, value);
+ if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
+ current() == '\\') {
+ // Attempt to read trail surrogate.
+ int start = position();
+ if (Next() == 'u') {
+ Advance(2);
+ uc32 trail;
+ if (ParseHexEscape(4, &trail) &&
+ unibrow::Utf16::IsTrailSurrogate(trail)) {
+ *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
+ static_cast<uc16>(trail));
+ return true;
+ }
+ }
+ Reset(start);
+ }
+ return result;
}
@@ -879,14 +896,18 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
case '5':
case '6':
case '7':
- // For compatibility, we interpret a decimal escape that isn't
- // a back reference (and therefore either \0 or not valid according
- // to the specification) as a 1..3 digit octal character code.
if (unicode()) {
+ // \0 is interpreted as \u0000 if it is not followed by another digit.
+ if (current() == '0') {
+ Advance();
+ if (current() < '0' || current() > '9') return 0;
Yang 2016/02/09 19:18:31 This now matches how we parse \<digit> outside of
+ }
// With /u, decimal escape is not interpreted as octal character code.
ReportError(CStrVector("Invalid class escape"));
return 0;
}
+ // For backward compatibility, we interpret escaped digit from 0 to 7 as
+ // a 1..3 digit octal character code.
return ParseOctalLiteral();
case 'x': {
Advance();
@@ -916,9 +937,9 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
}
default: {
uc32 result = current();
- // With /u, no identity escapes except for syntax characters are
+ // With /u, no identity escapes except for syntax characters and '-' are
// allowed. Otherwise, all identity escapes are allowed.
- if (!unicode() || IsSyntaxCharacterOrSlash(result)) {
+ if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
Advance();
return result;
}
@@ -954,22 +975,6 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
Advance();
}
- if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {
- // Combine with possibly following trail surrogate.
- int start = position();
- uc32 second = current();
- if (second == '\\') {
- second = ParseClassCharacterEscape(CHECK_FAILED);
- } else {
- Advance();
- }
- if (unibrow::Utf16::IsTrailSurrogate(second)) {
- first = unibrow::Utf16::CombineSurrogatePair(first, second);
- } else {
- Reset(start);
- }
- }
-
return CharacterRange::Singleton(first);
}
@@ -1198,6 +1203,13 @@ void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
}
}
+void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
+ // A lead or trail surrogate parsed via escape sequence will not
+ // pair up with any preceding lead or following trail surrogate.
+ FlushPendingSurrogate();
+ AddUnicodeCharacter(character);
+ FlushPendingSurrogate();
+}
void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-escapes-in-regexps.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698