Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(118)

Unified Diff: src/regexp/regexp-parser.cc

Issue 1681893002: [regexp] parse RegExpUnicodeEscapeSequence according to spec. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: addressed comments Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-escapes-in-regexps.js » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/regexp/regexp-parser.cc
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc
index 5daea70df1787f4092c08caa4a194c95334779c3..7470e28aed696fa4afce7d774a3164bd593eb891 100644
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
@@ -478,7 +478,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
Advance(2);
uc32 value;
if (ParseUnicodeEscape(&value)) {
- builder->AddUnicodeCharacter(value);
+ builder->AddEscapedUnicodeCharacter(value);
} else if (!unicode()) {
builder->AddCharacter('u');
} else {
@@ -797,7 +797,7 @@ bool RegExpParser::ParseHexEscape(int length, uc32* value) {
return true;
}
-
+// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
bool RegExpParser::ParseUnicodeEscape(uc32* value) {
// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
// allowed). In the latter case, the number of hex digits between { } is
@@ -815,7 +815,24 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) {
return false;
}
// \u but no {, or \u{...} escapes not allowed.
- return ParseHexEscape(4, value);
+ bool result = ParseHexEscape(4, value);
+ if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
+ current() == '\\') {
+ // Attempt to read trail surrogate.
+ int start = position();
+ if (Next() == 'u') {
+ Advance(2);
+ uc32 trail;
+ if (ParseHexEscape(4, &trail) &&
+ unibrow::Utf16::IsTrailSurrogate(trail)) {
+ *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
+ static_cast<uc16>(trail));
+ return true;
+ }
+ }
+ Reset(start);
+ }
+ return result;
}
ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() {
@@ -938,6 +955,12 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
return '\\';
}
case '0':
+ // With /u, \0 is interpreted as NUL if not followed by another digit.
+ if (unicode() && !(Next() >= '0' && Next() <= '9')) {
+ Advance();
+ return 0;
+ }
+ // Fall through.
case '1':
case '2':
case '3':
@@ -982,9 +1005,9 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
}
default: {
uc32 result = current();
- // With /u, no identity escapes except for syntax characters are
+ // With /u, no identity escapes except for syntax characters and '-' are
// allowed. Otherwise, all identity escapes are allowed.
- if (!unicode() || IsSyntaxCharacterOrSlash(result)) {
+ if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
Advance();
return result;
}
@@ -1020,22 +1043,6 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
Advance();
}
- if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {
- // Combine with possibly following trail surrogate.
- int start = position();
- uc32 second = current();
- if (second == '\\') {
- second = ParseClassCharacterEscape(CHECK_FAILED);
- } else {
- Advance();
- }
- if (unibrow::Utf16::IsTrailSurrogate(second)) {
- first = unibrow::Utf16::CombineSurrogatePair(first, second);
- } else {
- Reset(start);
- }
- }
-
return CharacterRange::Singleton(first);
}
@@ -1264,6 +1271,13 @@ void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
}
}
+void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
+ // A lead or trail surrogate parsed via escape sequence will not
+ // pair up with any preceding lead or following trail surrogate.
+ FlushPendingSurrogate();
+ AddUnicodeCharacter(character);
+ FlushPendingSurrogate();
+}
void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-escapes-in-regexps.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698