Index: src/regexp/regexp-parser.cc |
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc |
index 5daea70df1787f4092c08caa4a194c95334779c3..7470e28aed696fa4afce7d774a3164bd593eb891 100644 |
--- a/src/regexp/regexp-parser.cc |
+++ b/src/regexp/regexp-parser.cc |
@@ -478,7 +478,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
Advance(2); |
uc32 value; |
if (ParseUnicodeEscape(&value)) { |
- builder->AddUnicodeCharacter(value); |
+ builder->AddEscapedUnicodeCharacter(value); |
} else if (!unicode()) { |
builder->AddCharacter('u'); |
} else { |
@@ -797,7 +797,7 @@ bool RegExpParser::ParseHexEscape(int length, uc32* value) { |
return true; |
} |
- |
+// This parses RegExpUnicodeEscapeSequence as described in ECMA262. |
bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
// allowed). In the latter case, the number of hex digits between { } is |
@@ -815,7 +815,24 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
return false; |
} |
// \u but no {, or \u{...} escapes not allowed. |
- return ParseHexEscape(4, value); |
+ bool result = ParseHexEscape(4, value); |
+ if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && |
+ current() == '\\') { |
+ // Attempt to read trail surrogate. |
+ int start = position(); |
+ if (Next() == 'u') { |
+ Advance(2); |
+ uc32 trail; |
+ if (ParseHexEscape(4, &trail) && |
+ unibrow::Utf16::IsTrailSurrogate(trail)) { |
+ *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value), |
+ static_cast<uc16>(trail)); |
+ return true; |
+ } |
+ } |
+ Reset(start); |
+ } |
+ return result; |
} |
ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() { |
@@ -938,6 +955,12 @@ uc32 RegExpParser::ParseClassCharacterEscape() { |
return '\\'; |
} |
case '0': |
+ // With /u, \0 is interpreted as NUL if not followed by another digit. |
+ if (unicode() && !(Next() >= '0' && Next() <= '9')) { |
+ Advance(); |
+ return 0; |
+ } |
+ // Fall through. |
case '1': |
case '2': |
case '3': |
@@ -982,9 +1005,9 @@ uc32 RegExpParser::ParseClassCharacterEscape() { |
} |
default: { |
uc32 result = current(); |
- // With /u, no identity escapes except for syntax characters are |
+ // With /u, no identity escapes except for syntax characters and '-' are |
// allowed. Otherwise, all identity escapes are allowed. |
- if (!unicode() || IsSyntaxCharacterOrSlash(result)) { |
+ if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { |
Advance(); |
return result; |
} |
@@ -1020,22 +1043,6 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { |
Advance(); |
} |
- if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) { |
- // Combine with possibly following trail surrogate. |
- int start = position(); |
- uc32 second = current(); |
- if (second == '\\') { |
- second = ParseClassCharacterEscape(CHECK_FAILED); |
- } else { |
- Advance(); |
- } |
- if (unibrow::Utf16::IsTrailSurrogate(second)) { |
- first = unibrow::Utf16::CombineSurrogatePair(first, second); |
- } else { |
- Reset(start); |
- } |
- } |
- |
return CharacterRange::Singleton(first); |
} |
@@ -1264,6 +1271,13 @@ void RegExpBuilder::AddUnicodeCharacter(uc32 c) { |
} |
} |
+void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { |
+ // A lead or trail surrogate parsed via escape sequence will not |
+ // pair up with any preceding lead or following trail surrogate. |
+ FlushPendingSurrogate(); |
+ AddUnicodeCharacter(character); |
+ FlushPendingSurrogate(); |
+} |
void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |