Index: src/regexp/regexp-parser.cc |
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc |
index 1e6a0dacca638881cabf0470bf2798c0c53bc0cb..3091b6d49c027937ab67773871884dd1d3ae5864 100644 |
--- a/src/regexp/regexp-parser.cc |
+++ b/src/regexp/regexp-parser.cc |
@@ -461,7 +461,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
Advance(2); |
uc32 value; |
if (ParseUnicodeEscape(&value)) { |
- builder->AddUnicodeCharacter(value); |
+ builder->AddEscapedUnicodeCharacter(value); |
} else if (!unicode()) { |
builder->AddCharacter('u'); |
} else { |
@@ -780,7 +780,7 @@ bool RegExpParser::ParseHexEscape(int length, uc32* value) { |
return true; |
} |
- |
+// This parses RegExpUnicodeEscapeSequence as described in ECMA262. |
bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
// allowed). In the latter case, the number of hex digits between { } is |
@@ -798,7 +798,24 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
return false; |
} |
// \u but no {, or \u{...} escapes not allowed. |
- return ParseHexEscape(4, value); |
+ bool result = ParseHexEscape(4, value); |
+ if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && |
+ current() == '\\') { |
+ // Attempt to read trail surrogate. |
+ int start = position(); |
+ if (Next() == 'u') { |
+ Advance(2); |
+ uc32 trail; |
+ if (ParseHexEscape(4, &trail) && |
+ unibrow::Utf16::IsTrailSurrogate(trail)) { |
+ *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value), |
+ static_cast<uc16>(trail)); |
+ return true; |
+ } |
+ } |
+ Reset(start); |
+ } |
+ return result; |
} |
@@ -879,14 +896,18 @@ uc32 RegExpParser::ParseClassCharacterEscape() { |
case '5': |
case '6': |
case '7': |
- // For compatibility, we interpret a decimal escape that isn't |
- // a back reference (and therefore either \0 or not valid according |
- // to the specification) as a 1..3 digit octal character code. |
if (unicode()) { |
+ // \0 is interpreted as \u0000 if it is not followed by another digit. |
+ if (current() == '0') { |
+ Advance(); |
+ if (current() < '0' || current() > '9') return 0; |
Yang
2016/02/09 19:18:31
This now matches how we parse \<digit> outside of
|
+ } |
// With /u, decimal escape is not interpreted as octal character code. |
ReportError(CStrVector("Invalid class escape")); |
return 0; |
} |
+ // For backward compatibility, we interpret escaped digit from 0 to 7 as |
+ // a 1..3 digit octal character code. |
return ParseOctalLiteral(); |
case 'x': { |
Advance(); |
@@ -916,9 +937,9 @@ uc32 RegExpParser::ParseClassCharacterEscape() { |
} |
default: { |
uc32 result = current(); |
- // With /u, no identity escapes except for syntax characters are |
+ // With /u, no identity escapes except for syntax characters and '-' are |
// allowed. Otherwise, all identity escapes are allowed. |
- if (!unicode() || IsSyntaxCharacterOrSlash(result)) { |
+ if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { |
Advance(); |
return result; |
} |
@@ -954,22 +975,6 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { |
Advance(); |
} |
- if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) { |
- // Combine with possibly following trail surrogate. |
- int start = position(); |
- uc32 second = current(); |
- if (second == '\\') { |
- second = ParseClassCharacterEscape(CHECK_FAILED); |
- } else { |
- Advance(); |
- } |
- if (unibrow::Utf16::IsTrailSurrogate(second)) { |
- first = unibrow::Utf16::CombineSurrogatePair(first, second); |
- } else { |
- Reset(start); |
- } |
- } |
- |
return CharacterRange::Singleton(first); |
} |
@@ -1198,6 +1203,13 @@ void RegExpBuilder::AddUnicodeCharacter(uc32 c) { |
} |
} |
+void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { |
+ // A lead or trail surrogate parsed via escape sequence will not |
+ // pair up with any preceding lead or following trail surrogate. |
+ FlushPendingSurrogate(); |
+ AddUnicodeCharacter(character); |
+ FlushPendingSurrogate(); |
+} |
void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |