Chromium Code Reviews| Index: src/regexp/regexp-parser.cc |
| diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc |
| index 1e6a0dacca638881cabf0470bf2798c0c53bc0cb..3091b6d49c027937ab67773871884dd1d3ae5864 100644 |
| --- a/src/regexp/regexp-parser.cc |
| +++ b/src/regexp/regexp-parser.cc |
| @@ -461,7 +461,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { |
| Advance(2); |
| uc32 value; |
| if (ParseUnicodeEscape(&value)) { |
| - builder->AddUnicodeCharacter(value); |
| + builder->AddEscapedUnicodeCharacter(value); |
| } else if (!unicode()) { |
| builder->AddCharacter('u'); |
| } else { |
| @@ -780,7 +780,7 @@ bool RegExpParser::ParseHexEscape(int length, uc32* value) { |
| return true; |
| } |
| - |
| +// This parses RegExpUnicodeEscapeSequence as described in ECMA262. |
| bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
| // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
| // allowed). In the latter case, the number of hex digits between { } is |
| @@ -798,7 +798,24 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
| return false; |
| } |
| // \u but no {, or \u{...} escapes not allowed. |
| - return ParseHexEscape(4, value); |
| + bool result = ParseHexEscape(4, value); |
| + if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && |
| + current() == '\\') { |
| + // Attempt to read trail surrogate. |
| + int start = position(); |
| + if (Next() == 'u') { |
| + Advance(2); |
| + uc32 trail; |
| + if (ParseHexEscape(4, &trail) && |
| + unibrow::Utf16::IsTrailSurrogate(trail)) { |
| + *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value), |
| + static_cast<uc16>(trail)); |
| + return true; |
| + } |
| + } |
| + Reset(start); |
| + } |
| + return result; |
| } |
| @@ -879,14 +896,18 @@ uc32 RegExpParser::ParseClassCharacterEscape() { |
| case '5': |
| case '6': |
| case '7': |
| - // For compatibility, we interpret a decimal escape that isn't |
| - // a back reference (and therefore either \0 or not valid according |
| - // to the specification) as a 1..3 digit octal character code. |
| if (unicode()) { |
| + // \0 is interpreted as \u0000 if it is not followed by another digit. |
| + if (current() == '0') { |
| + Advance(); |
| + if (current() < '0' || current() > '9') return 0; |
|
Yang
2016/02/09 19:18:31
This now matches how we parse \<digit> outside of
|
| + } |
| // With /u, decimal escape is not interpreted as octal character code. |
| ReportError(CStrVector("Invalid class escape")); |
| return 0; |
| } |
| + // For backward compatibility, we interpret escaped digit from 0 to 7 as |
| + // a 1..3 digit octal character code. |
| return ParseOctalLiteral(); |
| case 'x': { |
| Advance(); |
| @@ -916,9 +937,9 @@ uc32 RegExpParser::ParseClassCharacterEscape() { |
| } |
| default: { |
| uc32 result = current(); |
| - // With /u, no identity escapes except for syntax characters are |
| + // With /u, no identity escapes except for syntax characters and '-' are |
| // allowed. Otherwise, all identity escapes are allowed. |
| - if (!unicode() || IsSyntaxCharacterOrSlash(result)) { |
| + if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { |
| Advance(); |
| return result; |
| } |
| @@ -954,22 +975,6 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { |
| Advance(); |
| } |
| - if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) { |
| - // Combine with possibly following trail surrogate. |
| - int start = position(); |
| - uc32 second = current(); |
| - if (second == '\\') { |
| - second = ParseClassCharacterEscape(CHECK_FAILED); |
| - } else { |
| - Advance(); |
| - } |
| - if (unibrow::Utf16::IsTrailSurrogate(second)) { |
| - first = unibrow::Utf16::CombineSurrogatePair(first, second); |
| - } else { |
| - Reset(start); |
| - } |
| - } |
| - |
| return CharacterRange::Singleton(first); |
| } |
| @@ -1198,6 +1203,13 @@ void RegExpBuilder::AddUnicodeCharacter(uc32 c) { |
| } |
| } |
| +void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { |
| + // A lead or trail surrogate parsed via escape sequence will not |
| + // pair up with any preceding lead or following trail surrogate. |
| + FlushPendingSurrogate(); |
| + AddUnicodeCharacter(character); |
| + FlushPendingSurrogate(); |
| +} |
| void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |