| Index: src/regexp/regexp-parser.cc
|
| diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc
|
| index 5daea70df1787f4092c08caa4a194c95334779c3..7470e28aed696fa4afce7d774a3164bd593eb891 100644
|
| --- a/src/regexp/regexp-parser.cc
|
| +++ b/src/regexp/regexp-parser.cc
|
| @@ -478,7 +478,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
|
| Advance(2);
|
| uc32 value;
|
| if (ParseUnicodeEscape(&value)) {
|
| - builder->AddUnicodeCharacter(value);
|
| + builder->AddEscapedUnicodeCharacter(value);
|
| } else if (!unicode()) {
|
| builder->AddCharacter('u');
|
| } else {
|
| @@ -797,7 +797,7 @@ bool RegExpParser::ParseHexEscape(int length, uc32* value) {
|
| return true;
|
| }
|
|
|
| -
|
| +// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
|
| bool RegExpParser::ParseUnicodeEscape(uc32* value) {
|
| // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
|
| // allowed). In the latter case, the number of hex digits between { } is
|
| @@ -815,7 +815,24 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) {
|
| return false;
|
| }
|
| // \u but no {, or \u{...} escapes not allowed.
|
| - return ParseHexEscape(4, value);
|
| + bool result = ParseHexEscape(4, value);
|
| + if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
|
| + current() == '\\') {
|
| + // Attempt to read trail surrogate.
|
| + int start = position();
|
| + if (Next() == 'u') {
|
| + Advance(2);
|
| + uc32 trail;
|
| + if (ParseHexEscape(4, &trail) &&
|
| + unibrow::Utf16::IsTrailSurrogate(trail)) {
|
| + *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
|
| + static_cast<uc16>(trail));
|
| + return true;
|
| + }
|
| + }
|
| + Reset(start);
|
| + }
|
| + return result;
|
| }
|
|
|
| ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() {
|
| @@ -938,6 +955,12 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
|
| return '\\';
|
| }
|
| case '0':
|
| + // With /u, \0 is interpreted as NUL if not followed by another digit.
|
| + if (unicode() && !(Next() >= '0' && Next() <= '9')) {
|
| + Advance();
|
| + return 0;
|
| + }
|
| + // Fall through.
|
| case '1':
|
| case '2':
|
| case '3':
|
| @@ -982,9 +1005,9 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
|
| }
|
| default: {
|
| uc32 result = current();
|
| - // With /u, no identity escapes except for syntax characters are
|
| + // With /u, no identity escapes except for syntax characters and '-' are
|
| // allowed. Otherwise, all identity escapes are allowed.
|
| - if (!unicode() || IsSyntaxCharacterOrSlash(result)) {
|
| + if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
|
| Advance();
|
| return result;
|
| }
|
| @@ -1020,22 +1043,6 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
|
| Advance();
|
| }
|
|
|
| - if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {
|
| - // Combine with possibly following trail surrogate.
|
| - int start = position();
|
| - uc32 second = current();
|
| - if (second == '\\') {
|
| - second = ParseClassCharacterEscape(CHECK_FAILED);
|
| - } else {
|
| - Advance();
|
| - }
|
| - if (unibrow::Utf16::IsTrailSurrogate(second)) {
|
| - first = unibrow::Utf16::CombineSurrogatePair(first, second);
|
| - } else {
|
| - Reset(start);
|
| - }
|
| - }
|
| -
|
| return CharacterRange::Singleton(first);
|
| }
|
|
|
| @@ -1264,6 +1271,13 @@ void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
|
| }
|
| }
|
|
|
| +void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
|
| + // A lead or trail surrogate parsed via escape sequence will not
|
| + // pair up with any preceding lead or following trail surrogate.
|
| + FlushPendingSurrogate();
|
| + AddUnicodeCharacter(character);
|
| + FlushPendingSurrogate();
|
| +}
|
|
|
| void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
|
|
|
|
|