Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
| 6 | 6 |
| 7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
| 8 #include "src/factory.h" | 8 #include "src/factory.h" |
| 9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
| 10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
| (...skipping 443 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 454 } else { | 454 } else { |
| 455 // With /u, invalid escapes are not treated as identity escapes. | 455 // With /u, invalid escapes are not treated as identity escapes. |
| 456 return ReportError(CStrVector("Invalid escape")); | 456 return ReportError(CStrVector("Invalid escape")); |
| 457 } | 457 } |
| 458 break; | 458 break; |
| 459 } | 459 } |
| 460 case 'u': { | 460 case 'u': { |
| 461 Advance(2); | 461 Advance(2); |
| 462 uc32 value; | 462 uc32 value; |
| 463 if (ParseUnicodeEscape(&value)) { | 463 if (ParseUnicodeEscape(&value)) { |
| 464 builder->AddUnicodeCharacter(value); | 464 builder->AddEscapedUnicodeCharacter(value); |
| 465 } else if (!unicode()) { | 465 } else if (!unicode()) { |
| 466 builder->AddCharacter('u'); | 466 builder->AddCharacter('u'); |
| 467 } else { | 467 } else { |
| 468 // With /u, invalid escapes are not treated as identity escapes. | 468 // With /u, invalid escapes are not treated as identity escapes. |
| 469 return ReportError(CStrVector("Invalid unicode escape")); | 469 return ReportError(CStrVector("Invalid unicode escape")); |
| 470 } | 470 } |
| 471 break; | 471 break; |
| 472 } | 472 } |
| 473 default: | 473 default: |
| 474 Advance(); | 474 Advance(); |
| (...skipping 298 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 773 Reset(start); | 773 Reset(start); |
| 774 return false; | 774 return false; |
| 775 } | 775 } |
| 776 val = val * 16 + d; | 776 val = val * 16 + d; |
| 777 Advance(); | 777 Advance(); |
| 778 } | 778 } |
| 779 *value = val; | 779 *value = val; |
| 780 return true; | 780 return true; |
| 781 } | 781 } |
| 782 | 782 |
| 783 | 783 // This parses RegExpUnicodeEscapeSequence as described in ECMA262. |
| 784 bool RegExpParser::ParseUnicodeEscape(uc32* value) { | 784 bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
| 785 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are | 785 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
| 786 // allowed). In the latter case, the number of hex digits between { } is | 786 // allowed). In the latter case, the number of hex digits between { } is |
| 787 // arbitrary. \ and u have already been read. | 787 // arbitrary. \ and u have already been read. |
| 788 if (current() == '{' && unicode()) { | 788 if (current() == '{' && unicode()) { |
| 789 int start = position(); | 789 int start = position(); |
| 790 Advance(); | 790 Advance(); |
| 791 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { | 791 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { |
| 792 if (current() == '}') { | 792 if (current() == '}') { |
| 793 Advance(); | 793 Advance(); |
| 794 return true; | 794 return true; |
| 795 } | 795 } |
| 796 } | 796 } |
| 797 Reset(start); | 797 Reset(start); |
| 798 return false; | 798 return false; |
| 799 } | 799 } |
| 800 // \u but no {, or \u{...} escapes not allowed. | 800 // \u but no {, or \u{...} escapes not allowed. |
| 801 return ParseHexEscape(4, value); | 801 bool result = ParseHexEscape(4, value); |
| 802 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && | |
| 803 current() == '\\') { | |
| 804 // Attempt to read trail surrogate. | |
| 805 int start = position(); | |
| 806 if (Next() == 'u') { | |
| 807 Advance(2); | |
| 808 uc32 trail; | |
| 809 if (ParseHexEscape(4, &trail) && | |
| 810 unibrow::Utf16::IsTrailSurrogate(trail)) { | |
| 811 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value), | |
| 812 static_cast<uc16>(trail)); | |
| 813 return true; | |
| 814 } | |
| 815 } | |
| 816 Reset(start); | |
| 817 } | |
| 818 return result; | |
| 802 } | 819 } |
| 803 | 820 |
| 804 | 821 |
| 805 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { | 822 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { |
| 806 uc32 x = 0; | 823 uc32 x = 0; |
| 807 int d = HexValue(current()); | 824 int d = HexValue(current()); |
| 808 if (d < 0) { | 825 if (d < 0) { |
| 809 return false; | 826 return false; |
| 810 } | 827 } |
| 811 while (d >= 0) { | 828 while (d >= 0) { |
| (...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 872 return '\\'; | 889 return '\\'; |
| 873 } | 890 } |
| 874 case '0': | 891 case '0': |
| 875 case '1': | 892 case '1': |
| 876 case '2': | 893 case '2': |
| 877 case '3': | 894 case '3': |
| 878 case '4': | 895 case '4': |
| 879 case '5': | 896 case '5': |
| 880 case '6': | 897 case '6': |
| 881 case '7': | 898 case '7': |
| 882 // For compatibility, we interpret a decimal escape that isn't | |
| 883 // a back reference (and therefore either \0 or not valid according | |
| 884 // to the specification) as a 1..3 digit octal character code. | |
| 885 if (unicode()) { | 899 if (unicode()) { |
| 900 // \0 is interpreted as \u0000 if it is not followed by another digit. | |
| 901 if (current() == '0') { | |
| 902 Advance(); | |
| 903 if (current() < '0' || current() > '9') return 0; | |
|
Yang
2016/02/09 19:18:31
This now matches how we parse \<digit> outside of
| |
| 904 } | |
| 886 // With /u, decimal escape is not interpreted as octal character code. | 905 // With /u, decimal escape is not interpreted as octal character code. |
| 887 ReportError(CStrVector("Invalid class escape")); | 906 ReportError(CStrVector("Invalid class escape")); |
| 888 return 0; | 907 return 0; |
| 889 } | 908 } |
| 909 // For backward compatibility, we interpret escaped digit from 0 to 7 as | |
| 910 // a 1..3 digit octal character code. | |
| 890 return ParseOctalLiteral(); | 911 return ParseOctalLiteral(); |
| 891 case 'x': { | 912 case 'x': { |
| 892 Advance(); | 913 Advance(); |
| 893 uc32 value; | 914 uc32 value; |
| 894 if (ParseHexEscape(2, &value)) return value; | 915 if (ParseHexEscape(2, &value)) return value; |
| 895 if (unicode()) { | 916 if (unicode()) { |
| 896 // With /u, invalid escapes are not treated as identity escapes. | 917 // With /u, invalid escapes are not treated as identity escapes. |
| 897 ReportError(CStrVector("Invalid escape")); | 918 ReportError(CStrVector("Invalid escape")); |
| 898 return 0; | 919 return 0; |
| 899 } | 920 } |
| 900 // If \x is not followed by a two-digit hexadecimal, treat it | 921 // If \x is not followed by a two-digit hexadecimal, treat it |
| 901 // as an identity escape. | 922 // as an identity escape. |
| 902 return 'x'; | 923 return 'x'; |
| 903 } | 924 } |
| 904 case 'u': { | 925 case 'u': { |
| 905 Advance(); | 926 Advance(); |
| 906 uc32 value; | 927 uc32 value; |
| 907 if (ParseUnicodeEscape(&value)) return value; | 928 if (ParseUnicodeEscape(&value)) return value; |
| 908 if (unicode()) { | 929 if (unicode()) { |
| 909 // With /u, invalid escapes are not treated as identity escapes. | 930 // With /u, invalid escapes are not treated as identity escapes. |
| 910 ReportError(CStrVector("Invalid unicode escape")); | 931 ReportError(CStrVector("Invalid unicode escape")); |
| 911 return 0; | 932 return 0; |
| 912 } | 933 } |
| 913 // If \u is not followed by a two-digit hexadecimal, treat it | 934 // If \u is not followed by a two-digit hexadecimal, treat it |
| 914 // as an identity escape. | 935 // as an identity escape. |
| 915 return 'u'; | 936 return 'u'; |
| 916 } | 937 } |
| 917 default: { | 938 default: { |
| 918 uc32 result = current(); | 939 uc32 result = current(); |
| 919 // With /u, no identity escapes except for syntax characters are | 940 // With /u, no identity escapes except for syntax characters and '-' are |
| 920 // allowed. Otherwise, all identity escapes are allowed. | 941 // allowed. Otherwise, all identity escapes are allowed. |
| 921 if (!unicode() || IsSyntaxCharacterOrSlash(result)) { | 942 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { |
| 922 Advance(); | 943 Advance(); |
| 923 return result; | 944 return result; |
| 924 } | 945 } |
| 925 ReportError(CStrVector("Invalid escape")); | 946 ReportError(CStrVector("Invalid escape")); |
| 926 return 0; | 947 return 0; |
| 927 } | 948 } |
| 928 } | 949 } |
| 929 return 0; | 950 return 0; |
| 930 } | 951 } |
| 931 | 952 |
| (...skipping 15 matching lines...) Expand all Loading... | |
| 947 } | 968 } |
| 948 case kEndMarker: | 969 case kEndMarker: |
| 949 return ReportError(CStrVector("\\ at end of pattern")); | 970 return ReportError(CStrVector("\\ at end of pattern")); |
| 950 default: | 971 default: |
| 951 first = ParseClassCharacterEscape(CHECK_FAILED); | 972 first = ParseClassCharacterEscape(CHECK_FAILED); |
| 952 } | 973 } |
| 953 } else { | 974 } else { |
| 954 Advance(); | 975 Advance(); |
| 955 } | 976 } |
| 956 | 977 |
| 957 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) { | |
| 958 // Combine with possibly following trail surrogate. | |
| 959 int start = position(); | |
| 960 uc32 second = current(); | |
| 961 if (second == '\\') { | |
| 962 second = ParseClassCharacterEscape(CHECK_FAILED); | |
| 963 } else { | |
| 964 Advance(); | |
| 965 } | |
| 966 if (unibrow::Utf16::IsTrailSurrogate(second)) { | |
| 967 first = unibrow::Utf16::CombineSurrogatePair(first, second); | |
| 968 } else { | |
| 969 Reset(start); | |
| 970 } | |
| 971 } | |
| 972 | |
| 973 return CharacterRange::Singleton(first); | 978 return CharacterRange::Singleton(first); |
| 974 } | 979 } |
| 975 | 980 |
| 976 | 981 |
| 977 static const uc16 kNoCharClass = 0; | 982 static const uc16 kNoCharClass = 0; |
| 978 | 983 |
| 979 // Adds range or pre-defined character class to character ranges. | 984 // Adds range or pre-defined character class to character ranges. |
| 980 // If char_class is not kInvalidClass, it's interpreted as a class | 985 // If char_class is not kInvalidClass, it's interpreted as a class |
| 981 // escape (i.e., 's' means whitespace, from '\s'). | 986 // escape (i.e., 's' means whitespace, from '\s'). |
| 982 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, | 987 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, |
| (...skipping 208 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1191 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); | 1196 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); |
| 1192 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { | 1197 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { |
| 1193 AddLeadSurrogate(c); | 1198 AddLeadSurrogate(c); |
| 1194 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { | 1199 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { |
| 1195 AddTrailSurrogate(c); | 1200 AddTrailSurrogate(c); |
| 1196 } else { | 1201 } else { |
| 1197 AddCharacter(static_cast<uc16>(c)); | 1202 AddCharacter(static_cast<uc16>(c)); |
| 1198 } | 1203 } |
| 1199 } | 1204 } |
| 1200 | 1205 |
| 1206 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { | |
| 1207 // A lead or trail surrogate parsed via escape sequence will not | |
| 1208 // pair up with any preceding lead or following trail surrogate. | |
| 1209 FlushPendingSurrogate(); | |
| 1210 AddUnicodeCharacter(character); | |
| 1211 FlushPendingSurrogate(); | |
| 1212 } | |
| 1201 | 1213 |
| 1202 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } | 1214 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
| 1203 | 1215 |
| 1204 | 1216 |
| 1205 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { | 1217 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
| 1206 if (NeedsDesugaringForUnicode(cc)) { | 1218 if (NeedsDesugaringForUnicode(cc)) { |
| 1207 // With /u, character class needs to be desugared, so it | 1219 // With /u, character class needs to be desugared, so it |
| 1208 // must be a standalone term instead of being part of a RegExpText. | 1220 // must be a standalone term instead of being part of a RegExpText. |
| 1209 AddTerm(cc); | 1221 AddTerm(cc); |
| 1210 } else { | 1222 } else { |
| (...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1362 return false; | 1374 return false; |
| 1363 } | 1375 } |
| 1364 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1376 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
| 1365 zone()); | 1377 zone()); |
| 1366 LAST(ADD_TERM); | 1378 LAST(ADD_TERM); |
| 1367 return true; | 1379 return true; |
| 1368 } | 1380 } |
| 1369 | 1381 |
| 1370 } // namespace internal | 1382 } // namespace internal |
| 1371 } // namespace v8 | 1383 } // namespace v8 |
| OLD | NEW |