| OLD | NEW |
| 1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
| 6 | 6 |
| 7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
| 8 #include "src/factory.h" | 8 #include "src/factory.h" |
| 9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
| 10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
| (...skipping 443 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 454 } else { | 454 } else { |
| 455 // With /u, invalid escapes are not treated as identity escapes. | 455 // With /u, invalid escapes are not treated as identity escapes. |
| 456 return ReportError(CStrVector("Invalid escape")); | 456 return ReportError(CStrVector("Invalid escape")); |
| 457 } | 457 } |
| 458 break; | 458 break; |
| 459 } | 459 } |
| 460 case 'u': { | 460 case 'u': { |
| 461 Advance(2); | 461 Advance(2); |
| 462 uc32 value; | 462 uc32 value; |
| 463 if (ParseUnicodeEscape(&value)) { | 463 if (ParseUnicodeEscape(&value)) { |
| 464 builder->AddUnicodeCharacter(value); | 464 builder->AddEscapedUnicodeCharacter(value); |
| 465 } else if (!unicode()) { | 465 } else if (!unicode()) { |
| 466 builder->AddCharacter('u'); | 466 builder->AddCharacter('u'); |
| 467 } else { | 467 } else { |
| 468 // With /u, invalid escapes are not treated as identity escapes. | 468 // With /u, invalid escapes are not treated as identity escapes. |
| 469 return ReportError(CStrVector("Invalid unicode escape")); | 469 return ReportError(CStrVector("Invalid unicode escape")); |
| 470 } | 470 } |
| 471 break; | 471 break; |
| 472 } | 472 } |
| 473 default: | 473 default: |
| 474 Advance(); | 474 Advance(); |
| (...skipping 298 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 773 Reset(start); | 773 Reset(start); |
| 774 return false; | 774 return false; |
| 775 } | 775 } |
| 776 val = val * 16 + d; | 776 val = val * 16 + d; |
| 777 Advance(); | 777 Advance(); |
| 778 } | 778 } |
| 779 *value = val; | 779 *value = val; |
| 780 return true; | 780 return true; |
| 781 } | 781 } |
| 782 | 782 |
| 783 | 783 // This parses RegExpUnicodeEscapeSequence as described in ECMA262. |
| 784 bool RegExpParser::ParseUnicodeEscape(uc32* value) { | 784 bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
| 785 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are | 785 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
| 786 // allowed). In the latter case, the number of hex digits between { } is | 786 // allowed). In the latter case, the number of hex digits between { } is |
| 787 // arbitrary. \ and u have already been read. | 787 // arbitrary. \ and u have already been read. |
| 788 if (current() == '{' && unicode()) { | 788 if (current() == '{' && unicode()) { |
| 789 int start = position(); | 789 int start = position(); |
| 790 Advance(); | 790 Advance(); |
| 791 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { | 791 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { |
| 792 if (current() == '}') { | 792 if (current() == '}') { |
| 793 Advance(); | 793 Advance(); |
| 794 return true; | 794 return true; |
| 795 } | 795 } |
| 796 } | 796 } |
| 797 Reset(start); | 797 Reset(start); |
| 798 return false; | 798 return false; |
| 799 } | 799 } |
| 800 // \u but no {, or \u{...} escapes not allowed. | 800 // \u but no {, or \u{...} escapes not allowed. |
| 801 return ParseHexEscape(4, value); | 801 bool result = ParseHexEscape(4, value); |
| 802 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && |
| 803 current() == '\\') { |
| 804 // Attempt to read trail surrogate. |
| 805 int start = position(); |
| 806 if (Next() == 'u') { |
| 807 Advance(2); |
| 808 uc32 trail; |
| 809 if (ParseHexEscape(4, &trail) && |
| 810 unibrow::Utf16::IsTrailSurrogate(trail)) { |
| 811 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value), |
| 812 static_cast<uc16>(trail)); |
| 813 return true; |
| 814 } |
| 815 } |
| 816 Reset(start); |
| 817 } |
| 818 return result; |
| 802 } | 819 } |
| 803 | 820 |
| 804 | 821 |
| 805 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { | 822 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { |
| 806 uc32 x = 0; | 823 uc32 x = 0; |
| 807 int d = HexValue(current()); | 824 int d = HexValue(current()); |
| 808 if (d < 0) { | 825 if (d < 0) { |
| 809 return false; | 826 return false; |
| 810 } | 827 } |
| 811 while (d >= 0) { | 828 while (d >= 0) { |
| (...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 865 if ((controlLetter >= '0' && controlLetter <= '9') || | 882 if ((controlLetter >= '0' && controlLetter <= '9') || |
| 866 controlLetter == '_') { | 883 controlLetter == '_') { |
| 867 Advance(2); | 884 Advance(2); |
| 868 return controlLetter & 0x1f; | 885 return controlLetter & 0x1f; |
| 869 } | 886 } |
| 870 // We match JSC in reading the backslash as a literal | 887 // We match JSC in reading the backslash as a literal |
| 871 // character instead of as starting an escape. | 888 // character instead of as starting an escape. |
| 872 return '\\'; | 889 return '\\'; |
| 873 } | 890 } |
| 874 case '0': | 891 case '0': |
| 892 // With /u, \0 is interpreted as NUL if not followed by another digit. |
| 893 if (unicode() && !(Next() >= '0' && Next() <= '9')) { |
| 894 Advance(); |
| 895 return 0; |
| 896 } |
| 897 // Fall through. |
| 875 case '1': | 898 case '1': |
| 876 case '2': | 899 case '2': |
| 877 case '3': | 900 case '3': |
| 878 case '4': | 901 case '4': |
| 879 case '5': | 902 case '5': |
| 880 case '6': | 903 case '6': |
| 881 case '7': | 904 case '7': |
| 882 // For compatibility, we interpret a decimal escape that isn't | 905 // For compatibility, we interpret a decimal escape that isn't |
| 883 // a back reference (and therefore either \0 or not valid according | 906 // a back reference (and therefore either \0 or not valid according |
| 884 // to the specification) as a 1..3 digit octal character code. | 907 // to the specification) as a 1..3 digit octal character code. |
| (...skipping 24 matching lines...) Expand all Loading... |
| 909 // With /u, invalid escapes are not treated as identity escapes. | 932 // With /u, invalid escapes are not treated as identity escapes. |
| 910 ReportError(CStrVector("Invalid unicode escape")); | 933 ReportError(CStrVector("Invalid unicode escape")); |
| 911 return 0; | 934 return 0; |
| 912 } | 935 } |
| 913 // If \u is not followed by a two-digit hexadecimal, treat it | 936 // If \u is not followed by a two-digit hexadecimal, treat it |
| 914 // as an identity escape. | 937 // as an identity escape. |
| 915 return 'u'; | 938 return 'u'; |
| 916 } | 939 } |
| 917 default: { | 940 default: { |
| 918 uc32 result = current(); | 941 uc32 result = current(); |
| 919 // With /u, no identity escapes except for syntax characters are | 942 // With /u, no identity escapes except for syntax characters and '-' are |
| 920 // allowed. Otherwise, all identity escapes are allowed. | 943 // allowed. Otherwise, all identity escapes are allowed. |
| 921 if (!unicode() || IsSyntaxCharacterOrSlash(result)) { | 944 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { |
| 922 Advance(); | 945 Advance(); |
| 923 return result; | 946 return result; |
| 924 } | 947 } |
| 925 ReportError(CStrVector("Invalid escape")); | 948 ReportError(CStrVector("Invalid escape")); |
| 926 return 0; | 949 return 0; |
| 927 } | 950 } |
| 928 } | 951 } |
| 929 return 0; | 952 return 0; |
| 930 } | 953 } |
| 931 | 954 |
| (...skipping 15 matching lines...) Expand all Loading... |
| 947 } | 970 } |
| 948 case kEndMarker: | 971 case kEndMarker: |
| 949 return ReportError(CStrVector("\\ at end of pattern")); | 972 return ReportError(CStrVector("\\ at end of pattern")); |
| 950 default: | 973 default: |
| 951 first = ParseClassCharacterEscape(CHECK_FAILED); | 974 first = ParseClassCharacterEscape(CHECK_FAILED); |
| 952 } | 975 } |
| 953 } else { | 976 } else { |
| 954 Advance(); | 977 Advance(); |
| 955 } | 978 } |
| 956 | 979 |
| 957 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) { | |
| 958 // Combine with possibly following trail surrogate. | |
| 959 int start = position(); | |
| 960 uc32 second = current(); | |
| 961 if (second == '\\') { | |
| 962 second = ParseClassCharacterEscape(CHECK_FAILED); | |
| 963 } else { | |
| 964 Advance(); | |
| 965 } | |
| 966 if (unibrow::Utf16::IsTrailSurrogate(second)) { | |
| 967 first = unibrow::Utf16::CombineSurrogatePair(first, second); | |
| 968 } else { | |
| 969 Reset(start); | |
| 970 } | |
| 971 } | |
| 972 | |
| 973 return CharacterRange::Singleton(first); | 980 return CharacterRange::Singleton(first); |
| 974 } | 981 } |
| 975 | 982 |
| 976 | 983 |
| 977 static const uc16 kNoCharClass = 0; | 984 static const uc16 kNoCharClass = 0; |
| 978 | 985 |
| 979 // Adds range or pre-defined character class to character ranges. | 986 // Adds range or pre-defined character class to character ranges. |
| 980 // If char_class is not kInvalidClass, it's interpreted as a class | 987 // If char_class is not kInvalidClass, it's interpreted as a class |
| 981 // escape (i.e., 's' means whitespace, from '\s'). | 988 // escape (i.e., 's' means whitespace, from '\s'). |
| 982 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, | 989 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, |
| (...skipping 208 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1191 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); | 1198 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); |
| 1192 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { | 1199 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { |
| 1193 AddLeadSurrogate(c); | 1200 AddLeadSurrogate(c); |
| 1194 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { | 1201 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { |
| 1195 AddTrailSurrogate(c); | 1202 AddTrailSurrogate(c); |
| 1196 } else { | 1203 } else { |
| 1197 AddCharacter(static_cast<uc16>(c)); | 1204 AddCharacter(static_cast<uc16>(c)); |
| 1198 } | 1205 } |
| 1199 } | 1206 } |
| 1200 | 1207 |
| 1208 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { |
| 1209 // A lead or trail surrogate parsed via escape sequence will not |
| 1210 // pair up with any preceding lead or following trail surrogate. |
| 1211 FlushPendingSurrogate(); |
| 1212 AddUnicodeCharacter(character); |
| 1213 FlushPendingSurrogate(); |
| 1214 } |
| 1201 | 1215 |
| 1202 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } | 1216 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
| 1203 | 1217 |
| 1204 | 1218 |
| 1205 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { | 1219 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
| 1206 if (NeedsDesugaringForUnicode(cc)) { | 1220 if (NeedsDesugaringForUnicode(cc)) { |
| 1207 // With /u, character class needs to be desugared, so it | 1221 // With /u, character class needs to be desugared, so it |
| 1208 // must be a standalone term instead of being part of a RegExpText. | 1222 // must be a standalone term instead of being part of a RegExpText. |
| 1209 AddTerm(cc); | 1223 AddTerm(cc); |
| 1210 } else { | 1224 } else { |
| (...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1362 return false; | 1376 return false; |
| 1363 } | 1377 } |
| 1364 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1378 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
| 1365 zone()); | 1379 zone()); |
| 1366 LAST(ADD_TERM); | 1380 LAST(ADD_TERM); |
| 1367 return true; | 1381 return true; |
| 1368 } | 1382 } |
| 1369 | 1383 |
| 1370 } // namespace internal | 1384 } // namespace internal |
| 1371 } // namespace v8 | 1385 } // namespace v8 |
| OLD | NEW |