OLD | NEW |
1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
6 | 6 |
7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
8 #include "src/factory.h" | 8 #include "src/factory.h" |
9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
(...skipping 443 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
454 } else { | 454 } else { |
455 // With /u, invalid escapes are not treated as identity escapes. | 455 // With /u, invalid escapes are not treated as identity escapes. |
456 return ReportError(CStrVector("Invalid escape")); | 456 return ReportError(CStrVector("Invalid escape")); |
457 } | 457 } |
458 break; | 458 break; |
459 } | 459 } |
460 case 'u': { | 460 case 'u': { |
461 Advance(2); | 461 Advance(2); |
462 uc32 value; | 462 uc32 value; |
463 if (ParseUnicodeEscape(&value)) { | 463 if (ParseUnicodeEscape(&value)) { |
464 builder->AddUnicodeCharacter(value); | 464 builder->AddEscapedUnicodeCharacter(value); |
465 } else if (!unicode()) { | 465 } else if (!unicode()) { |
466 builder->AddCharacter('u'); | 466 builder->AddCharacter('u'); |
467 } else { | 467 } else { |
468 // With /u, invalid escapes are not treated as identity escapes. | 468 // With /u, invalid escapes are not treated as identity escapes. |
469 return ReportError(CStrVector("Invalid unicode escape")); | 469 return ReportError(CStrVector("Invalid unicode escape")); |
470 } | 470 } |
471 break; | 471 break; |
472 } | 472 } |
473 default: | 473 default: |
474 Advance(); | 474 Advance(); |
(...skipping 298 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
773 Reset(start); | 773 Reset(start); |
774 return false; | 774 return false; |
775 } | 775 } |
776 val = val * 16 + d; | 776 val = val * 16 + d; |
777 Advance(); | 777 Advance(); |
778 } | 778 } |
779 *value = val; | 779 *value = val; |
780 return true; | 780 return true; |
781 } | 781 } |
782 | 782 |
783 | 783 // This parses RegExpUnicodeEscapeSequence as described in ECMA262. |
784 bool RegExpParser::ParseUnicodeEscape(uc32* value) { | 784 bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
785 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are | 785 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
786 // allowed). In the latter case, the number of hex digits between { } is | 786 // allowed). In the latter case, the number of hex digits between { } is |
787 // arbitrary. \ and u have already been read. | 787 // arbitrary. \ and u have already been read. |
788 if (current() == '{' && unicode()) { | 788 if (current() == '{' && unicode()) { |
789 int start = position(); | 789 int start = position(); |
790 Advance(); | 790 Advance(); |
791 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { | 791 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { |
792 if (current() == '}') { | 792 if (current() == '}') { |
793 Advance(); | 793 Advance(); |
794 return true; | 794 return true; |
795 } | 795 } |
796 } | 796 } |
797 Reset(start); | 797 Reset(start); |
798 return false; | 798 return false; |
799 } | 799 } |
800 // \u but no {, or \u{...} escapes not allowed. | 800 // \u but no {, or \u{...} escapes not allowed. |
801 return ParseHexEscape(4, value); | 801 bool result = ParseHexEscape(4, value); |
| 802 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && |
| 803 current() == '\\') { |
| 804 // Attempt to read trail surrogate. |
| 805 int start = position(); |
| 806 if (Next() == 'u') { |
| 807 Advance(2); |
| 808 uc32 trail; |
| 809 if (ParseHexEscape(4, &trail) && |
| 810 unibrow::Utf16::IsTrailSurrogate(trail)) { |
| 811 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value), |
| 812 static_cast<uc16>(trail)); |
| 813 return true; |
| 814 } |
| 815 } |
| 816 Reset(start); |
| 817 } |
| 818 return result; |
802 } | 819 } |
803 | 820 |
804 | 821 |
805 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { | 822 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { |
806 uc32 x = 0; | 823 uc32 x = 0; |
807 int d = HexValue(current()); | 824 int d = HexValue(current()); |
808 if (d < 0) { | 825 if (d < 0) { |
809 return false; | 826 return false; |
810 } | 827 } |
811 while (d >= 0) { | 828 while (d >= 0) { |
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
865 if ((controlLetter >= '0' && controlLetter <= '9') || | 882 if ((controlLetter >= '0' && controlLetter <= '9') || |
866 controlLetter == '_') { | 883 controlLetter == '_') { |
867 Advance(2); | 884 Advance(2); |
868 return controlLetter & 0x1f; | 885 return controlLetter & 0x1f; |
869 } | 886 } |
870 // We match JSC in reading the backslash as a literal | 887 // We match JSC in reading the backslash as a literal |
871 // character instead of as starting an escape. | 888 // character instead of as starting an escape. |
872 return '\\'; | 889 return '\\'; |
873 } | 890 } |
874 case '0': | 891 case '0': |
| 892 // With /u, \0 is interpreted as NUL if not followed by another digit. |
| 893 if (unicode() && !(Next() >= '0' && Next() <= '9')) { |
| 894 Advance(); |
| 895 return 0; |
| 896 } |
| 897 // Fall through. |
875 case '1': | 898 case '1': |
876 case '2': | 899 case '2': |
877 case '3': | 900 case '3': |
878 case '4': | 901 case '4': |
879 case '5': | 902 case '5': |
880 case '6': | 903 case '6': |
881 case '7': | 904 case '7': |
882 // For compatibility, we interpret a decimal escape that isn't | 905 // For compatibility, we interpret a decimal escape that isn't |
883 // a back reference (and therefore either \0 or not valid according | 906 // a back reference (and therefore either \0 or not valid according |
884 // to the specification) as a 1..3 digit octal character code. | 907 // to the specification) as a 1..3 digit octal character code. |
(...skipping 24 matching lines...) Expand all Loading... |
909 // With /u, invalid escapes are not treated as identity escapes. | 932 // With /u, invalid escapes are not treated as identity escapes. |
910 ReportError(CStrVector("Invalid unicode escape")); | 933 ReportError(CStrVector("Invalid unicode escape")); |
911 return 0; | 934 return 0; |
912 } | 935 } |
913 // If \u is not followed by a two-digit hexadecimal, treat it | 936 // If \u is not followed by a two-digit hexadecimal, treat it |
914 // as an identity escape. | 937 // as an identity escape. |
915 return 'u'; | 938 return 'u'; |
916 } | 939 } |
917 default: { | 940 default: { |
918 uc32 result = current(); | 941 uc32 result = current(); |
919 // With /u, no identity escapes except for syntax characters are | 942 // With /u, no identity escapes except for syntax characters and '-' are |
920 // allowed. Otherwise, all identity escapes are allowed. | 943 // allowed. Otherwise, all identity escapes are allowed. |
921 if (!unicode() || IsSyntaxCharacterOrSlash(result)) { | 944 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { |
922 Advance(); | 945 Advance(); |
923 return result; | 946 return result; |
924 } | 947 } |
925 ReportError(CStrVector("Invalid escape")); | 948 ReportError(CStrVector("Invalid escape")); |
926 return 0; | 949 return 0; |
927 } | 950 } |
928 } | 951 } |
929 return 0; | 952 return 0; |
930 } | 953 } |
931 | 954 |
(...skipping 15 matching lines...) Expand all Loading... |
947 } | 970 } |
948 case kEndMarker: | 971 case kEndMarker: |
949 return ReportError(CStrVector("\\ at end of pattern")); | 972 return ReportError(CStrVector("\\ at end of pattern")); |
950 default: | 973 default: |
951 first = ParseClassCharacterEscape(CHECK_FAILED); | 974 first = ParseClassCharacterEscape(CHECK_FAILED); |
952 } | 975 } |
953 } else { | 976 } else { |
954 Advance(); | 977 Advance(); |
955 } | 978 } |
956 | 979 |
957 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) { | |
958 // Combine with possibly following trail surrogate. | |
959 int start = position(); | |
960 uc32 second = current(); | |
961 if (second == '\\') { | |
962 second = ParseClassCharacterEscape(CHECK_FAILED); | |
963 } else { | |
964 Advance(); | |
965 } | |
966 if (unibrow::Utf16::IsTrailSurrogate(second)) { | |
967 first = unibrow::Utf16::CombineSurrogatePair(first, second); | |
968 } else { | |
969 Reset(start); | |
970 } | |
971 } | |
972 | |
973 return CharacterRange::Singleton(first); | 980 return CharacterRange::Singleton(first); |
974 } | 981 } |
975 | 982 |
976 | 983 |
977 static const uc16 kNoCharClass = 0; | 984 static const uc16 kNoCharClass = 0; |
978 | 985 |
979 // Adds range or pre-defined character class to character ranges. | 986 // Adds range or pre-defined character class to character ranges. |
980 // If char_class is not kInvalidClass, it's interpreted as a class | 987 // If char_class is not kInvalidClass, it's interpreted as a class |
981 // escape (i.e., 's' means whitespace, from '\s'). | 988 // escape (i.e., 's' means whitespace, from '\s'). |
982 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, | 989 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, |
(...skipping 208 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1191 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); | 1198 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); |
1192 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { | 1199 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { |
1193 AddLeadSurrogate(c); | 1200 AddLeadSurrogate(c); |
1194 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { | 1201 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { |
1195 AddTrailSurrogate(c); | 1202 AddTrailSurrogate(c); |
1196 } else { | 1203 } else { |
1197 AddCharacter(static_cast<uc16>(c)); | 1204 AddCharacter(static_cast<uc16>(c)); |
1198 } | 1205 } |
1199 } | 1206 } |
1200 | 1207 |
| 1208 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { |
| 1209 // A lead or trail surrogate parsed via escape sequence will not |
| 1210 // pair up with any preceding lead or following trail surrogate. |
| 1211 FlushPendingSurrogate(); |
| 1212 AddUnicodeCharacter(character); |
| 1213 FlushPendingSurrogate(); |
| 1214 } |
1201 | 1215 |
1202 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } | 1216 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
1203 | 1217 |
1204 | 1218 |
1205 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { | 1219 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
1206 if (NeedsDesugaringForUnicode(cc)) { | 1220 if (NeedsDesugaringForUnicode(cc)) { |
1207 // With /u, character class needs to be desugared, so it | 1221 // With /u, character class needs to be desugared, so it |
1208 // must be a standalone term instead of being part of a RegExpText. | 1222 // must be a standalone term instead of being part of a RegExpText. |
1209 AddTerm(cc); | 1223 AddTerm(cc); |
1210 } else { | 1224 } else { |
(...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1362 return false; | 1376 return false; |
1363 } | 1377 } |
1364 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1378 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
1365 zone()); | 1379 zone()); |
1366 LAST(ADD_TERM); | 1380 LAST(ADD_TERM); |
1367 return true; | 1381 return true; |
1368 } | 1382 } |
1369 | 1383 |
1370 } // namespace internal | 1384 } // namespace internal |
1371 } // namespace v8 | 1385 } // namespace v8 |
OLD | NEW |