OLD | NEW |
---|---|
1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
6 | 6 |
7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
8 #include "src/factory.h" | 8 #include "src/factory.h" |
9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
(...skipping 443 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
454 } else { | 454 } else { |
455 // With /u, invalid escapes are not treated as identity escapes. | 455 // With /u, invalid escapes are not treated as identity escapes. |
456 return ReportError(CStrVector("Invalid escape")); | 456 return ReportError(CStrVector("Invalid escape")); |
457 } | 457 } |
458 break; | 458 break; |
459 } | 459 } |
460 case 'u': { | 460 case 'u': { |
461 Advance(2); | 461 Advance(2); |
462 uc32 value; | 462 uc32 value; |
463 if (ParseUnicodeEscape(&value)) { | 463 if (ParseUnicodeEscape(&value)) { |
464 builder->AddUnicodeCharacter(value); | 464 builder->AddEscapedUnicodeCharacter(value); |
465 } else if (!unicode()) { | 465 } else if (!unicode()) { |
466 builder->AddCharacter('u'); | 466 builder->AddCharacter('u'); |
467 } else { | 467 } else { |
468 // With /u, invalid escapes are not treated as identity escapes. | 468 // With /u, invalid escapes are not treated as identity escapes. |
469 return ReportError(CStrVector("Invalid unicode escape")); | 469 return ReportError(CStrVector("Invalid unicode escape")); |
470 } | 470 } |
471 break; | 471 break; |
472 } | 472 } |
473 default: | 473 default: |
474 Advance(); | 474 Advance(); |
(...skipping 298 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
773 Reset(start); | 773 Reset(start); |
774 return false; | 774 return false; |
775 } | 775 } |
776 val = val * 16 + d; | 776 val = val * 16 + d; |
777 Advance(); | 777 Advance(); |
778 } | 778 } |
779 *value = val; | 779 *value = val; |
780 return true; | 780 return true; |
781 } | 781 } |
782 | 782 |
783 | 783 // This parses RegExpUnicodeEscapeSequence as described in ECMA262. |
784 bool RegExpParser::ParseUnicodeEscape(uc32* value) { | 784 bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
785 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are | 785 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
786 // allowed). In the latter case, the number of hex digits between { } is | 786 // allowed). In the latter case, the number of hex digits between { } is |
787 // arbitrary. \ and u have already been read. | 787 // arbitrary. \ and u have already been read. |
788 if (current() == '{' && unicode()) { | 788 if (current() == '{' && unicode()) { |
789 int start = position(); | 789 int start = position(); |
790 Advance(); | 790 Advance(); |
791 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { | 791 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { |
792 if (current() == '}') { | 792 if (current() == '}') { |
793 Advance(); | 793 Advance(); |
794 return true; | 794 return true; |
795 } | 795 } |
796 } | 796 } |
797 Reset(start); | 797 Reset(start); |
798 return false; | 798 return false; |
799 } | 799 } |
800 // \u but no {, or \u{...} escapes not allowed. | 800 // \u but no {, or \u{...} escapes not allowed. |
801 return ParseHexEscape(4, value); | 801 bool result = ParseHexEscape(4, value); |
802 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && | |
803 current() == '\\') { | |
804 // Attempt to read trail surrogate. | |
805 int start = position(); | |
806 if (Next() == 'u') { | |
807 Advance(2); | |
808 uc32 trail; | |
809 if (ParseHexEscape(4, &trail) && | |
810 unibrow::Utf16::IsTrailSurrogate(trail)) { | |
811 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value), | |
812 static_cast<uc16>(trail)); | |
813 return true; | |
814 } | |
815 } | |
816 Reset(start); | |
817 } | |
818 return result; | |
802 } | 819 } |
803 | 820 |
804 | 821 |
805 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { | 822 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { |
806 uc32 x = 0; | 823 uc32 x = 0; |
807 int d = HexValue(current()); | 824 int d = HexValue(current()); |
808 if (d < 0) { | 825 if (d < 0) { |
809 return false; | 826 return false; |
810 } | 827 } |
811 while (d >= 0) { | 828 while (d >= 0) { |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
872 return '\\'; | 889 return '\\'; |
873 } | 890 } |
874 case '0': | 891 case '0': |
875 case '1': | 892 case '1': |
876 case '2': | 893 case '2': |
877 case '3': | 894 case '3': |
878 case '4': | 895 case '4': |
879 case '5': | 896 case '5': |
880 case '6': | 897 case '6': |
881 case '7': | 898 case '7': |
882 // For compatibility, we interpret a decimal escape that isn't | |
883 // a back reference (and therefore either \0 or not valid according | |
884 // to the specification) as a 1..3 digit octal character code. | |
885 if (unicode()) { | 899 if (unicode()) { |
900 // \0 is interpreted as \u0000 if it is not followed by another digit. | |
901 if (current() == '0') { | |
902 Advance(); | |
903 if (current() < '0' || current() > '9') return 0; | |
Yang
2016/02/09 19:18:31
This now matches how we parse \<digit> outside of
| |
904 } | |
886 // With /u, decimal escape is not interpreted as octal character code. | 905 // With /u, decimal escape is not interpreted as octal character code. |
887 ReportError(CStrVector("Invalid class escape")); | 906 ReportError(CStrVector("Invalid class escape")); |
888 return 0; | 907 return 0; |
889 } | 908 } |
909 // For backward compatibility, we interpret escaped digit from 0 to 7 as | |
910 // a 1..3 digit octal character code. | |
890 return ParseOctalLiteral(); | 911 return ParseOctalLiteral(); |
891 case 'x': { | 912 case 'x': { |
892 Advance(); | 913 Advance(); |
893 uc32 value; | 914 uc32 value; |
894 if (ParseHexEscape(2, &value)) return value; | 915 if (ParseHexEscape(2, &value)) return value; |
895 if (unicode()) { | 916 if (unicode()) { |
896 // With /u, invalid escapes are not treated as identity escapes. | 917 // With /u, invalid escapes are not treated as identity escapes. |
897 ReportError(CStrVector("Invalid escape")); | 918 ReportError(CStrVector("Invalid escape")); |
898 return 0; | 919 return 0; |
899 } | 920 } |
900 // If \x is not followed by a two-digit hexadecimal, treat it | 921 // If \x is not followed by a two-digit hexadecimal, treat it |
901 // as an identity escape. | 922 // as an identity escape. |
902 return 'x'; | 923 return 'x'; |
903 } | 924 } |
904 case 'u': { | 925 case 'u': { |
905 Advance(); | 926 Advance(); |
906 uc32 value; | 927 uc32 value; |
907 if (ParseUnicodeEscape(&value)) return value; | 928 if (ParseUnicodeEscape(&value)) return value; |
908 if (unicode()) { | 929 if (unicode()) { |
909 // With /u, invalid escapes are not treated as identity escapes. | 930 // With /u, invalid escapes are not treated as identity escapes. |
910 ReportError(CStrVector("Invalid unicode escape")); | 931 ReportError(CStrVector("Invalid unicode escape")); |
911 return 0; | 932 return 0; |
912 } | 933 } |
913 // If \u is not followed by a two-digit hexadecimal, treat it | 934 // If \u is not followed by a two-digit hexadecimal, treat it |
914 // as an identity escape. | 935 // as an identity escape. |
915 return 'u'; | 936 return 'u'; |
916 } | 937 } |
917 default: { | 938 default: { |
918 uc32 result = current(); | 939 uc32 result = current(); |
919 // With /u, no identity escapes except for syntax characters are | 940 // With /u, no identity escapes except for syntax characters and '-' are |
920 // allowed. Otherwise, all identity escapes are allowed. | 941 // allowed. Otherwise, all identity escapes are allowed. |
921 if (!unicode() || IsSyntaxCharacterOrSlash(result)) { | 942 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { |
922 Advance(); | 943 Advance(); |
923 return result; | 944 return result; |
924 } | 945 } |
925 ReportError(CStrVector("Invalid escape")); | 946 ReportError(CStrVector("Invalid escape")); |
926 return 0; | 947 return 0; |
927 } | 948 } |
928 } | 949 } |
929 return 0; | 950 return 0; |
930 } | 951 } |
931 | 952 |
(...skipping 15 matching lines...) Expand all Loading... | |
947 } | 968 } |
948 case kEndMarker: | 969 case kEndMarker: |
949 return ReportError(CStrVector("\\ at end of pattern")); | 970 return ReportError(CStrVector("\\ at end of pattern")); |
950 default: | 971 default: |
951 first = ParseClassCharacterEscape(CHECK_FAILED); | 972 first = ParseClassCharacterEscape(CHECK_FAILED); |
952 } | 973 } |
953 } else { | 974 } else { |
954 Advance(); | 975 Advance(); |
955 } | 976 } |
956 | 977 |
957 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) { | |
958 // Combine with possibly following trail surrogate. | |
959 int start = position(); | |
960 uc32 second = current(); | |
961 if (second == '\\') { | |
962 second = ParseClassCharacterEscape(CHECK_FAILED); | |
963 } else { | |
964 Advance(); | |
965 } | |
966 if (unibrow::Utf16::IsTrailSurrogate(second)) { | |
967 first = unibrow::Utf16::CombineSurrogatePair(first, second); | |
968 } else { | |
969 Reset(start); | |
970 } | |
971 } | |
972 | |
973 return CharacterRange::Singleton(first); | 978 return CharacterRange::Singleton(first); |
974 } | 979 } |
975 | 980 |
976 | 981 |
977 static const uc16 kNoCharClass = 0; | 982 static const uc16 kNoCharClass = 0; |
978 | 983 |
979 // Adds range or pre-defined character class to character ranges. | 984 // Adds range or pre-defined character class to character ranges. |
980 // If char_class is not kInvalidClass, it's interpreted as a class | 985 // If char_class is not kInvalidClass, it's interpreted as a class |
981 // escape (i.e., 's' means whitespace, from '\s'). | 986 // escape (i.e., 's' means whitespace, from '\s'). |
982 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, | 987 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, |
(...skipping 208 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1191 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); | 1196 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); |
1192 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { | 1197 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { |
1193 AddLeadSurrogate(c); | 1198 AddLeadSurrogate(c); |
1194 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { | 1199 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { |
1195 AddTrailSurrogate(c); | 1200 AddTrailSurrogate(c); |
1196 } else { | 1201 } else { |
1197 AddCharacter(static_cast<uc16>(c)); | 1202 AddCharacter(static_cast<uc16>(c)); |
1198 } | 1203 } |
1199 } | 1204 } |
1200 | 1205 |
1206 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { | |
1207 // A lead or trail surrogate parsed via escape sequence will not | |
1208 // pair up with any preceding lead or following trail surrogate. | |
1209 FlushPendingSurrogate(); | |
1210 AddUnicodeCharacter(character); | |
1211 FlushPendingSurrogate(); | |
1212 } | |
1201 | 1213 |
1202 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } | 1214 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
1203 | 1215 |
1204 | 1216 |
1205 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { | 1217 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
1206 if (NeedsDesugaringForUnicode(cc)) { | 1218 if (NeedsDesugaringForUnicode(cc)) { |
1207 // With /u, character class needs to be desugared, so it | 1219 // With /u, character class needs to be desugared, so it |
1208 // must be a standalone term instead of being part of a RegExpText. | 1220 // must be a standalone term instead of being part of a RegExpText. |
1209 AddTerm(cc); | 1221 AddTerm(cc); |
1210 } else { | 1222 } else { |
(...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1362 return false; | 1374 return false; |
1363 } | 1375 } |
1364 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1376 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
1365 zone()); | 1377 zone()); |
1366 LAST(ADD_TERM); | 1378 LAST(ADD_TERM); |
1367 return true; | 1379 return true; |
1368 } | 1380 } |
1369 | 1381 |
1370 } // namespace internal | 1382 } // namespace internal |
1371 } // namespace v8 | 1383 } // namespace v8 |
OLD | NEW |