OLD | NEW |
1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
6 | 6 |
7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
8 #include "src/factory.h" | 8 #include "src/factory.h" |
9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
(...skipping 460 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
471 } else { | 471 } else { |
472 // With /u, invalid escapes are not treated as identity escapes. | 472 // With /u, invalid escapes are not treated as identity escapes. |
473 return ReportError(CStrVector("Invalid escape")); | 473 return ReportError(CStrVector("Invalid escape")); |
474 } | 474 } |
475 break; | 475 break; |
476 } | 476 } |
477 case 'u': { | 477 case 'u': { |
478 Advance(2); | 478 Advance(2); |
479 uc32 value; | 479 uc32 value; |
480 if (ParseUnicodeEscape(&value)) { | 480 if (ParseUnicodeEscape(&value)) { |
481 builder->AddUnicodeCharacter(value); | 481 builder->AddEscapedUnicodeCharacter(value); |
482 } else if (!unicode()) { | 482 } else if (!unicode()) { |
483 builder->AddCharacter('u'); | 483 builder->AddCharacter('u'); |
484 } else { | 484 } else { |
485 // With /u, invalid escapes are not treated as identity escapes. | 485 // With /u, invalid escapes are not treated as identity escapes. |
486 return ReportError(CStrVector("Invalid unicode escape")); | 486 return ReportError(CStrVector("Invalid unicode escape")); |
487 } | 487 } |
488 break; | 488 break; |
489 } | 489 } |
490 default: | 490 default: |
491 Advance(); | 491 Advance(); |
(...skipping 298 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
790 Reset(start); | 790 Reset(start); |
791 return false; | 791 return false; |
792 } | 792 } |
793 val = val * 16 + d; | 793 val = val * 16 + d; |
794 Advance(); | 794 Advance(); |
795 } | 795 } |
796 *value = val; | 796 *value = val; |
797 return true; | 797 return true; |
798 } | 798 } |
799 | 799 |
800 | 800 // This parses RegExpUnicodeEscapeSequence as described in ECMA262. |
801 bool RegExpParser::ParseUnicodeEscape(uc32* value) { | 801 bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
802 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are | 802 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
803 // allowed). In the latter case, the number of hex digits between { } is | 803 // allowed). In the latter case, the number of hex digits between { } is |
804 // arbitrary. \ and u have already been read. | 804 // arbitrary. \ and u have already been read. |
805 if (current() == '{' && unicode()) { | 805 if (current() == '{' && unicode()) { |
806 int start = position(); | 806 int start = position(); |
807 Advance(); | 807 Advance(); |
808 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { | 808 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { |
809 if (current() == '}') { | 809 if (current() == '}') { |
810 Advance(); | 810 Advance(); |
811 return true; | 811 return true; |
812 } | 812 } |
813 } | 813 } |
814 Reset(start); | 814 Reset(start); |
815 return false; | 815 return false; |
816 } | 816 } |
817 // \u but no {, or \u{...} escapes not allowed. | 817 // \u but no {, or \u{...} escapes not allowed. |
818 return ParseHexEscape(4, value); | 818 bool result = ParseHexEscape(4, value); |
| 819 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && |
| 820 current() == '\\') { |
| 821 // Attempt to read trail surrogate. |
| 822 int start = position(); |
| 823 if (Next() == 'u') { |
| 824 Advance(2); |
| 825 uc32 trail; |
| 826 if (ParseHexEscape(4, &trail) && |
| 827 unibrow::Utf16::IsTrailSurrogate(trail)) { |
| 828 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value), |
| 829 static_cast<uc16>(trail)); |
| 830 return true; |
| 831 } |
| 832 } |
| 833 Reset(start); |
| 834 } |
| 835 return result; |
819 } | 836 } |
820 | 837 |
821 ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() { | 838 ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() { |
822 #ifdef V8_I18N_SUPPORT | 839 #ifdef V8_I18N_SUPPORT |
823 char property_name[3]; | 840 char property_name[3]; |
824 memset(property_name, 0, sizeof(property_name)); | 841 memset(property_name, 0, sizeof(property_name)); |
825 if (current() == '{') { | 842 if (current() == '{') { |
826 Advance(); | 843 Advance(); |
827 if (current() < 'A' || current() > 'Z') return nullptr; | 844 if (current() < 'A' || current() > 'Z') return nullptr; |
828 property_name[0] = static_cast<char>(current()); | 845 property_name[0] = static_cast<char>(current()); |
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
931 if ((controlLetter >= '0' && controlLetter <= '9') || | 948 if ((controlLetter >= '0' && controlLetter <= '9') || |
932 controlLetter == '_') { | 949 controlLetter == '_') { |
933 Advance(2); | 950 Advance(2); |
934 return controlLetter & 0x1f; | 951 return controlLetter & 0x1f; |
935 } | 952 } |
936 // We match JSC in reading the backslash as a literal | 953 // We match JSC in reading the backslash as a literal |
937 // character instead of as starting an escape. | 954 // character instead of as starting an escape. |
938 return '\\'; | 955 return '\\'; |
939 } | 956 } |
940 case '0': | 957 case '0': |
| 958 // With /u, \0 is interpreted as NUL if not followed by another digit. |
| 959 if (unicode() && !(Next() >= '0' && Next() <= '9')) { |
| 960 Advance(); |
| 961 return 0; |
| 962 } |
| 963 // Fall through. |
941 case '1': | 964 case '1': |
942 case '2': | 965 case '2': |
943 case '3': | 966 case '3': |
944 case '4': | 967 case '4': |
945 case '5': | 968 case '5': |
946 case '6': | 969 case '6': |
947 case '7': | 970 case '7': |
948 // For compatibility, we interpret a decimal escape that isn't | 971 // For compatibility, we interpret a decimal escape that isn't |
949 // a back reference (and therefore either \0 or not valid according | 972 // a back reference (and therefore either \0 or not valid according |
950 // to the specification) as a 1..3 digit octal character code. | 973 // to the specification) as a 1..3 digit octal character code. |
(...skipping 24 matching lines...) Expand all Loading... |
975 // With /u, invalid escapes are not treated as identity escapes. | 998 // With /u, invalid escapes are not treated as identity escapes. |
976 ReportError(CStrVector("Invalid unicode escape")); | 999 ReportError(CStrVector("Invalid unicode escape")); |
977 return 0; | 1000 return 0; |
978 } | 1001 } |
979 // If \u is not followed by a two-digit hexadecimal, treat it | 1002 // If \u is not followed by a two-digit hexadecimal, treat it |
980 // as an identity escape. | 1003 // as an identity escape. |
981 return 'u'; | 1004 return 'u'; |
982 } | 1005 } |
983 default: { | 1006 default: { |
984 uc32 result = current(); | 1007 uc32 result = current(); |
985 // With /u, no identity escapes except for syntax characters are | 1008 // With /u, no identity escapes except for syntax characters and '-' are |
986 // allowed. Otherwise, all identity escapes are allowed. | 1009 // allowed. Otherwise, all identity escapes are allowed. |
987 if (!unicode() || IsSyntaxCharacterOrSlash(result)) { | 1010 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { |
988 Advance(); | 1011 Advance(); |
989 return result; | 1012 return result; |
990 } | 1013 } |
991 ReportError(CStrVector("Invalid escape")); | 1014 ReportError(CStrVector("Invalid escape")); |
992 return 0; | 1015 return 0; |
993 } | 1016 } |
994 } | 1017 } |
995 return 0; | 1018 return 0; |
996 } | 1019 } |
997 | 1020 |
(...skipping 15 matching lines...) Expand all Loading... |
1013 } | 1036 } |
1014 case kEndMarker: | 1037 case kEndMarker: |
1015 return ReportError(CStrVector("\\ at end of pattern")); | 1038 return ReportError(CStrVector("\\ at end of pattern")); |
1016 default: | 1039 default: |
1017 first = ParseClassCharacterEscape(CHECK_FAILED); | 1040 first = ParseClassCharacterEscape(CHECK_FAILED); |
1018 } | 1041 } |
1019 } else { | 1042 } else { |
1020 Advance(); | 1043 Advance(); |
1021 } | 1044 } |
1022 | 1045 |
1023 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) { | |
1024 // Combine with possibly following trail surrogate. | |
1025 int start = position(); | |
1026 uc32 second = current(); | |
1027 if (second == '\\') { | |
1028 second = ParseClassCharacterEscape(CHECK_FAILED); | |
1029 } else { | |
1030 Advance(); | |
1031 } | |
1032 if (unibrow::Utf16::IsTrailSurrogate(second)) { | |
1033 first = unibrow::Utf16::CombineSurrogatePair(first, second); | |
1034 } else { | |
1035 Reset(start); | |
1036 } | |
1037 } | |
1038 | |
1039 return CharacterRange::Singleton(first); | 1046 return CharacterRange::Singleton(first); |
1040 } | 1047 } |
1041 | 1048 |
1042 | 1049 |
1043 static const uc16 kNoCharClass = 0; | 1050 static const uc16 kNoCharClass = 0; |
1044 | 1051 |
1045 // Adds range or pre-defined character class to character ranges. | 1052 // Adds range or pre-defined character class to character ranges. |
1046 // If char_class is not kInvalidClass, it's interpreted as a class | 1053 // If char_class is not kInvalidClass, it's interpreted as a class |
1047 // escape (i.e., 's' means whitespace, from '\s'). | 1054 // escape (i.e., 's' means whitespace, from '\s'). |
1048 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, | 1055 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, |
(...skipping 208 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1257 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); | 1264 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); |
1258 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { | 1265 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { |
1259 AddLeadSurrogate(c); | 1266 AddLeadSurrogate(c); |
1260 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { | 1267 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { |
1261 AddTrailSurrogate(c); | 1268 AddTrailSurrogate(c); |
1262 } else { | 1269 } else { |
1263 AddCharacter(static_cast<uc16>(c)); | 1270 AddCharacter(static_cast<uc16>(c)); |
1264 } | 1271 } |
1265 } | 1272 } |
1266 | 1273 |
| 1274 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { |
| 1275 // A lead or trail surrogate parsed via escape sequence will not |
| 1276 // pair up with any preceding lead or following trail surrogate. |
| 1277 FlushPendingSurrogate(); |
| 1278 AddUnicodeCharacter(character); |
| 1279 FlushPendingSurrogate(); |
| 1280 } |
1267 | 1281 |
1268 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } | 1282 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
1269 | 1283 |
1270 | 1284 |
1271 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { | 1285 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
1272 if (NeedsDesugaringForUnicode(cc)) { | 1286 if (NeedsDesugaringForUnicode(cc)) { |
1273 // With /u, character class needs to be desugared, so it | 1287 // With /u, character class needs to be desugared, so it |
1274 // must be a standalone term instead of being part of a RegExpText. | 1288 // must be a standalone term instead of being part of a RegExpText. |
1275 AddTerm(cc); | 1289 AddTerm(cc); |
1276 } else { | 1290 } else { |
(...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1428 return false; | 1442 return false; |
1429 } | 1443 } |
1430 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1444 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
1431 zone()); | 1445 zone()); |
1432 LAST(ADD_TERM); | 1446 LAST(ADD_TERM); |
1433 return true; | 1447 return true; |
1434 } | 1448 } |
1435 | 1449 |
1436 } // namespace internal | 1450 } // namespace internal |
1437 } // namespace v8 | 1451 } // namespace v8 |
OLD | NEW |