| OLD | NEW |
| 1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
| 6 | 6 |
| 7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
| 8 #include "src/factory.h" | 8 #include "src/factory.h" |
| 9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
| 10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
| (...skipping 460 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 471 } else { | 471 } else { |
| 472 // With /u, invalid escapes are not treated as identity escapes. | 472 // With /u, invalid escapes are not treated as identity escapes. |
| 473 return ReportError(CStrVector("Invalid escape")); | 473 return ReportError(CStrVector("Invalid escape")); |
| 474 } | 474 } |
| 475 break; | 475 break; |
| 476 } | 476 } |
| 477 case 'u': { | 477 case 'u': { |
| 478 Advance(2); | 478 Advance(2); |
| 479 uc32 value; | 479 uc32 value; |
| 480 if (ParseUnicodeEscape(&value)) { | 480 if (ParseUnicodeEscape(&value)) { |
| 481 builder->AddUnicodeCharacter(value); | 481 builder->AddEscapedUnicodeCharacter(value); |
| 482 } else if (!unicode()) { | 482 } else if (!unicode()) { |
| 483 builder->AddCharacter('u'); | 483 builder->AddCharacter('u'); |
| 484 } else { | 484 } else { |
| 485 // With /u, invalid escapes are not treated as identity escapes. | 485 // With /u, invalid escapes are not treated as identity escapes. |
| 486 return ReportError(CStrVector("Invalid unicode escape")); | 486 return ReportError(CStrVector("Invalid unicode escape")); |
| 487 } | 487 } |
| 488 break; | 488 break; |
| 489 } | 489 } |
| 490 default: | 490 default: |
| 491 Advance(); | 491 Advance(); |
| (...skipping 298 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 790 Reset(start); | 790 Reset(start); |
| 791 return false; | 791 return false; |
| 792 } | 792 } |
| 793 val = val * 16 + d; | 793 val = val * 16 + d; |
| 794 Advance(); | 794 Advance(); |
| 795 } | 795 } |
| 796 *value = val; | 796 *value = val; |
| 797 return true; | 797 return true; |
| 798 } | 798 } |
| 799 | 799 |
| 800 | 800 // This parses RegExpUnicodeEscapeSequence as described in ECMA262. |
| 801 bool RegExpParser::ParseUnicodeEscape(uc32* value) { | 801 bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
| 802 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are | 802 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
| 803 // allowed). In the latter case, the number of hex digits between { } is | 803 // allowed). In the latter case, the number of hex digits between { } is |
| 804 // arbitrary. \ and u have already been read. | 804 // arbitrary. \ and u have already been read. |
| 805 if (current() == '{' && unicode()) { | 805 if (current() == '{' && unicode()) { |
| 806 int start = position(); | 806 int start = position(); |
| 807 Advance(); | 807 Advance(); |
| 808 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { | 808 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { |
| 809 if (current() == '}') { | 809 if (current() == '}') { |
| 810 Advance(); | 810 Advance(); |
| 811 return true; | 811 return true; |
| 812 } | 812 } |
| 813 } | 813 } |
| 814 Reset(start); | 814 Reset(start); |
| 815 return false; | 815 return false; |
| 816 } | 816 } |
| 817 // \u but no {, or \u{...} escapes not allowed. | 817 // \u but no {, or \u{...} escapes not allowed. |
| 818 return ParseHexEscape(4, value); | 818 bool result = ParseHexEscape(4, value); |
| 819 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && |
| 820 current() == '\\') { |
| 821 // Attempt to read trail surrogate. |
| 822 int start = position(); |
| 823 if (Next() == 'u') { |
| 824 Advance(2); |
| 825 uc32 trail; |
| 826 if (ParseHexEscape(4, &trail) && |
| 827 unibrow::Utf16::IsTrailSurrogate(trail)) { |
| 828 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value), |
| 829 static_cast<uc16>(trail)); |
| 830 return true; |
| 831 } |
| 832 } |
| 833 Reset(start); |
| 834 } |
| 835 return result; |
| 819 } | 836 } |
| 820 | 837 |
| 821 ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() { | 838 ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() { |
| 822 #ifdef V8_I18N_SUPPORT | 839 #ifdef V8_I18N_SUPPORT |
| 823 char property_name[3]; | 840 char property_name[3]; |
| 824 memset(property_name, 0, sizeof(property_name)); | 841 memset(property_name, 0, sizeof(property_name)); |
| 825 if (current() == '{') { | 842 if (current() == '{') { |
| 826 Advance(); | 843 Advance(); |
| 827 if (current() < 'A' || current() > 'Z') return nullptr; | 844 if (current() < 'A' || current() > 'Z') return nullptr; |
| 828 property_name[0] = static_cast<char>(current()); | 845 property_name[0] = static_cast<char>(current()); |
| (...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 931 if ((controlLetter >= '0' && controlLetter <= '9') || | 948 if ((controlLetter >= '0' && controlLetter <= '9') || |
| 932 controlLetter == '_') { | 949 controlLetter == '_') { |
| 933 Advance(2); | 950 Advance(2); |
| 934 return controlLetter & 0x1f; | 951 return controlLetter & 0x1f; |
| 935 } | 952 } |
| 936 // We match JSC in reading the backslash as a literal | 953 // We match JSC in reading the backslash as a literal |
| 937 // character instead of as starting an escape. | 954 // character instead of as starting an escape. |
| 938 return '\\'; | 955 return '\\'; |
| 939 } | 956 } |
| 940 case '0': | 957 case '0': |
| 958 // With /u, \0 is interpreted as NUL if not followed by another digit. |
| 959 if (unicode() && !(Next() >= '0' && Next() <= '9')) { |
| 960 Advance(); |
| 961 return 0; |
| 962 } |
| 963 // Fall through. |
| 941 case '1': | 964 case '1': |
| 942 case '2': | 965 case '2': |
| 943 case '3': | 966 case '3': |
| 944 case '4': | 967 case '4': |
| 945 case '5': | 968 case '5': |
| 946 case '6': | 969 case '6': |
| 947 case '7': | 970 case '7': |
| 948 // For compatibility, we interpret a decimal escape that isn't | 971 // For compatibility, we interpret a decimal escape that isn't |
| 949 // a back reference (and therefore either \0 or not valid according | 972 // a back reference (and therefore either \0 or not valid according |
| 950 // to the specification) as a 1..3 digit octal character code. | 973 // to the specification) as a 1..3 digit octal character code. |
| (...skipping 24 matching lines...) Expand all Loading... |
| 975 // With /u, invalid escapes are not treated as identity escapes. | 998 // With /u, invalid escapes are not treated as identity escapes. |
| 976 ReportError(CStrVector("Invalid unicode escape")); | 999 ReportError(CStrVector("Invalid unicode escape")); |
| 977 return 0; | 1000 return 0; |
| 978 } | 1001 } |
| 979 // If \u is not followed by a two-digit hexadecimal, treat it | 1002 // If \u is not followed by a two-digit hexadecimal, treat it |
| 980 // as an identity escape. | 1003 // as an identity escape. |
| 981 return 'u'; | 1004 return 'u'; |
| 982 } | 1005 } |
| 983 default: { | 1006 default: { |
| 984 uc32 result = current(); | 1007 uc32 result = current(); |
| 985 // With /u, no identity escapes except for syntax characters are | 1008 // With /u, no identity escapes except for syntax characters and '-' are |
| 986 // allowed. Otherwise, all identity escapes are allowed. | 1009 // allowed. Otherwise, all identity escapes are allowed. |
| 987 if (!unicode() || IsSyntaxCharacterOrSlash(result)) { | 1010 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { |
| 988 Advance(); | 1011 Advance(); |
| 989 return result; | 1012 return result; |
| 990 } | 1013 } |
| 991 ReportError(CStrVector("Invalid escape")); | 1014 ReportError(CStrVector("Invalid escape")); |
| 992 return 0; | 1015 return 0; |
| 993 } | 1016 } |
| 994 } | 1017 } |
| 995 return 0; | 1018 return 0; |
| 996 } | 1019 } |
| 997 | 1020 |
| (...skipping 15 matching lines...) Expand all Loading... |
| 1013 } | 1036 } |
| 1014 case kEndMarker: | 1037 case kEndMarker: |
| 1015 return ReportError(CStrVector("\\ at end of pattern")); | 1038 return ReportError(CStrVector("\\ at end of pattern")); |
| 1016 default: | 1039 default: |
| 1017 first = ParseClassCharacterEscape(CHECK_FAILED); | 1040 first = ParseClassCharacterEscape(CHECK_FAILED); |
| 1018 } | 1041 } |
| 1019 } else { | 1042 } else { |
| 1020 Advance(); | 1043 Advance(); |
| 1021 } | 1044 } |
| 1022 | 1045 |
| 1023 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) { | |
| 1024 // Combine with possibly following trail surrogate. | |
| 1025 int start = position(); | |
| 1026 uc32 second = current(); | |
| 1027 if (second == '\\') { | |
| 1028 second = ParseClassCharacterEscape(CHECK_FAILED); | |
| 1029 } else { | |
| 1030 Advance(); | |
| 1031 } | |
| 1032 if (unibrow::Utf16::IsTrailSurrogate(second)) { | |
| 1033 first = unibrow::Utf16::CombineSurrogatePair(first, second); | |
| 1034 } else { | |
| 1035 Reset(start); | |
| 1036 } | |
| 1037 } | |
| 1038 | |
| 1039 return CharacterRange::Singleton(first); | 1046 return CharacterRange::Singleton(first); |
| 1040 } | 1047 } |
| 1041 | 1048 |
| 1042 | 1049 |
| 1043 static const uc16 kNoCharClass = 0; | 1050 static const uc16 kNoCharClass = 0; |
| 1044 | 1051 |
| 1045 // Adds range or pre-defined character class to character ranges. | 1052 // Adds range or pre-defined character class to character ranges. |
| 1046 // If char_class is not kInvalidClass, it's interpreted as a class | 1053 // If char_class is not kInvalidClass, it's interpreted as a class |
| 1047 // escape (i.e., 's' means whitespace, from '\s'). | 1054 // escape (i.e., 's' means whitespace, from '\s'). |
| 1048 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, | 1055 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, |
| (...skipping 208 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1257 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); | 1264 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); |
| 1258 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { | 1265 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { |
| 1259 AddLeadSurrogate(c); | 1266 AddLeadSurrogate(c); |
| 1260 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { | 1267 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { |
| 1261 AddTrailSurrogate(c); | 1268 AddTrailSurrogate(c); |
| 1262 } else { | 1269 } else { |
| 1263 AddCharacter(static_cast<uc16>(c)); | 1270 AddCharacter(static_cast<uc16>(c)); |
| 1264 } | 1271 } |
| 1265 } | 1272 } |
| 1266 | 1273 |
| 1274 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { |
| 1275 // A lead or trail surrogate parsed via escape sequence will not |
| 1276 // pair up with any preceding lead or following trail surrogate. |
| 1277 FlushPendingSurrogate(); |
| 1278 AddUnicodeCharacter(character); |
| 1279 FlushPendingSurrogate(); |
| 1280 } |
| 1267 | 1281 |
| 1268 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } | 1282 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
| 1269 | 1283 |
| 1270 | 1284 |
| 1271 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { | 1285 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
| 1272 if (NeedsDesugaringForUnicode(cc)) { | 1286 if (NeedsDesugaringForUnicode(cc)) { |
| 1273 // With /u, character class needs to be desugared, so it | 1287 // With /u, character class needs to be desugared, so it |
| 1274 // must be a standalone term instead of being part of a RegExpText. | 1288 // must be a standalone term instead of being part of a RegExpText. |
| 1275 AddTerm(cc); | 1289 AddTerm(cc); |
| 1276 } else { | 1290 } else { |
| (...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1428 return false; | 1442 return false; |
| 1429 } | 1443 } |
| 1430 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1444 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
| 1431 zone()); | 1445 zone()); |
| 1432 LAST(ADD_TERM); | 1446 LAST(ADD_TERM); |
| 1433 return true; | 1447 return true; |
| 1434 } | 1448 } |
| 1435 | 1449 |
| 1436 } // namespace internal | 1450 } // namespace internal |
| 1437 } // namespace v8 | 1451 } // namespace v8 |
| OLD | NEW |