Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(68)

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1774623005: [regexp] support \p in character classes. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: rebase Created 4 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/regexp-property-char-class.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/regexp-parser.h" 5 #include "src/regexp/regexp-parser.h"
6 6
7 #include "src/char-predicates-inl.h" 7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h" 8 #include "src/factory.h"
9 #include "src/isolate.h" 9 #include "src/isolate.h"
10 #include "src/objects-inl.h" 10 #include "src/objects-inl.h"
(...skipping 340 matching lines...) Expand 10 before | Expand all | Expand 10 after
351 RegExpCharacterClass* cc = 351 RegExpCharacterClass* cc =
352 new (zone()) RegExpCharacterClass(ranges, false); 352 new (zone()) RegExpCharacterClass(ranges, false);
353 builder->AddCharacterClass(cc); 353 builder->AddCharacterClass(cc);
354 break; 354 break;
355 } 355 }
356 case 'p': 356 case 'p':
357 case 'P': { 357 case 'P': {
358 uc32 p = Next(); 358 uc32 p = Next();
359 Advance(2); 359 Advance(2);
360 if (unicode()) { 360 if (unicode()) {
361 ZoneList<CharacterRange>* ranges = ParsePropertyClass(); 361 if (FLAG_harmony_regexp_property) {
362 if (ranges == nullptr) { 362 ZoneList<CharacterRange>* ranges =
363 return ReportError(CStrVector("Invalid property name")); 363 new (zone()) ZoneList<CharacterRange>(2, zone());
364 if (!ParsePropertyClass(ranges)) {
365 return ReportError(CStrVector("Invalid property name"));
366 }
367 RegExpCharacterClass* cc =
368 new (zone()) RegExpCharacterClass(ranges, p == 'P');
369 builder->AddCharacterClass(cc);
370 } else {
371 // With /u, no identity escapes except for syntax characters
372 // are allowed. Otherwise, all identity escapes are allowed.
373 return ReportError(CStrVector("Invalid escape"));
364 } 374 }
365 RegExpCharacterClass* cc =
366 new (zone()) RegExpCharacterClass(ranges, p == 'P');
367 builder->AddCharacterClass(cc);
368 } else { 375 } else {
369 builder->AddCharacter(p); 376 builder->AddCharacter(p);
370 } 377 }
371 break; 378 break;
372 } 379 }
373 case '1': 380 case '1':
374 case '2': 381 case '2':
375 case '3': 382 case '3':
376 case '4': 383 case '4':
377 case '5': 384 case '5':
(...skipping 451 matching lines...) Expand 10 before | Expand all | Expand 10 after
829 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value), 836 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
830 static_cast<uc16>(trail)); 837 static_cast<uc16>(trail));
831 return true; 838 return true;
832 } 839 }
833 } 840 }
834 Reset(start); 841 Reset(start);
835 } 842 }
836 return result; 843 return result;
837 } 844 }
838 845
839 ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() { 846 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) {
840 #ifdef V8_I18N_SUPPORT 847 #ifdef V8_I18N_SUPPORT
841 ZoneList<char> property_name(0, zone()); 848 ZoneList<char> property_name(0, zone());
842 if (current() == '{') { 849 if (current() == '{') {
843 for (Advance(); current() != '}'; Advance()) { 850 for (Advance(); current() != '}'; Advance()) {
844 if (!has_next()) return nullptr; 851 if (!has_next()) return false;
845 property_name.Add(static_cast<char>(current()), zone()); 852 property_name.Add(static_cast<char>(current()), zone());
846 } 853 }
847 } else if (current() != kEndMarker) { 854 } else if (current() != kEndMarker) {
848 property_name.Add(static_cast<char>(current()), zone()); 855 property_name.Add(static_cast<char>(current()), zone());
849 } else { 856 } else {
850 return nullptr; 857 return false;
851 } 858 }
852 Advance(); 859 Advance();
853 property_name.Add(0, zone()); // null-terminate string. 860 property_name.Add(0, zone()); // null-terminate string.
854 861
855 // Property names are defined in unicode database files. For aliases of 862 // Property names are defined in unicode database files. For aliases of
856 // these property names, see PropertyValueAliases.txt. 863 // these property names, see PropertyValueAliases.txt.
857 UProperty kPropertyClasses[] = { 864 UProperty kPropertyClasses[] = {
858 // General_Category (gc) found in PropertyValueAliases.txt 865 // General_Category (gc) found in PropertyValueAliases.txt
859 UCHAR_GENERAL_CATEGORY_MASK, 866 UCHAR_GENERAL_CATEGORY_MASK,
860 // Script (sc) found in Scripts.txt 867 // Script (sc) found in Scripts.txt
861 UCHAR_SCRIPT, 868 UCHAR_SCRIPT,
862 }; 869 };
863 870
864 for (int i = 0; i < arraysize(kPropertyClasses); i++) { 871 for (int i = 0; i < arraysize(kPropertyClasses); i++) {
865 UProperty property_class = kPropertyClasses[i]; 872 UProperty property_class = kPropertyClasses[i];
866 int32_t category = u_getPropertyValueEnum( 873 int32_t category = u_getPropertyValueEnum(
867 property_class, property_name.ToConstVector().start()); 874 property_class, property_name.ToConstVector().start());
868 if (category == UCHAR_INVALID_CODE) continue; 875 if (category == UCHAR_INVALID_CODE) continue;
869 876
870 USet* set = uset_openEmpty(); 877 USet* set = uset_openEmpty();
871 UErrorCode ec = U_ZERO_ERROR; 878 UErrorCode ec = U_ZERO_ERROR;
872 uset_applyIntPropertyValue(set, property_class, category, &ec); 879 uset_applyIntPropertyValue(set, property_class, category, &ec);
873 ZoneList<CharacterRange>* ranges = nullptr;
874 if (ec == U_ZERO_ERROR && !uset_isEmpty(set)) { 880 if (ec == U_ZERO_ERROR && !uset_isEmpty(set)) {
875 uset_removeAllStrings(set); 881 uset_removeAllStrings(set);
876 int item_count = uset_getItemCount(set); 882 int item_count = uset_getItemCount(set);
877 ranges = new (zone()) ZoneList<CharacterRange>(item_count, zone());
878 int item_result = 0; 883 int item_result = 0;
879 for (int i = 0; i < item_count; i++) { 884 for (int i = 0; i < item_count; i++) {
880 uc32 start = 0; 885 uc32 start = 0;
881 uc32 end = 0; 886 uc32 end = 0;
882 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec); 887 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
883 ranges->Add(CharacterRange::Range(start, end), zone()); 888 result->Add(CharacterRange::Range(start, end), zone());
884 } 889 }
885 DCHECK_EQ(U_ZERO_ERROR, ec); 890 DCHECK_EQ(U_ZERO_ERROR, ec);
886 DCHECK_EQ(0, item_result); 891 DCHECK_EQ(0, item_result);
887 } 892 }
888 uset_close(set); 893 uset_close(set);
889 return ranges; 894 return true;
890 } 895 }
891 #endif // V8_I18N_SUPPORT 896 #endif // V8_I18N_SUPPORT
892 897
893 return nullptr; 898 return false;
894 } 899 }
895 900
896 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { 901 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
897 uc32 x = 0; 902 uc32 x = 0;
898 int d = HexValue(current()); 903 int d = HexValue(current());
899 if (d < 0) { 904 if (d < 0) {
900 return false; 905 return false;
901 } 906 }
902 while (d >= 0) { 907 while (d >= 0) {
903 x = x * 16 + d; 908 x = x * 16 + d;
(...skipping 159 matching lines...) Expand 10 before | Expand all | Expand 10 after
1063 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, 1068 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
1064 uc16 char_class, CharacterRange range, 1069 uc16 char_class, CharacterRange range,
1065 Zone* zone) { 1070 Zone* zone) {
1066 if (char_class != kNoCharClass) { 1071 if (char_class != kNoCharClass) {
1067 CharacterRange::AddClassEscape(char_class, ranges, zone); 1072 CharacterRange::AddClassEscape(char_class, ranges, zone);
1068 } else { 1073 } else {
1069 ranges->Add(range, zone); 1074 ranges->Add(range, zone);
1070 } 1075 }
1071 } 1076 }
1072 1077
1078 bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) {
1079 if (!FLAG_harmony_regexp_property) return false;
1080 if (!unicode()) return false;
1081 if (current() != '\\') return false;
1082 uc32 next = Next();
1083 bool parse_success = false;
1084 if (next == 'p') {
1085 Advance(2);
1086 parse_success = ParsePropertyClass(ranges);
1087 } else if (next == 'P') {
1088 Advance(2);
1089 ZoneList<CharacterRange>* property_class =
1090 new (zone()) ZoneList<CharacterRange>(2, zone());
1091 parse_success = ParsePropertyClass(property_class);
1092 if (parse_success) {
1093 ZoneList<CharacterRange>* negated =
1094 new (zone()) ZoneList<CharacterRange>(2, zone());
1095 CharacterRange::Negate(property_class, negated, zone());
1096 const Vector<CharacterRange> negated_vector = negated->ToVector();
1097 ranges->AddAll(negated_vector, zone());
1098 }
1099 } else {
1100 return false;
1101 }
1102 if (!parse_success)
1103 ReportError(CStrVector("Invalid property name in character class"));
1104 return parse_success;
1105 }
1073 1106
1074 RegExpTree* RegExpParser::ParseCharacterClass() { 1107 RegExpTree* RegExpParser::ParseCharacterClass() {
1075 static const char* kUnterminated = "Unterminated character class"; 1108 static const char* kUnterminated = "Unterminated character class";
1076 static const char* kRangeInvalid = "Invalid character class"; 1109 static const char* kRangeInvalid = "Invalid character class";
1077 static const char* kRangeOutOfOrder = "Range out of order in character class"; 1110 static const char* kRangeOutOfOrder = "Range out of order in character class";
1078 1111
1079 DCHECK_EQ(current(), '['); 1112 DCHECK_EQ(current(), '[');
1080 Advance(); 1113 Advance();
1081 bool is_negated = false; 1114 bool is_negated = false;
1082 if (current() == '^') { 1115 if (current() == '^') {
1083 is_negated = true; 1116 is_negated = true;
1084 Advance(); 1117 Advance();
1085 } 1118 }
1086 ZoneList<CharacterRange>* ranges = 1119 ZoneList<CharacterRange>* ranges =
1087 new (zone()) ZoneList<CharacterRange>(2, zone()); 1120 new (zone()) ZoneList<CharacterRange>(2, zone());
1088 while (has_more() && current() != ']') { 1121 while (has_more() && current() != ']') {
1122 bool parsed_property = ParseClassProperty(ranges CHECK_FAILED);
1123 if (parsed_property) continue;
1089 uc16 char_class = kNoCharClass; 1124 uc16 char_class = kNoCharClass;
1090 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED); 1125 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
1091 if (current() == '-') { 1126 if (current() == '-') {
1092 Advance(); 1127 Advance();
1093 if (current() == kEndMarker) { 1128 if (current() == kEndMarker) {
1094 // If we reach the end we break out of the loop and let the 1129 // If we reach the end we break out of the loop and let the
1095 // following code report an error. 1130 // following code report an error.
1096 break; 1131 break;
1097 } else if (current() == ']') { 1132 } else if (current() == ']') {
1098 AddRangeOrEscape(ranges, char_class, first, zone()); 1133 AddRangeOrEscape(ranges, char_class, first, zone());
(...skipping 351 matching lines...) Expand 10 before | Expand all | Expand 10 after
1450 return false; 1485 return false;
1451 } 1486 }
1452 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1487 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1453 zone()); 1488 zone());
1454 LAST(ADD_TERM); 1489 LAST(ADD_TERM);
1455 return true; 1490 return true;
1456 } 1491 }
1457 1492
1458 } // namespace internal 1493 } // namespace internal
1459 } // namespace v8 1494 } // namespace v8
OLDNEW
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/regexp-property-char-class.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698