Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(193)

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 2059113002: [regexp] implement \p{Any}, \p{Ascii}, and \p{Assigned}. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/regexp-parser.h" 5 #include "src/regexp/regexp-parser.h"
6 6
7 #include "src/char-predicates-inl.h" 7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h" 8 #include "src/factory.h"
9 #include "src/isolate.h" 9 #include "src/isolate.h"
10 #include "src/objects-inl.h" 10 #include "src/objects-inl.h"
(...skipping 344 matching lines...) Expand 10 before | Expand all | Expand 10 after
355 break; 355 break;
356 } 356 }
357 case 'p': 357 case 'p':
358 case 'P': { 358 case 'P': {
359 uc32 p = Next(); 359 uc32 p = Next();
360 Advance(2); 360 Advance(2);
361 if (unicode()) { 361 if (unicode()) {
362 if (FLAG_harmony_regexp_property) { 362 if (FLAG_harmony_regexp_property) {
363 ZoneList<CharacterRange>* ranges = 363 ZoneList<CharacterRange>* ranges =
364 new (zone()) ZoneList<CharacterRange>(2, zone()); 364 new (zone()) ZoneList<CharacterRange>(2, zone());
365 if (!ParsePropertyClass(ranges)) { 365 if (!ParsePropertyClass(ranges, p == 'P')) {
Dan Ehrenberg 2016/06/13 20:23:15 I'm a fan of this refactoring to match the spec be
mathias 2016/06/13 20:43:57 It wouldn’t match all code points, since non-Lette
366 return ReportError(CStrVector("Invalid property name")); 366 return ReportError(CStrVector("Invalid property name"));
367 } 367 }
368 RegExpCharacterClass* cc = 368 RegExpCharacterClass* cc =
369 new (zone()) RegExpCharacterClass(ranges, p == 'P'); 369 new (zone()) RegExpCharacterClass(ranges, false);
370 builder->AddCharacterClass(cc); 370 builder->AddCharacterClass(cc);
371 } else { 371 } else {
372 // With /u, no identity escapes except for syntax characters 372 // With /u, no identity escapes except for syntax characters
373 // are allowed. Otherwise, all identity escapes are allowed. 373 // are allowed. Otherwise, all identity escapes are allowed.
374 return ReportError(CStrVector("Invalid escape")); 374 return ReportError(CStrVector("Invalid escape"));
375 } 375 }
376 } else { 376 } else {
377 builder->AddCharacter(p); 377 builder->AddCharacter(p);
378 } 378 }
379 break; 379 break;
(...skipping 458 matching lines...) Expand 10 before | Expand all | Expand 10 after
838 static_cast<uc16>(trail)); 838 static_cast<uc16>(trail));
839 return true; 839 return true;
840 } 840 }
841 } 841 }
842 Reset(start); 842 Reset(start);
843 } 843 }
844 return result; 844 return result;
845 } 845 }
846 846
847 #ifdef V8_I18N_SUPPORT 847 #ifdef V8_I18N_SUPPORT
848
849 namespace {
850
848 bool IsExactPropertyAlias(const char* property_name, UProperty property) { 851 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
849 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME); 852 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
850 if (short_name != NULL && strcmp(property_name, short_name) == 0) return true; 853 if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
851 for (int i = 0;; i++) { 854 for (int i = 0;; i++) {
852 const char* long_name = u_getPropertyName( 855 const char* long_name = u_getPropertyName(
853 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); 856 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
854 if (long_name == NULL) break; 857 if (long_name == NULL) break;
855 if (strcmp(property_name, long_name) == 0) return true; 858 if (strcmp(property_name, long_name) == 0) return true;
856 } 859 }
857 return false; 860 return false;
(...skipping 10 matching lines...) Expand all
868 const char* long_name = u_getPropertyValueName( 871 const char* long_name = u_getPropertyValueName(
869 property, property_value, 872 property, property_value,
870 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); 873 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
871 if (long_name == NULL) break; 874 if (long_name == NULL) break;
872 if (strcmp(property_value_name, long_name) == 0) return true; 875 if (strcmp(property_value_name, long_name) == 0) return true;
873 } 876 }
874 return false; 877 return false;
875 } 878 }
876 879
877 bool LookupPropertyValueName(UProperty property, 880 bool LookupPropertyValueName(UProperty property,
878 const char* property_value_name, 881 const char* property_value_name, bool negate,
879 ZoneList<CharacterRange>* result, Zone* zone) { 882 ZoneList<CharacterRange>* result, Zone* zone) {
880 int32_t property_value = 883 int32_t property_value =
881 u_getPropertyValueEnum(property, property_value_name); 884 u_getPropertyValueEnum(property, property_value_name);
882 if (property_value == UCHAR_INVALID_CODE) return false; 885 if (property_value == UCHAR_INVALID_CODE) return false;
883 886
884 // We require the property name to match exactly to one of the property value 887 // We require the property name to match exactly to one of the property value
885 // aliases. However, u_getPropertyValueEnum uses loose matching. 888 // aliases. However, u_getPropertyValueEnum uses loose matching.
886 if (!IsExactPropertyValueAlias(property_value_name, property, 889 if (!IsExactPropertyValueAlias(property_value_name, property,
887 property_value)) { 890 property_value)) {
888 return false; 891 return false;
889 } 892 }
890 893
891 USet* set = uset_openEmpty(); 894 USet* set = uset_openEmpty();
892 UErrorCode ec = U_ZERO_ERROR; 895 UErrorCode ec = U_ZERO_ERROR;
893 uset_applyIntPropertyValue(set, property, property_value, &ec); 896 uset_applyIntPropertyValue(set, property, property_value, &ec);
894 bool success = ec == U_ZERO_ERROR && !uset_isEmpty(set); 897 bool success = ec == U_ZERO_ERROR && !uset_isEmpty(set);
895 898
896 if (success) { 899 if (success) {
897 uset_removeAllStrings(set); 900 uset_removeAllStrings(set);
901 if (negate) uset_complement(set);
898 int item_count = uset_getItemCount(set); 902 int item_count = uset_getItemCount(set);
899 int item_result = 0; 903 int item_result = 0;
900 for (int i = 0; i < item_count; i++) { 904 for (int i = 0; i < item_count; i++) {
901 uc32 start = 0; 905 uc32 start = 0;
902 uc32 end = 0; 906 uc32 end = 0;
903 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec); 907 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
904 result->Add(CharacterRange::Range(start, end), zone); 908 result->Add(CharacterRange::Range(start, end), zone);
905 } 909 }
906 DCHECK_EQ(U_ZERO_ERROR, ec); 910 DCHECK_EQ(U_ZERO_ERROR, ec);
907 DCHECK_EQ(0, item_result); 911 DCHECK_EQ(0, item_result);
908 } 912 }
909 uset_close(set); 913 uset_close(set);
910 return success; 914 return success;
911 } 915 }
912 916
913 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) { 917 template <size_t N>
918 inline bool NameEquals(const char* name, const char (&literal)[N]) {
919 return strncmp(name, literal, N + 1) == 0;
920 }
921
922 bool LookupSpecialPropertyValueName(const char* name,
923 ZoneList<CharacterRange>* result,
924 bool negate, Zone* zone) {
925 if (NameEquals(name, "Any")) {
926 if (!negate) result->Add(CharacterRange::Everything(), zone);
927 } else if (NameEquals(name, "ASCII")) {
928 result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
929 : CharacterRange::Range(0x0, 0x7f),
930 zone);
931 } else if (NameEquals(name, "Assigned")) {
932 return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
933 !negate, result, zone);
934 } else {
935 return false;
936 }
937 return true;
938 }
939
940 } // anonymous namespace
941
942 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
943 bool negate) {
914 // Parse the property class as follows: 944 // Parse the property class as follows:
915 // - In \p{name}, 'name' is interpreted 945 // - In \p{name}, 'name' is interpreted
916 // - either as a general category property value name. 946 // - either as a general category property value name.
917 // - or as a binary property name. 947 // - or as a binary property name.
918 // - In \p{name=value}, 'name' is interpreted as an enumerated property name, 948 // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
919 // and 'value' is interpreted as one of the available property value names. 949 // and 'value' is interpreted as one of the available property value names.
920 // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used. 950 // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
921 // - Loose matching is not applied. 951 // - Loose matching is not applied.
922 List<char> first_part; 952 List<char> first_part;
923 List<char> second_part; 953 List<char> second_part;
(...skipping 12 matching lines...) Expand all
936 } 966 }
937 } else { 967 } else {
938 return false; 968 return false;
939 } 969 }
940 Advance(); 970 Advance();
941 first_part.Add(0); // null-terminate string. 971 first_part.Add(0); // null-terminate string.
942 972
943 if (second_part.is_empty()) { 973 if (second_part.is_empty()) {
944 // First attempt to interpret as general category property value name. 974 // First attempt to interpret as general category property value name.
945 const char* name = first_part.ToConstVector().start(); 975 const char* name = first_part.ToConstVector().start();
946 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, result, 976 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
947 zone())) { 977 result, zone())) {
978 return true;
979 }
980 // Interpret "Any", "ASCII", and "Assigned".
981 if (LookupSpecialPropertyValueName(name, result, negate, zone())) {
948 return true; 982 return true;
949 } 983 }
950 // Then attempt to interpret as binary property name with value name 'Y'. 984 // Then attempt to interpret as binary property name with value name 'Y'.
951 UProperty property = u_getPropertyEnum(name); 985 UProperty property = u_getPropertyEnum(name);
952 if (property < UCHAR_BINARY_START) return false; 986 if (property < UCHAR_BINARY_START) return false;
953 if (property >= UCHAR_BINARY_LIMIT) return false; 987 if (property >= UCHAR_BINARY_LIMIT) return false;
954 if (!IsExactPropertyAlias(name, property)) return false; 988 if (!IsExactPropertyAlias(name, property)) return false;
955 return LookupPropertyValueName(property, "Y", result, zone()); 989 return LookupPropertyValueName(property, negate ? "N" : "Y", false, result,
990 zone());
956 } else { 991 } else {
957 // Both property name and value name are specified. Attempt to interpret 992 // Both property name and value name are specified. Attempt to interpret
958 // the property name as enumerated property. 993 // the property name as enumerated property.
959 const char* property_name = first_part.ToConstVector().start(); 994 const char* property_name = first_part.ToConstVector().start();
960 const char* value_name = second_part.ToConstVector().start(); 995 const char* value_name = second_part.ToConstVector().start();
961 UProperty property = u_getPropertyEnum(property_name); 996 UProperty property = u_getPropertyEnum(property_name);
962 if (property < UCHAR_INT_START) return false; 997 if (property < UCHAR_INT_START) return false;
963 if (property >= UCHAR_INT_LIMIT) return false; 998 if (property >= UCHAR_INT_LIMIT) return false;
964 if (!IsExactPropertyAlias(property_name, property)) return false; 999 if (!IsExactPropertyAlias(property_name, property)) return false;
965 return LookupPropertyValueName(property, value_name, result, zone()); 1000 return LookupPropertyValueName(property, value_name, negate, result,
1001 zone());
966 } 1002 }
967 } 1003 }
968 1004
969 #else // V8_I18N_SUPPORT 1005 #else // V8_I18N_SUPPORT
970 1006
971 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) { 1007 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) {
972 return false; 1008 return false;
973 } 1009 }
974 1010
975 #endif // V8_I18N_SUPPORT 1011 #endif // V8_I18N_SUPPORT
(...skipping 176 matching lines...) Expand 10 before | Expand all | Expand 10 after
1152 } 1188 }
1153 1189
1154 bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) { 1190 bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) {
1155 if (!FLAG_harmony_regexp_property) return false; 1191 if (!FLAG_harmony_regexp_property) return false;
1156 if (!unicode()) return false; 1192 if (!unicode()) return false;
1157 if (current() != '\\') return false; 1193 if (current() != '\\') return false;
1158 uc32 next = Next(); 1194 uc32 next = Next();
1159 bool parse_success = false; 1195 bool parse_success = false;
1160 if (next == 'p') { 1196 if (next == 'p') {
1161 Advance(2); 1197 Advance(2);
1162 parse_success = ParsePropertyClass(ranges); 1198 parse_success = ParsePropertyClass(ranges, false);
1163 } else if (next == 'P') { 1199 } else if (next == 'P') {
1164 Advance(2); 1200 Advance(2);
1165 ZoneList<CharacterRange>* property_class = 1201 parse_success = ParsePropertyClass(ranges, true);
1166 new (zone()) ZoneList<CharacterRange>(2, zone());
1167 parse_success = ParsePropertyClass(property_class);
1168 if (parse_success) {
1169 ZoneList<CharacterRange>* negated =
1170 new (zone()) ZoneList<CharacterRange>(2, zone());
1171 CharacterRange::Negate(property_class, negated, zone());
1172 const Vector<CharacterRange> negated_vector = negated->ToVector();
1173 ranges->AddAll(negated_vector, zone());
1174 }
1175 } else { 1202 } else {
1176 return false; 1203 return false;
1177 } 1204 }
1178 if (!parse_success) 1205 if (!parse_success)
1179 ReportError(CStrVector("Invalid property name in character class")); 1206 ReportError(CStrVector("Invalid property name in character class"));
1180 return parse_success; 1207 return parse_success;
1181 } 1208 }
1182 1209
1183 RegExpTree* RegExpParser::ParseCharacterClass() { 1210 RegExpTree* RegExpParser::ParseCharacterClass() {
1184 static const char* kUnterminated = "Unterminated character class"; 1211 static const char* kUnterminated = "Unterminated character class";
(...skipping 372 matching lines...) Expand 10 before | Expand all | Expand 10 after
1557 return false; 1584 return false;
1558 } 1585 }
1559 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1586 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1560 zone()); 1587 zone());
1561 LAST(ADD_TERM); 1588 LAST(ADD_TERM);
1562 return true; 1589 return true;
1563 } 1590 }
1564 1591
1565 } // namespace internal 1592 } // namespace internal
1566 } // namespace v8 1593 } // namespace v8
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698