Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(57)

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1845243002: [regexp] extend \p syntax to binary and enumerated properties. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/regexp-parser.h" 5 #include "src/regexp/regexp-parser.h"
6 6
7 #include "src/char-predicates-inl.h" 7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h" 8 #include "src/factory.h"
9 #include "src/isolate.h" 9 #include "src/isolate.h"
10 #include "src/objects-inl.h" 10 #include "src/objects-inl.h"
(...skipping 827 matching lines...) Expand 10 before | Expand all | Expand 10 after
838 static_cast<uc16>(trail)); 838 static_cast<uc16>(trail));
839 return true; 839 return true;
840 } 840 }
841 } 841 }
842 Reset(start); 842 Reset(start);
843 } 843 }
844 return result; 844 return result;
845 } 845 }
846 846
847 #ifdef V8_I18N_SUPPORT 847 #ifdef V8_I18N_SUPPORT
848 bool IsExactPropertyValueAlias(const char* property_name, UProperty property, 848 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
849 int32_t property_value) { 849 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
850 const char* short_name =
851 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
852 if (short_name != NULL && strcmp(property_name, short_name) == 0) return true; 850 if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
853 for (int i = 0;; i++) { 851 for (int i = 0;; i++) {
854 const char* long_name = u_getPropertyValueName( 852 const char* long_name = u_getPropertyName(
855 property, property_value, 853 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
856 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
857 if (long_name == NULL) break; 854 if (long_name == NULL) break;
858 if (strcmp(property_name, long_name) == 0) return true; 855 if (strcmp(property_name, long_name) == 0) return true;
859 } 856 }
860 return false; 857 return false;
861 } 858 }
862 859
863 bool LookupPropertyClass(UProperty property, const char* property_name, 860 bool IsExactPropertyValueAlias(const char* property_value_name,
864 ZoneList<CharacterRange>* result, Zone* zone) { 861 UProperty property, int32_t property_value) {
865 int32_t property_value = u_getPropertyValueEnum(property, property_name); 862 const char* short_name =
863 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
864 if (short_name != NULL && strcmp(property_value_name, short_name) == 0) {
865 return true;
866 }
867 for (int i = 0;; i++) {
868 const char* long_name = u_getPropertyValueName(
869 property, property_value,
870 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
871 if (long_name == NULL) break;
872 if (strcmp(property_value_name, long_name) == 0) return true;
873 }
874 return false;
875 }
876
877 bool LookupPropertyValueName(UProperty property,
878 const char* property_value_name,
879 ZoneList<CharacterRange>* result, Zone* zone) {
880 int32_t property_value =
881 u_getPropertyValueEnum(property, property_value_name);
866 if (property_value == UCHAR_INVALID_CODE) return false; 882 if (property_value == UCHAR_INVALID_CODE) return false;
867 883
868 // We require the property name to match exactly to one of the property value 884 // We require the property name to match exactly to one of the property value
869 // aliases. However, u_getPropertyValueEnum uses loose matching. 885 // aliases. However, u_getPropertyValueEnum uses loose matching.
870 if (!IsExactPropertyValueAlias(property_name, property, property_value)) { 886 if (!IsExactPropertyValueAlias(property_value_name, property,
887 property_value)) {
871 return false; 888 return false;
872 } 889 }
873 890
874 USet* set = uset_openEmpty(); 891 USet* set = uset_openEmpty();
875 UErrorCode ec = U_ZERO_ERROR; 892 UErrorCode ec = U_ZERO_ERROR;
876 uset_applyIntPropertyValue(set, property, property_value, &ec); 893 uset_applyIntPropertyValue(set, property, property_value, &ec);
877 bool success = ec == U_ZERO_ERROR && !uset_isEmpty(set); 894 bool success = ec == U_ZERO_ERROR && !uset_isEmpty(set);
878 895
879 if (success) { 896 if (success) {
880 uset_removeAllStrings(set); 897 uset_removeAllStrings(set);
881 int item_count = uset_getItemCount(set); 898 int item_count = uset_getItemCount(set);
882 int item_result = 0; 899 int item_result = 0;
883 for (int i = 0; i < item_count; i++) { 900 for (int i = 0; i < item_count; i++) {
884 uc32 start = 0; 901 uc32 start = 0;
885 uc32 end = 0; 902 uc32 end = 0;
886 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec); 903 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
887 result->Add(CharacterRange::Range(start, end), zone); 904 result->Add(CharacterRange::Range(start, end), zone);
888 } 905 }
889 DCHECK_EQ(U_ZERO_ERROR, ec); 906 DCHECK_EQ(U_ZERO_ERROR, ec);
890 DCHECK_EQ(0, item_result); 907 DCHECK_EQ(0, item_result);
891 } 908 }
892 uset_close(set); 909 uset_close(set);
893 return success; 910 return success;
894 } 911 }
895 #endif // V8_I18N_SUPPORT
896 912
897 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) { 913 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) {
898 #ifdef V8_I18N_SUPPORT 914 // Parse the property class as follows:
899 List<char> property_name_list; 915 // - \pN with a single-character N is equivalent to \p{N}
916 // - In \p{name}, 'name' is interpreted
917 // - either as a general category property value name.
918 // - or as a binary property name.
919 // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
920 // and 'value' is interpreted as one of the available property value names.
921 // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
922 // - Loose matching is not applied.
923 List<char> first_part;
924 List<char> second_part;
900 if (current() == '{') { 925 if (current() == '{') {
901 for (Advance(); current() != '}'; Advance()) { 926 // Parse \p{[PropertyName=]PropertyNameValue}
927 for (Advance(); current() != '}' && current() != '='; Advance()) {
902 if (!has_next()) return false; 928 if (!has_next()) return false;
903 property_name_list.Add(static_cast<char>(current())); 929 first_part.Add(static_cast<char>(current()));
930 }
931 if (current() == '=') {
932 for (Advance(); current() != '}'; Advance()) {
933 if (!has_next()) return false;
934 second_part.Add(static_cast<char>(current()));
935 }
936 second_part.Add(0); // null-terminate string.
904 } 937 }
905 } else if (current() != kEndMarker) { 938 } else if (current() != kEndMarker) {
906 property_name_list.Add(static_cast<char>(current())); 939 // Parse \pN, where N is a single-character property name value.
940 first_part.Add(static_cast<char>(current()));
907 } else { 941 } else {
908 return false; 942 return false;
909 } 943 }
910 Advance(); 944 Advance();
911 property_name_list.Add(0); // null-terminate string. 945 first_part.Add(0); // null-terminate string.
912 946
913 const char* property_name = property_name_list.ToConstVector().start(); 947 if (second_part.is_empty()) {
948 // First attempt to interpret as general category property value name.
949 const char* name = first_part.ToConstVector().start();
950 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, result,
951 zone())) {
952 return true;
953 }
954 // Then attempt to interpret as binary property name with value name 'Y'.
955 UProperty property = u_getPropertyEnum(name);
956 if (property < UCHAR_BINARY_START) return false;
957 if (property >= UCHAR_BINARY_LIMIT) return false;
958 if (!IsExactPropertyAlias(name, property)) return false;
959 return LookupPropertyValueName(property, "Y", result, zone());
960 } else {
961 // Both property name and value name are specified. Attempt to interpret
962 // the property name as enumerated property.
963 const char* property_name = first_part.ToConstVector().start();
964 const char* value_name = second_part.ToConstVector().start();
965 UProperty property = u_getPropertyEnum(property_name);
966 if (property < UCHAR_INT_START) return false;
967 if (property >= UCHAR_INT_LIMIT) return false;
968 if (!IsExactPropertyAlias(property_name, property)) return false;
969 return LookupPropertyValueName(property, value_name, result, zone());
970 }
971 }
914 972
915 #define PROPERTY_NAME_LOOKUP(PROPERTY) \ 973 #else // V8_I18N_SUPPORT
916 do { \
917 if (LookupPropertyClass(PROPERTY, property_name, result, zone())) { \
918 return true; \
919 } \
920 } while (false)
921 974
922 // General_Category (gc) found in PropertyValueAliases.txt 975 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) {
923 PROPERTY_NAME_LOOKUP(UCHAR_GENERAL_CATEGORY_MASK);
924 // Script (sc) found in Scripts.txt
925 PROPERTY_NAME_LOOKUP(UCHAR_SCRIPT);
926 // To disambiguate from script names, block names have an "In"-prefix.
927 if (property_name_list.length() > 3 && property_name[0] == 'I' &&
928 property_name[1] == 'n') {
929 // Block (blk) found in Blocks.txt
930 property_name += 2;
931 PROPERTY_NAME_LOOKUP(UCHAR_BLOCK);
932 }
933 #undef PROPERTY_NAME_LOOKUP
934 #endif // V8_I18N_SUPPORT
935 return false; 976 return false;
936 } 977 }
937 978
979 #endif // V8_I18N_SUPPORT
980
938 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { 981 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
939 uc32 x = 0; 982 uc32 x = 0;
940 int d = HexValue(current()); 983 int d = HexValue(current());
941 if (d < 0) { 984 if (d < 0) {
942 return false; 985 return false;
943 } 986 }
944 while (d >= 0) { 987 while (d >= 0) {
945 x = x * 16 + d; 988 x = x * 16 + d;
946 if (x > max_value) { 989 if (x > max_value) {
947 return false; 990 return false;
(...skipping 574 matching lines...) Expand 10 before | Expand all | Expand 10 after
1522 return false; 1565 return false;
1523 } 1566 }
1524 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1567 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1525 zone()); 1568 zone());
1526 LAST(ADD_TERM); 1569 LAST(ADD_TERM);
1527 return true; 1570 return true;
1528 } 1571 }
1529 1572
1530 } // namespace internal 1573 } // namespace internal
1531 } // namespace v8 1574 } // namespace v8
OLDNEW
« no previous file with comments | « no previous file | test/mjsunit/harmony/regexp-property-binary.js » ('j') | test/mjsunit/harmony/regexp-property-exact-match.js » ('J')

Powered by Google App Engine
This is Rietveld 408576698