Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: src/runtime.cc

Issue 7114: * Changed string search to full-blown Boyer-Moore. (Closed)
Patch Set: Addressed review comments Created 12 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | test/mjsunit/string-indexof.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 934 matching lines...) Expand 10 before | Expand all | Expand 10 after
945 uint32_t code; 945 uint32_t code;
946 if (Array::IndexFromObject(args[0], &code)) { 946 if (Array::IndexFromObject(args[0], &code)) {
947 if (code <= 0xffff) { 947 if (code <= 0xffff) {
948 return Heap::LookupSingleCharacterStringFromCode(code); 948 return Heap::LookupSingleCharacterStringFromCode(code);
949 } 949 }
950 } 950 }
951 return Heap::empty_string(); 951 return Heap::empty_string();
952 } 952 }
953 953
954 954
955 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
956 // limit, we can fix the size of tables.
957 static const int kBMMaxShift = 0xff;
958 static const int kBMAlphabetSize = 0x100; // Reduce alphabet to this size.
959
960 // Holds the two buffers used by Boyer-Moore string search's Good Suffix
961 // shift. Only allows the last kBMMaxShift characters of the needle
962 // to be indexed.
963 class BMGoodSuffixBuffers: public AllStatic {
964 public:
965 BMGoodSuffixBuffers() {}
966 inline void init(int needle_length) {
967 ASSERT(needle_length > 1);
968 int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift;
969 int len = needle_length - start;
970 biased_suffixes_ = suffixes_ - start;
971 biased_good_suffix_shift_ = good_suffix_shift_ - start;
972 for (int i = 0; i <= len; i++) {
973 good_suffix_shift_[i] = len;
974 }
975 }
976 inline int& suffix(int index) {
977 ASSERT(biased_suffixes_ + index >= suffixes_);
978 return biased_suffixes_[index];
979 }
980 inline int& shift(int index) {
981 ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_);
982 return biased_good_suffix_shift_[index];
983 }
984 private:
985 int suffixes_[kBMMaxShift + 1];
986 int good_suffix_shift_[kBMMaxShift + 1];
987 int *biased_suffixes_;
988 int *biased_good_suffix_shift_;
989 DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers);
990 };
991
992 // buffers reused by BoyerMoore
993 static int bad_char_occurence[kBMAlphabetSize];
994 static BMGoodSuffixBuffers bmgs_buffers;
995
996 // Restricted Boyer-Moore string matching. Restricts tables to a
997 // suffix of long pattern strings and handles only equivalence classes
998 // of the full alphabet. This allows us to ensure that tables take only
999 // a fixed amount of space.
1000 template <typename schar, typename pchar>
1001 static int BoyerMooreIndexOf(Vector<const schar> subject,
1002 Vector<const pchar> pattern,
1003 int start_index) {
1004 int m = pattern.length();
1005 int n = subject.length();
1006
1007 // Only preprocess at most kBMMaxShift last characters of pattern.
1008 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
1009 int len = m - start;
1010
1011 // Run forwards to populate bad_char_table, so that *last* instance
1012 // of character equivalence class is the one registered.
1013 // Notice: Doesn't include last character.
1014 for (int i = 0; i < kBMAlphabetSize; i++) {
1015 bad_char_occurence[i] = start - 1;
1016 }
1017 for (int i = start; i < m; i++) {
1018 uc32 c = pattern[i];
1019 bad_char_occurence[c % kBMAlphabetSize] = i;
1020 if (sizeof(schar) == 1 &&
1021 sizeof(pchar) > 1 &&
1022 c > String::kMaxAsciiCharCode) {
1023 return -1;
1024 }
1025 }
1026 // End of Bad Char computation.
1027
1028 // Compute Good Suffix shift table.
1029 bmgs_buffers.init(m);
1030
1031 bmgs_buffers.shift(m-1) = 1;
1032 bmgs_buffers.suffix(m) = m + 1;
1033 pchar last_char = pattern[m - 1];
1034 int suffix = m + 1;
1035 for (int i = m; i > start;) {
1036 for (pchar c = pattern[i - 1]; suffix <= m && c != pattern[suffix - 1];) {
1037 if (bmgs_buffers.shift(suffix) == len) {
1038 bmgs_buffers.shift(suffix) = suffix - i;
1039 }
1040 suffix = bmgs_buffers.suffix(suffix);
1041 }
1042 i--;
1043 suffix--;
1044 bmgs_buffers.suffix(i) = suffix;
1045 if (suffix == m) {
1046 // no suffix to extend, so we check against last_char only.
1047 while (i > start && pattern[i - 1] != last_char) {
1048 if (bmgs_buffers.shift(m) == len) {
1049 bmgs_buffers.shift(m) = m - i;
1050 }
1051 i--;
1052 bmgs_buffers.suffix(i) = m;
1053 }
1054 if (i > start) {
1055 i--;
1056 suffix--;
1057 bmgs_buffers.suffix(i) = suffix;
1058 }
1059 }
1060 }
1061 if (suffix < m) {
1062 for (int i = start; i <= m; i++) {
1063 if (bmgs_buffers.shift(i) == len) {
1064 bmgs_buffers.shift(i) = suffix - start;
1065 }
1066 if (i == suffix) {
1067 suffix = bmgs_buffers.suffix(suffix);
1068 }
1069 }
1070 }
1071 // End of Good Suffix computation.
1072
1073
1074 // Perform search
1075 for (int i = start_index; i <= n - m;) {
1076 int j = m - 1;
1077 schar c;
1078 while (j >= 0 && pattern[j] == (c = subject[i + j])) j--;
1079 if (j < 0) {
1080 return i;
1081 } else if (j < start) {
1082 // we have matched more than our tables allow us to be smart about.
1083 i += 1;
1084 } else {
1085 int gs_shift = bmgs_buffers.shift(j + 1);
1086 int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
1087 int bc_shift = j - bc_occ;
1088 i += (gs_shift > bc_shift) ? gs_shift : bc_shift;
1089 }
1090 }
1091 return -1;
1092 }
1093
955 template <typename schar, typename pchar> 1094 template <typename schar, typename pchar>
956 static int SingleCharIndexOf(Vector<const schar> string, 1095 static int SingleCharIndexOf(Vector<const schar> string,
957 pchar pattern_char, 1096 pchar pattern_char,
958 int start_index) { 1097 int start_index) {
959 for (int i = start_index, n = string.length(); i < n; i++) { 1098 for (int i = start_index, n = string.length(); i < n; i++) {
960 if (pattern_char == string[i]) { 1099 if (pattern_char == string[i]) {
961 return i; 1100 return i;
962 } 1101 }
963 } 1102 }
964 return -1; 1103 return -1;
965 } 1104 }
966 1105
967 // Trivial string search for shorter strings. 1106 // Trivial string search for shorter strings.
968 template <typename pchar, typename schar> 1107 template <typename pchar, typename schar>
969 static int SimpleIndexOf(Vector<const schar> subject, 1108 static int SimpleIndexOf(Vector<const schar> subject,
970 Vector<const pchar> pattern, 1109 Vector<const pchar> pattern,
971 int start_index) { 1110 int start_index) {
972 int pattern_length = pattern.length(); 1111 int pattern_length = pattern.length();
973 int subject_length = subject.length(); 1112 int subject_length = subject.length();
974 // We know our pattern is at least 2 characters, we cache the first so 1113 // We know our pattern is at least 2 characters, we cache the first so
975 // the common case of the first character not matching is faster. 1114 // the common case of the first character not matching is faster.
976 pchar pattern_first_char = pattern[0]; 1115 pchar pattern_first_char = pattern[0];
977 for (int i = start_index, n = subject_length - pattern_length; i <= n; i++) { 1116 for (int i = start_index, n = subject_length - pattern_length; i <= n; i++) {
978 if (subject[i] != pattern_first_char) continue; 1117 if (subject[i] != pattern_first_char) continue;
979 1118 int j = 1;
980 bool failure = false; 1119 while (pattern[j] == subject[j+i]) {
981 for (int j = 1; j < pattern_length; j++) { 1120 j++;
982 if (pattern[j] != subject[j+i]) { 1121 if (j == pattern_length) {
983 failure = true; 1122 return i;
984 break;
985 } 1123 }
986 } 1124 }
987 if (!failure) {
988 return i;
989 }
990 } 1125 }
991 return -1; 1126 return -1;
992 }
993
994 // Maximal length (+1) of suffix that is indexed. Also the size of the
995 // maximal bad-character skip.
996 static const int kBMHSignificantSuffixLength = 0xff;
997
998 // Significant bits taken from characters to use in bad-character
999 // skips, to reduce size of the table for Unicode letters.
1000 static const int kBMHSignificantBitsMask = 0xff;
1001
1002 // Number of elements in bad-char table.
1003 static const int kBMHBadCharCount = kBMHSignificantBitsMask + 1;
1004
1005 // Simplified Boyer-Moore string matching. Only uses bad-char skipping,
1006 // and restricts table to a suffix of long strings (also restricting
1007 // the maximum possible skip-length) in order to reduce space.
1008 template <typename schar, typename pchar>
1009 static int BoyerMooreHorspoolIndexOf(Vector<const schar> subject,
1010 Vector<const pchar> pattern,
1011 int start_index) {
1012 ASSERT(kBMHSignificantSuffixLength < 0x100); // We can use bytes as skips.
1013 static byte bad_char_map[kBMHBadCharCount];
1014
1015 int m = pattern.length();
1016 int n = subject.length();
1017 // Cap bad char table to last p chars of pattern. Also max skip value.
1018 int p = m < kBMHSignificantSuffixLength ? m : kBMHSignificantSuffixLength;
1019
1020 memset(bad_char_map, p, kBMHBadCharCount);
1021
1022 // Run forwards to populate bad_char_table, so that *last* instance
1023 // of character equivalence class is the one registered.
1024 // Notice: Doesn't include last character.
1025 for (int i = p < m ? m - p : 0; i < m - 1; i++) {
1026 uc32 c = pattern[i];
1027 if (sizeof(schar) == 1 &&
1028 sizeof(pchar) > 1 &&
1029 c > String::kMaxAsciiCharCode) {
1030 return -1;
1031 }
1032 bad_char_map[c & kBMHSignificantBitsMask] = m - 1 - i;
1033 }
1034
1035 for (int i = start_index + m - 1, j = m - 1; i < n;) {
1036 schar c = subject[i];
1037 if (c == pattern[j]) {
1038 if (j == 0) {
1039 return i;
1040 }
1041 j--;
1042 i--;
1043 } else {
1044 int skip = bad_char_map[c & kBMHSignificantBitsMask];
1045 if (skip < (m - j)) {
1046 skip = m - j;
1047 }
1048 i += skip;
1049 j = m - 1;
1050 }
1051 }
1052 return -1;
1053 }
1054
1055
1056 // Full KMP pattern match.
1057 template <typename schar, typename pchar> // Pattern & subject char types
1058 static int KMPIndexOf(Vector<const schar> subject,
1059 Vector<const pchar> pattern,
1060 int start_index) {
1061 int subject_length = subject.length();
1062 int pattern_length = pattern.length();
1063 SmartPointer<int> next_table(NewArray<int>(pattern_length));
1064
1065 // Compute KMP "next" table
1066 int i = 0;
1067 int j = -1;
1068 next_table[0] = -1;
1069
1070 pchar p = pattern[0];
1071 while (i < pattern_length - 1) {
1072 while (j > -1 && p != pattern[j]) {
1073 j = next_table[j];
1074 }
1075 i++;
1076 j++;
1077 p = pattern[i];
1078 if (p == pattern[j]) {
1079 next_table[i] = next_table[j];
1080 } else {
1081 next_table[i] = j;
1082 }
1083 }
1084
1085 // Search using the 'next' table.
1086 int pattern_index = 0;
1087 int subject_index = start_index;
1088 while (subject_index < subject_length) {
1089 schar subject_char = subject[subject_index];
1090 while (pattern_index > -1 && pattern[pattern_index] != subject_char) {
1091 pattern_index = next_table[pattern_index];
1092 }
1093 pattern_index++;
1094 subject_index++;
1095 if (pattern_index >= pattern_length) {
1096 return subject_index - pattern_index;
1097 }
1098 }
1099 return -1;
1100 } 1127 }
1101 1128
1102 // Dispatch to different algorithms for different length of pattern/subject 1129 // Dispatch to different algorithms for different length of pattern/subject
1103 template <typename schar, typename pchar> 1130 template <typename schar, typename pchar>
1104 static int StringMatchStrategy(Vector<const schar> sub, 1131 static int StringMatchStrategy(Vector<const schar> sub,
1105 Vector<const pchar> pat, 1132 Vector<const pchar> pat,
1106 int start_index) { 1133 int start_index) {
1107 int pattern_length = pat.length(); 1134 int pattern_length = pat.length();
1108 // Searching for one specific character is common. For one 1135 ASSERT(pattern_length > 1);
1109 // character patterns the KMP algorithm is guaranteed to slow down
1110 // the search, so we just run through the subject string.
1111 if (pattern_length == 1) {
1112 return SingleCharIndexOf(sub, pat[0], start_index);
1113 }
1114 1136
1115 // For small searches, a complex sort is not worth the setup overhead. 1137 // For small searches, a complex sort is not worth the setup overhead.
1116 if (sub.length() - start_index < 25) { 1138 int subject_length = sub.length() - start_index;
1139 if (subject_length < 100 || pattern_length < 4) {
1117 return SimpleIndexOf(sub, pat, start_index); 1140 return SimpleIndexOf(sub, pat, start_index);
1118 } 1141 }
1119 1142
1120 return BoyerMooreHorspoolIndexOf(sub, pat, start_index); 1143 return BoyerMooreIndexOf(sub, pat, start_index);
1121 } 1144 }
1122 1145
1123 // Perform string match of pattern on subject, starting at start index. 1146 // Perform string match of pattern on subject, starting at start index.
1124 // Caller must ensure that 0 <= start_index <= sub->length(), 1147 // Caller must ensure that 0 <= start_index <= sub->length(),
1125 // and should check that pat->length() + start_index <= sub->length() 1148 // and should check that pat->length() + start_index <= sub->length()
1126 int Runtime::StringMatch(Handle<String> sub, 1149 int Runtime::StringMatch(Handle<String> sub,
1127 Handle<String> pat, 1150 Handle<String> pat,
1128 int start_index) { 1151 int start_index) {
1129 ASSERT(0 <= start_index); 1152 ASSERT(0 <= start_index);
1130 ASSERT(start_index <= sub->length()); 1153 ASSERT(start_index <= sub->length());
1131 1154
1132 int pattern_length = pat->length(); 1155 int pattern_length = pat->length();
1133 if (pattern_length == 0) return start_index; 1156 if (pattern_length == 0) return start_index;
1134 1157
1135 int subject_length = sub->length(); 1158 int subject_length = sub->length();
1136 if (start_index + pattern_length > subject_length) return -1; 1159 if (start_index + pattern_length > subject_length) return -1;
1137 1160
1138 FlattenString(sub); 1161 FlattenString(sub);
1162 // Searching for one specific character is common. For one
1163 // character patterns linear search is necessary, so any smart
1164 // algorithm is unnecessary overhead.
1165 if (pattern_length == 1) {
1166 AssertNoAllocation no_heap_allocation; // ensure vectors stay valid
1167 if (sub->is_ascii_representation()) {
1168 return SingleCharIndexOf(sub->ToAsciiVector(), pat->Get(0), start_index);
1169 }
1170 return SingleCharIndexOf(sub->ToUC16Vector(), pat->Get(0), start_index);
1171 }
1172
1139 FlattenString(pat); 1173 FlattenString(pat);
1140 1174
1141 AssertNoAllocation no_heap_allocation; // ensure vectors stay valid 1175 AssertNoAllocation no_heap_allocation; // ensure vectors stay valid
1142 // dispatch on type of strings 1176 // dispatch on type of strings
1143 if (pat->is_ascii_representation()) { 1177 if (pat->is_ascii_representation()) {
1144 Vector<const char> pat_vector = pat->ToAsciiVector(); 1178 Vector<const char> pat_vector = pat->ToAsciiVector();
1145 if (sub->is_ascii_representation()) { 1179 if (sub->is_ascii_representation()) {
1146 return StringMatchStrategy(sub->ToAsciiVector(), pat_vector, start_index); 1180 return StringMatchStrategy(sub->ToAsciiVector(), pat_vector, start_index);
1147 } 1181 }
1148 return StringMatchStrategy(sub->ToUC16Vector(), pat_vector, start_index); 1182 return StringMatchStrategy(sub->ToUC16Vector(), pat_vector, start_index);
(...skipping 4073 matching lines...) Expand 10 before | Expand all | Expand 10 after
5222 5256
5223 void Runtime::PerformGC(Object* result) { 5257 void Runtime::PerformGC(Object* result) {
5224 Failure* failure = Failure::cast(result); 5258 Failure* failure = Failure::cast(result);
5225 // Try to do a garbage collection; ignore it if it fails. The C 5259 // Try to do a garbage collection; ignore it if it fails. The C
5226 // entry stub will throw an out-of-memory exception in that case. 5260 // entry stub will throw an out-of-memory exception in that case.
5227 Heap::CollectGarbage(failure->requested(), failure->allocation_space()); 5261 Heap::CollectGarbage(failure->requested(), failure->allocation_space());
5228 } 5262 }
5229 5263
5230 5264
5231 } } // namespace v8::internal 5265 } } // namespace v8::internal
OLDNEW
« no previous file with comments | « no previous file | test/mjsunit/string-indexof.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698