Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(126)

Side by Side Diff: src/runtime.cc

Issue 7349: Optimized special case of not matching the last char of the needle in BM string match. (Closed)
Patch Set: Now really including all changes. Created 12 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 937 matching lines...) Expand 10 before | Expand all | Expand 10 after
948 return Heap::LookupSingleCharacterStringFromCode(code); 948 return Heap::LookupSingleCharacterStringFromCode(code);
949 } 949 }
950 } 950 }
951 return Heap::empty_string(); 951 return Heap::empty_string();
952 } 952 }
953 953
954 954
955 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a 955 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
956 // limit, we can fix the size of tables. 956 // limit, we can fix the size of tables.
957 static const int kBMMaxShift = 0xff; 957 static const int kBMMaxShift = 0xff;
958 static const int kBMAlphabetSize = 0x100; // Reduce alphabet to this size. 958 // Reduce alphabet to this size.
959 static const int kBMAlphabetSize = 0x100;
960 // For patterns below this length, the skip length of Boyer-Moore is too short
961 // to compensate for the algorithmic overhead compared to simple brute force.
962 static const int kBMMinPatternLength = 5;
959 963
960 // Holds the two buffers used by Boyer-Moore string search's Good Suffix 964 // Holds the two buffers used by Boyer-Moore string search's Good Suffix
961 // shift. Only allows the last kBMMaxShift characters of the needle 965 // shift. Only allows the last kBMMaxShift characters of the needle
962 // to be indexed. 966 // to be indexed.
963 class BMGoodSuffixBuffers: public AllStatic { 967 class BMGoodSuffixBuffers: public AllStatic {
964 public: 968 public:
965 BMGoodSuffixBuffers() {} 969 BMGoodSuffixBuffers() {}
966 inline void init(int needle_length) { 970 inline void init(int needle_length) {
967 ASSERT(needle_length > 1); 971 ASSERT(needle_length > 1);
968 int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift; 972 int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift;
(...skipping 18 matching lines...) Expand all
987 int *biased_suffixes_; 991 int *biased_suffixes_;
988 int *biased_good_suffix_shift_; 992 int *biased_good_suffix_shift_;
989 DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers); 993 DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers);
990 }; 994 };
991 995
992 // buffers reused by BoyerMoore 996 // buffers reused by BoyerMoore
993 static int bad_char_occurence[kBMAlphabetSize]; 997 static int bad_char_occurence[kBMAlphabetSize];
994 static BMGoodSuffixBuffers bmgs_buffers; 998 static BMGoodSuffixBuffers bmgs_buffers;
995 999
996 // Compute the bad-char table for Boyer-Moore in the static buffer. 1000 // Compute the bad-char table for Boyer-Moore in the static buffer.
997 // Return false if the pattern contains non-ASCII characters that cannot be
998 // in the searched string.
999 template <typename pchar> 1001 template <typename pchar>
1000 static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern, 1002 static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern,
1001 int start) { 1003 int start) {
1002 // Run forwards to populate bad_char_table, so that *last* instance 1004 // Run forwards to populate bad_char_table, so that *last* instance
1003 // of character equivalence class is the one registered. 1005 // of character equivalence class is the one registered.
1004 // Notice: Doesn't include the last character. 1006 // Notice: Doesn't include the last character.
1005 for (int i = 0; i < kBMAlphabetSize; i++) { 1007 for (int i = 0; i < kBMAlphabetSize; i++) {
1006 bad_char_occurence[i] = start - 1; 1008 bad_char_occurence[i] = start - 1;
1007 } 1009 }
1008 for (int i = start; i < pattern.length(); i++) { 1010 for (int i = start; i < pattern.length() - 1; i++) {
1009 bad_char_occurence[pattern[i] % kBMAlphabetSize] = i; 1011 pchar c = pattern[i];
1012 int bucket = c % kBMAlphabetSize;
1013 bad_char_occurence[bucket] = i;
1010 } 1014 }
1011 } 1015 }
1012 1016
1013 template <typename pchar> 1017 template <typename pchar>
1014 static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern, 1018 static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern,
1015 int start, 1019 int start) {
1016 int len) {
1017 int m = pattern.length(); 1020 int m = pattern.length();
1021 int len = m - start;
1018 // Compute Good Suffix tables. 1022 // Compute Good Suffix tables.
1019 bmgs_buffers.init(m); 1023 bmgs_buffers.init(m);
1020 1024
1021 bmgs_buffers.shift(m-1) = 1; 1025 bmgs_buffers.shift(m-1) = 1;
1022 bmgs_buffers.suffix(m) = m + 1; 1026 bmgs_buffers.suffix(m) = m + 1;
1023 pchar last_char = pattern[m - 1]; 1027 pchar last_char = pattern[m - 1];
1024 int suffix = m + 1; 1028 int suffix = m + 1;
1025 for (int i = m; i > start;) { 1029 for (int i = m; i > start;) {
1026 for (pchar c = pattern[i - 1]; suffix <= m && c != pattern[suffix - 1];) { 1030 for (pchar c = pattern[i - 1]; suffix <= m && c != pattern[suffix - 1];) {
1027 if (bmgs_buffers.shift(suffix) == len) { 1031 if (bmgs_buffers.shift(suffix) == len) {
(...skipping 25 matching lines...) Expand all
1053 if (bmgs_buffers.shift(i) == len) { 1057 if (bmgs_buffers.shift(i) == len) {
1054 bmgs_buffers.shift(i) = suffix - start; 1058 bmgs_buffers.shift(i) = suffix - start;
1055 } 1059 }
1056 if (i == suffix) { 1060 if (i == suffix) {
1057 suffix = bmgs_buffers.suffix(suffix); 1061 suffix = bmgs_buffers.suffix(suffix);
1058 } 1062 }
1059 } 1063 }
1060 } 1064 }
1061 } 1065 }
1062 1066
1063 // Restricted Boyer-Moore string matching. Restricts tables to a 1067
1068 // Restricted simplified Boyer-Moore string matching. Restricts tables to a
1064 // suffix of long pattern strings and handles only equivalence classes 1069 // suffix of long pattern strings and handles only equivalence classes
1065 // of the full alphabet. This allows us to ensure that tables take only 1070 // of the full alphabet. This allows us to ensure that tables take only
1066 // a fixed amount of space. 1071 // a fixed amount of space.
1067 template <typename schar, typename pchar> 1072 template <typename schar, typename pchar>
1068 static int BoyerMooreIndexOf(Vector<const schar> subject, 1073 static int BoyerMooreSimplified(Vector<const schar> subject,
1069 Vector<const pchar> pattern, 1074 Vector<const pchar> pattern,
1070 int start_index) { 1075 int start_index,
1076 bool& complete) {
1077 int n = subject.length();
1071 int m = pattern.length(); 1078 int m = pattern.length();
1072 int n = subject.length();
1073
1074 // Only preprocess at most kBMMaxShift last characters of pattern. 1079 // Only preprocess at most kBMMaxShift last characters of pattern.
1075 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift; 1080 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
1076 int len = m - start;
1077 1081
1078 BoyerMoorePopulateBadCharTable(pattern, start); 1082 BoyerMoorePopulateBadCharTable(pattern, start);
1079 1083
1080 int badness = 0; // How bad we are doing without a good-suffix table. 1084 int badness = -m; // How bad we are doing without a good-suffix table.
1081 int idx; // No matches found prior to this index. 1085 int idx; // No matches found prior to this index.
1086 pchar last_char = pattern[m - 1];
1082 // Perform search 1087 // Perform search
1083 for (idx = start_index; idx <= n - m;) { 1088 for (idx = start_index; idx <= n - m;) {
1084 int j = m - 1; 1089 int j = m - 1;
1085 schar c; 1090 int c;
1091 while (last_char != (c = subject[idx + j])) {
1092 int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
1093 int shift = j - bc_occ;
1094 idx += shift;
1095 badness += 1 - shift; // at most zero, so badness cannot increase.
1096 if (idx > n - m) {
1097 complete = true;
1098 return -1;
1099 }
1100 }
1101 j--;
1086 while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--; 1102 while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;
1087 if (j < 0) { 1103 if (j < 0) {
1104 complete = true;
1088 return idx; 1105 return idx;
1089 } else { 1106 } else {
1090 int bc_occ = bad_char_occurence[c % kBMAlphabetSize]; 1107 int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
1091 int shift = bc_occ < j ? j - bc_occ : 1; 1108 int shift = bc_occ < j ? j - bc_occ : 1;
1092 idx += shift; 1109 idx += shift;
1093 // Badness increases by the number of characters we have 1110 // Badness increases by the number of characters we have
1094 // checked, and decreases by the number of characters we 1111 // checked, and decreases by the number of characters we
1095 // can skip by shifting. It's a measure of how we are doing 1112 // can skip by shifting. It's a measure of how we are doing
1096 // compared to reading each character exactly once. 1113 // compared to reading each character exactly once.
1097 badness += (m - j) - shift; 1114 badness += (m - j) - shift;
1098 if (badness > m) break; 1115 if (badness > 0) {
1116 complete = false;
1117 return idx;
1118 }
1099 } 1119 }
1100 } 1120 }
1121 complete = true;
1122 return -1;
1123 }
1101 1124
1102 // If we are not done, we got here because we should build the Good Suffix 1125
1103 // table and continue searching. 1126 template <typename schar, typename pchar>
1104 if (idx <= n - m) { 1127 static int BoyerMooreIndexOf(Vector<const schar> subject,
1105 BoyerMoorePopulateGoodSuffixTable(pattern, start, len); 1128 Vector<const pchar> pattern,
1106 // Continue search from i. 1129 int idx) {
1107 do { 1130 int n = subject.length();
1108 int j = m - 1; 1131 int m = pattern.length();
1109 schar c; 1132 // Only preprocess at most kBMMaxShift last characters of pattern.
1110 while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--; 1133 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
1111 if (j < 0) { 1134
1112 return idx; 1135 // Build the Good Suffix table and continue searching.
1113 } else if (j < start) { 1136 BoyerMoorePopulateGoodSuffixTable(pattern, start);
1114 // we have matched more than our tables allow us to be smart about. 1137 pchar last_char = pattern[m - 1];
1115 idx += 1; 1138 // Continue search from i.
1116 } else { 1139 do {
1117 int gs_shift = bmgs_buffers.shift(j + 1); 1140 int j = m - 1;
1118 int bc_occ = bad_char_occurence[c % kBMAlphabetSize]; 1141 schar c;
1119 int bc_shift = j - bc_occ; 1142 while (last_char != (c = subject[idx + j])) {
1120 idx += (gs_shift > bc_shift) ? gs_shift : bc_shift; 1143 int shift = j - bad_char_occurence[c % kBMAlphabetSize];
1144 idx += shift;
1145 if (idx > n - m) {
1146 return -1;
1121 } 1147 }
1122 } while (idx <= n - m); 1148 }
1123 } 1149 while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;
1150 if (j < 0) {
1151 return idx;
1152 } else if (j < start) {
1153 // we have matched more than our tables allow us to be smart about.
1154 idx += 1;
1155 } else {
1156 int gs_shift = bmgs_buffers.shift(j + 1); // Good suffix shift.
1157 int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
1158 int shift = j - bc_occ; // Bad-char shift.
1159 shift = (gs_shift > shift) ? gs_shift : shift;
1160 idx += shift;
1161 }
1162 } while (idx <= n - m);
1124 1163
1125 return -1; 1164 return -1;
1126 } 1165 }
1127 1166
1128 template <typename schar, typename pchar> 1167
1168 template <typename schar>
1129 static int SingleCharIndexOf(Vector<const schar> string, 1169 static int SingleCharIndexOf(Vector<const schar> string,
1130 pchar pattern_char, 1170 uc16 pattern_char,
1131 int start_index) { 1171 int start_index) {
1172 if (sizeof(schar) == 1 && pattern_char > String::kMaxAsciiCharCode) {
1173 return -1;
1174 }
1132 for (int i = start_index, n = string.length(); i < n; i++) { 1175 for (int i = start_index, n = string.length(); i < n; i++) {
1133 if (pattern_char == string[i]) { 1176 if (pattern_char == string[i]) {
1134 return i; 1177 return i;
1135 } 1178 }
1136 } 1179 }
1137 return -1; 1180 return -1;
1138 } 1181 }
1139 1182
1140 // Trivial string search for shorter strings. 1183 // Trivial string search for shorter strings.
1141 // On return, if "complete" is set to true, the return value is the 1184 // On return, if "complete" is set to true, the return value is the
1142 // final result of searching for the patter in the subject. 1185 // final result of searching for the patter in the subject.
1143 // If "complete" is set to false, the return value is the index where 1186 // If "complete" is set to false, the return value is the index where
1144 // further checking should start, i.e., it's guaranteed that the pattern 1187 // further checking should start, i.e., it's guaranteed that the pattern
1145 // does not occur at a position prior to the returned index. 1188 // does not occur at a position prior to the returned index.
1146 template <typename pchar, typename schar> 1189 template <typename pchar, typename schar>
1147 static int SimpleIndexOf(Vector<const schar> subject, 1190 static int SimpleIndexOf(Vector<const schar> subject,
1148 Vector<const pchar> pattern, 1191 Vector<const pchar> pattern,
1149 int start_index, 1192 int idx,
1150 bool &complete) { 1193 bool &complete) {
1151 int pattern_length = pattern.length(); 1194 // Badness is a count of how much work we have done. When we have
1152 int subject_length = subject.length(); 1195 // done enough work we decide it's probably worth switching to a better
1153 // Badness is a count of how many extra times the same character 1196 // algorithm.
1154 // is checked. We compare it to the index counter, so we start 1197 int badness = -10 - (pattern.length() << 2);
1155 // it at the start_index, and give it a little discount to avoid
1156 // very early bail-outs.
1157 int badness = start_index - pattern_length;
1158 // We know our pattern is at least 2 characters, we cache the first so 1198 // We know our pattern is at least 2 characters, we cache the first so
1159 // the common case of the first character not matching is faster. 1199 // the common case of the first character not matching is faster.
1160 pchar pattern_first_char = pattern[0]; 1200 pchar pattern_first_char = pattern[0];
1161 1201
1162 for (int i = start_index, n = subject_length - pattern_length; i <= n; i++) { 1202 for (int i = idx, n = subject.length() - pattern.length(); i <= n; i++) {
1203 badness++;
1204 if (badness > 0) {
1205 complete = false;
1206 return (i);
1207 }
1163 if (subject[i] != pattern_first_char) continue; 1208 if (subject[i] != pattern_first_char) continue;
1164 int j = 1; 1209 int j = 1;
1165 do { 1210 do {
1166 if (pattern[j] != subject[i+j]) { 1211 if (pattern[j] != subject[i+j]) {
1167 break; 1212 break;
1168 } 1213 }
1169 j++; 1214 j++;
1170 } while (j < pattern_length); 1215 } while (j < pattern.length());
1171 if (j == pattern_length) { 1216 if (j == pattern.length()) {
1172 complete = true; 1217 complete = true;
1173 return i; 1218 return i;
1174 } 1219 }
1175 badness += j; 1220 badness += j;
1176 if (badness > i) { // More than one extra character on average.
1177 complete = false;
1178 return (i + 1); // No matches up to index i+1.
1179 }
1180 } 1221 }
1181 complete = true; 1222 complete = true;
1182 return -1; 1223 return -1;
1183 } 1224 }
1184 1225
1185 // Dispatch to different algorithms for different length of pattern/subject 1226 // Simple indexOf that never bails out. For short patterns only.
1227 template <typename pchar, typename schar>
1228 static int SimpleIndexOf(Vector<const schar> subject,
1229 Vector<const pchar> pattern,
1230 int idx) {
1231 pchar pattern_first_char = pattern[0];
1232 for (int i = idx, n = subject.length() - pattern.length(); i <= n; i++) {
1233 if (subject[i] != pattern_first_char) continue;
1234 int j = 1;
1235 do {
1236 if (pattern[j] != subject[i+j]) {
1237 break;
1238 }
1239 j++;
1240 } while (j < pattern.length());
1241 if (j == pattern.length()) {
1242 return i;
1243 }
1244 }
1245 return -1;
1246 }
1247
1248
1249 // Dispatch to different algorithms.
1186 template <typename schar, typename pchar> 1250 template <typename schar, typename pchar>
1187 static int StringMatchStrategy(Vector<const schar> sub, 1251 static int StringMatchStrategy(Vector<const schar> sub,
1188 Vector<const pchar> pat, 1252 Vector<const pchar> pat,
1189 int start_index) { 1253 int start_index) {
1190 ASSERT(pat.length() > 1); 1254 ASSERT(pat.length() > 1);
1191 1255
1192 // We have an ASCII haystack and a non-ASCII needle. Check if there 1256 // We have an ASCII haystack and a non-ASCII needle. Check if there
1193 // really is a non-ASCII character in the needle and bail out if there 1257 // really is a non-ASCII character in the needle and bail out if there
1194 // is. 1258 // is.
1195 if (sizeof(pchar) > 1 && sizeof(schar) == 1) { 1259 if (sizeof(pchar) > 1 && sizeof(schar) == 1) {
1196 for (int i = 0; i < pat.length(); i++) { 1260 for (int i = 0; i < pat.length(); i++) {
1197 uc16 c = pat[i]; 1261 uc16 c = pat[i];
1198 if (c > String::kMaxAsciiCharCode) { 1262 if (c > String::kMaxAsciiCharCode) {
1199 return -1; 1263 return -1;
1200 } 1264 }
1201 } 1265 }
1202 } 1266 }
1203 // For small searches, a complex sort is not worth the setup overhead. 1267 if (pat.length() < kBMMinPatternLength) {
1268 // We don't believe fancy searching can ever be more efficient.
1269 // The max shift of Boyer-Moore on a pattern of this length does
1270 // not compensate for the overhead.
1271 return SimpleIndexOf(sub, pat, start_index);
1272 }
1273 // Try algorithms in order of increasing setup cost and expected performance.
1204 bool complete; 1274 bool complete;
1205 int idx = SimpleIndexOf(sub, pat, start_index, complete); 1275 int idx = SimpleIndexOf(sub, pat, start_index, complete);
1206 if (complete) return idx; 1276 if (complete) return idx;
1277 idx = BoyerMooreSimplified(sub, pat, idx, complete);
1278 if (complete) return idx;
1207 return BoyerMooreIndexOf(sub, pat, idx); 1279 return BoyerMooreIndexOf(sub, pat, idx);
1208 } 1280 }
1209 1281
1210 // Perform string match of pattern on subject, starting at start index. 1282 // Perform string match of pattern on subject, starting at start index.
1211 // Caller must ensure that 0 <= start_index <= sub->length(), 1283 // Caller must ensure that 0 <= start_index <= sub->length(),
1212 // and should check that pat->length() + start_index <= sub->length() 1284 // and should check that pat->length() + start_index <= sub->length()
1213 int Runtime::StringMatch(Handle<String> sub, 1285 int Runtime::StringMatch(Handle<String> sub,
1214 Handle<String> pat, 1286 Handle<String> pat,
1215 int start_index) { 1287 int start_index) {
1216 ASSERT(0 <= start_index); 1288 ASSERT(0 <= start_index);
(...skipping 4113 matching lines...) Expand 10 before | Expand all | Expand 10 after
5330 5402
5331 void Runtime::PerformGC(Object* result) { 5403 void Runtime::PerformGC(Object* result) {
5332 Failure* failure = Failure::cast(result); 5404 Failure* failure = Failure::cast(result);
5333 // Try to do a garbage collection; ignore it if it fails. The C 5405 // Try to do a garbage collection; ignore it if it fails. The C
5334 // entry stub will throw an out-of-memory exception in that case. 5406 // entry stub will throw an out-of-memory exception in that case.
5335 Heap::CollectGarbage(failure->requested(), failure->allocation_space()); 5407 Heap::CollectGarbage(failure->requested(), failure->allocation_space());
5336 } 5408 }
5337 5409
5338 5410
5339 } } // namespace v8::internal 5411 } } // namespace v8::internal
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698