src/runtime.cc - Issue 7349: Optimized special case of not matching the last char of the needle in BM string match.

Side by Side Diff: src/runtime.cc

Issue 7349: Optimized special case of not matching the last char of the needle in BM string match. (Closed)

Patch Set: Now really including all changes. Created 12 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved.	1 // Copyright 2006-2008 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 937 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
948 return Heap::LookupSingleCharacterStringFromCode(code);	948 return Heap::LookupSingleCharacterStringFromCode(code);

949 }	949 }

950 }	950 }

951 return Heap::empty_string();	951 return Heap::empty_string();

952 }	952 }

953	953

954	954

955 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a	955 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a

956 // limit, we can fix the size of tables.	956 // limit, we can fix the size of tables.

957 static const int kBMMaxShift = 0xff;	957 static const int kBMMaxShift = 0xff;

958 static const int kBMAlphabetSize = 0x100; // Reduce alphabet to this size.	958 // Reduce alphabet to this size.

	959 static const int kBMAlphabetSize = 0x100;

	960 // For patterns below this length, the skip length of Boyer-Moore is too short

	961 // to compensate for the algorithmic overhead compared to simple brute force.

	962 static const int kBMMinPatternLength = 5;

959	963

960 // Holds the two buffers used by Boyer-Moore string search's Good Suffix	964 // Holds the two buffers used by Boyer-Moore string search's Good Suffix

961 // shift. Only allows the last kBMMaxShift characters of the needle	965 // shift. Only allows the last kBMMaxShift characters of the needle

962 // to be indexed.	966 // to be indexed.

963 class BMGoodSuffixBuffers: public AllStatic {	967 class BMGoodSuffixBuffers: public AllStatic {

964 public:	968 public:

965 BMGoodSuffixBuffers() {}	969 BMGoodSuffixBuffers() {}

966 inline void init(int needle_length) {	970 inline void init(int needle_length) {

967 ASSERT(needle_length > 1);	971 ASSERT(needle_length > 1);

968 int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift;	972 int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift;

(...skipping 18 matching lines...) Expand all Loading...
987 int *biased_suffixes_;	991 int *biased_suffixes_;

988 int *biased_good_suffix_shift_;	992 int *biased_good_suffix_shift_;

989 DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers);	993 DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers);

990 };	994 };

991	995

992 // buffers reused by BoyerMoore	996 // buffers reused by BoyerMoore

993 static int bad_char_occurence[kBMAlphabetSize];	997 static int bad_char_occurence[kBMAlphabetSize];

994 static BMGoodSuffixBuffers bmgs_buffers;	998 static BMGoodSuffixBuffers bmgs_buffers;

995	999

996 // Compute the bad-char table for Boyer-Moore in the static buffer.	1000 // Compute the bad-char table for Boyer-Moore in the static buffer.

997 // Return false if the pattern contains non-ASCII characters that cannot be

998 // in the searched string.

999 template <typename pchar>	1001 template <typename pchar>

1000 static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern,	1002 static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern,

1001 int start) {	1003 int start) {

1002 // Run forwards to populate bad_char_table, so that last instance	1004 // Run forwards to populate bad_char_table, so that last instance

1003 // of character equivalence class is the one registered.	1005 // of character equivalence class is the one registered.

1004 // Notice: Doesn't include the last character.	1006 // Notice: Doesn't include the last character.

1005 for (int i = 0; i < kBMAlphabetSize; i++) {	1007 for (int i = 0; i < kBMAlphabetSize; i++) {

1006 bad_char_occurence[i] = start - 1;	1008 bad_char_occurence[i] = start - 1;

1007 }	1009 }

1008 for (int i = start; i < pattern.length(); i++) {	1010 for (int i = start; i < pattern.length() - 1; i++) {

1009 bad_char_occurence[pattern[i] % kBMAlphabetSize] = i;	1011 pchar c = pattern[i];

	1012 int bucket = c % kBMAlphabetSize;

	1013 bad_char_occurence[bucket] = i;

1010 }	1014 }

1011 }	1015 }

1012	1016

1013 template <typename pchar>	1017 template <typename pchar>

1014 static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern,	1018 static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern,

1015 int start,	1019 int start) {

1016 int len) {

1017 int m = pattern.length();	1020 int m = pattern.length();

	1021 int len = m - start;

1018 // Compute Good Suffix tables.	1022 // Compute Good Suffix tables.

1019 bmgs_buffers.init(m);	1023 bmgs_buffers.init(m);

1020	1024

1021 bmgs_buffers.shift(m-1) = 1;	1025 bmgs_buffers.shift(m-1) = 1;

1022 bmgs_buffers.suffix(m) = m + 1;	1026 bmgs_buffers.suffix(m) = m + 1;

1023 pchar last_char = pattern[m - 1];	1027 pchar last_char = pattern[m - 1];

1024 int suffix = m + 1;	1028 int suffix = m + 1;

1025 for (int i = m; i > start;) {	1029 for (int i = m; i > start;) {

1026 for (pchar c = pattern[i - 1]; suffix <= m && c != pattern[suffix - 1];) {	1030 for (pchar c = pattern[i - 1]; suffix <= m && c != pattern[suffix - 1];) {

1027 if (bmgs_buffers.shift(suffix) == len) {	1031 if (bmgs_buffers.shift(suffix) == len) {

(...skipping 25 matching lines...) Expand all Loading...
1053 if (bmgs_buffers.shift(i) == len) {	1057 if (bmgs_buffers.shift(i) == len) {

1054 bmgs_buffers.shift(i) = suffix - start;	1058 bmgs_buffers.shift(i) = suffix - start;

1055 }	1059 }

1056 if (i == suffix) {	1060 if (i == suffix) {

1057 suffix = bmgs_buffers.suffix(suffix);	1061 suffix = bmgs_buffers.suffix(suffix);

1058 }	1062 }

1059 }	1063 }

1060 }	1064 }

1061 }	1065 }

1062	1066

1063 // Restricted Boyer-Moore string matching. Restricts tables to a	1067

	1068 // Restricted simplified Boyer-Moore string matching. Restricts tables to a

1064 // suffix of long pattern strings and handles only equivalence classes	1069 // suffix of long pattern strings and handles only equivalence classes

1065 // of the full alphabet. This allows us to ensure that tables take only	1070 // of the full alphabet. This allows us to ensure that tables take only

1066 // a fixed amount of space.	1071 // a fixed amount of space.

1067 template <typename schar, typename pchar>	1072 template <typename schar, typename pchar>

1068 static int BoyerMooreIndexOf(Vector<const schar> subject,	1073 static int BoyerMooreSimplified(Vector<const schar> subject,

1069 Vector<const pchar> pattern,	1074 Vector<const pchar> pattern,

1070 int start_index) {	1075 int start_index,

	1076 bool& complete) {

	1077 int n = subject.length();

1071 int m = pattern.length();	1078 int m = pattern.length();

1072 int n = subject.length();

1073

1074 // Only preprocess at most kBMMaxShift last characters of pattern.	1079 // Only preprocess at most kBMMaxShift last characters of pattern.

1075 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;	1080 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;

1076 int len = m - start;

1077	1081

1078 BoyerMoorePopulateBadCharTable(pattern, start);	1082 BoyerMoorePopulateBadCharTable(pattern, start);

1079	1083

1080 int badness = 0; // How bad we are doing without a good-suffix table.	1084 int badness = -m; // How bad we are doing without a good-suffix table.

1081 int idx; // No matches found prior to this index.	1085 int idx; // No matches found prior to this index.

	1086 pchar last_char = pattern[m - 1];

1082 // Perform search	1087 // Perform search

1083 for (idx = start_index; idx <= n - m;) {	1088 for (idx = start_index; idx <= n - m;) {

1084 int j = m - 1;	1089 int j = m - 1;

1085 schar c;	1090 int c;

	1091 while (last_char != (c = subject[idx + j])) {

	1092 int bc_occ = bad_char_occurence[c % kBMAlphabetSize];

	1093 int shift = j - bc_occ;

	1094 idx += shift;

	1095 badness += 1 - shift; // at most zero, so badness cannot increase.

	1096 if (idx > n - m) {

	1097 complete = true;

	1098 return -1;

	1099 }

	1100 }

	1101 j--;

1086 while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;	1102 while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;

1087 if (j < 0) {	1103 if (j < 0) {

	1104 complete = true;

1088 return idx;	1105 return idx;

1089 } else {	1106 } else {

1090 int bc_occ = bad_char_occurence[c % kBMAlphabetSize];	1107 int bc_occ = bad_char_occurence[c % kBMAlphabetSize];

1091 int shift = bc_occ < j ? j - bc_occ : 1;	1108 int shift = bc_occ < j ? j - bc_occ : 1;

1092 idx += shift;	1109 idx += shift;

1093 // Badness increases by the number of characters we have	1110 // Badness increases by the number of characters we have

1094 // checked, and decreases by the number of characters we	1111 // checked, and decreases by the number of characters we

1095 // can skip by shifting. It's a measure of how we are doing	1112 // can skip by shifting. It's a measure of how we are doing

1096 // compared to reading each character exactly once.	1113 // compared to reading each character exactly once.

1097 badness += (m - j) - shift;	1114 badness += (m - j) - shift;

1098 if (badness > m) break;	1115 if (badness > 0) {

	1116 complete = false;

	1117 return idx;

	1118 }

1099 }	1119 }

1100 }	1120 }

	1121 complete = true;

	1122 return -1;

	1123 }

1101	1124

1102 // If we are not done, we got here because we should build the Good Suffix	1125

1103 // table and continue searching.	1126 template <typename schar, typename pchar>

1104 if (idx <= n - m) {	1127 static int BoyerMooreIndexOf(Vector<const schar> subject,

1105 BoyerMoorePopulateGoodSuffixTable(pattern, start, len);	1128 Vector<const pchar> pattern,

1106 // Continue search from i.	1129 int idx) {

1107 do {	1130 int n = subject.length();

1108 int j = m - 1;	1131 int m = pattern.length();

1109 schar c;	1132 // Only preprocess at most kBMMaxShift last characters of pattern.

1110 while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;	1133 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;

1111 if (j < 0) {	1134

1112 return idx;	1135 // Build the Good Suffix table and continue searching.

1113 } else if (j < start) {	1136 BoyerMoorePopulateGoodSuffixTable(pattern, start);

1114 // we have matched more than our tables allow us to be smart about.	1137 pchar last_char = pattern[m - 1];

1115 idx += 1;	1138 // Continue search from i.

1116 } else {	1139 do {

1117 int gs_shift = bmgs_buffers.shift(j + 1);	1140 int j = m - 1;

1118 int bc_occ = bad_char_occurence[c % kBMAlphabetSize];	1141 schar c;

1119 int bc_shift = j - bc_occ;	1142 while (last_char != (c = subject[idx + j])) {

1120 idx += (gs_shift > bc_shift) ? gs_shift : bc_shift;	1143 int shift = j - bad_char_occurence[c % kBMAlphabetSize];

	1144 idx += shift;

	1145 if (idx > n - m) {

	1146 return -1;

1121 }	1147 }

1122 } while (idx <= n - m);	1148 }

1123 }	1149 while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;

	1150 if (j < 0) {

	1151 return idx;

	1152 } else if (j < start) {

	1153 // we have matched more than our tables allow us to be smart about.

	1154 idx += 1;

	1155 } else {

	1156 int gs_shift = bmgs_buffers.shift(j + 1); // Good suffix shift.

	1157 int bc_occ = bad_char_occurence[c % kBMAlphabetSize];

	1158 int shift = j - bc_occ; // Bad-char shift.

	1159 shift = (gs_shift > shift) ? gs_shift : shift;

	1160 idx += shift;

	1161 }

	1162 } while (idx <= n - m);

1124	1163

1125 return -1;	1164 return -1;

1126 }	1165 }

1127	1166

1128 template <typename schar, typename pchar>	1167

	1168 template <typename schar>

1129 static int SingleCharIndexOf(Vector<const schar> string,	1169 static int SingleCharIndexOf(Vector<const schar> string,

1130 pchar pattern_char,	1170 uc16 pattern_char,

1131 int start_index) {	1171 int start_index) {

	1172 if (sizeof(schar) == 1 && pattern_char > String::kMaxAsciiCharCode) {

	1173 return -1;

	1174 }

1132 for (int i = start_index, n = string.length(); i < n; i++) {	1175 for (int i = start_index, n = string.length(); i < n; i++) {

1133 if (pattern_char == string[i]) {	1176 if (pattern_char == string[i]) {

1134 return i;	1177 return i;

1135 }	1178 }

1136 }	1179 }

1137 return -1;	1180 return -1;

1138 }	1181 }

1139	1182

1140 // Trivial string search for shorter strings.	1183 // Trivial string search for shorter strings.

1141 // On return, if "complete" is set to true, the return value is the	1184 // On return, if "complete" is set to true, the return value is the

1142 // final result of searching for the patter in the subject.	1185 // final result of searching for the patter in the subject.

1143 // If "complete" is set to false, the return value is the index where	1186 // If "complete" is set to false, the return value is the index where

1144 // further checking should start, i.e., it's guaranteed that the pattern	1187 // further checking should start, i.e., it's guaranteed that the pattern

1145 // does not occur at a position prior to the returned index.	1188 // does not occur at a position prior to the returned index.

1146 template <typename pchar, typename schar>	1189 template <typename pchar, typename schar>

1147 static int SimpleIndexOf(Vector<const schar> subject,	1190 static int SimpleIndexOf(Vector<const schar> subject,

1148 Vector<const pchar> pattern,	1191 Vector<const pchar> pattern,

1149 int start_index,	1192 int idx,

1150 bool &complete) {	1193 bool &complete) {

1151 int pattern_length = pattern.length();	1194 // Badness is a count of how much work we have done. When we have

1152 int subject_length = subject.length();	1195 // done enough work we decide it's probably worth switching to a better

1153 // Badness is a count of how many extra times the same character	1196 // algorithm.

1154 // is checked. We compare it to the index counter, so we start	1197 int badness = -10 - (pattern.length() << 2);

1155 // it at the start_index, and give it a little discount to avoid

1156 // very early bail-outs.

1157 int badness = start_index - pattern_length;

1158 // We know our pattern is at least 2 characters, we cache the first so	1198 // We know our pattern is at least 2 characters, we cache the first so

1159 // the common case of the first character not matching is faster.	1199 // the common case of the first character not matching is faster.

1160 pchar pattern_first_char = pattern[0];	1200 pchar pattern_first_char = pattern[0];

1161	1201

1162 for (int i = start_index, n = subject_length - pattern_length; i <= n; i++) {	1202 for (int i = idx, n = subject.length() - pattern.length(); i <= n; i++) {

	1203 badness++;

	1204 if (badness > 0) {

	1205 complete = false;

	1206 return (i);

	1207 }

1163 if (subject[i] != pattern_first_char) continue;	1208 if (subject[i] != pattern_first_char) continue;

1164 int j = 1;	1209 int j = 1;

1165 do {	1210 do {

1166 if (pattern[j] != subject[i+j]) {	1211 if (pattern[j] != subject[i+j]) {

1167 break;	1212 break;

1168 }	1213 }

1169 j++;	1214 j++;

1170 } while (j < pattern_length);	1215 } while (j < pattern.length());

1171 if (j == pattern_length) {	1216 if (j == pattern.length()) {

1172 complete = true;	1217 complete = true;

1173 return i;	1218 return i;

1174 }	1219 }

1175 badness += j;	1220 badness += j;

1176 if (badness > i) { // More than one extra character on average.

1177 complete = false;

1178 return (i + 1); // No matches up to index i+1.

1179 }

1180 }	1221 }

1181 complete = true;	1222 complete = true;

1182 return -1;	1223 return -1;

1183 }	1224 }

1184	1225

1185 // Dispatch to different algorithms for different length of pattern/subject	1226 // Simple indexOf that never bails out. For short patterns only.

	1227 template <typename pchar, typename schar>

	1228 static int SimpleIndexOf(Vector<const schar> subject,

	1229 Vector<const pchar> pattern,

	1230 int idx) {

	1231 pchar pattern_first_char = pattern[0];

	1232 for (int i = idx, n = subject.length() - pattern.length(); i <= n; i++) {

	1233 if (subject[i] != pattern_first_char) continue;

	1234 int j = 1;

	1235 do {

	1236 if (pattern[j] != subject[i+j]) {

	1237 break;

	1238 }

	1239 j++;

	1240 } while (j < pattern.length());

	1241 if (j == pattern.length()) {

	1242 return i;

	1243 }

	1244 }

	1245 return -1;

	1246 }

	1247

	1248

	1249 // Dispatch to different algorithms.

1186 template <typename schar, typename pchar>	1250 template <typename schar, typename pchar>

1187 static int StringMatchStrategy(Vector<const schar> sub,	1251 static int StringMatchStrategy(Vector<const schar> sub,

1188 Vector<const pchar> pat,	1252 Vector<const pchar> pat,

1189 int start_index) {	1253 int start_index) {

1190 ASSERT(pat.length() > 1);	1254 ASSERT(pat.length() > 1);

1191	1255

1192 // We have an ASCII haystack and a non-ASCII needle. Check if there	1256 // We have an ASCII haystack and a non-ASCII needle. Check if there

1193 // really is a non-ASCII character in the needle and bail out if there	1257 // really is a non-ASCII character in the needle and bail out if there

1194 // is.	1258 // is.

1195 if (sizeof(pchar) > 1 && sizeof(schar) == 1) {	1259 if (sizeof(pchar) > 1 && sizeof(schar) == 1) {

1196 for (int i = 0; i < pat.length(); i++) {	1260 for (int i = 0; i < pat.length(); i++) {

1197 uc16 c = pat[i];	1261 uc16 c = pat[i];

1198 if (c > String::kMaxAsciiCharCode) {	1262 if (c > String::kMaxAsciiCharCode) {

1199 return -1;	1263 return -1;

1200 }	1264 }

1201 }	1265 }

1202 }	1266 }

1203 // For small searches, a complex sort is not worth the setup overhead.	1267 if (pat.length() < kBMMinPatternLength) {

	1268 // We don't believe fancy searching can ever be more efficient.

	1269 // The max shift of Boyer-Moore on a pattern of this length does

	1270 // not compensate for the overhead.

	1271 return SimpleIndexOf(sub, pat, start_index);

	1272 }

	1273 // Try algorithms in order of increasing setup cost and expected performance.

1204 bool complete;	1274 bool complete;

1205 int idx = SimpleIndexOf(sub, pat, start_index, complete);	1275 int idx = SimpleIndexOf(sub, pat, start_index, complete);

1206 if (complete) return idx;	1276 if (complete) return idx;

	1277 idx = BoyerMooreSimplified(sub, pat, idx, complete);

	1278 if (complete) return idx;

1207 return BoyerMooreIndexOf(sub, pat, idx);	1279 return BoyerMooreIndexOf(sub, pat, idx);

1208 }	1280 }

1209	1281

1210 // Perform string match of pattern on subject, starting at start index.	1282 // Perform string match of pattern on subject, starting at start index.

1211 // Caller must ensure that 0 <= start_index <= sub->length(),	1283 // Caller must ensure that 0 <= start_index <= sub->length(),

1212 // and should check that pat->length() + start_index <= sub->length()	1284 // and should check that pat->length() + start_index <= sub->length()

1213 int Runtime::StringMatch(Handle<String> sub,	1285 int Runtime::StringMatch(Handle<String> sub,

1214 Handle<String> pat,	1286 Handle<String> pat,

1215 int start_index) {	1287 int start_index) {

1216 ASSERT(0 <= start_index);	1288 ASSERT(0 <= start_index);

(...skipping 4113 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5330	5402

5331 void Runtime::PerformGC(Object* result) {	5403 void Runtime::PerformGC(Object* result) {

5332 Failure* failure = Failure::cast(result);	5404 Failure* failure = Failure::cast(result);

5333 // Try to do a garbage collection; ignore it if it fails. The C	5405 // Try to do a garbage collection; ignore it if it fails. The C

5334 // entry stub will throw an out-of-memory exception in that case.	5406 // entry stub will throw an out-of-memory exception in that case.

5335 Heap::CollectGarbage(failure->requested(), failure->allocation_space());	5407 Heap::CollectGarbage(failure->requested(), failure->allocation_space());

5336 }	5408 }

5337	5409

5338	5410

5339 } } // namespace v8::internal	5411 } } // namespace v8::internal

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »