Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(420)

Side by Side Diff: src/string-search.h

Issue 3291021: Move string-search functions to separate file. (Closed)
Patch Set: Address (some)co Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/runtime.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2010 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28 #ifndef V8_STRING_SEARCH_H_
29 #define V8_STRING_SEARCH_H_
30
31 namespace v8 {
32 namespace internal {
33
34
35 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
36 // limit, we can fix the size of tables. For a needle longer than this limit,
37 // search will not be optimal, since we only build tables for a smaller suffix
38 // of the string, which is a safe approximation.
39 static const int kBMMaxShift = 250;
40 // Reduce alphabet to this size.
41 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
42 // proportional to the input alphabet. We reduce the alphabet size by
43 // equating input characters modulo a smaller alphabet size. This gives
44 // a potentially less efficient searching, but is a safe approximation.
45 // For needles using only characters in the same Unicode 256-code point page,
46 // there is no search speed degradation.
47 static const int kBMAlphabetSize = 256;
48 // For patterns below this length, the skip length of Boyer-Moore is too short
49 // to compensate for the algorithmic overhead compared to simple brute force.
50 static const int kBMMinPatternLength = 7;
51
52 // Holds the two buffers used by Boyer-Moore string search's Good Suffix
53 // shift. Only allows the last kBMMaxShift characters of the needle
54 // to be indexed.
55 class BMGoodSuffixBuffers {
56 public:
57 BMGoodSuffixBuffers() {}
58 inline void Initialize(int needle_length) {
59 ASSERT(needle_length > 1);
60 int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift;
61 int len = needle_length - start;
62 biased_suffixes_ = suffixes_ - start;
63 biased_good_suffix_shift_ = good_suffix_shift_ - start;
64 for (int i = 0; i <= len; i++) {
65 good_suffix_shift_[i] = len;
66 }
67 }
68 inline int& suffix(int index) {
69 ASSERT(biased_suffixes_ + index >= suffixes_);
70 return biased_suffixes_[index];
71 }
72 inline int& shift(int index) {
73 ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_);
74 return biased_good_suffix_shift_[index];
75 }
76 private:
77 int suffixes_[kBMMaxShift + 1];
78 int good_suffix_shift_[kBMMaxShift + 1];
79 int* biased_suffixes_;
80 int* biased_good_suffix_shift_;
81 DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers);
82 };
83
84 // buffers reused by BoyerMoore
85 struct BMBuffers {
86 public:
87 static int bad_char_occurrence[kBMAlphabetSize];
88 static BMGoodSuffixBuffers bmgs_buffers;
89 };
90
91 // State of the string match tables.
92 // SIMPLE: No usable content in the buffers.
93 // BOYER_MOORE_HORSPOOL: The bad_char_occurence table has been populated.
94 // BOYER_MOORE: The bmgs_buffers tables have also been populated.
95 // Whenever starting with a new needle, one should call InitializeStringSearch
96 // to determine which search strategy to use, and in the case of a long-needle
97 // strategy, the call also initializes the algorithm to SIMPLE.
98 enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE };
99 static StringSearchAlgorithm algorithm;
100
101
102 // Compute the bad-char table for Boyer-Moore in the static buffer.
103 template <typename PatternChar>
104 static void BoyerMoorePopulateBadCharTable(Vector<const PatternChar> pattern) {
105 // Only preprocess at most kBMMaxShift last characters of pattern.
106 int start = Max(pattern.length() - kBMMaxShift, 0);
107 // Run forwards to populate bad_char_table, so that *last* instance
108 // of character equivalence class is the one registered.
109 // Notice: Doesn't include the last character.
110 int table_size = (sizeof(PatternChar) == 1) ? String::kMaxAsciiCharCode + 1
111 : kBMAlphabetSize;
112 if (start == 0) { // All patterns less than kBMMaxShift in length.
113 memset(BMBuffers::bad_char_occurrence,
114 -1,
115 table_size * sizeof(*BMBuffers::bad_char_occurrence));
116 } else {
117 for (int i = 0; i < table_size; i++) {
118 BMBuffers::bad_char_occurrence[i] = start - 1;
119 }
120 }
121 for (int i = start; i < pattern.length() - 1; i++) {
122 PatternChar c = pattern[i];
123 int bucket = (sizeof(PatternChar) ==1) ? c : c % kBMAlphabetSize;
124 BMBuffers::bad_char_occurrence[bucket] = i;
125 }
126 }
127
128
129 template <typename PatternChar>
130 static void BoyerMoorePopulateGoodSuffixTable(
131 Vector<const PatternChar> pattern) {
132 int m = pattern.length();
133 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
134 int len = m - start;
135 // Compute Good Suffix tables.
136 BMBuffers::bmgs_buffers.Initialize(m);
137
138 BMBuffers::bmgs_buffers.shift(m-1) = 1;
139 BMBuffers::bmgs_buffers.suffix(m) = m + 1;
140 PatternChar last_char = pattern[m - 1];
141 int suffix = m + 1;
142 {
143 int i = m;
144 while (i > start) {
145 PatternChar c = pattern[i - 1];
146 while (suffix <= m && c != pattern[suffix - 1]) {
147 if (BMBuffers::bmgs_buffers.shift(suffix) == len) {
148 BMBuffers::bmgs_buffers.shift(suffix) = suffix - i;
149 }
150 suffix = BMBuffers::bmgs_buffers.suffix(suffix);
151 }
152 BMBuffers::bmgs_buffers.suffix(--i) = --suffix;
153 if (suffix == m) {
154 // No suffix to extend, so we check against last_char only.
155 while ((i > start) && (pattern[i - 1] != last_char)) {
156 if (BMBuffers::bmgs_buffers.shift(m) == len) {
157 BMBuffers::bmgs_buffers.shift(m) = m - i;
158 }
159 BMBuffers::bmgs_buffers.suffix(--i) = m;
160 }
161 if (i > start) {
162 BMBuffers::bmgs_buffers.suffix(--i) = --suffix;
163 }
164 }
165 }
166 }
167 if (suffix < m) {
168 for (int i = start; i <= m; i++) {
169 if (BMBuffers::bmgs_buffers.shift(i) == len) {
170 BMBuffers::bmgs_buffers.shift(i) = suffix - start;
171 }
172 if (i == suffix) {
173 suffix = BMBuffers::bmgs_buffers.suffix(suffix);
174 }
175 }
176 }
177 }
178
179
180 template <typename SubjectChar, typename PatternChar>
181 static inline int CharOccurrence(int char_code) {
182 if (sizeof(SubjectChar) == 1) {
183 return BMBuffers::bad_char_occurrence[char_code];
184 }
185 if (sizeof(PatternChar) == 1) {
186 if (char_code > String::kMaxAsciiCharCode) {
187 return -1;
188 }
189 return BMBuffers::bad_char_occurrence[char_code];
190 }
191 return BMBuffers::bad_char_occurrence[char_code % kBMAlphabetSize];
192 }
193
194
195 // Restricted simplified Boyer-Moore string matching.
196 // Uses only the bad-shift table of Boyer-Moore and only uses it
197 // for the character compared to the last character of the needle.
198 template <typename SubjectChar, typename PatternChar>
199 static int BoyerMooreHorspool(Vector<const SubjectChar> subject,
200 Vector<const PatternChar> pattern,
201 int start_index,
202 bool* complete) {
203 ASSERT(algorithm <= BOYER_MOORE_HORSPOOL);
204 int n = subject.length();
205 int m = pattern.length();
206
207 int badness = -m;
208
209 // How bad we are doing without a good-suffix table.
210 int idx; // No matches found prior to this index.
211 PatternChar last_char = pattern[m - 1];
212 int last_char_shift =
213 m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char);
214 // Perform search
215 for (idx = start_index; idx <= n - m;) {
216 int j = m - 1;
217 int c;
218 while (last_char != (c = subject[idx + j])) {
219 int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c);
220 int shift = j - bc_occ;
221 idx += shift;
222 badness += 1 - shift; // at most zero, so badness cannot increase.
223 if (idx > n - m) {
224 *complete = true;
225 return -1;
226 }
227 }
228 j--;
229 while (j >= 0 && pattern[j] == (subject[idx + j])) j--;
230 if (j < 0) {
231 *complete = true;
232 return idx;
233 } else {
234 idx += last_char_shift;
235 // Badness increases by the number of characters we have
236 // checked, and decreases by the number of characters we
237 // can skip by shifting. It's a measure of how we are doing
238 // compared to reading each character exactly once.
239 badness += (m - j) - last_char_shift;
240 if (badness > 0) {
241 *complete = false;
242 return idx;
243 }
244 }
245 }
246 *complete = true;
247 return -1;
248 }
249
250
251 template <typename SubjectChar, typename PatternChar>
252 static int BoyerMooreIndexOf(Vector<const SubjectChar> subject,
253 Vector<const PatternChar> pattern,
254 int idx) {
255 ASSERT(algorithm <= BOYER_MOORE);
256 int n = subject.length();
257 int m = pattern.length();
258 // Only preprocess at most kBMMaxShift last characters of pattern.
259 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
260
261 PatternChar last_char = pattern[m - 1];
262 // Continue search from i.
263 while (idx <= n - m) {
264 int j = m - 1;
265 SubjectChar c;
266 while (last_char != (c = subject[idx + j])) {
267 int shift = j - CharOccurrence<SubjectChar, PatternChar>(c);
268 idx += shift;
269 if (idx > n - m) {
270 return -1;
271 }
272 }
273 while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;
274 if (j < 0) {
275 return idx;
276 } else if (j < start) {
277 // we have matched more than our tables allow us to be smart about.
278 // Fall back on BMH shift.
279 idx += m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char);
280 } else {
281 int gs_shift = BMBuffers::bmgs_buffers.shift(j + 1); // Good suffix shift.
282 int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c);
283 int shift = j - bc_occ; // Bad-char shift.
284 if (gs_shift > shift) {
285 shift = gs_shift;
286 }
287 idx += shift;
288 }
289 }
290
291 return -1;
292 }
293
294
295 // Trivial string search for shorter strings.
296 // On return, if "complete" is set to true, the return value is the
297 // final result of searching for the patter in the subject.
298 // If "complete" is set to false, the return value is the index where
299 // further checking should start, i.e., it's guaranteed that the pattern
300 // does not occur at a position prior to the returned index.
301 template <typename PatternChar, typename SubjectChar>
302 static int SimpleIndexOf(Vector<const SubjectChar> subject,
303 Vector<const PatternChar> pattern,
304 int idx,
305 bool* complete) {
306 ASSERT(pattern.length() > 1);
307 int pattern_length = pattern.length();
308 // Badness is a count of how much work we have done. When we have
309 // done enough work we decide it's probably worth switching to a better
310 // algorithm.
311 int badness = -10 - (pattern_length << 2);
312
313 // We know our pattern is at least 2 characters, we cache the first so
314 // the common case of the first character not matching is faster.
315 PatternChar pattern_first_char = pattern[0];
316 for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {
317 badness++;
318 if (badness > 0) {
319 *complete = false;
320 return i;
321 }
322 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
323 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
324 memchr(subject.start() + i,
325 pattern_first_char,
326 n - i + 1));
327 if (pos == NULL) {
328 *complete = true;
329 return -1;
330 }
331 i = static_cast<int>(pos - subject.start());
332 } else {
333 if (subject[i] != pattern_first_char) continue;
334 }
335 int j = 1;
336 do {
337 if (pattern[j] != subject[i+j]) {
338 break;
339 }
340 j++;
341 } while (j < pattern_length);
342 if (j == pattern_length) {
343 *complete = true;
344 return i;
345 }
346 badness += j;
347 }
348 *complete = true;
349 return -1;
350 }
351
352 // Simple indexOf that never bails out. For short patterns only.
353 template <typename PatternChar, typename SubjectChar>
354 static int SimpleIndexOf(Vector<const SubjectChar> subject,
355 Vector<const PatternChar> pattern,
356 int idx) {
357 int pattern_length = pattern.length();
358 PatternChar pattern_first_char = pattern[0];
359 for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {
360 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
361 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
362 memchr(subject.start() + i,
363 pattern_first_char,
364 n - i + 1));
365 if (pos == NULL) return -1;
366 i = static_cast<int>(pos - subject.start());
367 } else {
368 if (subject[i] != pattern_first_char) continue;
369 }
370 int j = 1;
371 while (j < pattern_length) {
372 if (pattern[j] != subject[i+j]) {
373 break;
374 }
375 j++;
376 }
377 if (j == pattern_length) {
378 return i;
379 }
380 }
381 return -1;
382 }
383
384
385 // Strategy for searching for a string in another string.
386 enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG };
387
388
389 template <typename PatternChar>
390 static inline StringSearchStrategy InitializeStringSearch(
391 Vector<const PatternChar> pat, bool ascii_subject) {
392 // We have an ASCII haystack and a non-ASCII needle. Check if there
393 // really is a non-ASCII character in the needle and bail out if there
394 // is.
395 if (ascii_subject && sizeof(PatternChar) > 1) {
396 for (int i = 0; i < pat.length(); i++) {
397 uc16 c = pat[i];
398 if (c > String::kMaxAsciiCharCode) {
399 return SEARCH_FAIL;
400 }
401 }
402 }
403 if (pat.length() < kBMMinPatternLength) {
404 return SEARCH_SHORT;
405 }
406 algorithm = SIMPLE_SEARCH;
407 return SEARCH_LONG;
408 }
409
410
411 // Dispatch long needle searches to different algorithms.
412 template <typename SubjectChar, typename PatternChar>
413 static int ComplexIndexOf(Vector<const SubjectChar> sub,
414 Vector<const PatternChar> pat,
415 int start_index) {
416 ASSERT(pat.length() >= kBMMinPatternLength);
417 // Try algorithms in order of increasing setup cost and expected performance.
418 bool complete;
419 int idx = start_index;
420 switch (algorithm) {
421 case SIMPLE_SEARCH:
422 idx = SimpleIndexOf(sub, pat, idx, &complete);
423 if (complete) return idx;
424 BoyerMoorePopulateBadCharTable(pat);
425 algorithm = BOYER_MOORE_HORSPOOL;
426 // FALLTHROUGH.
427 case BOYER_MOORE_HORSPOOL:
428 idx = BoyerMooreHorspool(sub, pat, idx, &complete);
429 if (complete) return idx;
430 // Build the Good Suffix table and continue searching.
431 BoyerMoorePopulateGoodSuffixTable(pat);
432 algorithm = BOYER_MOORE;
433 // FALLTHROUGH.
434 case BOYER_MOORE:
435 return BoyerMooreIndexOf(sub, pat, idx);
436 }
437 UNREACHABLE();
438 return -1;
439 }
440
441
442 // Dispatch to different search strategies for a single search.
443 // If searching multiple times on the same needle, the search
444 // strategy should only be computed once and then dispatch to different
445 // loops.
446 template <typename SubjectChar, typename PatternChar>
447 static int StringSearch(Vector<const SubjectChar> sub,
448 Vector<const PatternChar> pat,
449 int start_index) {
450 bool ascii_subject = (sizeof(SubjectChar) == 1);
451 StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject);
452 switch (strategy) {
453 case SEARCH_FAIL: return -1;
454 case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index);
455 case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index);
456 }
457 UNREACHABLE();
458 return -1;
459 }
460
461 }} // namespace v8::internal
462
463 #endif // V8_STRING_SEARCH_H_
OLDNEW
« no previous file with comments | « src/runtime.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698