Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(407)

Side by Side Diff: src/string-search.h

Issue 3467010: Refactored string search code. (Closed)
Patch Set: Added to xcode project too. Created 10 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/runtime.cc ('k') | tools/gyp/v8.gyp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2010 the V8 project authors. All rights reserved. 1 // Copyright 2010 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 14 matching lines...) Expand all
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 27
28 #ifndef V8_STRING_SEARCH_H_ 28 #ifndef V8_STRING_SEARCH_H_
29 #define V8_STRING_SEARCH_H_ 29 #define V8_STRING_SEARCH_H_
30 30
31 namespace v8 { 31 namespace v8 {
32 namespace internal { 32 namespace internal {
33 33
34 34
35 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a 35 //---------------------------------------------------------------------
36 // limit, we can fix the size of tables. For a needle longer than this limit, 36 // String Search object.
37 // search will not be optimal, since we only build tables for a smaller suffix 37 //---------------------------------------------------------------------
38 // of the string, which is a safe approximation. 38
39 static const int kBMMaxShift = 250; 39 // Class holding constants and methods that apply to all string search variants,
40 // Reduce alphabet to this size. 40 // independently of subject and pattern char size.
41 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size 41 class StringSearchBase {
42 // proportional to the input alphabet. We reduce the alphabet size by 42 protected:
43 // equating input characters modulo a smaller alphabet size. This gives 43 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
44 // a potentially less efficient searching, but is a safe approximation. 44 // limit, we can fix the size of tables. For a needle longer than this limit,
45 // For needles using only characters in the same Unicode 256-code point page, 45 // search will not be optimal, since we only build tables for a suffix
46 // there is no search speed degradation. 46 // of the string, but it is a safe approximation.
47 static const int kBMAlphabetSize = 256; 47 static const int kBMMaxShift = 250;
48 // For patterns below this length, the skip length of Boyer-Moore is too short 48
49 // to compensate for the algorithmic overhead compared to simple brute force. 49 // Reduce alphabet to this size.
50 static const int kBMMinPatternLength = 7; 50 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
51 51 // proportional to the input alphabet. We reduce the alphabet size by
52 // Holds the two buffers used by Boyer-Moore string search's Good Suffix 52 // equating input characters modulo a smaller alphabet size. This gives
53 // shift. Only allows the last kBMMaxShift characters of the needle 53 // a potentially less efficient searching, but is a safe approximation.
54 // to be indexed. 54 // For needles using only characters in the same Unicode 256-code point page,
55 class BMGoodSuffixBuffers { 55 // there is no search speed degradation.
56 static const int kAsciiAlphabetSize = 128;
57 static const int kUC16AlphabetSize = 256;
58
59 // Bad-char shift table stored in the state. It's length is the alphabet size.
60 // For patterns below this length, the skip length of Boyer-Moore is too short
61 // to compensate for the algorithmic overhead compared to simple brute force.
62 static const int kBMMinPatternLength = 7;
63
64 static inline bool IsAsciiString(Vector<const char>) {
65 return true;
66 }
67
68 static inline bool IsAsciiString(Vector<const uc16> string) {
69 for (int i = 0, n = string.length(); i < n; i++) {
70 if (static_cast<unsigned>(string[i]) > String::kMaxAsciiCharCodeU) {
71 return false;
72 }
73 }
74 return true;
75 }
76
77 // The following tables are shared by all searches.
78 // TODO(lrn): Introduce a way for a pattern to keep its tables
79 // between searches (e.g., for an Atom RegExp).
80
81 // Store for the BoyerMoore(Horspool) bad char shift table.
82 static int kBadCharShiftTable[kUC16AlphabetSize];
83 // Store for the BoyerMoore good suffix shift table.
84 static int kGoodSuffixShiftTable[kBMMaxShift + 1];
85 // Table used temporarily while building the BoyerMoore good suffix
86 // shift table.
87 static int kSuffixTable[kBMMaxShift + 1];
88 };
89
90
91 template <typename PatternChar, typename SubjectChar>
92 class StringSearch : private StringSearchBase {
56 public: 93 public:
57 BMGoodSuffixBuffers() {} 94 explicit StringSearch(Vector<const PatternChar> pattern)
58 inline void Initialize(int needle_length) { 95 : pattern_(pattern),
59 ASSERT(needle_length > 1); 96 start_(Max(0, pattern.length() - kBMMaxShift)) {
60 int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift; 97 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
61 int len = needle_length - start; 98 if (!IsAsciiString(pattern_)) {
62 biased_suffixes_ = suffixes_ - start; 99 strategy_ = &FailSearch;
63 biased_good_suffix_shift_ = good_suffix_shift_ - start; 100 return;
64 for (int i = 0; i <= len; i++) { 101 }
65 good_suffix_shift_[i] = len; 102 }
66 } 103 int pattern_length = pattern_.length();
67 } 104 if (pattern_length < kBMMinPatternLength) {
68 inline int& suffix(int index) { 105 if (pattern_length == 1) {
69 ASSERT(biased_suffixes_ + index >= suffixes_); 106 strategy_ = &SingleCharSearch;
70 return biased_suffixes_[index]; 107 return;
71 } 108 }
72 inline int& shift(int index) { 109 strategy_ = &LinearSearch;
73 ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_); 110 return;
74 return biased_good_suffix_shift_[index]; 111 }
75 } 112 strategy_ = &InitialSearch;
113 }
114
115 int Search(Vector<const SubjectChar> subject, int index) {
116 return strategy_(this, subject, index);
117 }
118
119 static inline int AlphabetSize() {
120 if (sizeof(PatternChar) == 1) {
121 // ASCII needle.
122 return kAsciiAlphabetSize;
123 } else {
124 ASSERT(sizeof(PatternChar) == 2);
125 // UC16 needle.
126 return kUC16AlphabetSize;
127 }
128 }
129
76 private: 130 private:
77 int suffixes_[kBMMaxShift + 1]; 131 typedef int (*SearchFunction)( // NOLINT - it's not a cast!
78 int good_suffix_shift_[kBMMaxShift + 1]; 132 StringSearch<PatternChar, SubjectChar>*,
79 int* biased_suffixes_; 133 Vector<const SubjectChar>,
80 int* biased_good_suffix_shift_; 134 int);
81 DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers); 135
136 static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
137 Vector<const SubjectChar>,
138 int) {
139 return -1;
140 }
141
142 static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
143 Vector<const SubjectChar> subject,
144 int start_index);
145
146 static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
147 Vector<const SubjectChar> subject,
148 int start_index);
149
150 static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
151 Vector<const SubjectChar> subject,
152 int start_index);
153
154 static int BoyerMooreHorspoolSearch(
155 StringSearch<PatternChar, SubjectChar>* search,
156 Vector<const SubjectChar> subject,
157 int start_index);
158
159 static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
160 Vector<const SubjectChar> subject,
161 int start_index);
162
163 void PopulateBoyerMooreHorspoolTable();
164
165 void PopulateBoyerMooreTable();
166
167 static inline int CharOccurrence(int* bad_char_occurrence,
168 SubjectChar char_code) {
169 if (sizeof(SubjectChar) == 1) {
170 return bad_char_occurrence[static_cast<int>(char_code)];
171 }
172 if (sizeof(PatternChar) == 1) {
173 if (static_cast<unsigned char>(char_code) > String::kMaxAsciiCharCode) {
174 return -1;
175 }
176 return bad_char_occurrence[static_cast<int>(char_code)];
177 }
178 // Reduce to equivalence class.
179 int equiv_class = char_code % kUC16AlphabetSize;
180 return bad_char_occurrence[equiv_class];
181 }
182
183 // Return a table covering the last kBMMaxShift+1 positions of
184 // pattern.
185 int* bad_char_table() {
186 return kBadCharShiftTable;
187 }
188
189 int* good_suffix_shift_table() {
190 // Return biased pointer that maps the range [start_..pattern_.length()
191 // to the kGoodSuffixShiftTable array.
192 return kGoodSuffixShiftTable - start_;
193 }
194
195 int* suffix_table() {
196 // Return biased pointer that maps the range [start_..pattern_.length()
197 // to the kSuffixTable array.
198 return kSuffixTable - start_;
199 }
200
201 // The pattern to search for.
202 Vector<const PatternChar> pattern_;
203 // Pointer to implementation of the search.
204 SearchFunction strategy_;
205 // Cache value of Max(0, pattern_length() - kBMMaxShift)
206 int start_;
82 }; 207 };
83 208
84 // buffers reused by BoyerMoore 209
85 struct BMBuffers { 210 //---------------------------------------------------------------------
86 public: 211 // Single Character Pattern Search Strategy
87 static int bad_char_occurrence[kBMAlphabetSize]; 212 //---------------------------------------------------------------------
88 static BMGoodSuffixBuffers bmgs_buffers; 213
89 }; 214 template <typename PatternChar, typename SubjectChar>
90 215 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
91 // State of the string match tables. 216 StringSearch<PatternChar, SubjectChar>* search,
92 // SIMPLE: No usable content in the buffers. 217 Vector<const SubjectChar> subject,
93 // BOYER_MOORE_HORSPOOL: The bad_char_occurence table has been populated. 218 int index) {
94 // BOYER_MOORE: The bmgs_buffers tables have also been populated. 219 ASSERT_EQ(1, search->pattern_.length());
95 // Whenever starting with a new needle, one should call InitializeStringSearch 220 PatternChar pattern_first_char = search->pattern_[0];
96 // to determine which search strategy to use, and in the case of a long-needle 221 int i = index;
97 // strategy, the call also initializes the algorithm to SIMPLE. 222 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
98 enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE }; 223 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
99 static StringSearchAlgorithm algorithm; 224 memchr(subject.start() + i,
100 225 pattern_first_char,
101 226 subject.length() - i));
102 // Compute the bad-char table for Boyer-Moore in the static buffer. 227 if (pos == NULL) return -1;
103 template <typename PatternChar> 228 return static_cast<int>(pos - subject.start());
104 static void BoyerMoorePopulateBadCharTable(Vector<const PatternChar> pattern) { 229 } else {
230 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
231 if (static_cast<uc16>(pattern_first_char) > String::kMaxAsciiCharCodeU) {
232 return -1;
233 }
234 }
235 SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
236 int n = subject.length();
237 while (i < n) {
238 if (subject[i++] == search_char) return i - 1;
239 }
240 return -1;
241 }
242 }
243
244 //---------------------------------------------------------------------
245 // Linear Search Strategy
246 //---------------------------------------------------------------------
247
248
249 template <typename PatternChar, typename SubjectChar>
250 static inline bool CharCompare(const PatternChar* pattern,
251 const SubjectChar* subject,
252 int length) {
253 ASSERT(length > 0);
254 int pos = 0;
255 do {
256 if (pattern[pos] != subject[pos]) {
257 return false;
258 }
259 pos++;
260 } while (pos < length);
261 return true;
262 }
263
264
265 // Simple linear search for short patterns. Never bails out.
266 template <typename PatternChar, typename SubjectChar>
267 int StringSearch<PatternChar, SubjectChar>::LinearSearch(
268 StringSearch<PatternChar, SubjectChar>* search,
269 Vector<const SubjectChar> subject,
270 int index) {
271 Vector<const PatternChar> pattern = search->pattern_;
272 ASSERT(pattern.length() > 1);
273 int pattern_length = pattern.length();
274 PatternChar pattern_first_char = pattern[0];
275 int i = index;
276 int n = subject.length() - pattern_length;
277 while (i <= n) {
278 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
279 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
280 memchr(subject.start() + i,
281 pattern_first_char,
282 n - i + 1));
283 if (pos == NULL) return -1;
284 i = static_cast<int>(pos - subject.start()) + 1;
285 } else {
286 if (subject[i++] != pattern_first_char) continue;
287 }
288 // Loop extracted to separate function to allow using return to do
289 // a deeper break.
290 if (CharCompare(pattern.start() + 1,
291 subject.start() + i,
292 pattern_length - 1)) {
293 return i - 1;
294 }
295 }
296 return -1;
297 }
298
299 //---------------------------------------------------------------------
300 // Boyer-Moore string search
301 //---------------------------------------------------------------------
302
303 template <typename PatternChar, typename SubjectChar>
304 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
305 StringSearch<PatternChar, SubjectChar>* search,
306 Vector<const SubjectChar> subject,
307 int start_index) {
308 Vector<const PatternChar> pattern = search->pattern_;
309 int subject_length = subject.length();
310 int pattern_length = pattern.length();
105 // Only preprocess at most kBMMaxShift last characters of pattern. 311 // Only preprocess at most kBMMaxShift last characters of pattern.
106 int start = Max(pattern.length() - kBMMaxShift, 0); 312 int start = search->start_;
107 // Run forwards to populate bad_char_table, so that *last* instance 313
108 // of character equivalence class is the one registered. 314 int* bad_char_occurence = search->bad_char_table();
109 // Notice: Doesn't include the last character. 315 int* good_suffix_shift = search->good_suffix_shift_table();
110 int table_size = (sizeof(PatternChar) == 1) ? String::kMaxAsciiCharCode + 1 316
111 : kBMAlphabetSize; 317 PatternChar last_char = pattern[pattern_length - 1];
112 if (start == 0) { // All patterns less than kBMMaxShift in length. 318 int index = start_index;
113 memset(BMBuffers::bad_char_occurrence, 319 // Continue search from i.
114 -1, 320 while (index <= subject_length - pattern_length) {
115 table_size * sizeof(*BMBuffers::bad_char_occurrence)); 321 int j = pattern_length - 1;
116 } else { 322 int c;
117 for (int i = 0; i < table_size; i++) { 323 while (last_char != (c = subject[index + j])) {
118 BMBuffers::bad_char_occurrence[i] = start - 1; 324 int shift =
119 } 325 j - CharOccurrence(bad_char_occurence, c);
120 } 326 index += shift;
121 for (int i = start; i < pattern.length() - 1; i++) { 327 if (index > subject_length - pattern_length) {
122 PatternChar c = pattern[i]; 328 return -1;
123 int bucket = (sizeof(PatternChar) ==1) ? c : c % kBMAlphabetSize; 329 }
124 BMBuffers::bad_char_occurrence[bucket] = i; 330 }
125 } 331 while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
126 } 332 if (j < 0) {
127 333 return index;
128 334 } else if (j < start) {
129 template <typename PatternChar> 335 // we have matched more than our tables allow us to be smart about.
130 static void BoyerMoorePopulateGoodSuffixTable( 336 // Fall back on BMH shift.
131 Vector<const PatternChar> pattern) { 337 index +=
132 int m = pattern.length(); 338 pattern_length - 1 - CharOccurrence(bad_char_occurence, last_char);
133 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift; 339 } else {
134 int len = m - start; 340 int gs_shift = good_suffix_shift[j + 1];
135 // Compute Good Suffix tables. 341 int bc_occ =
136 BMBuffers::bmgs_buffers.Initialize(m); 342 CharOccurrence(bad_char_occurence, c);
137 343 int shift = j - bc_occ;
138 BMBuffers::bmgs_buffers.shift(m-1) = 1; 344 if (gs_shift > shift) {
139 BMBuffers::bmgs_buffers.suffix(m) = m + 1; 345 shift = gs_shift;
140 PatternChar last_char = pattern[m - 1]; 346 }
141 int suffix = m + 1; 347 index += shift;
348 }
349 }
350
351 return -1;
352 }
353
354
355 template <typename PatternChar, typename SubjectChar>
356 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
357 int pattern_length = pattern_.length();
358 const PatternChar* pattern = pattern_.start();
359 // Only look at the last kBMMaxShift characters of pattern (from start_
360 // to pattern_length).
361 int start = start_;
362 int length = pattern_length - start;
363
364 // Biased tables so that we can use pattern indices as table indices,
365 // even if we only cover the part of the pattern from offset start.
366 int* shift_table = good_suffix_shift_table();
367 int* suffix_table = this->suffix_table();
368
369 // Initialize table.
370 for (int i = start; i < pattern_length; i++) {
371 shift_table[i] = length;
372 }
373 shift_table[pattern_length] = 1;
374 suffix_table[pattern_length] = pattern_length + 1;
375
376 // Find suffixes.
377 PatternChar last_char = pattern[pattern_length - 1];
378 int suffix = pattern_length + 1;
142 { 379 {
143 int i = m; 380 int i = pattern_length;
144 while (i > start) { 381 while (i > start) {
145 PatternChar c = pattern[i - 1]; 382 PatternChar c = pattern[i - 1];
146 while (suffix <= m && c != pattern[suffix - 1]) { 383 while (suffix <= pattern_length && c != pattern[suffix - 1]) {
147 if (BMBuffers::bmgs_buffers.shift(suffix) == len) { 384 if (shift_table[suffix] == length) {
148 BMBuffers::bmgs_buffers.shift(suffix) = suffix - i; 385 shift_table[suffix] = suffix - i;
149 } 386 }
150 suffix = BMBuffers::bmgs_buffers.suffix(suffix); 387 suffix = suffix_table[suffix];
151 } 388 }
152 BMBuffers::bmgs_buffers.suffix(--i) = --suffix; 389 suffix_table[--i] = --suffix;
153 if (suffix == m) { 390 if (suffix == pattern_length) {
154 // No suffix to extend, so we check against last_char only. 391 // No suffix to extend, so we check against last_char only.
155 while ((i > start) && (pattern[i - 1] != last_char)) { 392 while ((i > start) && (pattern[i - 1] != last_char)) {
156 if (BMBuffers::bmgs_buffers.shift(m) == len) { 393 if (shift_table[pattern_length] == length) {
157 BMBuffers::bmgs_buffers.shift(m) = m - i; 394 shift_table[pattern_length] = pattern_length - i;
158 } 395 }
159 BMBuffers::bmgs_buffers.suffix(--i) = m; 396 suffix_table[--i] = pattern_length;
160 } 397 }
161 if (i > start) { 398 if (i > start) {
162 BMBuffers::bmgs_buffers.suffix(--i) = --suffix; 399 suffix_table[--i] = --suffix;
163 } 400 }
164 } 401 }
165 } 402 }
166 } 403 }
167 if (suffix < m) { 404 // Build shift table using suffixes.
168 for (int i = start; i <= m; i++) { 405 if (suffix < pattern_length) {
169 if (BMBuffers::bmgs_buffers.shift(i) == len) { 406 for (int i = start; i <= pattern_length; i++) {
170 BMBuffers::bmgs_buffers.shift(i) = suffix - start; 407 if (shift_table[i] == length) {
408 shift_table[i] = suffix - start;
171 } 409 }
172 if (i == suffix) { 410 if (i == suffix) {
173 suffix = BMBuffers::bmgs_buffers.suffix(suffix); 411 suffix = suffix_table[suffix];
174 } 412 }
175 } 413 }
176 } 414 }
177 } 415 }
178 416
179 417 //---------------------------------------------------------------------
180 template <typename SubjectChar, typename PatternChar> 418 // Boyer-Moore-Horspool string search.
181 static inline int CharOccurrence(int char_code) { 419 //---------------------------------------------------------------------
182 if (sizeof(SubjectChar) == 1) { 420
183 return BMBuffers::bad_char_occurrence[char_code]; 421 template <typename PatternChar, typename SubjectChar>
184 } 422 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
185 if (sizeof(PatternChar) == 1) { 423 StringSearch<PatternChar, SubjectChar>* search,
186 if (char_code > String::kMaxAsciiCharCode) { 424 Vector<const SubjectChar> subject,
187 return -1; 425 int start_index) {
188 } 426 Vector<const PatternChar> pattern = search->pattern_;
189 return BMBuffers::bad_char_occurrence[char_code]; 427 int subject_length = subject.length();
190 } 428 int pattern_length = pattern.length();
191 return BMBuffers::bad_char_occurrence[char_code % kBMAlphabetSize]; 429 int* char_occurrences = search->bad_char_table();
192 } 430 int badness = -pattern_length;
193
194
195 // Restricted simplified Boyer-Moore string matching.
196 // Uses only the bad-shift table of Boyer-Moore and only uses it
197 // for the character compared to the last character of the needle.
198 template <typename SubjectChar, typename PatternChar>
199 static int BoyerMooreHorspool(Vector<const SubjectChar> subject,
200 Vector<const PatternChar> pattern,
201 int start_index,
202 bool* complete) {
203 ASSERT(algorithm <= BOYER_MOORE_HORSPOOL);
204 int n = subject.length();
205 int m = pattern.length();
206
207 int badness = -m;
208 431
209 // How bad we are doing without a good-suffix table. 432 // How bad we are doing without a good-suffix table.
210 int idx; // No matches found prior to this index. 433 PatternChar last_char = pattern[pattern_length - 1];
211 PatternChar last_char = pattern[m - 1]; 434 int last_char_shift = pattern_length - 1 -
212 int last_char_shift = 435 CharOccurrence(char_occurrences, last_char);
213 m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char);
214 // Perform search 436 // Perform search
215 for (idx = start_index; idx <= n - m;) { 437 int index = start_index; // No matches found prior to this index.
216 int j = m - 1; 438 while (index <= subject_length - pattern_length) {
217 int c; 439 int j = pattern_length - 1;
218 while (last_char != (c = subject[idx + j])) { 440 int subject_char;
219 int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c); 441 while (last_char != (subject_char = subject[index + j])) {
442 int bc_occ = CharOccurrence(char_occurrences, subject_char);
220 int shift = j - bc_occ; 443 int shift = j - bc_occ;
221 idx += shift; 444 index += shift;
222 badness += 1 - shift; // at most zero, so badness cannot increase. 445 badness += 1 - shift; // at most zero, so badness cannot increase.
223 if (idx > n - m) { 446 if (index > subject_length - pattern_length) {
224 *complete = true;
225 return -1; 447 return -1;
226 } 448 }
227 } 449 }
228 j--; 450 j--;
229 while (j >= 0 && pattern[j] == (subject[idx + j])) j--; 451 while (j >= 0 && pattern[j] == (subject[index + j])) j--;
230 if (j < 0) { 452 if (j < 0) {
231 *complete = true; 453 return index;
232 return idx;
233 } else { 454 } else {
234 idx += last_char_shift; 455 index += last_char_shift;
235 // Badness increases by the number of characters we have 456 // Badness increases by the number of characters we have
236 // checked, and decreases by the number of characters we 457 // checked, and decreases by the number of characters we
237 // can skip by shifting. It's a measure of how we are doing 458 // can skip by shifting. It's a measure of how we are doing
238 // compared to reading each character exactly once. 459 // compared to reading each character exactly once.
239 badness += (m - j) - last_char_shift; 460 badness += (pattern_length - j) - last_char_shift;
240 if (badness > 0) { 461 if (badness > 0) {
241 *complete = false; 462 search->PopulateBoyerMooreTable();
242 return idx; 463 search->strategy_ = &BoyerMooreSearch;
243 } 464 return BoyerMooreSearch(search, subject, index);
244 } 465 }
245 } 466 }
246 *complete = true; 467 }
247 return -1; 468 return -1;
248 } 469 }
249 470
250 471
251 template <typename SubjectChar, typename PatternChar> 472 template <typename PatternChar, typename SubjectChar>
252 static int BoyerMooreIndexOf(Vector<const SubjectChar> subject, 473 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
253 Vector<const PatternChar> pattern, 474 int pattern_length = pattern_.length();
254 int idx) { 475
255 ASSERT(algorithm <= BOYER_MOORE); 476 int* bad_char_occurrence = bad_char_table();
256 int n = subject.length(); 477
257 int m = pattern.length();
258 // Only preprocess at most kBMMaxShift last characters of pattern. 478 // Only preprocess at most kBMMaxShift last characters of pattern.
259 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift; 479 int start = start_;
260 480 // Run forwards to populate bad_char_table, so that *last* instance
261 PatternChar last_char = pattern[m - 1]; 481 // of character equivalence class is the one registered.
262 // Continue search from i. 482 // Notice: Doesn't include the last character.
263 while (idx <= n - m) { 483 int table_size = AlphabetSize();
264 int j = m - 1; 484 if (start == 0) { // All patterns less than kBMMaxShift in length.
265 SubjectChar c; 485 memset(bad_char_occurrence,
266 while (last_char != (c = subject[idx + j])) { 486 -1,
267 int shift = j - CharOccurrence<SubjectChar, PatternChar>(c); 487 table_size * sizeof(*bad_char_occurrence));
268 idx += shift; 488 } else {
269 if (idx > n - m) { 489 for (int i = 0; i < table_size; i++) {
270 return -1; 490 bad_char_occurrence[i] = start - 1;
271 } 491 }
272 } 492 }
273 while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--; 493 for (int i = start; i < pattern_length - 1; i++) {
274 if (j < 0) { 494 PatternChar c = pattern_[i];
275 return idx; 495 int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
276 } else if (j < start) { 496 bad_char_occurrence[bucket] = i;
277 // we have matched more than our tables allow us to be smart about. 497 }
278 // Fall back on BMH shift. 498 }
279 idx += m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char); 499
280 } else { 500 //---------------------------------------------------------------------
281 int gs_shift = BMBuffers::bmgs_buffers.shift(j + 1); 501 // Linear string search with bailout to BMH.
282 int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c); 502 //---------------------------------------------------------------------
283 int shift = j - bc_occ; 503
284 if (gs_shift > shift) { 504 // Simple linear search for short patterns, which bails out if the string
285 shift = gs_shift; 505 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
286 } 506 template <typename PatternChar, typename SubjectChar>
287 idx += shift; 507 int StringSearch<PatternChar, SubjectChar>::InitialSearch(
288 } 508 StringSearch<PatternChar, SubjectChar>* search,
289 } 509 Vector<const SubjectChar> subject,
290 510 int index) {
291 return -1; 511 Vector<const PatternChar> pattern = search->pattern_;
292 }
293
294
295 // Trivial string search for shorter strings.
296 // On return, if "complete" is set to true, the return value is the
297 // final result of searching for the patter in the subject.
298 // If "complete" is set to false, the return value is the index where
299 // further checking should start, i.e., it's guaranteed that the pattern
300 // does not occur at a position prior to the returned index.
301 template <typename PatternChar, typename SubjectChar>
302 static int SimpleIndexOf(Vector<const SubjectChar> subject,
303 Vector<const PatternChar> pattern,
304 int idx,
305 bool* complete) {
306 ASSERT(pattern.length() > 1);
307 int pattern_length = pattern.length(); 512 int pattern_length = pattern.length();
308 // Badness is a count of how much work we have done. When we have 513 // Badness is a count of how much work we have done. When we have
309 // done enough work we decide it's probably worth switching to a better 514 // done enough work we decide it's probably worth switching to a better
310 // algorithm. 515 // algorithm.
311 int badness = -10 - (pattern_length << 2); 516 int badness = -10 - (pattern_length << 2);
312 517
313 // We know our pattern is at least 2 characters, we cache the first so 518 // We know our pattern is at least 2 characters, we cache the first so
314 // the common case of the first character not matching is faster. 519 // the common case of the first character not matching is faster.
315 PatternChar pattern_first_char = pattern[0]; 520 PatternChar pattern_first_char = pattern[0];
316 for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) { 521 for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
317 badness++; 522 badness++;
318 if (badness > 0) { 523 if (badness <= 0) {
319 *complete = false; 524 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
320 return i; 525 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
321 } 526 memchr(subject.start() + i,
322 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { 527 pattern_first_char,
323 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( 528 n - i + 1));
324 memchr(subject.start() + i, 529 if (pos == NULL) {
325 pattern_first_char, 530 return -1;
326 n - i + 1)); 531 }
327 if (pos == NULL) { 532 i = static_cast<int>(pos - subject.start());
328 *complete = true; 533 } else {
329 return -1; 534 if (subject[i] != pattern_first_char) continue;
330 } 535 }
331 i = static_cast<int>(pos - subject.start()); 536 int j = 1;
537 do {
538 if (pattern[j] != subject[i + j]) {
539 break;
540 }
541 j++;
542 } while (j < pattern_length);
543 if (j == pattern_length) {
544 return i;
545 }
546 badness += j;
332 } else { 547 } else {
333 if (subject[i] != pattern_first_char) continue; 548 search->PopulateBoyerMooreHorspoolTable();
334 } 549 search->strategy_ = &BoyerMooreHorspoolSearch;
335 int j = 1; 550 return BoyerMooreHorspoolSearch(search, subject, i);
336 do {
337 if (pattern[j] != subject[i+j]) {
338 break;
339 }
340 j++;
341 } while (j < pattern_length);
342 if (j == pattern_length) {
343 *complete = true;
344 return i;
345 }
346 badness += j;
347 }
348 *complete = true;
349 return -1;
350 }
351
352 // Simple indexOf that never bails out. For short patterns only.
353 template <typename PatternChar, typename SubjectChar>
354 static int SimpleIndexOf(Vector<const SubjectChar> subject,
355 Vector<const PatternChar> pattern,
356 int idx) {
357 int pattern_length = pattern.length();
358 PatternChar pattern_first_char = pattern[0];
359 for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {
360 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
361 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
362 memchr(subject.start() + i,
363 pattern_first_char,
364 n - i + 1));
365 if (pos == NULL) return -1;
366 i = static_cast<int>(pos - subject.start());
367 } else {
368 if (subject[i] != pattern_first_char) continue;
369 }
370 int j = 1;
371 while (j < pattern_length) {
372 if (pattern[j] != subject[i+j]) {
373 break;
374 }
375 j++;
376 }
377 if (j == pattern_length) {
378 return i;
379 } 551 }
380 } 552 }
381 return -1; 553 return -1;
382 } 554 }
383 555
384 556
385 // Strategy for searching for a string in another string. 557 // Perform a a single stand-alone search.
386 enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG }; 558 // If searching multiple times for the same pattern, a search
387 559 // object should be constructed once and the Search function then called
388 560 // for each search.
389 template <typename PatternChar>
390 static inline StringSearchStrategy InitializeStringSearch(
391 Vector<const PatternChar> pat, bool ascii_subject) {
392 // We have an ASCII haystack and a non-ASCII needle. Check if there
393 // really is a non-ASCII character in the needle and bail out if there
394 // is.
395 if (ascii_subject && sizeof(PatternChar) > 1) {
396 for (int i = 0; i < pat.length(); i++) {
397 uc16 c = pat[i];
398 if (c > String::kMaxAsciiCharCode) {
399 return SEARCH_FAIL;
400 }
401 }
402 }
403 if (pat.length() < kBMMinPatternLength) {
404 return SEARCH_SHORT;
405 }
406 algorithm = SIMPLE_SEARCH;
407 return SEARCH_LONG;
408 }
409
410
411 // Dispatch long needle searches to different algorithms.
412 template <typename SubjectChar, typename PatternChar> 561 template <typename SubjectChar, typename PatternChar>
413 static int ComplexIndexOf(Vector<const SubjectChar> sub, 562 static int SearchString(Vector<const SubjectChar> subject,
414 Vector<const PatternChar> pat, 563 Vector<const PatternChar> pattern,
415 int start_index) {
416 ASSERT(pat.length() >= kBMMinPatternLength);
417 // Try algorithms in order of increasing setup cost and expected performance.
418 bool complete;
419 int idx = start_index;
420 switch (algorithm) {
421 case SIMPLE_SEARCH:
422 idx = SimpleIndexOf(sub, pat, idx, &complete);
423 if (complete) return idx;
424 BoyerMoorePopulateBadCharTable(pat);
425 algorithm = BOYER_MOORE_HORSPOOL;
426 // FALLTHROUGH.
427 case BOYER_MOORE_HORSPOOL:
428 idx = BoyerMooreHorspool(sub, pat, idx, &complete);
429 if (complete) return idx;
430 // Build the Good Suffix table and continue searching.
431 BoyerMoorePopulateGoodSuffixTable(pat);
432 algorithm = BOYER_MOORE;
433 // FALLTHROUGH.
434 case BOYER_MOORE:
435 return BoyerMooreIndexOf(sub, pat, idx);
436 }
437 UNREACHABLE();
438 return -1;
439 }
440
441
442 // Dispatch to different search strategies for a single search.
443 // If searching multiple times on the same needle, the search
444 // strategy should only be computed once and then dispatch to different
445 // loops.
446 template <typename SubjectChar, typename PatternChar>
447 static int StringSearch(Vector<const SubjectChar> sub,
448 Vector<const PatternChar> pat,
449 int start_index) { 564 int start_index) {
450 bool ascii_subject = (sizeof(SubjectChar) == 1); 565 StringSearch<PatternChar, SubjectChar> search(pattern);
451 StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject); 566 return search.Search(subject, start_index);
452 switch (strategy) {
453 case SEARCH_FAIL: return -1;
454 case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index);
455 case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index);
456 }
457 UNREACHABLE();
458 return -1;
459 } 567 }
460 568
461 }} // namespace v8::internal 569 }} // namespace v8::internal
462 570
463 #endif // V8_STRING_SEARCH_H_ 571 #endif // V8_STRING_SEARCH_H_
OLDNEW
« no previous file with comments | « src/runtime.cc ('k') | tools/gyp/v8.gyp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698