OLD | NEW |
1 // Copyright 2010 the V8 project authors. All rights reserved. | 1 // Copyright 2010 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 14 matching lines...) Expand all Loading... |
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
27 | 27 |
28 #ifndef V8_STRING_SEARCH_H_ | 28 #ifndef V8_STRING_SEARCH_H_ |
29 #define V8_STRING_SEARCH_H_ | 29 #define V8_STRING_SEARCH_H_ |
30 | 30 |
31 namespace v8 { | 31 namespace v8 { |
32 namespace internal { | 32 namespace internal { |
33 | 33 |
34 | 34 |
35 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a | 35 //--------------------------------------------------------------------- |
36 // limit, we can fix the size of tables. For a needle longer than this limit, | 36 // String Search object. |
37 // search will not be optimal, since we only build tables for a smaller suffix | 37 //--------------------------------------------------------------------- |
38 // of the string, which is a safe approximation. | 38 |
39 static const int kBMMaxShift = 250; | 39 // Class holding constants and methods that apply to all string search variants, |
40 // Reduce alphabet to this size. | 40 // independently of subject and pattern char size. |
41 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size | 41 class StringSearchBase { |
42 // proportional to the input alphabet. We reduce the alphabet size by | 42 protected: |
43 // equating input characters modulo a smaller alphabet size. This gives | 43 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a |
44 // a potentially less efficient searching, but is a safe approximation. | 44 // limit, we can fix the size of tables. For a needle longer than this limit, |
45 // For needles using only characters in the same Unicode 256-code point page, | 45 // search will not be optimal, since we only build tables for a suffix |
46 // there is no search speed degradation. | 46 // of the string, but it is a safe approximation. |
47 static const int kBMAlphabetSize = 256; | 47 static const int kBMMaxShift = 250; |
48 // For patterns below this length, the skip length of Boyer-Moore is too short | 48 |
49 // to compensate for the algorithmic overhead compared to simple brute force. | 49 // Reduce alphabet to this size. |
50 static const int kBMMinPatternLength = 7; | 50 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size |
51 | 51 // proportional to the input alphabet. We reduce the alphabet size by |
52 // Holds the two buffers used by Boyer-Moore string search's Good Suffix | 52 // equating input characters modulo a smaller alphabet size. This gives |
53 // shift. Only allows the last kBMMaxShift characters of the needle | 53 // a potentially less efficient searching, but is a safe approximation. |
54 // to be indexed. | 54 // For needles using only characters in the same Unicode 256-code point page, |
55 class BMGoodSuffixBuffers { | 55 // there is no search speed degradation. |
| 56 static const int kAsciiAlphabetSize = 128; |
| 57 static const int kUC16AlphabetSize = 256; |
| 58 |
| 59 // Bad-char shift table stored in the state. It's length is the alphabet size. |
| 60 // For patterns below this length, the skip length of Boyer-Moore is too short |
| 61 // to compensate for the algorithmic overhead compared to simple brute force. |
| 62 static const int kBMMinPatternLength = 7; |
| 63 |
| 64 static inline bool IsAsciiString(Vector<const char>) { |
| 65 return true; |
| 66 } |
| 67 |
| 68 static inline bool IsAsciiString(Vector<const uc16> string) { |
| 69 for (int i = 0, n = string.length(); i < n; i++) { |
| 70 if (static_cast<unsigned>(string[i]) > String::kMaxAsciiCharCodeU) { |
| 71 return false; |
| 72 } |
| 73 } |
| 74 return true; |
| 75 } |
| 76 |
| 77 // The following tables are shared by all searches. |
| 78 // TODO(lrn): Introduce a way for a pattern to keep its tables |
| 79 // between searches (e.g., for an Atom RegExp). |
| 80 |
| 81 // Store for the BoyerMoore(Horspool) bad char shift table. |
| 82 static int kBadCharShiftTable[kUC16AlphabetSize]; |
| 83 // Store for the BoyerMoore good suffix shift table. |
| 84 static int kGoodSuffixShiftTable[kBMMaxShift + 1]; |
| 85 // Table used temporarily while building the BoyerMoore good suffix |
| 86 // shift table. |
| 87 static int kSuffixTable[kBMMaxShift + 1]; |
| 88 }; |
| 89 |
| 90 |
| 91 template <typename PatternChar, typename SubjectChar> |
| 92 class StringSearch : private StringSearchBase { |
56 public: | 93 public: |
57 BMGoodSuffixBuffers() {} | 94 explicit StringSearch(Vector<const PatternChar> pattern) |
58 inline void Initialize(int needle_length) { | 95 : pattern_(pattern), |
59 ASSERT(needle_length > 1); | 96 start_(Max(0, pattern.length() - kBMMaxShift)) { |
60 int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift; | 97 if (sizeof(PatternChar) > sizeof(SubjectChar)) { |
61 int len = needle_length - start; | 98 if (!IsAsciiString(pattern_)) { |
62 biased_suffixes_ = suffixes_ - start; | 99 strategy_ = &FailSearch; |
63 biased_good_suffix_shift_ = good_suffix_shift_ - start; | 100 return; |
64 for (int i = 0; i <= len; i++) { | 101 } |
65 good_suffix_shift_[i] = len; | 102 } |
66 } | 103 int pattern_length = pattern_.length(); |
67 } | 104 if (pattern_length < kBMMinPatternLength) { |
68 inline int& suffix(int index) { | 105 if (pattern_length == 1) { |
69 ASSERT(biased_suffixes_ + index >= suffixes_); | 106 strategy_ = &SingleCharSearch; |
70 return biased_suffixes_[index]; | 107 return; |
71 } | 108 } |
72 inline int& shift(int index) { | 109 strategy_ = &LinearSearch; |
73 ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_); | 110 return; |
74 return biased_good_suffix_shift_[index]; | 111 } |
75 } | 112 strategy_ = &InitialSearch; |
| 113 } |
| 114 |
| 115 int Search(Vector<const SubjectChar> subject, int index) { |
| 116 return strategy_(this, subject, index); |
| 117 } |
| 118 |
| 119 static inline int AlphabetSize() { |
| 120 if (sizeof(PatternChar) == 1) { |
| 121 // ASCII needle. |
| 122 return kAsciiAlphabetSize; |
| 123 } else { |
| 124 ASSERT(sizeof(PatternChar) == 2); |
| 125 // UC16 needle. |
| 126 return kUC16AlphabetSize; |
| 127 } |
| 128 } |
| 129 |
76 private: | 130 private: |
77 int suffixes_[kBMMaxShift + 1]; | 131 typedef int (*SearchFunction)( // NOLINT - it's not a cast! |
78 int good_suffix_shift_[kBMMaxShift + 1]; | 132 StringSearch<PatternChar, SubjectChar>*, |
79 int* biased_suffixes_; | 133 Vector<const SubjectChar>, |
80 int* biased_good_suffix_shift_; | 134 int); |
81 DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers); | 135 |
| 136 static int FailSearch(StringSearch<PatternChar, SubjectChar>*, |
| 137 Vector<const SubjectChar>, |
| 138 int) { |
| 139 return -1; |
| 140 } |
| 141 |
| 142 static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search, |
| 143 Vector<const SubjectChar> subject, |
| 144 int start_index); |
| 145 |
| 146 static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search, |
| 147 Vector<const SubjectChar> subject, |
| 148 int start_index); |
| 149 |
| 150 static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search, |
| 151 Vector<const SubjectChar> subject, |
| 152 int start_index); |
| 153 |
| 154 static int BoyerMooreHorspoolSearch( |
| 155 StringSearch<PatternChar, SubjectChar>* search, |
| 156 Vector<const SubjectChar> subject, |
| 157 int start_index); |
| 158 |
| 159 static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search, |
| 160 Vector<const SubjectChar> subject, |
| 161 int start_index); |
| 162 |
| 163 void PopulateBoyerMooreHorspoolTable(); |
| 164 |
| 165 void PopulateBoyerMooreTable(); |
| 166 |
| 167 static inline int CharOccurrence(int* bad_char_occurrence, |
| 168 SubjectChar char_code) { |
| 169 if (sizeof(SubjectChar) == 1) { |
| 170 return bad_char_occurrence[static_cast<int>(char_code)]; |
| 171 } |
| 172 if (sizeof(PatternChar) == 1) { |
| 173 if (static_cast<unsigned char>(char_code) > String::kMaxAsciiCharCode) { |
| 174 return -1; |
| 175 } |
| 176 return bad_char_occurrence[static_cast<int>(char_code)]; |
| 177 } |
| 178 // Reduce to equivalence class. |
| 179 int equiv_class = char_code % kUC16AlphabetSize; |
| 180 return bad_char_occurrence[equiv_class]; |
| 181 } |
| 182 |
| 183 // Return a table covering the last kBMMaxShift+1 positions of |
| 184 // pattern. |
| 185 int* bad_char_table() { |
| 186 return kBadCharShiftTable; |
| 187 } |
| 188 |
| 189 int* good_suffix_shift_table() { |
| 190 // Return biased pointer that maps the range [start_..pattern_.length() |
| 191 // to the kGoodSuffixShiftTable array. |
| 192 return kGoodSuffixShiftTable - start_; |
| 193 } |
| 194 |
| 195 int* suffix_table() { |
| 196 // Return biased pointer that maps the range [start_..pattern_.length() |
| 197 // to the kSuffixTable array. |
| 198 return kSuffixTable - start_; |
| 199 } |
| 200 |
| 201 // The pattern to search for. |
| 202 Vector<const PatternChar> pattern_; |
| 203 // Pointer to implementation of the search. |
| 204 SearchFunction strategy_; |
| 205 // Cache value of Max(0, pattern_length() - kBMMaxShift) |
| 206 int start_; |
82 }; | 207 }; |
83 | 208 |
84 // buffers reused by BoyerMoore | 209 |
85 struct BMBuffers { | 210 //--------------------------------------------------------------------- |
86 public: | 211 // Single Character Pattern Search Strategy |
87 static int bad_char_occurrence[kBMAlphabetSize]; | 212 //--------------------------------------------------------------------- |
88 static BMGoodSuffixBuffers bmgs_buffers; | 213 |
89 }; | 214 template <typename PatternChar, typename SubjectChar> |
90 | 215 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch( |
91 // State of the string match tables. | 216 StringSearch<PatternChar, SubjectChar>* search, |
92 // SIMPLE: No usable content in the buffers. | 217 Vector<const SubjectChar> subject, |
93 // BOYER_MOORE_HORSPOOL: The bad_char_occurence table has been populated. | 218 int index) { |
94 // BOYER_MOORE: The bmgs_buffers tables have also been populated. | 219 ASSERT_EQ(1, search->pattern_.length()); |
95 // Whenever starting with a new needle, one should call InitializeStringSearch | 220 PatternChar pattern_first_char = search->pattern_[0]; |
96 // to determine which search strategy to use, and in the case of a long-needle | 221 int i = index; |
97 // strategy, the call also initializes the algorithm to SIMPLE. | 222 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { |
98 enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE }; | 223 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( |
99 static StringSearchAlgorithm algorithm; | 224 memchr(subject.start() + i, |
100 | 225 pattern_first_char, |
101 | 226 subject.length() - i)); |
102 // Compute the bad-char table for Boyer-Moore in the static buffer. | 227 if (pos == NULL) return -1; |
103 template <typename PatternChar> | 228 return static_cast<int>(pos - subject.start()); |
104 static void BoyerMoorePopulateBadCharTable(Vector<const PatternChar> pattern) { | 229 } else { |
| 230 if (sizeof(PatternChar) > sizeof(SubjectChar)) { |
| 231 if (static_cast<uc16>(pattern_first_char) > String::kMaxAsciiCharCodeU) { |
| 232 return -1; |
| 233 } |
| 234 } |
| 235 SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char); |
| 236 int n = subject.length(); |
| 237 while (i < n) { |
| 238 if (subject[i++] == search_char) return i - 1; |
| 239 } |
| 240 return -1; |
| 241 } |
| 242 } |
| 243 |
| 244 //--------------------------------------------------------------------- |
| 245 // Linear Search Strategy |
| 246 //--------------------------------------------------------------------- |
| 247 |
| 248 |
| 249 template <typename PatternChar, typename SubjectChar> |
| 250 static inline bool CharCompare(const PatternChar* pattern, |
| 251 const SubjectChar* subject, |
| 252 int length) { |
| 253 ASSERT(length > 0); |
| 254 int pos = 0; |
| 255 do { |
| 256 if (pattern[pos] != subject[pos]) { |
| 257 return false; |
| 258 } |
| 259 pos++; |
| 260 } while (pos < length); |
| 261 return true; |
| 262 } |
| 263 |
| 264 |
| 265 // Simple linear search for short patterns. Never bails out. |
| 266 template <typename PatternChar, typename SubjectChar> |
| 267 int StringSearch<PatternChar, SubjectChar>::LinearSearch( |
| 268 StringSearch<PatternChar, SubjectChar>* search, |
| 269 Vector<const SubjectChar> subject, |
| 270 int index) { |
| 271 Vector<const PatternChar> pattern = search->pattern_; |
| 272 ASSERT(pattern.length() > 1); |
| 273 int pattern_length = pattern.length(); |
| 274 PatternChar pattern_first_char = pattern[0]; |
| 275 int i = index; |
| 276 int n = subject.length() - pattern_length; |
| 277 while (i <= n) { |
| 278 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { |
| 279 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( |
| 280 memchr(subject.start() + i, |
| 281 pattern_first_char, |
| 282 n - i + 1)); |
| 283 if (pos == NULL) return -1; |
| 284 i = static_cast<int>(pos - subject.start()) + 1; |
| 285 } else { |
| 286 if (subject[i++] != pattern_first_char) continue; |
| 287 } |
| 288 // Loop extracted to separate function to allow using return to do |
| 289 // a deeper break. |
| 290 if (CharCompare(pattern.start() + 1, |
| 291 subject.start() + i, |
| 292 pattern_length - 1)) { |
| 293 return i - 1; |
| 294 } |
| 295 } |
| 296 return -1; |
| 297 } |
| 298 |
| 299 //--------------------------------------------------------------------- |
| 300 // Boyer-Moore string search |
| 301 //--------------------------------------------------------------------- |
| 302 |
| 303 template <typename PatternChar, typename SubjectChar> |
| 304 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch( |
| 305 StringSearch<PatternChar, SubjectChar>* search, |
| 306 Vector<const SubjectChar> subject, |
| 307 int start_index) { |
| 308 Vector<const PatternChar> pattern = search->pattern_; |
| 309 int subject_length = subject.length(); |
| 310 int pattern_length = pattern.length(); |
105 // Only preprocess at most kBMMaxShift last characters of pattern. | 311 // Only preprocess at most kBMMaxShift last characters of pattern. |
106 int start = Max(pattern.length() - kBMMaxShift, 0); | 312 int start = search->start_; |
107 // Run forwards to populate bad_char_table, so that *last* instance | 313 |
108 // of character equivalence class is the one registered. | 314 int* bad_char_occurence = search->bad_char_table(); |
109 // Notice: Doesn't include the last character. | 315 int* good_suffix_shift = search->good_suffix_shift_table(); |
110 int table_size = (sizeof(PatternChar) == 1) ? String::kMaxAsciiCharCode + 1 | 316 |
111 : kBMAlphabetSize; | 317 PatternChar last_char = pattern[pattern_length - 1]; |
112 if (start == 0) { // All patterns less than kBMMaxShift in length. | 318 int index = start_index; |
113 memset(BMBuffers::bad_char_occurrence, | 319 // Continue search from i. |
114 -1, | 320 while (index <= subject_length - pattern_length) { |
115 table_size * sizeof(*BMBuffers::bad_char_occurrence)); | 321 int j = pattern_length - 1; |
116 } else { | 322 int c; |
117 for (int i = 0; i < table_size; i++) { | 323 while (last_char != (c = subject[index + j])) { |
118 BMBuffers::bad_char_occurrence[i] = start - 1; | 324 int shift = |
119 } | 325 j - CharOccurrence(bad_char_occurence, c); |
120 } | 326 index += shift; |
121 for (int i = start; i < pattern.length() - 1; i++) { | 327 if (index > subject_length - pattern_length) { |
122 PatternChar c = pattern[i]; | 328 return -1; |
123 int bucket = (sizeof(PatternChar) ==1) ? c : c % kBMAlphabetSize; | 329 } |
124 BMBuffers::bad_char_occurrence[bucket] = i; | 330 } |
125 } | 331 while (j >= 0 && pattern[j] == (c = subject[index + j])) j--; |
126 } | 332 if (j < 0) { |
127 | 333 return index; |
128 | 334 } else if (j < start) { |
129 template <typename PatternChar> | 335 // we have matched more than our tables allow us to be smart about. |
130 static void BoyerMoorePopulateGoodSuffixTable( | 336 // Fall back on BMH shift. |
131 Vector<const PatternChar> pattern) { | 337 index += |
132 int m = pattern.length(); | 338 pattern_length - 1 - CharOccurrence(bad_char_occurence, last_char); |
133 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift; | 339 } else { |
134 int len = m - start; | 340 int gs_shift = good_suffix_shift[j + 1]; |
135 // Compute Good Suffix tables. | 341 int bc_occ = |
136 BMBuffers::bmgs_buffers.Initialize(m); | 342 CharOccurrence(bad_char_occurence, c); |
137 | 343 int shift = j - bc_occ; |
138 BMBuffers::bmgs_buffers.shift(m-1) = 1; | 344 if (gs_shift > shift) { |
139 BMBuffers::bmgs_buffers.suffix(m) = m + 1; | 345 shift = gs_shift; |
140 PatternChar last_char = pattern[m - 1]; | 346 } |
141 int suffix = m + 1; | 347 index += shift; |
| 348 } |
| 349 } |
| 350 |
| 351 return -1; |
| 352 } |
| 353 |
| 354 |
| 355 template <typename PatternChar, typename SubjectChar> |
| 356 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() { |
| 357 int pattern_length = pattern_.length(); |
| 358 const PatternChar* pattern = pattern_.start(); |
| 359 // Only look at the last kBMMaxShift characters of pattern (from start_ |
| 360 // to pattern_length). |
| 361 int start = start_; |
| 362 int length = pattern_length - start; |
| 363 |
| 364 // Biased tables so that we can use pattern indices as table indices, |
| 365 // even if we only cover the part of the pattern from offset start. |
| 366 int* shift_table = good_suffix_shift_table(); |
| 367 int* suffix_table = this->suffix_table(); |
| 368 |
| 369 // Initialize table. |
| 370 for (int i = start; i < pattern_length; i++) { |
| 371 shift_table[i] = length; |
| 372 } |
| 373 shift_table[pattern_length] = 1; |
| 374 suffix_table[pattern_length] = pattern_length + 1; |
| 375 |
| 376 // Find suffixes. |
| 377 PatternChar last_char = pattern[pattern_length - 1]; |
| 378 int suffix = pattern_length + 1; |
142 { | 379 { |
143 int i = m; | 380 int i = pattern_length; |
144 while (i > start) { | 381 while (i > start) { |
145 PatternChar c = pattern[i - 1]; | 382 PatternChar c = pattern[i - 1]; |
146 while (suffix <= m && c != pattern[suffix - 1]) { | 383 while (suffix <= pattern_length && c != pattern[suffix - 1]) { |
147 if (BMBuffers::bmgs_buffers.shift(suffix) == len) { | 384 if (shift_table[suffix] == length) { |
148 BMBuffers::bmgs_buffers.shift(suffix) = suffix - i; | 385 shift_table[suffix] = suffix - i; |
149 } | 386 } |
150 suffix = BMBuffers::bmgs_buffers.suffix(suffix); | 387 suffix = suffix_table[suffix]; |
151 } | 388 } |
152 BMBuffers::bmgs_buffers.suffix(--i) = --suffix; | 389 suffix_table[--i] = --suffix; |
153 if (suffix == m) { | 390 if (suffix == pattern_length) { |
154 // No suffix to extend, so we check against last_char only. | 391 // No suffix to extend, so we check against last_char only. |
155 while ((i > start) && (pattern[i - 1] != last_char)) { | 392 while ((i > start) && (pattern[i - 1] != last_char)) { |
156 if (BMBuffers::bmgs_buffers.shift(m) == len) { | 393 if (shift_table[pattern_length] == length) { |
157 BMBuffers::bmgs_buffers.shift(m) = m - i; | 394 shift_table[pattern_length] = pattern_length - i; |
158 } | 395 } |
159 BMBuffers::bmgs_buffers.suffix(--i) = m; | 396 suffix_table[--i] = pattern_length; |
160 } | 397 } |
161 if (i > start) { | 398 if (i > start) { |
162 BMBuffers::bmgs_buffers.suffix(--i) = --suffix; | 399 suffix_table[--i] = --suffix; |
163 } | 400 } |
164 } | 401 } |
165 } | 402 } |
166 } | 403 } |
167 if (suffix < m) { | 404 // Build shift table using suffixes. |
168 for (int i = start; i <= m; i++) { | 405 if (suffix < pattern_length) { |
169 if (BMBuffers::bmgs_buffers.shift(i) == len) { | 406 for (int i = start; i <= pattern_length; i++) { |
170 BMBuffers::bmgs_buffers.shift(i) = suffix - start; | 407 if (shift_table[i] == length) { |
| 408 shift_table[i] = suffix - start; |
171 } | 409 } |
172 if (i == suffix) { | 410 if (i == suffix) { |
173 suffix = BMBuffers::bmgs_buffers.suffix(suffix); | 411 suffix = suffix_table[suffix]; |
174 } | 412 } |
175 } | 413 } |
176 } | 414 } |
177 } | 415 } |
178 | 416 |
179 | 417 //--------------------------------------------------------------------- |
180 template <typename SubjectChar, typename PatternChar> | 418 // Boyer-Moore-Horspool string search. |
181 static inline int CharOccurrence(int char_code) { | 419 //--------------------------------------------------------------------- |
182 if (sizeof(SubjectChar) == 1) { | 420 |
183 return BMBuffers::bad_char_occurrence[char_code]; | 421 template <typename PatternChar, typename SubjectChar> |
184 } | 422 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch( |
185 if (sizeof(PatternChar) == 1) { | 423 StringSearch<PatternChar, SubjectChar>* search, |
186 if (char_code > String::kMaxAsciiCharCode) { | 424 Vector<const SubjectChar> subject, |
187 return -1; | 425 int start_index) { |
188 } | 426 Vector<const PatternChar> pattern = search->pattern_; |
189 return BMBuffers::bad_char_occurrence[char_code]; | 427 int subject_length = subject.length(); |
190 } | 428 int pattern_length = pattern.length(); |
191 return BMBuffers::bad_char_occurrence[char_code % kBMAlphabetSize]; | 429 int* char_occurrences = search->bad_char_table(); |
192 } | 430 int badness = -pattern_length; |
193 | |
194 | |
195 // Restricted simplified Boyer-Moore string matching. | |
196 // Uses only the bad-shift table of Boyer-Moore and only uses it | |
197 // for the character compared to the last character of the needle. | |
198 template <typename SubjectChar, typename PatternChar> | |
199 static int BoyerMooreHorspool(Vector<const SubjectChar> subject, | |
200 Vector<const PatternChar> pattern, | |
201 int start_index, | |
202 bool* complete) { | |
203 ASSERT(algorithm <= BOYER_MOORE_HORSPOOL); | |
204 int n = subject.length(); | |
205 int m = pattern.length(); | |
206 | |
207 int badness = -m; | |
208 | 431 |
209 // How bad we are doing without a good-suffix table. | 432 // How bad we are doing without a good-suffix table. |
210 int idx; // No matches found prior to this index. | 433 PatternChar last_char = pattern[pattern_length - 1]; |
211 PatternChar last_char = pattern[m - 1]; | 434 int last_char_shift = pattern_length - 1 - |
212 int last_char_shift = | 435 CharOccurrence(char_occurrences, last_char); |
213 m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char); | |
214 // Perform search | 436 // Perform search |
215 for (idx = start_index; idx <= n - m;) { | 437 int index = start_index; // No matches found prior to this index. |
216 int j = m - 1; | 438 while (index <= subject_length - pattern_length) { |
217 int c; | 439 int j = pattern_length - 1; |
218 while (last_char != (c = subject[idx + j])) { | 440 int subject_char; |
219 int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c); | 441 while (last_char != (subject_char = subject[index + j])) { |
| 442 int bc_occ = CharOccurrence(char_occurrences, subject_char); |
220 int shift = j - bc_occ; | 443 int shift = j - bc_occ; |
221 idx += shift; | 444 index += shift; |
222 badness += 1 - shift; // at most zero, so badness cannot increase. | 445 badness += 1 - shift; // at most zero, so badness cannot increase. |
223 if (idx > n - m) { | 446 if (index > subject_length - pattern_length) { |
224 *complete = true; | |
225 return -1; | 447 return -1; |
226 } | 448 } |
227 } | 449 } |
228 j--; | 450 j--; |
229 while (j >= 0 && pattern[j] == (subject[idx + j])) j--; | 451 while (j >= 0 && pattern[j] == (subject[index + j])) j--; |
230 if (j < 0) { | 452 if (j < 0) { |
231 *complete = true; | 453 return index; |
232 return idx; | |
233 } else { | 454 } else { |
234 idx += last_char_shift; | 455 index += last_char_shift; |
235 // Badness increases by the number of characters we have | 456 // Badness increases by the number of characters we have |
236 // checked, and decreases by the number of characters we | 457 // checked, and decreases by the number of characters we |
237 // can skip by shifting. It's a measure of how we are doing | 458 // can skip by shifting. It's a measure of how we are doing |
238 // compared to reading each character exactly once. | 459 // compared to reading each character exactly once. |
239 badness += (m - j) - last_char_shift; | 460 badness += (pattern_length - j) - last_char_shift; |
240 if (badness > 0) { | 461 if (badness > 0) { |
241 *complete = false; | 462 search->PopulateBoyerMooreTable(); |
242 return idx; | 463 search->strategy_ = &BoyerMooreSearch; |
243 } | 464 return BoyerMooreSearch(search, subject, index); |
244 } | 465 } |
245 } | 466 } |
246 *complete = true; | 467 } |
247 return -1; | 468 return -1; |
248 } | 469 } |
249 | 470 |
250 | 471 |
251 template <typename SubjectChar, typename PatternChar> | 472 template <typename PatternChar, typename SubjectChar> |
252 static int BoyerMooreIndexOf(Vector<const SubjectChar> subject, | 473 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() { |
253 Vector<const PatternChar> pattern, | 474 int pattern_length = pattern_.length(); |
254 int idx) { | 475 |
255 ASSERT(algorithm <= BOYER_MOORE); | 476 int* bad_char_occurrence = bad_char_table(); |
256 int n = subject.length(); | 477 |
257 int m = pattern.length(); | |
258 // Only preprocess at most kBMMaxShift last characters of pattern. | 478 // Only preprocess at most kBMMaxShift last characters of pattern. |
259 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift; | 479 int start = start_; |
260 | 480 // Run forwards to populate bad_char_table, so that *last* instance |
261 PatternChar last_char = pattern[m - 1]; | 481 // of character equivalence class is the one registered. |
262 // Continue search from i. | 482 // Notice: Doesn't include the last character. |
263 while (idx <= n - m) { | 483 int table_size = AlphabetSize(); |
264 int j = m - 1; | 484 if (start == 0) { // All patterns less than kBMMaxShift in length. |
265 SubjectChar c; | 485 memset(bad_char_occurrence, |
266 while (last_char != (c = subject[idx + j])) { | 486 -1, |
267 int shift = j - CharOccurrence<SubjectChar, PatternChar>(c); | 487 table_size * sizeof(*bad_char_occurrence)); |
268 idx += shift; | 488 } else { |
269 if (idx > n - m) { | 489 for (int i = 0; i < table_size; i++) { |
270 return -1; | 490 bad_char_occurrence[i] = start - 1; |
271 } | 491 } |
272 } | 492 } |
273 while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--; | 493 for (int i = start; i < pattern_length - 1; i++) { |
274 if (j < 0) { | 494 PatternChar c = pattern_[i]; |
275 return idx; | 495 int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize(); |
276 } else if (j < start) { | 496 bad_char_occurrence[bucket] = i; |
277 // we have matched more than our tables allow us to be smart about. | 497 } |
278 // Fall back on BMH shift. | 498 } |
279 idx += m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char); | 499 |
280 } else { | 500 //--------------------------------------------------------------------- |
281 int gs_shift = BMBuffers::bmgs_buffers.shift(j + 1); | 501 // Linear string search with bailout to BMH. |
282 int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c); | 502 //--------------------------------------------------------------------- |
283 int shift = j - bc_occ; | 503 |
284 if (gs_shift > shift) { | 504 // Simple linear search for short patterns, which bails out if the string |
285 shift = gs_shift; | 505 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool. |
286 } | 506 template <typename PatternChar, typename SubjectChar> |
287 idx += shift; | 507 int StringSearch<PatternChar, SubjectChar>::InitialSearch( |
288 } | 508 StringSearch<PatternChar, SubjectChar>* search, |
289 } | 509 Vector<const SubjectChar> subject, |
290 | 510 int index) { |
291 return -1; | 511 Vector<const PatternChar> pattern = search->pattern_; |
292 } | |
293 | |
294 | |
295 // Trivial string search for shorter strings. | |
296 // On return, if "complete" is set to true, the return value is the | |
297 // final result of searching for the patter in the subject. | |
298 // If "complete" is set to false, the return value is the index where | |
299 // further checking should start, i.e., it's guaranteed that the pattern | |
300 // does not occur at a position prior to the returned index. | |
301 template <typename PatternChar, typename SubjectChar> | |
302 static int SimpleIndexOf(Vector<const SubjectChar> subject, | |
303 Vector<const PatternChar> pattern, | |
304 int idx, | |
305 bool* complete) { | |
306 ASSERT(pattern.length() > 1); | |
307 int pattern_length = pattern.length(); | 512 int pattern_length = pattern.length(); |
308 // Badness is a count of how much work we have done. When we have | 513 // Badness is a count of how much work we have done. When we have |
309 // done enough work we decide it's probably worth switching to a better | 514 // done enough work we decide it's probably worth switching to a better |
310 // algorithm. | 515 // algorithm. |
311 int badness = -10 - (pattern_length << 2); | 516 int badness = -10 - (pattern_length << 2); |
312 | 517 |
313 // We know our pattern is at least 2 characters, we cache the first so | 518 // We know our pattern is at least 2 characters, we cache the first so |
314 // the common case of the first character not matching is faster. | 519 // the common case of the first character not matching is faster. |
315 PatternChar pattern_first_char = pattern[0]; | 520 PatternChar pattern_first_char = pattern[0]; |
316 for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) { | 521 for (int i = index, n = subject.length() - pattern_length; i <= n; i++) { |
317 badness++; | 522 badness++; |
318 if (badness > 0) { | 523 if (badness <= 0) { |
319 *complete = false; | 524 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { |
320 return i; | 525 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( |
321 } | 526 memchr(subject.start() + i, |
322 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { | 527 pattern_first_char, |
323 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( | 528 n - i + 1)); |
324 memchr(subject.start() + i, | 529 if (pos == NULL) { |
325 pattern_first_char, | 530 return -1; |
326 n - i + 1)); | 531 } |
327 if (pos == NULL) { | 532 i = static_cast<int>(pos - subject.start()); |
328 *complete = true; | 533 } else { |
329 return -1; | 534 if (subject[i] != pattern_first_char) continue; |
330 } | 535 } |
331 i = static_cast<int>(pos - subject.start()); | 536 int j = 1; |
| 537 do { |
| 538 if (pattern[j] != subject[i + j]) { |
| 539 break; |
| 540 } |
| 541 j++; |
| 542 } while (j < pattern_length); |
| 543 if (j == pattern_length) { |
| 544 return i; |
| 545 } |
| 546 badness += j; |
332 } else { | 547 } else { |
333 if (subject[i] != pattern_first_char) continue; | 548 search->PopulateBoyerMooreHorspoolTable(); |
334 } | 549 search->strategy_ = &BoyerMooreHorspoolSearch; |
335 int j = 1; | 550 return BoyerMooreHorspoolSearch(search, subject, i); |
336 do { | |
337 if (pattern[j] != subject[i+j]) { | |
338 break; | |
339 } | |
340 j++; | |
341 } while (j < pattern_length); | |
342 if (j == pattern_length) { | |
343 *complete = true; | |
344 return i; | |
345 } | |
346 badness += j; | |
347 } | |
348 *complete = true; | |
349 return -1; | |
350 } | |
351 | |
352 // Simple indexOf that never bails out. For short patterns only. | |
353 template <typename PatternChar, typename SubjectChar> | |
354 static int SimpleIndexOf(Vector<const SubjectChar> subject, | |
355 Vector<const PatternChar> pattern, | |
356 int idx) { | |
357 int pattern_length = pattern.length(); | |
358 PatternChar pattern_first_char = pattern[0]; | |
359 for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) { | |
360 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { | |
361 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( | |
362 memchr(subject.start() + i, | |
363 pattern_first_char, | |
364 n - i + 1)); | |
365 if (pos == NULL) return -1; | |
366 i = static_cast<int>(pos - subject.start()); | |
367 } else { | |
368 if (subject[i] != pattern_first_char) continue; | |
369 } | |
370 int j = 1; | |
371 while (j < pattern_length) { | |
372 if (pattern[j] != subject[i+j]) { | |
373 break; | |
374 } | |
375 j++; | |
376 } | |
377 if (j == pattern_length) { | |
378 return i; | |
379 } | 551 } |
380 } | 552 } |
381 return -1; | 553 return -1; |
382 } | 554 } |
383 | 555 |
384 | 556 |
385 // Strategy for searching for a string in another string. | 557 // Perform a a single stand-alone search. |
386 enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG }; | 558 // If searching multiple times for the same pattern, a search |
387 | 559 // object should be constructed once and the Search function then called |
388 | 560 // for each search. |
389 template <typename PatternChar> | |
390 static inline StringSearchStrategy InitializeStringSearch( | |
391 Vector<const PatternChar> pat, bool ascii_subject) { | |
392 // We have an ASCII haystack and a non-ASCII needle. Check if there | |
393 // really is a non-ASCII character in the needle and bail out if there | |
394 // is. | |
395 if (ascii_subject && sizeof(PatternChar) > 1) { | |
396 for (int i = 0; i < pat.length(); i++) { | |
397 uc16 c = pat[i]; | |
398 if (c > String::kMaxAsciiCharCode) { | |
399 return SEARCH_FAIL; | |
400 } | |
401 } | |
402 } | |
403 if (pat.length() < kBMMinPatternLength) { | |
404 return SEARCH_SHORT; | |
405 } | |
406 algorithm = SIMPLE_SEARCH; | |
407 return SEARCH_LONG; | |
408 } | |
409 | |
410 | |
411 // Dispatch long needle searches to different algorithms. | |
412 template <typename SubjectChar, typename PatternChar> | 561 template <typename SubjectChar, typename PatternChar> |
413 static int ComplexIndexOf(Vector<const SubjectChar> sub, | 562 static int SearchString(Vector<const SubjectChar> subject, |
414 Vector<const PatternChar> pat, | 563 Vector<const PatternChar> pattern, |
415 int start_index) { | |
416 ASSERT(pat.length() >= kBMMinPatternLength); | |
417 // Try algorithms in order of increasing setup cost and expected performance. | |
418 bool complete; | |
419 int idx = start_index; | |
420 switch (algorithm) { | |
421 case SIMPLE_SEARCH: | |
422 idx = SimpleIndexOf(sub, pat, idx, &complete); | |
423 if (complete) return idx; | |
424 BoyerMoorePopulateBadCharTable(pat); | |
425 algorithm = BOYER_MOORE_HORSPOOL; | |
426 // FALLTHROUGH. | |
427 case BOYER_MOORE_HORSPOOL: | |
428 idx = BoyerMooreHorspool(sub, pat, idx, &complete); | |
429 if (complete) return idx; | |
430 // Build the Good Suffix table and continue searching. | |
431 BoyerMoorePopulateGoodSuffixTable(pat); | |
432 algorithm = BOYER_MOORE; | |
433 // FALLTHROUGH. | |
434 case BOYER_MOORE: | |
435 return BoyerMooreIndexOf(sub, pat, idx); | |
436 } | |
437 UNREACHABLE(); | |
438 return -1; | |
439 } | |
440 | |
441 | |
442 // Dispatch to different search strategies for a single search. | |
443 // If searching multiple times on the same needle, the search | |
444 // strategy should only be computed once and then dispatch to different | |
445 // loops. | |
446 template <typename SubjectChar, typename PatternChar> | |
447 static int StringSearch(Vector<const SubjectChar> sub, | |
448 Vector<const PatternChar> pat, | |
449 int start_index) { | 564 int start_index) { |
450 bool ascii_subject = (sizeof(SubjectChar) == 1); | 565 StringSearch<PatternChar, SubjectChar> search(pattern); |
451 StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject); | 566 return search.Search(subject, start_index); |
452 switch (strategy) { | |
453 case SEARCH_FAIL: return -1; | |
454 case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index); | |
455 case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index); | |
456 } | |
457 UNREACHABLE(); | |
458 return -1; | |
459 } | 567 } |
460 | 568 |
461 }} // namespace v8::internal | 569 }} // namespace v8::internal |
462 | 570 |
463 #endif // V8_STRING_SEARCH_H_ | 571 #endif // V8_STRING_SEARCH_H_ |
OLD | NEW |