| OLD | NEW | 
|---|
| 1 // Copyright 2010 the V8 project authors. All rights reserved. | 1 // Copyright 2010 the V8 project authors. All rights reserved. | 
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without | 
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are | 
| 4 // met: | 4 // met: | 
| 5 // | 5 // | 
| 6 //     * Redistributions of source code must retain the above copyright | 6 //     * Redistributions of source code must retain the above copyright | 
| 7 //       notice, this list of conditions and the following disclaimer. | 7 //       notice, this list of conditions and the following disclaimer. | 
| 8 //     * Redistributions in binary form must reproduce the above | 8 //     * Redistributions in binary form must reproduce the above | 
| 9 //       copyright notice, this list of conditions and the following | 9 //       copyright notice, this list of conditions and the following | 
| 10 //       disclaimer in the documentation and/or other materials provided | 10 //       disclaimer in the documentation and/or other materials provided | 
| (...skipping 14 matching lines...) Expand all  Loading... | 
| 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
| 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 
| 27 | 27 | 
| 28 #ifndef V8_STRING_SEARCH_H_ | 28 #ifndef V8_STRING_SEARCH_H_ | 
| 29 #define V8_STRING_SEARCH_H_ | 29 #define V8_STRING_SEARCH_H_ | 
| 30 | 30 | 
| 31 namespace v8 { | 31 namespace v8 { | 
| 32 namespace internal { | 32 namespace internal { | 
| 33 | 33 | 
| 34 | 34 | 
| 35 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a | 35 //--------------------------------------------------------------------- | 
| 36 // limit, we can fix the size of tables. For a needle longer than this limit, | 36 // String Search object. | 
| 37 // search will not be optimal, since we only build tables for a smaller suffix | 37 //--------------------------------------------------------------------- | 
| 38 // of the string, which is a safe approximation. | 38 | 
| 39 static const int kBMMaxShift = 250; | 39 // Class holding constants and methods that apply to all string search variants, | 
| 40 // Reduce alphabet to this size. | 40 // independently of subject and pattern char size. | 
| 41 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size | 41 class StringSearchBase { | 
| 42 // proportional to the input alphabet. We reduce the alphabet size by | 42  protected: | 
| 43 // equating input characters modulo a smaller alphabet size. This gives | 43   // Cap on the maximal shift in the Boyer-Moore implementation. By setting a | 
| 44 // a potentially less efficient searching, but is a safe approximation. | 44   // limit, we can fix the size of tables. For a needle longer than this limit, | 
| 45 // For needles using only characters in the same Unicode 256-code point page, | 45   // search will not be optimal, since we only build tables for a suffix | 
| 46 // there is no search speed degradation. | 46   // of the string, but it is a safe approximation. | 
| 47 static const int kBMAlphabetSize = 256; | 47   static const int kBMMaxShift = 250; | 
| 48 // For patterns below this length, the skip length of Boyer-Moore is too short | 48 | 
| 49 // to compensate for the algorithmic overhead compared to simple brute force. | 49   // Reduce alphabet to this size. | 
| 50 static const int kBMMinPatternLength = 7; | 50   // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size | 
| 51 | 51   // proportional to the input alphabet. We reduce the alphabet size by | 
| 52 // Holds the two buffers used by Boyer-Moore string search's Good Suffix | 52   // equating input characters modulo a smaller alphabet size. This gives | 
| 53 // shift. Only allows the last kBMMaxShift characters of the needle | 53   // a potentially less efficient searching, but is a safe approximation. | 
| 54 // to be indexed. | 54   // For needles using only characters in the same Unicode 256-code point page, | 
| 55 class BMGoodSuffixBuffers { | 55   // there is no search speed degradation. | 
|  | 56   static const int kAsciiAlphabetSize = 128; | 
|  | 57   static const int kUC16AlphabetSize = 256; | 
|  | 58 | 
|  | 59   // Bad-char shift table stored in the state. It's length is the alphabet size. | 
|  | 60   // For patterns below this length, the skip length of Boyer-Moore is too short | 
|  | 61   // to compensate for the algorithmic overhead compared to simple brute force. | 
|  | 62   static const int kBMMinPatternLength = 7; | 
|  | 63 | 
|  | 64   static inline bool IsAsciiString(Vector<const char>) { | 
|  | 65     return true; | 
|  | 66   } | 
|  | 67 | 
|  | 68   static inline bool IsAsciiString(Vector<const uc16> string) { | 
|  | 69     for (int i = 0, n = string.length(); i < n; i++) { | 
|  | 70       if (static_cast<unsigned>(string[i]) > String::kMaxAsciiCharCodeU) { | 
|  | 71         return false; | 
|  | 72       } | 
|  | 73     } | 
|  | 74     return true; | 
|  | 75   } | 
|  | 76 | 
|  | 77   // The following tables are shared by all searches. | 
|  | 78   // TODO(lrn): Introduce a way for a pattern to keep its tables | 
|  | 79   // between searches (e.g., for an Atom RegExp). | 
|  | 80 | 
|  | 81   // Store for the BoyerMoore(Horspool) bad char shift table. | 
|  | 82   static int kBadCharShiftTable[kUC16AlphabetSize]; | 
|  | 83   // Store for the BoyerMoore good suffix shift table. | 
|  | 84   static int kGoodSuffixShiftTable[kBMMaxShift + 1]; | 
|  | 85   // Table used temporarily while building the BoyerMoore good suffix | 
|  | 86   // shift table. | 
|  | 87   static int kSuffixTable[kBMMaxShift + 1]; | 
|  | 88 }; | 
|  | 89 | 
|  | 90 | 
|  | 91 template <typename PatternChar, typename SubjectChar> | 
|  | 92 class StringSearch : private StringSearchBase { | 
| 56  public: | 93  public: | 
| 57   BMGoodSuffixBuffers() {} | 94   explicit StringSearch(Vector<const PatternChar> pattern) | 
| 58   inline void Initialize(int needle_length) { | 95       : pattern_(pattern), | 
| 59     ASSERT(needle_length > 1); | 96         start_(Max(0, pattern.length() - kBMMaxShift)) { | 
| 60     int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift; | 97     if (sizeof(PatternChar) > sizeof(SubjectChar)) { | 
| 61     int len = needle_length - start; | 98       if (!IsAsciiString(pattern_)) { | 
| 62     biased_suffixes_ = suffixes_ - start; | 99         strategy_ = &FailSearch; | 
| 63     biased_good_suffix_shift_ = good_suffix_shift_ - start; | 100         return; | 
| 64     for (int i = 0; i <= len; i++) { | 101       } | 
| 65       good_suffix_shift_[i] = len; | 102     } | 
| 66     } | 103     int pattern_length = pattern_.length(); | 
| 67   } | 104     if (pattern_length < kBMMinPatternLength) { | 
| 68   inline int& suffix(int index) { | 105       if (pattern_length == 1) { | 
| 69     ASSERT(biased_suffixes_ + index >= suffixes_); | 106         strategy_ = &SingleCharSearch; | 
| 70     return biased_suffixes_[index]; | 107         return; | 
| 71   } | 108       } | 
| 72   inline int& shift(int index) { | 109       strategy_ = &LinearSearch; | 
| 73     ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_); | 110       return; | 
| 74     return biased_good_suffix_shift_[index]; | 111     } | 
| 75   } | 112     strategy_ = &InitialSearch; | 
|  | 113   } | 
|  | 114 | 
|  | 115   int Search(Vector<const SubjectChar> subject, int index) { | 
|  | 116     return strategy_(this, subject, index); | 
|  | 117   } | 
|  | 118 | 
|  | 119   static inline int AlphabetSize() { | 
|  | 120     if (sizeof(PatternChar) == 1) { | 
|  | 121       // ASCII needle. | 
|  | 122       return kAsciiAlphabetSize; | 
|  | 123     } else { | 
|  | 124       ASSERT(sizeof(PatternChar) == 2); | 
|  | 125       // UC16 needle. | 
|  | 126       return kUC16AlphabetSize; | 
|  | 127     } | 
|  | 128   } | 
|  | 129 | 
| 76  private: | 130  private: | 
| 77   int suffixes_[kBMMaxShift + 1]; | 131   typedef int (*SearchFunction)(  // NOLINT - it's not a cast! | 
| 78   int good_suffix_shift_[kBMMaxShift + 1]; | 132       StringSearch<PatternChar, SubjectChar>*, | 
| 79   int* biased_suffixes_; | 133       Vector<const SubjectChar>, | 
| 80   int* biased_good_suffix_shift_; | 134       int); | 
| 81   DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers); | 135 | 
|  | 136   static int FailSearch(StringSearch<PatternChar, SubjectChar>*, | 
|  | 137                         Vector<const SubjectChar>, | 
|  | 138                         int) { | 
|  | 139     return -1; | 
|  | 140   } | 
|  | 141 | 
|  | 142   static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search, | 
|  | 143                               Vector<const SubjectChar> subject, | 
|  | 144                               int start_index); | 
|  | 145 | 
|  | 146   static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search, | 
|  | 147                           Vector<const SubjectChar> subject, | 
|  | 148                           int start_index); | 
|  | 149 | 
|  | 150   static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search, | 
|  | 151                            Vector<const SubjectChar> subject, | 
|  | 152                            int start_index); | 
|  | 153 | 
|  | 154   static int BoyerMooreHorspoolSearch( | 
|  | 155       StringSearch<PatternChar, SubjectChar>* search, | 
|  | 156       Vector<const SubjectChar> subject, | 
|  | 157       int start_index); | 
|  | 158 | 
|  | 159   static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search, | 
|  | 160                               Vector<const SubjectChar> subject, | 
|  | 161                               int start_index); | 
|  | 162 | 
|  | 163   void PopulateBoyerMooreHorspoolTable(); | 
|  | 164 | 
|  | 165   void PopulateBoyerMooreTable(); | 
|  | 166 | 
|  | 167   static inline int CharOccurrence(int* bad_char_occurrence, | 
|  | 168                                    SubjectChar char_code) { | 
|  | 169     if (sizeof(SubjectChar) == 1) { | 
|  | 170       return bad_char_occurrence[static_cast<int>(char_code)]; | 
|  | 171     } | 
|  | 172     if (sizeof(PatternChar) == 1) { | 
|  | 173       if (static_cast<unsigned char>(char_code) > String::kMaxAsciiCharCode) { | 
|  | 174         return -1; | 
|  | 175       } | 
|  | 176       return bad_char_occurrence[static_cast<int>(char_code)]; | 
|  | 177     } | 
|  | 178     // Reduce to equivalence class. | 
|  | 179     int equiv_class = char_code % kUC16AlphabetSize; | 
|  | 180     return bad_char_occurrence[equiv_class]; | 
|  | 181   } | 
|  | 182 | 
|  | 183   // Return a table covering the last kBMMaxShift+1 positions of | 
|  | 184   // pattern. | 
|  | 185   int* bad_char_table() { | 
|  | 186     return kBadCharShiftTable; | 
|  | 187   } | 
|  | 188 | 
|  | 189   int* good_suffix_shift_table() { | 
|  | 190     // Return biased pointer that maps the range  [start_..pattern_.length() | 
|  | 191     // to the kGoodSuffixShiftTable array. | 
|  | 192     return kGoodSuffixShiftTable - start_; | 
|  | 193   } | 
|  | 194 | 
|  | 195   int* suffix_table() { | 
|  | 196     // Return biased pointer that maps the range  [start_..pattern_.length() | 
|  | 197     // to the kSuffixTable array. | 
|  | 198     return kSuffixTable - start_; | 
|  | 199   } | 
|  | 200 | 
|  | 201   // The pattern to search for. | 
|  | 202   Vector<const PatternChar> pattern_; | 
|  | 203   // Pointer to implementation of the search. | 
|  | 204   SearchFunction strategy_; | 
|  | 205   // Cache value of Max(0, pattern_length() - kBMMaxShift) | 
|  | 206   int start_; | 
| 82 }; | 207 }; | 
| 83 | 208 | 
| 84 // buffers reused by BoyerMoore | 209 | 
| 85 struct BMBuffers { | 210 //--------------------------------------------------------------------- | 
| 86  public: | 211 // Single Character Pattern Search Strategy | 
| 87   static int bad_char_occurrence[kBMAlphabetSize]; | 212 //--------------------------------------------------------------------- | 
| 88   static BMGoodSuffixBuffers bmgs_buffers; | 213 | 
| 89 }; | 214 template <typename PatternChar, typename SubjectChar> | 
| 90 | 215 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch( | 
| 91 // State of the string match tables. | 216     StringSearch<PatternChar, SubjectChar>* search, | 
| 92 // SIMPLE: No usable content in the buffers. | 217     Vector<const SubjectChar> subject, | 
| 93 // BOYER_MOORE_HORSPOOL: The bad_char_occurence table has been populated. | 218     int index) { | 
| 94 // BOYER_MOORE: The bmgs_buffers tables have also been populated. | 219   ASSERT_EQ(1, search->pattern_.length()); | 
| 95 // Whenever starting with a new needle, one should call InitializeStringSearch | 220   PatternChar pattern_first_char = search->pattern_[0]; | 
| 96 // to determine which search strategy to use, and in the case of a long-needle | 221   int i = index; | 
| 97 // strategy, the call also initializes the algorithm to SIMPLE. | 222   if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { | 
| 98 enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE }; | 223     const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( | 
| 99 static StringSearchAlgorithm algorithm; | 224         memchr(subject.start() + i, | 
| 100 | 225                pattern_first_char, | 
| 101 | 226                subject.length() - i)); | 
| 102 // Compute the bad-char table for Boyer-Moore in the static buffer. | 227     if (pos == NULL) return -1; | 
| 103 template <typename PatternChar> | 228     return static_cast<int>(pos - subject.start()); | 
| 104 static void BoyerMoorePopulateBadCharTable(Vector<const PatternChar> pattern) { | 229   } else { | 
|  | 230     if (sizeof(PatternChar) > sizeof(SubjectChar)) { | 
|  | 231       if (static_cast<uc16>(pattern_first_char) > String::kMaxAsciiCharCodeU) { | 
|  | 232         return -1; | 
|  | 233       } | 
|  | 234     } | 
|  | 235     SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char); | 
|  | 236     int n = subject.length(); | 
|  | 237     while (i < n) { | 
|  | 238       if (subject[i++] == search_char) return i - 1; | 
|  | 239     } | 
|  | 240     return -1; | 
|  | 241   } | 
|  | 242 } | 
|  | 243 | 
|  | 244 //--------------------------------------------------------------------- | 
|  | 245 // Linear Search Strategy | 
|  | 246 //--------------------------------------------------------------------- | 
|  | 247 | 
|  | 248 | 
|  | 249 template <typename PatternChar, typename SubjectChar> | 
|  | 250 static inline bool CharCompare(const PatternChar* pattern, | 
|  | 251                                const SubjectChar* subject, | 
|  | 252                                int length) { | 
|  | 253   ASSERT(length > 0); | 
|  | 254   int pos = 0; | 
|  | 255   do { | 
|  | 256     if (pattern[pos] != subject[pos]) { | 
|  | 257       return false; | 
|  | 258     } | 
|  | 259     pos++; | 
|  | 260   } while (pos < length); | 
|  | 261   return true; | 
|  | 262 } | 
|  | 263 | 
|  | 264 | 
|  | 265 // Simple linear search for short patterns. Never bails out. | 
|  | 266 template <typename PatternChar, typename SubjectChar> | 
|  | 267 int StringSearch<PatternChar, SubjectChar>::LinearSearch( | 
|  | 268     StringSearch<PatternChar, SubjectChar>* search, | 
|  | 269     Vector<const SubjectChar> subject, | 
|  | 270     int index) { | 
|  | 271   Vector<const PatternChar> pattern = search->pattern_; | 
|  | 272   ASSERT(pattern.length() > 1); | 
|  | 273   int pattern_length = pattern.length(); | 
|  | 274   PatternChar pattern_first_char = pattern[0]; | 
|  | 275   int i = index; | 
|  | 276   int n = subject.length() - pattern_length; | 
|  | 277   while (i <= n) { | 
|  | 278     if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { | 
|  | 279       const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( | 
|  | 280           memchr(subject.start() + i, | 
|  | 281                  pattern_first_char, | 
|  | 282                  n - i + 1)); | 
|  | 283       if (pos == NULL) return -1; | 
|  | 284       i = static_cast<int>(pos - subject.start()) + 1; | 
|  | 285     } else { | 
|  | 286       if (subject[i++] != pattern_first_char) continue; | 
|  | 287     } | 
|  | 288     // Loop extracted to separate function to allow using return to do | 
|  | 289     // a deeper break. | 
|  | 290     if (CharCompare(pattern.start() + 1, | 
|  | 291                     subject.start() + i, | 
|  | 292                     pattern_length - 1)) { | 
|  | 293       return i - 1; | 
|  | 294     } | 
|  | 295   } | 
|  | 296   return -1; | 
|  | 297 } | 
|  | 298 | 
|  | 299 //--------------------------------------------------------------------- | 
|  | 300 // Boyer-Moore string search | 
|  | 301 //--------------------------------------------------------------------- | 
|  | 302 | 
|  | 303 template <typename PatternChar, typename SubjectChar> | 
|  | 304 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch( | 
|  | 305     StringSearch<PatternChar, SubjectChar>* search, | 
|  | 306     Vector<const SubjectChar> subject, | 
|  | 307     int start_index) { | 
|  | 308   Vector<const PatternChar> pattern = search->pattern_; | 
|  | 309   int subject_length = subject.length(); | 
|  | 310   int pattern_length = pattern.length(); | 
| 105   // Only preprocess at most kBMMaxShift last characters of pattern. | 311   // Only preprocess at most kBMMaxShift last characters of pattern. | 
| 106   int start = Max(pattern.length() - kBMMaxShift, 0); | 312   int start = search->start_; | 
| 107   // Run forwards to populate bad_char_table, so that *last* instance | 313 | 
| 108   // of character equivalence class is the one registered. | 314   int* bad_char_occurence = search->bad_char_table(); | 
| 109   // Notice: Doesn't include the last character. | 315   int* good_suffix_shift = search->good_suffix_shift_table(); | 
| 110   int table_size = (sizeof(PatternChar) == 1) ? String::kMaxAsciiCharCode + 1 | 316 | 
| 111                                         : kBMAlphabetSize; | 317   PatternChar last_char = pattern[pattern_length - 1]; | 
| 112   if (start == 0) {  // All patterns less than kBMMaxShift in length. | 318   int index = start_index; | 
| 113     memset(BMBuffers::bad_char_occurrence, | 319   // Continue search from i. | 
| 114            -1, | 320   while (index <= subject_length - pattern_length) { | 
| 115            table_size * sizeof(*BMBuffers::bad_char_occurrence)); | 321     int j = pattern_length - 1; | 
| 116   } else { | 322     int c; | 
| 117     for (int i = 0; i < table_size; i++) { | 323     while (last_char != (c = subject[index + j])) { | 
| 118       BMBuffers::bad_char_occurrence[i] = start - 1; | 324       int shift = | 
| 119     } | 325           j - CharOccurrence(bad_char_occurence, c); | 
| 120   } | 326       index += shift; | 
| 121   for (int i = start; i < pattern.length() - 1; i++) { | 327       if (index > subject_length - pattern_length) { | 
| 122     PatternChar c = pattern[i]; | 328         return -1; | 
| 123     int bucket = (sizeof(PatternChar) ==1) ? c : c % kBMAlphabetSize; | 329       } | 
| 124     BMBuffers::bad_char_occurrence[bucket] = i; | 330     } | 
| 125   } | 331     while (j >= 0 && pattern[j] == (c = subject[index + j])) j--; | 
| 126 } | 332     if (j < 0) { | 
| 127 | 333       return index; | 
| 128 | 334     } else if (j < start) { | 
| 129 template <typename PatternChar> | 335       // we have matched more than our tables allow us to be smart about. | 
| 130 static void BoyerMoorePopulateGoodSuffixTable( | 336       // Fall back on BMH shift. | 
| 131     Vector<const PatternChar> pattern) { | 337       index += | 
| 132   int m = pattern.length(); | 338           pattern_length - 1 - CharOccurrence(bad_char_occurence, last_char); | 
| 133   int start = m < kBMMaxShift ? 0 : m - kBMMaxShift; | 339     } else { | 
| 134   int len = m - start; | 340       int gs_shift = good_suffix_shift[j + 1]; | 
| 135   // Compute Good Suffix tables. | 341       int bc_occ = | 
| 136   BMBuffers::bmgs_buffers.Initialize(m); | 342           CharOccurrence(bad_char_occurence, c); | 
| 137 | 343       int shift = j - bc_occ; | 
| 138   BMBuffers::bmgs_buffers.shift(m-1) = 1; | 344       if (gs_shift > shift) { | 
| 139   BMBuffers::bmgs_buffers.suffix(m) = m + 1; | 345         shift = gs_shift; | 
| 140   PatternChar last_char = pattern[m - 1]; | 346       } | 
| 141   int suffix = m + 1; | 347       index += shift; | 
|  | 348     } | 
|  | 349   } | 
|  | 350 | 
|  | 351   return -1; | 
|  | 352 } | 
|  | 353 | 
|  | 354 | 
|  | 355 template <typename PatternChar, typename SubjectChar> | 
|  | 356 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() { | 
|  | 357   int pattern_length = pattern_.length(); | 
|  | 358   const PatternChar* pattern = pattern_.start(); | 
|  | 359   // Only look at the last kBMMaxShift characters of pattern (from start_ | 
|  | 360   // to pattern_length). | 
|  | 361   int start = start_; | 
|  | 362   int length = pattern_length - start; | 
|  | 363 | 
|  | 364   // Biased tables so that we can use pattern indices as table indices, | 
|  | 365   // even if we only cover the part of the pattern from offset start. | 
|  | 366   int* shift_table = good_suffix_shift_table(); | 
|  | 367   int* suffix_table = this->suffix_table(); | 
|  | 368 | 
|  | 369   // Initialize table. | 
|  | 370   for (int i = start; i < pattern_length; i++) { | 
|  | 371     shift_table[i] = length; | 
|  | 372   } | 
|  | 373   shift_table[pattern_length] = 1; | 
|  | 374   suffix_table[pattern_length] = pattern_length + 1; | 
|  | 375 | 
|  | 376   // Find suffixes. | 
|  | 377   PatternChar last_char = pattern[pattern_length - 1]; | 
|  | 378   int suffix = pattern_length + 1; | 
| 142   { | 379   { | 
| 143     int i = m; | 380     int i = pattern_length; | 
| 144     while (i > start) { | 381     while (i > start) { | 
| 145       PatternChar c = pattern[i - 1]; | 382       PatternChar c = pattern[i - 1]; | 
| 146       while (suffix <= m && c != pattern[suffix - 1]) { | 383       while (suffix <= pattern_length && c != pattern[suffix - 1]) { | 
| 147         if (BMBuffers::bmgs_buffers.shift(suffix) == len) { | 384         if (shift_table[suffix] == length) { | 
| 148           BMBuffers::bmgs_buffers.shift(suffix) = suffix - i; | 385           shift_table[suffix] = suffix - i; | 
| 149         } | 386         } | 
| 150         suffix = BMBuffers::bmgs_buffers.suffix(suffix); | 387         suffix = suffix_table[suffix]; | 
| 151       } | 388       } | 
| 152       BMBuffers::bmgs_buffers.suffix(--i) = --suffix; | 389       suffix_table[--i] = --suffix; | 
| 153       if (suffix == m) { | 390       if (suffix == pattern_length) { | 
| 154         // No suffix to extend, so we check against last_char only. | 391         // No suffix to extend, so we check against last_char only. | 
| 155         while ((i > start) && (pattern[i - 1] != last_char)) { | 392         while ((i > start) && (pattern[i - 1] != last_char)) { | 
| 156           if (BMBuffers::bmgs_buffers.shift(m) == len) { | 393           if (shift_table[pattern_length] == length) { | 
| 157             BMBuffers::bmgs_buffers.shift(m) = m - i; | 394             shift_table[pattern_length] = pattern_length - i; | 
| 158           } | 395           } | 
| 159           BMBuffers::bmgs_buffers.suffix(--i) = m; | 396           suffix_table[--i] = pattern_length; | 
| 160         } | 397         } | 
| 161         if (i > start) { | 398         if (i > start) { | 
| 162           BMBuffers::bmgs_buffers.suffix(--i) = --suffix; | 399           suffix_table[--i] = --suffix; | 
| 163         } | 400         } | 
| 164       } | 401       } | 
| 165     } | 402     } | 
| 166   } | 403   } | 
| 167   if (suffix < m) { | 404   // Build shift table using suffixes. | 
| 168     for (int i = start; i <= m; i++) { | 405   if (suffix < pattern_length) { | 
| 169       if (BMBuffers::bmgs_buffers.shift(i) == len) { | 406     for (int i = start; i <= pattern_length; i++) { | 
| 170         BMBuffers::bmgs_buffers.shift(i) = suffix - start; | 407       if (shift_table[i] == length) { | 
|  | 408         shift_table[i] = suffix - start; | 
| 171       } | 409       } | 
| 172       if (i == suffix) { | 410       if (i == suffix) { | 
| 173         suffix = BMBuffers::bmgs_buffers.suffix(suffix); | 411         suffix = suffix_table[suffix]; | 
| 174       } | 412       } | 
| 175     } | 413     } | 
| 176   } | 414   } | 
| 177 } | 415 } | 
| 178 | 416 | 
| 179 | 417 //--------------------------------------------------------------------- | 
| 180 template <typename SubjectChar, typename PatternChar> | 418 // Boyer-Moore-Horspool string search. | 
| 181 static inline int CharOccurrence(int char_code) { | 419 //--------------------------------------------------------------------- | 
| 182   if (sizeof(SubjectChar) == 1) { | 420 | 
| 183     return BMBuffers::bad_char_occurrence[char_code]; | 421 template <typename PatternChar, typename SubjectChar> | 
| 184   } | 422 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch( | 
| 185   if (sizeof(PatternChar) == 1) { | 423     StringSearch<PatternChar, SubjectChar>* search, | 
| 186     if (char_code > String::kMaxAsciiCharCode) { | 424     Vector<const SubjectChar> subject, | 
| 187       return -1; | 425     int start_index) { | 
| 188     } | 426   Vector<const PatternChar> pattern = search->pattern_; | 
| 189     return BMBuffers::bad_char_occurrence[char_code]; | 427   int subject_length = subject.length(); | 
| 190   } | 428   int pattern_length = pattern.length(); | 
| 191   return BMBuffers::bad_char_occurrence[char_code % kBMAlphabetSize]; | 429   int* char_occurrences = search->bad_char_table(); | 
| 192 } | 430   int badness = -pattern_length; | 
| 193 |  | 
| 194 |  | 
| 195 // Restricted simplified Boyer-Moore string matching. |  | 
| 196 // Uses only the bad-shift table of Boyer-Moore and only uses it |  | 
| 197 // for the character compared to the last character of the needle. |  | 
| 198 template <typename SubjectChar, typename PatternChar> |  | 
| 199 static int BoyerMooreHorspool(Vector<const SubjectChar> subject, |  | 
| 200                               Vector<const PatternChar> pattern, |  | 
| 201                               int start_index, |  | 
| 202                               bool* complete) { |  | 
| 203   ASSERT(algorithm <= BOYER_MOORE_HORSPOOL); |  | 
| 204   int n = subject.length(); |  | 
| 205   int m = pattern.length(); |  | 
| 206 |  | 
| 207   int badness = -m; |  | 
| 208 | 431 | 
| 209   // How bad we are doing without a good-suffix table. | 432   // How bad we are doing without a good-suffix table. | 
| 210   int idx;  // No matches found prior to this index. | 433   PatternChar last_char = pattern[pattern_length - 1]; | 
| 211   PatternChar last_char = pattern[m - 1]; | 434   int last_char_shift = pattern_length - 1 - | 
| 212   int last_char_shift = | 435       CharOccurrence(char_occurrences, last_char); | 
| 213       m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char); |  | 
| 214   // Perform search | 436   // Perform search | 
| 215   for (idx = start_index; idx <= n - m;) { | 437   int index = start_index;  // No matches found prior to this index. | 
| 216     int j = m - 1; | 438   while (index <= subject_length - pattern_length) { | 
| 217     int c; | 439     int j = pattern_length - 1; | 
| 218     while (last_char != (c = subject[idx + j])) { | 440     int subject_char; | 
| 219       int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c); | 441     while (last_char != (subject_char = subject[index + j])) { | 
|  | 442       int bc_occ = CharOccurrence(char_occurrences, subject_char); | 
| 220       int shift = j - bc_occ; | 443       int shift = j - bc_occ; | 
| 221       idx += shift; | 444       index += shift; | 
| 222       badness += 1 - shift;  // at most zero, so badness cannot increase. | 445       badness += 1 - shift;  // at most zero, so badness cannot increase. | 
| 223       if (idx > n - m) { | 446       if (index > subject_length - pattern_length) { | 
| 224         *complete = true; |  | 
| 225         return -1; | 447         return -1; | 
| 226       } | 448       } | 
| 227     } | 449     } | 
| 228     j--; | 450     j--; | 
| 229     while (j >= 0 && pattern[j] == (subject[idx + j])) j--; | 451     while (j >= 0 && pattern[j] == (subject[index + j])) j--; | 
| 230     if (j < 0) { | 452     if (j < 0) { | 
| 231       *complete = true; | 453       return index; | 
| 232       return idx; |  | 
| 233     } else { | 454     } else { | 
| 234       idx += last_char_shift; | 455       index += last_char_shift; | 
| 235       // Badness increases by the number of characters we have | 456       // Badness increases by the number of characters we have | 
| 236       // checked, and decreases by the number of characters we | 457       // checked, and decreases by the number of characters we | 
| 237       // can skip by shifting. It's a measure of how we are doing | 458       // can skip by shifting. It's a measure of how we are doing | 
| 238       // compared to reading each character exactly once. | 459       // compared to reading each character exactly once. | 
| 239       badness += (m - j) - last_char_shift; | 460       badness += (pattern_length - j) - last_char_shift; | 
| 240       if (badness > 0) { | 461       if (badness > 0) { | 
| 241         *complete = false; | 462         search->PopulateBoyerMooreTable(); | 
| 242         return idx; | 463         search->strategy_ = &BoyerMooreSearch; | 
| 243       } | 464         return BoyerMooreSearch(search, subject, index); | 
| 244     } | 465       } | 
| 245   } | 466     } | 
| 246   *complete = true; | 467   } | 
| 247   return -1; | 468   return -1; | 
| 248 } | 469 } | 
| 249 | 470 | 
| 250 | 471 | 
| 251 template <typename SubjectChar, typename PatternChar> | 472 template <typename PatternChar, typename SubjectChar> | 
| 252 static int BoyerMooreIndexOf(Vector<const SubjectChar> subject, | 473 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() { | 
| 253                              Vector<const PatternChar> pattern, | 474   int pattern_length = pattern_.length(); | 
| 254                              int idx) { | 475 | 
| 255   ASSERT(algorithm <= BOYER_MOORE); | 476   int* bad_char_occurrence = bad_char_table(); | 
| 256   int n = subject.length(); | 477 | 
| 257   int m = pattern.length(); |  | 
| 258   // Only preprocess at most kBMMaxShift last characters of pattern. | 478   // Only preprocess at most kBMMaxShift last characters of pattern. | 
| 259   int start = m < kBMMaxShift ? 0 : m - kBMMaxShift; | 479   int start = start_; | 
| 260 | 480   // Run forwards to populate bad_char_table, so that *last* instance | 
| 261   PatternChar last_char = pattern[m - 1]; | 481   // of character equivalence class is the one registered. | 
| 262   // Continue search from i. | 482   // Notice: Doesn't include the last character. | 
| 263   while (idx <= n - m) { | 483   int table_size = AlphabetSize(); | 
| 264     int j = m - 1; | 484   if (start == 0) {  // All patterns less than kBMMaxShift in length. | 
| 265     SubjectChar c; | 485     memset(bad_char_occurrence, | 
| 266     while (last_char != (c = subject[idx + j])) { | 486            -1, | 
| 267       int shift = j - CharOccurrence<SubjectChar, PatternChar>(c); | 487            table_size * sizeof(*bad_char_occurrence)); | 
| 268       idx += shift; | 488   } else { | 
| 269       if (idx > n - m) { | 489     for (int i = 0; i < table_size; i++) { | 
| 270         return -1; | 490       bad_char_occurrence[i] = start - 1; | 
| 271       } | 491     } | 
| 272     } | 492   } | 
| 273     while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--; | 493   for (int i = start; i < pattern_length - 1; i++) { | 
| 274     if (j < 0) { | 494     PatternChar c = pattern_[i]; | 
| 275       return idx; | 495     int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize(); | 
| 276     } else if (j < start) { | 496     bad_char_occurrence[bucket] = i; | 
| 277       // we have matched more than our tables allow us to be smart about. | 497   } | 
| 278       // Fall back on BMH shift. | 498 } | 
| 279       idx += m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char); | 499 | 
| 280     } else { | 500 //--------------------------------------------------------------------- | 
| 281       int gs_shift = BMBuffers::bmgs_buffers.shift(j + 1); | 501 // Linear string search with bailout to BMH. | 
| 282       int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c); | 502 //--------------------------------------------------------------------- | 
| 283       int shift = j - bc_occ; | 503 | 
| 284       if (gs_shift > shift) { | 504 // Simple linear search for short patterns, which bails out if the string | 
| 285         shift = gs_shift; | 505 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool. | 
| 286       } | 506 template <typename PatternChar, typename SubjectChar> | 
| 287       idx += shift; | 507 int StringSearch<PatternChar, SubjectChar>::InitialSearch( | 
| 288     } | 508     StringSearch<PatternChar, SubjectChar>* search, | 
| 289   } | 509     Vector<const SubjectChar> subject, | 
| 290 | 510     int index) { | 
| 291   return -1; | 511   Vector<const PatternChar> pattern = search->pattern_; | 
| 292 } |  | 
| 293 |  | 
| 294 |  | 
| 295 // Trivial string search for shorter strings. |  | 
| 296 // On return, if "complete" is set to true, the return value is the |  | 
| 297 // final result of searching for the patter in the subject. |  | 
| 298 // If "complete" is set to false, the return value is the index where |  | 
| 299 // further checking should start, i.e., it's guaranteed that the pattern |  | 
| 300 // does not occur at a position prior to the returned index. |  | 
| 301 template <typename PatternChar, typename SubjectChar> |  | 
| 302 static int SimpleIndexOf(Vector<const SubjectChar> subject, |  | 
| 303                          Vector<const PatternChar> pattern, |  | 
| 304                          int idx, |  | 
| 305                          bool* complete) { |  | 
| 306   ASSERT(pattern.length() > 1); |  | 
| 307   int pattern_length = pattern.length(); | 512   int pattern_length = pattern.length(); | 
| 308   // Badness is a count of how much work we have done.  When we have | 513   // Badness is a count of how much work we have done.  When we have | 
| 309   // done enough work we decide it's probably worth switching to a better | 514   // done enough work we decide it's probably worth switching to a better | 
| 310   // algorithm. | 515   // algorithm. | 
| 311   int badness = -10 - (pattern_length << 2); | 516   int badness = -10 - (pattern_length << 2); | 
| 312 | 517 | 
| 313   // We know our pattern is at least 2 characters, we cache the first so | 518   // We know our pattern is at least 2 characters, we cache the first so | 
| 314   // the common case of the first character not matching is faster. | 519   // the common case of the first character not matching is faster. | 
| 315   PatternChar pattern_first_char = pattern[0]; | 520   PatternChar pattern_first_char = pattern[0]; | 
| 316   for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) { | 521   for (int i = index, n = subject.length() - pattern_length; i <= n; i++) { | 
| 317     badness++; | 522     badness++; | 
| 318     if (badness > 0) { | 523     if (badness <= 0) { | 
| 319       *complete = false; | 524       if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { | 
| 320       return i; | 525         const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( | 
| 321     } | 526             memchr(subject.start() + i, | 
| 322     if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { | 527                    pattern_first_char, | 
| 323       const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( | 528                    n - i + 1)); | 
| 324           memchr(subject.start() + i, | 529         if (pos == NULL) { | 
| 325                  pattern_first_char, | 530           return -1; | 
| 326                  n - i + 1)); | 531         } | 
| 327       if (pos == NULL) { | 532         i = static_cast<int>(pos - subject.start()); | 
| 328         *complete = true; | 533       } else { | 
| 329         return -1; | 534         if (subject[i] != pattern_first_char) continue; | 
| 330       } | 535       } | 
| 331       i = static_cast<int>(pos - subject.start()); | 536       int j = 1; | 
|  | 537       do { | 
|  | 538         if (pattern[j] != subject[i + j]) { | 
|  | 539           break; | 
|  | 540         } | 
|  | 541         j++; | 
|  | 542       } while (j < pattern_length); | 
|  | 543       if (j == pattern_length) { | 
|  | 544         return i; | 
|  | 545       } | 
|  | 546       badness += j; | 
| 332     } else { | 547     } else { | 
| 333       if (subject[i] != pattern_first_char) continue; | 548       search->PopulateBoyerMooreHorspoolTable(); | 
| 334     } | 549       search->strategy_ = &BoyerMooreHorspoolSearch; | 
| 335     int j = 1; | 550       return BoyerMooreHorspoolSearch(search, subject, i); | 
| 336     do { |  | 
| 337       if (pattern[j] != subject[i+j]) { |  | 
| 338         break; |  | 
| 339       } |  | 
| 340       j++; |  | 
| 341     } while (j < pattern_length); |  | 
| 342     if (j == pattern_length) { |  | 
| 343       *complete = true; |  | 
| 344       return i; |  | 
| 345     } |  | 
| 346     badness += j; |  | 
| 347   } |  | 
| 348   *complete = true; |  | 
| 349   return -1; |  | 
| 350 } |  | 
| 351 |  | 
| 352 // Simple indexOf that never bails out. For short patterns only. |  | 
| 353 template <typename PatternChar, typename SubjectChar> |  | 
| 354 static int SimpleIndexOf(Vector<const SubjectChar> subject, |  | 
| 355                          Vector<const PatternChar> pattern, |  | 
| 356                          int idx) { |  | 
| 357   int pattern_length = pattern.length(); |  | 
| 358   PatternChar pattern_first_char = pattern[0]; |  | 
| 359   for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) { |  | 
| 360     if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { |  | 
| 361       const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( |  | 
| 362           memchr(subject.start() + i, |  | 
| 363                  pattern_first_char, |  | 
| 364                  n - i + 1)); |  | 
| 365       if (pos == NULL) return -1; |  | 
| 366       i = static_cast<int>(pos - subject.start()); |  | 
| 367     } else { |  | 
| 368       if (subject[i] != pattern_first_char) continue; |  | 
| 369     } |  | 
| 370     int j = 1; |  | 
| 371     while (j < pattern_length) { |  | 
| 372       if (pattern[j] != subject[i+j]) { |  | 
| 373         break; |  | 
| 374       } |  | 
| 375       j++; |  | 
| 376     } |  | 
| 377     if (j == pattern_length) { |  | 
| 378       return i; |  | 
| 379     } | 551     } | 
| 380   } | 552   } | 
| 381   return -1; | 553   return -1; | 
| 382 } | 554 } | 
| 383 | 555 | 
| 384 | 556 | 
| 385 // Strategy for searching for a string in another string. | 557 // Perform a a single stand-alone search. | 
| 386 enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG }; | 558 // If searching multiple times for the same pattern, a search | 
| 387 | 559 // object should be constructed once and the Search function then called | 
| 388 | 560 // for each search. | 
| 389 template <typename PatternChar> |  | 
| 390 static inline StringSearchStrategy InitializeStringSearch( |  | 
| 391     Vector<const PatternChar> pat, bool ascii_subject) { |  | 
| 392   // We have an ASCII haystack and a non-ASCII needle. Check if there |  | 
| 393   // really is a non-ASCII character in the needle and bail out if there |  | 
| 394   // is. |  | 
| 395   if (ascii_subject && sizeof(PatternChar) > 1) { |  | 
| 396     for (int i = 0; i < pat.length(); i++) { |  | 
| 397       uc16 c = pat[i]; |  | 
| 398       if (c > String::kMaxAsciiCharCode) { |  | 
| 399         return SEARCH_FAIL; |  | 
| 400       } |  | 
| 401     } |  | 
| 402   } |  | 
| 403   if (pat.length() < kBMMinPatternLength) { |  | 
| 404     return SEARCH_SHORT; |  | 
| 405   } |  | 
| 406   algorithm = SIMPLE_SEARCH; |  | 
| 407   return SEARCH_LONG; |  | 
| 408 } |  | 
| 409 |  | 
| 410 |  | 
| 411 // Dispatch long needle searches to different algorithms. |  | 
| 412 template <typename SubjectChar, typename PatternChar> | 561 template <typename SubjectChar, typename PatternChar> | 
| 413 static int ComplexIndexOf(Vector<const SubjectChar> sub, | 562 static int SearchString(Vector<const SubjectChar> subject, | 
| 414                           Vector<const PatternChar> pat, | 563                         Vector<const PatternChar> pattern, | 
| 415                           int start_index) { |  | 
| 416   ASSERT(pat.length() >= kBMMinPatternLength); |  | 
| 417   // Try algorithms in order of increasing setup cost and expected performance. |  | 
| 418   bool complete; |  | 
| 419   int idx = start_index; |  | 
| 420   switch (algorithm) { |  | 
| 421     case SIMPLE_SEARCH: |  | 
| 422       idx = SimpleIndexOf(sub, pat, idx, &complete); |  | 
| 423       if (complete) return idx; |  | 
| 424       BoyerMoorePopulateBadCharTable(pat); |  | 
| 425       algorithm = BOYER_MOORE_HORSPOOL; |  | 
| 426       // FALLTHROUGH. |  | 
| 427     case BOYER_MOORE_HORSPOOL: |  | 
| 428       idx = BoyerMooreHorspool(sub, pat, idx, &complete); |  | 
| 429       if (complete) return idx; |  | 
| 430       // Build the Good Suffix table and continue searching. |  | 
| 431       BoyerMoorePopulateGoodSuffixTable(pat); |  | 
| 432       algorithm = BOYER_MOORE; |  | 
| 433       // FALLTHROUGH. |  | 
| 434     case BOYER_MOORE: |  | 
| 435       return BoyerMooreIndexOf(sub, pat, idx); |  | 
| 436   } |  | 
| 437   UNREACHABLE(); |  | 
| 438   return -1; |  | 
| 439 } |  | 
| 440 |  | 
| 441 |  | 
| 442 // Dispatch to different search strategies for a single search. |  | 
| 443 // If searching multiple times on the same needle, the search |  | 
| 444 // strategy should only be computed once and then dispatch to different |  | 
| 445 // loops. |  | 
| 446 template <typename SubjectChar, typename PatternChar> |  | 
| 447 static int StringSearch(Vector<const SubjectChar> sub, |  | 
| 448                         Vector<const PatternChar> pat, |  | 
| 449                         int start_index) { | 564                         int start_index) { | 
| 450   bool ascii_subject = (sizeof(SubjectChar) == 1); | 565   StringSearch<PatternChar, SubjectChar> search(pattern); | 
| 451   StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject); | 566   return search.Search(subject, start_index); | 
| 452   switch (strategy) { |  | 
| 453     case SEARCH_FAIL: return -1; |  | 
| 454     case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index); |  | 
| 455     case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index); |  | 
| 456   } |  | 
| 457   UNREACHABLE(); |  | 
| 458   return -1; |  | 
| 459 } | 567 } | 
| 460 | 568 | 
| 461 }}  // namespace v8::internal | 569 }}  // namespace v8::internal | 
| 462 | 570 | 
| 463 #endif  // V8_STRING_SEARCH_H_ | 571 #endif  // V8_STRING_SEARCH_H_ | 
| OLD | NEW | 
|---|