Index: src/string-search.h |
diff --git a/src/string-search.h b/src/string-search.h |
index d7959c0be982c35208c14cfc8803ec00a7f14a9b..1de375018b1d0c9cdd69d2dabceaf077d259f1a7 100644 |
--- a/src/string-search.h |
+++ b/src/string-search.h |
@@ -32,278 +32,483 @@ namespace v8 { |
namespace internal { |
-// Cap on the maximal shift in the Boyer-Moore implementation. By setting a |
-// limit, we can fix the size of tables. For a needle longer than this limit, |
-// search will not be optimal, since we only build tables for a smaller suffix |
-// of the string, which is a safe approximation. |
-static const int kBMMaxShift = 250; |
-// Reduce alphabet to this size. |
-// One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size |
-// proportional to the input alphabet. We reduce the alphabet size by |
-// equating input characters modulo a smaller alphabet size. This gives |
-// a potentially less efficient searching, but is a safe approximation. |
-// For needles using only characters in the same Unicode 256-code point page, |
-// there is no search speed degradation. |
-static const int kBMAlphabetSize = 256; |
-// For patterns below this length, the skip length of Boyer-Moore is too short |
-// to compensate for the algorithmic overhead compared to simple brute force. |
-static const int kBMMinPatternLength = 7; |
- |
-// Holds the two buffers used by Boyer-Moore string search's Good Suffix |
-// shift. Only allows the last kBMMaxShift characters of the needle |
-// to be indexed. |
-class BMGoodSuffixBuffers { |
+//--------------------------------------------------------------------- |
+// String Search object. |
+//--------------------------------------------------------------------- |
+ |
+// Class holding constants and methods that apply to all string search variants, |
+// independently of subject and pattern char size. |
+class StringSearchBase { |
+ protected: |
+ // Cap on the maximal shift in the Boyer-Moore implementation. By setting a |
+ // limit, we can fix the size of tables. For a needle longer than this limit, |
+ // search will not be optimal, since we only build tables for a suffix |
+ // of the string, but it is a safe approximation. |
+ static const int kBMMaxShift = 250; |
+ |
+ // Reduce alphabet to this size. |
+ // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size |
+ // proportional to the input alphabet. We reduce the alphabet size by |
+ // equating input characters modulo a smaller alphabet size. This gives |
+ // a potentially less efficient searching, but is a safe approximation. |
+ // For needles using only characters in the same Unicode 256-code point page, |
+ // there is no search speed degradation. |
+ static const int kAsciiAlphabetSize = 128; |
+ static const int kUC16AlphabetSize = 256; |
+ |
+ // Bad-char shift table stored in the state. It's length is the alphabet size. |
+ // For patterns below this length, the skip length of Boyer-Moore is too short |
+ // to compensate for the algorithmic overhead compared to simple brute force. |
+ static const int kBMMinPatternLength = 7; |
+ |
+ static inline bool IsAsciiString(Vector<const char>) { |
+ return true; |
+ } |
+ |
+ static inline bool IsAsciiString(Vector<const uc16> string) { |
+ for (int i = 0, n = string.length(); i < n; i++) { |
+ if (static_cast<unsigned>(string[i]) > String::kMaxAsciiCharCodeU) { |
+ return false; |
+ } |
+ } |
+ return true; |
+ } |
+ |
+ // The following tables are shared by all searches. |
+ // TODO(lrn): Introduce a way for a pattern to keep its tables |
+ // between searches (e.g., for an Atom RegExp). |
+ |
+ // Store for the BoyerMoore(Horspool) bad char shift table. |
+ static int kBadCharShiftTable[kUC16AlphabetSize]; |
+ // Store for the BoyerMoore good suffix shift table. |
+ static int kGoodSuffixShiftTable[kBMMaxShift + 1]; |
+ // Table used temporarily while building the BoyerMoore good suffix |
+ // shift table. |
+ static int kSuffixTable[kBMMaxShift + 1]; |
+}; |
+ |
+ |
+template <typename PatternChar, typename SubjectChar> |
+class StringSearch : private StringSearchBase { |
public: |
- BMGoodSuffixBuffers() {} |
- inline void Initialize(int needle_length) { |
- ASSERT(needle_length > 1); |
- int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift; |
- int len = needle_length - start; |
- biased_suffixes_ = suffixes_ - start; |
- biased_good_suffix_shift_ = good_suffix_shift_ - start; |
- for (int i = 0; i <= len; i++) { |
- good_suffix_shift_[i] = len; |
+ explicit StringSearch(Vector<const PatternChar> pattern) |
+ : pattern_(pattern), |
+ start_(Max(0, pattern.length() - kBMMaxShift)) { |
+ if (sizeof(PatternChar) > sizeof(SubjectChar)) { |
+ if (!IsAsciiString(pattern_)) { |
+ strategy_ = &FailSearch; |
+ return; |
+ } |
+ } |
+ int pattern_length = pattern_.length(); |
+ if (pattern_length < kBMMinPatternLength) { |
+ if (pattern_length == 1) { |
+ strategy_ = &SingleCharSearch; |
+ return; |
+ } |
+ strategy_ = &LinearSearch; |
+ return; |
} |
+ strategy_ = &InitialSearch; |
} |
- inline int& suffix(int index) { |
- ASSERT(biased_suffixes_ + index >= suffixes_); |
- return biased_suffixes_[index]; |
+ |
+ int Search(Vector<const SubjectChar> subject, int index) { |
+ return strategy_(this, subject, index); |
} |
- inline int& shift(int index) { |
- ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_); |
- return biased_good_suffix_shift_[index]; |
+ |
+ static inline int AlphabetSize() { |
+ if (sizeof(PatternChar) == 1) { |
+ // ASCII needle. |
+ return kAsciiAlphabetSize; |
+ } else { |
+ ASSERT(sizeof(PatternChar) == 2); |
+ // UC16 needle. |
+ return kUC16AlphabetSize; |
+ } |
} |
+ |
private: |
- int suffixes_[kBMMaxShift + 1]; |
- int good_suffix_shift_[kBMMaxShift + 1]; |
- int* biased_suffixes_; |
- int* biased_good_suffix_shift_; |
- DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers); |
-}; |
+ typedef int (*SearchFunction)( // NOLINT - it's not a cast! |
+ StringSearch<PatternChar, SubjectChar>*, |
+ Vector<const SubjectChar>, |
+ int); |
+ |
+ static int FailSearch(StringSearch<PatternChar, SubjectChar>*, |
+ Vector<const SubjectChar>, |
+ int) { |
+ return -1; |
+ } |
-// buffers reused by BoyerMoore |
-struct BMBuffers { |
- public: |
- static int bad_char_occurrence[kBMAlphabetSize]; |
- static BMGoodSuffixBuffers bmgs_buffers; |
+ static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search, |
+ Vector<const SubjectChar> subject, |
+ int start_index); |
+ |
+ static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search, |
+ Vector<const SubjectChar> subject, |
+ int start_index); |
+ |
+ static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search, |
+ Vector<const SubjectChar> subject, |
+ int start_index); |
+ |
+ static int BoyerMooreHorspoolSearch( |
+ StringSearch<PatternChar, SubjectChar>* search, |
+ Vector<const SubjectChar> subject, |
+ int start_index); |
+ |
+ static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search, |
+ Vector<const SubjectChar> subject, |
+ int start_index); |
+ |
+ void PopulateBoyerMooreHorspoolTable(); |
+ |
+ void PopulateBoyerMooreTable(); |
+ |
+ static inline int CharOccurrence(int* bad_char_occurrence, |
+ SubjectChar char_code) { |
+ if (sizeof(SubjectChar) == 1) { |
+ return bad_char_occurrence[static_cast<int>(char_code)]; |
+ } |
+ if (sizeof(PatternChar) == 1) { |
+ if (static_cast<unsigned char>(char_code) > String::kMaxAsciiCharCode) { |
+ return -1; |
+ } |
+ return bad_char_occurrence[static_cast<int>(char_code)]; |
+ } |
+ // Reduce to equivalence class. |
+ int equiv_class = char_code % kUC16AlphabetSize; |
+ return bad_char_occurrence[equiv_class]; |
+ } |
+ |
+ // Return a table covering the last kBMMaxShift+1 positions of |
+ // pattern. |
+ int* bad_char_table() { |
+ return kBadCharShiftTable; |
+ } |
+ |
+ int* good_suffix_shift_table() { |
+ // Return biased pointer that maps the range [start_..pattern_.length() |
+ // to the kGoodSuffixShiftTable array. |
+ return kGoodSuffixShiftTable - start_; |
+ } |
+ |
+ int* suffix_table() { |
+ // Return biased pointer that maps the range [start_..pattern_.length() |
+ // to the kSuffixTable array. |
+ return kSuffixTable - start_; |
+ } |
+ |
+ // The pattern to search for. |
+ Vector<const PatternChar> pattern_; |
+ // Pointer to implementation of the search. |
+ SearchFunction strategy_; |
+ // Cache value of Max(0, pattern_length() - kBMMaxShift) |
+ int start_; |
}; |
-// State of the string match tables. |
-// SIMPLE: No usable content in the buffers. |
-// BOYER_MOORE_HORSPOOL: The bad_char_occurence table has been populated. |
-// BOYER_MOORE: The bmgs_buffers tables have also been populated. |
-// Whenever starting with a new needle, one should call InitializeStringSearch |
-// to determine which search strategy to use, and in the case of a long-needle |
-// strategy, the call also initializes the algorithm to SIMPLE. |
-enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE }; |
-static StringSearchAlgorithm algorithm; |
+//--------------------------------------------------------------------- |
+// Single Character Pattern Search Strategy |
+//--------------------------------------------------------------------- |
-// Compute the bad-char table for Boyer-Moore in the static buffer. |
-template <typename PatternChar> |
-static void BoyerMoorePopulateBadCharTable(Vector<const PatternChar> pattern) { |
- // Only preprocess at most kBMMaxShift last characters of pattern. |
- int start = Max(pattern.length() - kBMMaxShift, 0); |
- // Run forwards to populate bad_char_table, so that *last* instance |
- // of character equivalence class is the one registered. |
- // Notice: Doesn't include the last character. |
- int table_size = (sizeof(PatternChar) == 1) ? String::kMaxAsciiCharCode + 1 |
- : kBMAlphabetSize; |
- if (start == 0) { // All patterns less than kBMMaxShift in length. |
- memset(BMBuffers::bad_char_occurrence, |
- -1, |
- table_size * sizeof(*BMBuffers::bad_char_occurrence)); |
+template <typename PatternChar, typename SubjectChar> |
+int StringSearch<PatternChar, SubjectChar>::SingleCharSearch( |
+ StringSearch<PatternChar, SubjectChar>* search, |
+ Vector<const SubjectChar> subject, |
+ int index) { |
+ ASSERT_EQ(1, search->pattern_.length()); |
+ PatternChar pattern_first_char = search->pattern_[0]; |
+ int i = index; |
+ if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { |
+ const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( |
+ memchr(subject.start() + i, |
+ pattern_first_char, |
+ subject.length() - i)); |
+ if (pos == NULL) return -1; |
+ return static_cast<int>(pos - subject.start()); |
} else { |
- for (int i = 0; i < table_size; i++) { |
- BMBuffers::bad_char_occurrence[i] = start - 1; |
+ if (sizeof(PatternChar) > sizeof(SubjectChar)) { |
+ if (static_cast<uc16>(pattern_first_char) > String::kMaxAsciiCharCodeU) { |
+ return -1; |
+ } |
} |
+ SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char); |
+ int n = subject.length(); |
+ while (i < n) { |
+ if (subject[i++] == search_char) return i - 1; |
+ } |
+ return -1; |
} |
- for (int i = start; i < pattern.length() - 1; i++) { |
- PatternChar c = pattern[i]; |
- int bucket = (sizeof(PatternChar) ==1) ? c : c % kBMAlphabetSize; |
- BMBuffers::bad_char_occurrence[bucket] = i; |
+} |
+ |
+//--------------------------------------------------------------------- |
+// Linear Search Strategy |
+//--------------------------------------------------------------------- |
+ |
+ |
+template <typename PatternChar, typename SubjectChar> |
+static inline bool CharCompare(const PatternChar* pattern, |
+ const SubjectChar* subject, |
+ int length) { |
+ ASSERT(length > 0); |
+ int pos = 0; |
+ do { |
+ if (pattern[pos] != subject[pos]) { |
+ return false; |
+ } |
+ pos++; |
+ } while (pos < length); |
+ return true; |
+} |
+ |
+ |
+// Simple linear search for short patterns. Never bails out. |
+template <typename PatternChar, typename SubjectChar> |
+int StringSearch<PatternChar, SubjectChar>::LinearSearch( |
+ StringSearch<PatternChar, SubjectChar>* search, |
+ Vector<const SubjectChar> subject, |
+ int index) { |
+ Vector<const PatternChar> pattern = search->pattern_; |
+ ASSERT(pattern.length() > 1); |
+ int pattern_length = pattern.length(); |
+ PatternChar pattern_first_char = pattern[0]; |
+ int i = index; |
+ int n = subject.length() - pattern_length; |
+ while (i <= n) { |
+ if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { |
+ const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( |
+ memchr(subject.start() + i, |
+ pattern_first_char, |
+ n - i + 1)); |
+ if (pos == NULL) return -1; |
+ i = static_cast<int>(pos - subject.start()) + 1; |
+ } else { |
+ if (subject[i++] != pattern_first_char) continue; |
+ } |
+ // Loop extracted to separate function to allow using return to do |
+ // a deeper break. |
+ if (CharCompare(pattern.start() + 1, |
+ subject.start() + i, |
+ pattern_length - 1)) { |
+ return i - 1; |
+ } |
+ } |
+ return -1; |
+} |
+ |
+//--------------------------------------------------------------------- |
+// Boyer-Moore string search |
+//--------------------------------------------------------------------- |
+ |
+template <typename PatternChar, typename SubjectChar> |
+int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch( |
+ StringSearch<PatternChar, SubjectChar>* search, |
+ Vector<const SubjectChar> subject, |
+ int start_index) { |
+ Vector<const PatternChar> pattern = search->pattern_; |
+ int subject_length = subject.length(); |
+ int pattern_length = pattern.length(); |
+ // Only preprocess at most kBMMaxShift last characters of pattern. |
+ int start = search->start_; |
+ |
+ int* bad_char_occurence = search->bad_char_table(); |
+ int* good_suffix_shift = search->good_suffix_shift_table(); |
+ |
+ PatternChar last_char = pattern[pattern_length - 1]; |
+ int index = start_index; |
+ // Continue search from i. |
+ while (index <= subject_length - pattern_length) { |
+ int j = pattern_length - 1; |
+ int c; |
+ while (last_char != (c = subject[index + j])) { |
+ int shift = |
+ j - CharOccurrence(bad_char_occurence, c); |
+ index += shift; |
+ if (index > subject_length - pattern_length) { |
+ return -1; |
+ } |
+ } |
+ while (j >= 0 && pattern[j] == (c = subject[index + j])) j--; |
+ if (j < 0) { |
+ return index; |
+ } else if (j < start) { |
+ // we have matched more than our tables allow us to be smart about. |
+ // Fall back on BMH shift. |
+ index += |
+ pattern_length - 1 - CharOccurrence(bad_char_occurence, last_char); |
+ } else { |
+ int gs_shift = good_suffix_shift[j + 1]; |
+ int bc_occ = |
+ CharOccurrence(bad_char_occurence, c); |
+ int shift = j - bc_occ; |
+ if (gs_shift > shift) { |
+ shift = gs_shift; |
+ } |
+ index += shift; |
+ } |
} |
+ |
+ return -1; |
} |
-template <typename PatternChar> |
-static void BoyerMoorePopulateGoodSuffixTable( |
- Vector<const PatternChar> pattern) { |
- int m = pattern.length(); |
- int start = m < kBMMaxShift ? 0 : m - kBMMaxShift; |
- int len = m - start; |
- // Compute Good Suffix tables. |
- BMBuffers::bmgs_buffers.Initialize(m); |
+template <typename PatternChar, typename SubjectChar> |
+void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() { |
+ int pattern_length = pattern_.length(); |
+ const PatternChar* pattern = pattern_.start(); |
+ // Only look at the last kBMMaxShift characters of pattern (from start_ |
+ // to pattern_length). |
+ int start = start_; |
+ int length = pattern_length - start; |
+ |
+ // Biased tables so that we can use pattern indices as table indices, |
+ // even if we only cover the part of the pattern from offset start. |
+ int* shift_table = good_suffix_shift_table(); |
+ int* suffix_table = this->suffix_table(); |
+ |
+ // Initialize table. |
+ for (int i = start; i < pattern_length; i++) { |
+ shift_table[i] = length; |
+ } |
+ shift_table[pattern_length] = 1; |
+ suffix_table[pattern_length] = pattern_length + 1; |
- BMBuffers::bmgs_buffers.shift(m-1) = 1; |
- BMBuffers::bmgs_buffers.suffix(m) = m + 1; |
- PatternChar last_char = pattern[m - 1]; |
- int suffix = m + 1; |
+ // Find suffixes. |
+ PatternChar last_char = pattern[pattern_length - 1]; |
+ int suffix = pattern_length + 1; |
{ |
- int i = m; |
+ int i = pattern_length; |
while (i > start) { |
PatternChar c = pattern[i - 1]; |
- while (suffix <= m && c != pattern[suffix - 1]) { |
- if (BMBuffers::bmgs_buffers.shift(suffix) == len) { |
- BMBuffers::bmgs_buffers.shift(suffix) = suffix - i; |
+ while (suffix <= pattern_length && c != pattern[suffix - 1]) { |
+ if (shift_table[suffix] == length) { |
+ shift_table[suffix] = suffix - i; |
} |
- suffix = BMBuffers::bmgs_buffers.suffix(suffix); |
+ suffix = suffix_table[suffix]; |
} |
- BMBuffers::bmgs_buffers.suffix(--i) = --suffix; |
- if (suffix == m) { |
+ suffix_table[--i] = --suffix; |
+ if (suffix == pattern_length) { |
// No suffix to extend, so we check against last_char only. |
while ((i > start) && (pattern[i - 1] != last_char)) { |
- if (BMBuffers::bmgs_buffers.shift(m) == len) { |
- BMBuffers::bmgs_buffers.shift(m) = m - i; |
+ if (shift_table[pattern_length] == length) { |
+ shift_table[pattern_length] = pattern_length - i; |
} |
- BMBuffers::bmgs_buffers.suffix(--i) = m; |
+ suffix_table[--i] = pattern_length; |
} |
if (i > start) { |
- BMBuffers::bmgs_buffers.suffix(--i) = --suffix; |
+ suffix_table[--i] = --suffix; |
} |
} |
} |
} |
- if (suffix < m) { |
- for (int i = start; i <= m; i++) { |
- if (BMBuffers::bmgs_buffers.shift(i) == len) { |
- BMBuffers::bmgs_buffers.shift(i) = suffix - start; |
+ // Build shift table using suffixes. |
+ if (suffix < pattern_length) { |
+ for (int i = start; i <= pattern_length; i++) { |
+ if (shift_table[i] == length) { |
+ shift_table[i] = suffix - start; |
} |
if (i == suffix) { |
- suffix = BMBuffers::bmgs_buffers.suffix(suffix); |
+ suffix = suffix_table[suffix]; |
} |
} |
} |
} |
+//--------------------------------------------------------------------- |
+// Boyer-Moore-Horspool string search. |
+//--------------------------------------------------------------------- |
-template <typename SubjectChar, typename PatternChar> |
-static inline int CharOccurrence(int char_code) { |
- if (sizeof(SubjectChar) == 1) { |
- return BMBuffers::bad_char_occurrence[char_code]; |
- } |
- if (sizeof(PatternChar) == 1) { |
- if (char_code > String::kMaxAsciiCharCode) { |
- return -1; |
- } |
- return BMBuffers::bad_char_occurrence[char_code]; |
- } |
- return BMBuffers::bad_char_occurrence[char_code % kBMAlphabetSize]; |
-} |
- |
- |
-// Restricted simplified Boyer-Moore string matching. |
-// Uses only the bad-shift table of Boyer-Moore and only uses it |
-// for the character compared to the last character of the needle. |
-template <typename SubjectChar, typename PatternChar> |
-static int BoyerMooreHorspool(Vector<const SubjectChar> subject, |
- Vector<const PatternChar> pattern, |
- int start_index, |
- bool* complete) { |
- ASSERT(algorithm <= BOYER_MOORE_HORSPOOL); |
- int n = subject.length(); |
- int m = pattern.length(); |
- |
- int badness = -m; |
+template <typename PatternChar, typename SubjectChar> |
+int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch( |
+ StringSearch<PatternChar, SubjectChar>* search, |
+ Vector<const SubjectChar> subject, |
+ int start_index) { |
+ Vector<const PatternChar> pattern = search->pattern_; |
+ int subject_length = subject.length(); |
+ int pattern_length = pattern.length(); |
+ int* char_occurrences = search->bad_char_table(); |
+ int badness = -pattern_length; |
// How bad we are doing without a good-suffix table. |
- int idx; // No matches found prior to this index. |
- PatternChar last_char = pattern[m - 1]; |
- int last_char_shift = |
- m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char); |
+ PatternChar last_char = pattern[pattern_length - 1]; |
+ int last_char_shift = pattern_length - 1 - |
+ CharOccurrence(char_occurrences, last_char); |
// Perform search |
- for (idx = start_index; idx <= n - m;) { |
- int j = m - 1; |
- int c; |
- while (last_char != (c = subject[idx + j])) { |
- int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c); |
+ int index = start_index; // No matches found prior to this index. |
+ while (index <= subject_length - pattern_length) { |
+ int j = pattern_length - 1; |
+ int subject_char; |
+ while (last_char != (subject_char = subject[index + j])) { |
+ int bc_occ = CharOccurrence(char_occurrences, subject_char); |
int shift = j - bc_occ; |
- idx += shift; |
+ index += shift; |
badness += 1 - shift; // at most zero, so badness cannot increase. |
- if (idx > n - m) { |
- *complete = true; |
+ if (index > subject_length - pattern_length) { |
return -1; |
} |
} |
j--; |
- while (j >= 0 && pattern[j] == (subject[idx + j])) j--; |
+ while (j >= 0 && pattern[j] == (subject[index + j])) j--; |
if (j < 0) { |
- *complete = true; |
- return idx; |
+ return index; |
} else { |
- idx += last_char_shift; |
+ index += last_char_shift; |
// Badness increases by the number of characters we have |
// checked, and decreases by the number of characters we |
// can skip by shifting. It's a measure of how we are doing |
// compared to reading each character exactly once. |
- badness += (m - j) - last_char_shift; |
+ badness += (pattern_length - j) - last_char_shift; |
if (badness > 0) { |
- *complete = false; |
- return idx; |
+ search->PopulateBoyerMooreTable(); |
+ search->strategy_ = &BoyerMooreSearch; |
+ return BoyerMooreSearch(search, subject, index); |
} |
} |
} |
- *complete = true; |
return -1; |
} |
-template <typename SubjectChar, typename PatternChar> |
-static int BoyerMooreIndexOf(Vector<const SubjectChar> subject, |
- Vector<const PatternChar> pattern, |
- int idx) { |
- ASSERT(algorithm <= BOYER_MOORE); |
- int n = subject.length(); |
- int m = pattern.length(); |
- // Only preprocess at most kBMMaxShift last characters of pattern. |
- int start = m < kBMMaxShift ? 0 : m - kBMMaxShift; |
+template <typename PatternChar, typename SubjectChar> |
+void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() { |
+ int pattern_length = pattern_.length(); |
- PatternChar last_char = pattern[m - 1]; |
- // Continue search from i. |
- while (idx <= n - m) { |
- int j = m - 1; |
- SubjectChar c; |
- while (last_char != (c = subject[idx + j])) { |
- int shift = j - CharOccurrence<SubjectChar, PatternChar>(c); |
- idx += shift; |
- if (idx > n - m) { |
- return -1; |
- } |
- } |
- while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--; |
- if (j < 0) { |
- return idx; |
- } else if (j < start) { |
- // we have matched more than our tables allow us to be smart about. |
- // Fall back on BMH shift. |
- idx += m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char); |
- } else { |
- int gs_shift = BMBuffers::bmgs_buffers.shift(j + 1); |
- int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c); |
- int shift = j - bc_occ; |
- if (gs_shift > shift) { |
- shift = gs_shift; |
- } |
- idx += shift; |
+ int* bad_char_occurrence = bad_char_table(); |
+ |
+ // Only preprocess at most kBMMaxShift last characters of pattern. |
+ int start = start_; |
+ // Run forwards to populate bad_char_table, so that *last* instance |
+ // of character equivalence class is the one registered. |
+ // Notice: Doesn't include the last character. |
+ int table_size = AlphabetSize(); |
+ if (start == 0) { // All patterns less than kBMMaxShift in length. |
+ memset(bad_char_occurrence, |
+ -1, |
+ table_size * sizeof(*bad_char_occurrence)); |
+ } else { |
+ for (int i = 0; i < table_size; i++) { |
+ bad_char_occurrence[i] = start - 1; |
} |
} |
- |
- return -1; |
+ for (int i = start; i < pattern_length - 1; i++) { |
+ PatternChar c = pattern_[i]; |
+ int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize(); |
+ bad_char_occurrence[bucket] = i; |
+ } |
} |
+//--------------------------------------------------------------------- |
+// Linear string search with bailout to BMH. |
+//--------------------------------------------------------------------- |
-// Trivial string search for shorter strings. |
-// On return, if "complete" is set to true, the return value is the |
-// final result of searching for the patter in the subject. |
-// If "complete" is set to false, the return value is the index where |
-// further checking should start, i.e., it's guaranteed that the pattern |
-// does not occur at a position prior to the returned index. |
+// Simple linear search for short patterns, which bails out if the string |
+// isn't found very early in the subject. Upgrades to BoyerMooreHorspool. |
template <typename PatternChar, typename SubjectChar> |
-static int SimpleIndexOf(Vector<const SubjectChar> subject, |
- Vector<const PatternChar> pattern, |
- int idx, |
- bool* complete) { |
- ASSERT(pattern.length() > 1); |
+int StringSearch<PatternChar, SubjectChar>::InitialSearch( |
+ StringSearch<PatternChar, SubjectChar>* search, |
+ Vector<const SubjectChar> subject, |
+ int index) { |
+ Vector<const PatternChar> pattern = search->pattern_; |
int pattern_length = pattern.length(); |
// Badness is a count of how much work we have done. When we have |
// done enough work we decide it's probably worth switching to a better |
@@ -313,149 +518,52 @@ static int SimpleIndexOf(Vector<const SubjectChar> subject, |
// We know our pattern is at least 2 characters, we cache the first so |
// the common case of the first character not matching is faster. |
PatternChar pattern_first_char = pattern[0]; |
- for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) { |
+ for (int i = index, n = subject.length() - pattern_length; i <= n; i++) { |
badness++; |
- if (badness > 0) { |
- *complete = false; |
- return i; |
- } |
- if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { |
- const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( |
- memchr(subject.start() + i, |
- pattern_first_char, |
- n - i + 1)); |
- if (pos == NULL) { |
- *complete = true; |
- return -1; |
+ if (badness <= 0) { |
+ if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { |
+ const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( |
+ memchr(subject.start() + i, |
+ pattern_first_char, |
+ n - i + 1)); |
+ if (pos == NULL) { |
+ return -1; |
+ } |
+ i = static_cast<int>(pos - subject.start()); |
+ } else { |
+ if (subject[i] != pattern_first_char) continue; |
} |
- i = static_cast<int>(pos - subject.start()); |
- } else { |
- if (subject[i] != pattern_first_char) continue; |
- } |
- int j = 1; |
- do { |
- if (pattern[j] != subject[i+j]) { |
- break; |
+ int j = 1; |
+ do { |
+ if (pattern[j] != subject[i + j]) { |
+ break; |
+ } |
+ j++; |
+ } while (j < pattern_length); |
+ if (j == pattern_length) { |
+ return i; |
} |
- j++; |
- } while (j < pattern_length); |
- if (j == pattern_length) { |
- *complete = true; |
- return i; |
- } |
- badness += j; |
- } |
- *complete = true; |
- return -1; |
-} |
- |
-// Simple indexOf that never bails out. For short patterns only. |
-template <typename PatternChar, typename SubjectChar> |
-static int SimpleIndexOf(Vector<const SubjectChar> subject, |
- Vector<const PatternChar> pattern, |
- int idx) { |
- int pattern_length = pattern.length(); |
- PatternChar pattern_first_char = pattern[0]; |
- for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) { |
- if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { |
- const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( |
- memchr(subject.start() + i, |
- pattern_first_char, |
- n - i + 1)); |
- if (pos == NULL) return -1; |
- i = static_cast<int>(pos - subject.start()); |
+ badness += j; |
} else { |
- if (subject[i] != pattern_first_char) continue; |
- } |
- int j = 1; |
- while (j < pattern_length) { |
- if (pattern[j] != subject[i+j]) { |
- break; |
- } |
- j++; |
- } |
- if (j == pattern_length) { |
- return i; |
+ search->PopulateBoyerMooreHorspoolTable(); |
+ search->strategy_ = &BoyerMooreHorspoolSearch; |
+ return BoyerMooreHorspoolSearch(search, subject, i); |
} |
} |
return -1; |
} |
-// Strategy for searching for a string in another string. |
-enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG }; |
- |
- |
-template <typename PatternChar> |
-static inline StringSearchStrategy InitializeStringSearch( |
- Vector<const PatternChar> pat, bool ascii_subject) { |
- // We have an ASCII haystack and a non-ASCII needle. Check if there |
- // really is a non-ASCII character in the needle and bail out if there |
- // is. |
- if (ascii_subject && sizeof(PatternChar) > 1) { |
- for (int i = 0; i < pat.length(); i++) { |
- uc16 c = pat[i]; |
- if (c > String::kMaxAsciiCharCode) { |
- return SEARCH_FAIL; |
- } |
- } |
- } |
- if (pat.length() < kBMMinPatternLength) { |
- return SEARCH_SHORT; |
- } |
- algorithm = SIMPLE_SEARCH; |
- return SEARCH_LONG; |
-} |
- |
- |
-// Dispatch long needle searches to different algorithms. |
+// Perform a a single stand-alone search. |
+// If searching multiple times for the same pattern, a search |
+// object should be constructed once and the Search function then called |
+// for each search. |
template <typename SubjectChar, typename PatternChar> |
-static int ComplexIndexOf(Vector<const SubjectChar> sub, |
- Vector<const PatternChar> pat, |
- int start_index) { |
- ASSERT(pat.length() >= kBMMinPatternLength); |
- // Try algorithms in order of increasing setup cost and expected performance. |
- bool complete; |
- int idx = start_index; |
- switch (algorithm) { |
- case SIMPLE_SEARCH: |
- idx = SimpleIndexOf(sub, pat, idx, &complete); |
- if (complete) return idx; |
- BoyerMoorePopulateBadCharTable(pat); |
- algorithm = BOYER_MOORE_HORSPOOL; |
- // FALLTHROUGH. |
- case BOYER_MOORE_HORSPOOL: |
- idx = BoyerMooreHorspool(sub, pat, idx, &complete); |
- if (complete) return idx; |
- // Build the Good Suffix table and continue searching. |
- BoyerMoorePopulateGoodSuffixTable(pat); |
- algorithm = BOYER_MOORE; |
- // FALLTHROUGH. |
- case BOYER_MOORE: |
- return BoyerMooreIndexOf(sub, pat, idx); |
- } |
- UNREACHABLE(); |
- return -1; |
-} |
- |
- |
-// Dispatch to different search strategies for a single search. |
-// If searching multiple times on the same needle, the search |
-// strategy should only be computed once and then dispatch to different |
-// loops. |
-template <typename SubjectChar, typename PatternChar> |
-static int StringSearch(Vector<const SubjectChar> sub, |
- Vector<const PatternChar> pat, |
+static int SearchString(Vector<const SubjectChar> subject, |
+ Vector<const PatternChar> pattern, |
int start_index) { |
- bool ascii_subject = (sizeof(SubjectChar) == 1); |
- StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject); |
- switch (strategy) { |
- case SEARCH_FAIL: return -1; |
- case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index); |
- case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index); |
- } |
- UNREACHABLE(); |
- return -1; |
+ StringSearch<PatternChar, SubjectChar> search(pattern); |
+ return search.Search(subject, start_index); |
} |
}} // namespace v8::internal |