Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(200)

Unified Diff: src/string-search.h

Issue 3467010: Refactored string search code. (Closed)
Patch Set: Added to xcode project too. Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/runtime.cc ('k') | tools/gyp/v8.gyp » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/string-search.h
diff --git a/src/string-search.h b/src/string-search.h
index d7959c0be982c35208c14cfc8803ec00a7f14a9b..1de375018b1d0c9cdd69d2dabceaf077d259f1a7 100644
--- a/src/string-search.h
+++ b/src/string-search.h
@@ -32,278 +32,483 @@ namespace v8 {
namespace internal {
-// Cap on the maximal shift in the Boyer-Moore implementation. By setting a
-// limit, we can fix the size of tables. For a needle longer than this limit,
-// search will not be optimal, since we only build tables for a smaller suffix
-// of the string, which is a safe approximation.
-static const int kBMMaxShift = 250;
-// Reduce alphabet to this size.
-// One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
-// proportional to the input alphabet. We reduce the alphabet size by
-// equating input characters modulo a smaller alphabet size. This gives
-// a potentially less efficient searching, but is a safe approximation.
-// For needles using only characters in the same Unicode 256-code point page,
-// there is no search speed degradation.
-static const int kBMAlphabetSize = 256;
-// For patterns below this length, the skip length of Boyer-Moore is too short
-// to compensate for the algorithmic overhead compared to simple brute force.
-static const int kBMMinPatternLength = 7;
-
-// Holds the two buffers used by Boyer-Moore string search's Good Suffix
-// shift. Only allows the last kBMMaxShift characters of the needle
-// to be indexed.
-class BMGoodSuffixBuffers {
+//---------------------------------------------------------------------
+// String Search object.
+//---------------------------------------------------------------------
+
+// Class holding constants and methods that apply to all string search variants,
+// independently of subject and pattern char size.
+class StringSearchBase {
+ protected:
+ // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
+ // limit, we can fix the size of tables. For a needle longer than this limit,
+ // search will not be optimal, since we only build tables for a suffix
+ // of the string, but it is a safe approximation.
+ static const int kBMMaxShift = 250;
+
+ // Reduce alphabet to this size.
+ // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
+ // proportional to the input alphabet. We reduce the alphabet size by
+ // equating input characters modulo a smaller alphabet size. This gives
+ // a potentially less efficient searching, but is a safe approximation.
+ // For needles using only characters in the same Unicode 256-code point page,
+ // there is no search speed degradation.
+ static const int kAsciiAlphabetSize = 128;
+ static const int kUC16AlphabetSize = 256;
+
+ // Bad-char shift table stored in the state. It's length is the alphabet size.
+ // For patterns below this length, the skip length of Boyer-Moore is too short
+ // to compensate for the algorithmic overhead compared to simple brute force.
+ static const int kBMMinPatternLength = 7;
+
+ static inline bool IsAsciiString(Vector<const char>) {
+ return true;
+ }
+
+ static inline bool IsAsciiString(Vector<const uc16> string) {
+ for (int i = 0, n = string.length(); i < n; i++) {
+ if (static_cast<unsigned>(string[i]) > String::kMaxAsciiCharCodeU) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // The following tables are shared by all searches.
+ // TODO(lrn): Introduce a way for a pattern to keep its tables
+ // between searches (e.g., for an Atom RegExp).
+
+ // Store for the BoyerMoore(Horspool) bad char shift table.
+ static int kBadCharShiftTable[kUC16AlphabetSize];
+ // Store for the BoyerMoore good suffix shift table.
+ static int kGoodSuffixShiftTable[kBMMaxShift + 1];
+ // Table used temporarily while building the BoyerMoore good suffix
+ // shift table.
+ static int kSuffixTable[kBMMaxShift + 1];
+};
+
+
+template <typename PatternChar, typename SubjectChar>
+class StringSearch : private StringSearchBase {
public:
- BMGoodSuffixBuffers() {}
- inline void Initialize(int needle_length) {
- ASSERT(needle_length > 1);
- int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift;
- int len = needle_length - start;
- biased_suffixes_ = suffixes_ - start;
- biased_good_suffix_shift_ = good_suffix_shift_ - start;
- for (int i = 0; i <= len; i++) {
- good_suffix_shift_[i] = len;
+ explicit StringSearch(Vector<const PatternChar> pattern)
+ : pattern_(pattern),
+ start_(Max(0, pattern.length() - kBMMaxShift)) {
+ if (sizeof(PatternChar) > sizeof(SubjectChar)) {
+ if (!IsAsciiString(pattern_)) {
+ strategy_ = &FailSearch;
+ return;
+ }
+ }
+ int pattern_length = pattern_.length();
+ if (pattern_length < kBMMinPatternLength) {
+ if (pattern_length == 1) {
+ strategy_ = &SingleCharSearch;
+ return;
+ }
+ strategy_ = &LinearSearch;
+ return;
}
+ strategy_ = &InitialSearch;
}
- inline int& suffix(int index) {
- ASSERT(biased_suffixes_ + index >= suffixes_);
- return biased_suffixes_[index];
+
+ int Search(Vector<const SubjectChar> subject, int index) {
+ return strategy_(this, subject, index);
}
- inline int& shift(int index) {
- ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_);
- return biased_good_suffix_shift_[index];
+
+ static inline int AlphabetSize() {
+ if (sizeof(PatternChar) == 1) {
+ // ASCII needle.
+ return kAsciiAlphabetSize;
+ } else {
+ ASSERT(sizeof(PatternChar) == 2);
+ // UC16 needle.
+ return kUC16AlphabetSize;
+ }
}
+
private:
- int suffixes_[kBMMaxShift + 1];
- int good_suffix_shift_[kBMMaxShift + 1];
- int* biased_suffixes_;
- int* biased_good_suffix_shift_;
- DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers);
-};
+ typedef int (*SearchFunction)( // NOLINT - it's not a cast!
+ StringSearch<PatternChar, SubjectChar>*,
+ Vector<const SubjectChar>,
+ int);
+
+ static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
+ Vector<const SubjectChar>,
+ int) {
+ return -1;
+ }
-// buffers reused by BoyerMoore
-struct BMBuffers {
- public:
- static int bad_char_occurrence[kBMAlphabetSize];
- static BMGoodSuffixBuffers bmgs_buffers;
+ static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject,
+ int start_index);
+
+ static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject,
+ int start_index);
+
+ static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject,
+ int start_index);
+
+ static int BoyerMooreHorspoolSearch(
+ StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject,
+ int start_index);
+
+ static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject,
+ int start_index);
+
+ void PopulateBoyerMooreHorspoolTable();
+
+ void PopulateBoyerMooreTable();
+
+ static inline int CharOccurrence(int* bad_char_occurrence,
+ SubjectChar char_code) {
+ if (sizeof(SubjectChar) == 1) {
+ return bad_char_occurrence[static_cast<int>(char_code)];
+ }
+ if (sizeof(PatternChar) == 1) {
+ if (static_cast<unsigned char>(char_code) > String::kMaxAsciiCharCode) {
+ return -1;
+ }
+ return bad_char_occurrence[static_cast<int>(char_code)];
+ }
+ // Reduce to equivalence class.
+ int equiv_class = char_code % kUC16AlphabetSize;
+ return bad_char_occurrence[equiv_class];
+ }
+
+ // Return a table covering the last kBMMaxShift+1 positions of
+ // pattern.
+ int* bad_char_table() {
+ return kBadCharShiftTable;
+ }
+
+ int* good_suffix_shift_table() {
+ // Return biased pointer that maps the range [start_..pattern_.length()
+ // to the kGoodSuffixShiftTable array.
+ return kGoodSuffixShiftTable - start_;
+ }
+
+ int* suffix_table() {
+ // Return biased pointer that maps the range [start_..pattern_.length()
+ // to the kSuffixTable array.
+ return kSuffixTable - start_;
+ }
+
+ // The pattern to search for.
+ Vector<const PatternChar> pattern_;
+ // Pointer to implementation of the search.
+ SearchFunction strategy_;
+ // Cache value of Max(0, pattern_length() - kBMMaxShift)
+ int start_;
};
-// State of the string match tables.
-// SIMPLE: No usable content in the buffers.
-// BOYER_MOORE_HORSPOOL: The bad_char_occurence table has been populated.
-// BOYER_MOORE: The bmgs_buffers tables have also been populated.
-// Whenever starting with a new needle, one should call InitializeStringSearch
-// to determine which search strategy to use, and in the case of a long-needle
-// strategy, the call also initializes the algorithm to SIMPLE.
-enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE };
-static StringSearchAlgorithm algorithm;
+//---------------------------------------------------------------------
+// Single Character Pattern Search Strategy
+//---------------------------------------------------------------------
-// Compute the bad-char table for Boyer-Moore in the static buffer.
-template <typename PatternChar>
-static void BoyerMoorePopulateBadCharTable(Vector<const PatternChar> pattern) {
- // Only preprocess at most kBMMaxShift last characters of pattern.
- int start = Max(pattern.length() - kBMMaxShift, 0);
- // Run forwards to populate bad_char_table, so that *last* instance
- // of character equivalence class is the one registered.
- // Notice: Doesn't include the last character.
- int table_size = (sizeof(PatternChar) == 1) ? String::kMaxAsciiCharCode + 1
- : kBMAlphabetSize;
- if (start == 0) { // All patterns less than kBMMaxShift in length.
- memset(BMBuffers::bad_char_occurrence,
- -1,
- table_size * sizeof(*BMBuffers::bad_char_occurrence));
+template <typename PatternChar, typename SubjectChar>
+int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
+ StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject,
+ int index) {
+ ASSERT_EQ(1, search->pattern_.length());
+ PatternChar pattern_first_char = search->pattern_[0];
+ int i = index;
+ if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
+ const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
+ memchr(subject.start() + i,
+ pattern_first_char,
+ subject.length() - i));
+ if (pos == NULL) return -1;
+ return static_cast<int>(pos - subject.start());
} else {
- for (int i = 0; i < table_size; i++) {
- BMBuffers::bad_char_occurrence[i] = start - 1;
+ if (sizeof(PatternChar) > sizeof(SubjectChar)) {
+ if (static_cast<uc16>(pattern_first_char) > String::kMaxAsciiCharCodeU) {
+ return -1;
+ }
}
+ SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
+ int n = subject.length();
+ while (i < n) {
+ if (subject[i++] == search_char) return i - 1;
+ }
+ return -1;
}
- for (int i = start; i < pattern.length() - 1; i++) {
- PatternChar c = pattern[i];
- int bucket = (sizeof(PatternChar) ==1) ? c : c % kBMAlphabetSize;
- BMBuffers::bad_char_occurrence[bucket] = i;
+}
+
+//---------------------------------------------------------------------
+// Linear Search Strategy
+//---------------------------------------------------------------------
+
+
+template <typename PatternChar, typename SubjectChar>
+static inline bool CharCompare(const PatternChar* pattern,
+ const SubjectChar* subject,
+ int length) {
+ ASSERT(length > 0);
+ int pos = 0;
+ do {
+ if (pattern[pos] != subject[pos]) {
+ return false;
+ }
+ pos++;
+ } while (pos < length);
+ return true;
+}
+
+
+// Simple linear search for short patterns. Never bails out.
+template <typename PatternChar, typename SubjectChar>
+int StringSearch<PatternChar, SubjectChar>::LinearSearch(
+ StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject,
+ int index) {
+ Vector<const PatternChar> pattern = search->pattern_;
+ ASSERT(pattern.length() > 1);
+ int pattern_length = pattern.length();
+ PatternChar pattern_first_char = pattern[0];
+ int i = index;
+ int n = subject.length() - pattern_length;
+ while (i <= n) {
+ if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
+ const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
+ memchr(subject.start() + i,
+ pattern_first_char,
+ n - i + 1));
+ if (pos == NULL) return -1;
+ i = static_cast<int>(pos - subject.start()) + 1;
+ } else {
+ if (subject[i++] != pattern_first_char) continue;
+ }
+ // Loop extracted to separate function to allow using return to do
+ // a deeper break.
+ if (CharCompare(pattern.start() + 1,
+ subject.start() + i,
+ pattern_length - 1)) {
+ return i - 1;
+ }
+ }
+ return -1;
+}
+
+//---------------------------------------------------------------------
+// Boyer-Moore string search
+//---------------------------------------------------------------------
+
+template <typename PatternChar, typename SubjectChar>
+int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
+ StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject,
+ int start_index) {
+ Vector<const PatternChar> pattern = search->pattern_;
+ int subject_length = subject.length();
+ int pattern_length = pattern.length();
+ // Only preprocess at most kBMMaxShift last characters of pattern.
+ int start = search->start_;
+
+ int* bad_char_occurence = search->bad_char_table();
+ int* good_suffix_shift = search->good_suffix_shift_table();
+
+ PatternChar last_char = pattern[pattern_length - 1];
+ int index = start_index;
+ // Continue search from i.
+ while (index <= subject_length - pattern_length) {
+ int j = pattern_length - 1;
+ int c;
+ while (last_char != (c = subject[index + j])) {
+ int shift =
+ j - CharOccurrence(bad_char_occurence, c);
+ index += shift;
+ if (index > subject_length - pattern_length) {
+ return -1;
+ }
+ }
+ while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
+ if (j < 0) {
+ return index;
+ } else if (j < start) {
+ // we have matched more than our tables allow us to be smart about.
+ // Fall back on BMH shift.
+ index +=
+ pattern_length - 1 - CharOccurrence(bad_char_occurence, last_char);
+ } else {
+ int gs_shift = good_suffix_shift[j + 1];
+ int bc_occ =
+ CharOccurrence(bad_char_occurence, c);
+ int shift = j - bc_occ;
+ if (gs_shift > shift) {
+ shift = gs_shift;
+ }
+ index += shift;
+ }
}
+
+ return -1;
}
-template <typename PatternChar>
-static void BoyerMoorePopulateGoodSuffixTable(
- Vector<const PatternChar> pattern) {
- int m = pattern.length();
- int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
- int len = m - start;
- // Compute Good Suffix tables.
- BMBuffers::bmgs_buffers.Initialize(m);
+template <typename PatternChar, typename SubjectChar>
+void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
+ int pattern_length = pattern_.length();
+ const PatternChar* pattern = pattern_.start();
+ // Only look at the last kBMMaxShift characters of pattern (from start_
+ // to pattern_length).
+ int start = start_;
+ int length = pattern_length - start;
+
+ // Biased tables so that we can use pattern indices as table indices,
+ // even if we only cover the part of the pattern from offset start.
+ int* shift_table = good_suffix_shift_table();
+ int* suffix_table = this->suffix_table();
+
+ // Initialize table.
+ for (int i = start; i < pattern_length; i++) {
+ shift_table[i] = length;
+ }
+ shift_table[pattern_length] = 1;
+ suffix_table[pattern_length] = pattern_length + 1;
- BMBuffers::bmgs_buffers.shift(m-1) = 1;
- BMBuffers::bmgs_buffers.suffix(m) = m + 1;
- PatternChar last_char = pattern[m - 1];
- int suffix = m + 1;
+ // Find suffixes.
+ PatternChar last_char = pattern[pattern_length - 1];
+ int suffix = pattern_length + 1;
{
- int i = m;
+ int i = pattern_length;
while (i > start) {
PatternChar c = pattern[i - 1];
- while (suffix <= m && c != pattern[suffix - 1]) {
- if (BMBuffers::bmgs_buffers.shift(suffix) == len) {
- BMBuffers::bmgs_buffers.shift(suffix) = suffix - i;
+ while (suffix <= pattern_length && c != pattern[suffix - 1]) {
+ if (shift_table[suffix] == length) {
+ shift_table[suffix] = suffix - i;
}
- suffix = BMBuffers::bmgs_buffers.suffix(suffix);
+ suffix = suffix_table[suffix];
}
- BMBuffers::bmgs_buffers.suffix(--i) = --suffix;
- if (suffix == m) {
+ suffix_table[--i] = --suffix;
+ if (suffix == pattern_length) {
// No suffix to extend, so we check against last_char only.
while ((i > start) && (pattern[i - 1] != last_char)) {
- if (BMBuffers::bmgs_buffers.shift(m) == len) {
- BMBuffers::bmgs_buffers.shift(m) = m - i;
+ if (shift_table[pattern_length] == length) {
+ shift_table[pattern_length] = pattern_length - i;
}
- BMBuffers::bmgs_buffers.suffix(--i) = m;
+ suffix_table[--i] = pattern_length;
}
if (i > start) {
- BMBuffers::bmgs_buffers.suffix(--i) = --suffix;
+ suffix_table[--i] = --suffix;
}
}
}
}
- if (suffix < m) {
- for (int i = start; i <= m; i++) {
- if (BMBuffers::bmgs_buffers.shift(i) == len) {
- BMBuffers::bmgs_buffers.shift(i) = suffix - start;
+ // Build shift table using suffixes.
+ if (suffix < pattern_length) {
+ for (int i = start; i <= pattern_length; i++) {
+ if (shift_table[i] == length) {
+ shift_table[i] = suffix - start;
}
if (i == suffix) {
- suffix = BMBuffers::bmgs_buffers.suffix(suffix);
+ suffix = suffix_table[suffix];
}
}
}
}
+//---------------------------------------------------------------------
+// Boyer-Moore-Horspool string search.
+//---------------------------------------------------------------------
-template <typename SubjectChar, typename PatternChar>
-static inline int CharOccurrence(int char_code) {
- if (sizeof(SubjectChar) == 1) {
- return BMBuffers::bad_char_occurrence[char_code];
- }
- if (sizeof(PatternChar) == 1) {
- if (char_code > String::kMaxAsciiCharCode) {
- return -1;
- }
- return BMBuffers::bad_char_occurrence[char_code];
- }
- return BMBuffers::bad_char_occurrence[char_code % kBMAlphabetSize];
-}
-
-
-// Restricted simplified Boyer-Moore string matching.
-// Uses only the bad-shift table of Boyer-Moore and only uses it
-// for the character compared to the last character of the needle.
-template <typename SubjectChar, typename PatternChar>
-static int BoyerMooreHorspool(Vector<const SubjectChar> subject,
- Vector<const PatternChar> pattern,
- int start_index,
- bool* complete) {
- ASSERT(algorithm <= BOYER_MOORE_HORSPOOL);
- int n = subject.length();
- int m = pattern.length();
-
- int badness = -m;
+template <typename PatternChar, typename SubjectChar>
+int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
+ StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject,
+ int start_index) {
+ Vector<const PatternChar> pattern = search->pattern_;
+ int subject_length = subject.length();
+ int pattern_length = pattern.length();
+ int* char_occurrences = search->bad_char_table();
+ int badness = -pattern_length;
// How bad we are doing without a good-suffix table.
- int idx; // No matches found prior to this index.
- PatternChar last_char = pattern[m - 1];
- int last_char_shift =
- m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char);
+ PatternChar last_char = pattern[pattern_length - 1];
+ int last_char_shift = pattern_length - 1 -
+ CharOccurrence(char_occurrences, last_char);
// Perform search
- for (idx = start_index; idx <= n - m;) {
- int j = m - 1;
- int c;
- while (last_char != (c = subject[idx + j])) {
- int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c);
+ int index = start_index; // No matches found prior to this index.
+ while (index <= subject_length - pattern_length) {
+ int j = pattern_length - 1;
+ int subject_char;
+ while (last_char != (subject_char = subject[index + j])) {
+ int bc_occ = CharOccurrence(char_occurrences, subject_char);
int shift = j - bc_occ;
- idx += shift;
+ index += shift;
badness += 1 - shift; // at most zero, so badness cannot increase.
- if (idx > n - m) {
- *complete = true;
+ if (index > subject_length - pattern_length) {
return -1;
}
}
j--;
- while (j >= 0 && pattern[j] == (subject[idx + j])) j--;
+ while (j >= 0 && pattern[j] == (subject[index + j])) j--;
if (j < 0) {
- *complete = true;
- return idx;
+ return index;
} else {
- idx += last_char_shift;
+ index += last_char_shift;
// Badness increases by the number of characters we have
// checked, and decreases by the number of characters we
// can skip by shifting. It's a measure of how we are doing
// compared to reading each character exactly once.
- badness += (m - j) - last_char_shift;
+ badness += (pattern_length - j) - last_char_shift;
if (badness > 0) {
- *complete = false;
- return idx;
+ search->PopulateBoyerMooreTable();
+ search->strategy_ = &BoyerMooreSearch;
+ return BoyerMooreSearch(search, subject, index);
}
}
}
- *complete = true;
return -1;
}
-template <typename SubjectChar, typename PatternChar>
-static int BoyerMooreIndexOf(Vector<const SubjectChar> subject,
- Vector<const PatternChar> pattern,
- int idx) {
- ASSERT(algorithm <= BOYER_MOORE);
- int n = subject.length();
- int m = pattern.length();
- // Only preprocess at most kBMMaxShift last characters of pattern.
- int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
+template <typename PatternChar, typename SubjectChar>
+void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
+ int pattern_length = pattern_.length();
- PatternChar last_char = pattern[m - 1];
- // Continue search from i.
- while (idx <= n - m) {
- int j = m - 1;
- SubjectChar c;
- while (last_char != (c = subject[idx + j])) {
- int shift = j - CharOccurrence<SubjectChar, PatternChar>(c);
- idx += shift;
- if (idx > n - m) {
- return -1;
- }
- }
- while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;
- if (j < 0) {
- return idx;
- } else if (j < start) {
- // we have matched more than our tables allow us to be smart about.
- // Fall back on BMH shift.
- idx += m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char);
- } else {
- int gs_shift = BMBuffers::bmgs_buffers.shift(j + 1);
- int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c);
- int shift = j - bc_occ;
- if (gs_shift > shift) {
- shift = gs_shift;
- }
- idx += shift;
+ int* bad_char_occurrence = bad_char_table();
+
+ // Only preprocess at most kBMMaxShift last characters of pattern.
+ int start = start_;
+ // Run forwards to populate bad_char_table, so that *last* instance
+ // of character equivalence class is the one registered.
+ // Notice: Doesn't include the last character.
+ int table_size = AlphabetSize();
+ if (start == 0) { // All patterns less than kBMMaxShift in length.
+ memset(bad_char_occurrence,
+ -1,
+ table_size * sizeof(*bad_char_occurrence));
+ } else {
+ for (int i = 0; i < table_size; i++) {
+ bad_char_occurrence[i] = start - 1;
}
}
-
- return -1;
+ for (int i = start; i < pattern_length - 1; i++) {
+ PatternChar c = pattern_[i];
+ int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
+ bad_char_occurrence[bucket] = i;
+ }
}
+//---------------------------------------------------------------------
+// Linear string search with bailout to BMH.
+//---------------------------------------------------------------------
-// Trivial string search for shorter strings.
-// On return, if "complete" is set to true, the return value is the
-// final result of searching for the patter in the subject.
-// If "complete" is set to false, the return value is the index where
-// further checking should start, i.e., it's guaranteed that the pattern
-// does not occur at a position prior to the returned index.
+// Simple linear search for short patterns, which bails out if the string
+// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
template <typename PatternChar, typename SubjectChar>
-static int SimpleIndexOf(Vector<const SubjectChar> subject,
- Vector<const PatternChar> pattern,
- int idx,
- bool* complete) {
- ASSERT(pattern.length() > 1);
+int StringSearch<PatternChar, SubjectChar>::InitialSearch(
+ StringSearch<PatternChar, SubjectChar>* search,
+ Vector<const SubjectChar> subject,
+ int index) {
+ Vector<const PatternChar> pattern = search->pattern_;
int pattern_length = pattern.length();
// Badness is a count of how much work we have done. When we have
// done enough work we decide it's probably worth switching to a better
@@ -313,149 +518,52 @@ static int SimpleIndexOf(Vector<const SubjectChar> subject,
// We know our pattern is at least 2 characters, we cache the first so
// the common case of the first character not matching is faster.
PatternChar pattern_first_char = pattern[0];
- for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {
+ for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
badness++;
- if (badness > 0) {
- *complete = false;
- return i;
- }
- if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
- const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
- memchr(subject.start() + i,
- pattern_first_char,
- n - i + 1));
- if (pos == NULL) {
- *complete = true;
- return -1;
+ if (badness <= 0) {
+ if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
+ const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
+ memchr(subject.start() + i,
+ pattern_first_char,
+ n - i + 1));
+ if (pos == NULL) {
+ return -1;
+ }
+ i = static_cast<int>(pos - subject.start());
+ } else {
+ if (subject[i] != pattern_first_char) continue;
}
- i = static_cast<int>(pos - subject.start());
- } else {
- if (subject[i] != pattern_first_char) continue;
- }
- int j = 1;
- do {
- if (pattern[j] != subject[i+j]) {
- break;
+ int j = 1;
+ do {
+ if (pattern[j] != subject[i + j]) {
+ break;
+ }
+ j++;
+ } while (j < pattern_length);
+ if (j == pattern_length) {
+ return i;
}
- j++;
- } while (j < pattern_length);
- if (j == pattern_length) {
- *complete = true;
- return i;
- }
- badness += j;
- }
- *complete = true;
- return -1;
-}
-
-// Simple indexOf that never bails out. For short patterns only.
-template <typename PatternChar, typename SubjectChar>
-static int SimpleIndexOf(Vector<const SubjectChar> subject,
- Vector<const PatternChar> pattern,
- int idx) {
- int pattern_length = pattern.length();
- PatternChar pattern_first_char = pattern[0];
- for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {
- if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
- const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
- memchr(subject.start() + i,
- pattern_first_char,
- n - i + 1));
- if (pos == NULL) return -1;
- i = static_cast<int>(pos - subject.start());
+ badness += j;
} else {
- if (subject[i] != pattern_first_char) continue;
- }
- int j = 1;
- while (j < pattern_length) {
- if (pattern[j] != subject[i+j]) {
- break;
- }
- j++;
- }
- if (j == pattern_length) {
- return i;
+ search->PopulateBoyerMooreHorspoolTable();
+ search->strategy_ = &BoyerMooreHorspoolSearch;
+ return BoyerMooreHorspoolSearch(search, subject, i);
}
}
return -1;
}
-// Strategy for searching for a string in another string.
-enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG };
-
-
-template <typename PatternChar>
-static inline StringSearchStrategy InitializeStringSearch(
- Vector<const PatternChar> pat, bool ascii_subject) {
- // We have an ASCII haystack and a non-ASCII needle. Check if there
- // really is a non-ASCII character in the needle and bail out if there
- // is.
- if (ascii_subject && sizeof(PatternChar) > 1) {
- for (int i = 0; i < pat.length(); i++) {
- uc16 c = pat[i];
- if (c > String::kMaxAsciiCharCode) {
- return SEARCH_FAIL;
- }
- }
- }
- if (pat.length() < kBMMinPatternLength) {
- return SEARCH_SHORT;
- }
- algorithm = SIMPLE_SEARCH;
- return SEARCH_LONG;
-}
-
-
-// Dispatch long needle searches to different algorithms.
+// Perform a a single stand-alone search.
+// If searching multiple times for the same pattern, a search
+// object should be constructed once and the Search function then called
+// for each search.
template <typename SubjectChar, typename PatternChar>
-static int ComplexIndexOf(Vector<const SubjectChar> sub,
- Vector<const PatternChar> pat,
- int start_index) {
- ASSERT(pat.length() >= kBMMinPatternLength);
- // Try algorithms in order of increasing setup cost and expected performance.
- bool complete;
- int idx = start_index;
- switch (algorithm) {
- case SIMPLE_SEARCH:
- idx = SimpleIndexOf(sub, pat, idx, &complete);
- if (complete) return idx;
- BoyerMoorePopulateBadCharTable(pat);
- algorithm = BOYER_MOORE_HORSPOOL;
- // FALLTHROUGH.
- case BOYER_MOORE_HORSPOOL:
- idx = BoyerMooreHorspool(sub, pat, idx, &complete);
- if (complete) return idx;
- // Build the Good Suffix table and continue searching.
- BoyerMoorePopulateGoodSuffixTable(pat);
- algorithm = BOYER_MOORE;
- // FALLTHROUGH.
- case BOYER_MOORE:
- return BoyerMooreIndexOf(sub, pat, idx);
- }
- UNREACHABLE();
- return -1;
-}
-
-
-// Dispatch to different search strategies for a single search.
-// If searching multiple times on the same needle, the search
-// strategy should only be computed once and then dispatch to different
-// loops.
-template <typename SubjectChar, typename PatternChar>
-static int StringSearch(Vector<const SubjectChar> sub,
- Vector<const PatternChar> pat,
+static int SearchString(Vector<const SubjectChar> subject,
+ Vector<const PatternChar> pattern,
int start_index) {
- bool ascii_subject = (sizeof(SubjectChar) == 1);
- StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject);
- switch (strategy) {
- case SEARCH_FAIL: return -1;
- case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index);
- case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index);
- }
- UNREACHABLE();
- return -1;
+ StringSearch<PatternChar, SubjectChar> search(pattern);
+ return search.Search(subject, start_index);
}
}} // namespace v8::internal
« no previous file with comments | « src/runtime.cc ('k') | tools/gyp/v8.gyp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698