Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(118)

Unified Diff: src/runtime.cc

Issue 3291021: Move string-search functions to separate file. (Closed)
Patch Set: Address (some)co Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | src/string-search.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/runtime.cc
diff --git a/src/runtime.cc b/src/runtime.cc
index f9c2e5bea6995953adc9f7cd1e720d337da3b37d..30ba42ff2042a8d34032fbc7ebf2a0c8fc27e9f7 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -47,6 +47,7 @@
#include "smart-pointer.h"
#include "stub-cache.h"
#include "v8threads.h"
+#include "string-search.h"
namespace v8 {
namespace internal {
@@ -2571,420 +2572,6 @@ static Object* Runtime_StringReplaceRegExpWithString(Arguments args) {
}
-// Cap on the maximal shift in the Boyer-Moore implementation. By setting a
-// limit, we can fix the size of tables.
-static const int kBMMaxShift = 0xff;
-// Reduce alphabet to this size.
-static const int kBMAlphabetSize = 0x100;
-// For patterns below this length, the skip length of Boyer-Moore is too short
-// to compensate for the algorithmic overhead compared to simple brute force.
-static const int kBMMinPatternLength = 7;
-
-// Holds the two buffers used by Boyer-Moore string search's Good Suffix
-// shift. Only allows the last kBMMaxShift characters of the needle
-// to be indexed.
-class BMGoodSuffixBuffers {
- public:
- BMGoodSuffixBuffers() {}
- inline void init(int needle_length) {
- ASSERT(needle_length > 1);
- int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift;
- int len = needle_length - start;
- biased_suffixes_ = suffixes_ - start;
- biased_good_suffix_shift_ = good_suffix_shift_ - start;
- for (int i = 0; i <= len; i++) {
- good_suffix_shift_[i] = len;
- }
- }
- inline int& suffix(int index) {
- ASSERT(biased_suffixes_ + index >= suffixes_);
- return biased_suffixes_[index];
- }
- inline int& shift(int index) {
- ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_);
- return biased_good_suffix_shift_[index];
- }
- private:
- int suffixes_[kBMMaxShift + 1];
- int good_suffix_shift_[kBMMaxShift + 1];
- int* biased_suffixes_;
- int* biased_good_suffix_shift_;
- DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers);
-};
-
-// buffers reused by BoyerMoore
-static int bad_char_occurrence[kBMAlphabetSize];
-static BMGoodSuffixBuffers bmgs_buffers;
-
-// State of the string match tables.
-// SIMPLE: No usable content in the buffers.
-// BOYER_MOORE_HORSPOOL: The bad_char_occurences table has been populated.
-// BOYER_MOORE: The bmgs_buffers tables have also been populated.
-// Whenever starting with a new needle, one should call InitializeStringSearch
-// to determine which search strategy to use, and in the case of a long-needle
-// strategy, the call also initializes the algorithm to SIMPLE.
-enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE };
-static StringSearchAlgorithm algorithm;
-
-
-// Compute the bad-char table for Boyer-Moore in the static buffer.
-template <typename pchar>
-static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern) {
- // Only preprocess at most kBMMaxShift last characters of pattern.
- int start = pattern.length() < kBMMaxShift ? 0
- : pattern.length() - kBMMaxShift;
- // Run forwards to populate bad_char_table, so that *last* instance
- // of character equivalence class is the one registered.
- // Notice: Doesn't include the last character.
- int table_size = (sizeof(pchar) == 1) ? String::kMaxAsciiCharCode + 1
- : kBMAlphabetSize;
- if (start == 0) { // All patterns less than kBMMaxShift in length.
- memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
- } else {
- for (int i = 0; i < table_size; i++) {
- bad_char_occurrence[i] = start - 1;
- }
- }
- for (int i = start; i < pattern.length() - 1; i++) {
- pchar c = pattern[i];
- int bucket = (sizeof(pchar) ==1) ? c : c % kBMAlphabetSize;
- bad_char_occurrence[bucket] = i;
- }
-}
-
-
-template <typename pchar>
-static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern) {
- int m = pattern.length();
- int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
- int len = m - start;
- // Compute Good Suffix tables.
- bmgs_buffers.init(m);
-
- bmgs_buffers.shift(m-1) = 1;
- bmgs_buffers.suffix(m) = m + 1;
- pchar last_char = pattern[m - 1];
- int suffix = m + 1;
- for (int i = m; i > start;) {
- for (pchar c = pattern[i - 1]; suffix <= m && c != pattern[suffix - 1];) {
- if (bmgs_buffers.shift(suffix) == len) {
- bmgs_buffers.shift(suffix) = suffix - i;
- }
- suffix = bmgs_buffers.suffix(suffix);
- }
- i--;
- suffix--;
- bmgs_buffers.suffix(i) = suffix;
- if (suffix == m) {
- // No suffix to extend, so we check against last_char only.
- while (i > start && pattern[i - 1] != last_char) {
- if (bmgs_buffers.shift(m) == len) {
- bmgs_buffers.shift(m) = m - i;
- }
- i--;
- bmgs_buffers.suffix(i) = m;
- }
- if (i > start) {
- i--;
- suffix--;
- bmgs_buffers.suffix(i) = suffix;
- }
- }
- }
- if (suffix < m) {
- for (int i = start; i <= m; i++) {
- if (bmgs_buffers.shift(i) == len) {
- bmgs_buffers.shift(i) = suffix - start;
- }
- if (i == suffix) {
- suffix = bmgs_buffers.suffix(suffix);
- }
- }
- }
-}
-
-
-template <typename schar, typename pchar>
-static inline int CharOccurrence(int char_code) {
- if (sizeof(schar) == 1) {
- return bad_char_occurrence[char_code];
- }
- if (sizeof(pchar) == 1) {
- if (char_code > String::kMaxAsciiCharCode) {
- return -1;
- }
- return bad_char_occurrence[char_code];
- }
- return bad_char_occurrence[char_code % kBMAlphabetSize];
-}
-
-
-// Restricted simplified Boyer-Moore string matching.
-// Uses only the bad-shift table of Boyer-Moore and only uses it
-// for the character compared to the last character of the needle.
-template <typename schar, typename pchar>
-static int BoyerMooreHorspool(Vector<const schar> subject,
- Vector<const pchar> pattern,
- int start_index,
- bool* complete) {
- ASSERT(algorithm <= BOYER_MOORE_HORSPOOL);
- int n = subject.length();
- int m = pattern.length();
-
- int badness = -m;
-
- // How bad we are doing without a good-suffix table.
- int idx; // No matches found prior to this index.
- pchar last_char = pattern[m - 1];
- int last_char_shift = m - 1 - CharOccurrence<schar, pchar>(last_char);
- // Perform search
- for (idx = start_index; idx <= n - m;) {
- int j = m - 1;
- int c;
- while (last_char != (c = subject[idx + j])) {
- int bc_occ = CharOccurrence<schar, pchar>(c);
- int shift = j - bc_occ;
- idx += shift;
- badness += 1 - shift; // at most zero, so badness cannot increase.
- if (idx > n - m) {
- *complete = true;
- return -1;
- }
- }
- j--;
- while (j >= 0 && pattern[j] == (subject[idx + j])) j--;
- if (j < 0) {
- *complete = true;
- return idx;
- } else {
- idx += last_char_shift;
- // Badness increases by the number of characters we have
- // checked, and decreases by the number of characters we
- // can skip by shifting. It's a measure of how we are doing
- // compared to reading each character exactly once.
- badness += (m - j) - last_char_shift;
- if (badness > 0) {
- *complete = false;
- return idx;
- }
- }
- }
- *complete = true;
- return -1;
-}
-
-
-template <typename schar, typename pchar>
-static int BoyerMooreIndexOf(Vector<const schar> subject,
- Vector<const pchar> pattern,
- int idx) {
- ASSERT(algorithm <= BOYER_MOORE);
- int n = subject.length();
- int m = pattern.length();
- // Only preprocess at most kBMMaxShift last characters of pattern.
- int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
-
- pchar last_char = pattern[m - 1];
- // Continue search from i.
- while (idx <= n - m) {
- int j = m - 1;
- schar c;
- while (last_char != (c = subject[idx + j])) {
- int shift = j - CharOccurrence<schar, pchar>(c);
- idx += shift;
- if (idx > n - m) {
- return -1;
- }
- }
- while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;
- if (j < 0) {
- return idx;
- } else if (j < start) {
- // we have matched more than our tables allow us to be smart about.
- // Fall back on BMH shift.
- idx += m - 1 - CharOccurrence<schar, pchar>(last_char);
- } else {
- int gs_shift = bmgs_buffers.shift(j + 1); // Good suffix shift.
- int bc_occ = CharOccurrence<schar, pchar>(c);
- int shift = j - bc_occ; // Bad-char shift.
- if (gs_shift > shift) {
- shift = gs_shift;
- }
- idx += shift;
- }
- }
-
- return -1;
-}
-
-
-// Trivial string search for shorter strings.
-// On return, if "complete" is set to true, the return value is the
-// final result of searching for the patter in the subject.
-// If "complete" is set to false, the return value is the index where
-// further checking should start, i.e., it's guaranteed that the pattern
-// does not occur at a position prior to the returned index.
-template <typename pchar, typename schar>
-static int SimpleIndexOf(Vector<const schar> subject,
- Vector<const pchar> pattern,
- int idx,
- bool* complete) {
- ASSERT(pattern.length() > 1);
- int pattern_length = pattern.length();
- // Badness is a count of how much work we have done. When we have
- // done enough work we decide it's probably worth switching to a better
- // algorithm.
- int badness = -10 - (pattern_length << 2);
-
- // We know our pattern is at least 2 characters, we cache the first so
- // the common case of the first character not matching is faster.
- pchar pattern_first_char = pattern[0];
- for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {
- badness++;
- if (badness > 0) {
- *complete = false;
- return i;
- }
- if (sizeof(schar) == 1 && sizeof(pchar) == 1) {
- const schar* pos = reinterpret_cast<const schar*>(
- memchr(subject.start() + i,
- pattern_first_char,
- n - i + 1));
- if (pos == NULL) {
- *complete = true;
- return -1;
- }
- i = static_cast<int>(pos - subject.start());
- } else {
- if (subject[i] != pattern_first_char) continue;
- }
- int j = 1;
- do {
- if (pattern[j] != subject[i+j]) {
- break;
- }
- j++;
- } while (j < pattern_length);
- if (j == pattern_length) {
- *complete = true;
- return i;
- }
- badness += j;
- }
- *complete = true;
- return -1;
-}
-
-// Simple indexOf that never bails out. For short patterns only.
-template <typename pchar, typename schar>
-static int SimpleIndexOf(Vector<const schar> subject,
- Vector<const pchar> pattern,
- int idx) {
- int pattern_length = pattern.length();
- pchar pattern_first_char = pattern[0];
- for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {
- if (sizeof(schar) == 1 && sizeof(pchar) == 1) {
- const schar* pos = reinterpret_cast<const schar*>(
- memchr(subject.start() + i,
- pattern_first_char,
- n - i + 1));
- if (pos == NULL) return -1;
- i = static_cast<int>(pos - subject.start());
- } else {
- if (subject[i] != pattern_first_char) continue;
- }
- int j = 1;
- while (j < pattern_length) {
- if (pattern[j] != subject[i+j]) {
- break;
- }
- j++;
- }
- if (j == pattern_length) {
- return i;
- }
- }
- return -1;
-}
-
-
-// Strategy for searching for a string in another string.
-enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG };
-
-
-template <typename pchar>
-static inline StringSearchStrategy InitializeStringSearch(
- Vector<const pchar> pat, bool ascii_subject) {
- // We have an ASCII haystack and a non-ASCII needle. Check if there
- // really is a non-ASCII character in the needle and bail out if there
- // is.
- if (ascii_subject && sizeof(pchar) > 1) {
- for (int i = 0; i < pat.length(); i++) {
- uc16 c = pat[i];
- if (c > String::kMaxAsciiCharCode) {
- return SEARCH_FAIL;
- }
- }
- }
- if (pat.length() < kBMMinPatternLength) {
- return SEARCH_SHORT;
- }
- algorithm = SIMPLE_SEARCH;
- return SEARCH_LONG;
-}
-
-
-// Dispatch long needle searches to different algorithms.
-template <typename schar, typename pchar>
-static int ComplexIndexOf(Vector<const schar> sub,
- Vector<const pchar> pat,
- int start_index) {
- ASSERT(pat.length() >= kBMMinPatternLength);
- // Try algorithms in order of increasing setup cost and expected performance.
- bool complete;
- int idx = start_index;
- switch (algorithm) {
- case SIMPLE_SEARCH:
- idx = SimpleIndexOf(sub, pat, idx, &complete);
- if (complete) return idx;
- BoyerMoorePopulateBadCharTable(pat);
- algorithm = BOYER_MOORE_HORSPOOL;
- // FALLTHROUGH.
- case BOYER_MOORE_HORSPOOL:
- idx = BoyerMooreHorspool(sub, pat, idx, &complete);
- if (complete) return idx;
- // Build the Good Suffix table and continue searching.
- BoyerMoorePopulateGoodSuffixTable(pat);
- algorithm = BOYER_MOORE;
- // FALLTHROUGH.
- case BOYER_MOORE:
- return BoyerMooreIndexOf(sub, pat, idx);
- }
- UNREACHABLE();
- return -1;
-}
-
-
-// Dispatch to different search strategies for a single search.
-// If searching multiple times on the same needle, the search
-// strategy should only be computed once and then dispatch to different
-// loops.
-template <typename schar, typename pchar>
-static int StringSearch(Vector<const schar> sub,
- Vector<const pchar> pat,
- int start_index) {
- bool ascii_subject = (sizeof(schar) == 1);
- StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject);
- switch (strategy) {
- case SEARCH_FAIL: return -1;
- case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index);
- case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index);
- }
- UNREACHABLE();
- return -1;
-}
-
-
// Perform string match of pattern on subject, starting at start index.
// Caller must ensure that 0 <= start_index <= sub->length(),
// and should check that pat->length() + start_index <= sub->length()
@@ -5170,6 +4757,12 @@ static Object* Runtime_StringTrim(Arguments args) {
}
+// Define storage for buffers declared in header file.
+// TODO(lrn): Remove these when rewriting search code.
+int BMBuffers::bad_char_occurrence[kBMAlphabetSize];
+BMGoodSuffixBuffers BMBuffers::bmgs_buffers;
+
+
template <typename schar, typename pchar>
void FindStringIndices(Vector<const schar> subject,
Vector<const pchar> pattern,
« no previous file with comments | « no previous file | src/string-search.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698