| Index: src/string-search.h
|
| diff --git a/src/string-search.h b/src/string-search.h
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..1503652bf1812d649eb97cdd1aa5a4f0ee81c98e
|
| --- /dev/null
|
| +++ b/src/string-search.h
|
| @@ -0,0 +1,463 @@
|
| +// Copyright 2010 the V8 project authors. All rights reserved.
|
| +// Redistribution and use in source and binary forms, with or without
|
| +// modification, are permitted provided that the following conditions are
|
| +// met:
|
| +//
|
| +// * Redistributions of source code must retain the above copyright
|
| +// notice, this list of conditions and the following disclaimer.
|
| +// * Redistributions in binary form must reproduce the above
|
| +// copyright notice, this list of conditions and the following
|
| +// disclaimer in the documentation and/or other materials provided
|
| +// with the distribution.
|
| +// * Neither the name of Google Inc. nor the names of its
|
| +// contributors may be used to endorse or promote products derived
|
| +// from this software without specific prior written permission.
|
| +//
|
| +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| +
|
| +#ifndef V8_STRING_SEARCH_H_
|
| +#define V8_STRING_SEARCH_H_
|
| +
|
| +namespace v8 {
|
| +namespace internal {
|
| +
|
| +
|
| +// Cap on the maximal shift in the Boyer-Moore implementation. By setting a
|
| +// limit, we can fix the size of tables. For a needle longer than this limit,
|
| +// search will not be optimal, since we only build tables for a smaller suffix
|
| +// of the string, which is a safe approximation.
|
| +static const int kBMMaxShift = 250;
|
| +// Reduce alphabet to this size.
|
| +// One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
|
| +// proportional to the input alphabet. We reduce the alphabet size by
|
| +// equating input characters modulo a smaller alphabet size. This gives
|
| +// a potentially less efficient searching, but is a safe approximation.
|
| +// For needles using only characters in the same Unicode 256-code point page,
|
| +// there is no search speed degradation.
|
| +static const int kBMAlphabetSize = 256;
|
| +// For patterns below this length, the skip length of Boyer-Moore is too short
|
| +// to compensate for the algorithmic overhead compared to simple brute force.
|
| +static const int kBMMinPatternLength = 7;
|
| +
|
| +// Holds the two buffers used by Boyer-Moore string search's Good Suffix
|
| +// shift. Only allows the last kBMMaxShift characters of the needle
|
| +// to be indexed.
|
| +class BMGoodSuffixBuffers {
|
| + public:
|
| + BMGoodSuffixBuffers() {}
|
| + inline void Initialize(int needle_length) {
|
| + ASSERT(needle_length > 1);
|
| + int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift;
|
| + int len = needle_length - start;
|
| + biased_suffixes_ = suffixes_ - start;
|
| + biased_good_suffix_shift_ = good_suffix_shift_ - start;
|
| + for (int i = 0; i <= len; i++) {
|
| + good_suffix_shift_[i] = len;
|
| + }
|
| + }
|
| + inline int& suffix(int index) {
|
| + ASSERT(biased_suffixes_ + index >= suffixes_);
|
| + return biased_suffixes_[index];
|
| + }
|
| + inline int& shift(int index) {
|
| + ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_);
|
| + return biased_good_suffix_shift_[index];
|
| + }
|
| + private:
|
| + int suffixes_[kBMMaxShift + 1];
|
| + int good_suffix_shift_[kBMMaxShift + 1];
|
| + int* biased_suffixes_;
|
| + int* biased_good_suffix_shift_;
|
| + DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers);
|
| +};
|
| +
|
| +// buffers reused by BoyerMoore
|
| +struct BMBuffers {
|
| + public:
|
| + static int bad_char_occurrence[kBMAlphabetSize];
|
| + static BMGoodSuffixBuffers bmgs_buffers;
|
| +};
|
| +
|
| +// State of the string match tables.
|
| +// SIMPLE: No usable content in the buffers.
|
| +// BOYER_MOORE_HORSPOOL: The bad_char_occurence table has been populated.
|
| +// BOYER_MOORE: The bmgs_buffers tables have also been populated.
|
| +// Whenever starting with a new needle, one should call InitializeStringSearch
|
| +// to determine which search strategy to use, and in the case of a long-needle
|
| +// strategy, the call also initializes the algorithm to SIMPLE.
|
| +enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE };
|
| +static StringSearchAlgorithm algorithm;
|
| +
|
| +
|
| +// Compute the bad-char table for Boyer-Moore in the static buffer.
|
| +template <typename PatternChar>
|
| +static void BoyerMoorePopulateBadCharTable(Vector<const PatternChar> pattern) {
|
| + // Only preprocess at most kBMMaxShift last characters of pattern.
|
| + int start = Max(pattern.length() - kBMMaxShift, 0);
|
| + // Run forwards to populate bad_char_table, so that *last* instance
|
| + // of character equivalence class is the one registered.
|
| + // Notice: Doesn't include the last character.
|
| + int table_size = (sizeof(PatternChar) == 1) ? String::kMaxAsciiCharCode + 1
|
| + : kBMAlphabetSize;
|
| + if (start == 0) { // All patterns less than kBMMaxShift in length.
|
| + memset(BMBuffers::bad_char_occurrence,
|
| + -1,
|
| + table_size * sizeof(*BMBuffers::bad_char_occurrence));
|
| + } else {
|
| + for (int i = 0; i < table_size; i++) {
|
| + BMBuffers::bad_char_occurrence[i] = start - 1;
|
| + }
|
| + }
|
| + for (int i = start; i < pattern.length() - 1; i++) {
|
| + PatternChar c = pattern[i];
|
| + int bucket = (sizeof(PatternChar) ==1) ? c : c % kBMAlphabetSize;
|
| + BMBuffers::bad_char_occurrence[bucket] = i;
|
| + }
|
| +}
|
| +
|
| +
|
| +template <typename PatternChar>
|
| +static void BoyerMoorePopulateGoodSuffixTable(
|
| + Vector<const PatternChar> pattern) {
|
| + int m = pattern.length();
|
| + int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
|
| + int len = m - start;
|
| + // Compute Good Suffix tables.
|
| + BMBuffers::bmgs_buffers.Initialize(m);
|
| +
|
| + BMBuffers::bmgs_buffers.shift(m-1) = 1;
|
| + BMBuffers::bmgs_buffers.suffix(m) = m + 1;
|
| + PatternChar last_char = pattern[m - 1];
|
| + int suffix = m + 1;
|
| + {
|
| + int i = m;
|
| + while (i > start) {
|
| + PatternChar c = pattern[i - 1];
|
| + while (suffix <= m && c != pattern[suffix - 1]) {
|
| + if (BMBuffers::bmgs_buffers.shift(suffix) == len) {
|
| + BMBuffers::bmgs_buffers.shift(suffix) = suffix - i;
|
| + }
|
| + suffix = BMBuffers::bmgs_buffers.suffix(suffix);
|
| + }
|
| + BMBuffers::bmgs_buffers.suffix(--i) = --suffix;
|
| + if (suffix == m) {
|
| + // No suffix to extend, so we check against last_char only.
|
| + while ((i > start) && (pattern[i - 1] != last_char)) {
|
| + if (BMBuffers::bmgs_buffers.shift(m) == len) {
|
| + BMBuffers::bmgs_buffers.shift(m) = m - i;
|
| + }
|
| + BMBuffers::bmgs_buffers.suffix(--i) = m;
|
| + }
|
| + if (i > start) {
|
| + BMBuffers::bmgs_buffers.suffix(--i) = --suffix;
|
| + }
|
| + }
|
| + }
|
| + }
|
| + if (suffix < m) {
|
| + for (int i = start; i <= m; i++) {
|
| + if (BMBuffers::bmgs_buffers.shift(i) == len) {
|
| + BMBuffers::bmgs_buffers.shift(i) = suffix - start;
|
| + }
|
| + if (i == suffix) {
|
| + suffix = BMBuffers::bmgs_buffers.suffix(suffix);
|
| + }
|
| + }
|
| + }
|
| +}
|
| +
|
| +
|
| +template <typename SubjectChar, typename PatternChar>
|
| +static inline int CharOccurrence(int char_code) {
|
| + if (sizeof(SubjectChar) == 1) {
|
| + return BMBuffers::bad_char_occurrence[char_code];
|
| + }
|
| + if (sizeof(PatternChar) == 1) {
|
| + if (char_code > String::kMaxAsciiCharCode) {
|
| + return -1;
|
| + }
|
| + return BMBuffers::bad_char_occurrence[char_code];
|
| + }
|
| + return BMBuffers::bad_char_occurrence[char_code % kBMAlphabetSize];
|
| +}
|
| +
|
| +
|
| +// Restricted simplified Boyer-Moore string matching.
|
| +// Uses only the bad-shift table of Boyer-Moore and only uses it
|
| +// for the character compared to the last character of the needle.
|
| +template <typename SubjectChar, typename PatternChar>
|
| +static int BoyerMooreHorspool(Vector<const SubjectChar> subject,
|
| + Vector<const PatternChar> pattern,
|
| + int start_index,
|
| + bool* complete) {
|
| + ASSERT(algorithm <= BOYER_MOORE_HORSPOOL);
|
| + int n = subject.length();
|
| + int m = pattern.length();
|
| +
|
| + int badness = -m;
|
| +
|
| + // How bad we are doing without a good-suffix table.
|
| + int idx; // No matches found prior to this index.
|
| + PatternChar last_char = pattern[m - 1];
|
| + int last_char_shift =
|
| + m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char);
|
| + // Perform search
|
| + for (idx = start_index; idx <= n - m;) {
|
| + int j = m - 1;
|
| + int c;
|
| + while (last_char != (c = subject[idx + j])) {
|
| + int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c);
|
| + int shift = j - bc_occ;
|
| + idx += shift;
|
| + badness += 1 - shift; // at most zero, so badness cannot increase.
|
| + if (idx > n - m) {
|
| + *complete = true;
|
| + return -1;
|
| + }
|
| + }
|
| + j--;
|
| + while (j >= 0 && pattern[j] == (subject[idx + j])) j--;
|
| + if (j < 0) {
|
| + *complete = true;
|
| + return idx;
|
| + } else {
|
| + idx += last_char_shift;
|
| + // Badness increases by the number of characters we have
|
| + // checked, and decreases by the number of characters we
|
| + // can skip by shifting. It's a measure of how we are doing
|
| + // compared to reading each character exactly once.
|
| + badness += (m - j) - last_char_shift;
|
| + if (badness > 0) {
|
| + *complete = false;
|
| + return idx;
|
| + }
|
| + }
|
| + }
|
| + *complete = true;
|
| + return -1;
|
| +}
|
| +
|
| +
|
| +template <typename SubjectChar, typename PatternChar>
|
| +static int BoyerMooreIndexOf(Vector<const SubjectChar> subject,
|
| + Vector<const PatternChar> pattern,
|
| + int idx) {
|
| + ASSERT(algorithm <= BOYER_MOORE);
|
| + int n = subject.length();
|
| + int m = pattern.length();
|
| + // Only preprocess at most kBMMaxShift last characters of pattern.
|
| + int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
|
| +
|
| + PatternChar last_char = pattern[m - 1];
|
| + // Continue search from i.
|
| + while (idx <= n - m) {
|
| + int j = m - 1;
|
| + SubjectChar c;
|
| + while (last_char != (c = subject[idx + j])) {
|
| + int shift = j - CharOccurrence<SubjectChar, PatternChar>(c);
|
| + idx += shift;
|
| + if (idx > n - m) {
|
| + return -1;
|
| + }
|
| + }
|
| + while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;
|
| + if (j < 0) {
|
| + return idx;
|
| + } else if (j < start) {
|
| + // we have matched more than our tables allow us to be smart about.
|
| + // Fall back on BMH shift.
|
| + idx += m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char);
|
| + } else {
|
| + int gs_shift = BMBuffers::bmgs_buffers.shift(j + 1); // Good suffix shift.
|
| + int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c);
|
| + int shift = j - bc_occ; // Bad-char shift.
|
| + if (gs_shift > shift) {
|
| + shift = gs_shift;
|
| + }
|
| + idx += shift;
|
| + }
|
| + }
|
| +
|
| + return -1;
|
| +}
|
| +
|
| +
|
| +// Trivial string search for shorter strings.
|
| +// On return, if "complete" is set to true, the return value is the
|
| +// final result of searching for the patter in the subject.
|
| +// If "complete" is set to false, the return value is the index where
|
| +// further checking should start, i.e., it's guaranteed that the pattern
|
| +// does not occur at a position prior to the returned index.
|
| +template <typename PatternChar, typename SubjectChar>
|
| +static int SimpleIndexOf(Vector<const SubjectChar> subject,
|
| + Vector<const PatternChar> pattern,
|
| + int idx,
|
| + bool* complete) {
|
| + ASSERT(pattern.length() > 1);
|
| + int pattern_length = pattern.length();
|
| + // Badness is a count of how much work we have done. When we have
|
| + // done enough work we decide it's probably worth switching to a better
|
| + // algorithm.
|
| + int badness = -10 - (pattern_length << 2);
|
| +
|
| + // We know our pattern is at least 2 characters, we cache the first so
|
| + // the common case of the first character not matching is faster.
|
| + PatternChar pattern_first_char = pattern[0];
|
| + for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {
|
| + badness++;
|
| + if (badness > 0) {
|
| + *complete = false;
|
| + return i;
|
| + }
|
| + if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
|
| + const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
|
| + memchr(subject.start() + i,
|
| + pattern_first_char,
|
| + n - i + 1));
|
| + if (pos == NULL) {
|
| + *complete = true;
|
| + return -1;
|
| + }
|
| + i = static_cast<int>(pos - subject.start());
|
| + } else {
|
| + if (subject[i] != pattern_first_char) continue;
|
| + }
|
| + int j = 1;
|
| + do {
|
| + if (pattern[j] != subject[i+j]) {
|
| + break;
|
| + }
|
| + j++;
|
| + } while (j < pattern_length);
|
| + if (j == pattern_length) {
|
| + *complete = true;
|
| + return i;
|
| + }
|
| + badness += j;
|
| + }
|
| + *complete = true;
|
| + return -1;
|
| +}
|
| +
|
| +// Simple indexOf that never bails out. For short patterns only.
|
| +template <typename PatternChar, typename SubjectChar>
|
| +static int SimpleIndexOf(Vector<const SubjectChar> subject,
|
| + Vector<const PatternChar> pattern,
|
| + int idx) {
|
| + int pattern_length = pattern.length();
|
| + PatternChar pattern_first_char = pattern[0];
|
| + for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {
|
| + if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
|
| + const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
|
| + memchr(subject.start() + i,
|
| + pattern_first_char,
|
| + n - i + 1));
|
| + if (pos == NULL) return -1;
|
| + i = static_cast<int>(pos - subject.start());
|
| + } else {
|
| + if (subject[i] != pattern_first_char) continue;
|
| + }
|
| + int j = 1;
|
| + while (j < pattern_length) {
|
| + if (pattern[j] != subject[i+j]) {
|
| + break;
|
| + }
|
| + j++;
|
| + }
|
| + if (j == pattern_length) {
|
| + return i;
|
| + }
|
| + }
|
| + return -1;
|
| +}
|
| +
|
| +
|
| +// Strategy for searching for a string in another string.
|
| +enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG };
|
| +
|
| +
|
| +template <typename PatternChar>
|
| +static inline StringSearchStrategy InitializeStringSearch(
|
| + Vector<const PatternChar> pat, bool ascii_subject) {
|
| + // We have an ASCII haystack and a non-ASCII needle. Check if there
|
| + // really is a non-ASCII character in the needle and bail out if there
|
| + // is.
|
| + if (ascii_subject && sizeof(PatternChar) > 1) {
|
| + for (int i = 0; i < pat.length(); i++) {
|
| + uc16 c = pat[i];
|
| + if (c > String::kMaxAsciiCharCode) {
|
| + return SEARCH_FAIL;
|
| + }
|
| + }
|
| + }
|
| + if (pat.length() < kBMMinPatternLength) {
|
| + return SEARCH_SHORT;
|
| + }
|
| + algorithm = SIMPLE_SEARCH;
|
| + return SEARCH_LONG;
|
| +}
|
| +
|
| +
|
| +// Dispatch long needle searches to different algorithms.
|
| +template <typename SubjectChar, typename PatternChar>
|
| +static int ComplexIndexOf(Vector<const SubjectChar> sub,
|
| + Vector<const PatternChar> pat,
|
| + int start_index) {
|
| + ASSERT(pat.length() >= kBMMinPatternLength);
|
| + // Try algorithms in order of increasing setup cost and expected performance.
|
| + bool complete;
|
| + int idx = start_index;
|
| + switch (algorithm) {
|
| + case SIMPLE_SEARCH:
|
| + idx = SimpleIndexOf(sub, pat, idx, &complete);
|
| + if (complete) return idx;
|
| + BoyerMoorePopulateBadCharTable(pat);
|
| + algorithm = BOYER_MOORE_HORSPOOL;
|
| + // FALLTHROUGH.
|
| + case BOYER_MOORE_HORSPOOL:
|
| + idx = BoyerMooreHorspool(sub, pat, idx, &complete);
|
| + if (complete) return idx;
|
| + // Build the Good Suffix table and continue searching.
|
| + BoyerMoorePopulateGoodSuffixTable(pat);
|
| + algorithm = BOYER_MOORE;
|
| + // FALLTHROUGH.
|
| + case BOYER_MOORE:
|
| + return BoyerMooreIndexOf(sub, pat, idx);
|
| + }
|
| + UNREACHABLE();
|
| + return -1;
|
| +}
|
| +
|
| +
|
| +// Dispatch to different search strategies for a single search.
|
| +// If searching multiple times on the same needle, the search
|
| +// strategy should only be computed once and then dispatch to different
|
| +// loops.
|
| +template <typename SubjectChar, typename PatternChar>
|
| +static int StringSearch(Vector<const SubjectChar> sub,
|
| + Vector<const PatternChar> pat,
|
| + int start_index) {
|
| + bool ascii_subject = (sizeof(SubjectChar) == 1);
|
| + StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject);
|
| + switch (strategy) {
|
| + case SEARCH_FAIL: return -1;
|
| + case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index);
|
| + case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index);
|
| + }
|
| + UNREACHABLE();
|
| + return -1;
|
| +}
|
| +
|
| +}} // namespace v8::internal
|
| +
|
| +#endif // V8_STRING_SEARCH_H_
|
|
|