src/string-search.h - Issue 3291021: Move string-search functions to separate file.

Unified Diff: src/string-search.h

Issue 3291021: Move string-search functions to separate file. (Closed)

Patch Set: Address (some)co Created 10 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/string-search.h

diff --git a/src/string-search.h b/src/string-search.h

new file mode 100644

index 0000000000000000000000000000000000000000..1503652bf1812d649eb97cdd1aa5a4f0ee81c98e

--- /dev/null

+++ b/src/string-search.h

@@ -0,0 +1,463 @@

+// Redistribution and use in source and binary forms, with or without

+// modification, are permitted provided that the following conditions are

+// met:

+//

+// * Redistributions of source code must retain the above copyright

+// notice, this list of conditions and the following disclaimer.

+// * Redistributions in binary form must reproduce the above

+// copyright notice, this list of conditions and the following

+// disclaimer in the documentation and/or other materials provided

+// with the distribution.

+// * Neither the name of Google Inc. nor the names of its

+// contributors may be used to endorse or promote products derived

+// from this software without specific prior written permission.

+//

+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+#ifndef V8_STRING_SEARCH_H_

+#define V8_STRING_SEARCH_H_

+namespace v8 {

+namespace internal {

+// Cap on the maximal shift in the Boyer-Moore implementation. By setting a

+// limit, we can fix the size of tables. For a needle longer than this limit,

+// search will not be optimal, since we only build tables for a smaller suffix

+// of the string, which is a safe approximation.

+static const int kBMMaxShift = 250;

+// Reduce alphabet to this size.

+// One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size

+// proportional to the input alphabet. We reduce the alphabet size by

+// equating input characters modulo a smaller alphabet size. This gives

+// a potentially less efficient searching, but is a safe approximation.

+// For needles using only characters in the same Unicode 256-code point page,

+// there is no search speed degradation.

+static const int kBMAlphabetSize = 256;

+// For patterns below this length, the skip length of Boyer-Moore is too short

+// to compensate for the algorithmic overhead compared to simple brute force.

+static const int kBMMinPatternLength = 7;

+// Holds the two buffers used by Boyer-Moore string search's Good Suffix

+// shift. Only allows the last kBMMaxShift characters of the needle

+// to be indexed.

+class BMGoodSuffixBuffers {

+ public:

+ BMGoodSuffixBuffers() {}

+ inline void Initialize(int needle_length) {

+ ASSERT(needle_length > 1);

+ int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift;

+ int len = needle_length - start;

+ biased_suffixes_ = suffixes_ - start;

+ biased_good_suffix_shift_ = good_suffix_shift_ - start;

+ for (int i = 0; i <= len; i++) {

+ good_suffix_shift_[i] = len;

+ }

+ inline int& suffix(int index) {

+ ASSERT(biased_suffixes_ + index >= suffixes_);

+ return biased_suffixes_[index];

+ }

+ inline int& shift(int index) {

+ ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_);

+ return biased_good_suffix_shift_[index];

+ }

+ private:

+ int suffixes_[kBMMaxShift + 1];

+ int good_suffix_shift_[kBMMaxShift + 1];

+ int* biased_suffixes_;

+ int* biased_good_suffix_shift_;

+ DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers);

+};

+// buffers reused by BoyerMoore

+struct BMBuffers {

+ public:

+ static int bad_char_occurrence[kBMAlphabetSize];

+ static BMGoodSuffixBuffers bmgs_buffers;

+};

+// State of the string match tables.

+// SIMPLE: No usable content in the buffers.

+// BOYER_MOORE_HORSPOOL: The bad_char_occurence table has been populated.

+// BOYER_MOORE: The bmgs_buffers tables have also been populated.

+// Whenever starting with a new needle, one should call InitializeStringSearch

+// to determine which search strategy to use, and in the case of a long-needle

+// strategy, the call also initializes the algorithm to SIMPLE.

+enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE };

+static StringSearchAlgorithm algorithm;

+// Compute the bad-char table for Boyer-Moore in the static buffer.

+template <typename PatternChar>

+static void BoyerMoorePopulateBadCharTable(Vector<const PatternChar> pattern) {

+ // Only preprocess at most kBMMaxShift last characters of pattern.

+ int start = Max(pattern.length() - kBMMaxShift, 0);

+ // Run forwards to populate bad_char_table, so that *last* instance

+ // of character equivalence class is the one registered.

+ // Notice: Doesn't include the last character.

+ int table_size = (sizeof(PatternChar) == 1) ? String::kMaxAsciiCharCode + 1

+ : kBMAlphabetSize;

+ if (start == 0) { // All patterns less than kBMMaxShift in length.

+ memset(BMBuffers::bad_char_occurrence,

+ -1,

+ table_size * sizeof(*BMBuffers::bad_char_occurrence));

+ } else {

+ for (int i = 0; i < table_size; i++) {

+ BMBuffers::bad_char_occurrence[i] = start - 1;

+ }

+ for (int i = start; i < pattern.length() - 1; i++) {

+ PatternChar c = pattern[i];

+ int bucket = (sizeof(PatternChar) ==1) ? c : c % kBMAlphabetSize;

+ BMBuffers::bad_char_occurrence[bucket] = i;

+ }

+template <typename PatternChar>

+static void BoyerMoorePopulateGoodSuffixTable(

+ Vector<const PatternChar> pattern) {

+ int m = pattern.length();

+ int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;

+ int len = m - start;

+ // Compute Good Suffix tables.

+ BMBuffers::bmgs_buffers.Initialize(m);

+ BMBuffers::bmgs_buffers.shift(m-1) = 1;

+ BMBuffers::bmgs_buffers.suffix(m) = m + 1;

+ PatternChar last_char = pattern[m - 1];

+ int suffix = m + 1;

+ {

+ int i = m;

+ while (i > start) {

+ PatternChar c = pattern[i - 1];

+ while (suffix <= m && c != pattern[suffix - 1]) {

+ if (BMBuffers::bmgs_buffers.shift(suffix) == len) {

+ BMBuffers::bmgs_buffers.shift(suffix) = suffix - i;

+ }

+ suffix = BMBuffers::bmgs_buffers.suffix(suffix);

+ }

+ BMBuffers::bmgs_buffers.suffix(--i) = --suffix;

+ if (suffix == m) {

+ // No suffix to extend, so we check against last_char only.

+ while ((i > start) && (pattern[i - 1] != last_char)) {

+ if (BMBuffers::bmgs_buffers.shift(m) == len) {

+ BMBuffers::bmgs_buffers.shift(m) = m - i;

+ }

+ BMBuffers::bmgs_buffers.suffix(--i) = m;

+ }

+ if (i > start) {

+ BMBuffers::bmgs_buffers.suffix(--i) = --suffix;

+ }

+ if (suffix < m) {

+ for (int i = start; i <= m; i++) {

+ if (BMBuffers::bmgs_buffers.shift(i) == len) {

+ BMBuffers::bmgs_buffers.shift(i) = suffix - start;

+ }

+ if (i == suffix) {

+ suffix = BMBuffers::bmgs_buffers.suffix(suffix);

+ }

+template <typename SubjectChar, typename PatternChar>

+static inline int CharOccurrence(int char_code) {

+ if (sizeof(SubjectChar) == 1) {

+ return BMBuffers::bad_char_occurrence[char_code];

+ }

+ if (sizeof(PatternChar) == 1) {

+ if (char_code > String::kMaxAsciiCharCode) {

+ return -1;

+ }

+ return BMBuffers::bad_char_occurrence[char_code];

+ }

+ return BMBuffers::bad_char_occurrence[char_code % kBMAlphabetSize];

+// Restricted simplified Boyer-Moore string matching.

+// Uses only the bad-shift table of Boyer-Moore and only uses it

+// for the character compared to the last character of the needle.

+template <typename SubjectChar, typename PatternChar>

+static int BoyerMooreHorspool(Vector<const SubjectChar> subject,

+ Vector<const PatternChar> pattern,

+ int start_index,

+ bool* complete) {

+ ASSERT(algorithm <= BOYER_MOORE_HORSPOOL);

+ int n = subject.length();

+ int m = pattern.length();

+ int badness = -m;

+ // How bad we are doing without a good-suffix table.

+ int idx; // No matches found prior to this index.

+ PatternChar last_char = pattern[m - 1];

+ int last_char_shift =

+ m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char);

+ // Perform search

+ for (idx = start_index; idx <= n - m;) {

+ int j = m - 1;

+ int c;

+ while (last_char != (c = subject[idx + j])) {

+ int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c);

+ int shift = j - bc_occ;

+ idx += shift;

+ badness += 1 - shift; // at most zero, so badness cannot increase.

+ if (idx > n - m) {

+ *complete = true;

+ return -1;

+ }

+ j--;

+ while (j >= 0 && pattern[j] == (subject[idx + j])) j--;

+ if (j < 0) {

+ *complete = true;

+ return idx;

+ } else {

+ idx += last_char_shift;

+ // Badness increases by the number of characters we have

+ // checked, and decreases by the number of characters we

+ // can skip by shifting. It's a measure of how we are doing

+ // compared to reading each character exactly once.

+ badness += (m - j) - last_char_shift;

+ if (badness > 0) {

+ *complete = false;

+ return idx;

+ }

+ *complete = true;

+ return -1;

+template <typename SubjectChar, typename PatternChar>

+static int BoyerMooreIndexOf(Vector<const SubjectChar> subject,

+ Vector<const PatternChar> pattern,

+ int idx) {

+ ASSERT(algorithm <= BOYER_MOORE);

+ int n = subject.length();

+ int m = pattern.length();

+ // Only preprocess at most kBMMaxShift last characters of pattern.

+ int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;

+ PatternChar last_char = pattern[m - 1];

+ // Continue search from i.

+ while (idx <= n - m) {

+ int j = m - 1;

+ SubjectChar c;

+ while (last_char != (c = subject[idx + j])) {

+ int shift = j - CharOccurrence<SubjectChar, PatternChar>(c);

+ idx += shift;

+ if (idx > n - m) {

+ return -1;

+ }

+ while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;

+ if (j < 0) {

+ return idx;

+ } else if (j < start) {

+ // we have matched more than our tables allow us to be smart about.

+ // Fall back on BMH shift.

+ idx += m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char);

+ } else {

+ int gs_shift = BMBuffers::bmgs_buffers.shift(j + 1); // Good suffix shift.

+ int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c);

+ int shift = j - bc_occ; // Bad-char shift.

+ if (gs_shift > shift) {

+ shift = gs_shift;

+ }

+ idx += shift;

+ }

+ return -1;

+// Trivial string search for shorter strings.

+// On return, if "complete" is set to true, the return value is the

+// final result of searching for the patter in the subject.

+// If "complete" is set to false, the return value is the index where

+// further checking should start, i.e., it's guaranteed that the pattern

+// does not occur at a position prior to the returned index.

+template <typename PatternChar, typename SubjectChar>

+static int SimpleIndexOf(Vector<const SubjectChar> subject,

+ Vector<const PatternChar> pattern,

+ int idx,

+ bool* complete) {

+ ASSERT(pattern.length() > 1);

+ int pattern_length = pattern.length();

+ // Badness is a count of how much work we have done. When we have

+ // done enough work we decide it's probably worth switching to a better

+ // algorithm.

+ int badness = -10 - (pattern_length << 2);

+ // We know our pattern is at least 2 characters, we cache the first so

+ // the common case of the first character not matching is faster.

+ PatternChar pattern_first_char = pattern[0];

+ for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {

+ badness++;

+ if (badness > 0) {

+ *complete = false;

+ return i;

+ }

+ if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {

+ const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(

+ memchr(subject.start() + i,

+ pattern_first_char,

+ n - i + 1));

+ if (pos == NULL) {

+ *complete = true;

+ return -1;

+ }

+ i = static_cast<int>(pos - subject.start());

+ } else {

+ if (subject[i] != pattern_first_char) continue;

+ }

+ int j = 1;

+ do {

+ if (pattern[j] != subject[i+j]) {

+ break;

+ }

+ j++;

+ } while (j < pattern_length);

+ if (j == pattern_length) {

+ *complete = true;

+ return i;

+ }

+ badness += j;

+ }

+ *complete = true;

+ return -1;

+// Simple indexOf that never bails out. For short patterns only.

+template <typename PatternChar, typename SubjectChar>

+static int SimpleIndexOf(Vector<const SubjectChar> subject,

+ Vector<const PatternChar> pattern,

+ int idx) {

+ int pattern_length = pattern.length();

+ PatternChar pattern_first_char = pattern[0];

+ for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {

+ if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {

+ const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(

+ memchr(subject.start() + i,

+ pattern_first_char,

+ n - i + 1));

+ if (pos == NULL) return -1;

+ i = static_cast<int>(pos - subject.start());

+ } else {

+ if (subject[i] != pattern_first_char) continue;

+ }

+ int j = 1;

+ while (j < pattern_length) {

+ if (pattern[j] != subject[i+j]) {

+ break;

+ }

+ j++;

+ }

+ if (j == pattern_length) {

+ return i;

+ }

+ return -1;

+// Strategy for searching for a string in another string.

+enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG };

+template <typename PatternChar>

+static inline StringSearchStrategy InitializeStringSearch(

+ Vector<const PatternChar> pat, bool ascii_subject) {

+ // We have an ASCII haystack and a non-ASCII needle. Check if there

+ // really is a non-ASCII character in the needle and bail out if there

+ // is.

+ if (ascii_subject && sizeof(PatternChar) > 1) {

+ for (int i = 0; i < pat.length(); i++) {

+ uc16 c = pat[i];

+ if (c > String::kMaxAsciiCharCode) {

+ return SEARCH_FAIL;

+ }

+ if (pat.length() < kBMMinPatternLength) {

+ return SEARCH_SHORT;

+ }

+ algorithm = SIMPLE_SEARCH;

+ return SEARCH_LONG;

+// Dispatch long needle searches to different algorithms.

+template <typename SubjectChar, typename PatternChar>

+static int ComplexIndexOf(Vector<const SubjectChar> sub,

+ Vector<const PatternChar> pat,

+ int start_index) {

+ ASSERT(pat.length() >= kBMMinPatternLength);

+ // Try algorithms in order of increasing setup cost and expected performance.

+ bool complete;

+ int idx = start_index;

+ switch (algorithm) {

+ case SIMPLE_SEARCH:

+ idx = SimpleIndexOf(sub, pat, idx, &complete);

+ if (complete) return idx;

+ BoyerMoorePopulateBadCharTable(pat);

+ algorithm = BOYER_MOORE_HORSPOOL;

+ // FALLTHROUGH.

+ case BOYER_MOORE_HORSPOOL:

+ idx = BoyerMooreHorspool(sub, pat, idx, &complete);

+ if (complete) return idx;

+ // Build the Good Suffix table and continue searching.

+ BoyerMoorePopulateGoodSuffixTable(pat);

+ algorithm = BOYER_MOORE;

+ // FALLTHROUGH.

+ case BOYER_MOORE:

+ return BoyerMooreIndexOf(sub, pat, idx);

+ }

+ UNREACHABLE();

+ return -1;

+// Dispatch to different search strategies for a single search.

+// If searching multiple times on the same needle, the search

+// strategy should only be computed once and then dispatch to different

+// loops.

+template <typename SubjectChar, typename PatternChar>

+static int StringSearch(Vector<const SubjectChar> sub,

+ Vector<const PatternChar> pat,

+ int start_index) {

+ bool ascii_subject = (sizeof(SubjectChar) == 1);

+ StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject);

+ switch (strategy) {

+ case SEARCH_FAIL: return -1;

+ case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index);

+ case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index);

+ }

+ UNREACHABLE();

+ return -1;

+}} // namespace v8::internal

+#endif // V8_STRING_SEARCH_H_

« no previous file with comments | « src/runtime.cc ('k') | no next file » | no next file with comments »