| Index: src/runtime.cc
|
| diff --git a/src/runtime.cc b/src/runtime.cc
|
| index f3fb849de4dd81aad03cda7f96053aa8b822b2c5..2f69ffad83ba06b89a44d1b8eafc1f5b53ead2c0 100644
|
| --- a/src/runtime.cc
|
| +++ b/src/runtime.cc
|
| @@ -952,148 +952,175 @@ static Object* Runtime_CharFromCode(Arguments args) {
|
| }
|
|
|
|
|
| -template <typename schar, typename pchar>
|
| -static int SingleCharIndexOf(Vector<const schar> string,
|
| - pchar pattern_char,
|
| - int start_index) {
|
| - for (int i = start_index, n = string.length(); i < n; i++) {
|
| - if (pattern_char == string[i]) {
|
| - return i;
|
| +// Cap on the maximal shift in the Boyer-Moore implementation. By setting a
|
| +// limit, we can fix the size of tables.
|
| +static const int kBMMaxShift = 0xff;
|
| +static const int kBMAlphabetSize = 0x100; // Reduce alphabet to this size.
|
| +
|
| +// Holds the two buffers used by Boyer-Moore string search's Good Suffix
|
| +// shift. Only allows the last kBMMaxShift characters of the needle
|
| +// to be indexed.
|
| +class BMGoodSuffixBuffers: public AllStatic {
|
| + public:
|
| + BMGoodSuffixBuffers() {}
|
| + inline void init(int needle_length) {
|
| + ASSERT(needle_length > 1);
|
| + int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift;
|
| + int len = needle_length - start;
|
| + biased_suffixes_ = suffixes_ - start;
|
| + biased_good_suffix_shift_ = good_suffix_shift_ - start;
|
| + for (int i = 0; i <= len; i++) {
|
| + good_suffix_shift_[i] = len;
|
| }
|
| }
|
| - return -1;
|
| -}
|
| -
|
| -// Trivial string search for shorter strings.
|
| -template <typename pchar, typename schar>
|
| -static int SimpleIndexOf(Vector<const schar> subject,
|
| - Vector<const pchar> pattern,
|
| - int start_index) {
|
| - int pattern_length = pattern.length();
|
| - int subject_length = subject.length();
|
| - // We know our pattern is at least 2 characters, we cache the first so
|
| - // the common case of the first character not matching is faster.
|
| - pchar pattern_first_char = pattern[0];
|
| - for (int i = start_index, n = subject_length - pattern_length; i <= n; i++) {
|
| - if (subject[i] != pattern_first_char) continue;
|
| -
|
| - bool failure = false;
|
| - for (int j = 1; j < pattern_length; j++) {
|
| - if (pattern[j] != subject[j+i]) {
|
| - failure = true;
|
| - break;
|
| - }
|
| - }
|
| - if (!failure) {
|
| - return i;
|
| - }
|
| + inline int& suffix(int index) {
|
| + ASSERT(biased_suffixes_ + index >= suffixes_);
|
| + return biased_suffixes_[index];
|
| }
|
| - return -1;
|
| -}
|
| -
|
| -// Maximal length (+1) of suffix that is indexed. Also the size of the
|
| -// maximal bad-character skip.
|
| -static const int kBMHSignificantSuffixLength = 0xff;
|
| -
|
| -// Significant bits taken from characters to use in bad-character
|
| -// skips, to reduce size of the table for Unicode letters.
|
| -static const int kBMHSignificantBitsMask = 0xff;
|
| + inline int& shift(int index) {
|
| + ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_);
|
| + return biased_good_suffix_shift_[index];
|
| + }
|
| + private:
|
| + int suffixes_[kBMMaxShift + 1];
|
| + int good_suffix_shift_[kBMMaxShift + 1];
|
| + int *biased_suffixes_;
|
| + int *biased_good_suffix_shift_;
|
| + DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers);
|
| +};
|
|
|
| -// Number of elements in bad-char table.
|
| -static const int kBMHBadCharCount = kBMHSignificantBitsMask + 1;
|
| +// buffers reused by BoyerMoore
|
| +static int bad_char_occurence[kBMAlphabetSize];
|
| +static BMGoodSuffixBuffers bmgs_buffers;
|
|
|
| -// Simplified Boyer-Moore string matching. Only uses bad-char skipping,
|
| -// and restricts table to a suffix of long strings (also restricting
|
| -// the maximum possible skip-length) in order to reduce space.
|
| +// Restricted Boyer-Moore string matching. Restricts tables to a
|
| +// suffix of long pattern strings and handles only equivalence classes
|
| +// of the full alphabet. This allows us to ensure that tables take only
|
| +// a fixed amount of space.
|
| template <typename schar, typename pchar>
|
| -static int BoyerMooreHorspoolIndexOf(Vector<const schar> subject,
|
| - Vector<const pchar> pattern,
|
| - int start_index) {
|
| - ASSERT(kBMHSignificantSuffixLength < 0x100); // We can use bytes as skips.
|
| - static byte bad_char_map[kBMHBadCharCount];
|
| -
|
| +static int BoyerMooreIndexOf(Vector<const schar> subject,
|
| + Vector<const pchar> pattern,
|
| + int start_index) {
|
| int m = pattern.length();
|
| int n = subject.length();
|
| - // Cap bad char table to last p chars of pattern. Also max skip value.
|
| - int p = m < kBMHSignificantSuffixLength ? m : kBMHSignificantSuffixLength;
|
|
|
| - memset(bad_char_map, p, kBMHBadCharCount);
|
| + // Only preprocess at most kBMMaxShift last characters of pattern.
|
| + int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
|
| + int len = m - start;
|
|
|
| // Run forwards to populate bad_char_table, so that *last* instance
|
| // of character equivalence class is the one registered.
|
| // Notice: Doesn't include last character.
|
| - for (int i = p < m ? m - p : 0; i < m - 1; i++) {
|
| + for (int i = 0; i < kBMAlphabetSize; i++) {
|
| + bad_char_occurence[i] = start - 1;
|
| + }
|
| + for (int i = start; i < m; i++) {
|
| uc32 c = pattern[i];
|
| + bad_char_occurence[c % kBMAlphabetSize] = i;
|
| if (sizeof(schar) == 1 &&
|
| sizeof(pchar) > 1 &&
|
| c > String::kMaxAsciiCharCode) {
|
| return -1;
|
| }
|
| - bad_char_map[c & kBMHSignificantBitsMask] = m - 1 - i;
|
| }
|
| + // End of Bad Char computation.
|
|
|
| - for (int i = start_index + m - 1, j = m - 1; i < n;) {
|
| - schar c = subject[i];
|
| - if (c == pattern[j]) {
|
| - if (j == 0) {
|
| - return i;
|
| + // Compute Good Suffix shift table.
|
| + bmgs_buffers.init(m);
|
| +
|
| + bmgs_buffers.shift(m-1) = 1;
|
| + bmgs_buffers.suffix(m) = m + 1;
|
| + pchar last_char = pattern[m - 1];
|
| + int suffix = m + 1;
|
| + for (int i = m; i > start;) {
|
| + for (pchar c = pattern[i - 1]; suffix <= m && c != pattern[suffix - 1];) {
|
| + if (bmgs_buffers.shift(suffix) == len) {
|
| + bmgs_buffers.shift(suffix) = suffix - i;
|
| }
|
| - j--;
|
| - i--;
|
| - } else {
|
| - int skip = bad_char_map[c & kBMHSignificantBitsMask];
|
| - if (skip < (m - j)) {
|
| - skip = m - j;
|
| + suffix = bmgs_buffers.suffix(suffix);
|
| + }
|
| + i--;
|
| + suffix--;
|
| + bmgs_buffers.suffix(i) = suffix;
|
| + if (suffix == m) {
|
| + // no suffix to extend, so we check against last_char only.
|
| + while (i > start && pattern[i - 1] != last_char) {
|
| + if (bmgs_buffers.shift(m) == len) {
|
| + bmgs_buffers.shift(m) = m - i;
|
| + }
|
| + i--;
|
| + bmgs_buffers.suffix(i) = m;
|
| + }
|
| + if (i > start) {
|
| + i--;
|
| + suffix--;
|
| + bmgs_buffers.suffix(i) = suffix;
|
| }
|
| - i += skip;
|
| - j = m - 1;
|
| }
|
| }
|
| - return -1;
|
| -}
|
| -
|
| -
|
| -// Full KMP pattern match.
|
| -template <typename schar, typename pchar> // Pattern & subject char types
|
| -static int KMPIndexOf(Vector<const schar> subject,
|
| - Vector<const pchar> pattern,
|
| - int start_index) {
|
| - int subject_length = subject.length();
|
| - int pattern_length = pattern.length();
|
| - SmartPointer<int> next_table(NewArray<int>(pattern_length));
|
| + if (suffix < m) {
|
| + for (int i = start; i <= m; i++) {
|
| + if (bmgs_buffers.shift(i) == len) {
|
| + bmgs_buffers.shift(i) = suffix - start;
|
| + }
|
| + if (i == suffix) {
|
| + suffix = bmgs_buffers.suffix(suffix);
|
| + }
|
| + }
|
| + }
|
| + // End of Good Suffix computation.
|
|
|
| - // Compute KMP "next" table
|
| - int i = 0;
|
| - int j = -1;
|
| - next_table[0] = -1;
|
|
|
| - pchar p = pattern[0];
|
| - while (i < pattern_length - 1) {
|
| - while (j > -1 && p != pattern[j]) {
|
| - j = next_table[j];
|
| - }
|
| - i++;
|
| - j++;
|
| - p = pattern[i];
|
| - if (p == pattern[j]) {
|
| - next_table[i] = next_table[j];
|
| + // Perform search
|
| + for (int i = start_index; i <= n - m;) {
|
| + int j = m - 1;
|
| + schar c;
|
| + while (j >= 0 && pattern[j] == (c = subject[i + j])) j--;
|
| + if (j < 0) {
|
| + return i;
|
| + } else if (j < start) {
|
| + // we have matched more than our tables allow us to be smart about.
|
| + i += 1;
|
| } else {
|
| - next_table[i] = j;
|
| + int gs_shift = bmgs_buffers.shift(j + 1);
|
| + int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
|
| + int bc_shift = j - bc_occ;
|
| + i += (gs_shift > bc_shift) ? gs_shift : bc_shift;
|
| }
|
| }
|
| + return -1;
|
| +}
|
|
|
| - // Search using the 'next' table.
|
| - int pattern_index = 0;
|
| - int subject_index = start_index;
|
| - while (subject_index < subject_length) {
|
| - schar subject_char = subject[subject_index];
|
| - while (pattern_index > -1 && pattern[pattern_index] != subject_char) {
|
| - pattern_index = next_table[pattern_index];
|
| +template <typename schar, typename pchar>
|
| +static int SingleCharIndexOf(Vector<const schar> string,
|
| + pchar pattern_char,
|
| + int start_index) {
|
| + for (int i = start_index, n = string.length(); i < n; i++) {
|
| + if (pattern_char == string[i]) {
|
| + return i;
|
| }
|
| - pattern_index++;
|
| - subject_index++;
|
| - if (pattern_index >= pattern_length) {
|
| - return subject_index - pattern_index;
|
| + }
|
| + return -1;
|
| +}
|
| +
|
| +// Trivial string search for shorter strings.
|
| +template <typename pchar, typename schar>
|
| +static int SimpleIndexOf(Vector<const schar> subject,
|
| + Vector<const pchar> pattern,
|
| + int start_index) {
|
| + int pattern_length = pattern.length();
|
| + int subject_length = subject.length();
|
| + // We know our pattern is at least 2 characters, we cache the first so
|
| + // the common case of the first character not matching is faster.
|
| + pchar pattern_first_char = pattern[0];
|
| + for (int i = start_index, n = subject_length - pattern_length; i <= n; i++) {
|
| + if (subject[i] != pattern_first_char) continue;
|
| + int j = 1;
|
| + while (pattern[j] == subject[j+i]) {
|
| + j++;
|
| + if (j == pattern_length) {
|
| + return i;
|
| + }
|
| }
|
| }
|
| return -1;
|
| @@ -1105,19 +1132,15 @@ static int StringMatchStrategy(Vector<const schar> sub,
|
| Vector<const pchar> pat,
|
| int start_index) {
|
| int pattern_length = pat.length();
|
| - // Searching for one specific character is common. For one
|
| - // character patterns the KMP algorithm is guaranteed to slow down
|
| - // the search, so we just run through the subject string.
|
| - if (pattern_length == 1) {
|
| - return SingleCharIndexOf(sub, pat[0], start_index);
|
| - }
|
| + ASSERT(pattern_length > 1);
|
|
|
| // For small searches, a complex sort is not worth the setup overhead.
|
| - if (sub.length() - start_index < 25) {
|
| + int subject_length = sub.length() - start_index;
|
| + if (subject_length < 100 || pattern_length < 4) {
|
| return SimpleIndexOf(sub, pat, start_index);
|
| }
|
|
|
| - return BoyerMooreHorspoolIndexOf(sub, pat, start_index);
|
| + return BoyerMooreIndexOf(sub, pat, start_index);
|
| }
|
|
|
| // Perform string match of pattern on subject, starting at start index.
|
| @@ -1136,6 +1159,17 @@ int Runtime::StringMatch(Handle<String> sub,
|
| if (start_index + pattern_length > subject_length) return -1;
|
|
|
| FlattenString(sub);
|
| + // Searching for one specific character is common. For one
|
| + // character patterns linear search is necessary, so any smart
|
| + // algorithm is unnecessary overhead.
|
| + if (pattern_length == 1) {
|
| + AssertNoAllocation no_heap_allocation; // ensure vectors stay valid
|
| + if (sub->is_ascii_representation()) {
|
| + return SingleCharIndexOf(sub->ToAsciiVector(), pat->Get(0), start_index);
|
| + }
|
| + return SingleCharIndexOf(sub->ToUC16Vector(), pat->Get(0), start_index);
|
| + }
|
| +
|
| FlattenString(pat);
|
|
|
| AssertNoAllocation no_heap_allocation; // ensure vectors stay valid
|
|
|