| Index: src/runtime.cc
|
| diff --git a/src/runtime.cc b/src/runtime.cc
|
| index 74d93def1f989ef4d04ce8da7026aa0539bfb281..0525b5e4f29a3efd2c9a653763fe695dcc3f04e1 100644
|
| --- a/src/runtime.cc
|
| +++ b/src/runtime.cc
|
| @@ -955,7 +955,11 @@ static Object* Runtime_CharFromCode(Arguments args) {
|
| // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
|
| // limit, we can fix the size of tables.
|
| static const int kBMMaxShift = 0xff;
|
| -static const int kBMAlphabetSize = 0x100; // Reduce alphabet to this size.
|
| +// Reduce alphabet to this size.
|
| +static const int kBMAlphabetSize = 0x100;
|
| +// For patterns below this length, the skip length of Boyer-Moore is too short
|
| +// to compensate for the algorithmic overhead compared to simple brute force.
|
| +static const int kBMMinPatternLength = 5;
|
|
|
| // Holds the two buffers used by Boyer-Moore string search's Good Suffix
|
| // shift. Only allows the last kBMMaxShift characters of the needle
|
| @@ -994,8 +998,6 @@ static int bad_char_occurence[kBMAlphabetSize];
|
| static BMGoodSuffixBuffers bmgs_buffers;
|
|
|
| // Compute the bad-char table for Boyer-Moore in the static buffer.
|
| -// Return false if the pattern contains non-ASCII characters that cannot be
|
| -// in the searched string.
|
| template <typename pchar>
|
| static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern,
|
| int start) {
|
| @@ -1005,16 +1007,18 @@ static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern,
|
| for (int i = 0; i < kBMAlphabetSize; i++) {
|
| bad_char_occurence[i] = start - 1;
|
| }
|
| - for (int i = start; i < pattern.length(); i++) {
|
| - bad_char_occurence[pattern[i] % kBMAlphabetSize] = i;
|
| + for (int i = start; i < pattern.length() - 1; i++) {
|
| + pchar c = pattern[i];
|
| + int bucket = c % kBMAlphabetSize;
|
| + bad_char_occurence[bucket] = i;
|
| }
|
| }
|
|
|
| template <typename pchar>
|
| static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern,
|
| - int start,
|
| - int len) {
|
| + int start) {
|
| int m = pattern.length();
|
| + int len = m - start;
|
| // Compute Good Suffix tables.
|
| bmgs_buffers.init(m);
|
|
|
| @@ -1060,31 +1064,44 @@ static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern,
|
| }
|
| }
|
|
|
| -// Restricted Boyer-Moore string matching. Restricts tables to a
|
| +
|
| +// Restricted simplified Boyer-Moore string matching. Restricts tables to a
|
| // suffix of long pattern strings and handles only equivalence classes
|
| // of the full alphabet. This allows us to ensure that tables take only
|
| // a fixed amount of space.
|
| template <typename schar, typename pchar>
|
| -static int BoyerMooreIndexOf(Vector<const schar> subject,
|
| - Vector<const pchar> pattern,
|
| - int start_index) {
|
| - int m = pattern.length();
|
| +static int BoyerMooreSimplified(Vector<const schar> subject,
|
| + Vector<const pchar> pattern,
|
| + int start_index,
|
| + bool& complete) {
|
| int n = subject.length();
|
| -
|
| + int m = pattern.length();
|
| // Only preprocess at most kBMMaxShift last characters of pattern.
|
| int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
|
| - int len = m - start;
|
|
|
| BoyerMoorePopulateBadCharTable(pattern, start);
|
|
|
| - int badness = 0; // How bad we are doing without a good-suffix table.
|
| + int badness = -m; // How bad we are doing without a good-suffix table.
|
| int idx; // No matches found prior to this index.
|
| + pchar last_char = pattern[m - 1];
|
| // Perform search
|
| for (idx = start_index; idx <= n - m;) {
|
| int j = m - 1;
|
| - schar c;
|
| + int c;
|
| + while (last_char != (c = subject[idx + j])) {
|
| + int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
|
| + int shift = j - bc_occ;
|
| + idx += shift;
|
| + badness += 1 - shift; // at most zero, so badness cannot increase.
|
| + if (idx > n - m) {
|
| + complete = true;
|
| + return -1;
|
| + }
|
| + }
|
| + j--;
|
| while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;
|
| if (j < 0) {
|
| + complete = true;
|
| return idx;
|
| } else {
|
| int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
|
| @@ -1095,40 +1112,66 @@ static int BoyerMooreIndexOf(Vector<const schar> subject,
|
| // can skip by shifting. It's a measure of how we are doing
|
| // compared to reading each character exactly once.
|
| badness += (m - j) - shift;
|
| - if (badness > m) break;
|
| + if (badness > 0) {
|
| + complete = false;
|
| + return idx;
|
| + }
|
| }
|
| }
|
| + complete = true;
|
| + return -1;
|
| +}
|
|
|
| - // If we are not done, we got here because we should build the Good Suffix
|
| - // table and continue searching.
|
| - if (idx <= n - m) {
|
| - BoyerMoorePopulateGoodSuffixTable(pattern, start, len);
|
| - // Continue search from i.
|
| - do {
|
| - int j = m - 1;
|
| - schar c;
|
| - while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;
|
| - if (j < 0) {
|
| - return idx;
|
| - } else if (j < start) {
|
| - // we have matched more than our tables allow us to be smart about.
|
| - idx += 1;
|
| - } else {
|
| - int gs_shift = bmgs_buffers.shift(j + 1);
|
| - int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
|
| - int bc_shift = j - bc_occ;
|
| - idx += (gs_shift > bc_shift) ? gs_shift : bc_shift;
|
| +
|
| +template <typename schar, typename pchar>
|
| +static int BoyerMooreIndexOf(Vector<const schar> subject,
|
| + Vector<const pchar> pattern,
|
| + int idx) {
|
| + int n = subject.length();
|
| + int m = pattern.length();
|
| + // Only preprocess at most kBMMaxShift last characters of pattern.
|
| + int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
|
| +
|
| + // Build the Good Suffix table and continue searching.
|
| + BoyerMoorePopulateGoodSuffixTable(pattern, start);
|
| + pchar last_char = pattern[m - 1];
|
| + // Continue search from i.
|
| + do {
|
| + int j = m - 1;
|
| + schar c;
|
| + while (last_char != (c = subject[idx + j])) {
|
| + int shift = j - bad_char_occurence[c % kBMAlphabetSize];
|
| + idx += shift;
|
| + if (idx > n - m) {
|
| + return -1;
|
| }
|
| - } while (idx <= n - m);
|
| - }
|
| + }
|
| + while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;
|
| + if (j < 0) {
|
| + return idx;
|
| + } else if (j < start) {
|
| + // we have matched more than our tables allow us to be smart about.
|
| + idx += 1;
|
| + } else {
|
| + int gs_shift = bmgs_buffers.shift(j + 1); // Good suffix shift.
|
| + int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
|
| + int shift = j - bc_occ; // Bad-char shift.
|
| + shift = (gs_shift > shift) ? gs_shift : shift;
|
| + idx += shift;
|
| + }
|
| + } while (idx <= n - m);
|
|
|
| return -1;
|
| }
|
|
|
| -template <typename schar, typename pchar>
|
| +
|
| +template <typename schar>
|
| static int SingleCharIndexOf(Vector<const schar> string,
|
| - pchar pattern_char,
|
| + uc16 pattern_char,
|
| int start_index) {
|
| + if (sizeof(schar) == 1 && pattern_char > String::kMaxAsciiCharCode) {
|
| + return -1;
|
| + }
|
| for (int i = start_index, n = string.length(); i < n; i++) {
|
| if (pattern_char == string[i]) {
|
| return i;
|
| @@ -1146,20 +1189,22 @@ static int SingleCharIndexOf(Vector<const schar> string,
|
| template <typename pchar, typename schar>
|
| static int SimpleIndexOf(Vector<const schar> subject,
|
| Vector<const pchar> pattern,
|
| - int start_index,
|
| + int idx,
|
| bool &complete) {
|
| - int pattern_length = pattern.length();
|
| - int subject_length = subject.length();
|
| - // Badness is a count of how many extra times the same character
|
| - // is checked. We compare it to the index counter, so we start
|
| - // it at the start_index, and give it a little discount to avoid
|
| - // very early bail-outs.
|
| - int badness = start_index - pattern_length;
|
| + // Badness is a count of how much work we have done. When we have
|
| + // done enough work we decide it's probably worth switching to a better
|
| + // algorithm.
|
| + int badness = -10 - (pattern.length() << 2);
|
| // We know our pattern is at least 2 characters, we cache the first so
|
| // the common case of the first character not matching is faster.
|
| pchar pattern_first_char = pattern[0];
|
|
|
| - for (int i = start_index, n = subject_length - pattern_length; i <= n; i++) {
|
| + for (int i = idx, n = subject.length() - pattern.length(); i <= n; i++) {
|
| + badness++;
|
| + if (badness > 0) {
|
| + complete = false;
|
| + return (i);
|
| + }
|
| if (subject[i] != pattern_first_char) continue;
|
| int j = 1;
|
| do {
|
| @@ -1167,22 +1212,41 @@ static int SimpleIndexOf(Vector<const schar> subject,
|
| break;
|
| }
|
| j++;
|
| - } while (j < pattern_length);
|
| - if (j == pattern_length) {
|
| + } while (j < pattern.length());
|
| + if (j == pattern.length()) {
|
| complete = true;
|
| return i;
|
| }
|
| badness += j;
|
| - if (badness > i) { // More than one extra character on average.
|
| - complete = false;
|
| - return (i + 1); // No matches up to index i+1.
|
| - }
|
| }
|
| complete = true;
|
| return -1;
|
| }
|
|
|
| -// Dispatch to different algorithms for different length of pattern/subject
|
| +// Simple indexOf that never bails out. For short patterns only.
|
| +template <typename pchar, typename schar>
|
| +static int SimpleIndexOf(Vector<const schar> subject,
|
| + Vector<const pchar> pattern,
|
| + int idx) {
|
| + pchar pattern_first_char = pattern[0];
|
| + for (int i = idx, n = subject.length() - pattern.length(); i <= n; i++) {
|
| + if (subject[i] != pattern_first_char) continue;
|
| + int j = 1;
|
| + do {
|
| + if (pattern[j] != subject[i+j]) {
|
| + break;
|
| + }
|
| + j++;
|
| + } while (j < pattern.length());
|
| + if (j == pattern.length()) {
|
| + return i;
|
| + }
|
| + }
|
| + return -1;
|
| +}
|
| +
|
| +
|
| +// Dispatch to different algorithms.
|
| template <typename schar, typename pchar>
|
| static int StringMatchStrategy(Vector<const schar> sub,
|
| Vector<const pchar> pat,
|
| @@ -1200,10 +1264,18 @@ static int StringMatchStrategy(Vector<const schar> sub,
|
| }
|
| }
|
| }
|
| - // For small searches, a complex sort is not worth the setup overhead.
|
| + if (pat.length() < kBMMinPatternLength) {
|
| + // We don't believe fancy searching can ever be more efficient.
|
| + // The max shift of Boyer-Moore on a pattern of this length does
|
| + // not compensate for the overhead.
|
| + return SimpleIndexOf(sub, pat, start_index);
|
| + }
|
| + // Try algorithms in order of increasing setup cost and expected performance.
|
| bool complete;
|
| int idx = SimpleIndexOf(sub, pat, start_index, complete);
|
| if (complete) return idx;
|
| + idx = BoyerMooreSimplified(sub, pat, idx, complete);
|
| + if (complete) return idx;
|
| return BoyerMooreIndexOf(sub, pat, idx);
|
| }
|
|
|
|
|