src/runtime.cc - Issue 14505: * Changed simplified index-of to use correct BMH algorithm, for a theoretical better performance.

Unified Diff: src/runtime.cc

Issue 14505: * Changed simplified index-of to use correct BMH algorithm, for a theoretical better performance. (Closed)

Patch Set: Created 12 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/runtime.cc

diff --git a/src/runtime.cc b/src/runtime.cc

index 24b19047eb564ff6b4c1842ed99dba46bb9112f2..b9f1f48ef2138a4179b5436b4572e70d851e6e49 100644

--- a/src/runtime.cc

+++ b/src/runtime.cc

@@ -1151,15 +1151,14 @@ static inline int CharOccurence(int char_code) {

return bad_char_occurence[char_code % kBMAlphabetSize];

}

-// Restricted simplified Boyer-Moore string matching. Restricts tables to a

-// suffix of long pattern strings and handles only equivalence classes

-// of the full alphabet. This allows us to ensure that tables take only

-// a fixed amount of space.

+// Restricted simplified Boyer-Moore string matching.

+// Uses only the bad-shift table of Boyer-Moore and only uses it

+// for the character compared to the last character of the needle.

template <typename schar, typename pchar>

-static int BoyerMooreSimplified(Vector<const schar> subject,

- Vector<const pchar> pattern,

- int start_index,

- bool* complete) {

+static int BoyerMooreHorsepool(Vector<const schar> subject,

+ Vector<const pchar> pattern,

+ int start_index,

+ bool* complete) {

int n = subject.length();

int m = pattern.length();

// Only preprocess at most kBMMaxShift last characters of pattern.

@@ -1170,6 +1169,7 @@ static int BoyerMooreSimplified(Vector<const schar> subject,

int badness = -m; // How bad we are doing without a good-suffix table.

int idx; // No matches found prior to this index.

pchar last_char = pattern[m - 1];

+ int last_char_shift = m - 1 - CharOccurence<schar, pchar>(last_char);

Erik Corry 2008/12/17 09:07:18 Noun. occurence. Common misspelling of occurrence.

// Perform search

for (idx = start_index; idx <= n - m;) {

int j = m - 1;

@@ -1185,19 +1185,17 @@ static int BoyerMooreSimplified(Vector<const schar> subject,

}

j--;

- while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;

+ while (j >= 0 && pattern[j] == (subject[idx + j])) j--;

if (j < 0) {

*complete = true;

return idx;

} else {

- int bc_occ = CharOccurence<schar, pchar>(c);

- int shift = bc_occ < j ? j - bc_occ : 1;

- idx += shift;

+ idx += last_char_shift;

// Badness increases by the number of characters we have

// checked, and decreases by the number of characters we

// can skip by shifting. It's a measure of how we are doing

// compared to reading each character exactly once.

- badness += (m - j) - shift;

+ badness += (m - j) - last_char_shift;

if (badness > 0) {

*complete = false;

return idx;

@@ -1237,12 +1235,15 @@ static int BoyerMooreIndexOf(Vector<const schar> subject,

return idx;

} else if (j < start) {

// we have matched more than our tables allow us to be smart about.

- idx += 1;

+ // Fall back on BMH shift.

+ idx += m - 1 - CharOccurence<schar, pchar>(last_char);

} else {

int gs_shift = bmgs_buffers.shift(j + 1); // Good suffix shift.

int bc_occ = CharOccurence<schar, pchar>(c);

int shift = j - bc_occ; // Bad-char shift.

- shift = (gs_shift > shift) ? gs_shift : shift;

+ if (gs_shift > shift) {

+ shift = gs_shift;

+ }

idx += shift;

}

} while (idx <= n - m);

@@ -1286,7 +1287,7 @@ static int SimpleIndexOf(Vector<const schar> subject,

badness++;

if (badness > 0) {

*complete = false;

- return (i);

+ return i;

}

if (subject[i] != pattern_first_char) continue;

int j = 1;

@@ -1357,7 +1358,7 @@ static int StringMatchStrategy(Vector<const schar> sub,

bool complete;

int idx = SimpleIndexOf(sub, pat, start_index, &complete);

if (complete) return idx;

- idx = BoyerMooreSimplified(sub, pat, idx, &complete);

+ idx = BoyerMooreHorsepool(sub, pat, idx, &complete);

if (complete) return idx;

return BoyerMooreIndexOf(sub, pat, idx);

}

« no previous file with comments | « no previous file | no next file » | no next file with comments »