src/runtime.cc - Issue 7621: * Optimized the bad-char table of Boyer-Moore

Unified Diff: src/runtime.cc

Issue 7621: * Optimized the bad-char table of Boyer-Moore (Closed)

Patch Set: Created 12 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/runtime.cc

diff --git a/src/runtime.cc b/src/runtime.cc

index bcdd76decaa0f23c795f50853838371d292278c4..bc1152142afade11849fc0dcf0c0ee96fe811ab3 100644

--- a/src/runtime.cc

+++ b/src/runtime.cc

@@ -965,7 +965,7 @@ static const int kBMMinPatternLength = 5;

// Holds the two buffers used by Boyer-Moore string search's Good Suffix

// shift. Only allows the last kBMMaxShift characters of the needle

// to be indexed.

-class BMGoodSuffixBuffers: public AllStatic {

+class BMGoodSuffixBuffers {

public:

BMGoodSuffixBuffers() {}

inline void init(int needle_length) {

@@ -1005,12 +1005,18 @@ static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern,

// Run forwards to populate bad_char_table, so that *last* instance

// of character equivalence class is the one registered.

// Notice: Doesn't include the last character.

- for (int i = 0; i < kBMAlphabetSize; i++) {

- bad_char_occurence[i] = start - 1;

+ int table_size = (sizeof(pchar) == 1) ? String::kMaxAsciiCharCode + 1

+ : kBMAlphabetSize;

+ if (start == 0) { // All patterns less than kBMMaxShift in length.

+ memset(bad_char_occurence, -1, table_size * sizeof(*bad_char_occurence));

+ } else {

+ for (int i = 0; i < table_size; i++) {

+ bad_char_occurence[i] = start - 1;

+ }

}

for (int i = start; i < pattern.length() - 1; i++) {

pchar c = pattern[i];

- int bucket = c % kBMAlphabetSize;

+ int bucket = (sizeof(pchar) ==1) ? c : c % kBMAlphabetSize;

bad_char_occurence[bucket] = i;

}

@@ -1065,6 +1071,19 @@ static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern,

}

+template <typename schar, typename pchar>

+static inline int CharOccurence(int char_code) {

+ if (sizeof(schar) == 1) {

+ return bad_char_occurence[char_code];

+ }

+ if (sizeof(pchar) == 1) {

+ if (char_code > String::kMaxAsciiCharCode) {

+ return -1;

+ }

+ return bad_char_occurence[char_code];

+ }

+ return bad_char_occurence[char_code % kBMAlphabetSize];

// Restricted simplified Boyer-Moore string matching. Restricts tables to a

// suffix of long pattern strings and handles only equivalence classes

@@ -1090,7 +1109,7 @@ static int BoyerMooreSimplified(Vector<const schar> subject,

int j = m - 1;

int c;

while (last_char != (c = subject[idx + j])) {

- int bc_occ = bad_char_occurence[c % kBMAlphabetSize];

+ int bc_occ = CharOccurence<schar, pchar>(c);

int shift = j - bc_occ;

idx += shift;

badness += 1 - shift; // at most zero, so badness cannot increase.

@@ -1105,7 +1124,7 @@ static int BoyerMooreSimplified(Vector<const schar> subject,

complete = true;

return idx;

} else {

- int bc_occ = bad_char_occurence[c % kBMAlphabetSize];

+ int bc_occ = CharOccurence<schar, pchar>(c);

int shift = bc_occ < j ? j - bc_occ : 1;

idx += shift;

// Badness increases by the number of characters we have

@@ -1141,7 +1160,7 @@ static int BoyerMooreIndexOf(Vector<const schar> subject,

int j = m - 1;

schar c;

while (last_char != (c = subject[idx + j])) {

- int shift = j - bad_char_occurence[c % kBMAlphabetSize];

+ int shift = j - CharOccurence<schar, pchar>(c);

idx += shift;

if (idx > n - m) {

return -1;

@@ -1155,7 +1174,7 @@ static int BoyerMooreIndexOf(Vector<const schar> subject,

idx += 1;

} else {

int gs_shift = bmgs_buffers.shift(j + 1); // Good suffix shift.

- int bc_occ = bad_char_occurence[c % kBMAlphabetSize];

+ int bc_occ = CharOccurence<schar, pchar>(c);

int shift = j - bc_occ; // Bad-char shift.

shift = (gs_shift > shift) ? gs_shift : shift;

idx += shift;

« no previous file with comments | « no previous file | no next file » | no next file with comments »