Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(271)

Unified Diff: src/runtime.cc

Issue 6533: * Added simplified Boyer-Moore-Horspool text search. (Closed)
Patch Set: Included comments from review Created 12 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/runtime.h ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/runtime.cc
diff --git a/src/runtime.cc b/src/runtime.cc
index be1dbcddd3b64f06e22019a770e5073e32697b70..924b3b1f41dacad592c4899979d94a25aad4ded2 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -1029,6 +1029,64 @@ static int SimpleIndexOf(Vector<const schar> subject,
return -1;
}
+// Maximal length (+1) of suffix that is indexed. Also the size of the
+// maximal bad-character skip.
+static const int kBMHSignificantSuffixLength = 0xff;
+
+// Significant bits taken from characters to use in bad-character
+// skips, to reduce size of the table for Unicode letters.
+static const int kBMHSignificantBitsMask = 0xff;
+
+// Number of elements in bad-char table.
+static const int kBMHBadCharCount = kBMHSignificantBitsMask + 1;
+
+// Simplified Boyer-Moore string matching. Only uses bad-char skipping,
+// and restricts table to a suffix of long strings (also restricting
+// the maximum possible skip-length) in order to reduce space.
+template <typename schar, typename pchar>
+static int BoyerMooreHorspoolIndexOf(Vector<const schar> subject,
+ Vector<const pchar> pattern,
+ int start_index) {
+ ASSERT(kBMHSignificantSuffixLength < 0x100); // We can use bytes as skips.
+ static byte bad_char_map[kBMHBadCharCount];
+
+ int m = pattern.length();
+ int n = subject.length();
+ // Cap bad char table to last p chars of pattern. Also max skip value.
+ int p = m < kBMHSignificantSuffixLength ? m : kBMHSignificantSuffixLength;
+
+ memset(bad_char_map, p, kBMHBadCharCount);
+
+ // Run forwards to populate bad_char_table, so that *last* instance
+ // of character equivalence class is the one registered.
+ // Notice: Doesn't include last character.
+ for (int i = p < m ? m - p : 0; i < m - 1; i++) {
+ pchar c = pattern[i];
+ if (sizeof(schar) == 1 && c > 255) return -1;
+ bad_char_map[c & kBMHSignificantBitsMask] = m - 1 - i;
+ }
+
+ for (int i = start_index + m - 1, j = m - 1; i < n;) {
+ schar c = subject[i];
+ if (c == pattern[j]) {
+ if (j == 0) {
+ return i;
+ }
+ j--;
+ i--;
+ } else {
+ int skip = bad_char_map[c & kBMHSignificantBitsMask];
+ if (skip < (m - j)) {
+ skip = m - j;
+ }
+ i += skip;
+ j = m - 1;
+ }
+ }
+ return -1;
+}
+
+
// Full KMP pattern match.
template <typename schar, typename pchar> // Pattern & subject char types
static int KMPIndexOf(Vector<const schar> subject,
@@ -1077,35 +1135,40 @@ static int KMPIndexOf(Vector<const schar> subject,
// Dispatch to different algorithms for different length of pattern/subject
template <typename schar, typename pchar>
-static int StringMatchKMP(Vector<const schar> sub,
- Vector<const pchar> pat,
- int start_index) {
+static int StringMatchStrategy(Vector<const schar> sub,
+ Vector<const pchar> pat,
+ int start_index) {
+ int pattern_length = pat.length();
// Searching for one specific character is common. For one
// character patterns the KMP algorithm is guaranteed to slow down
// the search, so we just run through the subject string.
- if (pat.length() == 1) {
+ if (pattern_length == 1) {
return SingleCharIndexOf(sub, pat[0], start_index);
}
- // For small searches, KMP is not worth the setup overhead.
- if (sub.length() - start_index < 100) {
+ // For small searches, a complex sort is not worth the setup overhead.
+ if (sub.length() - start_index < 25) {
return SimpleIndexOf(sub, pat, start_index);
}
- // For patterns with a larger length we use the KMP algorithm.
- return KMPIndexOf(sub, pat, start_index);
+ return BoyerMooreHorspoolIndexOf(sub, pat, start_index);
}
// Perform string match of pattern on subject, starting at start index.
// Caller must ensure that 0 <= start_index <= sub->length(),
// and should check that pat->length() + start_index <= sub->length()
-int Runtime::StringMatchKmp(Handle<String> sub,
- Handle<String> pat,
- int start_index) {
+int Runtime::StringMatch(Handle<String> sub,
+ Handle<String> pat,
+ int start_index) {
ASSERT(0 <= start_index);
ASSERT(start_index <= sub->length());
- if (pat->length() == 0) return start_index;
+ int pattern_length = pat->length();
+ if (pattern_length == 0) return start_index;
+
+ int subject_length = sub->length();
+ if (start_index + pattern_length > subject_length) return -1;
+
FlattenString(sub);
FlattenString(pat);
@@ -1114,15 +1177,15 @@ int Runtime::StringMatchKmp(Handle<String> sub,
if (pat->is_ascii()) {
Vector<const char> pat_vector = ToAsciiVector(*pat);
if (sub->is_ascii()) {
- return StringMatchKMP(ToAsciiVector(*sub), pat_vector, start_index);
+ return StringMatchStrategy(ToAsciiVector(*sub), pat_vector, start_index);
}
- return StringMatchKMP(ToUC16Vector(*sub), pat_vector, start_index);
+ return StringMatchStrategy(ToUC16Vector(*sub), pat_vector, start_index);
}
Vector<const uc16> pat_vector = ToUC16Vector(*pat);
if (sub->is_ascii()) {
- return StringMatchKMP(ToAsciiVector(*sub), pat_vector, start_index);
+ return StringMatchStrategy(ToAsciiVector(*sub), pat_vector, start_index);
}
- return StringMatchKMP(ToUC16Vector(*sub), pat_vector, start_index);
+ return StringMatchStrategy(ToUC16Vector(*sub), pat_vector, start_index);
}
@@ -1137,7 +1200,7 @@ static Object* Runtime_StringIndexOf(Arguments args) {
uint32_t start_index;
if (!Array::IndexFromObject(index, &start_index)) return Smi::FromInt(-1);
- int position = Runtime::StringMatchKmp(sub, pat, start_index);
+ int position = Runtime::StringMatch(sub, pat, start_index);
return Smi::FromInt(position);
}
« no previous file with comments | « src/runtime.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698