Index: src/runtime.cc |
diff --git a/src/runtime.cc b/src/runtime.cc |
index 04ea02bb6cac0641a1b6480a58b1ef854a25cd02..6c46d6ace67c89e576167dd524d8fddfd18f8857 100644 |
--- a/src/runtime.cc |
+++ b/src/runtime.cc |
@@ -2136,10 +2136,23 @@ class BMGoodSuffixBuffers { |
static int bad_char_occurrence[kBMAlphabetSize]; |
static BMGoodSuffixBuffers bmgs_buffers; |
+// State of the string match tables. |
+// SIMPLE: No usable content in the buffers. |
+// BOYER_MOORE_HORSPOOL: The bad_char_occurences table has been populated. |
+// BOYER_MOORE: The bmgs_buffers tables have also been populated. |
+// Whenever starting with a new needle, one should call InitializeStringSearch |
+// to determine which search strategy to use, and in the case of a long-needle |
+// strategy, the call also initializes the algorithm to SIMPLE. |
+enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE }; |
+static StringSearchAlgorithm algorithm; |
+ |
+ |
// Compute the bad-char table for Boyer-Moore in the static buffer. |
template <typename pchar> |
-static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern, |
- int start) { |
+static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern) { |
+ // Only preprocess at most kBMMaxShift last characters of pattern. |
+ int start = pattern.length() < kBMMaxShift ? 0 |
+ : pattern.length() - kBMMaxShift; |
// Run forwards to populate bad_char_table, so that *last* instance |
// of character equivalence class is the one registered. |
// Notice: Doesn't include the last character. |
@@ -2159,10 +2172,11 @@ static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern, |
} |
} |
+ |
template <typename pchar> |
-static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern, |
- int start) { |
+static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern) { |
int m = pattern.length(); |
+ int start = m < kBMMaxShift ? 0 : m - kBMMaxShift; |
int len = m - start; |
// Compute Good Suffix tables. |
bmgs_buffers.init(m); |
@@ -2209,6 +2223,7 @@ static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern, |
} |
} |
+ |
template <typename schar, typename pchar> |
static inline int CharOccurrence(int char_code) { |
if (sizeof(schar) == 1) { |
@@ -2223,6 +2238,7 @@ static inline int CharOccurrence(int char_code) { |
return bad_char_occurrence[char_code % kBMAlphabetSize]; |
} |
+ |
// Restricted simplified Boyer-Moore string matching. |
// Uses only the bad-shift table of Boyer-Moore and only uses it |
// for the character compared to the last character of the needle. |
@@ -2231,14 +2247,13 @@ static int BoyerMooreHorspool(Vector<const schar> subject, |
Vector<const pchar> pattern, |
int start_index, |
bool* complete) { |
+ ASSERT(algorithm <= BOYER_MOORE_HORSPOOL); |
int n = subject.length(); |
int m = pattern.length(); |
- // Only preprocess at most kBMMaxShift last characters of pattern. |
- int start = m < kBMMaxShift ? 0 : m - kBMMaxShift; |
- BoyerMoorePopulateBadCharTable(pattern, start); |
+ int badness = -m; |
- int badness = -m; // How bad we are doing without a good-suffix table. |
+ // How bad we are doing without a good-suffix table. |
int idx; // No matches found prior to this index. |
pchar last_char = pattern[m - 1]; |
int last_char_shift = m - 1 - CharOccurrence<schar, pchar>(last_char); |
@@ -2252,6 +2267,7 @@ static int BoyerMooreHorspool(Vector<const schar> subject, |
idx += shift; |
badness += 1 - shift; // at most zero, so badness cannot increase. |
if (idx > n - m) { |
+ // If we could just break out of the for loop here. |
Erik Corry
2010/03/15 12:57:48
What would then happen?
|
*complete = true; |
return -1; |
} |
@@ -2283,13 +2299,12 @@ template <typename schar, typename pchar> |
static int BoyerMooreIndexOf(Vector<const schar> subject, |
Vector<const pchar> pattern, |
int idx) { |
+ ASSERT(algorithm <= BOYER_MOORE); |
int n = subject.length(); |
int m = pattern.length(); |
// Only preprocess at most kBMMaxShift last characters of pattern. |
int start = m < kBMMaxShift ? 0 : m - kBMMaxShift; |
- // Build the Good Suffix table and continue searching. |
- BoyerMoorePopulateGoodSuffixTable(pattern, start); |
pchar last_char = pattern[m - 1]; |
// Continue search from i. |
while (idx <= n - m) { |
@@ -2325,9 +2340,9 @@ static int BoyerMooreIndexOf(Vector<const schar> subject, |
template <typename schar> |
-static int SingleCharIndexOf(Vector<const schar> string, |
- schar pattern_char, |
- int start_index) { |
+static inline int SingleCharIndexOf(Vector<const schar> string, |
+ schar pattern_char, |
+ int start_index) { |
for (int i = start_index, n = string.length(); i < n; i++) { |
if (pattern_char == string[i]) { |
return i; |
@@ -2364,11 +2379,11 @@ static int SimpleIndexOf(Vector<const schar> subject, |
// Badness is a count of how much work we have done. When we have |
// done enough work we decide it's probably worth switching to a better |
// algorithm. |
- int badness = -10 - (pattern.length() << 2); |
+ |
// We know our pattern is at least 2 characters, we cache the first so |
// the common case of the first character not matching is faster. |
pchar pattern_first_char = pattern[0]; |
- |
+ int badness = -10 - pattern.length() << 2; |
for (int i = idx, n = subject.length() - pattern.length(); i <= n; i++) { |
badness++; |
if (badness > 0) { |
@@ -2416,39 +2431,84 @@ static int SimpleIndexOf(Vector<const schar> subject, |
} |
-// Dispatch to different algorithms. |
-template <typename schar, typename pchar> |
-static int StringMatchStrategy(Vector<const schar> sub, |
- Vector<const pchar> pat, |
- int start_index) { |
- ASSERT(pat.length() > 1); |
+// Strategy for searching for a string in another string. |
+enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG }; |
+ |
+template <typename pchar> |
+static inline StringSearchStrategy InitializeStringSearch( |
+ Vector<const pchar> pat, bool ascii_subject) { |
+ ASSERT(pat.length() > 1); |
// We have an ASCII haystack and a non-ASCII needle. Check if there |
// really is a non-ASCII character in the needle and bail out if there |
// is. |
- if (sizeof(schar) == 1 && sizeof(pchar) > 1) { |
+ if (ascii_subject && sizeof(pchar) > 1) { |
for (int i = 0; i < pat.length(); i++) { |
uc16 c = pat[i]; |
if (c > String::kMaxAsciiCharCode) { |
- return -1; |
+ return SEARCH_FAIL; |
} |
} |
} |
if (pat.length() < kBMMinPatternLength) { |
- // We don't believe fancy searching can ever be more efficient. |
- // The max shift of Boyer-Moore on a pattern of this length does |
- // not compensate for the overhead. |
- return SimpleIndexOf(sub, pat, start_index); |
+ return SEARCH_SHORT; |
} |
+ algorithm = SIMPLE_SEARCH; |
+ return SEARCH_LONG; |
+} |
+ |
+ |
+// Dispatch long needle searches to different algorithms. |
+template <typename schar, typename pchar> |
+static int ComplexIndexOf(Vector<const schar> sub, |
+ Vector<const pchar> pat, |
+ int start_index) { |
+ ASSERT(pat.length() >= kBMMinPatternLength); |
// Try algorithms in order of increasing setup cost and expected performance. |
bool complete; |
- int idx = SimpleIndexOf(sub, pat, start_index, &complete); |
- if (complete) return idx; |
- idx = BoyerMooreHorspool(sub, pat, idx, &complete); |
- if (complete) return idx; |
- return BoyerMooreIndexOf(sub, pat, idx); |
+ int idx = start_index; |
+ switch (algorithm) { |
+ case SIMPLE_SEARCH: |
+ idx = SimpleIndexOf(sub, pat, idx, &complete); |
+ if (complete) return idx; |
+ BoyerMoorePopulateBadCharTable(pat); |
+ algorithm = BOYER_MOORE_HORSPOOL; |
+ // FALLTHROUGH. |
Erik Corry
2010/03/15 12:57:48
Does the linter insist on this annoying capitaliza
|
+ case BOYER_MOORE_HORSPOOL: |
+ idx = BoyerMooreHorspool(sub, pat, idx, &complete); |
+ if (complete) return idx; |
+ // Build the Good Suffix table and continue searching. |
+ BoyerMoorePopulateGoodSuffixTable(pat); |
+ algorithm = BOYER_MOORE; |
+ // FALLTHROUGH. |
+ case BOYER_MOORE: |
+ return BoyerMooreIndexOf(sub, pat, idx); |
+ } |
+ UNREACHABLE(); |
+ return -1; |
} |
+ |
+// Dispatch to different search strategies for a single search. |
+// If searching multiple times on the same needle, the search |
+// strategy should only be computed once and then dispatch to different |
+// loops. |
+template <typename schar, typename pchar> |
+static int StringSearch(Vector<const schar> sub, |
+ Vector<const pchar> pat, |
+ int start_index) { |
+ bool ascii_subject = (sizeof(schar) == 1); |
+ StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject); |
+ switch (strategy) { |
+ case SEARCH_FAIL: return -1; |
+ case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index); |
+ case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index); |
+ } |
+ UNREACHABLE(); |
+ return -1; |
+} |
+ |
+ |
// Perform string match of pattern on subject, starting at start index. |
// Caller must ensure that 0 <= start_index <= sub->length(), |
// and should check that pat->length() + start_index <= sub->length() |
@@ -2467,6 +2527,7 @@ int Runtime::StringMatch(Handle<String> sub, |
if (!sub->IsFlat()) { |
FlattenString(sub); |
} |
+ |
// Searching for one specific character is common. For one |
// character patterns linear search is necessary, so any smart |
// algorithm is unnecessary overhead. |
@@ -2500,15 +2561,15 @@ int Runtime::StringMatch(Handle<String> sub, |
if (pat->IsAsciiRepresentation()) { |
Vector<const char> pat_vector = pat->ToAsciiVector(); |
if (sub->IsAsciiRepresentation()) { |
- return StringMatchStrategy(sub->ToAsciiVector(), pat_vector, start_index); |
+ return StringSearch(sub->ToAsciiVector(), pat_vector, start_index); |
} |
- return StringMatchStrategy(sub->ToUC16Vector(), pat_vector, start_index); |
+ return StringSearch(sub->ToUC16Vector(), pat_vector, start_index); |
} |
Vector<const uc16> pat_vector = pat->ToUC16Vector(); |
if (sub->IsAsciiRepresentation()) { |
- return StringMatchStrategy(sub->ToAsciiVector(), pat_vector, start_index); |
+ return StringSearch(sub->ToAsciiVector(), pat_vector, start_index); |
} |
- return StringMatchStrategy(sub->ToUC16Vector(), pat_vector, start_index); |
+ return StringSearch(sub->ToUC16Vector(), pat_vector, start_index); |
} |
@@ -4263,6 +4324,164 @@ static Object* Runtime_StringTrim(Arguments args) { |
} |
+template <typename schar, typename pchar> |
+void FindStringIndices(Vector<const schar> subject, |
+ Vector<const pchar> pattern, |
+ ZoneList<int>* indices, |
+ unsigned int limit) { |
+ ASSERT(limit > 0); |
+ // Collect indices of pattern in subject, and the end-of-string index. |
+ // Stop after finding at most limit values. |
+ StringSearchStrategy strategy = |
+ InitializeStringSearch(pattern, sizeof(schar) == 1); |
+ switch (strategy) { |
+ case SEARCH_FAIL: return; |
+ case SEARCH_SHORT: { |
+ int pattern_length = pattern.length(); |
+ int index = 0; |
+ while (limit > 0) { |
+ index = SimpleIndexOf(subject, pattern, index); |
+ if (index < 0) return; |
+ indices->Add(index); |
+ index += pattern_length; |
+ limit--; |
+ } |
+ return; |
+ } |
+ case SEARCH_LONG: { |
+ int pattern_length = pattern.length(); |
+ int index = 0; |
+ while (limit > 0) { |
+ index = ComplexIndexOf(subject, pattern, index); |
+ if (index < 0) return; |
+ indices->Add(index); |
+ index += pattern_length; |
+ limit--; |
+ } |
+ return; |
+ } |
+ default: |
+ UNREACHABLE(); |
+ return; |
+ } |
+} |
+ |
+template <typename schar> |
+void inline FindCharIndices(Vector<const schar> subject, |
+ const schar pattern_char, |
+ ZoneList<int>* indices, |
+ unsigned int limit) { |
+ // Collect indices of pattern_char in subject, and the end-of-string index. |
+ // Stop after finding at most limit values. |
+ int index = 0; |
+ while (limit > 0) { |
+ index = SingleCharIndexOf(subject, pattern_char, index); |
+ if (index < 0) return; |
+ indices->Add(index); |
+ index++; |
+ limit--; |
+ } |
+} |
+ |
+ |
+static Object* Runtime_StringSplit(Arguments args) { |
+ ASSERT(args.length() == 3); |
+ HandleScope handle_scope; |
+ CONVERT_ARG_CHECKED(String, subject, 0); |
+ CONVERT_ARG_CHECKED(String, pattern, 1); |
+ CONVERT_NUMBER_CHECKED(uint32_t, limit, Uint32, args[2]); |
+ |
+ int subject_length = subject->length(); |
+ int pattern_length = pattern->length(); |
+ |
+ if (!subject->IsFlat()) FlattenString(subject); |
+ |
+ static const int kMaxInitialListCapacity = 16; |
+ |
+ ZoneScope scope(DELETE_ON_EXIT); |
+ |
+ // Find (up to limit) indices of separator and end-of-string in subject |
+ int initial_capacity = Min<uint32_t>(kMaxInitialListCapacity, limit); |
+ ZoneList<int> indices(initial_capacity); |
+ if (pattern_length == 1) { |
+ // Special case, go directly to fast single-character split. |
+ AssertNoAllocation nogc; |
+ uc16 pattern_char = pattern->Get(0); |
+ if (subject->IsTwoByteRepresentation()) { |
+ FindCharIndices(subject->ToUC16Vector(), pattern_char, |
+ &indices, |
+ limit); |
+ } else if (pattern_char <= String::kMaxAsciiCharCode) { |
+ FindCharIndices(subject->ToAsciiVector(), |
+ static_cast<char>(pattern_char), |
+ &indices, |
+ limit); |
+ } |
+ } else { |
+ if (!pattern->IsFlat()) FlattenString(pattern); |
+ AssertNoAllocation nogc; |
+ if (subject->IsAsciiRepresentation()) { |
+ Vector<const char> subject_vector = subject->ToAsciiVector(); |
+ if (pattern->IsAsciiRepresentation()) { |
+ FindStringIndices(subject_vector, |
+ pattern->ToAsciiVector(), |
+ &indices, |
+ limit); |
+ } else { |
+ FindStringIndices(subject_vector, |
+ pattern->ToUC16Vector(), |
+ &indices, |
+ limit); |
+ } |
+ } else { |
+ Vector<const uc16> subject_vector = subject->ToUC16Vector(); |
+ if (pattern->IsAsciiRepresentation()) { |
+ FindStringIndices(subject_vector, |
+ pattern->ToAsciiVector(), |
+ &indices, |
+ limit); |
+ } else { |
+ FindStringIndices(subject_vector, |
+ pattern->ToUC16Vector(), |
+ &indices, |
+ limit); |
+ } |
+ } |
+ } |
+ if (static_cast<uint32_t>(indices.length()) < limit) { |
+ indices.Add(subject_length); |
+ } |
+ // The list indices now contains the end of each part to create. |
+ |
+ |
+ // Create JSArray of substrings separated by separator. |
+ int part_count = indices.length(); |
+ |
+ Handle<JSArray> result = Factory::NewJSArray(part_count); |
+ result->set_length(Smi::FromInt(part_count)); |
+ |
+ ASSERT(result->HasFastElements()); |
+ |
+ if (part_count == 1 && indices.at(0) == subject_length) { |
+ FixedArray::cast(result->elements())->set(0, *subject); |
+ return *result; |
+ } |
+ |
+ Handle<FixedArray> elements(FixedArray::cast(result->elements())); |
+ int part_start = 0; |
+ for (int i = 0; i < part_count; i++) { |
+ HandleScope local_loop_handle; |
+ int part_end = indices.at(i); |
+ Handle<String> substring = |
+ Factory::NewSubString(subject, part_start, part_end); |
+ elements->set(i, *substring); |
+ part_start = part_end + pattern_length; |
+ } |
+ |
+ return *result; |
+} |
+ |
+ |
// Copies ascii characters to the given fixed array looking up |
// one-char strings in the cache. Gives up on the first char that is |
// not in the cache and fills the remainder with smi zeros. Returns |