Chromium Code Reviews| Index: src/runtime.cc |
| diff --git a/src/runtime.cc b/src/runtime.cc |
| index 04ea02bb6cac0641a1b6480a58b1ef854a25cd02..6c46d6ace67c89e576167dd524d8fddfd18f8857 100644 |
| --- a/src/runtime.cc |
| +++ b/src/runtime.cc |
| @@ -2136,10 +2136,23 @@ class BMGoodSuffixBuffers { |
| static int bad_char_occurrence[kBMAlphabetSize]; |
| static BMGoodSuffixBuffers bmgs_buffers; |
| +// State of the string match tables. |
| +// SIMPLE: No usable content in the buffers. |
| +// BOYER_MOORE_HORSPOOL: The bad_char_occurences table has been populated. |
| +// BOYER_MOORE: The bmgs_buffers tables have also been populated. |
| +// Whenever starting with a new needle, one should call InitializeStringSearch |
| +// to determine which search strategy to use, and in the case of a long-needle |
| +// strategy, the call also initializes the algorithm to SIMPLE. |
| +enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE }; |
| +static StringSearchAlgorithm algorithm; |
| + |
| + |
| // Compute the bad-char table for Boyer-Moore in the static buffer. |
| template <typename pchar> |
| -static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern, |
| - int start) { |
| +static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern) { |
| + // Only preprocess at most kBMMaxShift last characters of pattern. |
| + int start = pattern.length() < kBMMaxShift ? 0 |
| + : pattern.length() - kBMMaxShift; |
| // Run forwards to populate bad_char_table, so that *last* instance |
| // of character equivalence class is the one registered. |
| // Notice: Doesn't include the last character. |
| @@ -2159,10 +2172,11 @@ static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern, |
| } |
| } |
| + |
| template <typename pchar> |
| -static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern, |
| - int start) { |
| +static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern) { |
| int m = pattern.length(); |
| + int start = m < kBMMaxShift ? 0 : m - kBMMaxShift; |
| int len = m - start; |
| // Compute Good Suffix tables. |
| bmgs_buffers.init(m); |
| @@ -2209,6 +2223,7 @@ static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern, |
| } |
| } |
| + |
| template <typename schar, typename pchar> |
| static inline int CharOccurrence(int char_code) { |
| if (sizeof(schar) == 1) { |
| @@ -2223,6 +2238,7 @@ static inline int CharOccurrence(int char_code) { |
| return bad_char_occurrence[char_code % kBMAlphabetSize]; |
| } |
| + |
| // Restricted simplified Boyer-Moore string matching. |
| // Uses only the bad-shift table of Boyer-Moore and only uses it |
| // for the character compared to the last character of the needle. |
| @@ -2231,14 +2247,13 @@ static int BoyerMooreHorspool(Vector<const schar> subject, |
| Vector<const pchar> pattern, |
| int start_index, |
| bool* complete) { |
| + ASSERT(algorithm <= BOYER_MOORE_HORSPOOL); |
| int n = subject.length(); |
| int m = pattern.length(); |
| - // Only preprocess at most kBMMaxShift last characters of pattern. |
| - int start = m < kBMMaxShift ? 0 : m - kBMMaxShift; |
| - BoyerMoorePopulateBadCharTable(pattern, start); |
| + int badness = -m; |
| - int badness = -m; // How bad we are doing without a good-suffix table. |
| + // How bad we are doing without a good-suffix table. |
| int idx; // No matches found prior to this index. |
| pchar last_char = pattern[m - 1]; |
| int last_char_shift = m - 1 - CharOccurrence<schar, pchar>(last_char); |
| @@ -2252,6 +2267,7 @@ static int BoyerMooreHorspool(Vector<const schar> subject, |
| idx += shift; |
| badness += 1 - shift; // at most zero, so badness cannot increase. |
| if (idx > n - m) { |
| + // If we could just break out of the for loop here. |
|
Erik Corry
2010/03/15 12:57:48
What would then happen?
|
| *complete = true; |
| return -1; |
| } |
| @@ -2283,13 +2299,12 @@ template <typename schar, typename pchar> |
| static int BoyerMooreIndexOf(Vector<const schar> subject, |
| Vector<const pchar> pattern, |
| int idx) { |
| + ASSERT(algorithm <= BOYER_MOORE); |
| int n = subject.length(); |
| int m = pattern.length(); |
| // Only preprocess at most kBMMaxShift last characters of pattern. |
| int start = m < kBMMaxShift ? 0 : m - kBMMaxShift; |
| - // Build the Good Suffix table and continue searching. |
| - BoyerMoorePopulateGoodSuffixTable(pattern, start); |
| pchar last_char = pattern[m - 1]; |
| // Continue search from i. |
| while (idx <= n - m) { |
| @@ -2325,9 +2340,9 @@ static int BoyerMooreIndexOf(Vector<const schar> subject, |
| template <typename schar> |
| -static int SingleCharIndexOf(Vector<const schar> string, |
| - schar pattern_char, |
| - int start_index) { |
| +static inline int SingleCharIndexOf(Vector<const schar> string, |
| + schar pattern_char, |
| + int start_index) { |
| for (int i = start_index, n = string.length(); i < n; i++) { |
| if (pattern_char == string[i]) { |
| return i; |
| @@ -2364,11 +2379,11 @@ static int SimpleIndexOf(Vector<const schar> subject, |
| // Badness is a count of how much work we have done. When we have |
| // done enough work we decide it's probably worth switching to a better |
| // algorithm. |
| - int badness = -10 - (pattern.length() << 2); |
| + |
| // We know our pattern is at least 2 characters, we cache the first so |
| // the common case of the first character not matching is faster. |
| pchar pattern_first_char = pattern[0]; |
| - |
| + int badness = -10 - pattern.length() << 2; |
| for (int i = idx, n = subject.length() - pattern.length(); i <= n; i++) { |
| badness++; |
| if (badness > 0) { |
| @@ -2416,39 +2431,84 @@ static int SimpleIndexOf(Vector<const schar> subject, |
| } |
| -// Dispatch to different algorithms. |
| -template <typename schar, typename pchar> |
| -static int StringMatchStrategy(Vector<const schar> sub, |
| - Vector<const pchar> pat, |
| - int start_index) { |
| - ASSERT(pat.length() > 1); |
| +// Strategy for searching for a string in another string. |
| +enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG }; |
| + |
| +template <typename pchar> |
| +static inline StringSearchStrategy InitializeStringSearch( |
| + Vector<const pchar> pat, bool ascii_subject) { |
| + ASSERT(pat.length() > 1); |
| // We have an ASCII haystack and a non-ASCII needle. Check if there |
| // really is a non-ASCII character in the needle and bail out if there |
| // is. |
| - if (sizeof(schar) == 1 && sizeof(pchar) > 1) { |
| + if (ascii_subject && sizeof(pchar) > 1) { |
| for (int i = 0; i < pat.length(); i++) { |
| uc16 c = pat[i]; |
| if (c > String::kMaxAsciiCharCode) { |
| - return -1; |
| + return SEARCH_FAIL; |
| } |
| } |
| } |
| if (pat.length() < kBMMinPatternLength) { |
| - // We don't believe fancy searching can ever be more efficient. |
| - // The max shift of Boyer-Moore on a pattern of this length does |
| - // not compensate for the overhead. |
| - return SimpleIndexOf(sub, pat, start_index); |
| + return SEARCH_SHORT; |
| } |
| + algorithm = SIMPLE_SEARCH; |
| + return SEARCH_LONG; |
| +} |
| + |
| + |
| +// Dispatch long needle searches to different algorithms. |
| +template <typename schar, typename pchar> |
| +static int ComplexIndexOf(Vector<const schar> sub, |
| + Vector<const pchar> pat, |
| + int start_index) { |
| + ASSERT(pat.length() >= kBMMinPatternLength); |
| // Try algorithms in order of increasing setup cost and expected performance. |
| bool complete; |
| - int idx = SimpleIndexOf(sub, pat, start_index, &complete); |
| - if (complete) return idx; |
| - idx = BoyerMooreHorspool(sub, pat, idx, &complete); |
| - if (complete) return idx; |
| - return BoyerMooreIndexOf(sub, pat, idx); |
| + int idx = start_index; |
| + switch (algorithm) { |
| + case SIMPLE_SEARCH: |
| + idx = SimpleIndexOf(sub, pat, idx, &complete); |
| + if (complete) return idx; |
| + BoyerMoorePopulateBadCharTable(pat); |
| + algorithm = BOYER_MOORE_HORSPOOL; |
| + // FALLTHROUGH. |
|
Erik Corry
2010/03/15 12:57:48
Does the linter insist on this annoying capitaliza
|
| + case BOYER_MOORE_HORSPOOL: |
| + idx = BoyerMooreHorspool(sub, pat, idx, &complete); |
| + if (complete) return idx; |
| + // Build the Good Suffix table and continue searching. |
| + BoyerMoorePopulateGoodSuffixTable(pat); |
| + algorithm = BOYER_MOORE; |
| + // FALLTHROUGH. |
| + case BOYER_MOORE: |
| + return BoyerMooreIndexOf(sub, pat, idx); |
| + } |
| + UNREACHABLE(); |
| + return -1; |
| } |
| + |
| +// Dispatch to different search strategies for a single search. |
| +// If searching multiple times on the same needle, the search |
| +// strategy should only be computed once and then dispatch to different |
| +// loops. |
| +template <typename schar, typename pchar> |
| +static int StringSearch(Vector<const schar> sub, |
| + Vector<const pchar> pat, |
| + int start_index) { |
| + bool ascii_subject = (sizeof(schar) == 1); |
| + StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject); |
| + switch (strategy) { |
| + case SEARCH_FAIL: return -1; |
| + case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index); |
| + case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index); |
| + } |
| + UNREACHABLE(); |
| + return -1; |
| +} |
| + |
| + |
| // Perform string match of pattern on subject, starting at start index. |
| // Caller must ensure that 0 <= start_index <= sub->length(), |
| // and should check that pat->length() + start_index <= sub->length() |
| @@ -2467,6 +2527,7 @@ int Runtime::StringMatch(Handle<String> sub, |
| if (!sub->IsFlat()) { |
| FlattenString(sub); |
| } |
| + |
| // Searching for one specific character is common. For one |
| // character patterns linear search is necessary, so any smart |
| // algorithm is unnecessary overhead. |
| @@ -2500,15 +2561,15 @@ int Runtime::StringMatch(Handle<String> sub, |
| if (pat->IsAsciiRepresentation()) { |
| Vector<const char> pat_vector = pat->ToAsciiVector(); |
| if (sub->IsAsciiRepresentation()) { |
| - return StringMatchStrategy(sub->ToAsciiVector(), pat_vector, start_index); |
| + return StringSearch(sub->ToAsciiVector(), pat_vector, start_index); |
| } |
| - return StringMatchStrategy(sub->ToUC16Vector(), pat_vector, start_index); |
| + return StringSearch(sub->ToUC16Vector(), pat_vector, start_index); |
| } |
| Vector<const uc16> pat_vector = pat->ToUC16Vector(); |
| if (sub->IsAsciiRepresentation()) { |
| - return StringMatchStrategy(sub->ToAsciiVector(), pat_vector, start_index); |
| + return StringSearch(sub->ToAsciiVector(), pat_vector, start_index); |
| } |
| - return StringMatchStrategy(sub->ToUC16Vector(), pat_vector, start_index); |
| + return StringSearch(sub->ToUC16Vector(), pat_vector, start_index); |
| } |
| @@ -4263,6 +4324,164 @@ static Object* Runtime_StringTrim(Arguments args) { |
| } |
| +template <typename schar, typename pchar> |
| +void FindStringIndices(Vector<const schar> subject, |
| + Vector<const pchar> pattern, |
| + ZoneList<int>* indices, |
| + unsigned int limit) { |
| + ASSERT(limit > 0); |
| + // Collect indices of pattern in subject, and the end-of-string index. |
| + // Stop after finding at most limit values. |
| + StringSearchStrategy strategy = |
| + InitializeStringSearch(pattern, sizeof(schar) == 1); |
| + switch (strategy) { |
| + case SEARCH_FAIL: return; |
| + case SEARCH_SHORT: { |
| + int pattern_length = pattern.length(); |
| + int index = 0; |
| + while (limit > 0) { |
| + index = SimpleIndexOf(subject, pattern, index); |
| + if (index < 0) return; |
| + indices->Add(index); |
| + index += pattern_length; |
| + limit--; |
| + } |
| + return; |
| + } |
| + case SEARCH_LONG: { |
| + int pattern_length = pattern.length(); |
| + int index = 0; |
| + while (limit > 0) { |
| + index = ComplexIndexOf(subject, pattern, index); |
| + if (index < 0) return; |
| + indices->Add(index); |
| + index += pattern_length; |
| + limit--; |
| + } |
| + return; |
| + } |
| + default: |
| + UNREACHABLE(); |
| + return; |
| + } |
| +} |
| + |
| +template <typename schar> |
| +void inline FindCharIndices(Vector<const schar> subject, |
| + const schar pattern_char, |
| + ZoneList<int>* indices, |
| + unsigned int limit) { |
| + // Collect indices of pattern_char in subject, and the end-of-string index. |
| + // Stop after finding at most limit values. |
| + int index = 0; |
| + while (limit > 0) { |
| + index = SingleCharIndexOf(subject, pattern_char, index); |
| + if (index < 0) return; |
| + indices->Add(index); |
| + index++; |
| + limit--; |
| + } |
| +} |
| + |
| + |
| +static Object* Runtime_StringSplit(Arguments args) { |
| + ASSERT(args.length() == 3); |
| + HandleScope handle_scope; |
| + CONVERT_ARG_CHECKED(String, subject, 0); |
| + CONVERT_ARG_CHECKED(String, pattern, 1); |
| + CONVERT_NUMBER_CHECKED(uint32_t, limit, Uint32, args[2]); |
| + |
| + int subject_length = subject->length(); |
| + int pattern_length = pattern->length(); |
| + |
| + if (!subject->IsFlat()) FlattenString(subject); |
| + |
| + static const int kMaxInitialListCapacity = 16; |
| + |
| + ZoneScope scope(DELETE_ON_EXIT); |
| + |
| + // Find (up to limit) indices of separator and end-of-string in subject |
| + int initial_capacity = Min<uint32_t>(kMaxInitialListCapacity, limit); |
| + ZoneList<int> indices(initial_capacity); |
| + if (pattern_length == 1) { |
| + // Special case, go directly to fast single-character split. |
| + AssertNoAllocation nogc; |
| + uc16 pattern_char = pattern->Get(0); |
| + if (subject->IsTwoByteRepresentation()) { |
| + FindCharIndices(subject->ToUC16Vector(), pattern_char, |
| + &indices, |
| + limit); |
| + } else if (pattern_char <= String::kMaxAsciiCharCode) { |
| + FindCharIndices(subject->ToAsciiVector(), |
| + static_cast<char>(pattern_char), |
| + &indices, |
| + limit); |
| + } |
| + } else { |
| + if (!pattern->IsFlat()) FlattenString(pattern); |
| + AssertNoAllocation nogc; |
| + if (subject->IsAsciiRepresentation()) { |
| + Vector<const char> subject_vector = subject->ToAsciiVector(); |
| + if (pattern->IsAsciiRepresentation()) { |
| + FindStringIndices(subject_vector, |
| + pattern->ToAsciiVector(), |
| + &indices, |
| + limit); |
| + } else { |
| + FindStringIndices(subject_vector, |
| + pattern->ToUC16Vector(), |
| + &indices, |
| + limit); |
| + } |
| + } else { |
| + Vector<const uc16> subject_vector = subject->ToUC16Vector(); |
| + if (pattern->IsAsciiRepresentation()) { |
| + FindStringIndices(subject_vector, |
| + pattern->ToAsciiVector(), |
| + &indices, |
| + limit); |
| + } else { |
| + FindStringIndices(subject_vector, |
| + pattern->ToUC16Vector(), |
| + &indices, |
| + limit); |
| + } |
| + } |
| + } |
| + if (static_cast<uint32_t>(indices.length()) < limit) { |
| + indices.Add(subject_length); |
| + } |
| + // The list indices now contains the end of each part to create. |
| + |
| + |
| + // Create JSArray of substrings separated by separator. |
| + int part_count = indices.length(); |
| + |
| + Handle<JSArray> result = Factory::NewJSArray(part_count); |
| + result->set_length(Smi::FromInt(part_count)); |
| + |
| + ASSERT(result->HasFastElements()); |
| + |
| + if (part_count == 1 && indices.at(0) == subject_length) { |
| + FixedArray::cast(result->elements())->set(0, *subject); |
| + return *result; |
| + } |
| + |
| + Handle<FixedArray> elements(FixedArray::cast(result->elements())); |
| + int part_start = 0; |
| + for (int i = 0; i < part_count; i++) { |
| + HandleScope local_loop_handle; |
| + int part_end = indices.at(i); |
| + Handle<String> substring = |
| + Factory::NewSubString(subject, part_start, part_end); |
| + elements->set(i, *substring); |
| + part_start = part_end + pattern_length; |
| + } |
| + |
| + return *result; |
| +} |
| + |
| + |
| // Copies ascii characters to the given fixed array looking up |
| // one-char strings in the cache. Gives up on the first char that is |
| // not in the cache and fills the remainder with smi zeros. Returns |