src/runtime.cc - Issue 875001: Converted String.prototype.split with string to C++.

Unified Diff: src/runtime.cc

Issue 875001: Converted String.prototype.split with string to C++. (Closed)

Patch Set: Rewrote conditions to switch with fallthrough. Created 10 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/runtime.cc

diff --git a/src/runtime.cc b/src/runtime.cc

index 04ea02bb6cac0641a1b6480a58b1ef854a25cd02..6c46d6ace67c89e576167dd524d8fddfd18f8857 100644

--- a/src/runtime.cc

+++ b/src/runtime.cc

@@ -2136,10 +2136,23 @@ class BMGoodSuffixBuffers {

static int bad_char_occurrence[kBMAlphabetSize];

static BMGoodSuffixBuffers bmgs_buffers;

+// State of the string match tables.

+// SIMPLE: No usable content in the buffers.

+// BOYER_MOORE_HORSPOOL: The bad_char_occurences table has been populated.

+// BOYER_MOORE: The bmgs_buffers tables have also been populated.

+// Whenever starting with a new needle, one should call InitializeStringSearch

+// to determine which search strategy to use, and in the case of a long-needle

+// strategy, the call also initializes the algorithm to SIMPLE.

+enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE };

+static StringSearchAlgorithm algorithm;

// Compute the bad-char table for Boyer-Moore in the static buffer.

template <typename pchar>

-static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern,

- int start) {

+static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern) {

+ // Only preprocess at most kBMMaxShift last characters of pattern.

+ int start = pattern.length() < kBMMaxShift ? 0

+ : pattern.length() - kBMMaxShift;

// Run forwards to populate bad_char_table, so that *last* instance

// of character equivalence class is the one registered.

// Notice: Doesn't include the last character.

@@ -2159,10 +2172,11 @@ static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern,

}

template <typename pchar>

-static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern,

- int start) {

+static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern) {

int m = pattern.length();

+ int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;

int len = m - start;

// Compute Good Suffix tables.

bmgs_buffers.init(m);

@@ -2209,6 +2223,7 @@ static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern,

}

template <typename schar, typename pchar>

static inline int CharOccurrence(int char_code) {

if (sizeof(schar) == 1) {

@@ -2223,6 +2238,7 @@ static inline int CharOccurrence(int char_code) {

return bad_char_occurrence[char_code % kBMAlphabetSize];

}

// Restricted simplified Boyer-Moore string matching.

// Uses only the bad-shift table of Boyer-Moore and only uses it

// for the character compared to the last character of the needle.

@@ -2231,14 +2247,13 @@ static int BoyerMooreHorspool(Vector<const schar> subject,

Vector<const pchar> pattern,

int start_index,

bool* complete) {

+ ASSERT(algorithm <= BOYER_MOORE_HORSPOOL);

int n = subject.length();

int m = pattern.length();

- // Only preprocess at most kBMMaxShift last characters of pattern.

- int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;

- BoyerMoorePopulateBadCharTable(pattern, start);

+ int badness = -m;

- int badness = -m; // How bad we are doing without a good-suffix table.

+ // How bad we are doing without a good-suffix table.

int idx; // No matches found prior to this index.

pchar last_char = pattern[m - 1];

int last_char_shift = m - 1 - CharOccurrence<schar, pchar>(last_char);

@@ -2252,6 +2267,7 @@ static int BoyerMooreHorspool(Vector<const schar> subject,

idx += shift;

badness += 1 - shift; // at most zero, so badness cannot increase.

if (idx > n - m) {

+ // If we could just break out of the for loop here.

Erik Corry 2010/03/15 12:57:48 What would then happen?

*complete = true;

return -1;

}

@@ -2283,13 +2299,12 @@ template <typename schar, typename pchar>

static int BoyerMooreIndexOf(Vector<const schar> subject,

Vector<const pchar> pattern,

int idx) {

+ ASSERT(algorithm <= BOYER_MOORE);

int n = subject.length();

int m = pattern.length();

// Only preprocess at most kBMMaxShift last characters of pattern.

int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;

- // Build the Good Suffix table and continue searching.

- BoyerMoorePopulateGoodSuffixTable(pattern, start);

pchar last_char = pattern[m - 1];

// Continue search from i.

while (idx <= n - m) {

@@ -2325,9 +2340,9 @@ static int BoyerMooreIndexOf(Vector<const schar> subject,

template <typename schar>

-static int SingleCharIndexOf(Vector<const schar> string,

- schar pattern_char,

- int start_index) {

+static inline int SingleCharIndexOf(Vector<const schar> string,

+ schar pattern_char,

+ int start_index) {

for (int i = start_index, n = string.length(); i < n; i++) {

if (pattern_char == string[i]) {

return i;

@@ -2364,11 +2379,11 @@ static int SimpleIndexOf(Vector<const schar> subject,

// Badness is a count of how much work we have done. When we have

// done enough work we decide it's probably worth switching to a better

// algorithm.

- int badness = -10 - (pattern.length() << 2);

// We know our pattern is at least 2 characters, we cache the first so

// the common case of the first character not matching is faster.

pchar pattern_first_char = pattern[0];

+ int badness = -10 - pattern.length() << 2;

for (int i = idx, n = subject.length() - pattern.length(); i <= n; i++) {

badness++;

if (badness > 0) {

@@ -2416,39 +2431,84 @@ static int SimpleIndexOf(Vector<const schar> subject,

}

-// Dispatch to different algorithms.

-template <typename schar, typename pchar>

-static int StringMatchStrategy(Vector<const schar> sub,

- Vector<const pchar> pat,

- int start_index) {

- ASSERT(pat.length() > 1);

+// Strategy for searching for a string in another string.

+enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG };

+template <typename pchar>

+static inline StringSearchStrategy InitializeStringSearch(

+ Vector<const pchar> pat, bool ascii_subject) {

+ ASSERT(pat.length() > 1);

// We have an ASCII haystack and a non-ASCII needle. Check if there

// really is a non-ASCII character in the needle and bail out if there

// is.

- if (sizeof(schar) == 1 && sizeof(pchar) > 1) {

+ if (ascii_subject && sizeof(pchar) > 1) {

for (int i = 0; i < pat.length(); i++) {

uc16 c = pat[i];

if (c > String::kMaxAsciiCharCode) {

- return -1;

+ return SEARCH_FAIL;

}

if (pat.length() < kBMMinPatternLength) {

- // We don't believe fancy searching can ever be more efficient.

- // The max shift of Boyer-Moore on a pattern of this length does

- // not compensate for the overhead.

- return SimpleIndexOf(sub, pat, start_index);

+ return SEARCH_SHORT;

}

+ algorithm = SIMPLE_SEARCH;

+ return SEARCH_LONG;

+// Dispatch long needle searches to different algorithms.

+template <typename schar, typename pchar>

+static int ComplexIndexOf(Vector<const schar> sub,

+ Vector<const pchar> pat,

+ int start_index) {

+ ASSERT(pat.length() >= kBMMinPatternLength);

// Try algorithms in order of increasing setup cost and expected performance.

bool complete;

- int idx = SimpleIndexOf(sub, pat, start_index, &complete);

- if (complete) return idx;

- idx = BoyerMooreHorspool(sub, pat, idx, &complete);

- if (complete) return idx;

- return BoyerMooreIndexOf(sub, pat, idx);

+ int idx = start_index;

+ switch (algorithm) {

+ case SIMPLE_SEARCH:

+ idx = SimpleIndexOf(sub, pat, idx, &complete);

+ if (complete) return idx;

+ BoyerMoorePopulateBadCharTable(pat);

+ algorithm = BOYER_MOORE_HORSPOOL;

+ // FALLTHROUGH.

Erik Corry 2010/03/15 12:57:48 Does the linter insist on this annoying capitaliza

+ case BOYER_MOORE_HORSPOOL:

+ idx = BoyerMooreHorspool(sub, pat, idx, &complete);

+ if (complete) return idx;

+ // Build the Good Suffix table and continue searching.

+ BoyerMoorePopulateGoodSuffixTable(pat);

+ algorithm = BOYER_MOORE;

+ // FALLTHROUGH.

+ case BOYER_MOORE:

+ return BoyerMooreIndexOf(sub, pat, idx);

+ }

+ UNREACHABLE();

+ return -1;

}

+// Dispatch to different search strategies for a single search.

+// If searching multiple times on the same needle, the search

+// strategy should only be computed once and then dispatch to different

+// loops.

+template <typename schar, typename pchar>

+static int StringSearch(Vector<const schar> sub,

+ Vector<const pchar> pat,

+ int start_index) {

+ bool ascii_subject = (sizeof(schar) == 1);

+ StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject);

+ switch (strategy) {

+ case SEARCH_FAIL: return -1;

+ case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index);

+ case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index);

+ }

+ UNREACHABLE();

+ return -1;

// Perform string match of pattern on subject, starting at start index.

// Caller must ensure that 0 <= start_index <= sub->length(),

// and should check that pat->length() + start_index <= sub->length()

@@ -2467,6 +2527,7 @@ int Runtime::StringMatch(Handle<String> sub,

if (!sub->IsFlat()) {

FlattenString(sub);

}

// Searching for one specific character is common. For one

// character patterns linear search is necessary, so any smart

// algorithm is unnecessary overhead.

@@ -2500,15 +2561,15 @@ int Runtime::StringMatch(Handle<String> sub,

if (pat->IsAsciiRepresentation()) {

Vector<const char> pat_vector = pat->ToAsciiVector();

if (sub->IsAsciiRepresentation()) {

- return StringMatchStrategy(sub->ToAsciiVector(), pat_vector, start_index);

+ return StringSearch(sub->ToAsciiVector(), pat_vector, start_index);

}

- return StringMatchStrategy(sub->ToUC16Vector(), pat_vector, start_index);

+ return StringSearch(sub->ToUC16Vector(), pat_vector, start_index);

}

Vector<const uc16> pat_vector = pat->ToUC16Vector();

if (sub->IsAsciiRepresentation()) {

- return StringMatchStrategy(sub->ToAsciiVector(), pat_vector, start_index);

+ return StringSearch(sub->ToAsciiVector(), pat_vector, start_index);

}

- return StringMatchStrategy(sub->ToUC16Vector(), pat_vector, start_index);

+ return StringSearch(sub->ToUC16Vector(), pat_vector, start_index);

}

@@ -4263,6 +4324,164 @@ static Object* Runtime_StringTrim(Arguments args) {

}

+template <typename schar, typename pchar>

+void FindStringIndices(Vector<const schar> subject,

+ Vector<const pchar> pattern,

+ ZoneList<int>* indices,

+ unsigned int limit) {

+ ASSERT(limit > 0);

+ // Collect indices of pattern in subject, and the end-of-string index.

+ // Stop after finding at most limit values.

+ StringSearchStrategy strategy =

+ InitializeStringSearch(pattern, sizeof(schar) == 1);

+ switch (strategy) {

+ case SEARCH_FAIL: return;

+ case SEARCH_SHORT: {

+ int pattern_length = pattern.length();

+ int index = 0;

+ while (limit > 0) {

+ index = SimpleIndexOf(subject, pattern, index);

+ if (index < 0) return;

+ indices->Add(index);

+ index += pattern_length;

+ limit--;

+ }

+ return;

+ }

+ case SEARCH_LONG: {

+ int pattern_length = pattern.length();

+ int index = 0;

+ while (limit > 0) {

+ index = ComplexIndexOf(subject, pattern, index);

+ if (index < 0) return;

+ indices->Add(index);

+ index += pattern_length;

+ limit--;

+ }

+ return;

+ }

+ default:

+ UNREACHABLE();

+ return;

+ }

+template <typename schar>

+void inline FindCharIndices(Vector<const schar> subject,

+ const schar pattern_char,

+ ZoneList<int>* indices,

+ unsigned int limit) {

+ // Collect indices of pattern_char in subject, and the end-of-string index.

+ // Stop after finding at most limit values.

+ int index = 0;

+ while (limit > 0) {

+ index = SingleCharIndexOf(subject, pattern_char, index);

+ if (index < 0) return;

+ indices->Add(index);

+ index++;

+ limit--;

+ }

+static Object* Runtime_StringSplit(Arguments args) {

+ ASSERT(args.length() == 3);

+ HandleScope handle_scope;

+ CONVERT_ARG_CHECKED(String, subject, 0);

+ CONVERT_ARG_CHECKED(String, pattern, 1);

+ CONVERT_NUMBER_CHECKED(uint32_t, limit, Uint32, args[2]);

+ int subject_length = subject->length();

+ int pattern_length = pattern->length();

+ if (!subject->IsFlat()) FlattenString(subject);

+ static const int kMaxInitialListCapacity = 16;

+ ZoneScope scope(DELETE_ON_EXIT);

+ // Find (up to limit) indices of separator and end-of-string in subject

+ int initial_capacity = Min<uint32_t>(kMaxInitialListCapacity, limit);

+ ZoneList<int> indices(initial_capacity);

+ if (pattern_length == 1) {

+ // Special case, go directly to fast single-character split.

+ AssertNoAllocation nogc;

+ uc16 pattern_char = pattern->Get(0);

+ if (subject->IsTwoByteRepresentation()) {

+ FindCharIndices(subject->ToUC16Vector(), pattern_char,

+ &indices,

+ limit);

+ } else if (pattern_char <= String::kMaxAsciiCharCode) {

+ FindCharIndices(subject->ToAsciiVector(),

+ static_cast<char>(pattern_char),

+ &indices,

+ limit);

+ }

+ } else {

+ if (!pattern->IsFlat()) FlattenString(pattern);

+ AssertNoAllocation nogc;

+ if (subject->IsAsciiRepresentation()) {

+ Vector<const char> subject_vector = subject->ToAsciiVector();

+ if (pattern->IsAsciiRepresentation()) {

+ FindStringIndices(subject_vector,

+ pattern->ToAsciiVector(),

+ &indices,

+ limit);

+ } else {

+ FindStringIndices(subject_vector,

+ pattern->ToUC16Vector(),

+ &indices,

+ limit);

+ }

+ } else {

+ Vector<const uc16> subject_vector = subject->ToUC16Vector();

+ if (pattern->IsAsciiRepresentation()) {

+ FindStringIndices(subject_vector,

+ pattern->ToAsciiVector(),

+ &indices,

+ limit);

+ } else {

+ FindStringIndices(subject_vector,

+ pattern->ToUC16Vector(),

+ &indices,

+ limit);

+ }

+ if (static_cast<uint32_t>(indices.length()) < limit) {

+ indices.Add(subject_length);

+ }

+ // The list indices now contains the end of each part to create.

+ // Create JSArray of substrings separated by separator.

+ int part_count = indices.length();

+ Handle<JSArray> result = Factory::NewJSArray(part_count);

+ result->set_length(Smi::FromInt(part_count));

+ ASSERT(result->HasFastElements());

+ if (part_count == 1 && indices.at(0) == subject_length) {

+ FixedArray::cast(result->elements())->set(0, *subject);

+ return *result;

+ }

+ Handle<FixedArray> elements(FixedArray::cast(result->elements()));

+ int part_start = 0;

+ for (int i = 0; i < part_count; i++) {

+ HandleScope local_loop_handle;

+ int part_end = indices.at(i);

+ Handle<String> substring =

+ Factory::NewSubString(subject, part_start, part_end);

+ elements->set(i, *substring);

+ part_start = part_end + pattern_length;

+ }

+ return *result;

// Copies ascii characters to the given fixed array looking up

// one-char strings in the cache. Gives up on the first char that is

// not in the cache and fills the remainder with smi zeros. Returns

« no previous file with comments | « src/runtime.h ('k') | src/string.js » ('j') | src/string.js » ('J')