Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(102)

Unified Diff: src/runtime.cc

Issue 42115: Faster string.replace with regexp pattern. (Closed)
Patch Set: Addressed review comments Created 11 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/runtime.h ('k') | src/string.js » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/runtime.cc
diff --git a/src/runtime.cc b/src/runtime.cc
index bd81704df05b98a68e1c7cbe4f99c567a3411abc..df3a5628e386086ddc6c3e9243dfec7f49abae45 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -1180,6 +1180,503 @@ static Object* Runtime_CharFromCode(Arguments args) {
return Heap::empty_string();
}
+// Forward declaration.
+template <typename schar>
+void StringBuilderConcatHelper(String*, StringShape, schar*, FixedArray*, int);
+
+class ReplacementStringBuilder {
+ public:
+ ReplacementStringBuilder(Handle<String> subject, int estimated_part_count)
+ : subject_(subject),
+ parts_(Factory::NewFixedArray(estimated_part_count)),
+ part_count_(0),
+ character_count_(0),
+ is_ascii_(StringShape(*subject).IsAsciiRepresentation()) {
+ // Require a non-zero initial size. Ensures that doubling the size to
+ // extend the array will work.
+ ASSERT(estimated_part_count > 0);
+ }
+
+ void AddSubjectSlice(int from, int to) {
+ ASSERT(from >= 0);
+ int length = to - from;
+ ASSERT(length >= 0);
+ if (length > 0) {
+ // Can we encode the slice in 11 bits for length and 19 bits for
+ // start position - as used by StringBuilderConcatHelper?
+ if (length < (1 << 11) && from < (1 << 19)) {
Erik Corry 2009/03/13 09:12:34 Not your fault (it's mine :-) but 11 and 19 should
Lasse Reichstein 2011/06/14 11:51:17 Fixed to use BitField<int,0,11> and BitField<int,1
+ int encoded_slice = (from << 11) + length;
+ AddElement(Smi::FromInt(encoded_slice));
+ } else {
+ Handle<String> slice = Factory::NewStringSlice(subject_, from, to);
Erik Corry 2009/03/13 09:12:34 Do we have test coverage of this branch?
Lasse Reichstein 2011/06/14 11:51:17 Probably not. Lets make sure. Some extra string.re
+ AddElement(*slice);
+ }
+ IncrementCharacterCount(length);
+ }
+ }
+
+
+ void AddString(Handle<String> string) {
+ StringShape shape(*string);
+ int length = string->length(shape);
+ if (length > 0) {
+ AddElement(*string);
+ if (!shape.IsAsciiRepresentation()) {
+ is_ascii_ = false;
+ }
+ IncrementCharacterCount(length);
+ }
+ }
+
+
+ Handle<String> ToString() {
+ if (part_count_ == 0) {
+ return Factory::empty_string();
+ }
+
+ Handle<String> joined_string;
+ if (is_ascii_) {
+ joined_string = NewRawAsciiString(character_count_);
+ AssertNoAllocation no_alloc;
+ SeqAsciiString* seq = SeqAsciiString::cast(*joined_string);
+ char* char_buffer = seq->GetChars();
+ StringBuilderConcatHelper(*subject_,
+ StringShape(*subject_),
+ char_buffer,
+ *parts_,
+ part_count_);
+ } else {
+ // Non-ASCII.
+ joined_string = NewRawTwoByteString(character_count_);
+ AssertNoAllocation no_alloc;
+ SeqTwoByteString* seq = SeqTwoByteString::cast(*joined_string);
+ uc16* char_buffer = seq->GetChars();
+ StringBuilderConcatHelper(*subject_,
+ StringShape(*subject_),
+ char_buffer,
+ *parts_,
+ part_count_);
+ }
+ return joined_string;
+ }
+
+
+ void IncrementCharacterCount(int by) {
+ if ( character_count_ > Smi::kMaxValue - by) {
+ V8::FatalProcessOutOfMemory("String.replace result too large.");
+ }
+ character_count_ += by;
+ }
+
+ private:
+
+ Handle<String> NewRawAsciiString(int size) {
+ CALL_HEAP_FUNCTION(Heap::AllocateRawAsciiString(size), String);
+ }
+
+
+ Handle<String> NewRawTwoByteString(int size) {
+ CALL_HEAP_FUNCTION(Heap::AllocateRawTwoByteString(size), String);
+ }
+
+
+ void AddElement(Object* element) {
+ ASSERT(element->IsSmi() || element->IsString());
+ // Extend parts_ array if necessary.
+ if (parts_->length() == part_count_) {
+ Handle<FixedArray> extended_array =
+ Factory::NewFixedArray(part_count_ * 2);
+ parts_->CopyTo(0, *extended_array, 0, part_count_);
+ parts_ = extended_array;
+ }
+ parts_->set(part_count_, element);
+ part_count_++;
+ }
+
+ Handle<String> subject_;
+ Handle<FixedArray> parts_;
+ int part_count_;
+ int character_count_;
+ bool is_ascii_;
+};
+
+
+class CompiledReplacement {
+ public:
+ CompiledReplacement()
+ : parts_(1), replacement_substrings_(0) {}
+
+ void Compile(Handle<String> replacement,
Erik Corry 2009/03/13 09:12:34 This method is (way!) too big to be inline in the
Lasse Reichstein 2011/06/14 11:51:17 Refactored outside. Just after, but outside.
+ int capture_count,
+ int subject_length) {
+ StringShape shape(*replacement);
+ ASSERT(replacement->IsFlat(shape));
+ if (shape.IsAsciiRepresentation()) {
+ AssertNoAllocation no_alloc;
+ Parse(&parts_,
+ replacement->ToAsciiVector(),
+ capture_count,
+ subject_length);
+ } else {
+ ASSERT(shape.IsTwoByteRepresentation());
+ AssertNoAllocation no_alloc;
+
+ Parse(&parts_,
+ replacement->ToUC16Vector(),
+ capture_count,
+ subject_length);
+ }
+ // Find substrings of replacement string and create them as String objects..
+ int substring_index = 0;
+ for (int i = 0, n = parts_.length(); i < n; i++) {
+ int tag = parts_[i].tag;
+ if (tag <= 0) { // A replacement string slice.
+ int from = -tag;
+ int to = parts_[i].data;
+ replacement_substrings_.Add(Factory::NewStringSlice(replacement,
+ from,
+ to));
+ parts_[i].tag = REPLACEMENT_SUBSTRING;
+ parts_[i].data = substring_index;
+ substring_index++;
+ } else if (tag == REPLACEMENT_STRING) {
+ replacement_substrings_.Add(replacement);
+ parts_[i].data = substring_index;
+ substring_index++;
+ }
+ }
+ }
+
+
+ void Apply(ReplacementStringBuilder* builder,
Erik Corry 2009/03/13 09:12:34 Ditto.
Lasse Reichstein 2011/06/14 11:51:17 Done.
+ int match_from,
+ int match_to,
+ Handle<JSArray> last_match_info) {
+ for (int i = 0, n = parts_.length(); i < n; i++) {
+ ReplacementPart part = parts_[i];
+ switch (part.tag) {
+ case SUBJECT_PREFIX:
+ builder->AddSubjectSlice(0, match_from);
+ break;
+ case SUBJECT_SUFFIX: {
+ int subject_length = part.data;
+ builder->AddSubjectSlice(match_to, subject_length);
+ break;
+ }
+ case SUBJECT_CAPTURE: {
+ int capture = part.data;
+ FixedArray* match_info = last_match_info->elements();
+ int from = RegExpImpl::GetCapture(match_info, capture * 2);
+ int to = RegExpImpl::GetCapture(match_info, capture * 2 + 1);
+ if (from >= 0 && to > from) {
+ builder->AddSubjectSlice(from, to);
+ }
+ break;
+ }
+ case REPLACEMENT_SUBSTRING:
+ case REPLACEMENT_STRING:
+ builder->AddString(replacement_substrings_[part.data]);
+ break;
+ default:
+ UNREACHABLE();
+ }
+ }
+ }
+ // Number of distinct parts of the replacement pattern.
+ int parts() {
+ return parts_.length();
+ }
+ private:
+ enum PartType {
+ SUBJECT_PREFIX = 1,
+ SUBJECT_SUFFIX,
+ SUBJECT_CAPTURE,
+ REPLACEMENT_SUBSTRING,
+ REPLACEMENT_STRING,
+
+ NUMBER_OF_PART_TYPES
+ };
+
+ struct ReplacementPart {
+ static inline ReplacementPart SubjectMatch() {
+ return ReplacementPart(SUBJECT_CAPTURE, 0);
+ }
+ static inline ReplacementPart SubjectCapture(int capture_index) {
+ return ReplacementPart(SUBJECT_CAPTURE, capture_index);
+ }
+ static inline ReplacementPart SubjectPrefix() {
+ return ReplacementPart(SUBJECT_PREFIX, 0);
+ }
+ static inline ReplacementPart SubjectSuffix(int subject_length) {
+ return ReplacementPart(SUBJECT_SUFFIX, subject_length);
+ }
+ static inline ReplacementPart ReplacementString() {
+ return ReplacementPart(REPLACEMENT_STRING, 0);
+ }
+ static inline ReplacementPart ReplacementSubString(int from, int to) {
+ ASSERT(from >= 0);
+ ASSERT(to > from);
+ return ReplacementPart(-from, to);
+ }
+
+ // If tag <= 0 then it is the negation of a start index of a substring of
+ // the replacement pattern, otherwise it's a value from PartType.
+ ReplacementPart(int tag, int data)
+ : tag(tag), data(data) {
+ // Must be non-positive or a PartType value.
+ ASSERT(tag < NUMBER_OF_PART_TYPES);
+ }
+ // Either a value of PartType or a non-positive number that is
+ // the negation of an index into the replacement string.
+ int tag;
+ // The data value's interpretation depends on the value of tag:
+ // tag == SUBJECT_PREFIX ||
+ // tag == SUBJECT_SUFFIX: data is unused.
+ // tag == SUBJECT_CAPTURE: data is the number of the capture.
+ // tag == REPLACEMENT_SUBSTRING ||
+ // tag == REPLACEMENT_STRING: data is index into array of substrings
+ // of the replacement string.
+ // tag <= 0: Temporary representation of the substring of the replacement
+ // string ranging over -tag .. data.
+ // Is replaced by REPLACEMENT_{SUB,}STRING when we create the
+ // substring objects.
+ int data;
+ };
+
+ template<typename Char>
+ static void Parse(ZoneList<ReplacementPart>* parts,
+ Vector<Char> characters,
+ int capture_count,
+ int subject_length) {
+ int length = characters.length();
+ int last = 0;
+ for (int i = 0; i < length; i++) {
+ Char c = characters[i];
+ if (c == '$') {
+ int next_index = i + 1;
+ if (next_index == length) { // No next character!
+ break;
+ }
+ Char c2 = characters[next_index];
+ switch (c2) {
+ case '$':
+ if (i > last) {
+ // There is a substring before. Include the first "$".
+ parts->Add(ReplacementPart::ReplacementSubString(last, next_index));
+ last = next_index + 1; // Continue after the second "$".
+ } else {
+ // Let the next substring start with the second "$".
+ last = next_index;
+ }
+ i = next_index;
+ break;
+ case '`':
+ if (i > last) {
+ parts->Add(ReplacementPart::ReplacementSubString(last, i));
+ }
+ parts->Add(ReplacementPart::SubjectPrefix());
+ i = next_index;
+ last = i + 1;
+ break;
+ case '\'':
+ if (i > last) {
+ parts->Add(ReplacementPart::ReplacementSubString(last, i));
+ }
+ parts->Add(ReplacementPart::SubjectSuffix(subject_length));
+ i = next_index;
+ last = i + 1;
+ break;
+ case '&':
+ if (i > last) {
+ parts->Add(ReplacementPart::ReplacementSubString(last, i));
+ }
+ parts->Add(ReplacementPart::SubjectMatch());
+ i = next_index;
+ last = i + 1;
+ break;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9': {
+ int capture_ref = c2 - '0';
+ if (capture_ref > capture_count) {
+ i = next_index;
+ continue;
+ }
+ int second_digit_index = next_index + 1;
+ if (second_digit_index < length) {
+ // Peek ahead to see if we have two digits.
+ Char c3 = characters[second_digit_index];
+ if ('0' <= c3 && c3 <= '9') { // Double digits.
+ int double_digit_ref = capture_ref * 10 + c3 - '0';
+ if (double_digit_ref <= capture_count) {
+ next_index = second_digit_index;
+ capture_ref = double_digit_ref;
+ }
+ }
+ }
+ if (capture_ref > 0) {
+ if (i > last) {
+ parts->Add(ReplacementPart::ReplacementSubString(last, i));
+ }
+ parts->Add(ReplacementPart::SubjectCapture(capture_ref));
+ last = next_index + 1;
+ }
+ i = next_index;
+ break;
+ }
+ default:
+ i = next_index;
+ break;
+ }
+ }
+ }
+ if (length > last) {
+ if (last == 0) {
+ parts->Add(ReplacementPart::ReplacementString());
+ } else {
+ parts->Add(ReplacementPart::ReplacementSubString(last, length));
+ }
+ }
+ }
+
+ ZoneList<ReplacementPart> parts_;
+ ZoneList<Handle<String> > replacement_substrings_;
+};
+
+
+static Object* StringReplaceRegExpWithString(String* subject,
+ JSRegExp* regexp,
+ String* replacement,
+ JSArray* last_match_info) {
+ ASSERT(subject->IsFlat(StringShape(subject)));
+ ASSERT(replacement->IsFlat(StringShape(replacement)));
+
+ HandleScope handles;
+
+ int length = subject->length();
+ Handle<String> subject_handle(subject);
+ Handle<JSRegExp> regexp_handle(regexp);
+ Handle<String> replacement_handle(replacement);
+ Handle<JSArray> last_match_info_handle(last_match_info);
+ Handle<Object> match = RegExpImpl::Exec(regexp_handle,
+ subject_handle,
+ 0,
+ last_match_info_handle);
+ if (match.is_null()) {
+ return Failure::Exception();
+ }
+ if (match->IsNull()) {
+ return *subject_handle;
+ }
+
+ int capture_count = regexp_handle->CaptureCount();
+
+ // CompiledReplacement uses zone allocation.
+ ZoneScope zone(DELETE_ON_EXIT);
+ CompiledReplacement compiled_replacement;
+ compiled_replacement.Compile(replacement_handle,
+ capture_count,
+ length);
+
+ bool is_global = regexp_handle->GetFlags().is_global();
+
+ // Guessing the number of parts that the final result string is build
Erik Corry 2009/03/13 09:12:34 built
Lasse Reichstein 2011/06/14 11:51:17 Done.
+ // from. Global regexps can match any number of times, so we guess
+ // conservatively.
+ int expected_parts =
+ (compiled_replacement.parts() + 1) * (is_global ? 4 : 1) + 1;
+ ReplacementStringBuilder builder(subject_handle, expected_parts);
+
+ // Index of end of last match.
+ int prev = 0;
+
+ do {
+ ASSERT(last_match_info_handle->HasFastElements());
+ FixedArray* match_info_array = last_match_info_handle->elements();
+
+ ASSERT_EQ(capture_count * 2 + 2,
+ RegExpImpl::GetLastCaptureCount(match_info_array));
+ int start = RegExpImpl::GetCapture(match_info_array, 0);
+ int end = RegExpImpl::GetCapture(match_info_array, 1);
+
+ if (prev < start) {
+ builder.AddSubjectSlice(prev, start);
+ }
+ compiled_replacement.Apply(&builder,
+ start,
+ end,
+ last_match_info_handle);
+ prev = end;
+
+ // Only continue checking for global regexps.
+ if (!is_global) break;
+
+ // Continue from where the match ended, unless it was an empty match.
+ int next = end;
+ if (start == end) {
+ next = end + 1;
+ if (next > length) break;
+ }
+
+ match = RegExpImpl::Exec(regexp_handle,
+ subject_handle,
+ next,
+ last_match_info_handle);
+ if (match.is_null()) {
+ return Failure::Exception();
+ }
+ } while (!match->IsNull());
+
+ if (prev < length) {
+ builder.AddSubjectSlice(prev, length);
+ }
+
+ return *(builder.ToString());
+}
+
+
+static Object* Runtime_StringReplaceRegExpWithString(Arguments args) {
+ ASSERT(args.length() == 4);
+
+ CONVERT_CHECKED(String, subject, args[0]);
+ StringShape subject_shape(subject);
+ if (!subject->IsFlat(subject_shape)) {
+ Object* flat_subject = subject->TryFlatten(subject_shape);
+ if (flat_subject->IsFailure()) {
+ return flat_subject;
+ }
+ subject = String::cast(flat_subject);
+ }
+
+ CONVERT_CHECKED(String, replacement, args[2]);
+ StringShape replacement_shape(replacement);
+ if (!replacement->IsFlat(replacement_shape)) {
+ Object* flat_replacement = replacement->TryFlatten(replacement_shape);
+ if (flat_replacement->IsFailure()) {
+ return flat_replacement;
+ }
+ replacement = String::cast(flat_replacement);
+ }
+
+ CONVERT_CHECKED(JSRegExp, regexp, args[1]);
+ CONVERT_CHECKED(JSArray, last_match_info, args[3]);
+
+ ASSERT(last_match_info->HasFastElements());
+
+ return StringReplaceRegExpWithString(subject,
+ regexp,
+ replacement,
+ last_match_info);
+}
+
+
// Cap on the maximal shift in the Boyer-Moore implementation. By setting a
// limit, we can fix the size of tables.
« no previous file with comments | « src/runtime.h ('k') | src/string.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698