Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(233)

Unified Diff: src/runtime.cc

Issue 42115: Faster string.replace with regexp pattern. (Closed)
Patch Set: Created 11 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/runtime.cc
diff --git a/src/runtime.cc b/src/runtime.cc
index bd81704df05b98a68e1c7cbe4f99c567a3411abc..0144b122c75e2cb2c643250cc3e5ae573a6d2f5c 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -1180,6 +1180,336 @@ static Object* Runtime_CharFromCode(Arguments args) {
return Heap::empty_string();
}
+class ReplacementStringBuilder {
Erik Corry 2009/03/12 10:13:00 I think you should use StringBuilderConcatHelper i
Lasse Reichstein 2009/03/13 08:36:51 Done.
+ public:
+ ReplacementStringBuilder() : string_() {}
+
+ void Add(Handle<String> string) {
+ StringShape shape(*string);
+ if (string->length(shape) > 0) {
+ if (string_.is_null()) {
+ string_ = string;
+ } else {
+ string_ =
+ Factory::NewConsString(string_, StringShape(*string_),
+ string, shape);
+ }
+ }
+ }
+
+ Handle<String> ToString() {
+ if (string_.is_null()) {
+ return Factory::empty_string();
+ }
+ return string_;
+ }
+ private:
+ Handle<String> string_;
+};
+
+
+class CompiledReplacement {
+ public:
+ CompiledReplacement(Handle<String> replacement, int capture_count)
+ : parts_(1), replacement_substrings_(0) {
+ ParseReplacement(replacement, capture_count);
Erik Corry 2009/03/12 10:13:00 Nontrivial work shouldn't be in constructors.
Lasse Reichstein 2009/03/13 08:36:51 Done.
+ }
+ void Apply(ReplacementStringBuilder* builder,
+ int match_from,
+ int match_to,
+ Handle<String> subject,
+ Handle<JSArray> last_match_info) {
+ for (int i = 0, n = parts_.length(); i < n; i++) {
+ ReplacementPart part = parts_[i];
+ switch (part.tag) {
+ case SUBJECT_MATCH:
+ builder->Add(Factory::NewStringSlice(subject, match_from, match_to));
+ break;
+ case SUBJECT_PREFIX:
+ builder->Add(Factory::NewStringSlice(subject, 0, match_from));
+ break;
+ case SUBJECT_SUFFIX: {
+ int length = subject->length();
+ builder->Add(Factory::NewStringSlice(subject, match_to, length));
+ break;
+ }
+ case SUBJECT_CAPTURE: {
+ int capture = part.data;
+ FixedArray* match_info = last_match_info->elements();
Erik Corry 2009/03/12 10:13:00 Move outside loop? Actually, you should move this
Lasse Reichstein 2009/03/13 08:36:51 As commented later, it can't be moved that far out
+ int from = RegExpImpl::GetCapture(match_info, capture * 2);
+ int to = RegExpImpl::GetCapture(match_info, capture * 2 + 1);
+ builder->Add(Factory::NewStringSlice(subject, from, to));
+ break;
+ }
+ case REPLACEMENT_SUBSTRING:
+ builder->Add(replacement_substrings_[part.data]);
+ break;
+ default:
+ UNREACHABLE();
+ }
+ }
+ }
+ private:
+ enum PartType {
+ SUBJECT_MATCH = -1,
Erik Corry 2009/03/12 10:13:00 Subject match seems like a special case of subject
Lasse Reichstein 2009/03/13 08:36:51 True. Slightly faster to implement since I already
+ SUBJECT_PREFIX = -2,
+ SUBJECT_SUFFIX = -3,
+ SUBJECT_CAPTURE = -4,
+ REPLACEMENT_SUBSTRING = -5
Erik Corry 2009/03/12 10:13:00 I'd prefer these to be positive and the offsets ne
Lasse Reichstein 2009/03/13 08:36:51 Done.
+ };
+
+ struct ReplacementPart {
+ // If tag >= 0 then it is the start index of a substring of the replacement
+ // pattern.
+ ReplacementPart(int tag, int data)
+ : tag(tag), data(data) {}
+ int tag;
+ int data;
Erik Corry 2009/03/12 10:13:00 'data' is too generic a name, there's no comment t
Lasse Reichstein 2009/03/13 08:36:51 Comments added.
+ };
+
+ template<typename Char>
+ static void Parse(ZoneList<ReplacementPart>* parts,
+ Vector<Char> characters,
+ int capture_count) {
+ int length = characters.length();
+ int last = 0;
+ for (int i = 0; i < length; i++) {
+ Char c = characters[i];
+ if (c == '$') {
+ int next_index = i + 1;
+ if (next_index == length) { // No next character!
+ break;
+ }
+ Char c2 = characters[next_index];
+ switch (c2) {
+ case '$':
+ if (i > last) {
+ // Up to and including the first "$".
+ parts->Add(ReplacementPart(last, next_index));
+ last = next_index + 1; // Continue after the second "$".
+ } else {
+ last = next_index; // Continue with the second "$".
+ }
+ i = next_index;
+ break;
+ case '`':
+ if (i > last) {
+ parts->Add(ReplacementPart(last, i));
Erik Corry 2009/03/12 10:13:00 Perhaps parts->Add could check for last == i to sa
Lasse Reichstein 2009/03/13 08:36:51 Parts is just a ZoneList, and this is a self-conta
+ }
+ parts->Add(ReplacementPart(SUBJECT_PREFIX, 0));
+ i = next_index;
+ last = i + 1;
+ break;
+ case '\'':
+ if (i > last) {
+ parts->Add(ReplacementPart(last, i));
+ }
+ parts->Add(ReplacementPart(SUBJECT_SUFFIX, 0));
+ i = next_index;
+ last = i + 1;
+ break;
+ case '&':
+ if (i > last) {
+ parts->Add(ReplacementPart(last, i));
+ }
+ parts->Add(ReplacementPart(SUBJECT_MATCH, 0));
+ i = next_index;
+ last = i + 1;
+ break;
+ case '0':
Erik Corry 2009/03/12 10:13:00 Are $0 and $04 allowed? Should $0 be taken verbat
Lasse Reichstein 2009/03/13 08:36:51 Yes and yes.
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9': {
+ int capture_ref = c2 - '0';
+ if (capture_ref > capture_count) {
+ i = next_index;
+ continue;
+ }
+ int second_digit_index = next_index + 1;
+ if (second_digit_index < length) {
+ // Peek ahead to see if we have two digits.
+ Char c3 = characters[second_digit_index];
+ if ('0' <= c3 && c3 <= '9') { // Double digits.
+ int double_digit_ref = capture_ref * 10 + c3 - '0';
+ if (double_digit_ref <= capture_count) {
+ next_index = second_digit_index;
+ capture_ref = double_digit_ref;
+ }
+ }
+ }
+ if (capture_ref > 0) {
+ if (i > last) {
+ parts->Add(ReplacementPart(last, i));
+ }
+ parts->Add(ReplacementPart(SUBJECT_CAPTURE, capture_ref));
+ last = next_index + 1;
+ }
+ i = next_index;
+ break;
+ }
+ default:
Erik Corry 2009/03/12 10:13:00 Should $F00 replace with $FOO or FOO?
Lasse Reichstein 2009/03/13 08:36:51 With "$F00".
+ i = next_index;
+ break;
+ }
+ }
+ }
+ if (length > last) {
+ parts->Add(ReplacementPart(last, length));
+ }
+ }
+
+ void ParseReplacement(Handle<String> replacement, int capture_count) {
+ StringShape shape(*replacement);
+ ASSERT(replacement->IsFlat(shape));
+ if (shape.IsAsciiRepresentation()) {
+ Parse(&parts_, replacement->ToAsciiVector(), capture_count);
+ } else {
+ ASSERT(shape.IsTwoByteRepresentation());
+ Parse(&parts_, replacement->ToUC16Vector(), capture_count);
+ }
+ // Find substrings of replacement string and create them as String objcts..
+ int index = 0;
+ for (int i = 0, n = parts_.length(); i < n; i++) {
+ int tag = parts_[i].tag;
+ if (tag >= 0) { // A replacement string slice.
+ int end = parts_[i].data;
+ replacement_substrings_.Add(Factory::NewStringSlice(replacement,
+ tag,
+ end));
+ parts_[i].tag = REPLACEMENT_SUBSTRING;
+ parts_[i].data = index;
+ index++;
+ }
+ }
+ }
+
+ ZoneList<ReplacementPart> parts_;
+ ZoneList<Handle<String> > replacement_substrings_;
+};
+
+
+static Object* StringReplaceRegExpWithString(String* subject,
+ JSRegExp* regexp,
+ String* replacement,
+ JSArray* last_match_info) {
+ ASSERT(subject->IsFlat(StringShape(subject)));
+ ASSERT(replacement->IsFlat(StringShape(replacement)));
+
+ HandleScope handles;
+
+ int length = subject->length();
+ Handle<String> subject_handle(subject);
+ Handle<JSRegExp> regexp_handle(regexp);
+ Handle<String> replacement_handle(replacement);
+ Handle<JSArray> last_match_info_handle(last_match_info);
+ Handle<Object> match = RegExpImpl::Exec(regexp_handle,
+ subject_handle,
+ 0,
+ last_match_info_handle);
+ if (match.is_null()) {
+ return Failure::Exception();
+ }
+ if (match->IsNull()) {
+ return *subject_handle;
+ }
+
+ int capture_count = regexp_handle->CaptureCount();
+
+ // CompiledReplacement uses zone allocation.
+ ZoneScope zone(DELETE_ON_EXIT);
+ CompiledReplacement compiled_replacement(replacement_handle, capture_count);
+ ReplacementStringBuilder builder;
+ bool is_global = regexp_handle->GetFlags().is_global();
+
+ // Index of end of last match.
+ int prev = 0;
+
+ do {
+ ASSERT(last_match_info_handle->HasFastElements());
+ FixedArray* match_data = last_match_info_handle->elements();
Erik Corry 2009/03/12 10:13:00 Not using a handle is a little dodgy. If you move
Lasse Reichstein 2009/03/13 08:36:51 Won't work. The loop might be preempted (since it
Erik Corry 2009/03/13 09:12:34 If you retain a reference to the old FixedArray th
+ ASSERT_EQ(capture_count * 2 + 2,
+ Smi::cast(
+ match_data->get(RegExpImpl::kLastCaptureCount))->value());
+
+ int start = RegExpImpl::GetCapture(match_data, 0);
+ int end = RegExpImpl::GetCapture(match_data, 1);
+
+ if (prev < start) {
+ builder.Add(Factory::NewStringSlice(subject_handle, prev, start));
+ }
+ compiled_replacement.Apply(&builder,
+ start,
+ end,
+ subject_handle,
+ last_match_info_handle);
+ prev = end;
+
+ // Only continue checking for global regexps.
+ if (!is_global) break;
+
+ // Continue from where the match ended, unless it was an empty match.
+ int next = end;
+ if (start == end) {
+ next = end + 1;
+ if (next > length) break;
+ }
+
+ match = RegExpImpl::Exec(regexp_handle,
+ subject_handle,
+ next,
+ last_match_info_handle);
+ if (match.is_null()) {
+ return Failure::Exception();
+ }
+ } while (!match->IsNull());
+
+ if (prev < length) {
+ builder.Add(Factory::NewStringSlice(subject_handle, prev, length));
+ }
+
+ return *(builder.ToString());
+}
+
+
+static Object* Runtime_StringReplaceRegExpWithString(Arguments args) {
+ ASSERT(args.length() == 4);
+
+ CONVERT_CHECKED(String, subject, args[0]);
+ StringShape subject_shape(subject);
+ if (!subject->IsFlat(subject_shape)) {
+ Object* flat_subject = subject->TryFlatten(subject_shape);
+ if (flat_subject->IsFailure()) {
+ return flat_subject;
+ }
+ subject = String::cast(flat_subject);
+ }
+
+ CONVERT_CHECKED(String, replacement, args[2]);
+ StringShape replacement_shape(replacement);
+ if (!replacement->IsFlat(replacement_shape)) {
+ Object* flat_replacement = replacement->TryFlatten(replacement_shape);
+ if (flat_replacement->IsFailure()) {
+ return flat_replacement;
+ }
+ replacement = String::cast(flat_replacement);
+ }
+
+ CONVERT_CHECKED(JSRegExp, regexp, args[1]);
+ CONVERT_CHECKED(JSArray, last_match_info, args[3]);
Erik Corry 2009/03/12 10:13:00 You need to check here that the last_match_info ha
Lasse Reichstein 2009/03/13 08:36:51 I only ever use last_match_info after having execu
+
+ return StringReplaceRegExpWithString(subject,
+ regexp,
+ replacement,
+ last_match_info);
+}
+
+
// Cap on the maximal shift in the Boyer-Moore implementation. By setting a
// limit, we can fix the size of tables.

Powered by Google App Engine
This is Rietveld 408576698