Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(210)

Side by Side Diff: src/runtime.cc

Issue 42115: Faster string.replace with regexp pattern. (Closed)
Patch Set: Created 11 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 1162 matching lines...) Expand 10 before | Expand all | Expand 10 after
1173 ASSERT(args.length() == 1); 1173 ASSERT(args.length() == 1);
1174 uint32_t code; 1174 uint32_t code;
1175 if (Array::IndexFromObject(args[0], &code)) { 1175 if (Array::IndexFromObject(args[0], &code)) {
1176 if (code <= 0xffff) { 1176 if (code <= 0xffff) {
1177 return Heap::LookupSingleCharacterStringFromCode(code); 1177 return Heap::LookupSingleCharacterStringFromCode(code);
1178 } 1178 }
1179 } 1179 }
1180 return Heap::empty_string(); 1180 return Heap::empty_string();
1181 } 1181 }
1182 1182
1183 class ReplacementStringBuilder {
Erik Corry 2009/03/12 10:13:00 I think you should use StringBuilderConcatHelper i
Lasse Reichstein 2009/03/13 08:36:51 Done.
1184 public:
1185 ReplacementStringBuilder() : string_() {}
1186
1187 void Add(Handle<String> string) {
1188 StringShape shape(*string);
1189 if (string->length(shape) > 0) {
1190 if (string_.is_null()) {
1191 string_ = string;
1192 } else {
1193 string_ =
1194 Factory::NewConsString(string_, StringShape(*string_),
1195 string, shape);
1196 }
1197 }
1198 }
1199
1200 Handle<String> ToString() {
1201 if (string_.is_null()) {
1202 return Factory::empty_string();
1203 }
1204 return string_;
1205 }
1206 private:
1207 Handle<String> string_;
1208 };
1209
1210
1211 class CompiledReplacement {
1212 public:
1213 CompiledReplacement(Handle<String> replacement, int capture_count)
1214 : parts_(1), replacement_substrings_(0) {
1215 ParseReplacement(replacement, capture_count);
Erik Corry 2009/03/12 10:13:00 Nontrivial work shouldn't be in constructors.
Lasse Reichstein 2009/03/13 08:36:51 Done.
1216 }
1217 void Apply(ReplacementStringBuilder* builder,
1218 int match_from,
1219 int match_to,
1220 Handle<String> subject,
1221 Handle<JSArray> last_match_info) {
1222 for (int i = 0, n = parts_.length(); i < n; i++) {
1223 ReplacementPart part = parts_[i];
1224 switch (part.tag) {
1225 case SUBJECT_MATCH:
1226 builder->Add(Factory::NewStringSlice(subject, match_from, match_to));
1227 break;
1228 case SUBJECT_PREFIX:
1229 builder->Add(Factory::NewStringSlice(subject, 0, match_from));
1230 break;
1231 case SUBJECT_SUFFIX: {
1232 int length = subject->length();
1233 builder->Add(Factory::NewStringSlice(subject, match_to, length));
1234 break;
1235 }
1236 case SUBJECT_CAPTURE: {
1237 int capture = part.data;
1238 FixedArray* match_info = last_match_info->elements();
Erik Corry 2009/03/12 10:13:00 Move outside loop? Actually, you should move this
Lasse Reichstein 2009/03/13 08:36:51 As commented later, it can't be moved that far out
1239 int from = RegExpImpl::GetCapture(match_info, capture * 2);
1240 int to = RegExpImpl::GetCapture(match_info, capture * 2 + 1);
1241 builder->Add(Factory::NewStringSlice(subject, from, to));
1242 break;
1243 }
1244 case REPLACEMENT_SUBSTRING:
1245 builder->Add(replacement_substrings_[part.data]);
1246 break;
1247 default:
1248 UNREACHABLE();
1249 }
1250 }
1251 }
1252 private:
1253 enum PartType {
1254 SUBJECT_MATCH = -1,
Erik Corry 2009/03/12 10:13:00 Subject match seems like a special case of subject
Lasse Reichstein 2009/03/13 08:36:51 True. Slightly faster to implement since I already
1255 SUBJECT_PREFIX = -2,
1256 SUBJECT_SUFFIX = -3,
1257 SUBJECT_CAPTURE = -4,
1258 REPLACEMENT_SUBSTRING = -5
Erik Corry 2009/03/12 10:13:00 I'd prefer these to be positive and the offsets ne
Lasse Reichstein 2009/03/13 08:36:51 Done.
1259 };
1260
1261 struct ReplacementPart {
1262 // If tag >= 0 then it is the start index of a substring of the replacement
1263 // pattern.
1264 ReplacementPart(int tag, int data)
1265 : tag(tag), data(data) {}
1266 int tag;
1267 int data;
Erik Corry 2009/03/12 10:13:00 'data' is too generic a name, there's no comment t
Lasse Reichstein 2009/03/13 08:36:51 Comments added.
1268 };
1269
1270 template<typename Char>
1271 static void Parse(ZoneList<ReplacementPart>* parts,
1272 Vector<Char> characters,
1273 int capture_count) {
1274 int length = characters.length();
1275 int last = 0;
1276 for (int i = 0; i < length; i++) {
1277 Char c = characters[i];
1278 if (c == '$') {
1279 int next_index = i + 1;
1280 if (next_index == length) { // No next character!
1281 break;
1282 }
1283 Char c2 = characters[next_index];
1284 switch (c2) {
1285 case '$':
1286 if (i > last) {
1287 // Up to and including the first "$".
1288 parts->Add(ReplacementPart(last, next_index));
1289 last = next_index + 1; // Continue after the second "$".
1290 } else {
1291 last = next_index; // Continue with the second "$".
1292 }
1293 i = next_index;
1294 break;
1295 case '`':
1296 if (i > last) {
1297 parts->Add(ReplacementPart(last, i));
Erik Corry 2009/03/12 10:13:00 Perhaps parts->Add could check for last == i to sa
Lasse Reichstein 2009/03/13 08:36:51 Parts is just a ZoneList, and this is a self-conta
1298 }
1299 parts->Add(ReplacementPart(SUBJECT_PREFIX, 0));
1300 i = next_index;
1301 last = i + 1;
1302 break;
1303 case '\'':
1304 if (i > last) {
1305 parts->Add(ReplacementPart(last, i));
1306 }
1307 parts->Add(ReplacementPart(SUBJECT_SUFFIX, 0));
1308 i = next_index;
1309 last = i + 1;
1310 break;
1311 case '&':
1312 if (i > last) {
1313 parts->Add(ReplacementPart(last, i));
1314 }
1315 parts->Add(ReplacementPart(SUBJECT_MATCH, 0));
1316 i = next_index;
1317 last = i + 1;
1318 break;
1319 case '0':
Erik Corry 2009/03/12 10:13:00 Are $0 and $04 allowed? Should $0 be taken verbat
Lasse Reichstein 2009/03/13 08:36:51 Yes and yes.
1320 case '1':
1321 case '2':
1322 case '3':
1323 case '4':
1324 case '5':
1325 case '6':
1326 case '7':
1327 case '8':
1328 case '9': {
1329 int capture_ref = c2 - '0';
1330 if (capture_ref > capture_count) {
1331 i = next_index;
1332 continue;
1333 }
1334 int second_digit_index = next_index + 1;
1335 if (second_digit_index < length) {
1336 // Peek ahead to see if we have two digits.
1337 Char c3 = characters[second_digit_index];
1338 if ('0' <= c3 && c3 <= '9') { // Double digits.
1339 int double_digit_ref = capture_ref * 10 + c3 - '0';
1340 if (double_digit_ref <= capture_count) {
1341 next_index = second_digit_index;
1342 capture_ref = double_digit_ref;
1343 }
1344 }
1345 }
1346 if (capture_ref > 0) {
1347 if (i > last) {
1348 parts->Add(ReplacementPart(last, i));
1349 }
1350 parts->Add(ReplacementPart(SUBJECT_CAPTURE, capture_ref));
1351 last = next_index + 1;
1352 }
1353 i = next_index;
1354 break;
1355 }
1356 default:
Erik Corry 2009/03/12 10:13:00 Should $F00 replace with $FOO or FOO?
Lasse Reichstein 2009/03/13 08:36:51 With "$F00".
1357 i = next_index;
1358 break;
1359 }
1360 }
1361 }
1362 if (length > last) {
1363 parts->Add(ReplacementPart(last, length));
1364 }
1365 }
1366
1367 void ParseReplacement(Handle<String> replacement, int capture_count) {
1368 StringShape shape(*replacement);
1369 ASSERT(replacement->IsFlat(shape));
1370 if (shape.IsAsciiRepresentation()) {
1371 Parse(&parts_, replacement->ToAsciiVector(), capture_count);
1372 } else {
1373 ASSERT(shape.IsTwoByteRepresentation());
1374 Parse(&parts_, replacement->ToUC16Vector(), capture_count);
1375 }
1376 // Find substrings of replacement string and create them as String objcts..
1377 int index = 0;
1378 for (int i = 0, n = parts_.length(); i < n; i++) {
1379 int tag = parts_[i].tag;
1380 if (tag >= 0) { // A replacement string slice.
1381 int end = parts_[i].data;
1382 replacement_substrings_.Add(Factory::NewStringSlice(replacement,
1383 tag,
1384 end));
1385 parts_[i].tag = REPLACEMENT_SUBSTRING;
1386 parts_[i].data = index;
1387 index++;
1388 }
1389 }
1390 }
1391
1392 ZoneList<ReplacementPart> parts_;
1393 ZoneList<Handle<String> > replacement_substrings_;
1394 };
1395
1396
1397 static Object* StringReplaceRegExpWithString(String* subject,
1398 JSRegExp* regexp,
1399 String* replacement,
1400 JSArray* last_match_info) {
1401 ASSERT(subject->IsFlat(StringShape(subject)));
1402 ASSERT(replacement->IsFlat(StringShape(replacement)));
1403
1404 HandleScope handles;
1405
1406 int length = subject->length();
1407 Handle<String> subject_handle(subject);
1408 Handle<JSRegExp> regexp_handle(regexp);
1409 Handle<String> replacement_handle(replacement);
1410 Handle<JSArray> last_match_info_handle(last_match_info);
1411 Handle<Object> match = RegExpImpl::Exec(regexp_handle,
1412 subject_handle,
1413 0,
1414 last_match_info_handle);
1415 if (match.is_null()) {
1416 return Failure::Exception();
1417 }
1418 if (match->IsNull()) {
1419 return *subject_handle;
1420 }
1421
1422 int capture_count = regexp_handle->CaptureCount();
1423
1424 // CompiledReplacement uses zone allocation.
1425 ZoneScope zone(DELETE_ON_EXIT);
1426 CompiledReplacement compiled_replacement(replacement_handle, capture_count);
1427 ReplacementStringBuilder builder;
1428 bool is_global = regexp_handle->GetFlags().is_global();
1429
1430 // Index of end of last match.
1431 int prev = 0;
1432
1433 do {
1434 ASSERT(last_match_info_handle->HasFastElements());
1435 FixedArray* match_data = last_match_info_handle->elements();
Erik Corry 2009/03/12 10:13:00 Not using a handle is a little dodgy. If you move
Lasse Reichstein 2009/03/13 08:36:51 Won't work. The loop might be preempted (since it
Erik Corry 2009/03/13 09:12:34 If you retain a reference to the old FixedArray th
1436 ASSERT_EQ(capture_count * 2 + 2,
1437 Smi::cast(
1438 match_data->get(RegExpImpl::kLastCaptureCount))->value());
1439
1440 int start = RegExpImpl::GetCapture(match_data, 0);
1441 int end = RegExpImpl::GetCapture(match_data, 1);
1442
1443 if (prev < start) {
1444 builder.Add(Factory::NewStringSlice(subject_handle, prev, start));
1445 }
1446 compiled_replacement.Apply(&builder,
1447 start,
1448 end,
1449 subject_handle,
1450 last_match_info_handle);
1451 prev = end;
1452
1453 // Only continue checking for global regexps.
1454 if (!is_global) break;
1455
1456 // Continue from where the match ended, unless it was an empty match.
1457 int next = end;
1458 if (start == end) {
1459 next = end + 1;
1460 if (next > length) break;
1461 }
1462
1463 match = RegExpImpl::Exec(regexp_handle,
1464 subject_handle,
1465 next,
1466 last_match_info_handle);
1467 if (match.is_null()) {
1468 return Failure::Exception();
1469 }
1470 } while (!match->IsNull());
1471
1472 if (prev < length) {
1473 builder.Add(Factory::NewStringSlice(subject_handle, prev, length));
1474 }
1475
1476 return *(builder.ToString());
1477 }
1478
1479
1480 static Object* Runtime_StringReplaceRegExpWithString(Arguments args) {
1481 ASSERT(args.length() == 4);
1482
1483 CONVERT_CHECKED(String, subject, args[0]);
1484 StringShape subject_shape(subject);
1485 if (!subject->IsFlat(subject_shape)) {
1486 Object* flat_subject = subject->TryFlatten(subject_shape);
1487 if (flat_subject->IsFailure()) {
1488 return flat_subject;
1489 }
1490 subject = String::cast(flat_subject);
1491 }
1492
1493 CONVERT_CHECKED(String, replacement, args[2]);
1494 StringShape replacement_shape(replacement);
1495 if (!replacement->IsFlat(replacement_shape)) {
1496 Object* flat_replacement = replacement->TryFlatten(replacement_shape);
1497 if (flat_replacement->IsFailure()) {
1498 return flat_replacement;
1499 }
1500 replacement = String::cast(flat_replacement);
1501 }
1502
1503 CONVERT_CHECKED(JSRegExp, regexp, args[1]);
1504 CONVERT_CHECKED(JSArray, last_match_info, args[3]);
Erik Corry 2009/03/12 10:13:00 You need to check here that the last_match_info ha
Lasse Reichstein 2009/03/13 08:36:51 I only ever use last_match_info after having execu
1505
1506 return StringReplaceRegExpWithString(subject,
1507 regexp,
1508 replacement,
1509 last_match_info);
1510 }
1511
1512
1183 1513
1184 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a 1514 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
1185 // limit, we can fix the size of tables. 1515 // limit, we can fix the size of tables.
1186 static const int kBMMaxShift = 0xff; 1516 static const int kBMMaxShift = 0xff;
1187 // Reduce alphabet to this size. 1517 // Reduce alphabet to this size.
1188 static const int kBMAlphabetSize = 0x100; 1518 static const int kBMAlphabetSize = 0x100;
1189 // For patterns below this length, the skip length of Boyer-Moore is too short 1519 // For patterns below this length, the skip length of Boyer-Moore is too short
1190 // to compensate for the algorithmic overhead compared to simple brute force. 1520 // to compensate for the algorithmic overhead compared to simple brute force.
1191 static const int kBMMinPatternLength = 5; 1521 static const int kBMMinPatternLength = 5;
1192 1522
(...skipping 5004 matching lines...) Expand 10 before | Expand all | Expand 10 after
6197 } else { 6527 } else {
6198 // Handle last resort GC and make sure to allow future allocations 6528 // Handle last resort GC and make sure to allow future allocations
6199 // to grow the heap without causing GCs (if possible). 6529 // to grow the heap without causing GCs (if possible).
6200 Counters::gc_last_resort_from_js.Increment(); 6530 Counters::gc_last_resort_from_js.Increment();
6201 Heap::CollectAllGarbage(); 6531 Heap::CollectAllGarbage();
6202 } 6532 }
6203 } 6533 }
6204 6534
6205 6535
6206 } } // namespace v8::internal 6536 } } // namespace v8::internal
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698