Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(120)

Side by Side Diff: src/runtime.cc

Issue 42115: Faster string.replace with regexp pattern. (Closed)
Patch Set: Addressed review comments Created 11 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/runtime.h ('k') | src/string.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 1162 matching lines...) Expand 10 before | Expand all | Expand 10 after
1173 ASSERT(args.length() == 1); 1173 ASSERT(args.length() == 1);
1174 uint32_t code; 1174 uint32_t code;
1175 if (Array::IndexFromObject(args[0], &code)) { 1175 if (Array::IndexFromObject(args[0], &code)) {
1176 if (code <= 0xffff) { 1176 if (code <= 0xffff) {
1177 return Heap::LookupSingleCharacterStringFromCode(code); 1177 return Heap::LookupSingleCharacterStringFromCode(code);
1178 } 1178 }
1179 } 1179 }
1180 return Heap::empty_string(); 1180 return Heap::empty_string();
1181 } 1181 }
1182 1182
1183 // Forward declaration.
1184 template <typename schar>
1185 void StringBuilderConcatHelper(String*, StringShape, schar*, FixedArray*, int);
1186
1187 class ReplacementStringBuilder {
1188 public:
1189 ReplacementStringBuilder(Handle<String> subject, int estimated_part_count)
1190 : subject_(subject),
1191 parts_(Factory::NewFixedArray(estimated_part_count)),
1192 part_count_(0),
1193 character_count_(0),
1194 is_ascii_(StringShape(*subject).IsAsciiRepresentation()) {
1195 // Require a non-zero initial size. Ensures that doubling the size to
1196 // extend the array will work.
1197 ASSERT(estimated_part_count > 0);
1198 }
1199
1200 void AddSubjectSlice(int from, int to) {
1201 ASSERT(from >= 0);
1202 int length = to - from;
1203 ASSERT(length >= 0);
1204 if (length > 0) {
1205 // Can we encode the slice in 11 bits for length and 19 bits for
1206 // start position - as used by StringBuilderConcatHelper?
1207 if (length < (1 << 11) && from < (1 << 19)) {
Erik Corry 2009/03/13 09:12:34 Not your fault (it's mine :-) but 11 and 19 should
Lasse Reichstein 2011/06/14 11:51:17 Fixed to use BitField<int,0,11> and BitField<int,1
1208 int encoded_slice = (from << 11) + length;
1209 AddElement(Smi::FromInt(encoded_slice));
1210 } else {
1211 Handle<String> slice = Factory::NewStringSlice(subject_, from, to);
Erik Corry 2009/03/13 09:12:34 Do we have test coverage of this branch?
Lasse Reichstein 2011/06/14 11:51:17 Probably not. Lets make sure. Some extra string.re
1212 AddElement(*slice);
1213 }
1214 IncrementCharacterCount(length);
1215 }
1216 }
1217
1218
1219 void AddString(Handle<String> string) {
1220 StringShape shape(*string);
1221 int length = string->length(shape);
1222 if (length > 0) {
1223 AddElement(*string);
1224 if (!shape.IsAsciiRepresentation()) {
1225 is_ascii_ = false;
1226 }
1227 IncrementCharacterCount(length);
1228 }
1229 }
1230
1231
1232 Handle<String> ToString() {
1233 if (part_count_ == 0) {
1234 return Factory::empty_string();
1235 }
1236
1237 Handle<String> joined_string;
1238 if (is_ascii_) {
1239 joined_string = NewRawAsciiString(character_count_);
1240 AssertNoAllocation no_alloc;
1241 SeqAsciiString* seq = SeqAsciiString::cast(*joined_string);
1242 char* char_buffer = seq->GetChars();
1243 StringBuilderConcatHelper(*subject_,
1244 StringShape(*subject_),
1245 char_buffer,
1246 *parts_,
1247 part_count_);
1248 } else {
1249 // Non-ASCII.
1250 joined_string = NewRawTwoByteString(character_count_);
1251 AssertNoAllocation no_alloc;
1252 SeqTwoByteString* seq = SeqTwoByteString::cast(*joined_string);
1253 uc16* char_buffer = seq->GetChars();
1254 StringBuilderConcatHelper(*subject_,
1255 StringShape(*subject_),
1256 char_buffer,
1257 *parts_,
1258 part_count_);
1259 }
1260 return joined_string;
1261 }
1262
1263
1264 void IncrementCharacterCount(int by) {
1265 if ( character_count_ > Smi::kMaxValue - by) {
1266 V8::FatalProcessOutOfMemory("String.replace result too large.");
1267 }
1268 character_count_ += by;
1269 }
1270
1271 private:
1272
1273 Handle<String> NewRawAsciiString(int size) {
1274 CALL_HEAP_FUNCTION(Heap::AllocateRawAsciiString(size), String);
1275 }
1276
1277
1278 Handle<String> NewRawTwoByteString(int size) {
1279 CALL_HEAP_FUNCTION(Heap::AllocateRawTwoByteString(size), String);
1280 }
1281
1282
1283 void AddElement(Object* element) {
1284 ASSERT(element->IsSmi() || element->IsString());
1285 // Extend parts_ array if necessary.
1286 if (parts_->length() == part_count_) {
1287 Handle<FixedArray> extended_array =
1288 Factory::NewFixedArray(part_count_ * 2);
1289 parts_->CopyTo(0, *extended_array, 0, part_count_);
1290 parts_ = extended_array;
1291 }
1292 parts_->set(part_count_, element);
1293 part_count_++;
1294 }
1295
1296 Handle<String> subject_;
1297 Handle<FixedArray> parts_;
1298 int part_count_;
1299 int character_count_;
1300 bool is_ascii_;
1301 };
1302
1303
1304 class CompiledReplacement {
1305 public:
1306 CompiledReplacement()
1307 : parts_(1), replacement_substrings_(0) {}
1308
1309 void Compile(Handle<String> replacement,
Erik Corry 2009/03/13 09:12:34 This method is (way!) too big to be inline in the
Lasse Reichstein 2011/06/14 11:51:17 Refactored outside. Just after, but outside.
1310 int capture_count,
1311 int subject_length) {
1312 StringShape shape(*replacement);
1313 ASSERT(replacement->IsFlat(shape));
1314 if (shape.IsAsciiRepresentation()) {
1315 AssertNoAllocation no_alloc;
1316 Parse(&parts_,
1317 replacement->ToAsciiVector(),
1318 capture_count,
1319 subject_length);
1320 } else {
1321 ASSERT(shape.IsTwoByteRepresentation());
1322 AssertNoAllocation no_alloc;
1323
1324 Parse(&parts_,
1325 replacement->ToUC16Vector(),
1326 capture_count,
1327 subject_length);
1328 }
1329 // Find substrings of replacement string and create them as String objects..
1330 int substring_index = 0;
1331 for (int i = 0, n = parts_.length(); i < n; i++) {
1332 int tag = parts_[i].tag;
1333 if (tag <= 0) { // A replacement string slice.
1334 int from = -tag;
1335 int to = parts_[i].data;
1336 replacement_substrings_.Add(Factory::NewStringSlice(replacement,
1337 from,
1338 to));
1339 parts_[i].tag = REPLACEMENT_SUBSTRING;
1340 parts_[i].data = substring_index;
1341 substring_index++;
1342 } else if (tag == REPLACEMENT_STRING) {
1343 replacement_substrings_.Add(replacement);
1344 parts_[i].data = substring_index;
1345 substring_index++;
1346 }
1347 }
1348 }
1349
1350
1351 void Apply(ReplacementStringBuilder* builder,
Erik Corry 2009/03/13 09:12:34 Ditto.
Lasse Reichstein 2011/06/14 11:51:17 Done.
1352 int match_from,
1353 int match_to,
1354 Handle<JSArray> last_match_info) {
1355 for (int i = 0, n = parts_.length(); i < n; i++) {
1356 ReplacementPart part = parts_[i];
1357 switch (part.tag) {
1358 case SUBJECT_PREFIX:
1359 builder->AddSubjectSlice(0, match_from);
1360 break;
1361 case SUBJECT_SUFFIX: {
1362 int subject_length = part.data;
1363 builder->AddSubjectSlice(match_to, subject_length);
1364 break;
1365 }
1366 case SUBJECT_CAPTURE: {
1367 int capture = part.data;
1368 FixedArray* match_info = last_match_info->elements();
1369 int from = RegExpImpl::GetCapture(match_info, capture * 2);
1370 int to = RegExpImpl::GetCapture(match_info, capture * 2 + 1);
1371 if (from >= 0 && to > from) {
1372 builder->AddSubjectSlice(from, to);
1373 }
1374 break;
1375 }
1376 case REPLACEMENT_SUBSTRING:
1377 case REPLACEMENT_STRING:
1378 builder->AddString(replacement_substrings_[part.data]);
1379 break;
1380 default:
1381 UNREACHABLE();
1382 }
1383 }
1384 }
1385 // Number of distinct parts of the replacement pattern.
1386 int parts() {
1387 return parts_.length();
1388 }
1389 private:
1390 enum PartType {
1391 SUBJECT_PREFIX = 1,
1392 SUBJECT_SUFFIX,
1393 SUBJECT_CAPTURE,
1394 REPLACEMENT_SUBSTRING,
1395 REPLACEMENT_STRING,
1396
1397 NUMBER_OF_PART_TYPES
1398 };
1399
1400 struct ReplacementPart {
1401 static inline ReplacementPart SubjectMatch() {
1402 return ReplacementPart(SUBJECT_CAPTURE, 0);
1403 }
1404 static inline ReplacementPart SubjectCapture(int capture_index) {
1405 return ReplacementPart(SUBJECT_CAPTURE, capture_index);
1406 }
1407 static inline ReplacementPart SubjectPrefix() {
1408 return ReplacementPart(SUBJECT_PREFIX, 0);
1409 }
1410 static inline ReplacementPart SubjectSuffix(int subject_length) {
1411 return ReplacementPart(SUBJECT_SUFFIX, subject_length);
1412 }
1413 static inline ReplacementPart ReplacementString() {
1414 return ReplacementPart(REPLACEMENT_STRING, 0);
1415 }
1416 static inline ReplacementPart ReplacementSubString(int from, int to) {
1417 ASSERT(from >= 0);
1418 ASSERT(to > from);
1419 return ReplacementPart(-from, to);
1420 }
1421
1422 // If tag <= 0 then it is the negation of a start index of a substring of
1423 // the replacement pattern, otherwise it's a value from PartType.
1424 ReplacementPart(int tag, int data)
1425 : tag(tag), data(data) {
1426 // Must be non-positive or a PartType value.
1427 ASSERT(tag < NUMBER_OF_PART_TYPES);
1428 }
1429 // Either a value of PartType or a non-positive number that is
1430 // the negation of an index into the replacement string.
1431 int tag;
1432 // The data value's interpretation depends on the value of tag:
1433 // tag == SUBJECT_PREFIX ||
1434 // tag == SUBJECT_SUFFIX: data is unused.
1435 // tag == SUBJECT_CAPTURE: data is the number of the capture.
1436 // tag == REPLACEMENT_SUBSTRING ||
1437 // tag == REPLACEMENT_STRING: data is index into array of substrings
1438 // of the replacement string.
1439 // tag <= 0: Temporary representation of the substring of the replacement
1440 // string ranging over -tag .. data.
1441 // Is replaced by REPLACEMENT_{SUB,}STRING when we create the
1442 // substring objects.
1443 int data;
1444 };
1445
1446 template<typename Char>
1447 static void Parse(ZoneList<ReplacementPart>* parts,
1448 Vector<Char> characters,
1449 int capture_count,
1450 int subject_length) {
1451 int length = characters.length();
1452 int last = 0;
1453 for (int i = 0; i < length; i++) {
1454 Char c = characters[i];
1455 if (c == '$') {
1456 int next_index = i + 1;
1457 if (next_index == length) { // No next character!
1458 break;
1459 }
1460 Char c2 = characters[next_index];
1461 switch (c2) {
1462 case '$':
1463 if (i > last) {
1464 // There is a substring before. Include the first "$".
1465 parts->Add(ReplacementPart::ReplacementSubString(last, next_index));
1466 last = next_index + 1; // Continue after the second "$".
1467 } else {
1468 // Let the next substring start with the second "$".
1469 last = next_index;
1470 }
1471 i = next_index;
1472 break;
1473 case '`':
1474 if (i > last) {
1475 parts->Add(ReplacementPart::ReplacementSubString(last, i));
1476 }
1477 parts->Add(ReplacementPart::SubjectPrefix());
1478 i = next_index;
1479 last = i + 1;
1480 break;
1481 case '\'':
1482 if (i > last) {
1483 parts->Add(ReplacementPart::ReplacementSubString(last, i));
1484 }
1485 parts->Add(ReplacementPart::SubjectSuffix(subject_length));
1486 i = next_index;
1487 last = i + 1;
1488 break;
1489 case '&':
1490 if (i > last) {
1491 parts->Add(ReplacementPart::ReplacementSubString(last, i));
1492 }
1493 parts->Add(ReplacementPart::SubjectMatch());
1494 i = next_index;
1495 last = i + 1;
1496 break;
1497 case '0':
1498 case '1':
1499 case '2':
1500 case '3':
1501 case '4':
1502 case '5':
1503 case '6':
1504 case '7':
1505 case '8':
1506 case '9': {
1507 int capture_ref = c2 - '0';
1508 if (capture_ref > capture_count) {
1509 i = next_index;
1510 continue;
1511 }
1512 int second_digit_index = next_index + 1;
1513 if (second_digit_index < length) {
1514 // Peek ahead to see if we have two digits.
1515 Char c3 = characters[second_digit_index];
1516 if ('0' <= c3 && c3 <= '9') { // Double digits.
1517 int double_digit_ref = capture_ref * 10 + c3 - '0';
1518 if (double_digit_ref <= capture_count) {
1519 next_index = second_digit_index;
1520 capture_ref = double_digit_ref;
1521 }
1522 }
1523 }
1524 if (capture_ref > 0) {
1525 if (i > last) {
1526 parts->Add(ReplacementPart::ReplacementSubString(last, i));
1527 }
1528 parts->Add(ReplacementPart::SubjectCapture(capture_ref));
1529 last = next_index + 1;
1530 }
1531 i = next_index;
1532 break;
1533 }
1534 default:
1535 i = next_index;
1536 break;
1537 }
1538 }
1539 }
1540 if (length > last) {
1541 if (last == 0) {
1542 parts->Add(ReplacementPart::ReplacementString());
1543 } else {
1544 parts->Add(ReplacementPart::ReplacementSubString(last, length));
1545 }
1546 }
1547 }
1548
1549 ZoneList<ReplacementPart> parts_;
1550 ZoneList<Handle<String> > replacement_substrings_;
1551 };
1552
1553
1554 static Object* StringReplaceRegExpWithString(String* subject,
1555 JSRegExp* regexp,
1556 String* replacement,
1557 JSArray* last_match_info) {
1558 ASSERT(subject->IsFlat(StringShape(subject)));
1559 ASSERT(replacement->IsFlat(StringShape(replacement)));
1560
1561 HandleScope handles;
1562
1563 int length = subject->length();
1564 Handle<String> subject_handle(subject);
1565 Handle<JSRegExp> regexp_handle(regexp);
1566 Handle<String> replacement_handle(replacement);
1567 Handle<JSArray> last_match_info_handle(last_match_info);
1568 Handle<Object> match = RegExpImpl::Exec(regexp_handle,
1569 subject_handle,
1570 0,
1571 last_match_info_handle);
1572 if (match.is_null()) {
1573 return Failure::Exception();
1574 }
1575 if (match->IsNull()) {
1576 return *subject_handle;
1577 }
1578
1579 int capture_count = regexp_handle->CaptureCount();
1580
1581 // CompiledReplacement uses zone allocation.
1582 ZoneScope zone(DELETE_ON_EXIT);
1583 CompiledReplacement compiled_replacement;
1584 compiled_replacement.Compile(replacement_handle,
1585 capture_count,
1586 length);
1587
1588 bool is_global = regexp_handle->GetFlags().is_global();
1589
1590 // Guessing the number of parts that the final result string is build
Erik Corry 2009/03/13 09:12:34 built
Lasse Reichstein 2011/06/14 11:51:17 Done.
1591 // from. Global regexps can match any number of times, so we guess
1592 // conservatively.
1593 int expected_parts =
1594 (compiled_replacement.parts() + 1) * (is_global ? 4 : 1) + 1;
1595 ReplacementStringBuilder builder(subject_handle, expected_parts);
1596
1597 // Index of end of last match.
1598 int prev = 0;
1599
1600 do {
1601 ASSERT(last_match_info_handle->HasFastElements());
1602 FixedArray* match_info_array = last_match_info_handle->elements();
1603
1604 ASSERT_EQ(capture_count * 2 + 2,
1605 RegExpImpl::GetLastCaptureCount(match_info_array));
1606 int start = RegExpImpl::GetCapture(match_info_array, 0);
1607 int end = RegExpImpl::GetCapture(match_info_array, 1);
1608
1609 if (prev < start) {
1610 builder.AddSubjectSlice(prev, start);
1611 }
1612 compiled_replacement.Apply(&builder,
1613 start,
1614 end,
1615 last_match_info_handle);
1616 prev = end;
1617
1618 // Only continue checking for global regexps.
1619 if (!is_global) break;
1620
1621 // Continue from where the match ended, unless it was an empty match.
1622 int next = end;
1623 if (start == end) {
1624 next = end + 1;
1625 if (next > length) break;
1626 }
1627
1628 match = RegExpImpl::Exec(regexp_handle,
1629 subject_handle,
1630 next,
1631 last_match_info_handle);
1632 if (match.is_null()) {
1633 return Failure::Exception();
1634 }
1635 } while (!match->IsNull());
1636
1637 if (prev < length) {
1638 builder.AddSubjectSlice(prev, length);
1639 }
1640
1641 return *(builder.ToString());
1642 }
1643
1644
1645 static Object* Runtime_StringReplaceRegExpWithString(Arguments args) {
1646 ASSERT(args.length() == 4);
1647
1648 CONVERT_CHECKED(String, subject, args[0]);
1649 StringShape subject_shape(subject);
1650 if (!subject->IsFlat(subject_shape)) {
1651 Object* flat_subject = subject->TryFlatten(subject_shape);
1652 if (flat_subject->IsFailure()) {
1653 return flat_subject;
1654 }
1655 subject = String::cast(flat_subject);
1656 }
1657
1658 CONVERT_CHECKED(String, replacement, args[2]);
1659 StringShape replacement_shape(replacement);
1660 if (!replacement->IsFlat(replacement_shape)) {
1661 Object* flat_replacement = replacement->TryFlatten(replacement_shape);
1662 if (flat_replacement->IsFailure()) {
1663 return flat_replacement;
1664 }
1665 replacement = String::cast(flat_replacement);
1666 }
1667
1668 CONVERT_CHECKED(JSRegExp, regexp, args[1]);
1669 CONVERT_CHECKED(JSArray, last_match_info, args[3]);
1670
1671 ASSERT(last_match_info->HasFastElements());
1672
1673 return StringReplaceRegExpWithString(subject,
1674 regexp,
1675 replacement,
1676 last_match_info);
1677 }
1678
1679
1183 1680
1184 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a 1681 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
1185 // limit, we can fix the size of tables. 1682 // limit, we can fix the size of tables.
1186 static const int kBMMaxShift = 0xff; 1683 static const int kBMMaxShift = 0xff;
1187 // Reduce alphabet to this size. 1684 // Reduce alphabet to this size.
1188 static const int kBMAlphabetSize = 0x100; 1685 static const int kBMAlphabetSize = 0x100;
1189 // For patterns below this length, the skip length of Boyer-Moore is too short 1686 // For patterns below this length, the skip length of Boyer-Moore is too short
1190 // to compensate for the algorithmic overhead compared to simple brute force. 1687 // to compensate for the algorithmic overhead compared to simple brute force.
1191 static const int kBMMinPatternLength = 5; 1688 static const int kBMMinPatternLength = 5;
1192 1689
(...skipping 5004 matching lines...) Expand 10 before | Expand all | Expand 10 after
6197 } else { 6694 } else {
6198 // Handle last resort GC and make sure to allow future allocations 6695 // Handle last resort GC and make sure to allow future allocations
6199 // to grow the heap without causing GCs (if possible). 6696 // to grow the heap without causing GCs (if possible).
6200 Counters::gc_last_resort_from_js.Increment(); 6697 Counters::gc_last_resort_from_js.Increment();
6201 Heap::CollectAllGarbage(); 6698 Heap::CollectAllGarbage();
6202 } 6699 }
6203 } 6700 }
6204 6701
6205 6702
6206 } } // namespace v8::internal 6703 } } // namespace v8::internal
OLDNEW
« no previous file with comments | « src/runtime.h ('k') | src/string.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698