Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(763)

Side by Side Diff: src/jsregexp.cc

Issue 9854020: RegExp: Add support for table-based character class (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 1516 matching lines...) Expand 10 before | Expand all | Expand 10 after
1527 macro_assembler->Bind(&ok); 1527 macro_assembler->Bind(&ok);
1528 break; 1528 break;
1529 default: 1529 default:
1530 UNREACHABLE(); 1530 UNREACHABLE();
1531 break; 1531 break;
1532 } 1532 }
1533 return true; 1533 return true;
1534 } 1534 }
1535 1535
1536 1536
1537 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1538 int border,
1539 Label* fall_through,
1540 Label* above,
ulan 2012/03/27 11:02:43 'above_or_equal' would be more precise name.
Erik Corry 2012/03/28 09:40:26 Done.
1541 Label* below) {
1542 if (below != fall_through) {
1543 masm->CheckCharacterLT(border, below);
1544 if (above != fall_through) masm->GoTo(above);
1545 } else {
1546 masm->CheckCharacterGT(border - 1, above);
1547 }
1548 }
1549
1550
1551 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1552 int first,
1553 int last,
1554 Label* fall_through,
1555 Label* even_label,
ulan 2012/03/28 13:14:08 Odd/even doesn't make sense without the ranges2 ar
Erik Corry 2012/03/29 14:48:59 Done.
1556 Label* odd_label) {
1557 if (even_label == fall_through) {
1558 if (first == last) {
1559 masm->CheckNotCharacter(first, odd_label);
1560 } else {
1561 masm->CheckCharacterNotInRange(first, last, odd_label);
1562 }
1563 } else {
1564 if (first == last) {
1565 masm->CheckCharacter(first, even_label);
1566 } else {
1567 masm->CheckCharacterInRange(first, last, even_label);
1568 }
1569 if (odd_label != fall_through) masm->GoTo(odd_label);
1570 }
1571 }
1572
1573
ulan 2012/03/28 13:14:08 A comment about odd/even/ranges2 would be helpful
Erik Corry 2012/03/29 14:48:59 Done.
1574 static void EmitUseLookupTable(
1575 RegExpMacroAssembler* masm,
1576 ZoneList<int>* ranges2,
1577 int start_index,
1578 int end_index,
1579 int min_char,
1580 Label* fall_through,
1581 Label* even_label,
1582 Label* odd_label) {
1583 static const int kSize = RegExpMacroAssembler::kTableSize;
1584 static const int kMask = RegExpMacroAssembler::kTableMask;
1585
1586 char templ[kSize];
1587 Label* on_bit_set;
1588 Label* on_bit_clear;
1589 int bit;
1590 if (even_label == fall_through) {
1591 on_bit_set = odd_label;
1592 on_bit_clear = even_label;
1593 bit = 1;
1594 } else {
1595 on_bit_set = even_label;
1596 on_bit_clear = odd_label;
1597 bit = 0;
1598 }
1599 int base = (min_char & ~kMask);
1600 for (int i = 0; i < (ranges2->at(start_index) & kMask) && i < kSize; i++) {
1601 templ[i] = bit;
1602 }
1603 int j = 0;
1604 bit ^= 1;
1605 for (int i = start_index; i < end_index; i++) {
1606 for (j = (ranges2->at(i) & kMask);
1607 j < ranges2->at(i + 1) - base && j < kSize;
ulan 2012/03/28 13:14:08 Is (ranges2->at(i + 1) - base) == ranges2->at(i +
Erik Corry 2012/03/29 14:48:59 The answer is yes, and it is changed to use the kM
1608 j++) {
1609 templ[j] = bit;
1610 }
1611 bit ^= 1;
1612 }
1613 for (int i = j; i < kSize; i++) {
1614 templ[i] = bit;
1615 }
1616 // TODO(erikcorry): Cache these.
1617 Handle<ByteArray> ba = FACTORY->NewByteArray(kSize, TENURED);
1618 for (int i = 0; i < kSize; i++) {
1619 ba->set(i, templ[i]);
1620 }
1621 masm->CheckBitInTable(ba, on_bit_set);
1622 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1623 }
1624
1625
1626 // Gets a series of segments representing a character class. If the character
ulan 2012/03/28 13:14:08 Gets a series of segment boundaries ...
Erik Corry 2012/03/29 14:48:59 Done.
1627 // is in the range between even and an odd boundary (counting from start_index)
1628 // then go to even_label, otherwise go to odd_label. We already know that
1629 // the character is in the range of min_char to max_char inclusive. Either
1630 // label can be NULL indicating backtracking. Either label can also be equal to
1631 // the fall_through label.
1632 static void GenerateBranches(RegExpMacroAssembler* masm,
1633 ZoneList<int>* ranges2,
1634 int start_index,
1635 int end_index,
1636 uc16 min_char,
1637 uc16 max_char,
1638 Label* fall_through,
1639 Label* even_label,
1640 Label* odd_label) {
1641 int first = ranges2->at(start_index);
1642 int last = ranges2->at(end_index) - 1;
1643
1644 // Just need to test if the character is before or on-or-after
1645 // a particular character.
1646 if (start_index == end_index) {
1647 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1648 return;
1649 }
1650
1651 // Another almost trivial case: There is one interval in the middle that is
1652 // different from the end intervals.
1653 if (start_index + 1 == end_index) {
1654 EmitDoubleBoundaryTest(
1655 masm, first, last, fall_through, even_label, odd_label);
1656 return;
1657 }
1658
1659 // It's not worth using table lookup if there are very few intervals in the
1660 // character class.
1661 if (end_index - start_index <= 6) {
1662 typedef enum { kSingleCharacter, kRange, kDone } SearchOrder;
1663 // It is faster to test for individual characters, so we look for those
1664 // first, then try arbitrary ranges in the second round.
1665 for (SearchOrder step = kSingleCharacter;
1666 step < kDone;
1667 step = (step == kSingleCharacter) ? kRange : kDone) {
1668 for (int i = start_index; i < end_index; i++) {
1669 int c = ranges2->at(i);
ulan 2012/03/28 13:14:08 If you extract the body of the loop into a functio
Erik Corry 2012/03/29 14:48:59 Done.
1670 int next = ranges2->at(i + 1) - 1;
1671 if (step == kRange || c == next) {
1672 bool odd = (((i - start_index) & 1) == 1);
1673 Label* in_range_label = odd ? odd_label : even_label;
1674 Label rest;
ulan 2012/03/28 13:14:08 'fall_through' seems to be a better name than 'res
Erik Corry 2012/03/29 14:48:59 Renamed to dummy since it is not actually used.
1675 EmitDoubleBoundaryTest(masm, c, next, &rest, in_range_label, &rest);
1676 ASSERT(!rest.is_linked());
1677 // Cut out the single range by rewriting the array.
ulan 2012/03/28 13:14:08 This implicitly creates a new range with endpoints
Erik Corry 2012/03/29 14:48:59 Done.
1678 for (int j = i; j > start_index; j--) {
1679 ranges2->at(j) = ranges2->at(j - 1);
1680 }
1681 for (int j = i + 1; j < end_index; j++) {
1682 ranges2->at(j) = ranges2->at(j + 1);
1683 }
1684 GenerateBranches(masm,
ulan 2012/03/28 13:14:08 I think start_index + 1 can be larger that end_ind
Erik Corry 2012/03/29 14:48:59 We do not get into this if() unless they are 6 apa
1685 ranges2,
1686 start_index + 1,
1687 end_index - 1,
1688 min_char,
1689 max_char,
1690 fall_through,
1691 even_label,
1692 odd_label);
1693 return;
1694 }
1695 }
1696 }
1697 UNREACHABLE();
1698 }
1699
1700 // If there are a lot of intervals in the regexp, then we will use tables to
1701 // determine whether the character is inside or outside the character class.
1702 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
1703 static const int kSize = RegExpMacroAssembler::kTableSize;
1704 static const int kMask = RegExpMacroAssembler::kTableMask;
1705
1706 if ((max_char >> kBits) == (min_char >> kBits)) {
1707 EmitUseLookupTable(masm,
1708 ranges2,
1709 start_index,
1710 end_index,
1711 min_char,
1712 fall_through,
1713 even_label,
1714 odd_label);
1715 } else {
1716 if ((min_char >> kBits) != (first >> kBits)) {
1717 masm->CheckCharacterLT(first, odd_label);
1718 GenerateBranches(masm,
1719 ranges2,
1720 start_index + 1,
1721 end_index,
1722 first,
1723 max_char,
1724 fall_through,
1725 odd_label,
1726 even_label);
1727 return;
1728 }
1729
1730 // Unicode case. Split the search space into kSize spaces that are handled
1731 // with recursion.
1732 int new_start_index = start_index;
1733 int border = (first & ~kMask) + kSize;
1734 while (new_start_index <= end_index) {
1735 if (ranges2->at(new_start_index) > border) break;
ulan 2012/03/28 13:14:08 I would expect to break on >=
Erik Corry 2012/03/29 14:48:59 I think this is correct. If there is a border rig
1736 new_start_index++;
1737 }
1738 // new_start_index is the index of the first edge that is beyond the
1739 // current kSize space.
1740
1741 // For very large search spaces we do a binary chop search of the non-ASCII
1742 // space instead of just going to the end of the current kSize space. The
1743 // heuristics are complicated a little by the fact that any 128-character
1744 // encoding space can be quickly tested with a table lookup, so we don't
1745 // wish to do binary chop search at a smaller granularity than that. A
1746 // 128-character space can take up a lot of space in the ranges2 array if,
1747 // for example, we only want to match every second character (eg. the lower
1748 // case characters on some Unicode pages).
1749 int binary_chop_index = (end_index + start_index) / 2;
ulan 2012/03/28 13:14:08 Why not (end_index + new_start_index) / 2 ?
Erik Corry 2012/03/29 14:48:59 If we are doing binary chop then we would want to
1750 if (border - 1 > String::kMaxAsciiCharCode &&
ulan 2012/03/28 13:14:08 border >= String::kMaxAsciiCharCode
Erik Corry 2012/03/29 14:48:59 This line makes a cut at the ASCII non-ASCII bound
1751 end_index - start_index > (new_start_index - start_index) * 4 &&
1752 last - first > kSize * 4 &&
ulan 2012/03/28 13:14:08 Are 4s above magic?
Erik Corry 2012/03/29 14:48:59 Yes. I changed it to 2, which makes little differ
1753 binary_chop_index > new_start_index &&
1754 ranges2->at(binary_chop_index) > first + 2 * kSize) {
1755 int scan_forward_for_section_border = binary_chop_index;;
1756 int new_border = (ranges2->at(binary_chop_index) | kMask) + 1;
1757 while (scan_forward_for_section_border < end_index) {
ulan 2012/03/28 13:14:08 This loop construct appears twice. Maybe put it in
Erik Corry 2012/03/29 14:48:59 The two occurrences are slightly different. A fun
1758 if (ranges2->at(scan_forward_for_section_border) > new_border) {
1759 new_start_index = scan_forward_for_section_border;
1760 border = new_border;
1761 break;
1762 }
1763 scan_forward_for_section_border++;
1764 }
1765 }
1766
1767 ASSERT(new_start_index > start_index);
1768 Label handle_rest;
1769 Label* above = &handle_rest;
1770 int new_end_index = new_start_index - 1;
1771 if (new_start_index > end_index) {
1772 // We didn't find any section that started after the limit, so everything
1773 // above the border is one of the terminal labels.
1774 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
1775 }
1776 if (ranges2->at(new_end_index) == border) {
1777 new_end_index--;
1778 }
ulan 2012/03/28 13:14:08 Can we extract the code above into a separate func
Erik Corry 2012/03/29 14:48:59 Extracted, and more tests added. It's fuzzing rat
1779 masm->CheckCharacterGT(border - 1, above);
1780 Label dummy;
1781 GenerateBranches(masm,
1782 ranges2,
1783 start_index,
1784 new_end_index,
1785 min_char,
1786 border - 1,
1787 &dummy,
1788 even_label,
1789 odd_label);
1790 if (handle_rest.is_linked()) {
1791 masm->Bind(&handle_rest);
1792 bool flip = (new_start_index & 1) != (start_index & 1);
1793 GenerateBranches(masm,
1794 ranges2,
1795 new_start_index,
1796 end_index,
1797 border,
1798 max_char,
1799 &dummy,
1800 flip ? odd_label : even_label,
1801 flip ? even_label : odd_label);
1802 }
1803 }
1804 }
1805
1806
1537 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 1807 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1538 RegExpCharacterClass* cc, 1808 RegExpCharacterClass* cc,
1539 bool ascii, 1809 bool ascii,
1540 Label* on_failure, 1810 Label* on_failure,
1541 int cp_offset, 1811 int cp_offset,
1542 bool check_offset, 1812 bool check_offset,
1543 bool preloaded) { 1813 bool preloaded) {
1544 ZoneList<CharacterRange>* ranges = cc->ranges(); 1814 ZoneList<CharacterRange>* ranges = cc->ranges();
1815 if (!CharacterRange::IsCanonical(ranges)) {
1816 CharacterRange::Canonicalize(ranges);
1817 }
1818
1545 int max_char; 1819 int max_char;
1546 if (ascii) { 1820 if (ascii) {
1547 max_char = String::kMaxAsciiCharCode; 1821 max_char = String::kMaxAsciiCharCode;
1548 } else { 1822 } else {
1549 max_char = String::kMaxUtf16CodeUnit; 1823 max_char = String::kMaxUtf16CodeUnit;
1550 } 1824 }
1551 1825
1552 Label success;
1553
1554 Label* char_is_in_class =
1555 cc->is_negated() ? on_failure : &success;
1556
1557 int range_count = ranges->length(); 1826 int range_count = ranges->length();
1558 1827
1559 int last_valid_range = range_count - 1; 1828 int last_valid_range = range_count - 1;
1560 while (last_valid_range >= 0) { 1829 while (last_valid_range >= 0) {
1561 CharacterRange& range = ranges->at(last_valid_range); 1830 CharacterRange& range = ranges->at(last_valid_range);
1562 if (range.from() <= max_char) { 1831 if (range.from() <= max_char) {
1563 break; 1832 break;
1564 } 1833 }
1565 last_valid_range--; 1834 last_valid_range--;
1566 } 1835 }
1567 1836
1568 if (last_valid_range < 0) { 1837 if (last_valid_range < 0) {
1569 if (!cc->is_negated()) { 1838 if (!cc->is_negated()) {
1570 // TODO(plesner): We can remove this when the node level does our
1571 // ASCII optimizations for us.
1572 macro_assembler->GoTo(on_failure); 1839 macro_assembler->GoTo(on_failure);
1573 } 1840 }
1574 if (check_offset) { 1841 if (check_offset) {
1575 macro_assembler->CheckPosition(cp_offset, on_failure); 1842 macro_assembler->CheckPosition(cp_offset, on_failure);
1576 } 1843 }
1577 return; 1844 return;
1578 } 1845 }
1579 1846
1580 if (last_valid_range == 0 && 1847 if (last_valid_range == 0 &&
1848 ranges->at(0).IsEverything(max_char)) {
1849 if (cc->is_negated()) {
1850 macro_assembler->GoTo(on_failure);
1851 } else {
1852 // This is a common case hit by non-anchored expressions.
1853 if (check_offset) {
1854 macro_assembler->CheckPosition(cp_offset, on_failure);
1855 }
1856 }
1857 return;
1858 }
1859 if (last_valid_range == 0 &&
1581 !cc->is_negated() && 1860 !cc->is_negated() &&
1582 ranges->at(0).IsEverything(max_char)) { 1861 ranges->at(0).IsEverything(max_char)) {
1583 // This is a common case hit by non-anchored expressions. 1862 // This is a common case hit by non-anchored expressions.
1584 if (check_offset) { 1863 if (check_offset) {
1585 macro_assembler->CheckPosition(cp_offset, on_failure); 1864 macro_assembler->CheckPosition(cp_offset, on_failure);
1586 } 1865 }
1587 return; 1866 return;
1588 } 1867 }
1589 1868
1590 if (!preloaded) { 1869 if (!preloaded) {
1591 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 1870 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1592 } 1871 }
1593 1872
1594 if (cc->is_standard() && 1873 if (cc->is_standard() &&
1595 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), 1874 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1596 on_failure)) { 1875 on_failure)) {
1597 return; 1876 return;
1598 } 1877 }
1599 1878
1600 for (int i = 0; i < last_valid_range; i++) { 1879
1601 CharacterRange& range = ranges->at(i); 1880 // A new list with ascending entries. Each entry is a code unit
1602 Label next_range; 1881 // where there is a boundary between code units that are part of
1603 uc16 from = range.from(); 1882 // the class and code units that are not. Normally we insert an
1604 uc16 to = range.to(); 1883 // entry at zero which goes to the failure label, but if there
1605 if (from > max_char) { 1884 // was already one there we fall through for success on that entry.
1606 continue; 1885 // Subsequent entries have alternating meaning (success/failure).
1607 } 1886 ZoneList<int>* ranges2 =
ulan 2012/03/28 13:14:08 This fits into one line. Can we find a better name
Erik Corry 2012/03/29 14:48:59 range_boundaries, but I changed it to just ranges
1608 if (to > max_char) to = max_char; 1887 new ZoneList<int>(last_valid_range);
1609 if (to == from) { 1888
1610 macro_assembler->CheckCharacter(to, char_is_in_class); 1889 bool zeroth_entry_is_failure = true;
1611 } else { 1890 if (cc->is_negated()) zeroth_entry_is_failure = false;
ulan 2012/03/28 13:14:08 zeroth_entry_is_failure = !cc->is_negated();
Erik Corry 2012/03/29 14:48:59 Done.
1612 if (from != 0) { 1891
1613 macro_assembler->CheckCharacterLT(from, &next_range); 1892 int start = ranges->at(0).from();
1614 } 1893 if (start == 0) {
1615 if (to != max_char) { 1894 zeroth_entry_is_failure = !zeroth_entry_is_failure;
1616 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class); 1895 } else {
1617 } else { 1896 ranges2->Add(0);
ulan 2012/03/28 13:14:08 I wonder why we need ranges2[0] to be zero? Genera
Erik Corry 2012/03/29 14:48:59 Done.
1618 macro_assembler->GoTo(char_is_in_class);
1619 }
1620 }
1621 macro_assembler->Bind(&next_range);
1622 } 1897 }
1623 1898
1624 CharacterRange& range = ranges->at(last_valid_range); 1899 for (int i = 0; i <= last_valid_range; i++) {
1625 uc16 from = range.from(); 1900 CharacterRange& range = ranges->at(i);
1626 uc16 to = range.to(); 1901 ranges2->Add(range.from());
1902 ranges2->Add(range.to() + 1);
1903 }
1904 int ranges2_length = ranges2->length() - 1;
ulan 2012/03/28 13:14:08 In 'ranges2_length' the 'length' suffix is mislead
Erik Corry 2012/03/29 14:48:59 Done.
1905 if (ranges2->at(ranges2_length) >= max_char) ranges2_length--;
1627 1906
1628 if (to > max_char) to = max_char; 1907 Label fall_through;
1629 ASSERT(to >= from); 1908 GenerateBranches(macro_assembler,
1630 1909 ranges2,
1631 if (to == from) { 1910 1,
1632 if (cc->is_negated()) { 1911 ranges2_length,
1633 macro_assembler->CheckCharacter(to, on_failure); 1912 0,
1634 } else { 1913 max_char,
1635 macro_assembler->CheckNotCharacter(to, on_failure); 1914 &fall_through,
1636 } 1915 zeroth_entry_is_failure ? &fall_through : on_failure,
1637 } else { 1916 zeroth_entry_is_failure ? on_failure : &fall_through);
1638 if (from != 0) { 1917 macro_assembler->Bind(&fall_through);
1639 if (cc->is_negated()) {
1640 macro_assembler->CheckCharacterLT(from, &success);
1641 } else {
1642 macro_assembler->CheckCharacterLT(from, on_failure);
1643 }
1644 }
1645 if (to != String::kMaxUtf16CodeUnit) {
1646 if (cc->is_negated()) {
1647 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1648 } else {
1649 macro_assembler->CheckCharacterGT(to, on_failure);
1650 }
1651 } else {
1652 if (cc->is_negated()) {
1653 macro_assembler->GoTo(on_failure);
1654 }
1655 }
1656 }
1657 macro_assembler->Bind(&success);
1658 } 1918 }
1659 1919
1660 1920
1661 RegExpNode::~RegExpNode() { 1921 RegExpNode::~RegExpNode() {
1662 } 1922 }
1663 1923
1664 1924
1665 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, 1925 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1666 Trace* trace) { 1926 Trace* trace) {
1667 // If we are generating a greedy loop then don't stop and don't reuse code. 1927 // If we are generating a greedy loop then don't stop and don't reuse code.
(...skipping 3666 matching lines...) Expand 10 before | Expand all | Expand 10 after
5334 } 5594 }
5335 5595
5336 return compiler.Assemble(&macro_assembler, 5596 return compiler.Assemble(&macro_assembler,
5337 node, 5597 node,
5338 data->capture_count, 5598 data->capture_count,
5339 pattern); 5599 pattern);
5340 } 5600 }
5341 5601
5342 5602
5343 }} // namespace v8::internal 5603 }} // namespace v8::internal
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698