OLD | NEW |
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 1516 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1527 macro_assembler->Bind(&ok); | 1527 macro_assembler->Bind(&ok); |
1528 break; | 1528 break; |
1529 default: | 1529 default: |
1530 UNREACHABLE(); | 1530 UNREACHABLE(); |
1531 break; | 1531 break; |
1532 } | 1532 } |
1533 return true; | 1533 return true; |
1534 } | 1534 } |
1535 | 1535 |
1536 | 1536 |
| 1537 static void EmitBoundaryTest(RegExpMacroAssembler* masm, |
| 1538 int border, |
| 1539 Label* fall_through, |
| 1540 Label* above_or_equal, |
| 1541 Label* below) { |
| 1542 if (below != fall_through) { |
| 1543 masm->CheckCharacterLT(border, below); |
| 1544 if (above_or_equal != fall_through) masm->GoTo(above_or_equal); |
| 1545 } else { |
| 1546 masm->CheckCharacterGT(border - 1, above_or_equal); |
| 1547 } |
| 1548 } |
| 1549 |
| 1550 |
| 1551 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, |
| 1552 int first, |
| 1553 int last, |
| 1554 Label* fall_through, |
| 1555 Label* even_label, |
| 1556 Label* odd_label) { |
| 1557 if (even_label == fall_through) { |
| 1558 if (first == last) { |
| 1559 masm->CheckNotCharacter(first, odd_label); |
| 1560 } else { |
| 1561 masm->CheckCharacterNotInRange(first, last, odd_label); |
| 1562 } |
| 1563 } else { |
| 1564 if (first == last) { |
| 1565 masm->CheckCharacter(first, even_label); |
| 1566 } else { |
| 1567 masm->CheckCharacterInRange(first, last, even_label); |
| 1568 } |
| 1569 if (odd_label != fall_through) masm->GoTo(odd_label); |
| 1570 } |
| 1571 } |
| 1572 |
| 1573 |
| 1574 static void EmitUseLookupTable( |
| 1575 RegExpMacroAssembler* masm, |
| 1576 ZoneList<int>* ranges2, |
| 1577 int start_index, |
| 1578 int end_index, |
| 1579 int min_char, |
| 1580 Label* fall_through, |
| 1581 Label* even_label, |
| 1582 Label* odd_label) { |
| 1583 static const int kSize = RegExpMacroAssembler::kTableSize; |
| 1584 static const int kMask = RegExpMacroAssembler::kTableMask; |
| 1585 |
| 1586 char templ[kSize]; |
| 1587 Label* on_bit_set; |
| 1588 Label* on_bit_clear; |
| 1589 int bit; |
| 1590 if (even_label == fall_through) { |
| 1591 on_bit_set = odd_label; |
| 1592 on_bit_clear = even_label; |
| 1593 bit = 1; |
| 1594 } else { |
| 1595 on_bit_set = even_label; |
| 1596 on_bit_clear = odd_label; |
| 1597 bit = 0; |
| 1598 } |
| 1599 int base = (min_char & ~kMask); |
| 1600 for (int i = 0; i < (ranges2->at(start_index) & kMask) && i < kSize; i++) { |
| 1601 templ[i] = bit; |
| 1602 } |
| 1603 int j = 0; |
| 1604 bit ^= 1; |
| 1605 for (int i = start_index; i < end_index; i++) { |
| 1606 for (j = (ranges2->at(i) & kMask); |
| 1607 j < ranges2->at(i + 1) - base && j < kSize; |
| 1608 j++) { |
| 1609 templ[j] = bit; |
| 1610 } |
| 1611 bit ^= 1; |
| 1612 } |
| 1613 for (int i = j; i < kSize; i++) { |
| 1614 templ[i] = bit; |
| 1615 } |
| 1616 // TODO(erikcorry): Cache these. |
| 1617 Handle<ByteArray> ba = FACTORY->NewByteArray(kSize, TENURED); |
| 1618 for (int i = 0; i < kSize; i++) { |
| 1619 ba->set(i, templ[i]); |
| 1620 } |
| 1621 masm->CheckBitInTable(ba, on_bit_set); |
| 1622 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear); |
| 1623 } |
| 1624 |
| 1625 |
| 1626 // Gets a series of segments representing a character class. If the character |
| 1627 // is in the range between even and an odd boundary (counting from start_index) |
| 1628 // then go to even_label, otherwise go to odd_label. We already know that |
| 1629 // the character is in the range of min_char to max_char inclusive. Either |
| 1630 // label can be NULL indicating backtracking. Either label can also be equal to |
| 1631 // the fall_through label. |
| 1632 static void GenerateBranches(RegExpMacroAssembler* masm, |
| 1633 ZoneList<int>* ranges2, |
| 1634 int start_index, |
| 1635 int end_index, |
| 1636 uc16 min_char, |
| 1637 uc16 max_char, |
| 1638 Label* fall_through, |
| 1639 Label* even_label, |
| 1640 Label* odd_label) { |
| 1641 int first = ranges2->at(start_index); |
| 1642 int last = ranges2->at(end_index) - 1; |
| 1643 |
| 1644 // Just need to test if the character is before or on-or-after |
| 1645 // a particular character. |
| 1646 if (start_index == end_index) { |
| 1647 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label); |
| 1648 return; |
| 1649 } |
| 1650 |
| 1651 // Another almost trivial case: There is one interval in the middle that is |
| 1652 // different from the end intervals. |
| 1653 if (start_index + 1 == end_index) { |
| 1654 EmitDoubleBoundaryTest( |
| 1655 masm, first, last, fall_through, even_label, odd_label); |
| 1656 return; |
| 1657 } |
| 1658 |
| 1659 // It's not worth using table lookup if there are very few intervals in the |
| 1660 // character class. |
| 1661 if (end_index - start_index <= 6) { |
| 1662 typedef enum { kSingleCharacter, kRange, kDone } SearchOrder; |
| 1663 // It is faster to test for individual characters, so we look for those |
| 1664 // first, then try arbitrary ranges in the second round. |
| 1665 for (SearchOrder step = kSingleCharacter; |
| 1666 step < kDone; |
| 1667 step = (step == kSingleCharacter) ? kRange : kDone) { |
| 1668 for (int i = start_index; i < end_index; i++) { |
| 1669 int c = ranges2->at(i); |
| 1670 int next = ranges2->at(i + 1) - 1; |
| 1671 if (step == kRange || c == next) { |
| 1672 bool odd = (((i - start_index) & 1) == 1); |
| 1673 Label* in_range_label = odd ? odd_label : even_label; |
| 1674 Label rest; |
| 1675 EmitDoubleBoundaryTest(masm, c, next, &rest, in_range_label, &rest); |
| 1676 ASSERT(!rest.is_linked()); |
| 1677 // Cut out the single range by rewriting the array. |
| 1678 for (int j = i; j > start_index; j--) { |
| 1679 ranges2->at(j) = ranges2->at(j - 1); |
| 1680 } |
| 1681 for (int j = i + 1; j < end_index; j++) { |
| 1682 ranges2->at(j) = ranges2->at(j + 1); |
| 1683 } |
| 1684 GenerateBranches(masm, |
| 1685 ranges2, |
| 1686 start_index + 1, |
| 1687 end_index - 1, |
| 1688 min_char, |
| 1689 max_char, |
| 1690 fall_through, |
| 1691 even_label, |
| 1692 odd_label); |
| 1693 return; |
| 1694 } |
| 1695 } |
| 1696 } |
| 1697 UNREACHABLE(); |
| 1698 } |
| 1699 |
| 1700 // If there are a lot of intervals in the regexp, then we will use tables to |
| 1701 // determine whether the character is inside or outside the character class. |
| 1702 static const int kBits = RegExpMacroAssembler::kTableSizeBits; |
| 1703 static const int kSize = RegExpMacroAssembler::kTableSize; |
| 1704 static const int kMask = RegExpMacroAssembler::kTableMask; |
| 1705 |
| 1706 if ((max_char >> kBits) == (min_char >> kBits)) { |
| 1707 EmitUseLookupTable(masm, |
| 1708 ranges2, |
| 1709 start_index, |
| 1710 end_index, |
| 1711 min_char, |
| 1712 fall_through, |
| 1713 even_label, |
| 1714 odd_label); |
| 1715 } else { |
| 1716 if ((min_char >> kBits) != (first >> kBits)) { |
| 1717 masm->CheckCharacterLT(first, odd_label); |
| 1718 GenerateBranches(masm, |
| 1719 ranges2, |
| 1720 start_index + 1, |
| 1721 end_index, |
| 1722 first, |
| 1723 max_char, |
| 1724 fall_through, |
| 1725 odd_label, |
| 1726 even_label); |
| 1727 return; |
| 1728 } |
| 1729 |
| 1730 // Unicode case. Split the search space into kSize spaces that are handled |
| 1731 // with recursion. |
| 1732 int new_start_index = start_index; |
| 1733 int border = (first & ~kMask) + kSize; |
| 1734 while (new_start_index <= end_index) { |
| 1735 if (ranges2->at(new_start_index) > border) break; |
| 1736 new_start_index++; |
| 1737 } |
| 1738 // new_start_index is the index of the first edge that is beyond the |
| 1739 // current kSize space. |
| 1740 |
| 1741 // For very large search spaces we do a binary chop search of the non-ASCII |
| 1742 // space instead of just going to the end of the current kSize space. The |
| 1743 // heuristics are complicated a little by the fact that any 128-character |
| 1744 // encoding space can be quickly tested with a table lookup, so we don't |
| 1745 // wish to do binary chop search at a smaller granularity than that. A |
| 1746 // 128-character space can take up a lot of space in the ranges2 array if, |
| 1747 // for example, we only want to match every second character (eg. the lower |
| 1748 // case characters on some Unicode pages). |
| 1749 int binary_chop_index = (end_index + start_index) / 2; |
| 1750 if (border - 1 > String::kMaxAsciiCharCode && |
| 1751 end_index - start_index > (new_start_index - start_index) * 4 && |
| 1752 last - first > kSize * 4 && |
| 1753 binary_chop_index > new_start_index && |
| 1754 ranges2->at(binary_chop_index) > first + 2 * kSize) { |
| 1755 int scan_forward_for_section_border = binary_chop_index;; |
| 1756 int new_border = (ranges2->at(binary_chop_index) | kMask) + 1; |
| 1757 while (scan_forward_for_section_border < end_index) { |
| 1758 if (ranges2->at(scan_forward_for_section_border) > new_border) { |
| 1759 new_start_index = scan_forward_for_section_border; |
| 1760 border = new_border; |
| 1761 break; |
| 1762 } |
| 1763 scan_forward_for_section_border++; |
| 1764 } |
| 1765 } |
| 1766 |
| 1767 ASSERT(new_start_index > start_index); |
| 1768 Label handle_rest; |
| 1769 Label* above = &handle_rest; |
| 1770 int new_end_index = new_start_index - 1; |
| 1771 if (new_start_index > end_index) { |
| 1772 // We didn't find any section that started after the limit, so everything |
| 1773 // above the border is one of the terminal labels. |
| 1774 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label; |
| 1775 } |
| 1776 if (ranges2->at(new_end_index) == border) { |
| 1777 new_end_index--; |
| 1778 } |
| 1779 masm->CheckCharacterGT(border - 1, above); |
| 1780 Label dummy; |
| 1781 GenerateBranches(masm, |
| 1782 ranges2, |
| 1783 start_index, |
| 1784 new_end_index, |
| 1785 min_char, |
| 1786 border - 1, |
| 1787 &dummy, |
| 1788 even_label, |
| 1789 odd_label); |
| 1790 if (handle_rest.is_linked()) { |
| 1791 masm->Bind(&handle_rest); |
| 1792 bool flip = (new_start_index & 1) != (start_index & 1); |
| 1793 GenerateBranches(masm, |
| 1794 ranges2, |
| 1795 new_start_index, |
| 1796 end_index, |
| 1797 border, |
| 1798 max_char, |
| 1799 &dummy, |
| 1800 flip ? odd_label : even_label, |
| 1801 flip ? even_label : odd_label); |
| 1802 } |
| 1803 } |
| 1804 } |
| 1805 |
| 1806 |
1537 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, | 1807 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, |
1538 RegExpCharacterClass* cc, | 1808 RegExpCharacterClass* cc, |
1539 bool ascii, | 1809 bool ascii, |
1540 Label* on_failure, | 1810 Label* on_failure, |
1541 int cp_offset, | 1811 int cp_offset, |
1542 bool check_offset, | 1812 bool check_offset, |
1543 bool preloaded) { | 1813 bool preloaded) { |
1544 ZoneList<CharacterRange>* ranges = cc->ranges(); | 1814 ZoneList<CharacterRange>* ranges = cc->ranges(); |
| 1815 if (!CharacterRange::IsCanonical(ranges)) { |
| 1816 CharacterRange::Canonicalize(ranges); |
| 1817 } |
| 1818 |
1545 int max_char; | 1819 int max_char; |
1546 if (ascii) { | 1820 if (ascii) { |
1547 max_char = String::kMaxAsciiCharCode; | 1821 max_char = String::kMaxAsciiCharCode; |
1548 } else { | 1822 } else { |
1549 max_char = String::kMaxUtf16CodeUnit; | 1823 max_char = String::kMaxUtf16CodeUnit; |
1550 } | 1824 } |
1551 | 1825 |
1552 Label success; | |
1553 | |
1554 Label* char_is_in_class = | |
1555 cc->is_negated() ? on_failure : &success; | |
1556 | |
1557 int range_count = ranges->length(); | 1826 int range_count = ranges->length(); |
1558 | 1827 |
1559 int last_valid_range = range_count - 1; | 1828 int last_valid_range = range_count - 1; |
1560 while (last_valid_range >= 0) { | 1829 while (last_valid_range >= 0) { |
1561 CharacterRange& range = ranges->at(last_valid_range); | 1830 CharacterRange& range = ranges->at(last_valid_range); |
1562 if (range.from() <= max_char) { | 1831 if (range.from() <= max_char) { |
1563 break; | 1832 break; |
1564 } | 1833 } |
1565 last_valid_range--; | 1834 last_valid_range--; |
1566 } | 1835 } |
1567 | 1836 |
1568 if (last_valid_range < 0) { | 1837 if (last_valid_range < 0) { |
1569 if (!cc->is_negated()) { | 1838 if (!cc->is_negated()) { |
1570 // TODO(plesner): We can remove this when the node level does our | |
1571 // ASCII optimizations for us. | |
1572 macro_assembler->GoTo(on_failure); | 1839 macro_assembler->GoTo(on_failure); |
1573 } | 1840 } |
1574 if (check_offset) { | 1841 if (check_offset) { |
1575 macro_assembler->CheckPosition(cp_offset, on_failure); | 1842 macro_assembler->CheckPosition(cp_offset, on_failure); |
1576 } | 1843 } |
1577 return; | 1844 return; |
1578 } | 1845 } |
1579 | 1846 |
1580 if (last_valid_range == 0 && | 1847 if (last_valid_range == 0 && |
| 1848 ranges->at(0).IsEverything(max_char)) { |
| 1849 if (cc->is_negated()) { |
| 1850 macro_assembler->GoTo(on_failure); |
| 1851 } else { |
| 1852 // This is a common case hit by non-anchored expressions. |
| 1853 if (check_offset) { |
| 1854 macro_assembler->CheckPosition(cp_offset, on_failure); |
| 1855 } |
| 1856 } |
| 1857 return; |
| 1858 } |
| 1859 if (last_valid_range == 0 && |
1581 !cc->is_negated() && | 1860 !cc->is_negated() && |
1582 ranges->at(0).IsEverything(max_char)) { | 1861 ranges->at(0).IsEverything(max_char)) { |
1583 // This is a common case hit by non-anchored expressions. | 1862 // This is a common case hit by non-anchored expressions. |
1584 if (check_offset) { | 1863 if (check_offset) { |
1585 macro_assembler->CheckPosition(cp_offset, on_failure); | 1864 macro_assembler->CheckPosition(cp_offset, on_failure); |
1586 } | 1865 } |
1587 return; | 1866 return; |
1588 } | 1867 } |
1589 | 1868 |
1590 if (!preloaded) { | 1869 if (!preloaded) { |
1591 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); | 1870 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); |
1592 } | 1871 } |
1593 | 1872 |
1594 if (cc->is_standard() && | 1873 if (cc->is_standard() && |
1595 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), | 1874 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), |
1596 on_failure)) { | 1875 on_failure)) { |
1597 return; | 1876 return; |
1598 } | 1877 } |
1599 | 1878 |
1600 for (int i = 0; i < last_valid_range; i++) { | 1879 |
1601 CharacterRange& range = ranges->at(i); | 1880 // A new list with ascending entries. Each entry is a code unit |
1602 Label next_range; | 1881 // where there is a boundary between code units that are part of |
1603 uc16 from = range.from(); | 1882 // the class and code units that are not. Normally we insert an |
1604 uc16 to = range.to(); | 1883 // entry at zero which goes to the failure label, but if there |
1605 if (from > max_char) { | 1884 // was already one there we fall through for success on that entry. |
1606 continue; | 1885 // Subsequent entries have alternating meaning (success/failure). |
1607 } | 1886 ZoneList<int>* ranges2 = |
1608 if (to > max_char) to = max_char; | 1887 new ZoneList<int>(last_valid_range); |
1609 if (to == from) { | 1888 |
1610 macro_assembler->CheckCharacter(to, char_is_in_class); | 1889 bool zeroth_entry_is_failure = true; |
1611 } else { | 1890 if (cc->is_negated()) zeroth_entry_is_failure = false; |
1612 if (from != 0) { | 1891 |
1613 macro_assembler->CheckCharacterLT(from, &next_range); | 1892 int start = ranges->at(0).from(); |
1614 } | 1893 if (start == 0) { |
1615 if (to != max_char) { | 1894 zeroth_entry_is_failure = !zeroth_entry_is_failure; |
1616 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class); | 1895 } else { |
1617 } else { | 1896 ranges2->Add(0); |
1618 macro_assembler->GoTo(char_is_in_class); | |
1619 } | |
1620 } | |
1621 macro_assembler->Bind(&next_range); | |
1622 } | 1897 } |
1623 | 1898 |
1624 CharacterRange& range = ranges->at(last_valid_range); | 1899 for (int i = 0; i <= last_valid_range; i++) { |
1625 uc16 from = range.from(); | 1900 CharacterRange& range = ranges->at(i); |
1626 uc16 to = range.to(); | 1901 ranges2->Add(range.from()); |
| 1902 ranges2->Add(range.to() + 1); |
| 1903 } |
| 1904 int ranges2_length = ranges2->length() - 1; |
| 1905 if (ranges2->at(ranges2_length) >= max_char) ranges2_length--; |
1627 | 1906 |
1628 if (to > max_char) to = max_char; | 1907 Label fall_through; |
1629 ASSERT(to >= from); | 1908 GenerateBranches(macro_assembler, |
1630 | 1909 ranges2, |
1631 if (to == from) { | 1910 1, |
1632 if (cc->is_negated()) { | 1911 ranges2_length, |
1633 macro_assembler->CheckCharacter(to, on_failure); | 1912 0, |
1634 } else { | 1913 max_char, |
1635 macro_assembler->CheckNotCharacter(to, on_failure); | 1914 &fall_through, |
1636 } | 1915 zeroth_entry_is_failure ? &fall_through : on_failure, |
1637 } else { | 1916 zeroth_entry_is_failure ? on_failure : &fall_through); |
1638 if (from != 0) { | 1917 macro_assembler->Bind(&fall_through); |
1639 if (cc->is_negated()) { | |
1640 macro_assembler->CheckCharacterLT(from, &success); | |
1641 } else { | |
1642 macro_assembler->CheckCharacterLT(from, on_failure); | |
1643 } | |
1644 } | |
1645 if (to != String::kMaxUtf16CodeUnit) { | |
1646 if (cc->is_negated()) { | |
1647 macro_assembler->CheckCharacterLT(to + 1, on_failure); | |
1648 } else { | |
1649 macro_assembler->CheckCharacterGT(to, on_failure); | |
1650 } | |
1651 } else { | |
1652 if (cc->is_negated()) { | |
1653 macro_assembler->GoTo(on_failure); | |
1654 } | |
1655 } | |
1656 } | |
1657 macro_assembler->Bind(&success); | |
1658 } | 1918 } |
1659 | 1919 |
1660 | 1920 |
1661 RegExpNode::~RegExpNode() { | 1921 RegExpNode::~RegExpNode() { |
1662 } | 1922 } |
1663 | 1923 |
1664 | 1924 |
1665 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, | 1925 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, |
1666 Trace* trace) { | 1926 Trace* trace) { |
1667 // If we are generating a greedy loop then don't stop and don't reuse code. | 1927 // If we are generating a greedy loop then don't stop and don't reuse code. |
(...skipping 3666 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5334 } | 5594 } |
5335 | 5595 |
5336 return compiler.Assemble(¯o_assembler, | 5596 return compiler.Assemble(¯o_assembler, |
5337 node, | 5597 node, |
5338 data->capture_count, | 5598 data->capture_count, |
5339 pattern); | 5599 pattern); |
5340 } | 5600 } |
5341 | 5601 |
5342 | 5602 |
5343 }} // namespace v8::internal | 5603 }} // namespace v8::internal |
OLD | NEW |