OLD | NEW |
---|---|
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. | 1 // Copyright 2006-2008 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 1721 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1732 trace->backtrack()); | 1732 trace->backtrack()); |
1733 break; | 1733 break; |
1734 } | 1734 } |
1735 } | 1735 } |
1736 | 1736 |
1737 | 1737 |
1738 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; | 1738 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; |
1739 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; | 1739 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; |
1740 | 1740 |
1741 | 1741 |
1742 // Returns the number of characters in the equivalence class, omitting those | |
1743 // that cannot occur in the source string because it is ASCII. | |
1744 static int GetCaseIndependentLetters(uc16 character, | |
1745 bool ascii_subject, | |
1746 unibrow::uchar* letters) { | |
1747 int length = uncanonicalize.get(character, '\0', letters); | |
1748 // Unibrow returns 0 or 1 for characters where case independependence is | |
1749 // trivial. | |
1750 if (length == 0) { | |
1751 letters[0] = character; | |
1752 length = 1; | |
1753 } | |
1754 if (!ascii_subject || character <= String::kMaxAsciiCharCode) { | |
1755 return length; | |
1756 } | |
1757 // The standard requires that non-ASCII characters cannot have ASCII | |
1758 // character codes in their equivalence class. | |
1759 return 0; | |
1760 } | |
1761 | |
1762 | |
1742 // Only emits non-letters (things that don't have case). Only used for case | 1763 // Only emits non-letters (things that don't have case). Only used for case |
1743 // independent matches. | 1764 // independent matches. |
1744 static inline bool EmitAtomNonLetter( | 1765 static inline bool EmitAtomNonLetter( |
1745 RegExpMacroAssembler* macro_assembler, | 1766 RegExpMacroAssembler* macro_assembler, |
1746 uc16 c, | 1767 uc16 c, |
1768 bool ascii, | |
1747 Label* on_failure, | 1769 Label* on_failure, |
1748 int cp_offset, | 1770 int cp_offset, |
1749 bool check, | 1771 bool check, |
1750 bool preloaded) { | 1772 bool preloaded) { |
1751 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1773 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
1752 int length = uncanonicalize.get(c, '\0', chars); | 1774 int length = GetCaseIndependentLetters(c, ascii, chars); |
1775 if (length < 1) { | |
1776 // This can't match. Must be an ASCII subject and a non-ASCII character. | |
1777 // We do not need to do anything since the ASCII pass already handled this. | |
1778 return false; // Bounds not checked. | |
1779 } | |
1753 bool checked = false; | 1780 bool checked = false; |
1754 if (length <= 1) { | 1781 // We handle the length > 1 case in a later pass. |
1782 if (length == 1) { | |
1783 if (ascii && c > String::kMaxAsciiCharCodeU) { | |
1784 // Can't match - see above. | |
1785 return false; // Bounds not checked. | |
1786 } | |
1755 if (!preloaded) { | 1787 if (!preloaded) { |
1756 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); | 1788 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
1757 checked = check; | 1789 checked = check; |
1758 } | 1790 } |
1759 macro_assembler->CheckNotCharacter(c, on_failure); | 1791 macro_assembler->CheckNotCharacter(c, on_failure); |
1760 } | 1792 } |
1761 return checked; | 1793 return checked; |
1762 } | 1794 } |
1763 | 1795 |
1764 | 1796 |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1798 return true; | 1830 return true; |
1799 } | 1831 } |
1800 return false; | 1832 return false; |
1801 } | 1833 } |
1802 | 1834 |
1803 | 1835 |
1804 // Only emits letters (things that have case). Only used for case independent | 1836 // Only emits letters (things that have case). Only used for case independent |
1805 // matches. | 1837 // matches. |
1806 static inline bool EmitAtomLetter( | 1838 static inline bool EmitAtomLetter( |
1807 RegExpMacroAssembler* macro_assembler, | 1839 RegExpMacroAssembler* macro_assembler, |
1840 uc16 c, | |
1808 bool ascii, | 1841 bool ascii, |
1809 uc16 c, | |
1810 Label* on_failure, | 1842 Label* on_failure, |
1811 int cp_offset, | 1843 int cp_offset, |
1812 bool check, | 1844 bool check, |
1813 bool preloaded) { | 1845 bool preloaded) { |
1814 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1846 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
1815 int length = uncanonicalize.get(c, '\0', chars); | 1847 int length = GetCaseIndependentLetters(c, ascii, chars); |
1816 if (length <= 1) return false; | 1848 if (length <= 1) return false; |
1817 // We may not need to check against the end of the input string | 1849 // We may not need to check against the end of the input string |
1818 // if this character lies before a character that matched. | 1850 // if this character lies before a character that matched. |
1819 if (!preloaded) { | 1851 if (!preloaded) { |
1820 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); | 1852 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
1821 } | 1853 } |
1822 Label ok; | 1854 Label ok; |
1823 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); | 1855 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); |
1824 switch (length) { | 1856 switch (length) { |
1825 case 2: { | 1857 case 2: { |
(...skipping 21 matching lines...) Expand all Loading... | |
1847 default: | 1879 default: |
1848 UNREACHABLE(); | 1880 UNREACHABLE(); |
1849 break; | 1881 break; |
1850 } | 1882 } |
1851 return true; | 1883 return true; |
1852 } | 1884 } |
1853 | 1885 |
1854 | 1886 |
1855 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, | 1887 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, |
1856 RegExpCharacterClass* cc, | 1888 RegExpCharacterClass* cc, |
1889 bool ascii, | |
1890 Label* on_failure, | |
1857 int cp_offset, | 1891 int cp_offset, |
1858 Label* on_failure, | |
1859 bool check_offset, | 1892 bool check_offset, |
1860 bool ascii, | |
1861 bool preloaded) { | 1893 bool preloaded) { |
1862 if (cc->is_standard() && | 1894 if (cc->is_standard() && |
1863 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), | 1895 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), |
1864 cp_offset, | 1896 cp_offset, |
1865 check_offset, | 1897 check_offset, |
1866 on_failure)) { | 1898 on_failure)) { |
1867 return; | 1899 return; |
1868 } | 1900 } |
1869 | 1901 |
1870 ZoneList<CharacterRange>* ranges = cc->ranges(); | 1902 ZoneList<CharacterRange>* ranges = cc->ranges(); |
(...skipping 360 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2231 for (int i = 0; i < characters && i < quarks.length(); i++) { | 2263 for (int i = 0; i < characters && i < quarks.length(); i++) { |
2232 QuickCheckDetails::Position* pos = | 2264 QuickCheckDetails::Position* pos = |
2233 details->positions(characters_filled_in); | 2265 details->positions(characters_filled_in); |
2234 uc16 c = quarks[i]; | 2266 uc16 c = quarks[i]; |
2235 if (c > char_mask) { | 2267 if (c > char_mask) { |
2236 // If we expect a non-ASCII character from an ASCII string, | 2268 // If we expect a non-ASCII character from an ASCII string, |
2237 // there is no way we can match. Not even case independent | 2269 // there is no way we can match. Not even case independent |
2238 // matching can turn an ASCII character into non-ASCII or | 2270 // matching can turn an ASCII character into non-ASCII or |
2239 // vice versa. | 2271 // vice versa. |
2240 details->set_cannot_match(); | 2272 details->set_cannot_match(); |
2273 pos->determines_perfectly = false; | |
Lasse Reichstein
2009/02/18 15:20:44
Should this be set to false? If the remaining case
Erik Corry
2009/02/18 16:04:24
Determines perfectly indicates that this particula
| |
2241 return; | 2274 return; |
2242 } | 2275 } |
2243 if (compiler->ignore_case()) { | 2276 if (compiler->ignore_case()) { |
2244 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 2277 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
2245 int length = uncanonicalize.get(c, '\0', chars); | 2278 int length = GetCaseIndependentLetters(c, compiler->ascii(), chars); |
2246 if (length < 2) { | 2279 ASSERT(length != 0); // Can only happen if c > char_mask (see above). |
2280 if (length == 1) { | |
2247 // This letter has no case equivalents, so it's nice and simple | 2281 // This letter has no case equivalents, so it's nice and simple |
2248 // and the mask-compare will determine definitely whether we have | 2282 // and the mask-compare will determine definitely whether we have |
2249 // a match at this character position. | 2283 // a match at this character position. |
2250 pos->mask = char_mask; | 2284 pos->mask = char_mask; |
2251 pos->value = c; | 2285 pos->value = c; |
2252 pos->determines_perfectly = true; | 2286 pos->determines_perfectly = true; |
2253 } else { | 2287 } else { |
2254 uint32_t common_bits = char_mask; | 2288 uint32_t common_bits = char_mask; |
2255 uint32_t bits = chars[0]; | 2289 uint32_t bits = chars[0]; |
2256 for (int j = 1; j < length; j++) { | 2290 for (int j = 1; j < length; j++) { |
(...skipping 24 matching lines...) Expand all Loading... | |
2281 ASSERT(characters_filled_in <= details->characters()); | 2315 ASSERT(characters_filled_in <= details->characters()); |
2282 if (characters_filled_in == details->characters()) { | 2316 if (characters_filled_in == details->characters()) { |
2283 return; | 2317 return; |
2284 } | 2318 } |
2285 } | 2319 } |
2286 } else { | 2320 } else { |
2287 QuickCheckDetails::Position* pos = | 2321 QuickCheckDetails::Position* pos = |
2288 details->positions(characters_filled_in); | 2322 details->positions(characters_filled_in); |
2289 RegExpCharacterClass* tree = elm.data.u_char_class; | 2323 RegExpCharacterClass* tree = elm.data.u_char_class; |
2290 ZoneList<CharacterRange>* ranges = tree->ranges(); | 2324 ZoneList<CharacterRange>* ranges = tree->ranges(); |
2291 CharacterRange range = ranges->at(0); | |
2292 if (tree->is_negated()) { | 2325 if (tree->is_negated()) { |
2293 // A quick check uses multi-character mask and compare. There is no | 2326 // A quick check uses multi-character mask and compare. There is no |
2294 // useful way to incorporate a negative char class into this scheme | 2327 // useful way to incorporate a negative char class into this scheme |
2295 // so we just conservatively create a mask and value that will always | 2328 // so we just conservatively create a mask and value that will always |
2296 // succeed. | 2329 // succeed. |
Lasse Reichstein
2009/02/18 15:20:44
This comment should be changed to just "not yet ha
Erik Corry
2009/02/18 16:04:24
I don't think that char classes like [^\u0000-ac-\
| |
2297 pos->mask = 0; | 2330 pos->mask = 0; |
2298 pos->value = 0; | 2331 pos->value = 0; |
2299 } else { | 2332 } else { |
2300 uint32_t differing_bits = (range.from() ^ range.to()); | 2333 int first_range = 0; |
2334 while (ranges->at(first_range).from() > char_mask) { | |
2335 first_range++; | |
2336 if (first_range == ranges->length()) { | |
2337 details->set_cannot_match(); | |
2338 pos->determines_perfectly = false; | |
2339 return; | |
2340 } | |
2341 } | |
2342 CharacterRange range = ranges->at(first_range); | |
2343 uc16 from = range.from(); | |
2344 uc16 to = range.to(); | |
2345 if (to > char_mask) { | |
2346 to = char_mask; | |
2347 } | |
2348 uint32_t differing_bits = (from ^ to); | |
2301 // A mask and compare is only perfect if the differing bits form a | 2349 // A mask and compare is only perfect if the differing bits form a |
2302 // number like 00011111 with one single block of trailing 1s. | 2350 // number like 00011111 with one single block of trailing 1s. |
2303 if ((differing_bits & (differing_bits + 1)) == 0) { | 2351 if ((differing_bits & (differing_bits + 1)) == 0) { |
2304 pos->determines_perfectly = true; | 2352 pos->determines_perfectly = true; |
2305 } | 2353 } |
2306 uint32_t common_bits = ~SmearBitsRight(differing_bits); | 2354 uint32_t common_bits = ~SmearBitsRight(differing_bits); |
2307 uint32_t bits = (range.from() & common_bits); | 2355 uint32_t bits = (from & common_bits); |
2308 for (int i = 1; i < ranges->length(); i++) { | 2356 for (int i = first_range + 1; i < ranges->length(); i++) { |
2357 CharacterRange range = ranges->at(i); | |
2358 uc16 from = range.from(); | |
2359 uc16 to = range.to(); | |
2360 if (from > char_mask) continue; | |
2361 if (to > char_mask) to = char_mask; | |
2309 // Here we are combining more ranges into the mask and compare | 2362 // Here we are combining more ranges into the mask and compare |
2310 // value. With each new range the mask becomes more sparse and | 2363 // value. With each new range the mask becomes more sparse and |
2311 // so the chances of a false positive rise. A character class | 2364 // so the chances of a false positive rise. A character class |
2312 // with multiple ranges is assumed never to be equivalent to a | 2365 // with multiple ranges is assumed never to be equivalent to a |
2313 // mask and compare operation. | 2366 // mask and compare operation. |
2314 pos->determines_perfectly = false; | 2367 pos->determines_perfectly = false; |
2315 CharacterRange range = ranges->at(i); | 2368 uint32_t new_common_bits = (from ^ to); |
2316 uint32_t new_common_bits = (range.from() ^ range.to()); | |
2317 new_common_bits = ~SmearBitsRight(new_common_bits); | 2369 new_common_bits = ~SmearBitsRight(new_common_bits); |
2318 common_bits &= new_common_bits; | 2370 common_bits &= new_common_bits; |
2319 bits &= new_common_bits; | 2371 bits &= new_common_bits; |
2320 uint32_t differing_bits = (range.from() & common_bits) ^ bits; | 2372 uint32_t differing_bits = (from & common_bits) ^ bits; |
2321 common_bits ^= differing_bits; | 2373 common_bits ^= differing_bits; |
2322 bits &= common_bits; | 2374 bits &= common_bits; |
2323 } | 2375 } |
2324 pos->mask = common_bits; | 2376 pos->mask = common_bits; |
2325 pos->value = bits; | 2377 pos->value = bits; |
2326 } | 2378 } |
2327 characters_filled_in++; | 2379 characters_filled_in++; |
2328 ASSERT(characters_filled_in <= details->characters()); | 2380 ASSERT(characters_filled_in <= details->characters()); |
2329 if (characters_filled_in == details->characters()) { | 2381 if (characters_filled_in == details->characters()) { |
2330 return; | 2382 return; |
(...skipping 281 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2612 return; | 2664 return; |
2613 case AT_NON_BOUNDARY: | 2665 case AT_NON_BOUNDARY: |
2614 case AT_BOUNDARY: | 2666 case AT_BOUNDARY: |
2615 EmitBoundaryCheck(type_, compiler, on_success(), trace); | 2667 EmitBoundaryCheck(type_, compiler, on_success(), trace); |
2616 return; | 2668 return; |
2617 } | 2669 } |
2618 on_success()->Emit(compiler, trace); | 2670 on_success()->Emit(compiler, trace); |
2619 } | 2671 } |
2620 | 2672 |
2621 | 2673 |
2674 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { | |
2675 if (quick_check == NULL) return false; | |
2676 if (offset >= quick_check->characters()) return false; | |
2677 return quick_check->positions(offset)->determines_perfectly; | |
Lasse Reichstein
2009/02/18 15:20:44
Just checking: If we find that a position determin
Erik Corry
2009/02/18 16:04:24
See above.
| |
2678 } | |
2679 | |
2680 | |
2681 static void UpdateBoundsCheck(int index, int* checked_up_to) { | |
2682 if (index > *checked_up_to) { | |
2683 *checked_up_to = index; | |
2684 } | |
2685 } | |
2686 | |
2687 | |
2622 // We call this repeatedly to generate code for each pass over the text node. | 2688 // We call this repeatedly to generate code for each pass over the text node. |
2623 // The passes are in increasing order of difficulty because we hope one | 2689 // The passes are in increasing order of difficulty because we hope one |
2624 // of the first passes will fail in which case we are saved the work of the | 2690 // of the first passes will fail in which case we are saved the work of the |
2625 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ | 2691 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ |
2626 // we will check the '%' in the first pass, the case independent 'a' in the | 2692 // we will check the '%' in the first pass, the case independent 'a' in the |
2627 // second pass and the character class in the last pass. | 2693 // second pass and the character class in the last pass. |
2628 // | 2694 // |
2629 // The passes are done from right to left, so for example to test for /bar/ | 2695 // The passes are done from right to left, so for example to test for /bar/ |
2630 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 | 2696 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 |
2631 // and then a 'b' with offset 0. This means we can avoid the end-of-input | 2697 // and then a 'b' with offset 0. This means we can avoid the end-of-input |
(...skipping 14 matching lines...) Expand all Loading... | |
2646 // order to get to the code we are now generating. The quick check can involve | 2712 // order to get to the code we are now generating. The quick check can involve |
2647 // loading characters, which means we do not need to recheck the bounds | 2713 // loading characters, which means we do not need to recheck the bounds |
2648 // up to the limit the quick check already checked. In addition the quick | 2714 // up to the limit the quick check already checked. In addition the quick |
2649 // check can have involved a mask and compare operation which may simplify | 2715 // check can have involved a mask and compare operation which may simplify |
2650 // or obviate the need for further checks at some character positions. | 2716 // or obviate the need for further checks at some character positions. |
2651 void TextNode::TextEmitPass(RegExpCompiler* compiler, | 2717 void TextNode::TextEmitPass(RegExpCompiler* compiler, |
2652 TextEmitPassType pass, | 2718 TextEmitPassType pass, |
2653 bool preloaded, | 2719 bool preloaded, |
2654 Trace* trace, | 2720 Trace* trace, |
2655 bool first_element_checked, | 2721 bool first_element_checked, |
2656 int* checked_up_to) { | 2722 int* checked_up_to) { |
Lasse Reichstein
2009/02/18 15:20:44
This function is one big glob of special cases.
If
Erik Corry
2009/02/18 16:04:24
I've refactored a little more. It is now only 68
| |
2657 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | 2723 RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
2658 bool ascii = compiler->ascii(); | 2724 bool ascii = compiler->ascii(); |
2659 Label* backtrack = trace->backtrack(); | 2725 Label* backtrack = trace->backtrack(); |
2660 QuickCheckDetails* quick_check = trace->quick_check_performed(); | 2726 QuickCheckDetails* quick_check = trace->quick_check_performed(); |
2661 int element_count = elms_->length(); | 2727 int element_count = elms_->length(); |
2662 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { | 2728 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { |
2663 TextElement elm = elms_->at(i); | 2729 TextElement elm = elms_->at(i); |
2664 int cp_offset = trace->cp_offset() + elm.cp_offset; | 2730 int cp_offset = trace->cp_offset() + elm.cp_offset; |
2665 if (elm.type == TextElement::ATOM) { | 2731 if (elm.type == TextElement::ATOM) { |
2666 if (pass == NON_ASCII_MATCH || | 2732 if (pass != CHARACTER_CLASS_MATCH) { |
2667 pass == CHARACTER_MATCH || | |
2668 pass == CASE_CHARACTER_MATCH) { | |
2669 Vector<const uc16> quarks = elm.data.u_atom->data(); | 2733 Vector<const uc16> quarks = elm.data.u_atom->data(); |
2670 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { | 2734 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { |
2671 bool bound_checked = true; // Most ops will check their bounds. | 2735 bool bound_checked = false; |
2672 if (first_element_checked && i == 0 && j == 0) continue; | 2736 if (first_element_checked && i == 0 && j == 0) continue; |
2673 if (pass == NON_ASCII_MATCH) { | 2737 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue; |
2674 ASSERT(ascii); | 2738 switch (pass) { |
2675 if (quarks[j] > String::kMaxAsciiCharCode) { | 2739 case NON_ASCII_MATCH: |
2676 assembler->GoTo(backtrack); | 2740 ASSERT(ascii); |
2677 return; | 2741 if (quarks[j] > String::kMaxAsciiCharCode) { |
2678 } | 2742 assembler->GoTo(backtrack); |
2679 } else { | 2743 return; |
2680 if (quick_check != NULL && | 2744 } |
2681 elm.cp_offset + j < quick_check->characters() && | 2745 break; |
2682 quick_check->positions(elm.cp_offset + j)-> | 2746 case NON_LETTER_CHARACTER_MATCH: |
2683 determines_perfectly) { | 2747 bound_checked = EmitAtomNonLetter(assembler, |
2684 continue; | 2748 quarks[j], |
2685 } | 2749 ascii, |
2686 if (pass == CHARACTER_MATCH) { | 2750 backtrack, |
2687 if (compiler->ignore_case()) { | 2751 cp_offset + j, |
2688 bound_checked = EmitAtomNonLetter( | 2752 *checked_up_to < cp_offset + j, |
2689 assembler, | 2753 preloaded); |
2690 quarks[j], | 2754 break; |
2755 case SIMPLE_CHARACTER_MATCH: | |
2756 if (!preloaded) { | |
2757 assembler->LoadCurrentCharacter( | |
2758 cp_offset + j, | |
2691 backtrack, | 2759 backtrack, |
2692 cp_offset + j, | 2760 *checked_up_to < cp_offset + j); |
2693 *checked_up_to < cp_offset + j, | 2761 bound_checked = true; |
2694 preloaded); | |
2695 } else { | |
2696 if (!preloaded) { | |
2697 assembler->LoadCurrentCharacter( | |
2698 cp_offset + j, | |
2699 backtrack, | |
2700 *checked_up_to < cp_offset + j); | |
2701 } | |
2702 assembler->CheckNotCharacter(quarks[j], backtrack); | |
2703 } | 2762 } |
2704 } else { | 2763 assembler->CheckNotCharacter(quarks[j], backtrack); |
2705 ASSERT_EQ(pass, CASE_CHARACTER_MATCH); | 2764 break; |
2706 ASSERT(compiler->ignore_case()); | 2765 case CASE_CHARACTER_MATCH: |
2707 bound_checked = EmitAtomLetter(assembler, | 2766 bound_checked = EmitAtomLetter(assembler, |
2708 compiler->ascii(), | |
2709 quarks[j], | 2767 quarks[j], |
2768 ascii, | |
2710 backtrack, | 2769 backtrack, |
2711 cp_offset + j, | 2770 cp_offset + j, |
2712 *checked_up_to < cp_offset + j, | 2771 *checked_up_to < cp_offset + j, |
2713 preloaded); | 2772 preloaded); |
2714 } | 2773 break; |
2715 if (bound_checked) { | 2774 default: |
2716 if (cp_offset + j > *checked_up_to) { | 2775 break; |
2717 *checked_up_to = cp_offset + j; | |
2718 } | |
2719 } | |
2720 } | 2776 } |
2777 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); | |
2721 } | 2778 } |
2722 } | 2779 } |
2723 } else { | 2780 } else { |
2724 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); | 2781 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); |
2725 if (first_element_checked && i == 0) continue; | |
2726 if (quick_check != NULL && | |
2727 elm.cp_offset < quick_check->characters() && | |
2728 quick_check->positions(elm.cp_offset)->determines_perfectly) { | |
2729 continue; | |
2730 } | |
2731 if (pass == CHARACTER_CLASS_MATCH) { | 2782 if (pass == CHARACTER_CLASS_MATCH) { |
2783 if (first_element_checked && i == 0) continue; | |
2784 if (DeterminedAlready(quick_check, elm.cp_offset)) continue; | |
2732 RegExpCharacterClass* cc = elm.data.u_char_class; | 2785 RegExpCharacterClass* cc = elm.data.u_char_class; |
2733 EmitCharClass(assembler, | 2786 EmitCharClass(assembler, |
2734 cc, | 2787 cc, |
2788 ascii, | |
2789 backtrack, | |
2735 cp_offset, | 2790 cp_offset, |
2736 backtrack, | |
2737 *checked_up_to < cp_offset, | 2791 *checked_up_to < cp_offset, |
2738 ascii, | |
2739 preloaded); | 2792 preloaded); |
2740 if (cp_offset > *checked_up_to) { | 2793 UpdateBoundsCheck(cp_offset, checked_up_to); |
2741 *checked_up_to = cp_offset; | |
2742 } | |
2743 } | 2794 } |
2744 } | 2795 } |
2745 } | 2796 } |
2746 } | 2797 } |
2747 | 2798 |
2748 | 2799 |
2749 int TextNode::Length() { | 2800 int TextNode::Length() { |
2750 TextElement elm = elms_->last(); | 2801 TextElement elm = elms_->last(); |
2751 ASSERT(elm.cp_offset >= 0); | 2802 ASSERT(elm.cp_offset >= 0); |
2752 if (elm.type == TextElement::ATOM) { | 2803 if (elm.type == TextElement::ATOM) { |
2753 return elm.cp_offset + elm.data.u_atom->data().length(); | 2804 return elm.cp_offset + elm.data.u_atom->data().length(); |
2754 } else { | 2805 } else { |
2755 return elm.cp_offset + 1; | 2806 return elm.cp_offset + 1; |
2756 } | 2807 } |
2757 } | 2808 } |
2758 | 2809 |
2759 | 2810 |
2811 bool TextNode::SkipPass(int int_pass, bool ignore_case) { | |
2812 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); | |
2813 if (ignore_case) { | |
2814 return pass == SIMPLE_CHARACTER_MATCH; | |
2815 } else { | |
2816 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; | |
2817 } | |
2818 } | |
2819 | |
2820 | |
2760 // This generates the code to match a text node. A text node can contain | 2821 // This generates the code to match a text node. A text node can contain |
2761 // straight character sequences (possibly to be matched in a case-independent | 2822 // straight character sequences (possibly to be matched in a case-independent |
2762 // way) and character classes. For efficiency we do not do this in a single | 2823 // way) and character classes. For efficiency we do not do this in a single |
2763 // pass from left to right. Instead we pass over the text node several times, | 2824 // pass from left to right. Instead we pass over the text node several times, |
2764 // emitting code for some character positions every time. See the comment on | 2825 // emitting code for some character positions every time. See the comment on |
2765 // TextEmitPass for details. | 2826 // TextEmitPass for details. |
2766 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { | 2827 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { |
2767 LimitResult limit_result = LimitVersions(compiler, trace); | 2828 LimitResult limit_result = LimitVersions(compiler, trace); |
2768 if (limit_result == DONE) return; | 2829 if (limit_result == DONE) return; |
2769 ASSERT(limit_result == CONTINUE); | 2830 ASSERT(limit_result == CONTINUE); |
2770 | 2831 |
2771 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { | 2832 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { |
2772 compiler->SetRegExpTooBig(); | 2833 compiler->SetRegExpTooBig(); |
2773 return; | 2834 return; |
2774 } | 2835 } |
2775 | 2836 |
2776 if (compiler->ascii()) { | 2837 if (compiler->ascii()) { |
2777 int dummy = 0; | 2838 int dummy = 0; |
2778 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); | 2839 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); |
2779 } | 2840 } |
2780 | 2841 |
2781 bool first_elt_done = false; | 2842 bool first_elt_done = false; |
2782 int bound_checked_to = trace->cp_offset() - 1; | 2843 int bound_checked_to = trace->cp_offset() - 1; |
2783 bound_checked_to += trace->bound_checked_up_to(); | 2844 bound_checked_to += trace->bound_checked_up_to(); |
2784 | 2845 |
2785 // If a character is preloaded into the current character register then | 2846 // If a character is preloaded into the current character register then |
2786 // check that now. | 2847 // check that now. |
2787 if (trace->characters_preloaded() == 1) { | 2848 if (trace->characters_preloaded() == 1) { |
2788 TextEmitPass(compiler, | 2849 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { |
2789 CHARACTER_MATCH, | 2850 if (!SkipPass(pass, compiler->ignore_case())) { |
2790 true, | 2851 TextEmitPass(compiler, |
2791 trace, | 2852 static_cast<TextEmitPassType>(pass), |
2792 false, | 2853 true, |
2793 &bound_checked_to); | 2854 trace, |
2794 if (compiler->ignore_case()) { | 2855 false, |
2795 TextEmitPass(compiler, | 2856 &bound_checked_to); |
2796 CASE_CHARACTER_MATCH, | 2857 } |
2797 true, | |
2798 trace, | |
2799 false, | |
2800 &bound_checked_to); | |
2801 } | 2858 } |
2802 TextEmitPass(compiler, | |
2803 CHARACTER_CLASS_MATCH, | |
2804 true, | |
2805 trace, | |
2806 false, | |
2807 &bound_checked_to); | |
2808 first_elt_done = true; | 2859 first_elt_done = true; |
2809 } | 2860 } |
2810 | 2861 |
2811 TextEmitPass(compiler, | 2862 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { |
2812 CHARACTER_MATCH, | 2863 if (!SkipPass(pass, compiler->ignore_case())) { |
2813 false, | 2864 TextEmitPass(compiler, |
2814 trace, | 2865 static_cast<TextEmitPassType>(pass), |
2815 first_elt_done, | 2866 false, |
2816 &bound_checked_to); | 2867 trace, |
2817 if (compiler->ignore_case()) { | 2868 first_elt_done, |
2818 TextEmitPass(compiler, | 2869 &bound_checked_to); |
2819 CASE_CHARACTER_MATCH, | 2870 } |
2820 false, | |
2821 trace, | |
2822 first_elt_done, | |
2823 &bound_checked_to); | |
2824 } | 2871 } |
2825 TextEmitPass(compiler, | |
2826 CHARACTER_CLASS_MATCH, | |
2827 false, | |
2828 trace, | |
2829 first_elt_done, | |
2830 &bound_checked_to); | |
2831 | 2872 |
2832 Trace successor_trace(*trace); | 2873 Trace successor_trace(*trace); |
2833 successor_trace.set_at_start(false); | 2874 successor_trace.set_at_start(false); |
2834 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); | 2875 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); |
2835 RecursionCheck rc(compiler); | 2876 RecursionCheck rc(compiler); |
2836 on_success()->Emit(compiler, &successor_trace); | 2877 on_success()->Emit(compiler, &successor_trace); |
2837 } | 2878 } |
2838 | 2879 |
2839 | 2880 |
2840 void Trace::InvalidateCurrentCharacter() { | 2881 void Trace::InvalidateCurrentCharacter() { |
(...skipping 2043 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
4884 EmbeddedVector<byte, 1024> codes; | 4925 EmbeddedVector<byte, 1024> codes; |
4885 RegExpMacroAssemblerIrregexp macro_assembler(codes); | 4926 RegExpMacroAssemblerIrregexp macro_assembler(codes); |
4886 return compiler.Assemble(¯o_assembler, | 4927 return compiler.Assemble(¯o_assembler, |
4887 node, | 4928 node, |
4888 data->capture_count, | 4929 data->capture_count, |
4889 pattern); | 4930 pattern); |
4890 } | 4931 } |
4891 | 4932 |
4892 | 4933 |
4893 }} // namespace v8::internal | 4934 }} // namespace v8::internal |
OLD | NEW |