Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2006-2008 the V8 project authors. All rights reserved. | 1 // Copyright 2006-2008 the V8 project authors. All rights reserved. |
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
| 4 // met: | 4 // met: |
| 5 // | 5 // |
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
| (...skipping 1721 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1732 trace->backtrack()); | 1732 trace->backtrack()); |
| 1733 break; | 1733 break; |
| 1734 } | 1734 } |
| 1735 } | 1735 } |
| 1736 | 1736 |
| 1737 | 1737 |
| 1738 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; | 1738 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; |
| 1739 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; | 1739 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; |
| 1740 | 1740 |
| 1741 | 1741 |
| 1742 // Returns the number of characters in the equivalence class, omitting those | |
| 1743 // that cannot occur in the source string because it is ASCII. | |
| 1744 static int GetCaseIndependentLetters(uc16 character, | |
| 1745 bool ascii_subject, | |
| 1746 unibrow::uchar* letters) { | |
| 1747 int length = uncanonicalize.get(character, '\0', letters); | |
| 1748 // Unibrow returns 0 or 1 for characters where case independependence is | |
| 1749 // trivial. | |
| 1750 if (length == 0) { | |
| 1751 letters[0] = character; | |
| 1752 length = 1; | |
| 1753 } | |
| 1754 if (!ascii_subject || character <= String::kMaxAsciiCharCode) { | |
| 1755 return length; | |
| 1756 } | |
| 1757 // The standard requires that non-ASCII characters cannot have ASCII | |
| 1758 // character codes in their equivalence class. | |
| 1759 return 0; | |
| 1760 } | |
| 1761 | |
| 1762 | |
| 1742 // Only emits non-letters (things that don't have case). Only used for case | 1763 // Only emits non-letters (things that don't have case). Only used for case |
| 1743 // independent matches. | 1764 // independent matches. |
| 1744 static inline bool EmitAtomNonLetter( | 1765 static inline bool EmitAtomNonLetter( |
| 1745 RegExpMacroAssembler* macro_assembler, | 1766 RegExpMacroAssembler* macro_assembler, |
| 1746 uc16 c, | 1767 uc16 c, |
| 1768 bool ascii, | |
| 1747 Label* on_failure, | 1769 Label* on_failure, |
| 1748 int cp_offset, | 1770 int cp_offset, |
| 1749 bool check, | 1771 bool check, |
| 1750 bool preloaded) { | 1772 bool preloaded) { |
| 1751 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1773 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 1752 int length = uncanonicalize.get(c, '\0', chars); | 1774 int length = GetCaseIndependentLetters(c, ascii, chars); |
| 1775 if (length < 1) { | |
| 1776 // This can't match. Must be an ASCII subject and a non-ASCII character. | |
| 1777 // We do not need to do anything since the ASCII pass already handled this. | |
| 1778 return false; // Bounds not checked. | |
| 1779 } | |
| 1753 bool checked = false; | 1780 bool checked = false; |
| 1754 if (length <= 1) { | 1781 // We handle the length > 1 case in a later pass. |
| 1782 if (length == 1) { | |
| 1783 if (ascii && c > String::kMaxAsciiCharCodeU) { | |
| 1784 // Can't match - see above. | |
| 1785 return false; // Bounds not checked. | |
| 1786 } | |
| 1755 if (!preloaded) { | 1787 if (!preloaded) { |
| 1756 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); | 1788 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
| 1757 checked = check; | 1789 checked = check; |
| 1758 } | 1790 } |
| 1759 macro_assembler->CheckNotCharacter(c, on_failure); | 1791 macro_assembler->CheckNotCharacter(c, on_failure); |
| 1760 } | 1792 } |
| 1761 return checked; | 1793 return checked; |
| 1762 } | 1794 } |
| 1763 | 1795 |
| 1764 | 1796 |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1798 return true; | 1830 return true; |
| 1799 } | 1831 } |
| 1800 return false; | 1832 return false; |
| 1801 } | 1833 } |
| 1802 | 1834 |
| 1803 | 1835 |
| 1804 // Only emits letters (things that have case). Only used for case independent | 1836 // Only emits letters (things that have case). Only used for case independent |
| 1805 // matches. | 1837 // matches. |
| 1806 static inline bool EmitAtomLetter( | 1838 static inline bool EmitAtomLetter( |
| 1807 RegExpMacroAssembler* macro_assembler, | 1839 RegExpMacroAssembler* macro_assembler, |
| 1840 uc16 c, | |
| 1808 bool ascii, | 1841 bool ascii, |
| 1809 uc16 c, | |
| 1810 Label* on_failure, | 1842 Label* on_failure, |
| 1811 int cp_offset, | 1843 int cp_offset, |
| 1812 bool check, | 1844 bool check, |
| 1813 bool preloaded) { | 1845 bool preloaded) { |
| 1814 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1846 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 1815 int length = uncanonicalize.get(c, '\0', chars); | 1847 int length = GetCaseIndependentLetters(c, ascii, chars); |
| 1816 if (length <= 1) return false; | 1848 if (length <= 1) return false; |
| 1817 // We may not need to check against the end of the input string | 1849 // We may not need to check against the end of the input string |
| 1818 // if this character lies before a character that matched. | 1850 // if this character lies before a character that matched. |
| 1819 if (!preloaded) { | 1851 if (!preloaded) { |
| 1820 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); | 1852 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
| 1821 } | 1853 } |
| 1822 Label ok; | 1854 Label ok; |
| 1823 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); | 1855 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); |
| 1824 switch (length) { | 1856 switch (length) { |
| 1825 case 2: { | 1857 case 2: { |
| (...skipping 21 matching lines...) Expand all Loading... | |
| 1847 default: | 1879 default: |
| 1848 UNREACHABLE(); | 1880 UNREACHABLE(); |
| 1849 break; | 1881 break; |
| 1850 } | 1882 } |
| 1851 return true; | 1883 return true; |
| 1852 } | 1884 } |
| 1853 | 1885 |
| 1854 | 1886 |
| 1855 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, | 1887 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, |
| 1856 RegExpCharacterClass* cc, | 1888 RegExpCharacterClass* cc, |
| 1889 bool ascii, | |
| 1890 Label* on_failure, | |
| 1857 int cp_offset, | 1891 int cp_offset, |
| 1858 Label* on_failure, | |
| 1859 bool check_offset, | 1892 bool check_offset, |
| 1860 bool ascii, | |
| 1861 bool preloaded) { | 1893 bool preloaded) { |
| 1862 if (cc->is_standard() && | 1894 if (cc->is_standard() && |
| 1863 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), | 1895 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), |
| 1864 cp_offset, | 1896 cp_offset, |
| 1865 check_offset, | 1897 check_offset, |
| 1866 on_failure)) { | 1898 on_failure)) { |
| 1867 return; | 1899 return; |
| 1868 } | 1900 } |
| 1869 | 1901 |
| 1870 ZoneList<CharacterRange>* ranges = cc->ranges(); | 1902 ZoneList<CharacterRange>* ranges = cc->ranges(); |
| (...skipping 360 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2231 for (int i = 0; i < characters && i < quarks.length(); i++) { | 2263 for (int i = 0; i < characters && i < quarks.length(); i++) { |
| 2232 QuickCheckDetails::Position* pos = | 2264 QuickCheckDetails::Position* pos = |
| 2233 details->positions(characters_filled_in); | 2265 details->positions(characters_filled_in); |
| 2234 uc16 c = quarks[i]; | 2266 uc16 c = quarks[i]; |
| 2235 if (c > char_mask) { | 2267 if (c > char_mask) { |
| 2236 // If we expect a non-ASCII character from an ASCII string, | 2268 // If we expect a non-ASCII character from an ASCII string, |
| 2237 // there is no way we can match. Not even case independent | 2269 // there is no way we can match. Not even case independent |
| 2238 // matching can turn an ASCII character into non-ASCII or | 2270 // matching can turn an ASCII character into non-ASCII or |
| 2239 // vice versa. | 2271 // vice versa. |
| 2240 details->set_cannot_match(); | 2272 details->set_cannot_match(); |
| 2273 pos->determines_perfectly = false; | |
|
Lasse Reichstein
2009/02/18 15:20:44
Should this be set to false? If the remaining case
Erik Corry
2009/02/18 16:04:24
Determines perfectly indicates that this particula
| |
| 2241 return; | 2274 return; |
| 2242 } | 2275 } |
| 2243 if (compiler->ignore_case()) { | 2276 if (compiler->ignore_case()) { |
| 2244 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 2277 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 2245 int length = uncanonicalize.get(c, '\0', chars); | 2278 int length = GetCaseIndependentLetters(c, compiler->ascii(), chars); |
| 2246 if (length < 2) { | 2279 ASSERT(length != 0); // Can only happen if c > char_mask (see above). |
| 2280 if (length == 1) { | |
| 2247 // This letter has no case equivalents, so it's nice and simple | 2281 // This letter has no case equivalents, so it's nice and simple |
| 2248 // and the mask-compare will determine definitely whether we have | 2282 // and the mask-compare will determine definitely whether we have |
| 2249 // a match at this character position. | 2283 // a match at this character position. |
| 2250 pos->mask = char_mask; | 2284 pos->mask = char_mask; |
| 2251 pos->value = c; | 2285 pos->value = c; |
| 2252 pos->determines_perfectly = true; | 2286 pos->determines_perfectly = true; |
| 2253 } else { | 2287 } else { |
| 2254 uint32_t common_bits = char_mask; | 2288 uint32_t common_bits = char_mask; |
| 2255 uint32_t bits = chars[0]; | 2289 uint32_t bits = chars[0]; |
| 2256 for (int j = 1; j < length; j++) { | 2290 for (int j = 1; j < length; j++) { |
| (...skipping 24 matching lines...) Expand all Loading... | |
| 2281 ASSERT(characters_filled_in <= details->characters()); | 2315 ASSERT(characters_filled_in <= details->characters()); |
| 2282 if (characters_filled_in == details->characters()) { | 2316 if (characters_filled_in == details->characters()) { |
| 2283 return; | 2317 return; |
| 2284 } | 2318 } |
| 2285 } | 2319 } |
| 2286 } else { | 2320 } else { |
| 2287 QuickCheckDetails::Position* pos = | 2321 QuickCheckDetails::Position* pos = |
| 2288 details->positions(characters_filled_in); | 2322 details->positions(characters_filled_in); |
| 2289 RegExpCharacterClass* tree = elm.data.u_char_class; | 2323 RegExpCharacterClass* tree = elm.data.u_char_class; |
| 2290 ZoneList<CharacterRange>* ranges = tree->ranges(); | 2324 ZoneList<CharacterRange>* ranges = tree->ranges(); |
| 2291 CharacterRange range = ranges->at(0); | |
| 2292 if (tree->is_negated()) { | 2325 if (tree->is_negated()) { |
| 2293 // A quick check uses multi-character mask and compare. There is no | 2326 // A quick check uses multi-character mask and compare. There is no |
| 2294 // useful way to incorporate a negative char class into this scheme | 2327 // useful way to incorporate a negative char class into this scheme |
| 2295 // so we just conservatively create a mask and value that will always | 2328 // so we just conservatively create a mask and value that will always |
| 2296 // succeed. | 2329 // succeed. |
|
Lasse Reichstein
2009/02/18 15:20:44
This comment should be changed to just "not yet ha
Erik Corry
2009/02/18 16:04:24
I don't think that char classes like [^\u0000-ac-\
| |
| 2297 pos->mask = 0; | 2330 pos->mask = 0; |
| 2298 pos->value = 0; | 2331 pos->value = 0; |
| 2299 } else { | 2332 } else { |
| 2300 uint32_t differing_bits = (range.from() ^ range.to()); | 2333 int first_range = 0; |
| 2334 while (ranges->at(first_range).from() > char_mask) { | |
| 2335 first_range++; | |
| 2336 if (first_range == ranges->length()) { | |
| 2337 details->set_cannot_match(); | |
| 2338 pos->determines_perfectly = false; | |
| 2339 return; | |
| 2340 } | |
| 2341 } | |
| 2342 CharacterRange range = ranges->at(first_range); | |
| 2343 uc16 from = range.from(); | |
| 2344 uc16 to = range.to(); | |
| 2345 if (to > char_mask) { | |
| 2346 to = char_mask; | |
| 2347 } | |
| 2348 uint32_t differing_bits = (from ^ to); | |
| 2301 // A mask and compare is only perfect if the differing bits form a | 2349 // A mask and compare is only perfect if the differing bits form a |
| 2302 // number like 00011111 with one single block of trailing 1s. | 2350 // number like 00011111 with one single block of trailing 1s. |
| 2303 if ((differing_bits & (differing_bits + 1)) == 0) { | 2351 if ((differing_bits & (differing_bits + 1)) == 0) { |
| 2304 pos->determines_perfectly = true; | 2352 pos->determines_perfectly = true; |
| 2305 } | 2353 } |
| 2306 uint32_t common_bits = ~SmearBitsRight(differing_bits); | 2354 uint32_t common_bits = ~SmearBitsRight(differing_bits); |
| 2307 uint32_t bits = (range.from() & common_bits); | 2355 uint32_t bits = (from & common_bits); |
| 2308 for (int i = 1; i < ranges->length(); i++) { | 2356 for (int i = first_range + 1; i < ranges->length(); i++) { |
| 2357 CharacterRange range = ranges->at(i); | |
| 2358 uc16 from = range.from(); | |
| 2359 uc16 to = range.to(); | |
| 2360 if (from > char_mask) continue; | |
| 2361 if (to > char_mask) to = char_mask; | |
| 2309 // Here we are combining more ranges into the mask and compare | 2362 // Here we are combining more ranges into the mask and compare |
| 2310 // value. With each new range the mask becomes more sparse and | 2363 // value. With each new range the mask becomes more sparse and |
| 2311 // so the chances of a false positive rise. A character class | 2364 // so the chances of a false positive rise. A character class |
| 2312 // with multiple ranges is assumed never to be equivalent to a | 2365 // with multiple ranges is assumed never to be equivalent to a |
| 2313 // mask and compare operation. | 2366 // mask and compare operation. |
| 2314 pos->determines_perfectly = false; | 2367 pos->determines_perfectly = false; |
| 2315 CharacterRange range = ranges->at(i); | 2368 uint32_t new_common_bits = (from ^ to); |
| 2316 uint32_t new_common_bits = (range.from() ^ range.to()); | |
| 2317 new_common_bits = ~SmearBitsRight(new_common_bits); | 2369 new_common_bits = ~SmearBitsRight(new_common_bits); |
| 2318 common_bits &= new_common_bits; | 2370 common_bits &= new_common_bits; |
| 2319 bits &= new_common_bits; | 2371 bits &= new_common_bits; |
| 2320 uint32_t differing_bits = (range.from() & common_bits) ^ bits; | 2372 uint32_t differing_bits = (from & common_bits) ^ bits; |
| 2321 common_bits ^= differing_bits; | 2373 common_bits ^= differing_bits; |
| 2322 bits &= common_bits; | 2374 bits &= common_bits; |
| 2323 } | 2375 } |
| 2324 pos->mask = common_bits; | 2376 pos->mask = common_bits; |
| 2325 pos->value = bits; | 2377 pos->value = bits; |
| 2326 } | 2378 } |
| 2327 characters_filled_in++; | 2379 characters_filled_in++; |
| 2328 ASSERT(characters_filled_in <= details->characters()); | 2380 ASSERT(characters_filled_in <= details->characters()); |
| 2329 if (characters_filled_in == details->characters()) { | 2381 if (characters_filled_in == details->characters()) { |
| 2330 return; | 2382 return; |
| (...skipping 281 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2612 return; | 2664 return; |
| 2613 case AT_NON_BOUNDARY: | 2665 case AT_NON_BOUNDARY: |
| 2614 case AT_BOUNDARY: | 2666 case AT_BOUNDARY: |
| 2615 EmitBoundaryCheck(type_, compiler, on_success(), trace); | 2667 EmitBoundaryCheck(type_, compiler, on_success(), trace); |
| 2616 return; | 2668 return; |
| 2617 } | 2669 } |
| 2618 on_success()->Emit(compiler, trace); | 2670 on_success()->Emit(compiler, trace); |
| 2619 } | 2671 } |
| 2620 | 2672 |
| 2621 | 2673 |
| 2674 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { | |
| 2675 if (quick_check == NULL) return false; | |
| 2676 if (offset >= quick_check->characters()) return false; | |
| 2677 return quick_check->positions(offset)->determines_perfectly; | |
|
Lasse Reichstein
2009/02/18 15:20:44
Just checking: If we find that a position determin
Erik Corry
2009/02/18 16:04:24
See above.
| |
| 2678 } | |
| 2679 | |
| 2680 | |
| 2681 static void UpdateBoundsCheck(int index, int* checked_up_to) { | |
| 2682 if (index > *checked_up_to) { | |
| 2683 *checked_up_to = index; | |
| 2684 } | |
| 2685 } | |
| 2686 | |
| 2687 | |
| 2622 // We call this repeatedly to generate code for each pass over the text node. | 2688 // We call this repeatedly to generate code for each pass over the text node. |
| 2623 // The passes are in increasing order of difficulty because we hope one | 2689 // The passes are in increasing order of difficulty because we hope one |
| 2624 // of the first passes will fail in which case we are saved the work of the | 2690 // of the first passes will fail in which case we are saved the work of the |
| 2625 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ | 2691 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ |
| 2626 // we will check the '%' in the first pass, the case independent 'a' in the | 2692 // we will check the '%' in the first pass, the case independent 'a' in the |
| 2627 // second pass and the character class in the last pass. | 2693 // second pass and the character class in the last pass. |
| 2628 // | 2694 // |
| 2629 // The passes are done from right to left, so for example to test for /bar/ | 2695 // The passes are done from right to left, so for example to test for /bar/ |
| 2630 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 | 2696 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 |
| 2631 // and then a 'b' with offset 0. This means we can avoid the end-of-input | 2697 // and then a 'b' with offset 0. This means we can avoid the end-of-input |
| (...skipping 14 matching lines...) Expand all Loading... | |
| 2646 // order to get to the code we are now generating. The quick check can involve | 2712 // order to get to the code we are now generating. The quick check can involve |
| 2647 // loading characters, which means we do not need to recheck the bounds | 2713 // loading characters, which means we do not need to recheck the bounds |
| 2648 // up to the limit the quick check already checked. In addition the quick | 2714 // up to the limit the quick check already checked. In addition the quick |
| 2649 // check can have involved a mask and compare operation which may simplify | 2715 // check can have involved a mask and compare operation which may simplify |
| 2650 // or obviate the need for further checks at some character positions. | 2716 // or obviate the need for further checks at some character positions. |
| 2651 void TextNode::TextEmitPass(RegExpCompiler* compiler, | 2717 void TextNode::TextEmitPass(RegExpCompiler* compiler, |
| 2652 TextEmitPassType pass, | 2718 TextEmitPassType pass, |
| 2653 bool preloaded, | 2719 bool preloaded, |
| 2654 Trace* trace, | 2720 Trace* trace, |
| 2655 bool first_element_checked, | 2721 bool first_element_checked, |
| 2656 int* checked_up_to) { | 2722 int* checked_up_to) { |
|
Lasse Reichstein
2009/02/18 15:20:44
This function is one big glob of special cases.
If
Erik Corry
2009/02/18 16:04:24
I've refactored a little more. It is now only 68
| |
| 2657 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | 2723 RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
| 2658 bool ascii = compiler->ascii(); | 2724 bool ascii = compiler->ascii(); |
| 2659 Label* backtrack = trace->backtrack(); | 2725 Label* backtrack = trace->backtrack(); |
| 2660 QuickCheckDetails* quick_check = trace->quick_check_performed(); | 2726 QuickCheckDetails* quick_check = trace->quick_check_performed(); |
| 2661 int element_count = elms_->length(); | 2727 int element_count = elms_->length(); |
| 2662 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { | 2728 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { |
| 2663 TextElement elm = elms_->at(i); | 2729 TextElement elm = elms_->at(i); |
| 2664 int cp_offset = trace->cp_offset() + elm.cp_offset; | 2730 int cp_offset = trace->cp_offset() + elm.cp_offset; |
| 2665 if (elm.type == TextElement::ATOM) { | 2731 if (elm.type == TextElement::ATOM) { |
| 2666 if (pass == NON_ASCII_MATCH || | 2732 if (pass != CHARACTER_CLASS_MATCH) { |
| 2667 pass == CHARACTER_MATCH || | |
| 2668 pass == CASE_CHARACTER_MATCH) { | |
| 2669 Vector<const uc16> quarks = elm.data.u_atom->data(); | 2733 Vector<const uc16> quarks = elm.data.u_atom->data(); |
| 2670 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { | 2734 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { |
| 2671 bool bound_checked = true; // Most ops will check their bounds. | 2735 bool bound_checked = false; |
| 2672 if (first_element_checked && i == 0 && j == 0) continue; | 2736 if (first_element_checked && i == 0 && j == 0) continue; |
| 2673 if (pass == NON_ASCII_MATCH) { | 2737 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue; |
| 2674 ASSERT(ascii); | 2738 switch (pass) { |
| 2675 if (quarks[j] > String::kMaxAsciiCharCode) { | 2739 case NON_ASCII_MATCH: |
| 2676 assembler->GoTo(backtrack); | 2740 ASSERT(ascii); |
| 2677 return; | 2741 if (quarks[j] > String::kMaxAsciiCharCode) { |
| 2678 } | 2742 assembler->GoTo(backtrack); |
| 2679 } else { | 2743 return; |
| 2680 if (quick_check != NULL && | 2744 } |
| 2681 elm.cp_offset + j < quick_check->characters() && | 2745 break; |
| 2682 quick_check->positions(elm.cp_offset + j)-> | 2746 case NON_LETTER_CHARACTER_MATCH: |
| 2683 determines_perfectly) { | 2747 bound_checked = EmitAtomNonLetter(assembler, |
| 2684 continue; | 2748 quarks[j], |
| 2685 } | 2749 ascii, |
| 2686 if (pass == CHARACTER_MATCH) { | 2750 backtrack, |
| 2687 if (compiler->ignore_case()) { | 2751 cp_offset + j, |
| 2688 bound_checked = EmitAtomNonLetter( | 2752 *checked_up_to < cp_offset + j, |
| 2689 assembler, | 2753 preloaded); |
| 2690 quarks[j], | 2754 break; |
| 2755 case SIMPLE_CHARACTER_MATCH: | |
| 2756 if (!preloaded) { | |
| 2757 assembler->LoadCurrentCharacter( | |
| 2758 cp_offset + j, | |
| 2691 backtrack, | 2759 backtrack, |
| 2692 cp_offset + j, | 2760 *checked_up_to < cp_offset + j); |
| 2693 *checked_up_to < cp_offset + j, | 2761 bound_checked = true; |
| 2694 preloaded); | |
| 2695 } else { | |
| 2696 if (!preloaded) { | |
| 2697 assembler->LoadCurrentCharacter( | |
| 2698 cp_offset + j, | |
| 2699 backtrack, | |
| 2700 *checked_up_to < cp_offset + j); | |
| 2701 } | |
| 2702 assembler->CheckNotCharacter(quarks[j], backtrack); | |
| 2703 } | 2762 } |
| 2704 } else { | 2763 assembler->CheckNotCharacter(quarks[j], backtrack); |
| 2705 ASSERT_EQ(pass, CASE_CHARACTER_MATCH); | 2764 break; |
| 2706 ASSERT(compiler->ignore_case()); | 2765 case CASE_CHARACTER_MATCH: |
| 2707 bound_checked = EmitAtomLetter(assembler, | 2766 bound_checked = EmitAtomLetter(assembler, |
| 2708 compiler->ascii(), | |
| 2709 quarks[j], | 2767 quarks[j], |
| 2768 ascii, | |
| 2710 backtrack, | 2769 backtrack, |
| 2711 cp_offset + j, | 2770 cp_offset + j, |
| 2712 *checked_up_to < cp_offset + j, | 2771 *checked_up_to < cp_offset + j, |
| 2713 preloaded); | 2772 preloaded); |
| 2714 } | 2773 break; |
| 2715 if (bound_checked) { | 2774 default: |
| 2716 if (cp_offset + j > *checked_up_to) { | 2775 break; |
| 2717 *checked_up_to = cp_offset + j; | |
| 2718 } | |
| 2719 } | |
| 2720 } | 2776 } |
| 2777 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); | |
| 2721 } | 2778 } |
| 2722 } | 2779 } |
| 2723 } else { | 2780 } else { |
| 2724 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); | 2781 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); |
| 2725 if (first_element_checked && i == 0) continue; | |
| 2726 if (quick_check != NULL && | |
| 2727 elm.cp_offset < quick_check->characters() && | |
| 2728 quick_check->positions(elm.cp_offset)->determines_perfectly) { | |
| 2729 continue; | |
| 2730 } | |
| 2731 if (pass == CHARACTER_CLASS_MATCH) { | 2782 if (pass == CHARACTER_CLASS_MATCH) { |
| 2783 if (first_element_checked && i == 0) continue; | |
| 2784 if (DeterminedAlready(quick_check, elm.cp_offset)) continue; | |
| 2732 RegExpCharacterClass* cc = elm.data.u_char_class; | 2785 RegExpCharacterClass* cc = elm.data.u_char_class; |
| 2733 EmitCharClass(assembler, | 2786 EmitCharClass(assembler, |
| 2734 cc, | 2787 cc, |
| 2788 ascii, | |
| 2789 backtrack, | |
| 2735 cp_offset, | 2790 cp_offset, |
| 2736 backtrack, | |
| 2737 *checked_up_to < cp_offset, | 2791 *checked_up_to < cp_offset, |
| 2738 ascii, | |
| 2739 preloaded); | 2792 preloaded); |
| 2740 if (cp_offset > *checked_up_to) { | 2793 UpdateBoundsCheck(cp_offset, checked_up_to); |
| 2741 *checked_up_to = cp_offset; | |
| 2742 } | |
| 2743 } | 2794 } |
| 2744 } | 2795 } |
| 2745 } | 2796 } |
| 2746 } | 2797 } |
| 2747 | 2798 |
| 2748 | 2799 |
| 2749 int TextNode::Length() { | 2800 int TextNode::Length() { |
| 2750 TextElement elm = elms_->last(); | 2801 TextElement elm = elms_->last(); |
| 2751 ASSERT(elm.cp_offset >= 0); | 2802 ASSERT(elm.cp_offset >= 0); |
| 2752 if (elm.type == TextElement::ATOM) { | 2803 if (elm.type == TextElement::ATOM) { |
| 2753 return elm.cp_offset + elm.data.u_atom->data().length(); | 2804 return elm.cp_offset + elm.data.u_atom->data().length(); |
| 2754 } else { | 2805 } else { |
| 2755 return elm.cp_offset + 1; | 2806 return elm.cp_offset + 1; |
| 2756 } | 2807 } |
| 2757 } | 2808 } |
| 2758 | 2809 |
| 2759 | 2810 |
| 2811 bool TextNode::SkipPass(int int_pass, bool ignore_case) { | |
| 2812 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); | |
| 2813 if (ignore_case) { | |
| 2814 return pass == SIMPLE_CHARACTER_MATCH; | |
| 2815 } else { | |
| 2816 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; | |
| 2817 } | |
| 2818 } | |
| 2819 | |
| 2820 | |
| 2760 // This generates the code to match a text node. A text node can contain | 2821 // This generates the code to match a text node. A text node can contain |
| 2761 // straight character sequences (possibly to be matched in a case-independent | 2822 // straight character sequences (possibly to be matched in a case-independent |
| 2762 // way) and character classes. For efficiency we do not do this in a single | 2823 // way) and character classes. For efficiency we do not do this in a single |
| 2763 // pass from left to right. Instead we pass over the text node several times, | 2824 // pass from left to right. Instead we pass over the text node several times, |
| 2764 // emitting code for some character positions every time. See the comment on | 2825 // emitting code for some character positions every time. See the comment on |
| 2765 // TextEmitPass for details. | 2826 // TextEmitPass for details. |
| 2766 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { | 2827 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { |
| 2767 LimitResult limit_result = LimitVersions(compiler, trace); | 2828 LimitResult limit_result = LimitVersions(compiler, trace); |
| 2768 if (limit_result == DONE) return; | 2829 if (limit_result == DONE) return; |
| 2769 ASSERT(limit_result == CONTINUE); | 2830 ASSERT(limit_result == CONTINUE); |
| 2770 | 2831 |
| 2771 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { | 2832 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { |
| 2772 compiler->SetRegExpTooBig(); | 2833 compiler->SetRegExpTooBig(); |
| 2773 return; | 2834 return; |
| 2774 } | 2835 } |
| 2775 | 2836 |
| 2776 if (compiler->ascii()) { | 2837 if (compiler->ascii()) { |
| 2777 int dummy = 0; | 2838 int dummy = 0; |
| 2778 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); | 2839 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); |
| 2779 } | 2840 } |
| 2780 | 2841 |
| 2781 bool first_elt_done = false; | 2842 bool first_elt_done = false; |
| 2782 int bound_checked_to = trace->cp_offset() - 1; | 2843 int bound_checked_to = trace->cp_offset() - 1; |
| 2783 bound_checked_to += trace->bound_checked_up_to(); | 2844 bound_checked_to += trace->bound_checked_up_to(); |
| 2784 | 2845 |
| 2785 // If a character is preloaded into the current character register then | 2846 // If a character is preloaded into the current character register then |
| 2786 // check that now. | 2847 // check that now. |
| 2787 if (trace->characters_preloaded() == 1) { | 2848 if (trace->characters_preloaded() == 1) { |
| 2788 TextEmitPass(compiler, | 2849 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { |
| 2789 CHARACTER_MATCH, | 2850 if (!SkipPass(pass, compiler->ignore_case())) { |
| 2790 true, | 2851 TextEmitPass(compiler, |
| 2791 trace, | 2852 static_cast<TextEmitPassType>(pass), |
| 2792 false, | 2853 true, |
| 2793 &bound_checked_to); | 2854 trace, |
| 2794 if (compiler->ignore_case()) { | 2855 false, |
| 2795 TextEmitPass(compiler, | 2856 &bound_checked_to); |
| 2796 CASE_CHARACTER_MATCH, | 2857 } |
| 2797 true, | |
| 2798 trace, | |
| 2799 false, | |
| 2800 &bound_checked_to); | |
| 2801 } | 2858 } |
| 2802 TextEmitPass(compiler, | |
| 2803 CHARACTER_CLASS_MATCH, | |
| 2804 true, | |
| 2805 trace, | |
| 2806 false, | |
| 2807 &bound_checked_to); | |
| 2808 first_elt_done = true; | 2859 first_elt_done = true; |
| 2809 } | 2860 } |
| 2810 | 2861 |
| 2811 TextEmitPass(compiler, | 2862 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { |
| 2812 CHARACTER_MATCH, | 2863 if (!SkipPass(pass, compiler->ignore_case())) { |
| 2813 false, | 2864 TextEmitPass(compiler, |
| 2814 trace, | 2865 static_cast<TextEmitPassType>(pass), |
| 2815 first_elt_done, | 2866 false, |
| 2816 &bound_checked_to); | 2867 trace, |
| 2817 if (compiler->ignore_case()) { | 2868 first_elt_done, |
| 2818 TextEmitPass(compiler, | 2869 &bound_checked_to); |
| 2819 CASE_CHARACTER_MATCH, | 2870 } |
| 2820 false, | |
| 2821 trace, | |
| 2822 first_elt_done, | |
| 2823 &bound_checked_to); | |
| 2824 } | 2871 } |
| 2825 TextEmitPass(compiler, | |
| 2826 CHARACTER_CLASS_MATCH, | |
| 2827 false, | |
| 2828 trace, | |
| 2829 first_elt_done, | |
| 2830 &bound_checked_to); | |
| 2831 | 2872 |
| 2832 Trace successor_trace(*trace); | 2873 Trace successor_trace(*trace); |
| 2833 successor_trace.set_at_start(false); | 2874 successor_trace.set_at_start(false); |
| 2834 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); | 2875 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); |
| 2835 RecursionCheck rc(compiler); | 2876 RecursionCheck rc(compiler); |
| 2836 on_success()->Emit(compiler, &successor_trace); | 2877 on_success()->Emit(compiler, &successor_trace); |
| 2837 } | 2878 } |
| 2838 | 2879 |
| 2839 | 2880 |
| 2840 void Trace::InvalidateCurrentCharacter() { | 2881 void Trace::InvalidateCurrentCharacter() { |
| (...skipping 2043 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 4884 EmbeddedVector<byte, 1024> codes; | 4925 EmbeddedVector<byte, 1024> codes; |
| 4885 RegExpMacroAssemblerIrregexp macro_assembler(codes); | 4926 RegExpMacroAssemblerIrregexp macro_assembler(codes); |
| 4886 return compiler.Assemble(¯o_assembler, | 4927 return compiler.Assemble(¯o_assembler, |
| 4887 node, | 4928 node, |
| 4888 data->capture_count, | 4929 data->capture_count, |
| 4889 pattern); | 4930 pattern); |
| 4890 } | 4931 } |
| 4891 | 4932 |
| 4892 | 4933 |
| 4893 }} // namespace v8::internal | 4934 }} // namespace v8::internal |
| OLD | NEW |