| OLD | NEW |
| 1 // Copyright 2006-2008 the V8 project authors. All rights reserved. | 1 // Copyright 2006-2008 the V8 project authors. All rights reserved. |
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
| 4 // met: | 4 // met: |
| 5 // | 5 // |
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
| (...skipping 1721 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1732 trace->backtrack()); | 1732 trace->backtrack()); |
| 1733 break; | 1733 break; |
| 1734 } | 1734 } |
| 1735 } | 1735 } |
| 1736 | 1736 |
| 1737 | 1737 |
| 1738 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; | 1738 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; |
| 1739 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; | 1739 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; |
| 1740 | 1740 |
| 1741 | 1741 |
| 1742 // Returns the number of characters in the equivalence class, omitting those |
| 1743 // that cannot occur in the source string because it is ASCII. |
| 1744 static int GetCaseIndependentLetters(uc16 character, |
| 1745 bool ascii_subject, |
| 1746 unibrow::uchar* letters) { |
| 1747 int length = uncanonicalize.get(character, '\0', letters); |
| 1748 // Unibrow returns 0 or 1 for characters where case independependence is |
| 1749 // trivial. |
| 1750 if (length == 0) { |
| 1751 letters[0] = character; |
| 1752 length = 1; |
| 1753 } |
| 1754 if (!ascii_subject || character <= String::kMaxAsciiCharCode) { |
| 1755 return length; |
| 1756 } |
| 1757 // The standard requires that non-ASCII characters cannot have ASCII |
| 1758 // character codes in their equivalence class. |
| 1759 return 0; |
| 1760 } |
| 1761 |
| 1762 |
| 1763 static inline bool EmitSimpleCharacter(RegExpCompiler* compiler, |
| 1764 uc16 c, |
| 1765 Label* on_failure, |
| 1766 int cp_offset, |
| 1767 bool check, |
| 1768 bool preloaded) { |
| 1769 RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
| 1770 bool bound_checked = false; |
| 1771 if (!preloaded) { |
| 1772 assembler->LoadCurrentCharacter( |
| 1773 cp_offset, |
| 1774 on_failure, |
| 1775 check); |
| 1776 bound_checked = true; |
| 1777 } |
| 1778 assembler->CheckNotCharacter(c, on_failure); |
| 1779 return bound_checked; |
| 1780 } |
| 1781 |
| 1782 |
| 1742 // Only emits non-letters (things that don't have case). Only used for case | 1783 // Only emits non-letters (things that don't have case). Only used for case |
| 1743 // independent matches. | 1784 // independent matches. |
| 1744 static inline bool EmitAtomNonLetter( | 1785 static inline bool EmitAtomNonLetter(RegExpCompiler* compiler, |
| 1745 RegExpMacroAssembler* macro_assembler, | 1786 uc16 c, |
| 1746 uc16 c, | 1787 Label* on_failure, |
| 1747 Label* on_failure, | 1788 int cp_offset, |
| 1748 int cp_offset, | 1789 bool check, |
| 1749 bool check, | 1790 bool preloaded) { |
| 1750 bool preloaded) { | 1791 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
| 1792 bool ascii = compiler->ascii(); |
| 1751 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1793 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 1752 int length = uncanonicalize.get(c, '\0', chars); | 1794 int length = GetCaseIndependentLetters(c, ascii, chars); |
| 1795 if (length < 1) { |
| 1796 // This can't match. Must be an ASCII subject and a non-ASCII character. |
| 1797 // We do not need to do anything since the ASCII pass already handled this. |
| 1798 return false; // Bounds not checked. |
| 1799 } |
| 1753 bool checked = false; | 1800 bool checked = false; |
| 1754 if (length <= 1) { | 1801 // We handle the length > 1 case in a later pass. |
| 1802 if (length == 1) { |
| 1803 if (ascii && c > String::kMaxAsciiCharCodeU) { |
| 1804 // Can't match - see above. |
| 1805 return false; // Bounds not checked. |
| 1806 } |
| 1755 if (!preloaded) { | 1807 if (!preloaded) { |
| 1756 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); | 1808 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
| 1757 checked = check; | 1809 checked = check; |
| 1758 } | 1810 } |
| 1759 macro_assembler->CheckNotCharacter(c, on_failure); | 1811 macro_assembler->CheckNotCharacter(c, on_failure); |
| 1760 } | 1812 } |
| 1761 return checked; | 1813 return checked; |
| 1762 } | 1814 } |
| 1763 | 1815 |
| 1764 | 1816 |
| (...skipping 29 matching lines...) Expand all Loading... |
| 1794 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, | 1846 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, |
| 1795 diff, | 1847 diff, |
| 1796 mask, | 1848 mask, |
| 1797 on_failure); | 1849 on_failure); |
| 1798 return true; | 1850 return true; |
| 1799 } | 1851 } |
| 1800 return false; | 1852 return false; |
| 1801 } | 1853 } |
| 1802 | 1854 |
| 1803 | 1855 |
| 1856 typedef bool EmitCharacterFunction(RegExpCompiler* compiler, |
| 1857 uc16 c, |
| 1858 Label* on_failure, |
| 1859 int cp_offset, |
| 1860 bool check, |
| 1861 bool preloaded); |
| 1862 |
| 1804 // Only emits letters (things that have case). Only used for case independent | 1863 // Only emits letters (things that have case). Only used for case independent |
| 1805 // matches. | 1864 // matches. |
| 1806 static inline bool EmitAtomLetter( | 1865 static inline bool EmitAtomLetter(RegExpCompiler* compiler, |
| 1807 RegExpMacroAssembler* macro_assembler, | 1866 uc16 c, |
| 1808 bool ascii, | 1867 Label* on_failure, |
| 1809 uc16 c, | 1868 int cp_offset, |
| 1810 Label* on_failure, | 1869 bool check, |
| 1811 int cp_offset, | 1870 bool preloaded) { |
| 1812 bool check, | 1871 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
| 1813 bool preloaded) { | 1872 bool ascii = compiler->ascii(); |
| 1814 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1873 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 1815 int length = uncanonicalize.get(c, '\0', chars); | 1874 int length = GetCaseIndependentLetters(c, ascii, chars); |
| 1816 if (length <= 1) return false; | 1875 if (length <= 1) return false; |
| 1817 // We may not need to check against the end of the input string | 1876 // We may not need to check against the end of the input string |
| 1818 // if this character lies before a character that matched. | 1877 // if this character lies before a character that matched. |
| 1819 if (!preloaded) { | 1878 if (!preloaded) { |
| 1820 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); | 1879 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
| 1821 } | 1880 } |
| 1822 Label ok; | 1881 Label ok; |
| 1823 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); | 1882 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); |
| 1824 switch (length) { | 1883 switch (length) { |
| 1825 case 2: { | 1884 case 2: { |
| (...skipping 21 matching lines...) Expand all Loading... |
| 1847 default: | 1906 default: |
| 1848 UNREACHABLE(); | 1907 UNREACHABLE(); |
| 1849 break; | 1908 break; |
| 1850 } | 1909 } |
| 1851 return true; | 1910 return true; |
| 1852 } | 1911 } |
| 1853 | 1912 |
| 1854 | 1913 |
| 1855 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, | 1914 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, |
| 1856 RegExpCharacterClass* cc, | 1915 RegExpCharacterClass* cc, |
| 1916 bool ascii, |
| 1917 Label* on_failure, |
| 1857 int cp_offset, | 1918 int cp_offset, |
| 1858 Label* on_failure, | |
| 1859 bool check_offset, | 1919 bool check_offset, |
| 1860 bool ascii, | |
| 1861 bool preloaded) { | 1920 bool preloaded) { |
| 1862 if (cc->is_standard() && | 1921 if (cc->is_standard() && |
| 1863 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), | 1922 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), |
| 1864 cp_offset, | 1923 cp_offset, |
| 1865 check_offset, | 1924 check_offset, |
| 1866 on_failure)) { | 1925 on_failure)) { |
| 1867 return; | 1926 return; |
| 1868 } | 1927 } |
| 1869 | 1928 |
| 1870 ZoneList<CharacterRange>* ranges = cc->ranges(); | 1929 ZoneList<CharacterRange>* ranges = cc->ranges(); |
| (...skipping 360 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2231 for (int i = 0; i < characters && i < quarks.length(); i++) { | 2290 for (int i = 0; i < characters && i < quarks.length(); i++) { |
| 2232 QuickCheckDetails::Position* pos = | 2291 QuickCheckDetails::Position* pos = |
| 2233 details->positions(characters_filled_in); | 2292 details->positions(characters_filled_in); |
| 2234 uc16 c = quarks[i]; | 2293 uc16 c = quarks[i]; |
| 2235 if (c > char_mask) { | 2294 if (c > char_mask) { |
| 2236 // If we expect a non-ASCII character from an ASCII string, | 2295 // If we expect a non-ASCII character from an ASCII string, |
| 2237 // there is no way we can match. Not even case independent | 2296 // there is no way we can match. Not even case independent |
| 2238 // matching can turn an ASCII character into non-ASCII or | 2297 // matching can turn an ASCII character into non-ASCII or |
| 2239 // vice versa. | 2298 // vice versa. |
| 2240 details->set_cannot_match(); | 2299 details->set_cannot_match(); |
| 2300 pos->determines_perfectly = false; |
| 2241 return; | 2301 return; |
| 2242 } | 2302 } |
| 2243 if (compiler->ignore_case()) { | 2303 if (compiler->ignore_case()) { |
| 2244 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 2304 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 2245 int length = uncanonicalize.get(c, '\0', chars); | 2305 int length = GetCaseIndependentLetters(c, compiler->ascii(), chars); |
| 2246 if (length < 2) { | 2306 ASSERT(length != 0); // Can only happen if c > char_mask (see above). |
| 2307 if (length == 1) { |
| 2247 // This letter has no case equivalents, so it's nice and simple | 2308 // This letter has no case equivalents, so it's nice and simple |
| 2248 // and the mask-compare will determine definitely whether we have | 2309 // and the mask-compare will determine definitely whether we have |
| 2249 // a match at this character position. | 2310 // a match at this character position. |
| 2250 pos->mask = char_mask; | 2311 pos->mask = char_mask; |
| 2251 pos->value = c; | 2312 pos->value = c; |
| 2252 pos->determines_perfectly = true; | 2313 pos->determines_perfectly = true; |
| 2253 } else { | 2314 } else { |
| 2254 uint32_t common_bits = char_mask; | 2315 uint32_t common_bits = char_mask; |
| 2255 uint32_t bits = chars[0]; | 2316 uint32_t bits = chars[0]; |
| 2256 for (int j = 1; j < length; j++) { | 2317 for (int j = 1; j < length; j++) { |
| (...skipping 24 matching lines...) Expand all Loading... |
| 2281 ASSERT(characters_filled_in <= details->characters()); | 2342 ASSERT(characters_filled_in <= details->characters()); |
| 2282 if (characters_filled_in == details->characters()) { | 2343 if (characters_filled_in == details->characters()) { |
| 2283 return; | 2344 return; |
| 2284 } | 2345 } |
| 2285 } | 2346 } |
| 2286 } else { | 2347 } else { |
| 2287 QuickCheckDetails::Position* pos = | 2348 QuickCheckDetails::Position* pos = |
| 2288 details->positions(characters_filled_in); | 2349 details->positions(characters_filled_in); |
| 2289 RegExpCharacterClass* tree = elm.data.u_char_class; | 2350 RegExpCharacterClass* tree = elm.data.u_char_class; |
| 2290 ZoneList<CharacterRange>* ranges = tree->ranges(); | 2351 ZoneList<CharacterRange>* ranges = tree->ranges(); |
| 2291 CharacterRange range = ranges->at(0); | |
| 2292 if (tree->is_negated()) { | 2352 if (tree->is_negated()) { |
| 2293 // A quick check uses multi-character mask and compare. There is no | 2353 // A quick check uses multi-character mask and compare. There is no |
| 2294 // useful way to incorporate a negative char class into this scheme | 2354 // useful way to incorporate a negative char class into this scheme |
| 2295 // so we just conservatively create a mask and value that will always | 2355 // so we just conservatively create a mask and value that will always |
| 2296 // succeed. | 2356 // succeed. |
| 2297 pos->mask = 0; | 2357 pos->mask = 0; |
| 2298 pos->value = 0; | 2358 pos->value = 0; |
| 2299 } else { | 2359 } else { |
| 2300 uint32_t differing_bits = (range.from() ^ range.to()); | 2360 int first_range = 0; |
| 2361 while (ranges->at(first_range).from() > char_mask) { |
| 2362 first_range++; |
| 2363 if (first_range == ranges->length()) { |
| 2364 details->set_cannot_match(); |
| 2365 pos->determines_perfectly = false; |
| 2366 return; |
| 2367 } |
| 2368 } |
| 2369 CharacterRange range = ranges->at(first_range); |
| 2370 uc16 from = range.from(); |
| 2371 uc16 to = range.to(); |
| 2372 if (to > char_mask) { |
| 2373 to = char_mask; |
| 2374 } |
| 2375 uint32_t differing_bits = (from ^ to); |
| 2301 // A mask and compare is only perfect if the differing bits form a | 2376 // A mask and compare is only perfect if the differing bits form a |
| 2302 // number like 00011111 with one single block of trailing 1s. | 2377 // number like 00011111 with one single block of trailing 1s. |
| 2303 if ((differing_bits & (differing_bits + 1)) == 0) { | 2378 if ((differing_bits & (differing_bits + 1)) == 0) { |
| 2304 pos->determines_perfectly = true; | 2379 pos->determines_perfectly = true; |
| 2305 } | 2380 } |
| 2306 uint32_t common_bits = ~SmearBitsRight(differing_bits); | 2381 uint32_t common_bits = ~SmearBitsRight(differing_bits); |
| 2307 uint32_t bits = (range.from() & common_bits); | 2382 uint32_t bits = (from & common_bits); |
| 2308 for (int i = 1; i < ranges->length(); i++) { | 2383 for (int i = first_range + 1; i < ranges->length(); i++) { |
| 2384 CharacterRange range = ranges->at(i); |
| 2385 uc16 from = range.from(); |
| 2386 uc16 to = range.to(); |
| 2387 if (from > char_mask) continue; |
| 2388 if (to > char_mask) to = char_mask; |
| 2309 // Here we are combining more ranges into the mask and compare | 2389 // Here we are combining more ranges into the mask and compare |
| 2310 // value. With each new range the mask becomes more sparse and | 2390 // value. With each new range the mask becomes more sparse and |
| 2311 // so the chances of a false positive rise. A character class | 2391 // so the chances of a false positive rise. A character class |
| 2312 // with multiple ranges is assumed never to be equivalent to a | 2392 // with multiple ranges is assumed never to be equivalent to a |
| 2313 // mask and compare operation. | 2393 // mask and compare operation. |
| 2314 pos->determines_perfectly = false; | 2394 pos->determines_perfectly = false; |
| 2315 CharacterRange range = ranges->at(i); | 2395 uint32_t new_common_bits = (from ^ to); |
| 2316 uint32_t new_common_bits = (range.from() ^ range.to()); | |
| 2317 new_common_bits = ~SmearBitsRight(new_common_bits); | 2396 new_common_bits = ~SmearBitsRight(new_common_bits); |
| 2318 common_bits &= new_common_bits; | 2397 common_bits &= new_common_bits; |
| 2319 bits &= new_common_bits; | 2398 bits &= new_common_bits; |
| 2320 uint32_t differing_bits = (range.from() & common_bits) ^ bits; | 2399 uint32_t differing_bits = (from & common_bits) ^ bits; |
| 2321 common_bits ^= differing_bits; | 2400 common_bits ^= differing_bits; |
| 2322 bits &= common_bits; | 2401 bits &= common_bits; |
| 2323 } | 2402 } |
| 2324 pos->mask = common_bits; | 2403 pos->mask = common_bits; |
| 2325 pos->value = bits; | 2404 pos->value = bits; |
| 2326 } | 2405 } |
| 2327 characters_filled_in++; | 2406 characters_filled_in++; |
| 2328 ASSERT(characters_filled_in <= details->characters()); | 2407 ASSERT(characters_filled_in <= details->characters()); |
| 2329 if (characters_filled_in == details->characters()) { | 2408 if (characters_filled_in == details->characters()) { |
| 2330 return; | 2409 return; |
| (...skipping 281 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2612 return; | 2691 return; |
| 2613 case AT_NON_BOUNDARY: | 2692 case AT_NON_BOUNDARY: |
| 2614 case AT_BOUNDARY: | 2693 case AT_BOUNDARY: |
| 2615 EmitBoundaryCheck(type_, compiler, on_success(), trace); | 2694 EmitBoundaryCheck(type_, compiler, on_success(), trace); |
| 2616 return; | 2695 return; |
| 2617 } | 2696 } |
| 2618 on_success()->Emit(compiler, trace); | 2697 on_success()->Emit(compiler, trace); |
| 2619 } | 2698 } |
| 2620 | 2699 |
| 2621 | 2700 |
| 2701 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { |
| 2702 if (quick_check == NULL) return false; |
| 2703 if (offset >= quick_check->characters()) return false; |
| 2704 return quick_check->positions(offset)->determines_perfectly; |
| 2705 } |
| 2706 |
| 2707 |
| 2708 static void UpdateBoundsCheck(int index, int* checked_up_to) { |
| 2709 if (index > *checked_up_to) { |
| 2710 *checked_up_to = index; |
| 2711 } |
| 2712 } |
| 2713 |
| 2714 |
| 2622 // We call this repeatedly to generate code for each pass over the text node. | 2715 // We call this repeatedly to generate code for each pass over the text node. |
| 2623 // The passes are in increasing order of difficulty because we hope one | 2716 // The passes are in increasing order of difficulty because we hope one |
| 2624 // of the first passes will fail in which case we are saved the work of the | 2717 // of the first passes will fail in which case we are saved the work of the |
| 2625 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ | 2718 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ |
| 2626 // we will check the '%' in the first pass, the case independent 'a' in the | 2719 // we will check the '%' in the first pass, the case independent 'a' in the |
| 2627 // second pass and the character class in the last pass. | 2720 // second pass and the character class in the last pass. |
| 2628 // | 2721 // |
| 2629 // The passes are done from right to left, so for example to test for /bar/ | 2722 // The passes are done from right to left, so for example to test for /bar/ |
| 2630 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 | 2723 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 |
| 2631 // and then a 'b' with offset 0. This means we can avoid the end-of-input | 2724 // and then a 'b' with offset 0. This means we can avoid the end-of-input |
| (...skipping 24 matching lines...) Expand all Loading... |
| 2656 int* checked_up_to) { | 2749 int* checked_up_to) { |
| 2657 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | 2750 RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
| 2658 bool ascii = compiler->ascii(); | 2751 bool ascii = compiler->ascii(); |
| 2659 Label* backtrack = trace->backtrack(); | 2752 Label* backtrack = trace->backtrack(); |
| 2660 QuickCheckDetails* quick_check = trace->quick_check_performed(); | 2753 QuickCheckDetails* quick_check = trace->quick_check_performed(); |
| 2661 int element_count = elms_->length(); | 2754 int element_count = elms_->length(); |
| 2662 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { | 2755 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { |
| 2663 TextElement elm = elms_->at(i); | 2756 TextElement elm = elms_->at(i); |
| 2664 int cp_offset = trace->cp_offset() + elm.cp_offset; | 2757 int cp_offset = trace->cp_offset() + elm.cp_offset; |
| 2665 if (elm.type == TextElement::ATOM) { | 2758 if (elm.type == TextElement::ATOM) { |
| 2666 if (pass == NON_ASCII_MATCH || | 2759 Vector<const uc16> quarks = elm.data.u_atom->data(); |
| 2667 pass == CHARACTER_MATCH || | 2760 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { |
| 2668 pass == CASE_CHARACTER_MATCH) { | 2761 if (first_element_checked && i == 0 && j == 0) continue; |
| 2669 Vector<const uc16> quarks = elm.data.u_atom->data(); | 2762 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue; |
| 2670 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { | 2763 EmitCharacterFunction* emit_function = NULL; |
| 2671 bool bound_checked = true; // Most ops will check their bounds. | 2764 switch (pass) { |
| 2672 if (first_element_checked && i == 0 && j == 0) continue; | 2765 case NON_ASCII_MATCH: |
| 2673 if (pass == NON_ASCII_MATCH) { | |
| 2674 ASSERT(ascii); | 2766 ASSERT(ascii); |
| 2675 if (quarks[j] > String::kMaxAsciiCharCode) { | 2767 if (quarks[j] > String::kMaxAsciiCharCode) { |
| 2676 assembler->GoTo(backtrack); | 2768 assembler->GoTo(backtrack); |
| 2677 return; | 2769 return; |
| 2678 } | 2770 } |
| 2679 } else { | 2771 break; |
| 2680 if (quick_check != NULL && | 2772 case NON_LETTER_CHARACTER_MATCH: |
| 2681 elm.cp_offset + j < quick_check->characters() && | 2773 emit_function = &EmitAtomNonLetter; |
| 2682 quick_check->positions(elm.cp_offset + j)-> | 2774 break; |
| 2683 determines_perfectly) { | 2775 case SIMPLE_CHARACTER_MATCH: |
| 2684 continue; | 2776 emit_function = &EmitSimpleCharacter; |
| 2685 } | 2777 break; |
| 2686 if (pass == CHARACTER_MATCH) { | 2778 case CASE_CHARACTER_MATCH: |
| 2687 if (compiler->ignore_case()) { | 2779 emit_function = &EmitAtomLetter; |
| 2688 bound_checked = EmitAtomNonLetter( | 2780 break; |
| 2689 assembler, | 2781 default: |
| 2690 quarks[j], | 2782 break; |
| 2691 backtrack, | 2783 } |
| 2692 cp_offset + j, | 2784 if (emit_function != NULL) { |
| 2693 *checked_up_to < cp_offset + j, | 2785 bool bound_checked = emit_function(compiler, |
| 2694 preloaded); | |
| 2695 } else { | |
| 2696 if (!preloaded) { | |
| 2697 assembler->LoadCurrentCharacter( | |
| 2698 cp_offset + j, | |
| 2699 backtrack, | |
| 2700 *checked_up_to < cp_offset + j); | |
| 2701 } | |
| 2702 assembler->CheckNotCharacter(quarks[j], backtrack); | |
| 2703 } | |
| 2704 } else { | |
| 2705 ASSERT_EQ(pass, CASE_CHARACTER_MATCH); | |
| 2706 ASSERT(compiler->ignore_case()); | |
| 2707 bound_checked = EmitAtomLetter(assembler, | |
| 2708 compiler->ascii(), | |
| 2709 quarks[j], | 2786 quarks[j], |
| 2710 backtrack, | 2787 backtrack, |
| 2711 cp_offset + j, | 2788 cp_offset + j, |
| 2712 *checked_up_to < cp_offset + j, | 2789 *checked_up_to < cp_offset + j, |
| 2713 preloaded); | 2790 preloaded); |
| 2714 } | 2791 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); |
| 2715 if (bound_checked) { | |
| 2716 if (cp_offset + j > *checked_up_to) { | |
| 2717 *checked_up_to = cp_offset + j; | |
| 2718 } | |
| 2719 } | |
| 2720 } | |
| 2721 } | 2792 } |
| 2722 } | 2793 } |
| 2723 } else { | 2794 } else { |
| 2724 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); | 2795 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); |
| 2725 if (first_element_checked && i == 0) continue; | |
| 2726 if (quick_check != NULL && | |
| 2727 elm.cp_offset < quick_check->characters() && | |
| 2728 quick_check->positions(elm.cp_offset)->determines_perfectly) { | |
| 2729 continue; | |
| 2730 } | |
| 2731 if (pass == CHARACTER_CLASS_MATCH) { | 2796 if (pass == CHARACTER_CLASS_MATCH) { |
| 2797 if (first_element_checked && i == 0) continue; |
| 2798 if (DeterminedAlready(quick_check, elm.cp_offset)) continue; |
| 2732 RegExpCharacterClass* cc = elm.data.u_char_class; | 2799 RegExpCharacterClass* cc = elm.data.u_char_class; |
| 2733 EmitCharClass(assembler, | 2800 EmitCharClass(assembler, |
| 2734 cc, | 2801 cc, |
| 2802 ascii, |
| 2803 backtrack, |
| 2735 cp_offset, | 2804 cp_offset, |
| 2736 backtrack, | |
| 2737 *checked_up_to < cp_offset, | 2805 *checked_up_to < cp_offset, |
| 2738 ascii, | |
| 2739 preloaded); | 2806 preloaded); |
| 2740 if (cp_offset > *checked_up_to) { | 2807 UpdateBoundsCheck(cp_offset, checked_up_to); |
| 2741 *checked_up_to = cp_offset; | |
| 2742 } | |
| 2743 } | 2808 } |
| 2744 } | 2809 } |
| 2745 } | 2810 } |
| 2746 } | 2811 } |
| 2747 | 2812 |
| 2748 | 2813 |
| 2749 int TextNode::Length() { | 2814 int TextNode::Length() { |
| 2750 TextElement elm = elms_->last(); | 2815 TextElement elm = elms_->last(); |
| 2751 ASSERT(elm.cp_offset >= 0); | 2816 ASSERT(elm.cp_offset >= 0); |
| 2752 if (elm.type == TextElement::ATOM) { | 2817 if (elm.type == TextElement::ATOM) { |
| 2753 return elm.cp_offset + elm.data.u_atom->data().length(); | 2818 return elm.cp_offset + elm.data.u_atom->data().length(); |
| 2754 } else { | 2819 } else { |
| 2755 return elm.cp_offset + 1; | 2820 return elm.cp_offset + 1; |
| 2756 } | 2821 } |
| 2757 } | 2822 } |
| 2758 | 2823 |
| 2759 | 2824 |
| 2825 bool TextNode::SkipPass(int int_pass, bool ignore_case) { |
| 2826 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); |
| 2827 if (ignore_case) { |
| 2828 return pass == SIMPLE_CHARACTER_MATCH; |
| 2829 } else { |
| 2830 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; |
| 2831 } |
| 2832 } |
| 2833 |
| 2834 |
| 2760 // This generates the code to match a text node. A text node can contain | 2835 // This generates the code to match a text node. A text node can contain |
| 2761 // straight character sequences (possibly to be matched in a case-independent | 2836 // straight character sequences (possibly to be matched in a case-independent |
| 2762 // way) and character classes. For efficiency we do not do this in a single | 2837 // way) and character classes. For efficiency we do not do this in a single |
| 2763 // pass from left to right. Instead we pass over the text node several times, | 2838 // pass from left to right. Instead we pass over the text node several times, |
| 2764 // emitting code for some character positions every time. See the comment on | 2839 // emitting code for some character positions every time. See the comment on |
| 2765 // TextEmitPass for details. | 2840 // TextEmitPass for details. |
| 2766 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { | 2841 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { |
| 2767 LimitResult limit_result = LimitVersions(compiler, trace); | 2842 LimitResult limit_result = LimitVersions(compiler, trace); |
| 2768 if (limit_result == DONE) return; | 2843 if (limit_result == DONE) return; |
| 2769 ASSERT(limit_result == CONTINUE); | 2844 ASSERT(limit_result == CONTINUE); |
| 2770 | 2845 |
| 2771 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { | 2846 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { |
| 2772 compiler->SetRegExpTooBig(); | 2847 compiler->SetRegExpTooBig(); |
| 2773 return; | 2848 return; |
| 2774 } | 2849 } |
| 2775 | 2850 |
| 2776 if (compiler->ascii()) { | 2851 if (compiler->ascii()) { |
| 2777 int dummy = 0; | 2852 int dummy = 0; |
| 2778 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); | 2853 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); |
| 2779 } | 2854 } |
| 2780 | 2855 |
| 2781 bool first_elt_done = false; | 2856 bool first_elt_done = false; |
| 2782 int bound_checked_to = trace->cp_offset() - 1; | 2857 int bound_checked_to = trace->cp_offset() - 1; |
| 2783 bound_checked_to += trace->bound_checked_up_to(); | 2858 bound_checked_to += trace->bound_checked_up_to(); |
| 2784 | 2859 |
| 2785 // If a character is preloaded into the current character register then | 2860 // If a character is preloaded into the current character register then |
| 2786 // check that now. | 2861 // check that now. |
| 2787 if (trace->characters_preloaded() == 1) { | 2862 if (trace->characters_preloaded() == 1) { |
| 2788 TextEmitPass(compiler, | 2863 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { |
| 2789 CHARACTER_MATCH, | 2864 if (!SkipPass(pass, compiler->ignore_case())) { |
| 2790 true, | 2865 TextEmitPass(compiler, |
| 2791 trace, | 2866 static_cast<TextEmitPassType>(pass), |
| 2792 false, | 2867 true, |
| 2793 &bound_checked_to); | 2868 trace, |
| 2794 if (compiler->ignore_case()) { | 2869 false, |
| 2795 TextEmitPass(compiler, | 2870 &bound_checked_to); |
| 2796 CASE_CHARACTER_MATCH, | 2871 } |
| 2797 true, | |
| 2798 trace, | |
| 2799 false, | |
| 2800 &bound_checked_to); | |
| 2801 } | 2872 } |
| 2802 TextEmitPass(compiler, | |
| 2803 CHARACTER_CLASS_MATCH, | |
| 2804 true, | |
| 2805 trace, | |
| 2806 false, | |
| 2807 &bound_checked_to); | |
| 2808 first_elt_done = true; | 2873 first_elt_done = true; |
| 2809 } | 2874 } |
| 2810 | 2875 |
| 2811 TextEmitPass(compiler, | 2876 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { |
| 2812 CHARACTER_MATCH, | 2877 if (!SkipPass(pass, compiler->ignore_case())) { |
| 2813 false, | 2878 TextEmitPass(compiler, |
| 2814 trace, | 2879 static_cast<TextEmitPassType>(pass), |
| 2815 first_elt_done, | 2880 false, |
| 2816 &bound_checked_to); | 2881 trace, |
| 2817 if (compiler->ignore_case()) { | 2882 first_elt_done, |
| 2818 TextEmitPass(compiler, | 2883 &bound_checked_to); |
| 2819 CASE_CHARACTER_MATCH, | 2884 } |
| 2820 false, | |
| 2821 trace, | |
| 2822 first_elt_done, | |
| 2823 &bound_checked_to); | |
| 2824 } | 2885 } |
| 2825 TextEmitPass(compiler, | |
| 2826 CHARACTER_CLASS_MATCH, | |
| 2827 false, | |
| 2828 trace, | |
| 2829 first_elt_done, | |
| 2830 &bound_checked_to); | |
| 2831 | 2886 |
| 2832 Trace successor_trace(*trace); | 2887 Trace successor_trace(*trace); |
| 2833 successor_trace.set_at_start(false); | 2888 successor_trace.set_at_start(false); |
| 2834 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); | 2889 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); |
| 2835 RecursionCheck rc(compiler); | 2890 RecursionCheck rc(compiler); |
| 2836 on_success()->Emit(compiler, &successor_trace); | 2891 on_success()->Emit(compiler, &successor_trace); |
| 2837 } | 2892 } |
| 2838 | 2893 |
| 2839 | 2894 |
| 2840 void Trace::InvalidateCurrentCharacter() { | 2895 void Trace::InvalidateCurrentCharacter() { |
| (...skipping 2043 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4884 EmbeddedVector<byte, 1024> codes; | 4939 EmbeddedVector<byte, 1024> codes; |
| 4885 RegExpMacroAssemblerIrregexp macro_assembler(codes); | 4940 RegExpMacroAssemblerIrregexp macro_assembler(codes); |
| 4886 return compiler.Assemble(¯o_assembler, | 4941 return compiler.Assemble(¯o_assembler, |
| 4887 node, | 4942 node, |
| 4888 data->capture_count, | 4943 data->capture_count, |
| 4889 pattern); | 4944 pattern); |
| 4890 } | 4945 } |
| 4891 | 4946 |
| 4892 | 4947 |
| 4893 }} // namespace v8::internal | 4948 }} // namespace v8::internal |
| OLD | NEW |