Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(4)

Side by Side Diff: src/jsregexp.cc

Issue 21450: Irregexp:... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 11 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « src/jsregexp.h ('k') | src/objects.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 1721 matching lines...) Expand 10 before | Expand all | Expand 10 after
1732 trace->backtrack()); 1732 trace->backtrack());
1733 break; 1733 break;
1734 } 1734 }
1735 } 1735 }
1736 1736
1737 1737
1738 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; 1738 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1739 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; 1739 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1740 1740
1741 1741
1742 // Returns the number of characters in the equivalence class, omitting those
1743 // that cannot occur in the source string because it is ASCII.
1744 static int GetCaseIndependentLetters(uc16 character,
1745 bool ascii_subject,
1746 unibrow::uchar* letters) {
1747 int length = uncanonicalize.get(character, '\0', letters);
1748 // Unibrow returns 0 or 1 for characters where case independependence is
1749 // trivial.
1750 if (length == 0) {
1751 letters[0] = character;
1752 length = 1;
1753 }
1754 if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
1755 return length;
1756 }
1757 // The standard requires that non-ASCII characters cannot have ASCII
1758 // character codes in their equivalence class.
1759 return 0;
1760 }
1761
1762
1742 // Only emits non-letters (things that don't have case). Only used for case 1763 // Only emits non-letters (things that don't have case). Only used for case
1743 // independent matches. 1764 // independent matches.
1744 static inline bool EmitAtomNonLetter( 1765 static inline bool EmitAtomNonLetter(
1745 RegExpMacroAssembler* macro_assembler, 1766 RegExpMacroAssembler* macro_assembler,
1746 uc16 c, 1767 uc16 c,
1768 bool ascii,
1747 Label* on_failure, 1769 Label* on_failure,
1748 int cp_offset, 1770 int cp_offset,
1749 bool check, 1771 bool check,
1750 bool preloaded) { 1772 bool preloaded) {
1751 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1773 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1752 int length = uncanonicalize.get(c, '\0', chars); 1774 int length = GetCaseIndependentLetters(c, ascii, chars);
1775 if (length < 1) {
1776 // This can't match. Must be an ASCII subject and a non-ASCII character.
1777 // We do not need to do anything since the ASCII pass already handled this.
1778 return false; // Bounds not checked.
1779 }
1753 bool checked = false; 1780 bool checked = false;
1754 if (length <= 1) { 1781 // We handle the length > 1 case in a later pass.
1782 if (length == 1) {
1783 if (ascii && c > String::kMaxAsciiCharCodeU) {
1784 // Can't match - see above.
1785 return false; // Bounds not checked.
1786 }
1755 if (!preloaded) { 1787 if (!preloaded) {
1756 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1788 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1757 checked = check; 1789 checked = check;
1758 } 1790 }
1759 macro_assembler->CheckNotCharacter(c, on_failure); 1791 macro_assembler->CheckNotCharacter(c, on_failure);
1760 } 1792 }
1761 return checked; 1793 return checked;
1762 } 1794 }
1763 1795
1764 1796
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
1798 return true; 1830 return true;
1799 } 1831 }
1800 return false; 1832 return false;
1801 } 1833 }
1802 1834
1803 1835
1804 // Only emits letters (things that have case). Only used for case independent 1836 // Only emits letters (things that have case). Only used for case independent
1805 // matches. 1837 // matches.
1806 static inline bool EmitAtomLetter( 1838 static inline bool EmitAtomLetter(
1807 RegExpMacroAssembler* macro_assembler, 1839 RegExpMacroAssembler* macro_assembler,
1840 uc16 c,
1808 bool ascii, 1841 bool ascii,
1809 uc16 c,
1810 Label* on_failure, 1842 Label* on_failure,
1811 int cp_offset, 1843 int cp_offset,
1812 bool check, 1844 bool check,
1813 bool preloaded) { 1845 bool preloaded) {
1814 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1846 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1815 int length = uncanonicalize.get(c, '\0', chars); 1847 int length = GetCaseIndependentLetters(c, ascii, chars);
1816 if (length <= 1) return false; 1848 if (length <= 1) return false;
1817 // We may not need to check against the end of the input string 1849 // We may not need to check against the end of the input string
1818 // if this character lies before a character that matched. 1850 // if this character lies before a character that matched.
1819 if (!preloaded) { 1851 if (!preloaded) {
1820 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1852 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1821 } 1853 }
1822 Label ok; 1854 Label ok;
1823 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); 1855 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1824 switch (length) { 1856 switch (length) {
1825 case 2: { 1857 case 2: {
(...skipping 21 matching lines...) Expand all
1847 default: 1879 default:
1848 UNREACHABLE(); 1880 UNREACHABLE();
1849 break; 1881 break;
1850 } 1882 }
1851 return true; 1883 return true;
1852 } 1884 }
1853 1885
1854 1886
1855 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 1887 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1856 RegExpCharacterClass* cc, 1888 RegExpCharacterClass* cc,
1889 bool ascii,
1890 Label* on_failure,
1857 int cp_offset, 1891 int cp_offset,
1858 Label* on_failure,
1859 bool check_offset, 1892 bool check_offset,
1860 bool ascii,
1861 bool preloaded) { 1893 bool preloaded) {
1862 if (cc->is_standard() && 1894 if (cc->is_standard() &&
1863 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), 1895 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1864 cp_offset, 1896 cp_offset,
1865 check_offset, 1897 check_offset,
1866 on_failure)) { 1898 on_failure)) {
1867 return; 1899 return;
1868 } 1900 }
1869 1901
1870 ZoneList<CharacterRange>* ranges = cc->ranges(); 1902 ZoneList<CharacterRange>* ranges = cc->ranges();
(...skipping 360 matching lines...) Expand 10 before | Expand all | Expand 10 after
2231 for (int i = 0; i < characters && i < quarks.length(); i++) { 2263 for (int i = 0; i < characters && i < quarks.length(); i++) {
2232 QuickCheckDetails::Position* pos = 2264 QuickCheckDetails::Position* pos =
2233 details->positions(characters_filled_in); 2265 details->positions(characters_filled_in);
2234 uc16 c = quarks[i]; 2266 uc16 c = quarks[i];
2235 if (c > char_mask) { 2267 if (c > char_mask) {
2236 // If we expect a non-ASCII character from an ASCII string, 2268 // If we expect a non-ASCII character from an ASCII string,
2237 // there is no way we can match. Not even case independent 2269 // there is no way we can match. Not even case independent
2238 // matching can turn an ASCII character into non-ASCII or 2270 // matching can turn an ASCII character into non-ASCII or
2239 // vice versa. 2271 // vice versa.
2240 details->set_cannot_match(); 2272 details->set_cannot_match();
2273 pos->determines_perfectly = false;
Lasse Reichstein 2009/02/18 15:20:44 Should this be set to false? If the remaining case
Erik Corry 2009/02/18 16:04:24 Determines perfectly indicates that this particula
2241 return; 2274 return;
2242 } 2275 }
2243 if (compiler->ignore_case()) { 2276 if (compiler->ignore_case()) {
2244 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 2277 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2245 int length = uncanonicalize.get(c, '\0', chars); 2278 int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);
2246 if (length < 2) { 2279 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
2280 if (length == 1) {
2247 // This letter has no case equivalents, so it's nice and simple 2281 // This letter has no case equivalents, so it's nice and simple
2248 // and the mask-compare will determine definitely whether we have 2282 // and the mask-compare will determine definitely whether we have
2249 // a match at this character position. 2283 // a match at this character position.
2250 pos->mask = char_mask; 2284 pos->mask = char_mask;
2251 pos->value = c; 2285 pos->value = c;
2252 pos->determines_perfectly = true; 2286 pos->determines_perfectly = true;
2253 } else { 2287 } else {
2254 uint32_t common_bits = char_mask; 2288 uint32_t common_bits = char_mask;
2255 uint32_t bits = chars[0]; 2289 uint32_t bits = chars[0];
2256 for (int j = 1; j < length; j++) { 2290 for (int j = 1; j < length; j++) {
(...skipping 24 matching lines...) Expand all
2281 ASSERT(characters_filled_in <= details->characters()); 2315 ASSERT(characters_filled_in <= details->characters());
2282 if (characters_filled_in == details->characters()) { 2316 if (characters_filled_in == details->characters()) {
2283 return; 2317 return;
2284 } 2318 }
2285 } 2319 }
2286 } else { 2320 } else {
2287 QuickCheckDetails::Position* pos = 2321 QuickCheckDetails::Position* pos =
2288 details->positions(characters_filled_in); 2322 details->positions(characters_filled_in);
2289 RegExpCharacterClass* tree = elm.data.u_char_class; 2323 RegExpCharacterClass* tree = elm.data.u_char_class;
2290 ZoneList<CharacterRange>* ranges = tree->ranges(); 2324 ZoneList<CharacterRange>* ranges = tree->ranges();
2291 CharacterRange range = ranges->at(0);
2292 if (tree->is_negated()) { 2325 if (tree->is_negated()) {
2293 // A quick check uses multi-character mask and compare. There is no 2326 // A quick check uses multi-character mask and compare. There is no
2294 // useful way to incorporate a negative char class into this scheme 2327 // useful way to incorporate a negative char class into this scheme
2295 // so we just conservatively create a mask and value that will always 2328 // so we just conservatively create a mask and value that will always
2296 // succeed. 2329 // succeed.
Lasse Reichstein 2009/02/18 15:20:44 This comment should be changed to just "not yet ha
Erik Corry 2009/02/18 16:04:24 I don't think that char classes like [^\u0000-ac-\
2297 pos->mask = 0; 2330 pos->mask = 0;
2298 pos->value = 0; 2331 pos->value = 0;
2299 } else { 2332 } else {
2300 uint32_t differing_bits = (range.from() ^ range.to()); 2333 int first_range = 0;
2334 while (ranges->at(first_range).from() > char_mask) {
2335 first_range++;
2336 if (first_range == ranges->length()) {
2337 details->set_cannot_match();
2338 pos->determines_perfectly = false;
2339 return;
2340 }
2341 }
2342 CharacterRange range = ranges->at(first_range);
2343 uc16 from = range.from();
2344 uc16 to = range.to();
2345 if (to > char_mask) {
2346 to = char_mask;
2347 }
2348 uint32_t differing_bits = (from ^ to);
2301 // A mask and compare is only perfect if the differing bits form a 2349 // A mask and compare is only perfect if the differing bits form a
2302 // number like 00011111 with one single block of trailing 1s. 2350 // number like 00011111 with one single block of trailing 1s.
2303 if ((differing_bits & (differing_bits + 1)) == 0) { 2351 if ((differing_bits & (differing_bits + 1)) == 0) {
2304 pos->determines_perfectly = true; 2352 pos->determines_perfectly = true;
2305 } 2353 }
2306 uint32_t common_bits = ~SmearBitsRight(differing_bits); 2354 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2307 uint32_t bits = (range.from() & common_bits); 2355 uint32_t bits = (from & common_bits);
2308 for (int i = 1; i < ranges->length(); i++) { 2356 for (int i = first_range + 1; i < ranges->length(); i++) {
2357 CharacterRange range = ranges->at(i);
2358 uc16 from = range.from();
2359 uc16 to = range.to();
2360 if (from > char_mask) continue;
2361 if (to > char_mask) to = char_mask;
2309 // Here we are combining more ranges into the mask and compare 2362 // Here we are combining more ranges into the mask and compare
2310 // value. With each new range the mask becomes more sparse and 2363 // value. With each new range the mask becomes more sparse and
2311 // so the chances of a false positive rise. A character class 2364 // so the chances of a false positive rise. A character class
2312 // with multiple ranges is assumed never to be equivalent to a 2365 // with multiple ranges is assumed never to be equivalent to a
2313 // mask and compare operation. 2366 // mask and compare operation.
2314 pos->determines_perfectly = false; 2367 pos->determines_perfectly = false;
2315 CharacterRange range = ranges->at(i); 2368 uint32_t new_common_bits = (from ^ to);
2316 uint32_t new_common_bits = (range.from() ^ range.to());
2317 new_common_bits = ~SmearBitsRight(new_common_bits); 2369 new_common_bits = ~SmearBitsRight(new_common_bits);
2318 common_bits &= new_common_bits; 2370 common_bits &= new_common_bits;
2319 bits &= new_common_bits; 2371 bits &= new_common_bits;
2320 uint32_t differing_bits = (range.from() & common_bits) ^ bits; 2372 uint32_t differing_bits = (from & common_bits) ^ bits;
2321 common_bits ^= differing_bits; 2373 common_bits ^= differing_bits;
2322 bits &= common_bits; 2374 bits &= common_bits;
2323 } 2375 }
2324 pos->mask = common_bits; 2376 pos->mask = common_bits;
2325 pos->value = bits; 2377 pos->value = bits;
2326 } 2378 }
2327 characters_filled_in++; 2379 characters_filled_in++;
2328 ASSERT(characters_filled_in <= details->characters()); 2380 ASSERT(characters_filled_in <= details->characters());
2329 if (characters_filled_in == details->characters()) { 2381 if (characters_filled_in == details->characters()) {
2330 return; 2382 return;
(...skipping 281 matching lines...) Expand 10 before | Expand all | Expand 10 after
2612 return; 2664 return;
2613 case AT_NON_BOUNDARY: 2665 case AT_NON_BOUNDARY:
2614 case AT_BOUNDARY: 2666 case AT_BOUNDARY:
2615 EmitBoundaryCheck(type_, compiler, on_success(), trace); 2667 EmitBoundaryCheck(type_, compiler, on_success(), trace);
2616 return; 2668 return;
2617 } 2669 }
2618 on_success()->Emit(compiler, trace); 2670 on_success()->Emit(compiler, trace);
2619 } 2671 }
2620 2672
2621 2673
2674 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2675 if (quick_check == NULL) return false;
2676 if (offset >= quick_check->characters()) return false;
2677 return quick_check->positions(offset)->determines_perfectly;
Lasse Reichstein 2009/02/18 15:20:44 Just checking: If we find that a position determin
Erik Corry 2009/02/18 16:04:24 See above.
2678 }
2679
2680
2681 static void UpdateBoundsCheck(int index, int* checked_up_to) {
2682 if (index > *checked_up_to) {
2683 *checked_up_to = index;
2684 }
2685 }
2686
2687
2622 // We call this repeatedly to generate code for each pass over the text node. 2688 // We call this repeatedly to generate code for each pass over the text node.
2623 // The passes are in increasing order of difficulty because we hope one 2689 // The passes are in increasing order of difficulty because we hope one
2624 // of the first passes will fail in which case we are saved the work of the 2690 // of the first passes will fail in which case we are saved the work of the
2625 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ 2691 // later passes. for example for the case independent regexp /%[asdfghjkl]a/
2626 // we will check the '%' in the first pass, the case independent 'a' in the 2692 // we will check the '%' in the first pass, the case independent 'a' in the
2627 // second pass and the character class in the last pass. 2693 // second pass and the character class in the last pass.
2628 // 2694 //
2629 // The passes are done from right to left, so for example to test for /bar/ 2695 // The passes are done from right to left, so for example to test for /bar/
2630 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 2696 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
2631 // and then a 'b' with offset 0. This means we can avoid the end-of-input 2697 // and then a 'b' with offset 0. This means we can avoid the end-of-input
(...skipping 14 matching lines...) Expand all
2646 // order to get to the code we are now generating. The quick check can involve 2712 // order to get to the code we are now generating. The quick check can involve
2647 // loading characters, which means we do not need to recheck the bounds 2713 // loading characters, which means we do not need to recheck the bounds
2648 // up to the limit the quick check already checked. In addition the quick 2714 // up to the limit the quick check already checked. In addition the quick
2649 // check can have involved a mask and compare operation which may simplify 2715 // check can have involved a mask and compare operation which may simplify
2650 // or obviate the need for further checks at some character positions. 2716 // or obviate the need for further checks at some character positions.
2651 void TextNode::TextEmitPass(RegExpCompiler* compiler, 2717 void TextNode::TextEmitPass(RegExpCompiler* compiler,
2652 TextEmitPassType pass, 2718 TextEmitPassType pass,
2653 bool preloaded, 2719 bool preloaded,
2654 Trace* trace, 2720 Trace* trace,
2655 bool first_element_checked, 2721 bool first_element_checked,
2656 int* checked_up_to) { 2722 int* checked_up_to) {
Lasse Reichstein 2009/02/18 15:20:44 This function is one big glob of special cases. If
Erik Corry 2009/02/18 16:04:24 I've refactored a little more. It is now only 68
2657 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2723 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2658 bool ascii = compiler->ascii(); 2724 bool ascii = compiler->ascii();
2659 Label* backtrack = trace->backtrack(); 2725 Label* backtrack = trace->backtrack();
2660 QuickCheckDetails* quick_check = trace->quick_check_performed(); 2726 QuickCheckDetails* quick_check = trace->quick_check_performed();
2661 int element_count = elms_->length(); 2727 int element_count = elms_->length();
2662 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 2728 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2663 TextElement elm = elms_->at(i); 2729 TextElement elm = elms_->at(i);
2664 int cp_offset = trace->cp_offset() + elm.cp_offset; 2730 int cp_offset = trace->cp_offset() + elm.cp_offset;
2665 if (elm.type == TextElement::ATOM) { 2731 if (elm.type == TextElement::ATOM) {
2666 if (pass == NON_ASCII_MATCH || 2732 if (pass != CHARACTER_CLASS_MATCH) {
2667 pass == CHARACTER_MATCH ||
2668 pass == CASE_CHARACTER_MATCH) {
2669 Vector<const uc16> quarks = elm.data.u_atom->data(); 2733 Vector<const uc16> quarks = elm.data.u_atom->data();
2670 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 2734 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2671 bool bound_checked = true; // Most ops will check their bounds. 2735 bool bound_checked = false;
2672 if (first_element_checked && i == 0 && j == 0) continue; 2736 if (first_element_checked && i == 0 && j == 0) continue;
2673 if (pass == NON_ASCII_MATCH) { 2737 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
2674 ASSERT(ascii); 2738 switch (pass) {
2675 if (quarks[j] > String::kMaxAsciiCharCode) { 2739 case NON_ASCII_MATCH:
2676 assembler->GoTo(backtrack); 2740 ASSERT(ascii);
2677 return; 2741 if (quarks[j] > String::kMaxAsciiCharCode) {
2678 } 2742 assembler->GoTo(backtrack);
2679 } else { 2743 return;
2680 if (quick_check != NULL && 2744 }
2681 elm.cp_offset + j < quick_check->characters() && 2745 break;
2682 quick_check->positions(elm.cp_offset + j)-> 2746 case NON_LETTER_CHARACTER_MATCH:
2683 determines_perfectly) { 2747 bound_checked = EmitAtomNonLetter(assembler,
2684 continue; 2748 quarks[j],
2685 } 2749 ascii,
2686 if (pass == CHARACTER_MATCH) { 2750 backtrack,
2687 if (compiler->ignore_case()) { 2751 cp_offset + j,
2688 bound_checked = EmitAtomNonLetter( 2752 *checked_up_to < cp_offset + j,
2689 assembler, 2753 preloaded);
2690 quarks[j], 2754 break;
2755 case SIMPLE_CHARACTER_MATCH:
2756 if (!preloaded) {
2757 assembler->LoadCurrentCharacter(
2758 cp_offset + j,
2691 backtrack, 2759 backtrack,
2692 cp_offset + j, 2760 *checked_up_to < cp_offset + j);
2693 *checked_up_to < cp_offset + j, 2761 bound_checked = true;
2694 preloaded);
2695 } else {
2696 if (!preloaded) {
2697 assembler->LoadCurrentCharacter(
2698 cp_offset + j,
2699 backtrack,
2700 *checked_up_to < cp_offset + j);
2701 }
2702 assembler->CheckNotCharacter(quarks[j], backtrack);
2703 } 2762 }
2704 } else { 2763 assembler->CheckNotCharacter(quarks[j], backtrack);
2705 ASSERT_EQ(pass, CASE_CHARACTER_MATCH); 2764 break;
2706 ASSERT(compiler->ignore_case()); 2765 case CASE_CHARACTER_MATCH:
2707 bound_checked = EmitAtomLetter(assembler, 2766 bound_checked = EmitAtomLetter(assembler,
2708 compiler->ascii(),
2709 quarks[j], 2767 quarks[j],
2768 ascii,
2710 backtrack, 2769 backtrack,
2711 cp_offset + j, 2770 cp_offset + j,
2712 *checked_up_to < cp_offset + j, 2771 *checked_up_to < cp_offset + j,
2713 preloaded); 2772 preloaded);
2714 } 2773 break;
2715 if (bound_checked) { 2774 default:
2716 if (cp_offset + j > *checked_up_to) { 2775 break;
2717 *checked_up_to = cp_offset + j;
2718 }
2719 }
2720 } 2776 }
2777 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2721 } 2778 }
2722 } 2779 }
2723 } else { 2780 } else {
2724 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); 2781 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
2725 if (first_element_checked && i == 0) continue;
2726 if (quick_check != NULL &&
2727 elm.cp_offset < quick_check->characters() &&
2728 quick_check->positions(elm.cp_offset)->determines_perfectly) {
2729 continue;
2730 }
2731 if (pass == CHARACTER_CLASS_MATCH) { 2782 if (pass == CHARACTER_CLASS_MATCH) {
2783 if (first_element_checked && i == 0) continue;
2784 if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
2732 RegExpCharacterClass* cc = elm.data.u_char_class; 2785 RegExpCharacterClass* cc = elm.data.u_char_class;
2733 EmitCharClass(assembler, 2786 EmitCharClass(assembler,
2734 cc, 2787 cc,
2788 ascii,
2789 backtrack,
2735 cp_offset, 2790 cp_offset,
2736 backtrack,
2737 *checked_up_to < cp_offset, 2791 *checked_up_to < cp_offset,
2738 ascii,
2739 preloaded); 2792 preloaded);
2740 if (cp_offset > *checked_up_to) { 2793 UpdateBoundsCheck(cp_offset, checked_up_to);
2741 *checked_up_to = cp_offset;
2742 }
2743 } 2794 }
2744 } 2795 }
2745 } 2796 }
2746 } 2797 }
2747 2798
2748 2799
2749 int TextNode::Length() { 2800 int TextNode::Length() {
2750 TextElement elm = elms_->last(); 2801 TextElement elm = elms_->last();
2751 ASSERT(elm.cp_offset >= 0); 2802 ASSERT(elm.cp_offset >= 0);
2752 if (elm.type == TextElement::ATOM) { 2803 if (elm.type == TextElement::ATOM) {
2753 return elm.cp_offset + elm.data.u_atom->data().length(); 2804 return elm.cp_offset + elm.data.u_atom->data().length();
2754 } else { 2805 } else {
2755 return elm.cp_offset + 1; 2806 return elm.cp_offset + 1;
2756 } 2807 }
2757 } 2808 }
2758 2809
2759 2810
2811 bool TextNode::SkipPass(int int_pass, bool ignore_case) {
2812 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
2813 if (ignore_case) {
2814 return pass == SIMPLE_CHARACTER_MATCH;
2815 } else {
2816 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2817 }
2818 }
2819
2820
2760 // This generates the code to match a text node. A text node can contain 2821 // This generates the code to match a text node. A text node can contain
2761 // straight character sequences (possibly to be matched in a case-independent 2822 // straight character sequences (possibly to be matched in a case-independent
2762 // way) and character classes. For efficiency we do not do this in a single 2823 // way) and character classes. For efficiency we do not do this in a single
2763 // pass from left to right. Instead we pass over the text node several times, 2824 // pass from left to right. Instead we pass over the text node several times,
2764 // emitting code for some character positions every time. See the comment on 2825 // emitting code for some character positions every time. See the comment on
2765 // TextEmitPass for details. 2826 // TextEmitPass for details.
2766 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2827 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2767 LimitResult limit_result = LimitVersions(compiler, trace); 2828 LimitResult limit_result = LimitVersions(compiler, trace);
2768 if (limit_result == DONE) return; 2829 if (limit_result == DONE) return;
2769 ASSERT(limit_result == CONTINUE); 2830 ASSERT(limit_result == CONTINUE);
2770 2831
2771 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 2832 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2772 compiler->SetRegExpTooBig(); 2833 compiler->SetRegExpTooBig();
2773 return; 2834 return;
2774 } 2835 }
2775 2836
2776 if (compiler->ascii()) { 2837 if (compiler->ascii()) {
2777 int dummy = 0; 2838 int dummy = 0;
2778 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); 2839 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
2779 } 2840 }
2780 2841
2781 bool first_elt_done = false; 2842 bool first_elt_done = false;
2782 int bound_checked_to = trace->cp_offset() - 1; 2843 int bound_checked_to = trace->cp_offset() - 1;
2783 bound_checked_to += trace->bound_checked_up_to(); 2844 bound_checked_to += trace->bound_checked_up_to();
2784 2845
2785 // If a character is preloaded into the current character register then 2846 // If a character is preloaded into the current character register then
2786 // check that now. 2847 // check that now.
2787 if (trace->characters_preloaded() == 1) { 2848 if (trace->characters_preloaded() == 1) {
2788 TextEmitPass(compiler, 2849 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2789 CHARACTER_MATCH, 2850 if (!SkipPass(pass, compiler->ignore_case())) {
2790 true, 2851 TextEmitPass(compiler,
2791 trace, 2852 static_cast<TextEmitPassType>(pass),
2792 false, 2853 true,
2793 &bound_checked_to); 2854 trace,
2794 if (compiler->ignore_case()) { 2855 false,
2795 TextEmitPass(compiler, 2856 &bound_checked_to);
2796 CASE_CHARACTER_MATCH, 2857 }
2797 true,
2798 trace,
2799 false,
2800 &bound_checked_to);
2801 } 2858 }
2802 TextEmitPass(compiler,
2803 CHARACTER_CLASS_MATCH,
2804 true,
2805 trace,
2806 false,
2807 &bound_checked_to);
2808 first_elt_done = true; 2859 first_elt_done = true;
2809 } 2860 }
2810 2861
2811 TextEmitPass(compiler, 2862 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2812 CHARACTER_MATCH, 2863 if (!SkipPass(pass, compiler->ignore_case())) {
2813 false, 2864 TextEmitPass(compiler,
2814 trace, 2865 static_cast<TextEmitPassType>(pass),
2815 first_elt_done, 2866 false,
2816 &bound_checked_to); 2867 trace,
2817 if (compiler->ignore_case()) { 2868 first_elt_done,
2818 TextEmitPass(compiler, 2869 &bound_checked_to);
2819 CASE_CHARACTER_MATCH, 2870 }
2820 false,
2821 trace,
2822 first_elt_done,
2823 &bound_checked_to);
2824 } 2871 }
2825 TextEmitPass(compiler,
2826 CHARACTER_CLASS_MATCH,
2827 false,
2828 trace,
2829 first_elt_done,
2830 &bound_checked_to);
2831 2872
2832 Trace successor_trace(*trace); 2873 Trace successor_trace(*trace);
2833 successor_trace.set_at_start(false); 2874 successor_trace.set_at_start(false);
2834 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); 2875 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
2835 RecursionCheck rc(compiler); 2876 RecursionCheck rc(compiler);
2836 on_success()->Emit(compiler, &successor_trace); 2877 on_success()->Emit(compiler, &successor_trace);
2837 } 2878 }
2838 2879
2839 2880
2840 void Trace::InvalidateCurrentCharacter() { 2881 void Trace::InvalidateCurrentCharacter() {
(...skipping 2043 matching lines...) Expand 10 before | Expand all | Expand 10 after
4884 EmbeddedVector<byte, 1024> codes; 4925 EmbeddedVector<byte, 1024> codes;
4885 RegExpMacroAssemblerIrregexp macro_assembler(codes); 4926 RegExpMacroAssemblerIrregexp macro_assembler(codes);
4886 return compiler.Assemble(&macro_assembler, 4927 return compiler.Assemble(&macro_assembler,
4887 node, 4928 node,
4888 data->capture_count, 4929 data->capture_count,
4889 pattern); 4930 pattern);
4890 } 4931 }
4891 4932
4892 4933
4893 }} // namespace v8::internal 4934 }} // namespace v8::internal
OLDNEW
« no previous file with comments | « src/jsregexp.h ('k') | src/objects.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698