src/jsregexp.cc - Issue 21450: Irregexp:...

Side by Side Diff: src/jsregexp.cc

Issue 21450: Irregexp:... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 11 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved.	1 // Copyright 2006-2008 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 1721 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1732 trace->backtrack());	1732 trace->backtrack());

1733 break;	1733 break;

1734 }	1734 }

1735 }	1735 }

1736	1736

1737	1737

1738 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;	1738 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;

1739 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;	1739 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;

1740	1740

1741	1741

	1742 // Returns the number of characters in the equivalence class, omitting those

	1743 // that cannot occur in the source string because it is ASCII.

	1744 static int GetCaseIndependentLetters(uc16 character,

	1745 bool ascii_subject,

	1746 unibrow::uchar* letters) {

	1747 int length = uncanonicalize.get(character, '\0', letters);

	1748 // Unibrow returns 0 or 1 for characters where case independependence is

	1749 // trivial.

	1750 if (length == 0) {

	1751 letters[0] = character;

	1752 length = 1;

	1753 }

	1754 if (!ascii_subject \|\| character <= String::kMaxAsciiCharCode) {

	1755 return length;

	1756 }

	1757 // The standard requires that non-ASCII characters cannot have ASCII

	1758 // character codes in their equivalence class.

	1759 return 0;

	1760 }

	1761

	1762

1742 // Only emits non-letters (things that don't have case). Only used for case	1763 // Only emits non-letters (things that don't have case). Only used for case

1743 // independent matches.	1764 // independent matches.

1744 static inline bool EmitAtomNonLetter(	1765 static inline bool EmitAtomNonLetter(

1745 RegExpMacroAssembler* macro_assembler,	1766 RegExpMacroAssembler* macro_assembler,

1746 uc16 c,	1767 uc16 c,

	1768 bool ascii,

1747 Label* on_failure,	1769 Label* on_failure,

1748 int cp_offset,	1770 int cp_offset,

1749 bool check,	1771 bool check,

1750 bool preloaded) {	1772 bool preloaded) {

1751 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	1773 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

1752 int length = uncanonicalize.get(c, '\0', chars);	1774 int length = GetCaseIndependentLetters(c, ascii, chars);

	1775 if (length < 1) {

	1776 // This can't match. Must be an ASCII subject and a non-ASCII character.

	1777 // We do not need to do anything since the ASCII pass already handled this.

	1778 return false; // Bounds not checked.

	1779 }

1753 bool checked = false;	1780 bool checked = false;

1754 if (length <= 1) {	1781 // We handle the length > 1 case in a later pass.

	1782 if (length == 1) {

	1783 if (ascii && c > String::kMaxAsciiCharCodeU) {

	1784 // Can't match - see above.

	1785 return false; // Bounds not checked.

	1786 }

1755 if (!preloaded) {	1787 if (!preloaded) {

1756 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);	1788 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

1757 checked = check;	1789 checked = check;

1758 }	1790 }

1759 macro_assembler->CheckNotCharacter(c, on_failure);	1791 macro_assembler->CheckNotCharacter(c, on_failure);

1760 }	1792 }

1761 return checked;	1793 return checked;

1762 }	1794 }

1763	1795

1764	1796

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1798 return true;	1830 return true;

1799 }	1831 }

1800 return false;	1832 return false;

1801 }	1833 }

1802	1834

1803	1835

1804 // Only emits letters (things that have case). Only used for case independent	1836 // Only emits letters (things that have case). Only used for case independent

1805 // matches.	1837 // matches.

1806 static inline bool EmitAtomLetter(	1838 static inline bool EmitAtomLetter(

1807 RegExpMacroAssembler* macro_assembler,	1839 RegExpMacroAssembler* macro_assembler,

	1840 uc16 c,

1808 bool ascii,	1841 bool ascii,

1809 uc16 c,

1810 Label* on_failure,	1842 Label* on_failure,

1811 int cp_offset,	1843 int cp_offset,

1812 bool check,	1844 bool check,

1813 bool preloaded) {	1845 bool preloaded) {

1814 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	1846 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

1815 int length = uncanonicalize.get(c, '\0', chars);	1847 int length = GetCaseIndependentLetters(c, ascii, chars);

1816 if (length <= 1) return false;	1848 if (length <= 1) return false;

1817 // We may not need to check against the end of the input string	1849 // We may not need to check against the end of the input string

1818 // if this character lies before a character that matched.	1850 // if this character lies before a character that matched.

1819 if (!preloaded) {	1851 if (!preloaded) {

1820 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);	1852 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

1821 }	1853 }

1822 Label ok;	1854 Label ok;

1823 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);	1855 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);

1824 switch (length) {	1856 switch (length) {

1825 case 2: {	1857 case 2: {

(...skipping 21 matching lines...) Expand all Loading...
1847 default:	1879 default:

1848 UNREACHABLE();	1880 UNREACHABLE();

1849 break;	1881 break;

1850 }	1882 }

1851 return true;	1883 return true;

1852 }	1884 }

1853	1885

1854	1886

1855 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,	1887 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,

1856 RegExpCharacterClass* cc,	1888 RegExpCharacterClass* cc,

	1889 bool ascii,

	1890 Label* on_failure,

1857 int cp_offset,	1891 int cp_offset,

1858 Label* on_failure,

1859 bool check_offset,	1892 bool check_offset,

1860 bool ascii,

1861 bool preloaded) {	1893 bool preloaded) {

1862 if (cc->is_standard() &&	1894 if (cc->is_standard() &&

1863 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),	1895 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),

1864 cp_offset,	1896 cp_offset,

1865 check_offset,	1897 check_offset,

1866 on_failure)) {	1898 on_failure)) {

1867 return;	1899 return;

1868 }	1900 }

1869	1901

1870 ZoneList<CharacterRange>* ranges = cc->ranges();	1902 ZoneList<CharacterRange>* ranges = cc->ranges();

(...skipping 360 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2231 for (int i = 0; i < characters && i < quarks.length(); i++) {	2263 for (int i = 0; i < characters && i < quarks.length(); i++) {

2232 QuickCheckDetails::Position* pos =	2264 QuickCheckDetails::Position* pos =

2233 details->positions(characters_filled_in);	2265 details->positions(characters_filled_in);

2234 uc16 c = quarks[i];	2266 uc16 c = quarks[i];

2235 if (c > char_mask) {	2267 if (c > char_mask) {

2236 // If we expect a non-ASCII character from an ASCII string,	2268 // If we expect a non-ASCII character from an ASCII string,

2237 // there is no way we can match. Not even case independent	2269 // there is no way we can match. Not even case independent

2238 // matching can turn an ASCII character into non-ASCII or	2270 // matching can turn an ASCII character into non-ASCII or

2239 // vice versa.	2271 // vice versa.

2240 details->set_cannot_match();	2272 details->set_cannot_match();

	2273 pos->determines_perfectly = false;
	Lasse Reichstein 2009/02/18 15:20:44 Should this be set to false? If the remaining case Should this be set to false? If the remaining cases of a choice node are actually determined perfectly, we should match them. I.e., treat a cannot-match result as if it wasn't there at all. Erik Corry 2009/02/18 16:04:24 Determines perfectly indicates that this particula Show quoted text On 2009/02/18 15:20:44, Lasse Reichstein wrote: > Should this be set to false? If the remaining cases of a choice node are > actually determined perfectly, we should match them. > I.e., treat a cannot-match result as if it wasn't there at all. Determines perfectly indicates that this particular character position has been checked completely by the quick check. The implication is that if we get past the quick check then we don't need to look at this character position again. So 'false' would be wrong here.
2241 return;	2274 return;

2242 }	2275 }

2243 if (compiler->ignore_case()) {	2276 if (compiler->ignore_case()) {

2244 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	2277 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

2245 int length = uncanonicalize.get(c, '\0', chars);	2278 int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);

2246 if (length < 2) {	2279 ASSERT(length != 0); // Can only happen if c > char_mask (see above).

	2280 if (length == 1) {

2247 // This letter has no case equivalents, so it's nice and simple	2281 // This letter has no case equivalents, so it's nice and simple

2248 // and the mask-compare will determine definitely whether we have	2282 // and the mask-compare will determine definitely whether we have

2249 // a match at this character position.	2283 // a match at this character position.

2250 pos->mask = char_mask;	2284 pos->mask = char_mask;

2251 pos->value = c;	2285 pos->value = c;

2252 pos->determines_perfectly = true;	2286 pos->determines_perfectly = true;

2253 } else {	2287 } else {

2254 uint32_t common_bits = char_mask;	2288 uint32_t common_bits = char_mask;

2255 uint32_t bits = chars[0];	2289 uint32_t bits = chars[0];

2256 for (int j = 1; j < length; j++) {	2290 for (int j = 1; j < length; j++) {

(...skipping 24 matching lines...) Expand all Loading...
2281 ASSERT(characters_filled_in <= details->characters());	2315 ASSERT(characters_filled_in <= details->characters());

2282 if (characters_filled_in == details->characters()) {	2316 if (characters_filled_in == details->characters()) {

2283 return;	2317 return;

2284 }	2318 }

2285 }	2319 }

2286 } else {	2320 } else {

2287 QuickCheckDetails::Position* pos =	2321 QuickCheckDetails::Position* pos =

2288 details->positions(characters_filled_in);	2322 details->positions(characters_filled_in);

2289 RegExpCharacterClass* tree = elm.data.u_char_class;	2323 RegExpCharacterClass* tree = elm.data.u_char_class;

2290 ZoneList<CharacterRange>* ranges = tree->ranges();	2324 ZoneList<CharacterRange>* ranges = tree->ranges();

2291 CharacterRange range = ranges->at(0);

2292 if (tree->is_negated()) {	2325 if (tree->is_negated()) {

2293 // A quick check uses multi-character mask and compare. There is no	2326 // A quick check uses multi-character mask and compare. There is no

2294 // useful way to incorporate a negative char class into this scheme	2327 // useful way to incorporate a negative char class into this scheme

2295 // so we just conservatively create a mask and value that will always	2328 // so we just conservatively create a mask and value that will always

2296 // succeed.	2329 // succeed.
	Lasse Reichstein 2009/02/18 15:20:44 This comment should be changed to just "not yet ha This comment should be changed to just "not yet handled". It would be fairly easy to normalize the character class representation to be non-negated and treat it like a positive class. Erik Corry 2009/02/18 16:04:24 I don't think that char classes like [^\u0000-ac-\ Show quoted text On 2009/02/18 15:20:44, Lasse Reichstein wrote: > This comment should be changed to just "not yet handled". > It would be fairly easy to normalize the character class representation to be > non-negated and treat it like a positive class. I don't think that char classes like [^\u0000-ac-\uffff] are common enough to warrant spending time on, ever.
2297 pos->mask = 0;	2330 pos->mask = 0;

2298 pos->value = 0;	2331 pos->value = 0;

2299 } else {	2332 } else {

2300 uint32_t differing_bits = (range.from() ^ range.to());	2333 int first_range = 0;

	2334 while (ranges->at(first_range).from() > char_mask) {

	2335 first_range++;

	2336 if (first_range == ranges->length()) {

	2337 details->set_cannot_match();

	2338 pos->determines_perfectly = false;

	2339 return;

	2340 }

	2341 }

	2342 CharacterRange range = ranges->at(first_range);

	2343 uc16 from = range.from();

	2344 uc16 to = range.to();

	2345 if (to > char_mask) {

	2346 to = char_mask;

	2347 }

	2348 uint32_t differing_bits = (from ^ to);

2301 // A mask and compare is only perfect if the differing bits form a	2349 // A mask and compare is only perfect if the differing bits form a

2302 // number like 00011111 with one single block of trailing 1s.	2350 // number like 00011111 with one single block of trailing 1s.

2303 if ((differing_bits & (differing_bits + 1)) == 0) {	2351 if ((differing_bits & (differing_bits + 1)) == 0) {

2304 pos->determines_perfectly = true;	2352 pos->determines_perfectly = true;

2305 }	2353 }

2306 uint32_t common_bits = ~SmearBitsRight(differing_bits);	2354 uint32_t common_bits = ~SmearBitsRight(differing_bits);

2307 uint32_t bits = (range.from() & common_bits);	2355 uint32_t bits = (from & common_bits);

2308 for (int i = 1; i < ranges->length(); i++) {	2356 for (int i = first_range + 1; i < ranges->length(); i++) {

	2357 CharacterRange range = ranges->at(i);

	2358 uc16 from = range.from();

	2359 uc16 to = range.to();

	2360 if (from > char_mask) continue;

	2361 if (to > char_mask) to = char_mask;

2309 // Here we are combining more ranges into the mask and compare	2362 // Here we are combining more ranges into the mask and compare

2310 // value. With each new range the mask becomes more sparse and	2363 // value. With each new range the mask becomes more sparse and

2311 // so the chances of a false positive rise. A character class	2364 // so the chances of a false positive rise. A character class

2312 // with multiple ranges is assumed never to be equivalent to a	2365 // with multiple ranges is assumed never to be equivalent to a

2313 // mask and compare operation.	2366 // mask and compare operation.

2314 pos->determines_perfectly = false;	2367 pos->determines_perfectly = false;

2315 CharacterRange range = ranges->at(i);	2368 uint32_t new_common_bits = (from ^ to);

2316 uint32_t new_common_bits = (range.from() ^ range.to());

2317 new_common_bits = ~SmearBitsRight(new_common_bits);	2369 new_common_bits = ~SmearBitsRight(new_common_bits);

2318 common_bits &= new_common_bits;	2370 common_bits &= new_common_bits;

2319 bits &= new_common_bits;	2371 bits &= new_common_bits;

2320 uint32_t differing_bits = (range.from() & common_bits) ^ bits;	2372 uint32_t differing_bits = (from & common_bits) ^ bits;

2321 common_bits ^= differing_bits;	2373 common_bits ^= differing_bits;

2322 bits &= common_bits;	2374 bits &= common_bits;

2323 }	2375 }

2324 pos->mask = common_bits;	2376 pos->mask = common_bits;

2325 pos->value = bits;	2377 pos->value = bits;

2326 }	2378 }

2327 characters_filled_in++;	2379 characters_filled_in++;

2328 ASSERT(characters_filled_in <= details->characters());	2380 ASSERT(characters_filled_in <= details->characters());

2329 if (characters_filled_in == details->characters()) {	2381 if (characters_filled_in == details->characters()) {

2330 return;	2382 return;

(...skipping 281 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2612 return;	2664 return;

2613 case AT_NON_BOUNDARY:	2665 case AT_NON_BOUNDARY:

2614 case AT_BOUNDARY:	2666 case AT_BOUNDARY:

2615 EmitBoundaryCheck(type_, compiler, on_success(), trace);	2667 EmitBoundaryCheck(type_, compiler, on_success(), trace);

2616 return;	2668 return;

2617 }	2669 }

2618 on_success()->Emit(compiler, trace);	2670 on_success()->Emit(compiler, trace);

2619 }	2671 }

2620	2672

2621	2673

	2674 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {

	2675 if (quick_check == NULL) return false;

	2676 if (offset >= quick_check->characters()) return false;

	2677 return quick_check->positions(offset)->determines_perfectly;
	Lasse Reichstein 2009/02/18 15:20:44 Just checking: If we find that a position determin Just checking: If we find that a position determines the branch perfectly, will the position always be tested? Or is it possible that, e.g., both the first and second character perfectly determines the branch, and only the first is actually checked? Erik Corry 2009/02/18 16:04:24 See above. Show quoted text On 2009/02/18 15:20:44, Lasse Reichstein wrote: > Just checking: If we find that a position determines the branch perfectly, will > the position always be tested? Or is it possible that, e.g., both the first and > second character perfectly determines the branch, and only the first is actually > checked? See above.
	2678 }

	2679

	2680

	2681 static void UpdateBoundsCheck(int index, int* checked_up_to) {

	2682 if (index > *checked_up_to) {

	2683 *checked_up_to = index;

	2684 }

	2685 }

	2686

	2687

2622 // We call this repeatedly to generate code for each pass over the text node.	2688 // We call this repeatedly to generate code for each pass over the text node.

2623 // The passes are in increasing order of difficulty because we hope one	2689 // The passes are in increasing order of difficulty because we hope one

2624 // of the first passes will fail in which case we are saved the work of the	2690 // of the first passes will fail in which case we are saved the work of the

2625 // later passes. for example for the case independent regexp /%[asdfghjkl]a/	2691 // later passes. for example for the case independent regexp /%[asdfghjkl]a/

2626 // we will check the '%' in the first pass, the case independent 'a' in the	2692 // we will check the '%' in the first pass, the case independent 'a' in the

2627 // second pass and the character class in the last pass.	2693 // second pass and the character class in the last pass.

2628 //	2694 //

2629 // The passes are done from right to left, so for example to test for /bar/	2695 // The passes are done from right to left, so for example to test for /bar/

2630 // we will first test for an 'r' with offset 2, then an 'a' with offset 1	2696 // we will first test for an 'r' with offset 2, then an 'a' with offset 1

2631 // and then a 'b' with offset 0. This means we can avoid the end-of-input	2697 // and then a 'b' with offset 0. This means we can avoid the end-of-input

(...skipping 14 matching lines...) Expand all Loading...
2646 // order to get to the code we are now generating. The quick check can involve	2712 // order to get to the code we are now generating. The quick check can involve

2647 // loading characters, which means we do not need to recheck the bounds	2713 // loading characters, which means we do not need to recheck the bounds

2648 // up to the limit the quick check already checked. In addition the quick	2714 // up to the limit the quick check already checked. In addition the quick

2649 // check can have involved a mask and compare operation which may simplify	2715 // check can have involved a mask and compare operation which may simplify

2650 // or obviate the need for further checks at some character positions.	2716 // or obviate the need for further checks at some character positions.

2651 void TextNode::TextEmitPass(RegExpCompiler* compiler,	2717 void TextNode::TextEmitPass(RegExpCompiler* compiler,

2652 TextEmitPassType pass,	2718 TextEmitPassType pass,

2653 bool preloaded,	2719 bool preloaded,

2654 Trace* trace,	2720 Trace* trace,

2655 bool first_element_checked,	2721 bool first_element_checked,

2656 int* checked_up_to) {	2722 int* checked_up_to) {
	Lasse Reichstein 2009/02/18 15:20:44 This function is one big glob of special cases. If This function is one big glob of special cases. If the different passes has something in common, please refactor it to, e.g., a template function or pass objects with virtual methods or something else that avoids putting all the cases in one monolithic function. Erik Corry 2009/02/18 16:04:24 I've refactored a little more. It is now only 68 Show quoted text On 2009/02/18 15:20:44, Lasse Reichstein wrote: > This function is one big glob of special cases. > If the different passes has something in common, please refactor it to, e.g., a > template function or pass objects with virtual methods or something else that > avoids putting all the cases in one monolithic function. I've refactored a little more. It is now only 68 lines with 30 lines of comments to explain what it is doing. I feel that making it much smaller will either cause code duplication or unnecessary abstraction boilerplace elsewhere.
2657 RegExpMacroAssembler* assembler = compiler->macro_assembler();	2723 RegExpMacroAssembler* assembler = compiler->macro_assembler();

2658 bool ascii = compiler->ascii();	2724 bool ascii = compiler->ascii();

2659 Label* backtrack = trace->backtrack();	2725 Label* backtrack = trace->backtrack();

2660 QuickCheckDetails* quick_check = trace->quick_check_performed();	2726 QuickCheckDetails* quick_check = trace->quick_check_performed();

2661 int element_count = elms_->length();	2727 int element_count = elms_->length();

2662 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {	2728 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {

2663 TextElement elm = elms_->at(i);	2729 TextElement elm = elms_->at(i);

2664 int cp_offset = trace->cp_offset() + elm.cp_offset;	2730 int cp_offset = trace->cp_offset() + elm.cp_offset;

2665 if (elm.type == TextElement::ATOM) {	2731 if (elm.type == TextElement::ATOM) {

2666 if (pass == NON_ASCII_MATCH \|\|	2732 if (pass != CHARACTER_CLASS_MATCH) {

2667 pass == CHARACTER_MATCH \|\|

2668 pass == CASE_CHARACTER_MATCH) {

2669 Vector<const uc16> quarks = elm.data.u_atom->data();	2733 Vector<const uc16> quarks = elm.data.u_atom->data();

2670 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {	2734 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {

2671 bool bound_checked = true; // Most ops will check their bounds.	2735 bool bound_checked = false;

2672 if (first_element_checked && i == 0 && j == 0) continue;	2736 if (first_element_checked && i == 0 && j == 0) continue;

2673 if (pass == NON_ASCII_MATCH) {	2737 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;

2674 ASSERT(ascii);	2738 switch (pass) {

2675 if (quarks[j] > String::kMaxAsciiCharCode) {	2739 case NON_ASCII_MATCH:

2676 assembler->GoTo(backtrack);	2740 ASSERT(ascii);

2677 return;	2741 if (quarks[j] > String::kMaxAsciiCharCode) {

2678 }	2742 assembler->GoTo(backtrack);

2679 } else {	2743 return;

2680 if (quick_check != NULL &&	2744 }

2681 elm.cp_offset + j < quick_check->characters() &&	2745 break;

2682 quick_check->positions(elm.cp_offset + j)->	2746 case NON_LETTER_CHARACTER_MATCH:

2683 determines_perfectly) {	2747 bound_checked = EmitAtomNonLetter(assembler,

2684 continue;	2748 quarks[j],

2685 }	2749 ascii,

2686 if (pass == CHARACTER_MATCH) {	2750 backtrack,

2687 if (compiler->ignore_case()) {	2751 cp_offset + j,

2688 bound_checked = EmitAtomNonLetter(	2752 *checked_up_to < cp_offset + j,

2689 assembler,	2753 preloaded);

2690 quarks[j],	2754 break;

	2755 case SIMPLE_CHARACTER_MATCH:

	2756 if (!preloaded) {

	2757 assembler->LoadCurrentCharacter(

	2758 cp_offset + j,

2691 backtrack,	2759 backtrack,

2692 cp_offset + j,	2760 *checked_up_to < cp_offset + j);

2693 *checked_up_to < cp_offset + j,	2761 bound_checked = true;

2694 preloaded);

2695 } else {

2696 if (!preloaded) {

2697 assembler->LoadCurrentCharacter(

2698 cp_offset + j,

2699 backtrack,

2700 *checked_up_to < cp_offset + j);

2701 }

2702 assembler->CheckNotCharacter(quarks[j], backtrack);

2703 }	2762 }

2704 } else {	2763 assembler->CheckNotCharacter(quarks[j], backtrack);

2705 ASSERT_EQ(pass, CASE_CHARACTER_MATCH);	2764 break;

2706 ASSERT(compiler->ignore_case());	2765 case CASE_CHARACTER_MATCH:

2707 bound_checked = EmitAtomLetter(assembler,	2766 bound_checked = EmitAtomLetter(assembler,

2708 compiler->ascii(),

2709 quarks[j],	2767 quarks[j],

	2768 ascii,

2710 backtrack,	2769 backtrack,

2711 cp_offset + j,	2770 cp_offset + j,

2712 *checked_up_to < cp_offset + j,	2771 *checked_up_to < cp_offset + j,

2713 preloaded);	2772 preloaded);

2714 }	2773 break;

2715 if (bound_checked) {	2774 default:

2716 if (cp_offset + j > *checked_up_to) {	2775 break;

2717 *checked_up_to = cp_offset + j;

2718 }

2719 }

2720 }	2776 }

	2777 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);

2721 }	2778 }

2722 }	2779 }

2723 } else {	2780 } else {

2724 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);	2781 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);

2725 if (first_element_checked && i == 0) continue;

2726 if (quick_check != NULL &&

2727 elm.cp_offset < quick_check->characters() &&

2728 quick_check->positions(elm.cp_offset)->determines_perfectly) {

2729 continue;

2730 }

2731 if (pass == CHARACTER_CLASS_MATCH) {	2782 if (pass == CHARACTER_CLASS_MATCH) {

	2783 if (first_element_checked && i == 0) continue;

	2784 if (DeterminedAlready(quick_check, elm.cp_offset)) continue;

2732 RegExpCharacterClass* cc = elm.data.u_char_class;	2785 RegExpCharacterClass* cc = elm.data.u_char_class;

2733 EmitCharClass(assembler,	2786 EmitCharClass(assembler,

2734 cc,	2787 cc,

	2788 ascii,

	2789 backtrack,

2735 cp_offset,	2790 cp_offset,

2736 backtrack,

2737 *checked_up_to < cp_offset,	2791 *checked_up_to < cp_offset,

2738 ascii,

2739 preloaded);	2792 preloaded);

2740 if (cp_offset > *checked_up_to) {	2793 UpdateBoundsCheck(cp_offset, checked_up_to);

2741 *checked_up_to = cp_offset;

2742 }

2743 }	2794 }

2744 }	2795 }

2745 }	2796 }

2746 }	2797 }

2747	2798

2748	2799

2749 int TextNode::Length() {	2800 int TextNode::Length() {

2750 TextElement elm = elms_->last();	2801 TextElement elm = elms_->last();

2751 ASSERT(elm.cp_offset >= 0);	2802 ASSERT(elm.cp_offset >= 0);

2752 if (elm.type == TextElement::ATOM) {	2803 if (elm.type == TextElement::ATOM) {

2753 return elm.cp_offset + elm.data.u_atom->data().length();	2804 return elm.cp_offset + elm.data.u_atom->data().length();

2754 } else {	2805 } else {

2755 return elm.cp_offset + 1;	2806 return elm.cp_offset + 1;

2756 }	2807 }

2757 }	2808 }

2758	2809

2759	2810

	2811 bool TextNode::SkipPass(int int_pass, bool ignore_case) {

	2812 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);

	2813 if (ignore_case) {

	2814 return pass == SIMPLE_CHARACTER_MATCH;

	2815 } else {

	2816 return pass == NON_LETTER_CHARACTER_MATCH \|\| pass == CASE_CHARACTER_MATCH;

	2817 }

	2818 }

	2819

	2820

2760 // This generates the code to match a text node. A text node can contain	2821 // This generates the code to match a text node. A text node can contain

2761 // straight character sequences (possibly to be matched in a case-independent	2822 // straight character sequences (possibly to be matched in a case-independent

2762 // way) and character classes. For efficiency we do not do this in a single	2823 // way) and character classes. For efficiency we do not do this in a single

2763 // pass from left to right. Instead we pass over the text node several times,	2824 // pass from left to right. Instead we pass over the text node several times,

2764 // emitting code for some character positions every time. See the comment on	2825 // emitting code for some character positions every time. See the comment on

2765 // TextEmitPass for details.	2826 // TextEmitPass for details.

2766 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {	2827 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {

2767 LimitResult limit_result = LimitVersions(compiler, trace);	2828 LimitResult limit_result = LimitVersions(compiler, trace);

2768 if (limit_result == DONE) return;	2829 if (limit_result == DONE) return;

2769 ASSERT(limit_result == CONTINUE);	2830 ASSERT(limit_result == CONTINUE);

2770	2831

2771 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {	2832 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {

2772 compiler->SetRegExpTooBig();	2833 compiler->SetRegExpTooBig();

2773 return;	2834 return;

2774 }	2835 }

2775	2836

2776 if (compiler->ascii()) {	2837 if (compiler->ascii()) {

2777 int dummy = 0;	2838 int dummy = 0;

2778 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);	2839 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);

2779 }	2840 }

2780	2841

2781 bool first_elt_done = false;	2842 bool first_elt_done = false;

2782 int bound_checked_to = trace->cp_offset() - 1;	2843 int bound_checked_to = trace->cp_offset() - 1;

2783 bound_checked_to += trace->bound_checked_up_to();	2844 bound_checked_to += trace->bound_checked_up_to();

2784	2845

2785 // If a character is preloaded into the current character register then	2846 // If a character is preloaded into the current character register then

2786 // check that now.	2847 // check that now.

2787 if (trace->characters_preloaded() == 1) {	2848 if (trace->characters_preloaded() == 1) {

2788 TextEmitPass(compiler,	2849 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {

2789 CHARACTER_MATCH,	2850 if (!SkipPass(pass, compiler->ignore_case())) {

2790 true,	2851 TextEmitPass(compiler,

2791 trace,	2852 static_cast<TextEmitPassType>(pass),

2792 false,	2853 true,

2793 &bound_checked_to);	2854 trace,

2794 if (compiler->ignore_case()) {	2855 false,

2795 TextEmitPass(compiler,	2856 &bound_checked_to);

2796 CASE_CHARACTER_MATCH,	2857 }

2797 true,

2798 trace,

2799 false,

2800 &bound_checked_to);

2801 }	2858 }

2802 TextEmitPass(compiler,

2803 CHARACTER_CLASS_MATCH,

2804 true,

2805 trace,

2806 false,

2807 &bound_checked_to);

2808 first_elt_done = true;	2859 first_elt_done = true;

2809 }	2860 }

2810	2861

2811 TextEmitPass(compiler,	2862 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {

2812 CHARACTER_MATCH,	2863 if (!SkipPass(pass, compiler->ignore_case())) {

2813 false,	2864 TextEmitPass(compiler,

2814 trace,	2865 static_cast<TextEmitPassType>(pass),

2815 first_elt_done,	2866 false,

2816 &bound_checked_to);	2867 trace,

2817 if (compiler->ignore_case()) {	2868 first_elt_done,

2818 TextEmitPass(compiler,	2869 &bound_checked_to);

2819 CASE_CHARACTER_MATCH,	2870 }

2820 false,

2821 trace,

2822 first_elt_done,

2823 &bound_checked_to);

2824 }	2871 }

2825 TextEmitPass(compiler,

2826 CHARACTER_CLASS_MATCH,

2827 false,

2828 trace,

2829 first_elt_done,

2830 &bound_checked_to);

2831	2872

2832 Trace successor_trace(*trace);	2873 Trace successor_trace(*trace);

2833 successor_trace.set_at_start(false);	2874 successor_trace.set_at_start(false);

2834 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);	2875 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);

2835 RecursionCheck rc(compiler);	2876 RecursionCheck rc(compiler);

2836 on_success()->Emit(compiler, &successor_trace);	2877 on_success()->Emit(compiler, &successor_trace);

2837 }	2878 }

2838	2879

2839	2880

2840 void Trace::InvalidateCurrentCharacter() {	2881 void Trace::InvalidateCurrentCharacter() {

(...skipping 2043 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4884 EmbeddedVector<byte, 1024> codes;	4925 EmbeddedVector<byte, 1024> codes;

4885 RegExpMacroAssemblerIrregexp macro_assembler(codes);	4926 RegExpMacroAssemblerIrregexp macro_assembler(codes);

4886 return compiler.Assemble(&macro_assembler,	4927 return compiler.Assemble(&macro_assembler,

4887 node,	4928 node,

4888 data->capture_count,	4929 data->capture_count,

4889 pattern);	4930 pattern);

4890 }	4931 }

4891	4932

4892	4933

4893 }} // namespace v8::internal	4934 }} // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/jsregexp.h ('k') | src/objects.h » ('j') | no next file with comments »