Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2006-2008 the V8 project authors. All rights reserved. | 1 // Copyright 2006-2008 the V8 project authors. All rights reserved. |
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
| 4 // met: | 4 // met: |
| 5 // | 5 // |
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
| (...skipping 1717 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1728 trace->backtrack()); | 1728 trace->backtrack()); |
| 1729 break; | 1729 break; |
| 1730 } | 1730 } |
| 1731 } | 1731 } |
| 1732 | 1732 |
| 1733 | 1733 |
| 1734 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; | 1734 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; |
| 1735 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; | 1735 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; |
| 1736 | 1736 |
| 1737 | 1737 |
| 1738 // Returns the number of characters in the equivalence class, omitting those | |
| 1739 // that cannot occur in the source string because it is ASCII. | |
| 1740 static int GetCaseIndependentLetters(uc16 character, | |
| 1741 bool ascii_subject, | |
| 1742 unibrow::uchar* letters) { | |
| 1743 int length = uncanonicalize.get(character, '\0', letters); | |
| 1744 if (length < 2) { | |
| 1745 // Unibrow returns 0 or 1 for characters where case independependence is | |
| 1746 // trivial. | |
| 1747 if (ascii_subject && character > String::kMaxAsciiCharCode) { | |
| 1748 return 0; // It's impossible to find a match here. | |
| 1749 } | |
| 1750 letters[0] = character; | |
| 1751 return 1; | |
| 1752 } | |
| 1753 if (!ascii_subject) { | |
| 1754 return length; | |
| 1755 } | |
| 1756 int characters = 0; | |
| 1757 for (int i = 0; i < length; i++) { | |
| 1758 if (letters[i] <= String::kMaxAsciiCharCodeU) { | |
| 1759 letters[characters++] = letters[i]; | |
| 1760 } | |
| 1761 } | |
| 1762 return characters; | |
| 1763 } | |
|
Lasse Reichstein
2009/02/06 15:50:56
If uncanonicalize implements Canonicalize^-1 o Can
| |
| 1764 | |
| 1765 | |
| 1738 // Only emits non-letters (things that don't have case). Only used for case | 1766 // Only emits non-letters (things that don't have case). Only used for case |
| 1739 // independent matches. | 1767 // independent matches. |
| 1740 static inline bool EmitAtomNonLetter( | 1768 static inline bool EmitAtomNonLetter( |
| 1741 RegExpMacroAssembler* macro_assembler, | 1769 RegExpMacroAssembler* macro_assembler, |
| 1742 uc16 c, | 1770 uc16 c, |
| 1771 bool ascii, | |
| 1743 Label* on_failure, | 1772 Label* on_failure, |
| 1744 int cp_offset, | 1773 int cp_offset, |
| 1745 bool check, | 1774 bool check, |
| 1746 bool preloaded) { | 1775 bool preloaded) { |
| 1747 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1776 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 1748 int length = uncanonicalize.get(c, '\0', chars); | 1777 int length = GetCaseIndependentLetters(c, ascii, chars); |
| 1778 if (length < 1) { | |
| 1779 // Can't match. | |
| 1780 macro_assembler->GoTo(on_failure); | |
| 1781 return false; // Bounds not checked. | |
| 1782 } | |
| 1749 bool checked = false; | 1783 bool checked = false; |
| 1750 if (length <= 1) { | 1784 // We handle the length > 1 case in another pass. |
| 1785 if (length == 1) { | |
|
Lasse Reichstein
2009/02/06 15:50:56
What about the length == 0 case?
Could we emit a
| |
| 1786 if (ascii && c > String::kMaxAsciiCharCodeU) { | |
|
Lasse Reichstein
2009/02/06 15:50:56
Wasn't this checked in line 1747, so length will a
| |
| 1787 // Can't match. | |
| 1788 macro_assembler->GoTo(on_failure); | |
| 1789 return false; // Bounds not checked. | |
| 1790 } | |
| 1751 if (!preloaded) { | 1791 if (!preloaded) { |
| 1752 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); | 1792 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
| 1753 checked = check; | 1793 checked = check; |
| 1754 } | 1794 } |
| 1755 macro_assembler->CheckNotCharacter(c, on_failure); | 1795 macro_assembler->CheckNotCharacter(c, on_failure); |
| 1756 } | 1796 } |
| 1757 return checked; | 1797 return checked; |
| 1758 } | 1798 } |
| 1759 | 1799 |
| 1760 | 1800 |
| (...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1801 // matches. | 1841 // matches. |
| 1802 static inline bool EmitAtomLetter( | 1842 static inline bool EmitAtomLetter( |
| 1803 RegExpMacroAssembler* macro_assembler, | 1843 RegExpMacroAssembler* macro_assembler, |
| 1804 bool ascii, | 1844 bool ascii, |
| 1805 uc16 c, | 1845 uc16 c, |
| 1806 Label* on_failure, | 1846 Label* on_failure, |
| 1807 int cp_offset, | 1847 int cp_offset, |
| 1808 bool check, | 1848 bool check, |
| 1809 bool preloaded) { | 1849 bool preloaded) { |
| 1810 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1850 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 1811 int length = uncanonicalize.get(c, '\0', chars); | 1851 int length = GetCaseIndependentLetters(c, ascii, chars); |
| 1812 if (length <= 1) return false; | 1852 if (length <= 1) return false; |
| 1813 // We may not need to check against the end of the input string | 1853 // We may not need to check against the end of the input string |
| 1814 // if this character lies before a character that matched. | 1854 // if this character lies before a character that matched. |
| 1815 if (!preloaded) { | 1855 if (!preloaded) { |
| 1816 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); | 1856 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
| 1817 } | 1857 } |
| 1818 Label ok; | 1858 Label ok; |
| 1819 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); | 1859 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); |
| 1820 switch (length) { | 1860 switch (length) { |
| 1821 case 2: { | 1861 case 2: { |
| (...skipping 401 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2223 for (int k = 0; k < elms_->length(); k++) { | 2263 for (int k = 0; k < elms_->length(); k++) { |
| 2224 TextElement elm = elms_->at(k); | 2264 TextElement elm = elms_->at(k); |
| 2225 if (elm.type == TextElement::ATOM) { | 2265 if (elm.type == TextElement::ATOM) { |
| 2226 Vector<const uc16> quarks = elm.data.u_atom->data(); | 2266 Vector<const uc16> quarks = elm.data.u_atom->data(); |
| 2227 for (int i = 0; i < characters && i < quarks.length(); i++) { | 2267 for (int i = 0; i < characters && i < quarks.length(); i++) { |
| 2228 QuickCheckDetails::Position* pos = | 2268 QuickCheckDetails::Position* pos = |
| 2229 details->positions(characters_filled_in); | 2269 details->positions(characters_filled_in); |
| 2230 if (compiler->ignore_case()) { | 2270 if (compiler->ignore_case()) { |
| 2231 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 2271 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 2232 uc16 c = quarks[i]; | 2272 uc16 c = quarks[i]; |
| 2233 int length = uncanonicalize.get(c, '\0', chars); | 2273 int length = GetCaseIndependentLetters(c, compiler->ascii(), chars); |
| 2234 if (length < 2) { | 2274 if (length == 0) { |
| 2275 details->set_cannot_match(); | |
| 2276 pos->determines_perfectly = false; | |
| 2277 return; | |
| 2278 } | |
| 2279 if (length == 1) { | |
| 2235 // This letter has no case equivalents, so it's nice and simple | 2280 // This letter has no case equivalents, so it's nice and simple |
| 2236 // and the mask-compare will determine definitely whether we have | 2281 // and the mask-compare will determine definitely whether we have |
| 2237 // a match at this character position. | 2282 // a match at this character position. |
| 2238 pos->mask = char_mask; | 2283 pos->mask = char_mask; |
| 2239 pos->value = c; | 2284 pos->value = c; |
| 2240 pos->determines_perfectly = true; | 2285 pos->determines_perfectly = true; |
| 2241 } else { | 2286 } else { |
| 2242 uint32_t common_bits = char_mask; | 2287 uint32_t common_bits = char_mask; |
| 2243 uint32_t bits = chars[0]; | 2288 uint32_t bits = chars[0]; |
| 2244 for (int j = 1; j < length; j++) { | 2289 for (int j = 1; j < length; j++) { |
| 2245 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); | 2290 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); |
| 2246 common_bits ^= differing_bits; | 2291 common_bits ^= differing_bits; |
| 2247 bits &= common_bits; | 2292 bits &= common_bits; |
| 2248 } | 2293 } |
| 2249 // If length is 2 and common bits has only one zero in it then | 2294 // If length is 2 and common bits has only one zero in it then |
| 2250 // our mask and compare instruction will determine definitely | 2295 // our mask and compare instruction will determine definitely |
| 2251 // whether we have a match at this character position. Otherwise | 2296 // whether we have a match at this character position. Otherwise |
| 2252 // it can only be an approximate check. | 2297 // it can only be an approximate check. |
| 2253 uint32_t one_zero = (common_bits | ~char_mask); | 2298 uint32_t one_zero = (common_bits | ~char_mask); |
| 2254 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { | 2299 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { |
| 2255 pos->determines_perfectly = true; | 2300 pos->determines_perfectly = true; |
| 2256 } | 2301 } |
| 2257 pos->mask = common_bits; | 2302 pos->mask = common_bits; |
| 2258 pos->value = bits; | 2303 pos->value = bits; |
| 2259 } | 2304 } |
| 2260 } else { | 2305 } else { |
| 2261 // Don't ignore case. Nice simple case where the mask-compare will | 2306 // Don't ignore case. Nice simple case where the mask-compare will |
| 2262 // determine definitely whether we have a match at this character | 2307 // determine definitely whether we have a match at this character |
| 2263 // position. | 2308 // position. |
| 2309 if (quarks[i] > char_mask) { | |
| 2310 pos->determines_perfectly = false; | |
| 2311 details->set_cannot_match(); | |
| 2312 return; | |
| 2313 } | |
| 2264 pos->mask = char_mask; | 2314 pos->mask = char_mask; |
| 2265 pos->value = quarks[i]; | 2315 pos->value = quarks[i]; |
| 2266 pos->determines_perfectly = true; | 2316 pos->determines_perfectly = true; |
| 2267 } | 2317 } |
| 2268 characters_filled_in++; | 2318 characters_filled_in++; |
| 2269 ASSERT(characters_filled_in <= details->characters()); | 2319 ASSERT(characters_filled_in <= details->characters()); |
| 2270 if (characters_filled_in == details->characters()) { | 2320 if (characters_filled_in == details->characters()) { |
| 2271 return; | 2321 return; |
| 2272 } | 2322 } |
| 2273 } | 2323 } |
| 2274 } else { | 2324 } else { |
| 2275 QuickCheckDetails::Position* pos = | 2325 QuickCheckDetails::Position* pos = |
| 2276 details->positions(characters_filled_in); | 2326 details->positions(characters_filled_in); |
| 2277 RegExpCharacterClass* tree = elm.data.u_char_class; | 2327 RegExpCharacterClass* tree = elm.data.u_char_class; |
| 2278 ZoneList<CharacterRange>* ranges = tree->ranges(); | 2328 ZoneList<CharacterRange>* ranges = tree->ranges(); |
| 2279 CharacterRange range = ranges->at(0); | |
| 2280 if (tree->is_negated()) { | 2329 if (tree->is_negated()) { |
| 2281 // A quick check uses multi-character mask and compare. There is no | 2330 // A quick check uses multi-character mask and compare. There is no |
| 2282 // useful way to incorporate a negative char class into this scheme | 2331 // useful way to incorporate a negative char class into this scheme |
| 2283 // so we just conservatively create a mask and value that will always | 2332 // so we just conservatively create a mask and value that will always |
| 2284 // succeed. | 2333 // succeed. |
|
Lasse Reichstein
2009/02/06 15:50:56
The comment is a little too pessimistic. A negated
| |
| 2285 pos->mask = 0; | 2334 pos->mask = 0; |
| 2286 pos->value = 0; | 2335 pos->value = 0; |
| 2287 } else { | 2336 } else { |
| 2288 uint32_t differing_bits = (range.from() ^ range.to()); | 2337 int first_range = 0; |
| 2338 while (ranges->at(first_range).from() > char_mask) { | |
| 2339 first_range++; | |
| 2340 if (first_range == ranges->length()) { | |
| 2341 details->set_cannot_match(); | |
| 2342 pos->determines_perfectly = false; | |
| 2343 return; | |
| 2344 } | |
| 2345 } | |
| 2346 CharacterRange range = ranges->at(first_range); | |
| 2347 uc16 from = range.from(); | |
| 2348 uc16 to = range.to(); | |
| 2349 if (to > char_mask) { | |
| 2350 to = char_mask; | |
| 2351 } | |
| 2352 uint32_t differing_bits = (from ^ to); | |
| 2289 // A mask and compare is only perfect if the differing bits form a | 2353 // A mask and compare is only perfect if the differing bits form a |
| 2290 // number like 00011111 with one single block of trailing 1s. | 2354 // number like 00011111 with one single block of trailing 1s. |
| 2291 if ((differing_bits & (differing_bits + 1)) == 0) { | 2355 if ((differing_bits & (differing_bits + 1)) == 0) { |
| 2292 pos->determines_perfectly = true; | 2356 pos->determines_perfectly = true; |
| 2293 } | 2357 } |
| 2294 uint32_t common_bits = ~SmearBitsRight(differing_bits); | 2358 uint32_t common_bits = ~SmearBitsRight(differing_bits); |
| 2295 uint32_t bits = (range.from() & common_bits); | 2359 uint32_t bits = (from & common_bits); |
| 2296 for (int i = 1; i < ranges->length(); i++) { | 2360 for (int i = first_range + 1; i < ranges->length(); i++) { |
| 2361 CharacterRange range = ranges->at(i); | |
| 2362 uc16 from = range.from(); | |
| 2363 uc16 to = range.to(); | |
| 2364 if (from > char_mask) continue; | |
| 2365 if (to > char_mask) to = char_mask; | |
| 2297 // Here we are combining more ranges into the mask and compare | 2366 // Here we are combining more ranges into the mask and compare |
| 2298 // value. With each new range the mask becomes more sparse and | 2367 // value. With each new range the mask becomes more sparse and |
| 2299 // so the chances of a false positive rise. A character class | 2368 // so the chances of a false positive rise. A character class |
| 2300 // with multiple ranges is assumed never to be equivalent to a | 2369 // with multiple ranges is assumed never to be equivalent to a |
| 2301 // mask and compare operation. | 2370 // mask and compare operation. |
| 2302 pos->determines_perfectly = false; | 2371 pos->determines_perfectly = false; |
| 2303 CharacterRange range = ranges->at(i); | 2372 uint32_t new_common_bits = (from ^ to); |
| 2304 uint32_t new_common_bits = (range.from() ^ range.to()); | |
| 2305 new_common_bits = ~SmearBitsRight(new_common_bits); | 2373 new_common_bits = ~SmearBitsRight(new_common_bits); |
| 2306 common_bits &= new_common_bits; | 2374 common_bits &= new_common_bits; |
| 2307 bits &= new_common_bits; | 2375 bits &= new_common_bits; |
| 2308 uint32_t differing_bits = (range.from() & common_bits) ^ bits; | 2376 uint32_t differing_bits = (from & common_bits) ^ bits; |
| 2309 common_bits ^= differing_bits; | 2377 common_bits ^= differing_bits; |
| 2310 bits &= common_bits; | 2378 bits &= common_bits; |
| 2311 } | 2379 } |
| 2312 pos->mask = common_bits; | 2380 pos->mask = common_bits; |
| 2313 pos->value = bits; | 2381 pos->value = bits; |
| 2314 } | 2382 } |
| 2315 characters_filled_in++; | 2383 characters_filled_in++; |
| 2316 ASSERT(characters_filled_in <= details->characters()); | 2384 ASSERT(characters_filled_in <= details->characters()); |
| 2317 if (characters_filled_in == details->characters()) { | 2385 if (characters_filled_in == details->characters()) { |
| 2318 return; | 2386 return; |
| (...skipping 347 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2666 if (pass == NON_ASCII_MATCH) { | 2734 if (pass == NON_ASCII_MATCH) { |
| 2667 ASSERT(ascii); | 2735 ASSERT(ascii); |
| 2668 if (quarks[j] > String::kMaxAsciiCharCode) { | 2736 if (quarks[j] > String::kMaxAsciiCharCode) { |
| 2669 assembler->GoTo(backtrack); | 2737 assembler->GoTo(backtrack); |
| 2670 return; | 2738 return; |
| 2671 } | 2739 } |
| 2672 } else if (pass == CHARACTER_MATCH) { | 2740 } else if (pass == CHARACTER_MATCH) { |
| 2673 if (compiler->ignore_case()) { | 2741 if (compiler->ignore_case()) { |
| 2674 bound_checked = EmitAtomNonLetter(assembler, | 2742 bound_checked = EmitAtomNonLetter(assembler, |
| 2675 quarks[j], | 2743 quarks[j], |
| 2744 ascii, | |
| 2676 backtrack, | 2745 backtrack, |
| 2677 cp_offset + j, | 2746 cp_offset + j, |
| 2678 *checked_up_to < cp_offset + j, | 2747 *checked_up_to < cp_offset + j, |
| 2679 preloaded); | 2748 preloaded); |
| 2680 } else { | 2749 } else { |
| 2681 if (!preloaded) { | 2750 if (!preloaded) { |
| 2682 assembler->LoadCurrentCharacter(cp_offset + j, | 2751 assembler->LoadCurrentCharacter(cp_offset + j, |
| 2683 backtrack, | 2752 backtrack, |
| 2684 *checked_up_to < cp_offset + j); | 2753 *checked_up_to < cp_offset + j); |
| 2685 } | 2754 } |
| (...skipping 2181 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 4867 EmbeddedVector<byte, 1024> codes; | 4936 EmbeddedVector<byte, 1024> codes; |
| 4868 RegExpMacroAssemblerIrregexp macro_assembler(codes); | 4937 RegExpMacroAssemblerIrregexp macro_assembler(codes); |
| 4869 return compiler.Assemble(¯o_assembler, | 4938 return compiler.Assemble(¯o_assembler, |
| 4870 node, | 4939 node, |
| 4871 data->capture_count, | 4940 data->capture_count, |
| 4872 pattern); | 4941 pattern); |
| 4873 } | 4942 } |
| 4874 | 4943 |
| 4875 | 4944 |
| 4876 }} // namespace v8::internal | 4945 }} // namespace v8::internal |
| OLD | NEW |