Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(241)

Side by Side Diff: src/jsregexp.cc

Issue 21126: Fix some glitches around unicode regexps matching ascii subject strings. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 11 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | src/objects.h » ('j') | src/objects.h » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 1717 matching lines...) Expand 10 before | Expand all | Expand 10 after
1728 trace->backtrack()); 1728 trace->backtrack());
1729 break; 1729 break;
1730 } 1730 }
1731 } 1731 }
1732 1732
1733 1733
1734 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; 1734 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1735 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; 1735 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1736 1736
1737 1737
1738 // Returns the number of characters in the equivalence class, omitting those
1739 // that cannot occur in the source string because it is ASCII.
1740 static int GetCaseIndependentLetters(uc16 character,
1741 bool ascii_subject,
1742 unibrow::uchar* letters) {
1743 int length = uncanonicalize.get(character, '\0', letters);
1744 if (length < 2) {
1745 // Unibrow returns 0 or 1 for characters where case independependence is
1746 // trivial.
1747 if (ascii_subject && character > String::kMaxAsciiCharCode) {
1748 return 0; // It's impossible to find a match here.
1749 }
1750 letters[0] = character;
1751 return 1;
1752 }
1753 if (!ascii_subject) {
1754 return length;
1755 }
1756 int characters = 0;
1757 for (int i = 0; i < length; i++) {
1758 if (letters[i] <= String::kMaxAsciiCharCodeU) {
1759 letters[characters++] = letters[i];
1760 }
1761 }
1762 return characters;
1763 }
Lasse Reichstein 2009/02/06 15:50:56 If uncanonicalize implements Canonicalize^-1 o Can
1764
1765
1738 // Only emits non-letters (things that don't have case). Only used for case 1766 // Only emits non-letters (things that don't have case). Only used for case
1739 // independent matches. 1767 // independent matches.
1740 static inline bool EmitAtomNonLetter( 1768 static inline bool EmitAtomNonLetter(
1741 RegExpMacroAssembler* macro_assembler, 1769 RegExpMacroAssembler* macro_assembler,
1742 uc16 c, 1770 uc16 c,
1771 bool ascii,
1743 Label* on_failure, 1772 Label* on_failure,
1744 int cp_offset, 1773 int cp_offset,
1745 bool check, 1774 bool check,
1746 bool preloaded) { 1775 bool preloaded) {
1747 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1776 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1748 int length = uncanonicalize.get(c, '\0', chars); 1777 int length = GetCaseIndependentLetters(c, ascii, chars);
1778 if (length < 1) {
1779 // Can't match.
1780 macro_assembler->GoTo(on_failure);
1781 return false; // Bounds not checked.
1782 }
1749 bool checked = false; 1783 bool checked = false;
1750 if (length <= 1) { 1784 // We handle the length > 1 case in another pass.
1785 if (length == 1) {
Lasse Reichstein 2009/02/06 15:50:56 What about the length == 0 case? Could we emit a
1786 if (ascii && c > String::kMaxAsciiCharCodeU) {
Lasse Reichstein 2009/02/06 15:50:56 Wasn't this checked in line 1747, so length will a
1787 // Can't match.
1788 macro_assembler->GoTo(on_failure);
1789 return false; // Bounds not checked.
1790 }
1751 if (!preloaded) { 1791 if (!preloaded) {
1752 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1792 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1753 checked = check; 1793 checked = check;
1754 } 1794 }
1755 macro_assembler->CheckNotCharacter(c, on_failure); 1795 macro_assembler->CheckNotCharacter(c, on_failure);
1756 } 1796 }
1757 return checked; 1797 return checked;
1758 } 1798 }
1759 1799
1760 1800
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
1801 // matches. 1841 // matches.
1802 static inline bool EmitAtomLetter( 1842 static inline bool EmitAtomLetter(
1803 RegExpMacroAssembler* macro_assembler, 1843 RegExpMacroAssembler* macro_assembler,
1804 bool ascii, 1844 bool ascii,
1805 uc16 c, 1845 uc16 c,
1806 Label* on_failure, 1846 Label* on_failure,
1807 int cp_offset, 1847 int cp_offset,
1808 bool check, 1848 bool check,
1809 bool preloaded) { 1849 bool preloaded) {
1810 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1850 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1811 int length = uncanonicalize.get(c, '\0', chars); 1851 int length = GetCaseIndependentLetters(c, ascii, chars);
1812 if (length <= 1) return false; 1852 if (length <= 1) return false;
1813 // We may not need to check against the end of the input string 1853 // We may not need to check against the end of the input string
1814 // if this character lies before a character that matched. 1854 // if this character lies before a character that matched.
1815 if (!preloaded) { 1855 if (!preloaded) {
1816 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1856 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1817 } 1857 }
1818 Label ok; 1858 Label ok;
1819 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); 1859 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1820 switch (length) { 1860 switch (length) {
1821 case 2: { 1861 case 2: {
(...skipping 401 matching lines...) Expand 10 before | Expand all | Expand 10 after
2223 for (int k = 0; k < elms_->length(); k++) { 2263 for (int k = 0; k < elms_->length(); k++) {
2224 TextElement elm = elms_->at(k); 2264 TextElement elm = elms_->at(k);
2225 if (elm.type == TextElement::ATOM) { 2265 if (elm.type == TextElement::ATOM) {
2226 Vector<const uc16> quarks = elm.data.u_atom->data(); 2266 Vector<const uc16> quarks = elm.data.u_atom->data();
2227 for (int i = 0; i < characters && i < quarks.length(); i++) { 2267 for (int i = 0; i < characters && i < quarks.length(); i++) {
2228 QuickCheckDetails::Position* pos = 2268 QuickCheckDetails::Position* pos =
2229 details->positions(characters_filled_in); 2269 details->positions(characters_filled_in);
2230 if (compiler->ignore_case()) { 2270 if (compiler->ignore_case()) {
2231 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 2271 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2232 uc16 c = quarks[i]; 2272 uc16 c = quarks[i];
2233 int length = uncanonicalize.get(c, '\0', chars); 2273 int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);
2234 if (length < 2) { 2274 if (length == 0) {
2275 details->set_cannot_match();
2276 pos->determines_perfectly = false;
2277 return;
2278 }
2279 if (length == 1) {
2235 // This letter has no case equivalents, so it's nice and simple 2280 // This letter has no case equivalents, so it's nice and simple
2236 // and the mask-compare will determine definitely whether we have 2281 // and the mask-compare will determine definitely whether we have
2237 // a match at this character position. 2282 // a match at this character position.
2238 pos->mask = char_mask; 2283 pos->mask = char_mask;
2239 pos->value = c; 2284 pos->value = c;
2240 pos->determines_perfectly = true; 2285 pos->determines_perfectly = true;
2241 } else { 2286 } else {
2242 uint32_t common_bits = char_mask; 2287 uint32_t common_bits = char_mask;
2243 uint32_t bits = chars[0]; 2288 uint32_t bits = chars[0];
2244 for (int j = 1; j < length; j++) { 2289 for (int j = 1; j < length; j++) {
2245 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); 2290 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2246 common_bits ^= differing_bits; 2291 common_bits ^= differing_bits;
2247 bits &= common_bits; 2292 bits &= common_bits;
2248 } 2293 }
2249 // If length is 2 and common bits has only one zero in it then 2294 // If length is 2 and common bits has only one zero in it then
2250 // our mask and compare instruction will determine definitely 2295 // our mask and compare instruction will determine definitely
2251 // whether we have a match at this character position. Otherwise 2296 // whether we have a match at this character position. Otherwise
2252 // it can only be an approximate check. 2297 // it can only be an approximate check.
2253 uint32_t one_zero = (common_bits | ~char_mask); 2298 uint32_t one_zero = (common_bits | ~char_mask);
2254 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { 2299 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2255 pos->determines_perfectly = true; 2300 pos->determines_perfectly = true;
2256 } 2301 }
2257 pos->mask = common_bits; 2302 pos->mask = common_bits;
2258 pos->value = bits; 2303 pos->value = bits;
2259 } 2304 }
2260 } else { 2305 } else {
2261 // Don't ignore case. Nice simple case where the mask-compare will 2306 // Don't ignore case. Nice simple case where the mask-compare will
2262 // determine definitely whether we have a match at this character 2307 // determine definitely whether we have a match at this character
2263 // position. 2308 // position.
2309 if (quarks[i] > char_mask) {
2310 pos->determines_perfectly = false;
2311 details->set_cannot_match();
2312 return;
2313 }
2264 pos->mask = char_mask; 2314 pos->mask = char_mask;
2265 pos->value = quarks[i]; 2315 pos->value = quarks[i];
2266 pos->determines_perfectly = true; 2316 pos->determines_perfectly = true;
2267 } 2317 }
2268 characters_filled_in++; 2318 characters_filled_in++;
2269 ASSERT(characters_filled_in <= details->characters()); 2319 ASSERT(characters_filled_in <= details->characters());
2270 if (characters_filled_in == details->characters()) { 2320 if (characters_filled_in == details->characters()) {
2271 return; 2321 return;
2272 } 2322 }
2273 } 2323 }
2274 } else { 2324 } else {
2275 QuickCheckDetails::Position* pos = 2325 QuickCheckDetails::Position* pos =
2276 details->positions(characters_filled_in); 2326 details->positions(characters_filled_in);
2277 RegExpCharacterClass* tree = elm.data.u_char_class; 2327 RegExpCharacterClass* tree = elm.data.u_char_class;
2278 ZoneList<CharacterRange>* ranges = tree->ranges(); 2328 ZoneList<CharacterRange>* ranges = tree->ranges();
2279 CharacterRange range = ranges->at(0);
2280 if (tree->is_negated()) { 2329 if (tree->is_negated()) {
2281 // A quick check uses multi-character mask and compare. There is no 2330 // A quick check uses multi-character mask and compare. There is no
2282 // useful way to incorporate a negative char class into this scheme 2331 // useful way to incorporate a negative char class into this scheme
2283 // so we just conservatively create a mask and value that will always 2332 // so we just conservatively create a mask and value that will always
2284 // succeed. 2333 // succeed.
Lasse Reichstein 2009/02/06 15:50:56 The comment is a little too pessimistic. A negated
2285 pos->mask = 0; 2334 pos->mask = 0;
2286 pos->value = 0; 2335 pos->value = 0;
2287 } else { 2336 } else {
2288 uint32_t differing_bits = (range.from() ^ range.to()); 2337 int first_range = 0;
2338 while (ranges->at(first_range).from() > char_mask) {
2339 first_range++;
2340 if (first_range == ranges->length()) {
2341 details->set_cannot_match();
2342 pos->determines_perfectly = false;
2343 return;
2344 }
2345 }
2346 CharacterRange range = ranges->at(first_range);
2347 uc16 from = range.from();
2348 uc16 to = range.to();
2349 if (to > char_mask) {
2350 to = char_mask;
2351 }
2352 uint32_t differing_bits = (from ^ to);
2289 // A mask and compare is only perfect if the differing bits form a 2353 // A mask and compare is only perfect if the differing bits form a
2290 // number like 00011111 with one single block of trailing 1s. 2354 // number like 00011111 with one single block of trailing 1s.
2291 if ((differing_bits & (differing_bits + 1)) == 0) { 2355 if ((differing_bits & (differing_bits + 1)) == 0) {
2292 pos->determines_perfectly = true; 2356 pos->determines_perfectly = true;
2293 } 2357 }
2294 uint32_t common_bits = ~SmearBitsRight(differing_bits); 2358 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2295 uint32_t bits = (range.from() & common_bits); 2359 uint32_t bits = (from & common_bits);
2296 for (int i = 1; i < ranges->length(); i++) { 2360 for (int i = first_range + 1; i < ranges->length(); i++) {
2361 CharacterRange range = ranges->at(i);
2362 uc16 from = range.from();
2363 uc16 to = range.to();
2364 if (from > char_mask) continue;
2365 if (to > char_mask) to = char_mask;
2297 // Here we are combining more ranges into the mask and compare 2366 // Here we are combining more ranges into the mask and compare
2298 // value. With each new range the mask becomes more sparse and 2367 // value. With each new range the mask becomes more sparse and
2299 // so the chances of a false positive rise. A character class 2368 // so the chances of a false positive rise. A character class
2300 // with multiple ranges is assumed never to be equivalent to a 2369 // with multiple ranges is assumed never to be equivalent to a
2301 // mask and compare operation. 2370 // mask and compare operation.
2302 pos->determines_perfectly = false; 2371 pos->determines_perfectly = false;
2303 CharacterRange range = ranges->at(i); 2372 uint32_t new_common_bits = (from ^ to);
2304 uint32_t new_common_bits = (range.from() ^ range.to());
2305 new_common_bits = ~SmearBitsRight(new_common_bits); 2373 new_common_bits = ~SmearBitsRight(new_common_bits);
2306 common_bits &= new_common_bits; 2374 common_bits &= new_common_bits;
2307 bits &= new_common_bits; 2375 bits &= new_common_bits;
2308 uint32_t differing_bits = (range.from() & common_bits) ^ bits; 2376 uint32_t differing_bits = (from & common_bits) ^ bits;
2309 common_bits ^= differing_bits; 2377 common_bits ^= differing_bits;
2310 bits &= common_bits; 2378 bits &= common_bits;
2311 } 2379 }
2312 pos->mask = common_bits; 2380 pos->mask = common_bits;
2313 pos->value = bits; 2381 pos->value = bits;
2314 } 2382 }
2315 characters_filled_in++; 2383 characters_filled_in++;
2316 ASSERT(characters_filled_in <= details->characters()); 2384 ASSERT(characters_filled_in <= details->characters());
2317 if (characters_filled_in == details->characters()) { 2385 if (characters_filled_in == details->characters()) {
2318 return; 2386 return;
(...skipping 347 matching lines...) Expand 10 before | Expand all | Expand 10 after
2666 if (pass == NON_ASCII_MATCH) { 2734 if (pass == NON_ASCII_MATCH) {
2667 ASSERT(ascii); 2735 ASSERT(ascii);
2668 if (quarks[j] > String::kMaxAsciiCharCode) { 2736 if (quarks[j] > String::kMaxAsciiCharCode) {
2669 assembler->GoTo(backtrack); 2737 assembler->GoTo(backtrack);
2670 return; 2738 return;
2671 } 2739 }
2672 } else if (pass == CHARACTER_MATCH) { 2740 } else if (pass == CHARACTER_MATCH) {
2673 if (compiler->ignore_case()) { 2741 if (compiler->ignore_case()) {
2674 bound_checked = EmitAtomNonLetter(assembler, 2742 bound_checked = EmitAtomNonLetter(assembler,
2675 quarks[j], 2743 quarks[j],
2744 ascii,
2676 backtrack, 2745 backtrack,
2677 cp_offset + j, 2746 cp_offset + j,
2678 *checked_up_to < cp_offset + j, 2747 *checked_up_to < cp_offset + j,
2679 preloaded); 2748 preloaded);
2680 } else { 2749 } else {
2681 if (!preloaded) { 2750 if (!preloaded) {
2682 assembler->LoadCurrentCharacter(cp_offset + j, 2751 assembler->LoadCurrentCharacter(cp_offset + j,
2683 backtrack, 2752 backtrack,
2684 *checked_up_to < cp_offset + j); 2753 *checked_up_to < cp_offset + j);
2685 } 2754 }
(...skipping 2181 matching lines...) Expand 10 before | Expand all | Expand 10 after
4867 EmbeddedVector<byte, 1024> codes; 4936 EmbeddedVector<byte, 1024> codes;
4868 RegExpMacroAssemblerIrregexp macro_assembler(codes); 4937 RegExpMacroAssemblerIrregexp macro_assembler(codes);
4869 return compiler.Assemble(&macro_assembler, 4938 return compiler.Assemble(&macro_assembler,
4870 node, 4939 node,
4871 data->capture_count, 4940 data->capture_count,
4872 pattern); 4941 pattern);
4873 } 4942 }
4874 4943
4875 4944
4876 }} // namespace v8::internal 4945 }} // namespace v8::internal
OLDNEW
« no previous file with comments | « no previous file | src/objects.h » ('j') | src/objects.h » ('J')

Powered by Google App Engine
This is Rietveld 408576698