src/jsregexp.cc - Issue 21126: Fix some glitches around unicode regexps matching ascii subject strings.

Side by Side Diff: src/jsregexp.cc

Issue 21126: Fix some glitches around unicode regexps matching ascii subject strings. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 11 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved.	1 // Copyright 2006-2008 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 1717 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1728 trace->backtrack());	1728 trace->backtrack());

1729 break;	1729 break;

1730 }	1730 }

1731 }	1731 }

1732	1732

1733	1733

1734 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;	1734 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;

1735 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;	1735 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;

1736	1736

1737	1737

	1738 // Returns the number of characters in the equivalence class, omitting those

	1739 // that cannot occur in the source string because it is ASCII.

	1740 static int GetCaseIndependentLetters(uc16 character,

	1741 bool ascii_subject,

	1742 unibrow::uchar* letters) {

	1743 int length = uncanonicalize.get(character, '\0', letters);

	1744 if (length < 2) {

	1745 // Unibrow returns 0 or 1 for characters where case independependence is

	1746 // trivial.

	1747 if (ascii_subject && character > String::kMaxAsciiCharCode) {

	1748 return 0; // It's impossible to find a match here.

	1749 }

	1750 letters[0] = character;

	1751 return 1;

	1752 }

	1753 if (!ascii_subject) {

	1754 return length;

	1755 }

	1756 int characters = 0;

	1757 for (int i = 0; i < length; i++) {

	1758 if (letters[i] <= String::kMaxAsciiCharCodeU) {

	1759 letters[characters++] = letters[i];

	1760 }

	1761 }

	1762 return characters;

	1763 }
	Lasse Reichstein 2009/02/06 15:50:56 If uncanonicalize implements Canonicalize^-1 o Can If uncanonicalize implements Canonicalize^-1 o Canonicalize (where Canonicalize is according to the specification), then the last loop is not necessary. If "character" is ASCII, so is every letter in "letters", and if "character" is not ASCII, neither are any letter in "letters". I.e., the loop can be replaced by: if (character <= String::kMaxAsciiCharCode) { return length; } return 0;
	1764

	1765

1738 // Only emits non-letters (things that don't have case). Only used for case	1766 // Only emits non-letters (things that don't have case). Only used for case

1739 // independent matches.	1767 // independent matches.

1740 static inline bool EmitAtomNonLetter(	1768 static inline bool EmitAtomNonLetter(

1741 RegExpMacroAssembler* macro_assembler,	1769 RegExpMacroAssembler* macro_assembler,

1742 uc16 c,	1770 uc16 c,

	1771 bool ascii,

1743 Label* on_failure,	1772 Label* on_failure,

1744 int cp_offset,	1773 int cp_offset,

1745 bool check,	1774 bool check,

1746 bool preloaded) {	1775 bool preloaded) {

1747 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	1776 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

1748 int length = uncanonicalize.get(c, '\0', chars);	1777 int length = GetCaseIndependentLetters(c, ascii, chars);

	1778 if (length < 1) {

	1779 // Can't match.

	1780 macro_assembler->GoTo(on_failure);

	1781 return false; // Bounds not checked.

	1782 }

1749 bool checked = false;	1783 bool checked = false;

1750 if (length <= 1) {	1784 // We handle the length > 1 case in another pass.

	1785 if (length == 1) {
	Lasse Reichstein 2009/02/06 15:50:56 What about the length == 0 case? Could we emit a What about the length == 0 case? Could we emit a fail or backtrack immediately in that case? Is is that handled by the quick check details so we never get here?
	1786 if (ascii && c > String::kMaxAsciiCharCodeU) {
	Lasse Reichstein 2009/02/06 15:50:56 Wasn't this checked in line 1747, so length will a Wasn't this checked in line 1747, so length will always be 0 if this test would be successful?
	1787 // Can't match.

	1788 macro_assembler->GoTo(on_failure);

	1789 return false; // Bounds not checked.

	1790 }

1751 if (!preloaded) {	1791 if (!preloaded) {

1752 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);	1792 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

1753 checked = check;	1793 checked = check;

1754 }	1794 }

1755 macro_assembler->CheckNotCharacter(c, on_failure);	1795 macro_assembler->CheckNotCharacter(c, on_failure);

1756 }	1796 }

1757 return checked;	1797 return checked;

1758 }	1798 }

1759	1799

1760	1800

(...skipping 40 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1801 // matches.	1841 // matches.

1802 static inline bool EmitAtomLetter(	1842 static inline bool EmitAtomLetter(

1803 RegExpMacroAssembler* macro_assembler,	1843 RegExpMacroAssembler* macro_assembler,

1804 bool ascii,	1844 bool ascii,

1805 uc16 c,	1845 uc16 c,

1806 Label* on_failure,	1846 Label* on_failure,

1807 int cp_offset,	1847 int cp_offset,

1808 bool check,	1848 bool check,

1809 bool preloaded) {	1849 bool preloaded) {

1810 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	1850 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

1811 int length = uncanonicalize.get(c, '\0', chars);	1851 int length = GetCaseIndependentLetters(c, ascii, chars);

1812 if (length <= 1) return false;	1852 if (length <= 1) return false;

1813 // We may not need to check against the end of the input string	1853 // We may not need to check against the end of the input string

1814 // if this character lies before a character that matched.	1854 // if this character lies before a character that matched.

1815 if (!preloaded) {	1855 if (!preloaded) {

1816 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);	1856 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

1817 }	1857 }

1818 Label ok;	1858 Label ok;

1819 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);	1859 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);

1820 switch (length) {	1860 switch (length) {

1821 case 2: {	1861 case 2: {

(...skipping 401 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2223 for (int k = 0; k < elms_->length(); k++) {	2263 for (int k = 0; k < elms_->length(); k++) {

2224 TextElement elm = elms_->at(k);	2264 TextElement elm = elms_->at(k);

2225 if (elm.type == TextElement::ATOM) {	2265 if (elm.type == TextElement::ATOM) {

2226 Vector<const uc16> quarks = elm.data.u_atom->data();	2266 Vector<const uc16> quarks = elm.data.u_atom->data();

2227 for (int i = 0; i < characters && i < quarks.length(); i++) {	2267 for (int i = 0; i < characters && i < quarks.length(); i++) {

2228 QuickCheckDetails::Position* pos =	2268 QuickCheckDetails::Position* pos =

2229 details->positions(characters_filled_in);	2269 details->positions(characters_filled_in);

2230 if (compiler->ignore_case()) {	2270 if (compiler->ignore_case()) {

2231 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	2271 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

2232 uc16 c = quarks[i];	2272 uc16 c = quarks[i];

2233 int length = uncanonicalize.get(c, '\0', chars);	2273 int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);

2234 if (length < 2) {	2274 if (length == 0) {

	2275 details->set_cannot_match();

	2276 pos->determines_perfectly = false;

	2277 return;

	2278 }

	2279 if (length == 1) {

2235 // This letter has no case equivalents, so it's nice and simple	2280 // This letter has no case equivalents, so it's nice and simple

2236 // and the mask-compare will determine definitely whether we have	2281 // and the mask-compare will determine definitely whether we have

2237 // a match at this character position.	2282 // a match at this character position.

2238 pos->mask = char_mask;	2283 pos->mask = char_mask;

2239 pos->value = c;	2284 pos->value = c;

2240 pos->determines_perfectly = true;	2285 pos->determines_perfectly = true;

2241 } else {	2286 } else {

2242 uint32_t common_bits = char_mask;	2287 uint32_t common_bits = char_mask;

2243 uint32_t bits = chars[0];	2288 uint32_t bits = chars[0];

2244 for (int j = 1; j < length; j++) {	2289 for (int j = 1; j < length; j++) {

2245 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);	2290 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);

2246 common_bits ^= differing_bits;	2291 common_bits ^= differing_bits;

2247 bits &= common_bits;	2292 bits &= common_bits;

2248 }	2293 }

2249 // If length is 2 and common bits has only one zero in it then	2294 // If length is 2 and common bits has only one zero in it then

2250 // our mask and compare instruction will determine definitely	2295 // our mask and compare instruction will determine definitely

2251 // whether we have a match at this character position. Otherwise	2296 // whether we have a match at this character position. Otherwise

2252 // it can only be an approximate check.	2297 // it can only be an approximate check.

2253 uint32_t one_zero = (common_bits \| ~char_mask);	2298 uint32_t one_zero = (common_bits \| ~char_mask);

2254 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {	2299 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {

2255 pos->determines_perfectly = true;	2300 pos->determines_perfectly = true;

2256 }	2301 }

2257 pos->mask = common_bits;	2302 pos->mask = common_bits;

2258 pos->value = bits;	2303 pos->value = bits;

2259 }	2304 }

2260 } else {	2305 } else {

2261 // Don't ignore case. Nice simple case where the mask-compare will	2306 // Don't ignore case. Nice simple case where the mask-compare will

2262 // determine definitely whether we have a match at this character	2307 // determine definitely whether we have a match at this character

2263 // position.	2308 // position.

	2309 if (quarks[i] > char_mask) {

	2310 pos->determines_perfectly = false;

	2311 details->set_cannot_match();

	2312 return;

	2313 }

2264 pos->mask = char_mask;	2314 pos->mask = char_mask;

2265 pos->value = quarks[i];	2315 pos->value = quarks[i];

2266 pos->determines_perfectly = true;	2316 pos->determines_perfectly = true;

2267 }	2317 }

2268 characters_filled_in++;	2318 characters_filled_in++;

2269 ASSERT(characters_filled_in <= details->characters());	2319 ASSERT(characters_filled_in <= details->characters());

2270 if (characters_filled_in == details->characters()) {	2320 if (characters_filled_in == details->characters()) {

2271 return;	2321 return;

2272 }	2322 }

2273 }	2323 }

2274 } else {	2324 } else {

2275 QuickCheckDetails::Position* pos =	2325 QuickCheckDetails::Position* pos =

2276 details->positions(characters_filled_in);	2326 details->positions(characters_filled_in);

2277 RegExpCharacterClass* tree = elm.data.u_char_class;	2327 RegExpCharacterClass* tree = elm.data.u_char_class;

2278 ZoneList<CharacterRange>* ranges = tree->ranges();	2328 ZoneList<CharacterRange>* ranges = tree->ranges();

2279 CharacterRange range = ranges->at(0);

2280 if (tree->is_negated()) {	2329 if (tree->is_negated()) {

2281 // A quick check uses multi-character mask and compare. There is no	2330 // A quick check uses multi-character mask and compare. There is no

2282 // useful way to incorporate a negative char class into this scheme	2331 // useful way to incorporate a negative char class into this scheme

2283 // so we just conservatively create a mask and value that will always	2332 // so we just conservatively create a mask and value that will always

2284 // succeed.	2333 // succeed.
	Lasse Reichstein 2009/02/06 15:50:56 The comment is a little too pessimistic. A negated The comment is a little too pessimistic. A negated character class is completely equivalent to a non-negated one with at most one more range in its representation. If nothing else, we could canonicalize character classes to only sorted non-overlapping positive ranges.
2285 pos->mask = 0;	2334 pos->mask = 0;

2286 pos->value = 0;	2335 pos->value = 0;

2287 } else {	2336 } else {

2288 uint32_t differing_bits = (range.from() ^ range.to());	2337 int first_range = 0;

	2338 while (ranges->at(first_range).from() > char_mask) {

	2339 first_range++;

	2340 if (first_range == ranges->length()) {

	2341 details->set_cannot_match();

	2342 pos->determines_perfectly = false;

	2343 return;

	2344 }

	2345 }

	2346 CharacterRange range = ranges->at(first_range);

	2347 uc16 from = range.from();

	2348 uc16 to = range.to();

	2349 if (to > char_mask) {

	2350 to = char_mask;

	2351 }

	2352 uint32_t differing_bits = (from ^ to);

2289 // A mask and compare is only perfect if the differing bits form a	2353 // A mask and compare is only perfect if the differing bits form a

2290 // number like 00011111 with one single block of trailing 1s.	2354 // number like 00011111 with one single block of trailing 1s.

2291 if ((differing_bits & (differing_bits + 1)) == 0) {	2355 if ((differing_bits & (differing_bits + 1)) == 0) {

2292 pos->determines_perfectly = true;	2356 pos->determines_perfectly = true;

2293 }	2357 }

2294 uint32_t common_bits = ~SmearBitsRight(differing_bits);	2358 uint32_t common_bits = ~SmearBitsRight(differing_bits);

2295 uint32_t bits = (range.from() & common_bits);	2359 uint32_t bits = (from & common_bits);

2296 for (int i = 1; i < ranges->length(); i++) {	2360 for (int i = first_range + 1; i < ranges->length(); i++) {

	2361 CharacterRange range = ranges->at(i);

	2362 uc16 from = range.from();

	2363 uc16 to = range.to();

	2364 if (from > char_mask) continue;

	2365 if (to > char_mask) to = char_mask;

2297 // Here we are combining more ranges into the mask and compare	2366 // Here we are combining more ranges into the mask and compare

2298 // value. With each new range the mask becomes more sparse and	2367 // value. With each new range the mask becomes more sparse and

2299 // so the chances of a false positive rise. A character class	2368 // so the chances of a false positive rise. A character class

2300 // with multiple ranges is assumed never to be equivalent to a	2369 // with multiple ranges is assumed never to be equivalent to a

2301 // mask and compare operation.	2370 // mask and compare operation.

2302 pos->determines_perfectly = false;	2371 pos->determines_perfectly = false;

2303 CharacterRange range = ranges->at(i);	2372 uint32_t new_common_bits = (from ^ to);

2304 uint32_t new_common_bits = (range.from() ^ range.to());

2305 new_common_bits = ~SmearBitsRight(new_common_bits);	2373 new_common_bits = ~SmearBitsRight(new_common_bits);

2306 common_bits &= new_common_bits;	2374 common_bits &= new_common_bits;

2307 bits &= new_common_bits;	2375 bits &= new_common_bits;

2308 uint32_t differing_bits = (range.from() & common_bits) ^ bits;	2376 uint32_t differing_bits = (from & common_bits) ^ bits;

2309 common_bits ^= differing_bits;	2377 common_bits ^= differing_bits;

2310 bits &= common_bits;	2378 bits &= common_bits;

2311 }	2379 }

2312 pos->mask = common_bits;	2380 pos->mask = common_bits;

2313 pos->value = bits;	2381 pos->value = bits;

2314 }	2382 }

2315 characters_filled_in++;	2383 characters_filled_in++;

2316 ASSERT(characters_filled_in <= details->characters());	2384 ASSERT(characters_filled_in <= details->characters());

2317 if (characters_filled_in == details->characters()) {	2385 if (characters_filled_in == details->characters()) {

2318 return;	2386 return;

(...skipping 347 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2666 if (pass == NON_ASCII_MATCH) {	2734 if (pass == NON_ASCII_MATCH) {

2667 ASSERT(ascii);	2735 ASSERT(ascii);

2668 if (quarks[j] > String::kMaxAsciiCharCode) {	2736 if (quarks[j] > String::kMaxAsciiCharCode) {

2669 assembler->GoTo(backtrack);	2737 assembler->GoTo(backtrack);

2670 return;	2738 return;

2671 }	2739 }

2672 } else if (pass == CHARACTER_MATCH) {	2740 } else if (pass == CHARACTER_MATCH) {

2673 if (compiler->ignore_case()) {	2741 if (compiler->ignore_case()) {

2674 bound_checked = EmitAtomNonLetter(assembler,	2742 bound_checked = EmitAtomNonLetter(assembler,

2675 quarks[j],	2743 quarks[j],

	2744 ascii,

2676 backtrack,	2745 backtrack,

2677 cp_offset + j,	2746 cp_offset + j,

2678 *checked_up_to < cp_offset + j,	2747 *checked_up_to < cp_offset + j,

2679 preloaded);	2748 preloaded);

2680 } else {	2749 } else {

2681 if (!preloaded) {	2750 if (!preloaded) {

2682 assembler->LoadCurrentCharacter(cp_offset + j,	2751 assembler->LoadCurrentCharacter(cp_offset + j,

2683 backtrack,	2752 backtrack,

2684 *checked_up_to < cp_offset + j);	2753 *checked_up_to < cp_offset + j);

2685 }	2754 }

(...skipping 2181 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4867 EmbeddedVector<byte, 1024> codes;	4936 EmbeddedVector<byte, 1024> codes;

4868 RegExpMacroAssemblerIrregexp macro_assembler(codes);	4937 RegExpMacroAssemblerIrregexp macro_assembler(codes);

4869 return compiler.Assemble(&macro_assembler,	4938 return compiler.Assemble(&macro_assembler,

4870 node,	4939 node,

4871 data->capture_count,	4940 data->capture_count,

4872 pattern);	4941 pattern);

4873 }	4942 }

4874	4943

4875	4944

4876 }} // namespace v8::internal	4945 }} // namespace v8::internal

OLD	NEW

« no previous file with comments | « no previous file | src/objects.h » ('j') | src/objects.h » ('J')