Chromium Code Reviews| Index: src/jsregexp.cc |
| =================================================================== |
| --- src/jsregexp.cc (revision 1235) |
| +++ src/jsregexp.cc (working copy) |
| @@ -1735,19 +1735,59 @@ |
| static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; |
| +// Returns the number of characters in the equivalence class, omitting those |
| +// that cannot occur in the source string because it is ASCII. |
| +static int GetCaseIndependentLetters(uc16 character, |
| + bool ascii_subject, |
| + unibrow::uchar* letters) { |
| + int length = uncanonicalize.get(character, '\0', letters); |
| + if (length < 2) { |
| + // Unibrow returns 0 or 1 for characters where case independependence is |
| + // trivial. |
| + if (ascii_subject && character > String::kMaxAsciiCharCode) { |
| + return 0; // It's impossible to find a match here. |
| + } |
| + letters[0] = character; |
| + return 1; |
| + } |
| + if (!ascii_subject) { |
| + return length; |
| + } |
| + int characters = 0; |
| + for (int i = 0; i < length; i++) { |
| + if (letters[i] <= String::kMaxAsciiCharCodeU) { |
| + letters[characters++] = letters[i]; |
| + } |
| + } |
| + return characters; |
| +} |
|
Lasse Reichstein
2009/02/06 15:50:56
If uncanonicalize implements Canonicalize^-1 o Can
|
| + |
| + |
| // Only emits non-letters (things that don't have case). Only used for case |
| // independent matches. |
| static inline bool EmitAtomNonLetter( |
| RegExpMacroAssembler* macro_assembler, |
| uc16 c, |
| + bool ascii, |
| Label* on_failure, |
| int cp_offset, |
| bool check, |
| bool preloaded) { |
| unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| - int length = uncanonicalize.get(c, '\0', chars); |
| + int length = GetCaseIndependentLetters(c, ascii, chars); |
| + if (length < 1) { |
| + // Can't match. |
| + macro_assembler->GoTo(on_failure); |
| + return false; // Bounds not checked. |
| + } |
| bool checked = false; |
| - if (length <= 1) { |
| + // We handle the length > 1 case in another pass. |
| + if (length == 1) { |
|
Lasse Reichstein
2009/02/06 15:50:56
What about the length == 0 case?
Could we emit a
|
| + if (ascii && c > String::kMaxAsciiCharCodeU) { |
|
Lasse Reichstein
2009/02/06 15:50:56
Wasn't this checked in line 1747, so length will a
|
| + // Can't match. |
| + macro_assembler->GoTo(on_failure); |
| + return false; // Bounds not checked. |
| + } |
| if (!preloaded) { |
| macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
| checked = check; |
| @@ -1808,7 +1848,7 @@ |
| bool check, |
| bool preloaded) { |
| unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| - int length = uncanonicalize.get(c, '\0', chars); |
| + int length = GetCaseIndependentLetters(c, ascii, chars); |
| if (length <= 1) return false; |
| // We may not need to check against the end of the input string |
| // if this character lies before a character that matched. |
| @@ -2230,8 +2270,13 @@ |
| if (compiler->ignore_case()) { |
| unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| uc16 c = quarks[i]; |
| - int length = uncanonicalize.get(c, '\0', chars); |
| - if (length < 2) { |
| + int length = GetCaseIndependentLetters(c, compiler->ascii(), chars); |
| + if (length == 0) { |
| + details->set_cannot_match(); |
| + pos->determines_perfectly = false; |
| + return; |
| + } |
| + if (length == 1) { |
| // This letter has no case equivalents, so it's nice and simple |
| // and the mask-compare will determine definitely whether we have |
| // a match at this character position. |
| @@ -2261,6 +2306,11 @@ |
| // Don't ignore case. Nice simple case where the mask-compare will |
| // determine definitely whether we have a match at this character |
| // position. |
| + if (quarks[i] > char_mask) { |
| + pos->determines_perfectly = false; |
| + details->set_cannot_match(); |
| + return; |
| + } |
| pos->mask = char_mask; |
| pos->value = quarks[i]; |
| pos->determines_perfectly = true; |
| @@ -2276,7 +2326,6 @@ |
| details->positions(characters_filled_in); |
| RegExpCharacterClass* tree = elm.data.u_char_class; |
| ZoneList<CharacterRange>* ranges = tree->ranges(); |
| - CharacterRange range = ranges->at(0); |
| if (tree->is_negated()) { |
| // A quick check uses multi-character mask and compare. There is no |
| // useful way to incorporate a negative char class into this scheme |
| @@ -2285,27 +2334,46 @@ |
| pos->mask = 0; |
| pos->value = 0; |
| } else { |
| - uint32_t differing_bits = (range.from() ^ range.to()); |
| + int first_range = 0; |
| + while (ranges->at(first_range).from() > char_mask) { |
| + first_range++; |
| + if (first_range == ranges->length()) { |
| + details->set_cannot_match(); |
| + pos->determines_perfectly = false; |
| + return; |
| + } |
| + } |
| + CharacterRange range = ranges->at(first_range); |
| + uc16 from = range.from(); |
| + uc16 to = range.to(); |
| + if (to > char_mask) { |
| + to = char_mask; |
| + } |
| + uint32_t differing_bits = (from ^ to); |
| // A mask and compare is only perfect if the differing bits form a |
| // number like 00011111 with one single block of trailing 1s. |
| if ((differing_bits & (differing_bits + 1)) == 0) { |
| pos->determines_perfectly = true; |
| } |
| uint32_t common_bits = ~SmearBitsRight(differing_bits); |
| - uint32_t bits = (range.from() & common_bits); |
| - for (int i = 1; i < ranges->length(); i++) { |
| + uint32_t bits = (from & common_bits); |
| + for (int i = first_range + 1; i < ranges->length(); i++) { |
| + CharacterRange range = ranges->at(i); |
| + uc16 from = range.from(); |
| + uc16 to = range.to(); |
| + if (from > char_mask) continue; |
| + if (to > char_mask) to = char_mask; |
| // Here we are combining more ranges into the mask and compare |
| // value. With each new range the mask becomes more sparse and |
| // so the chances of a false positive rise. A character class |
| // with multiple ranges is assumed never to be equivalent to a |
| // mask and compare operation. |
| pos->determines_perfectly = false; |
| - CharacterRange range = ranges->at(i); |
| - uint32_t new_common_bits = (range.from() ^ range.to()); |
| + uint32_t new_common_bits = (from ^ to); |
| new_common_bits = ~SmearBitsRight(new_common_bits); |
| common_bits &= new_common_bits; |
| bits &= new_common_bits; |
| - uint32_t differing_bits = (range.from() & common_bits) ^ bits; |
| + uint32_t differing_bits = (from & common_bits) ^ bits; |
| common_bits ^= differing_bits; |
| bits &= common_bits; |
| } |
| @@ -2673,6 +2741,7 @@ |
| if (compiler->ignore_case()) { |
| bound_checked = EmitAtomNonLetter(assembler, |
| quarks[j], |
| + ascii, |
| backtrack, |
| cp_offset + j, |
| *checked_up_to < cp_offset + j, |