Chromium Code Reviews| Index: src/jsregexp.cc |
| =================================================================== |
| --- src/jsregexp.cc (revision 1275) |
| +++ src/jsregexp.cc (working copy) |
| @@ -1739,19 +1739,51 @@ |
| static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; |
| +// Returns the number of characters in the equivalence class, omitting those |
| +// that cannot occur in the source string because it is ASCII. |
| +static int GetCaseIndependentLetters(uc16 character, |
| + bool ascii_subject, |
| + unibrow::uchar* letters) { |
| + int length = uncanonicalize.get(character, '\0', letters); |
| + // Unibrow returns 0 or 1 for characters where case independependence is |
| + // trivial. |
| + if (length == 0) { |
| + letters[0] = character; |
| + length = 1; |
| + } |
| + if (!ascii_subject || character <= String::kMaxAsciiCharCode) { |
| + return length; |
| + } |
| + // The standard requires that non-ASCII characters cannot have ASCII |
| + // character codes in their equivalence class. |
| + return 0; |
| +} |
| + |
| + |
| // Only emits non-letters (things that don't have case). Only used for case |
| // independent matches. |
| static inline bool EmitAtomNonLetter( |
| RegExpMacroAssembler* macro_assembler, |
| uc16 c, |
| + bool ascii, |
| Label* on_failure, |
| int cp_offset, |
| bool check, |
| bool preloaded) { |
| unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| - int length = uncanonicalize.get(c, '\0', chars); |
| + int length = GetCaseIndependentLetters(c, ascii, chars); |
| + if (length < 1) { |
| + // This can't match. Must be an ASCII subject and a non-ASCII character. |
| + // We do not need to do anything since the ASCII pass already handled this. |
| + return false; // Bounds not checked. |
| + } |
| bool checked = false; |
| - if (length <= 1) { |
| + // We handle the length > 1 case in a later pass. |
| + if (length == 1) { |
| + if (ascii && c > String::kMaxAsciiCharCodeU) { |
| + // Can't match - see above. |
| + return false; // Bounds not checked. |
| + } |
| if (!preloaded) { |
| macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
| checked = check; |
| @@ -1805,14 +1837,14 @@ |
| // matches. |
| static inline bool EmitAtomLetter( |
| RegExpMacroAssembler* macro_assembler, |
| + uc16 c, |
| bool ascii, |
| - uc16 c, |
| Label* on_failure, |
| int cp_offset, |
| bool check, |
| bool preloaded) { |
| unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| - int length = uncanonicalize.get(c, '\0', chars); |
| + int length = GetCaseIndependentLetters(c, ascii, chars); |
| if (length <= 1) return false; |
| // We may not need to check against the end of the input string |
| // if this character lies before a character that matched. |
| @@ -1854,10 +1886,10 @@ |
| static void EmitCharClass(RegExpMacroAssembler* macro_assembler, |
| RegExpCharacterClass* cc, |
| + bool ascii, |
| + Label* on_failure, |
| int cp_offset, |
| - Label* on_failure, |
| bool check_offset, |
| - bool ascii, |
| bool preloaded) { |
| if (cc->is_standard() && |
| macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), |
| @@ -2238,12 +2270,14 @@ |
| // matching can turn an ASCII character into non-ASCII or |
| // vice versa. |
| details->set_cannot_match(); |
| + pos->determines_perfectly = false; |
|
Lasse Reichstein
2009/02/18 15:20:44
Should this be set to false? If the remaining case
Erik Corry
2009/02/18 16:04:24
Determines perfectly indicates that this particula
|
| return; |
| } |
| if (compiler->ignore_case()) { |
| unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| - int length = uncanonicalize.get(c, '\0', chars); |
| - if (length < 2) { |
| + int length = GetCaseIndependentLetters(c, compiler->ascii(), chars); |
| + ASSERT(length != 0); // Can only happen if c > char_mask (see above). |
| + if (length == 1) { |
| // This letter has no case equivalents, so it's nice and simple |
| // and the mask-compare will determine definitely whether we have |
| // a match at this character position. |
| @@ -2288,7 +2322,6 @@ |
| details->positions(characters_filled_in); |
| RegExpCharacterClass* tree = elm.data.u_char_class; |
| ZoneList<CharacterRange>* ranges = tree->ranges(); |
| - CharacterRange range = ranges->at(0); |
| if (tree->is_negated()) { |
| // A quick check uses multi-character mask and compare. There is no |
| // useful way to incorporate a negative char class into this scheme |
| @@ -2297,27 +2330,46 @@ |
| pos->mask = 0; |
| pos->value = 0; |
| } else { |
| - uint32_t differing_bits = (range.from() ^ range.to()); |
| + int first_range = 0; |
| + while (ranges->at(first_range).from() > char_mask) { |
| + first_range++; |
| + if (first_range == ranges->length()) { |
| + details->set_cannot_match(); |
| + pos->determines_perfectly = false; |
| + return; |
| + } |
| + } |
| + CharacterRange range = ranges->at(first_range); |
| + uc16 from = range.from(); |
| + uc16 to = range.to(); |
| + if (to > char_mask) { |
| + to = char_mask; |
| + } |
| + uint32_t differing_bits = (from ^ to); |
| // A mask and compare is only perfect if the differing bits form a |
| // number like 00011111 with one single block of trailing 1s. |
| if ((differing_bits & (differing_bits + 1)) == 0) { |
| pos->determines_perfectly = true; |
| } |
| uint32_t common_bits = ~SmearBitsRight(differing_bits); |
| - uint32_t bits = (range.from() & common_bits); |
| - for (int i = 1; i < ranges->length(); i++) { |
| + uint32_t bits = (from & common_bits); |
| + for (int i = first_range + 1; i < ranges->length(); i++) { |
| + CharacterRange range = ranges->at(i); |
| + uc16 from = range.from(); |
| + uc16 to = range.to(); |
| + if (from > char_mask) continue; |
| + if (to > char_mask) to = char_mask; |
| // Here we are combining more ranges into the mask and compare |
| // value. With each new range the mask becomes more sparse and |
| // so the chances of a false positive rise. A character class |
| // with multiple ranges is assumed never to be equivalent to a |
| // mask and compare operation. |
| pos->determines_perfectly = false; |
| - CharacterRange range = ranges->at(i); |
| - uint32_t new_common_bits = (range.from() ^ range.to()); |
| + uint32_t new_common_bits = (from ^ to); |
| new_common_bits = ~SmearBitsRight(new_common_bits); |
| common_bits &= new_common_bits; |
| bits &= new_common_bits; |
| - uint32_t differing_bits = (range.from() & common_bits) ^ bits; |
| + uint32_t differing_bits = (from & common_bits) ^ bits; |
| common_bits ^= differing_bits; |
| bits &= common_bits; |
| } |
| @@ -2619,6 +2671,20 @@ |
| } |
| +static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { |
| + if (quick_check == NULL) return false; |
| + if (offset >= quick_check->characters()) return false; |
| + return quick_check->positions(offset)->determines_perfectly; |
|
Lasse Reichstein
2009/02/18 15:20:44
Just checking: If we find that a position determin
Erik Corry
2009/02/18 16:04:24
See above.
|
| +} |
| + |
| + |
| +static void UpdateBoundsCheck(int index, int* checked_up_to) { |
| + if (index > *checked_up_to) { |
| + *checked_up_to = index; |
| + } |
| +} |
| + |
| + |
| // We call this repeatedly to generate code for each pass over the text node. |
| // The passes are in increasing order of difficulty because we hope one |
| // of the first passes will fail in which case we are saved the work of the |
| @@ -2663,83 +2729,68 @@ |
| TextElement elm = elms_->at(i); |
| int cp_offset = trace->cp_offset() + elm.cp_offset; |
| if (elm.type == TextElement::ATOM) { |
| - if (pass == NON_ASCII_MATCH || |
| - pass == CHARACTER_MATCH || |
| - pass == CASE_CHARACTER_MATCH) { |
| + if (pass != CHARACTER_CLASS_MATCH) { |
| Vector<const uc16> quarks = elm.data.u_atom->data(); |
| for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { |
| - bool bound_checked = true; // Most ops will check their bounds. |
| + bool bound_checked = false; |
| if (first_element_checked && i == 0 && j == 0) continue; |
| - if (pass == NON_ASCII_MATCH) { |
| - ASSERT(ascii); |
| - if (quarks[j] > String::kMaxAsciiCharCode) { |
| - assembler->GoTo(backtrack); |
| - return; |
| - } |
| - } else { |
| - if (quick_check != NULL && |
| - elm.cp_offset + j < quick_check->characters() && |
| - quick_check->positions(elm.cp_offset + j)-> |
| - determines_perfectly) { |
| - continue; |
| - } |
| - if (pass == CHARACTER_MATCH) { |
| - if (compiler->ignore_case()) { |
| - bound_checked = EmitAtomNonLetter( |
| - assembler, |
| - quarks[j], |
| + if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue; |
| + switch (pass) { |
| + case NON_ASCII_MATCH: |
| + ASSERT(ascii); |
| + if (quarks[j] > String::kMaxAsciiCharCode) { |
| + assembler->GoTo(backtrack); |
| + return; |
| + } |
| + break; |
| + case NON_LETTER_CHARACTER_MATCH: |
| + bound_checked = EmitAtomNonLetter(assembler, |
| + quarks[j], |
| + ascii, |
| + backtrack, |
| + cp_offset + j, |
| + *checked_up_to < cp_offset + j, |
| + preloaded); |
| + break; |
| + case SIMPLE_CHARACTER_MATCH: |
| + if (!preloaded) { |
| + assembler->LoadCurrentCharacter( |
| + cp_offset + j, |
| backtrack, |
| - cp_offset + j, |
| - *checked_up_to < cp_offset + j, |
| - preloaded); |
| - } else { |
| - if (!preloaded) { |
| - assembler->LoadCurrentCharacter( |
| - cp_offset + j, |
| - backtrack, |
| - *checked_up_to < cp_offset + j); |
| - } |
| - assembler->CheckNotCharacter(quarks[j], backtrack); |
| + *checked_up_to < cp_offset + j); |
| + bound_checked = true; |
| } |
| - } else { |
| - ASSERT_EQ(pass, CASE_CHARACTER_MATCH); |
| - ASSERT(compiler->ignore_case()); |
| + assembler->CheckNotCharacter(quarks[j], backtrack); |
| + break; |
| + case CASE_CHARACTER_MATCH: |
| bound_checked = EmitAtomLetter(assembler, |
| - compiler->ascii(), |
| quarks[j], |
| + ascii, |
| backtrack, |
| cp_offset + j, |
| *checked_up_to < cp_offset + j, |
| preloaded); |
| - } |
| - if (bound_checked) { |
| - if (cp_offset + j > *checked_up_to) { |
| - *checked_up_to = cp_offset + j; |
| - } |
| - } |
| + break; |
| + default: |
| + break; |
| } |
| + if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); |
| } |
| } |
| } else { |
| ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); |
| - if (first_element_checked && i == 0) continue; |
| - if (quick_check != NULL && |
| - elm.cp_offset < quick_check->characters() && |
| - quick_check->positions(elm.cp_offset)->determines_perfectly) { |
| - continue; |
| - } |
| if (pass == CHARACTER_CLASS_MATCH) { |
| + if (first_element_checked && i == 0) continue; |
| + if (DeterminedAlready(quick_check, elm.cp_offset)) continue; |
| RegExpCharacterClass* cc = elm.data.u_char_class; |
| EmitCharClass(assembler, |
| cc, |
| + ascii, |
| + backtrack, |
| cp_offset, |
| - backtrack, |
| *checked_up_to < cp_offset, |
| - ascii, |
| preloaded); |
| - if (cp_offset > *checked_up_to) { |
| - *checked_up_to = cp_offset; |
| - } |
| + UpdateBoundsCheck(cp_offset, checked_up_to); |
| } |
| } |
| } |
| @@ -2757,6 +2808,16 @@ |
| } |
| +bool TextNode::SkipPass(int int_pass, bool ignore_case) { |
| + TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); |
| + if (ignore_case) { |
| + return pass == SIMPLE_CHARACTER_MATCH; |
| + } else { |
| + return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; |
| + } |
| +} |
| + |
| + |
| // This generates the code to match a text node. A text node can contain |
| // straight character sequences (possibly to be matched in a case-independent |
| // way) and character classes. For efficiency we do not do this in a single |
| @@ -2785,50 +2846,30 @@ |
| // If a character is preloaded into the current character register then |
| // check that now. |
| if (trace->characters_preloaded() == 1) { |
| - TextEmitPass(compiler, |
| - CHARACTER_MATCH, |
| - true, |
| - trace, |
| - false, |
| - &bound_checked_to); |
| - if (compiler->ignore_case()) { |
| + for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { |
| + if (!SkipPass(pass, compiler->ignore_case())) { |
| + TextEmitPass(compiler, |
| + static_cast<TextEmitPassType>(pass), |
| + true, |
| + trace, |
| + false, |
| + &bound_checked_to); |
| + } |
| + } |
| + first_elt_done = true; |
| + } |
| + |
| + for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { |
| + if (!SkipPass(pass, compiler->ignore_case())) { |
| TextEmitPass(compiler, |
| - CASE_CHARACTER_MATCH, |
| - true, |
| + static_cast<TextEmitPassType>(pass), |
| + false, |
| trace, |
| - false, |
| + first_elt_done, |
| &bound_checked_to); |
| } |
| - TextEmitPass(compiler, |
| - CHARACTER_CLASS_MATCH, |
| - true, |
| - trace, |
| - false, |
| - &bound_checked_to); |
| - first_elt_done = true; |
| } |
| - TextEmitPass(compiler, |
| - CHARACTER_MATCH, |
| - false, |
| - trace, |
| - first_elt_done, |
| - &bound_checked_to); |
| - if (compiler->ignore_case()) { |
| - TextEmitPass(compiler, |
| - CASE_CHARACTER_MATCH, |
| - false, |
| - trace, |
| - first_elt_done, |
| - &bound_checked_to); |
| - } |
| - TextEmitPass(compiler, |
| - CHARACTER_CLASS_MATCH, |
| - false, |
| - trace, |
| - first_elt_done, |
| - &bound_checked_to); |
| - |
| Trace successor_trace(*trace); |
| successor_trace.set_at_start(false); |
| successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); |