Index: src/jsregexp.cc |
=================================================================== |
--- src/jsregexp.cc (revision 1275) |
+++ src/jsregexp.cc (working copy) |
@@ -1739,19 +1739,51 @@ |
static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; |
+// Returns the number of characters in the equivalence class, omitting those |
+// that cannot occur in the source string because it is ASCII. |
+static int GetCaseIndependentLetters(uc16 character, |
+ bool ascii_subject, |
+ unibrow::uchar* letters) { |
+ int length = uncanonicalize.get(character, '\0', letters); |
+ // Unibrow returns 0 or 1 for characters where case independependence is |
+ // trivial. |
+ if (length == 0) { |
+ letters[0] = character; |
+ length = 1; |
+ } |
+ if (!ascii_subject || character <= String::kMaxAsciiCharCode) { |
+ return length; |
+ } |
+ // The standard requires that non-ASCII characters cannot have ASCII |
+ // character codes in their equivalence class. |
+ return 0; |
+} |
+ |
+ |
// Only emits non-letters (things that don't have case). Only used for case |
// independent matches. |
static inline bool EmitAtomNonLetter( |
RegExpMacroAssembler* macro_assembler, |
uc16 c, |
+ bool ascii, |
Label* on_failure, |
int cp_offset, |
bool check, |
bool preloaded) { |
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
- int length = uncanonicalize.get(c, '\0', chars); |
+ int length = GetCaseIndependentLetters(c, ascii, chars); |
+ if (length < 1) { |
+ // This can't match. Must be an ASCII subject and a non-ASCII character. |
+ // We do not need to do anything since the ASCII pass already handled this. |
+ return false; // Bounds not checked. |
+ } |
bool checked = false; |
- if (length <= 1) { |
+ // We handle the length > 1 case in a later pass. |
+ if (length == 1) { |
+ if (ascii && c > String::kMaxAsciiCharCodeU) { |
+ // Can't match - see above. |
+ return false; // Bounds not checked. |
+ } |
if (!preloaded) { |
macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
checked = check; |
@@ -1805,14 +1837,14 @@ |
// matches. |
static inline bool EmitAtomLetter( |
RegExpMacroAssembler* macro_assembler, |
+ uc16 c, |
bool ascii, |
- uc16 c, |
Label* on_failure, |
int cp_offset, |
bool check, |
bool preloaded) { |
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
- int length = uncanonicalize.get(c, '\0', chars); |
+ int length = GetCaseIndependentLetters(c, ascii, chars); |
if (length <= 1) return false; |
// We may not need to check against the end of the input string |
// if this character lies before a character that matched. |
@@ -1854,10 +1886,10 @@ |
static void EmitCharClass(RegExpMacroAssembler* macro_assembler, |
RegExpCharacterClass* cc, |
+ bool ascii, |
+ Label* on_failure, |
int cp_offset, |
- Label* on_failure, |
bool check_offset, |
- bool ascii, |
bool preloaded) { |
if (cc->is_standard() && |
macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), |
@@ -2238,12 +2270,14 @@ |
// matching can turn an ASCII character into non-ASCII or |
// vice versa. |
details->set_cannot_match(); |
+ pos->determines_perfectly = false; |
Lasse Reichstein
2009/02/18 15:20:44
Should this be set to false? If the remaining case
Erik Corry
2009/02/18 16:04:24
Determines perfectly indicates that this particula
|
return; |
} |
if (compiler->ignore_case()) { |
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
- int length = uncanonicalize.get(c, '\0', chars); |
- if (length < 2) { |
+ int length = GetCaseIndependentLetters(c, compiler->ascii(), chars); |
+ ASSERT(length != 0); // Can only happen if c > char_mask (see above). |
+ if (length == 1) { |
// This letter has no case equivalents, so it's nice and simple |
// and the mask-compare will determine definitely whether we have |
// a match at this character position. |
@@ -2288,7 +2322,6 @@ |
details->positions(characters_filled_in); |
RegExpCharacterClass* tree = elm.data.u_char_class; |
ZoneList<CharacterRange>* ranges = tree->ranges(); |
- CharacterRange range = ranges->at(0); |
if (tree->is_negated()) { |
// A quick check uses multi-character mask and compare. There is no |
// useful way to incorporate a negative char class into this scheme |
@@ -2297,27 +2330,46 @@ |
pos->mask = 0; |
pos->value = 0; |
} else { |
- uint32_t differing_bits = (range.from() ^ range.to()); |
+ int first_range = 0; |
+ while (ranges->at(first_range).from() > char_mask) { |
+ first_range++; |
+ if (first_range == ranges->length()) { |
+ details->set_cannot_match(); |
+ pos->determines_perfectly = false; |
+ return; |
+ } |
+ } |
+ CharacterRange range = ranges->at(first_range); |
+ uc16 from = range.from(); |
+ uc16 to = range.to(); |
+ if (to > char_mask) { |
+ to = char_mask; |
+ } |
+ uint32_t differing_bits = (from ^ to); |
// A mask and compare is only perfect if the differing bits form a |
// number like 00011111 with one single block of trailing 1s. |
if ((differing_bits & (differing_bits + 1)) == 0) { |
pos->determines_perfectly = true; |
} |
uint32_t common_bits = ~SmearBitsRight(differing_bits); |
- uint32_t bits = (range.from() & common_bits); |
- for (int i = 1; i < ranges->length(); i++) { |
+ uint32_t bits = (from & common_bits); |
+ for (int i = first_range + 1; i < ranges->length(); i++) { |
+ CharacterRange range = ranges->at(i); |
+ uc16 from = range.from(); |
+ uc16 to = range.to(); |
+ if (from > char_mask) continue; |
+ if (to > char_mask) to = char_mask; |
// Here we are combining more ranges into the mask and compare |
// value. With each new range the mask becomes more sparse and |
// so the chances of a false positive rise. A character class |
// with multiple ranges is assumed never to be equivalent to a |
// mask and compare operation. |
pos->determines_perfectly = false; |
- CharacterRange range = ranges->at(i); |
- uint32_t new_common_bits = (range.from() ^ range.to()); |
+ uint32_t new_common_bits = (from ^ to); |
new_common_bits = ~SmearBitsRight(new_common_bits); |
common_bits &= new_common_bits; |
bits &= new_common_bits; |
- uint32_t differing_bits = (range.from() & common_bits) ^ bits; |
+ uint32_t differing_bits = (from & common_bits) ^ bits; |
common_bits ^= differing_bits; |
bits &= common_bits; |
} |
@@ -2619,6 +2671,20 @@ |
} |
+static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { |
+ if (quick_check == NULL) return false; |
+ if (offset >= quick_check->characters()) return false; |
+ return quick_check->positions(offset)->determines_perfectly; |
Lasse Reichstein
2009/02/18 15:20:44
Just checking: If we find that a position determin
Erik Corry
2009/02/18 16:04:24
See above.
|
+} |
+ |
+ |
+static void UpdateBoundsCheck(int index, int* checked_up_to) { |
+ if (index > *checked_up_to) { |
+ *checked_up_to = index; |
+ } |
+} |
+ |
+ |
// We call this repeatedly to generate code for each pass over the text node. |
// The passes are in increasing order of difficulty because we hope one |
// of the first passes will fail in which case we are saved the work of the |
@@ -2663,83 +2729,68 @@ |
TextElement elm = elms_->at(i); |
int cp_offset = trace->cp_offset() + elm.cp_offset; |
if (elm.type == TextElement::ATOM) { |
- if (pass == NON_ASCII_MATCH || |
- pass == CHARACTER_MATCH || |
- pass == CASE_CHARACTER_MATCH) { |
+ if (pass != CHARACTER_CLASS_MATCH) { |
Vector<const uc16> quarks = elm.data.u_atom->data(); |
for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { |
- bool bound_checked = true; // Most ops will check their bounds. |
+ bool bound_checked = false; |
if (first_element_checked && i == 0 && j == 0) continue; |
- if (pass == NON_ASCII_MATCH) { |
- ASSERT(ascii); |
- if (quarks[j] > String::kMaxAsciiCharCode) { |
- assembler->GoTo(backtrack); |
- return; |
- } |
- } else { |
- if (quick_check != NULL && |
- elm.cp_offset + j < quick_check->characters() && |
- quick_check->positions(elm.cp_offset + j)-> |
- determines_perfectly) { |
- continue; |
- } |
- if (pass == CHARACTER_MATCH) { |
- if (compiler->ignore_case()) { |
- bound_checked = EmitAtomNonLetter( |
- assembler, |
- quarks[j], |
+ if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue; |
+ switch (pass) { |
+ case NON_ASCII_MATCH: |
+ ASSERT(ascii); |
+ if (quarks[j] > String::kMaxAsciiCharCode) { |
+ assembler->GoTo(backtrack); |
+ return; |
+ } |
+ break; |
+ case NON_LETTER_CHARACTER_MATCH: |
+ bound_checked = EmitAtomNonLetter(assembler, |
+ quarks[j], |
+ ascii, |
+ backtrack, |
+ cp_offset + j, |
+ *checked_up_to < cp_offset + j, |
+ preloaded); |
+ break; |
+ case SIMPLE_CHARACTER_MATCH: |
+ if (!preloaded) { |
+ assembler->LoadCurrentCharacter( |
+ cp_offset + j, |
backtrack, |
- cp_offset + j, |
- *checked_up_to < cp_offset + j, |
- preloaded); |
- } else { |
- if (!preloaded) { |
- assembler->LoadCurrentCharacter( |
- cp_offset + j, |
- backtrack, |
- *checked_up_to < cp_offset + j); |
- } |
- assembler->CheckNotCharacter(quarks[j], backtrack); |
+ *checked_up_to < cp_offset + j); |
+ bound_checked = true; |
} |
- } else { |
- ASSERT_EQ(pass, CASE_CHARACTER_MATCH); |
- ASSERT(compiler->ignore_case()); |
+ assembler->CheckNotCharacter(quarks[j], backtrack); |
+ break; |
+ case CASE_CHARACTER_MATCH: |
bound_checked = EmitAtomLetter(assembler, |
- compiler->ascii(), |
quarks[j], |
+ ascii, |
backtrack, |
cp_offset + j, |
*checked_up_to < cp_offset + j, |
preloaded); |
- } |
- if (bound_checked) { |
- if (cp_offset + j > *checked_up_to) { |
- *checked_up_to = cp_offset + j; |
- } |
- } |
+ break; |
+ default: |
+ break; |
} |
+ if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); |
} |
} |
} else { |
ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); |
- if (first_element_checked && i == 0) continue; |
- if (quick_check != NULL && |
- elm.cp_offset < quick_check->characters() && |
- quick_check->positions(elm.cp_offset)->determines_perfectly) { |
- continue; |
- } |
if (pass == CHARACTER_CLASS_MATCH) { |
+ if (first_element_checked && i == 0) continue; |
+ if (DeterminedAlready(quick_check, elm.cp_offset)) continue; |
RegExpCharacterClass* cc = elm.data.u_char_class; |
EmitCharClass(assembler, |
cc, |
+ ascii, |
+ backtrack, |
cp_offset, |
- backtrack, |
*checked_up_to < cp_offset, |
- ascii, |
preloaded); |
- if (cp_offset > *checked_up_to) { |
- *checked_up_to = cp_offset; |
- } |
+ UpdateBoundsCheck(cp_offset, checked_up_to); |
} |
} |
} |
@@ -2757,6 +2808,16 @@ |
} |
+bool TextNode::SkipPass(int int_pass, bool ignore_case) { |
+ TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); |
+ if (ignore_case) { |
+ return pass == SIMPLE_CHARACTER_MATCH; |
+ } else { |
+ return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; |
+ } |
+} |
+ |
+ |
// This generates the code to match a text node. A text node can contain |
// straight character sequences (possibly to be matched in a case-independent |
// way) and character classes. For efficiency we do not do this in a single |
@@ -2785,50 +2846,30 @@ |
// If a character is preloaded into the current character register then |
// check that now. |
if (trace->characters_preloaded() == 1) { |
- TextEmitPass(compiler, |
- CHARACTER_MATCH, |
- true, |
- trace, |
- false, |
- &bound_checked_to); |
- if (compiler->ignore_case()) { |
+ for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { |
+ if (!SkipPass(pass, compiler->ignore_case())) { |
+ TextEmitPass(compiler, |
+ static_cast<TextEmitPassType>(pass), |
+ true, |
+ trace, |
+ false, |
+ &bound_checked_to); |
+ } |
+ } |
+ first_elt_done = true; |
+ } |
+ |
+ for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { |
+ if (!SkipPass(pass, compiler->ignore_case())) { |
TextEmitPass(compiler, |
- CASE_CHARACTER_MATCH, |
- true, |
+ static_cast<TextEmitPassType>(pass), |
+ false, |
trace, |
- false, |
+ first_elt_done, |
&bound_checked_to); |
} |
- TextEmitPass(compiler, |
- CHARACTER_CLASS_MATCH, |
- true, |
- trace, |
- false, |
- &bound_checked_to); |
- first_elt_done = true; |
} |
- TextEmitPass(compiler, |
- CHARACTER_MATCH, |
- false, |
- trace, |
- first_elt_done, |
- &bound_checked_to); |
- if (compiler->ignore_case()) { |
- TextEmitPass(compiler, |
- CASE_CHARACTER_MATCH, |
- false, |
- trace, |
- first_elt_done, |
- &bound_checked_to); |
- } |
- TextEmitPass(compiler, |
- CHARACTER_CLASS_MATCH, |
- false, |
- trace, |
- first_elt_done, |
- &bound_checked_to); |
- |
Trace successor_trace(*trace); |
successor_trace.set_at_start(false); |
successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); |