Index: src/jsregexp.cc |
=================================================================== |
--- src/jsregexp.cc (revision 1004) |
+++ src/jsregexp.cc (working copy) |
@@ -214,25 +214,15 @@ |
vector_ = static_offsets_vector_; |
} |
} |
- |
- |
inline ~OffsetsVector() { |
if (offsets_vector_length_ > kStaticOffsetsVectorSize) { |
DeleteArray(vector_); |
vector_ = NULL; |
} |
} |
+ inline int* vector() { return vector_; } |
+ inline int length() { return offsets_vector_length_; } |
- |
- inline int* vector() { |
- return vector_; |
- } |
- |
- |
- inline int length() { |
- return offsets_vector_length_; |
- } |
- |
private: |
int* vector_; |
int offsets_vector_length_; |
@@ -803,6 +793,11 @@ |
} |
#endif |
LOG(RegExpExecEvent(regexp, previous_index, subject)); |
+ |
+ if (!subject->IsFlat(StringShape(*subject))) { |
+ FlattenString(subject); |
+ } |
+ |
return IrregexpExecOnce(irregexp, |
num_captures, |
subject, |
@@ -837,11 +832,12 @@ |
subject->Flatten(shape); |
} |
- do { |
+ while (true) { |
if (previous_index > subject->length() || previous_index < 0) { |
// Per ECMA-262 15.10.6.2, if the previous index is greater than the |
// string length, there is no match. |
matches = Factory::null_value(); |
+ return result; |
} else { |
#ifdef DEBUG |
if (FLAG_trace_regexp_bytecodes) { |
@@ -865,17 +861,12 @@ |
if (offsets.vector()[0] == offsets.vector()[1]) { |
previous_index++; |
} |
+ } else if (matches->IsNull()) { |
+ return result; |
+ } else { |
+ return matches; |
} |
} |
- } while (matches->IsJSArray()); |
- |
- // If we exited the loop with an exception, throw it. |
- if (matches->IsNull()) { |
- // Exited loop normally. |
- return result; |
- } else { |
- // Exited loop with the exception in matches. |
- return matches; |
} |
} |
@@ -886,14 +877,11 @@ |
int previous_index, |
int* offsets_vector, |
int offsets_vector_length) { |
+ ASSERT(subject->IsFlat(StringShape(*subject))); |
bool rc; |
int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value(); |
- if (!subject->IsFlat(StringShape(*subject))) { |
- FlattenString(subject); |
- } |
- |
switch (tag) { |
case RegExpMacroAssembler::kIA32Implementation: { |
#ifndef ARM |
@@ -997,9 +985,9 @@ |
Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1)); |
// The captures come in (start, end+1) pairs. |
- for (int i = 0; i < 2 * (num_captures+1); i += 2) { |
+ for (int i = 0; i < 2 * (num_captures + 1); i += 2) { |
array->set(i, Smi::FromInt(offsets_vector[i])); |
- array->set(i+1, Smi::FromInt(offsets_vector[i+1])); |
+ array->set(i + 1, Smi::FromInt(offsets_vector[i + 1])); |
} |
return Factory::NewJSArrayWithElements(array); |
} |
@@ -1344,25 +1332,26 @@ |
} |
-void GenerationVariant::PushAffectedRegisters(RegExpMacroAssembler* macro, |
+void GenerationVariant::PushAffectedRegisters(RegExpMacroAssembler* assembler, |
int max_register, |
OutSet& affected_registers) { |
for (int reg = 0; reg <= max_register; reg++) { |
- if (affected_registers.Get(reg)) macro->PushRegister(reg); |
+ if (affected_registers.Get(reg)) assembler->PushRegister(reg); |
} |
} |
-void GenerationVariant::RestoreAffectedRegisters(RegExpMacroAssembler* macro, |
- int max_register, |
- OutSet& affected_registers) { |
+void GenerationVariant::RestoreAffectedRegisters( |
+ RegExpMacroAssembler* assembler, |
+ int max_register, |
+ OutSet& affected_registers) { |
for (int reg = max_register; reg >= 0; reg--) { |
- if (affected_registers.Get(reg)) macro->PopRegister(reg); |
+ if (affected_registers.Get(reg)) assembler->PopRegister(reg); |
} |
} |
-void GenerationVariant::PerformDeferredActions(RegExpMacroAssembler* macro, |
+void GenerationVariant::PerformDeferredActions(RegExpMacroAssembler* assembler, |
int max_register, |
OutSet& affected_registers) { |
for (int reg = 0; reg <= max_register; reg++) { |
@@ -1410,13 +1399,13 @@ |
} |
} |
if (store_position != -1) { |
- macro->WriteCurrentPositionToRegister(reg, store_position); |
+ assembler->WriteCurrentPositionToRegister(reg, store_position); |
} else { |
if (absolute) { |
- macro->SetRegister(reg, value); |
+ assembler->SetRegister(reg, value); |
} else { |
if (value != 0) { |
- macro->AdvanceRegister(reg, value); |
+ assembler->AdvanceRegister(reg, value); |
} |
} |
} |
@@ -1428,14 +1417,19 @@ |
// nodes. It normalises the state of the code generator to ensure we can |
// generate generic code. |
bool GenerationVariant::Flush(RegExpCompiler* compiler, RegExpNode* successor) { |
- RegExpMacroAssembler* macro = compiler->macro_assembler(); |
+ RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
- ASSERT(actions_ != NULL || cp_offset_ != 0 || backtrack() != NULL); |
+ ASSERT(actions_ != NULL || |
+ cp_offset_ != 0 || |
+ backtrack() != NULL || |
+ characters_preloaded_ != 0 || |
+ quick_check_performed_.characters() != 0); |
if (actions_ == NULL && backtrack() == NULL) { |
// Here we just have some deferred cp advances to fix and we are back to |
- // a normal situation. |
- macro->AdvanceCurrentPosition(cp_offset_); |
+ // a normal situation. We may also have to forget some information gained |
+ // through a quick check that was already performed. |
+ if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); |
// Create a new trivial state and generate the node with that. |
GenerationVariant new_state; |
return successor->Emit(compiler, &new_state); |
@@ -1444,50 +1438,50 @@ |
// Generate deferred actions here along with code to undo them again. |
OutSet affected_registers; |
int max_register = FindAffectedRegisters(&affected_registers); |
- PushAffectedRegisters(macro, max_register, affected_registers); |
- PerformDeferredActions(macro, max_register, affected_registers); |
+ PushAffectedRegisters(assembler, max_register, affected_registers); |
+ PerformDeferredActions(assembler, max_register, affected_registers); |
if (backtrack() != NULL) { |
// Here we have a concrete backtrack location. These are set up by choice |
// nodes and so they indicate that we have a deferred save of the current |
// position which we may need to emit here. |
- macro->PushCurrentPosition(); |
+ assembler->PushCurrentPosition(); |
} |
if (cp_offset_ != 0) { |
- macro->AdvanceCurrentPosition(cp_offset_); |
+ assembler->AdvanceCurrentPosition(cp_offset_); |
} |
// Create a new trivial state and generate the node with that. |
Label undo; |
- macro->PushBacktrack(&undo); |
+ assembler->PushBacktrack(&undo); |
GenerationVariant new_state; |
bool ok = successor->Emit(compiler, &new_state); |
// On backtrack we need to restore state. |
- macro->Bind(&undo); |
+ assembler->Bind(&undo); |
if (!ok) return false; |
if (backtrack() != NULL) { |
- macro->PopCurrentPosition(); |
+ assembler->PopCurrentPosition(); |
} |
- RestoreAffectedRegisters(macro, max_register, affected_registers); |
+ RestoreAffectedRegisters(assembler, max_register, affected_registers); |
if (backtrack() == NULL) { |
- macro->Backtrack(); |
+ assembler->Backtrack(); |
} else { |
- macro->GoTo(backtrack()); |
+ assembler->GoTo(backtrack()); |
} |
return true; |
} |
-void EndNode::EmitInfoChecks(RegExpMacroAssembler* macro, |
+void EndNode::EmitInfoChecks(RegExpMacroAssembler* assembler, |
GenerationVariant* variant) { |
if (info()->at_end) { |
Label succeed; |
// LoadCurrentCharacter will go to the label if we are at the end of the |
// input string. |
- macro->LoadCurrentCharacter(0, &succeed); |
- macro->GoTo(variant->backtrack()); |
- macro->Bind(&succeed); |
+ assembler->LoadCurrentCharacter(0, &succeed); |
+ assembler->GoTo(variant->backtrack()); |
+ assembler->Bind(&succeed); |
} |
} |
@@ -1497,16 +1491,16 @@ |
if (!variant->is_trivial()) { |
return variant->Flush(compiler, this); |
} |
- RegExpMacroAssembler* macro = compiler->macro_assembler(); |
+ RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
if (!label()->is_bound()) { |
- macro->Bind(label()); |
+ assembler->Bind(label()); |
} |
- EmitInfoChecks(macro, variant); |
- macro->ReadCurrentPositionFromRegister(current_position_register_); |
- macro->ReadStackPointerFromRegister(stack_pointer_register_); |
+ EmitInfoChecks(assembler, variant); |
+ assembler->ReadCurrentPositionFromRegister(current_position_register_); |
+ assembler->ReadStackPointerFromRegister(stack_pointer_register_); |
// Now that we have unwound the stack we find at the top of the stack the |
// backtrack that the BeginSubmatch node got. |
- macro->Backtrack(); |
+ assembler->Backtrack(); |
return true; |
} |
@@ -1515,18 +1509,18 @@ |
if (!variant->is_trivial()) { |
return variant->Flush(compiler, this); |
} |
- RegExpMacroAssembler* macro = compiler->macro_assembler(); |
+ RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
if (!label()->is_bound()) { |
- macro->Bind(label()); |
+ assembler->Bind(label()); |
} |
switch (action_) { |
case ACCEPT: |
- EmitInfoChecks(macro, variant); |
- macro->Succeed(); |
+ EmitInfoChecks(assembler, variant); |
+ assembler->Succeed(); |
return true; |
case BACKTRACK: |
ASSERT(!info()->at_end); |
- macro->GoTo(variant->backtrack()); |
+ assembler->GoTo(variant->backtrack()); |
return true; |
case NEGATIVE_SUBMATCH_SUCCESS: |
// This case is handled in a different virtual method. |
@@ -1629,30 +1623,26 @@ |
static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; |
-static inline void EmitAtomNonLetters( |
+// Only emits non-letters (things that don't have case). Only used for case |
+// independent matches. |
+static inline bool EmitAtomNonLetter( |
RegExpMacroAssembler* macro_assembler, |
- TextElement elm, |
- Vector<const uc16> quarks, |
+ uc16 c, |
Label* on_failure, |
int cp_offset, |
- bool check_offset) { |
+ bool check, |
+ bool preloaded) { |
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
- // It is vital that this loop is backwards due to the unchecked character |
- // load below. |
- for (int i = quarks.length() - 1; i >= 0; i--) { |
- uc16 c = quarks[i]; |
- int length = uncanonicalize.get(c, '\0', chars); |
- if (length <= 1) { |
- if (check_offset && i == quarks.length() - 1) { |
- macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure); |
- } else { |
- // Here we don't need to check against the end of the input string |
- // since this character lies before a character that matched. |
- macro_assembler->LoadCurrentCharacterUnchecked(cp_offset + i); |
- } |
- macro_assembler->CheckNotCharacter(c, on_failure); |
+ int length = uncanonicalize.get(c, '\0', chars); |
+ bool checked = false; |
+ if (length <= 1) { |
+ if (!preloaded) { |
+ macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
+ checked = check; |
} |
+ macro_assembler->CheckNotCharacter(c, on_failure); |
} |
+ return checked; |
} |
@@ -1666,7 +1656,8 @@ |
// If c1 and c2 differ only by one bit. |
// Ecma262UnCanonicalize always gives the highest number last. |
ASSERT(c2 > c1); |
- macro_assembler->CheckNotCharacterAfterOr(c2, exor, on_failure); |
+ uc16 mask = String::kMaxUC16CharCode ^ exor; |
+ macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); |
return true; |
} |
ASSERT(c2 > c1); |
@@ -1676,65 +1667,63 @@ |
// subtract the difference from the found character, then do the or |
// trick. We avoid the theoretical case where negative numbers are |
// involved in order to simplify code generation. |
- macro_assembler->CheckNotCharacterAfterMinusOr(c2 - diff, |
- diff, |
- on_failure); |
+ uc16 mask = String::kMaxUC16CharCode ^ diff; |
+ macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, |
+ diff, |
+ mask, |
+ on_failure); |
return true; |
} |
return false; |
} |
-static inline void EmitAtomLetters( |
+// Only emits letters (things that have case). Only used for case independent |
+// matches. |
+static inline bool EmitAtomLetter( |
RegExpMacroAssembler* macro_assembler, |
- TextElement elm, |
- Vector<const uc16> quarks, |
+ uc16 c, |
Label* on_failure, |
int cp_offset, |
- bool check_offset) { |
+ bool check, |
+ bool preloaded) { |
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
- // It is vital that this loop is backwards due to the unchecked character |
- // load below. |
- for (int i = quarks.length() - 1; i >= 0; i--) { |
- uc16 c = quarks[i]; |
- int length = uncanonicalize.get(c, '\0', chars); |
- if (length <= 1) continue; |
- if (check_offset && i == quarks.length() - 1) { |
- macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure); |
- } else { |
- // Here we don't need to check against the end of the input string |
- // since this character lies before a character that matched. |
- macro_assembler->LoadCurrentCharacterUnchecked(cp_offset + i); |
- } |
- Label ok; |
- ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); |
- switch (length) { |
- case 2: { |
- if (ShortCutEmitCharacterPair(macro_assembler, |
- chars[0], |
- chars[1], |
- on_failure)) { |
- } else { |
- macro_assembler->CheckCharacter(chars[0], &ok); |
- macro_assembler->CheckNotCharacter(chars[1], on_failure); |
- macro_assembler->Bind(&ok); |
- } |
- break; |
- } |
- case 4: |
- macro_assembler->CheckCharacter(chars[3], &ok); |
- // Fall through! |
- case 3: |
+ int length = uncanonicalize.get(c, '\0', chars); |
+ if (length <= 1) return false; |
+ // We may not need to check against the end of the input string |
+ // if this character lies before a character that matched. |
+ if (!preloaded) { |
+ macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
+ } |
+ Label ok; |
+ ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); |
+ switch (length) { |
+ case 2: { |
+ if (ShortCutEmitCharacterPair(macro_assembler, |
+ chars[0], |
+ chars[1], |
+ on_failure)) { |
+ } else { |
macro_assembler->CheckCharacter(chars[0], &ok); |
- macro_assembler->CheckCharacter(chars[1], &ok); |
- macro_assembler->CheckNotCharacter(chars[2], on_failure); |
+ macro_assembler->CheckNotCharacter(chars[1], on_failure); |
macro_assembler->Bind(&ok); |
- break; |
- default: |
- UNREACHABLE(); |
- break; |
+ } |
+ break; |
} |
+ case 4: |
+ macro_assembler->CheckCharacter(chars[3], &ok); |
+ // Fall through! |
+ case 3: |
+ macro_assembler->CheckCharacter(chars[0], &ok); |
+ macro_assembler->CheckCharacter(chars[1], &ok); |
+ macro_assembler->CheckNotCharacter(chars[2], on_failure); |
+ macro_assembler->Bind(&ok); |
+ break; |
+ default: |
+ UNREACHABLE(); |
+ break; |
} |
+ return true; |
} |
@@ -1743,7 +1732,8 @@ |
int cp_offset, |
Label* on_failure, |
bool check_offset, |
- bool ascii) { |
+ bool ascii, |
+ bool preloaded) { |
ZoneList<CharacterRange>* ranges = cc->ranges(); |
int max_char; |
if (ascii) { |
@@ -1789,15 +1779,11 @@ |
return; |
} |
- if (check_offset) { |
- macro_assembler->LoadCurrentCharacter(cp_offset, on_failure); |
- } else { |
- // Here we don't need to check against the end of the input string |
- // since this character lies before a character that matched. |
- macro_assembler->LoadCurrentCharacterUnchecked(cp_offset); |
+ if (!preloaded) { |
+ macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); |
} |
- for (int i = 0; i <= last_valid_range; i++) { |
+ for (int i = 0; i < last_valid_range; i++) { |
CharacterRange& range = ranges->at(i); |
Label next_range; |
uc16 from = range.from(); |
@@ -1858,6 +1844,10 @@ |
} |
+RegExpNode::~RegExpNode() { |
+} |
+ |
+ |
RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, |
GenerationVariant* variant) { |
// TODO(erikcorry): Implement support. |
@@ -1908,115 +1898,573 @@ |
} |
-// This generates the code to match a text node. A text node can contain |
-// straight character sequences (possibly to be matched in a case-independent |
-// way) and character classes. In order to be most efficient we test for the |
-// simple things first and then move on to the more complicated things. The |
-// simplest thing is a non-letter or a letter if we are matching case. The |
-// next-most simple thing is a case-independent letter. The least simple is |
-// a character class. Another optimization is that we test the last one first. |
-// If that succeeds we don't need to test for the end of the string when we |
-// load other characters. |
-bool TextNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) { |
- RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
- Label *backtrack = variant->backtrack(); |
- LimitResult limit_result = LimitVersions(compiler, variant); |
- if (limit_result == FAIL) return false; |
- if (limit_result == DONE) return true; |
- ASSERT(limit_result == CONTINUE); |
+int ActionNode::EatsAtLeast(int recursion_depth) { |
+ if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
+ if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! |
+ return on_success()->EatsAtLeast(recursion_depth + 1); |
+} |
- int element_count = elms_->length(); |
- ASSERT(element_count != 0); |
- if (info()->at_end) { |
- macro_assembler->GoTo(backtrack); |
- return true; |
+ |
+int TextNode::EatsAtLeast(int recursion_depth) { |
+ int answer = Length(); |
+ if (answer >= 4) return answer; |
+ if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer; |
+ return answer + on_success()->EatsAtLeast(recursion_depth + 1); |
+} |
+ |
+ |
+ |
+int ChoiceNode::EatsAtLeastHelper(int recursion_depth, int start_from_node) { |
+ if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
+ int min = 100; |
+ int choice_count = alternatives_->length(); |
+ for (int i = start_from_node; i < choice_count; i++) { |
+ RegExpNode* node = alternatives_->at(i).node(); |
+ int node_eats_at_least = node->EatsAtLeast(recursion_depth + 1); |
+ if (node_eats_at_least < min) min = node_eats_at_least; |
} |
- // First check for non-ASCII text. |
- // TODO(plesner): We should do this at node level. |
+ return min; |
+} |
+ |
+ |
+int LoopChoiceNode::EatsAtLeast(int recursion_depth) { |
+ return 0; |
+} |
+ |
+ |
+int ChoiceNode::EatsAtLeast(int recursion_depth) { |
+ return EatsAtLeastHelper(recursion_depth, 0); |
+} |
+ |
+ |
+// Takes the left-most 1-bit and smears it out, setting all bits to its right. |
+static inline uint32_t SmearBitsRight(uint32_t v) { |
+ v |= v >> 1; |
+ v |= v >> 2; |
+ v |= v >> 4; |
+ v |= v >> 8; |
+ v |= v >> 16; |
+ return v; |
+} |
+ |
+ |
+bool QuickCheckDetails::Rationalize(bool asc) { |
+ bool found_useful_op = false; |
+ uint32_t char_mask; |
+ if (asc) { |
+ char_mask = String::kMaxAsciiCharCode; |
+ } else { |
+ char_mask = String::kMaxUC16CharCode; |
+ } |
+ mask_ = 0; |
+ value_ = 0; |
+ int char_shift = 0; |
+ for (int i = 0; i < characters_; i++) { |
+ Position* pos = &positions_[i]; |
+ if ((pos->mask & String::kMaxAsciiCharCode) != 0) { |
+ found_useful_op = true; |
+ } |
+ mask_ |= (pos->mask & char_mask) << char_shift; |
+ value_ |= (pos->value & char_mask) << char_shift; |
+ char_shift += asc ? 8 : 16; |
+ } |
+ return found_useful_op; |
+} |
+ |
+ |
+bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, |
+ GenerationVariant* variant, |
+ bool preload_has_checked_bounds, |
+ Label* on_possible_success, |
+ QuickCheckDetails* details, |
+ bool fall_through_on_failure) { |
+ if (details->characters() == 0) return false; |
+ GetQuickCheckDetails(details, compiler, 0); |
+ if (!details->Rationalize(compiler->ascii())) return false; |
+ uint32_t mask = details->mask(); |
+ uint32_t value = details->value(); |
+ |
+ RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
+ |
+ if (variant->characters_preloaded() != details->characters()) { |
+ assembler->LoadCurrentCharacter(variant->cp_offset(), |
+ variant->backtrack(), |
+ !preload_has_checked_bounds, |
+ details->characters()); |
+ } |
+ |
+ |
+ bool need_mask = true; |
+ |
+ if (details->characters() == 1) { |
+ // If number of characters preloaded is 1 then we used a byte or 16 bit |
+ // load so the value is already masked down. |
+ uint32_t char_mask; |
+ if (compiler->ascii()) { |
+ char_mask = String::kMaxAsciiCharCode; |
+ } else { |
+ char_mask = String::kMaxUC16CharCode; |
+ } |
+ if ((mask & char_mask) == char_mask) need_mask = false; |
+ } else { |
+ // For 2-character preloads in ASCII mode we also use a 16 bit load with |
+ // zero extend. |
+ if (details->characters() == 2 && compiler->ascii()) { |
+ if ((mask & 0xffff) == 0xffff) need_mask = false; |
+ } else { |
+ if (mask == 0xffffffff) need_mask = false; |
+ } |
+ } |
+ |
+ if (fall_through_on_failure) { |
+ if (need_mask) { |
+ assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); |
+ } else { |
+ assembler->CheckCharacter(value, on_possible_success); |
+ } |
+ } else { |
+ if (need_mask) { |
+ assembler->CheckNotCharacterAfterAnd(value, mask, variant->backtrack()); |
+ } else { |
+ assembler->CheckNotCharacter(value, variant->backtrack()); |
+ } |
+ } |
+ return true; |
+} |
+ |
+ |
+// Here is the meat of GetQuickCheckDetails (see also the comment on the |
+// super-class in the .h file). |
+// |
+// We iterate along the text object, building up for each character a |
+// mask and value that can be used to test for a quick failure to match. |
+// The masks and values for the positions will be combined into a single |
+// machine word for the current character width in order to be used in |
+// generating a quick check. |
+void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, |
+ RegExpCompiler* compiler, |
+ int characters_filled_in) { |
+ ASSERT(characters_filled_in < details->characters()); |
+ int characters = details->characters(); |
+ int char_mask; |
+ int char_shift; |
if (compiler->ascii()) { |
- for (int i = element_count - 1; i >= 0; i--) { |
- TextElement elm = elms_->at(i); |
- if (elm.type == TextElement::ATOM) { |
- Vector<const uc16> quarks = elm.data.u_atom->data(); |
- for (int j = quarks.length() - 1; j >= 0; j--) { |
- if (quarks[j] > String::kMaxAsciiCharCode) { |
- macro_assembler->GoTo(backtrack); |
- return true; |
+ char_mask = String::kMaxAsciiCharCode; |
+ char_shift = 8; |
+ } else { |
+ char_mask = String::kMaxUC16CharCode; |
+ char_shift = 16; |
+ } |
+ for (int k = 0; k < elms_->length(); k++) { |
+ TextElement elm = elms_->at(k); |
+ if (elm.type == TextElement::ATOM) { |
+ Vector<const uc16> quarks = elm.data.u_atom->data(); |
+ for (int i = 0; i < characters && i < quarks.length(); i++) { |
+ QuickCheckDetails::Position* pos = |
+ details->positions(characters_filled_in); |
+ if (compiler->ignore_case()) { |
+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
+ uc16 c = quarks[i]; |
+ int length = uncanonicalize.get(c, '\0', chars); |
+ if (length < 2) { |
+ // This letter has no case equivalents, so it's nice and simple |
+ // and the mask-compare will determine definitely whether we have |
+ // a match at this character position. |
+ pos->mask = char_mask; |
+ pos->value = c; |
+ pos->determines_perfectly = true; |
+ } else { |
+ uint32_t common_bits = char_mask; |
+ uint32_t bits = chars[0]; |
+ for (int j = 1; j < length; j++) { |
+ uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); |
+ common_bits ^= differing_bits; |
+ bits &= common_bits; |
+ } |
+ // If length is 2 and common bits has only one zero in it then |
+ // our mask and compare instruction will determine definitely |
+ // whether we have a match at this character position. Otherwise |
+ // it can only be an approximate check. |
+ uint32_t one_zero = (common_bits | ~char_mask); |
+ if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { |
+ pos->determines_perfectly = true; |
+ } |
+ pos->mask = common_bits; |
+ pos->value = bits; |
} |
+ } else { |
+ // Don't ignore case. Nice simple case where the mask-compare will |
+ // determine definitely whether we have a match at this character |
+ // position. |
+ pos->mask = char_mask; |
+ pos->value = quarks[i]; |
+ pos->determines_perfectly = true; |
} |
+ characters_filled_in++; |
+ ASSERT(characters_filled_in <= details->characters()); |
+ if (characters_filled_in == details->characters()) { |
+ return; |
+ } |
+ } |
+ } else { |
+ QuickCheckDetails::Position* pos = |
+ details->positions(characters_filled_in); |
+ RegExpCharacterClass* tree = elm.data.u_char_class; |
+ ZoneList<CharacterRange>* ranges = tree->ranges(); |
+ CharacterRange range = ranges->at(0); |
+ if (tree->is_negated()) { |
+ // A quick check uses multi-character mask and compare. There is no |
+ // useful way to incorporate a negative char class into this scheme |
+ // so we just conservatively create a mask and value that will always |
+ // succeed. |
+ pos->mask = 0; |
+ pos->value = 0; |
} else { |
- ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); |
+ uint32_t differing_bits = (range.from() ^ range.to()); |
+ // A mask and compare is only perfect if the differing bits form a |
+ // number like 00011111 with one single block of trailing 1s. |
+ if ((differing_bits & (differing_bits + 1)) == 0) { |
+ pos->determines_perfectly = true; |
+ } |
+ uint32_t common_bits = ~SmearBitsRight(differing_bits); |
+ uint32_t bits = (range.from() & common_bits); |
+ for (int i = 1; i < ranges->length(); i++) { |
+ // Here we are combining more ranges into the mask and compare |
+ // value. With each new range the mask becomes more sparse and |
+ // so the chances of a false positive rise. A character class |
+ // with multiple ranges is assumed never to be equivalent to a |
+ // mask and compare operation. |
+ pos->determines_perfectly = false; |
+ CharacterRange range = ranges->at(i); |
+ uint32_t new_common_bits = (range.from() ^ range.to()); |
+ new_common_bits = ~SmearBitsRight(new_common_bits); |
+ common_bits &= new_common_bits; |
+ bits &= new_common_bits; |
+ uint32_t differing_bits = (range.from() & common_bits) ^ bits; |
+ common_bits ^= differing_bits; |
+ bits &= common_bits; |
+ } |
+ pos->mask = common_bits; |
+ pos->value = bits; |
} |
+ characters_filled_in++; |
+ ASSERT(characters_filled_in <= details->characters()); |
+ if (characters_filled_in == details->characters()) { |
+ return; |
+ } |
} |
} |
- // Second, handle straight character matches. |
- int checked_up_to = -1; |
- for (int i = element_count - 1; i >= 0; i--) { |
+ ASSERT(characters_filled_in != details->characters()); |
+ on_success()-> GetQuickCheckDetails(details, compiler, characters_filled_in); |
+} |
+ |
+ |
+void QuickCheckDetails::Clear() { |
+ for (int i = 0; i < characters_; i++) { |
+ positions_[i].mask = 0; |
+ positions_[i].value = 0; |
+ positions_[i].determines_perfectly = false; |
+ } |
+ characters_ = 0; |
+} |
+ |
+ |
+void QuickCheckDetails::Advance(int by, bool ascii) { |
+ ASSERT(by > 0); |
+ if (by >= characters_) { |
+ Clear(); |
+ return; |
+ } |
+ for (int i = 0; i < characters_ - by; i++) { |
+ positions_[i] = positions_[by + i]; |
+ } |
+ for (int i = characters_ - by; i < characters_; i++) { |
+ positions_[i].mask = 0; |
+ positions_[i].value = 0; |
+ positions_[i].determines_perfectly = false; |
+ } |
+ characters_ -= by; |
+ // We could change mask_ and value_ here but we would never advance unless |
+ // they had already been used in a check and they won't be used again because |
+ // it would gain us nothing. So there's no point. |
+} |
+ |
+ |
+void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { |
+ ASSERT(characters_ == other->characters_); |
+ for (int i = from_index; i < characters_; i++) { |
+ QuickCheckDetails::Position* pos = positions(i); |
+ QuickCheckDetails::Position* other_pos = other->positions(i); |
+ if (pos->mask != other_pos->mask || |
+ pos->value != other_pos->value || |
+ !other_pos->determines_perfectly) { |
+ // Our mask-compare operation will be approximate unless we have the |
+ // exact same operation on both sides of the alternation. |
+ pos->determines_perfectly = false; |
+ } |
+ pos->mask &= other_pos->mask; |
+ pos->value &= pos->mask; |
+ other_pos->value &= pos->mask; |
+ uc16 differing_bits = (pos->value ^ other_pos->value); |
+ pos->mask &= ~differing_bits; |
+ pos->value &= pos->mask; |
+ } |
+} |
+ |
+ |
+void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, |
+ RegExpCompiler* compiler, |
+ int characters_filled_in) { |
+ int choice_count = alternatives_->length(); |
+ ASSERT(choice_count > 0); |
+ alternatives_->at(0).node()->GetQuickCheckDetails(details, |
+ compiler, |
+ characters_filled_in); |
+ for (int i = 1; i < choice_count; i++) { |
+ QuickCheckDetails new_details(details->characters()); |
+ RegExpNode* node = alternatives_->at(i).node(); |
+ node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in); |
+ // Here we merge the quick match details of the two branches. |
+ details->Merge(&new_details, characters_filled_in); |
+ } |
+} |
+ |
+ |
+// We call this repeatedly to generate code for each pass over the text node. |
+// The passes are in increasing order of difficulty because we hope one |
+// of the first passes will fail in which case we are saved the work of the |
+// later passes. for example for the case independent regexp /%[asdfghjkl]a/ |
+// we will check the '%' in the first pass, the case independent 'a' in the |
+// second pass and the character class in the last pass. |
+// |
+// The passes are done from right to left, so for example to test for /bar/ |
+// we will first test for an 'r' with offset 2, then an 'a' with offset 1 |
+// and then a 'b' with offset 0. This means we can avoid the end-of-input |
+// bounds check most of the time. In the example we only need to check for |
+// end-of-input when loading the putative 'r'. |
+// |
+// A slight complication involves the fact that the first character may already |
+// be fetched into a register by the previous node. In this case we want to |
+// do the test for that character first. We do this in separate passes. The |
+// 'preloaded' argument indicates that we are doing such a 'pass'. If such a |
+// pass has been performed then subsequent passes will have true in |
+// first_element_checked to indicate that that character does not need to be |
+// checked again. |
+// |
+// In addition to all this we are passed a GenerationVariant, which can |
+// contain an AlternativeGeneration object. In this AlternativeGeneration |
+// object we can see details of any quick check that was already passed in |
+// order to get to the code we are now generating. The quick check can involve |
+// loading characters, which means we do not need to recheck the bounds |
+// up to the limit the quick check already checked. In addition the quick |
+// check can have involved a mask and compare operation which may simplify |
+// or obviate the need for further checks at some character positions. |
+void TextNode::TextEmitPass(RegExpCompiler* compiler, |
+ TextEmitPassType pass, |
+ bool preloaded, |
+ GenerationVariant* variant, |
+ bool first_element_checked, |
+ int* checked_up_to) { |
+ RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
+ bool ascii = compiler->ascii(); |
+ Label* backtrack = variant->backtrack(); |
+ QuickCheckDetails* quick_check = variant->quick_check_performed(); |
+ int element_count = elms_->length(); |
+ for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { |
TextElement elm = elms_->at(i); |
- ASSERT(elm.cp_offset >= 0); |
int cp_offset = variant->cp_offset() + elm.cp_offset; |
if (elm.type == TextElement::ATOM) { |
- Vector<const uc16> quarks = elm.data.u_atom->data(); |
- int last_cp_offset = cp_offset + quarks.length(); |
- if (compiler->ignore_case()) { |
- EmitAtomNonLetters(macro_assembler, |
- elm, |
- quarks, |
- backtrack, |
- cp_offset, |
- checked_up_to < last_cp_offset); |
- } else { |
- macro_assembler->CheckCharacters(quarks, |
- cp_offset, |
- backtrack, |
- checked_up_to < last_cp_offset); |
+ if (pass == NON_ASCII_MATCH || |
+ pass == CHARACTER_MATCH || |
+ pass == CASE_CHARACTER_MATCH) { |
+ Vector<const uc16> quarks = elm.data.u_atom->data(); |
+ for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { |
+ bool bound_checked = true; // Most ops will check their bounds. |
+ if (first_element_checked && i == 0 && j == 0) continue; |
+ if (quick_check != NULL && |
+ elm.cp_offset + j < quick_check->characters() && |
+ quick_check->positions(elm.cp_offset + j)->determines_perfectly) { |
+ continue; |
+ } |
+ if (pass == NON_ASCII_MATCH) { |
+ ASSERT(ascii); |
+ if (quarks[j] > String::kMaxAsciiCharCode) { |
+ assembler->GoTo(backtrack); |
+ return; |
+ } |
+ } else if (pass == CHARACTER_MATCH) { |
+ if (compiler->ignore_case()) { |
+ bound_checked = EmitAtomNonLetter(assembler, |
+ quarks[j], |
+ backtrack, |
+ cp_offset + j, |
+ *checked_up_to < cp_offset + j, |
+ preloaded); |
+ } else { |
+ if (!preloaded) { |
+ assembler->LoadCurrentCharacter(cp_offset + j, |
+ backtrack, |
+ *checked_up_to < cp_offset + j); |
+ } |
+ assembler->CheckNotCharacter(quarks[j], backtrack); |
+ } |
+ } else { |
+ ASSERT_EQ(pass, CASE_CHARACTER_MATCH); |
+ ASSERT(compiler->ignore_case()); |
+ bound_checked = EmitAtomLetter(assembler, |
+ quarks[j], |
+ backtrack, |
+ cp_offset + j, |
+ *checked_up_to < cp_offset + j, |
+ preloaded); |
+ } |
+ if (pass != NON_ASCII_MATCH && bound_checked) { |
+ if (cp_offset + j > *checked_up_to) { |
+ *checked_up_to = cp_offset + j; |
+ } |
+ } |
+ } |
} |
- if (last_cp_offset > checked_up_to) checked_up_to = last_cp_offset - 1; |
} else { |
ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); |
- } |
- } |
- // Third, handle case independent letter matches if any. |
- if (compiler->ignore_case()) { |
- for (int i = element_count - 1; i >= 0; i--) { |
- TextElement elm = elms_->at(i); |
- int cp_offset = variant->cp_offset() + elm.cp_offset; |
- if (elm.type == TextElement::ATOM) { |
- Vector<const uc16> quarks = elm.data.u_atom->data(); |
- int last_cp_offset = cp_offset + quarks.length(); |
- EmitAtomLetters(macro_assembler, |
- elm, |
- quarks, |
- backtrack, |
- cp_offset, |
- checked_up_to < last_cp_offset); |
- if (last_cp_offset > checked_up_to) checked_up_to = last_cp_offset - 1; |
+ if (first_element_checked && i == 0) continue; |
+ if (quick_check != NULL && |
+ elm.cp_offset < quick_check->characters() && |
+ quick_check->positions(elm.cp_offset)->determines_perfectly) { |
+ continue; |
} |
+ if (pass == CHARACTER_CLASS_MATCH) { |
+ RegExpCharacterClass* cc = elm.data.u_char_class; |
+ EmitCharClass(assembler, |
+ cc, |
+ cp_offset, |
+ backtrack, |
+ *checked_up_to < cp_offset, |
+ ascii, |
+ preloaded); |
+ if (cp_offset > *checked_up_to) { |
+ *checked_up_to = cp_offset; |
+ } |
+ } |
} |
} |
- // If the fast character matches passed then do the character classes. |
- for (int i = element_count - 1; i >= 0; i--) { |
- TextElement elm = elms_->at(i); |
- int cp_offset = variant->cp_offset() + elm.cp_offset; |
- if (elm.type == TextElement::CHAR_CLASS) { |
- RegExpCharacterClass* cc = elm.data.u_char_class; |
- EmitCharClass(macro_assembler, |
- cc, |
- cp_offset, |
- backtrack, |
- checked_up_to < cp_offset, |
- compiler->ascii()); |
- if (cp_offset > checked_up_to) checked_up_to = cp_offset; |
+} |
+ |
+ |
+int TextNode::Length() { |
+ TextElement elm = elms_->last(); |
+ ASSERT(elm.cp_offset >= 0); |
+ if (elm.type == TextElement::ATOM) { |
+ return elm.cp_offset + elm.data.u_atom->data().length(); |
+ } else { |
+ return elm.cp_offset + 1; |
+ } |
+} |
+ |
+ |
+// This generates the code to match a text node. A text node can contain |
+// straight character sequences (possibly to be matched in a case-independent |
+// way) and character classes. For efficiency we do not do this in a single |
+// pass from left to right. Instead we pass over the text node several times, |
+// emitting code for some character positions every time. See the comment on |
+// TextEmitPass for details. |
+bool TextNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) { |
+ LimitResult limit_result = LimitVersions(compiler, variant); |
+ if (limit_result == FAIL) return false; |
+ if (limit_result == DONE) return true; |
+ ASSERT(limit_result == CONTINUE); |
+ |
+ if (info()->follows_word_interest || |
+ info()->follows_newline_interest || |
+ info()->follows_start_interest) { |
+ return false; |
+ } |
+ |
+ if (info()->at_end) { |
+ compiler->macro_assembler()->GoTo(variant->backtrack()); |
+ return true; |
+ } |
+ |
+ if (compiler->ascii()) { |
+ int dummy = 0; |
+ TextEmitPass(compiler, NON_ASCII_MATCH, false, variant, false, &dummy); |
+ } |
+ |
+ bool first_elt_done = false; |
+ int bound_checked_to = variant->cp_offset() - 1; |
+ QuickCheckDetails* quick_check = variant->quick_check_performed(); |
+ if (quick_check != NULL) { |
+ bound_checked_to += quick_check->characters(); |
+ } |
+ |
+ // If a character is preloaded into the current character register then |
+ // check that now. |
+ if (variant->characters_preloaded() == 1) { |
+ TextEmitPass(compiler, |
+ CHARACTER_MATCH, |
+ true, |
+ variant, |
+ false, |
+ &bound_checked_to); |
+ if (compiler->ignore_case()) { |
+ TextEmitPass(compiler, |
+ CASE_CHARACTER_MATCH, |
+ true, |
+ variant, |
+ false, |
+ &bound_checked_to); |
} |
+ TextEmitPass(compiler, |
+ CHARACTER_CLASS_MATCH, |
+ true, |
+ variant, |
+ false, |
+ &bound_checked_to); |
+ first_elt_done = true; |
} |
- GenerationVariant new_variant(*variant); |
- new_variant.set_cp_offset(checked_up_to + 1); |
+ TextEmitPass(compiler, |
+ CHARACTER_MATCH, |
+ false, |
+ variant, |
+ first_elt_done, |
+ &bound_checked_to); |
+ if (compiler->ignore_case()) { |
+ TextEmitPass(compiler, |
+ CASE_CHARACTER_MATCH, |
+ false, |
+ variant, |
+ first_elt_done, |
+ &bound_checked_to); |
+ } |
+ TextEmitPass(compiler, |
+ CHARACTER_CLASS_MATCH, |
+ false, |
+ variant, |
+ first_elt_done, |
+ &bound_checked_to); |
+ |
+ GenerationVariant successor_variant(*variant); |
+ successor_variant.AdvanceVariant(Length(), compiler->ascii()); |
RecursionCheck rc(compiler); |
- return on_success()->Emit(compiler, &new_variant); |
+ return on_success()->Emit(compiler, &successor_variant); |
} |
+void GenerationVariant::AdvanceVariant(int by, bool ascii) { |
+ ASSERT(by > 0); |
+ // We don't have an instruction for shifting the current character register |
+ // down or for using a shifted value for anything so lets just forget that |
+ // we preloaded any characters into it. |
+ characters_preloaded_ = 0; |
+ // Adjust the offsets of the quick check performed information. This |
+ // information is used to find out what we already determined about the |
+ // characters by means of mask and compare. |
+ quick_check_performed_.Advance(by, ascii); |
+ cp_offset_ += by; |
+} |
+ |
+ |
void TextNode::MakeCaseIndependent() { |
int element_count = elms_->length(); |
for (int i = 0; i < element_count; i++) { |
@@ -2110,6 +2558,156 @@ |
} |
+int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, |
+ int start_from_node) { |
+ bool ascii = compiler->ascii(); |
+ int preload_characters = ChoiceNode::EatsAtLeastHelper(0, start_from_node); |
+#ifdef CAN_READ_UNALIGNED |
+ if (ascii) { |
+ if (preload_characters > 4) preload_characters = 4; |
+ // We can't preload 3 characters because there is no machine instruction |
+ // to do that. We can't just load 4 because we could be reading |
+ // beyond the end of the string, which could cause a memory fault. |
+ if (preload_characters == 3) preload_characters = 2; |
+ } else { |
+ if (preload_characters > 2) preload_characters = 2; |
+ } |
+#else |
+ if (preload_characters > 1) preload_characters = 1; |
+#endif |
+ return preload_characters; |
+} |
+ |
+ |
+// This class is used when generating the alternatives in a choice node. It |
+// records the way the alternative is being code generated. |
+class AlternativeGeneration: public Malloced { |
+ public: |
+ AlternativeGeneration() |
+ : possible_success(), |
+ expects_preload(false), |
+ after(), |
+ quick_check_details() { } |
+ Label possible_success; |
+ bool expects_preload; |
+ Label after; |
+ QuickCheckDetails quick_check_details; |
+}; |
+ |
+ |
+// Creates a list of AlternativeGenerations. If the list has a reasonable |
+// size then it is on the stack, otherwise the excess is on the heap. |
+class AlternativeGenerationList { |
+ public: |
+ explicit AlternativeGenerationList(int count) |
+ : alt_gens_(count) { |
+ for (int i = 0; i < count && i < kAFew; i++) { |
+ alt_gens_.Add(a_few_alt_gens_ + i); |
+ } |
+ for (int i = kAFew; i < count; i++) { |
+ alt_gens_.Add(new AlternativeGeneration()); |
+ } |
+ } |
+ ~AlternativeGenerationList() { |
+ for (int i = 0; i < alt_gens_.length(); i++) { |
+ alt_gens_[i]->possible_success.Unuse(); |
+ alt_gens_[i]->after.Unuse(); |
+ } |
+ for (int i = kAFew; i < alt_gens_.length(); i++) { |
+ delete alt_gens_[i]; |
+ alt_gens_[i] = NULL; |
+ } |
+ } |
+ |
+ AlternativeGeneration* at(int i) { |
+ return alt_gens_[i]; |
+ } |
+ private: |
+ static const int kAFew = 10; |
+ ZoneList<AlternativeGeneration*> alt_gens_; |
+ AlternativeGeneration a_few_alt_gens_[kAFew]; |
+}; |
+ |
+ |
+/* Code generation for choice nodes. |
+ * |
+ * We generate quick checks that do a mask and compare to eliminate a |
+ * choice. If the quick check succeeds then it jumps to the continuation to |
+ * do slow checks and check subsequent nodes. If it fails (the common case) |
+ * it falls through to the next choice. |
+ * |
+ * Here is the desired flow graph. Nodes directly below each other imply |
+ * fallthrough. Alternatives 1 and 2 have quick checks. Alternative |
+ * 3 doesn't have a quick check so we have to call the slow check. |
+ * Nodes are marked Qn for quick checks and Sn for slow checks. The entire |
+ * regexp continuation is generated directly after the Sn node, up to the |
+ * next GoTo if we decide to reuse some already generated code. Some |
+ * nodes expect preload_characters to be preloaded into the current |
+ * character register. R nodes do this preloading. Vertices are marked |
+ * F for failures and S for success (possible success in the case of quick |
+ * nodes). L, V, < and > are used as arrow heads. |
+ * |
+ * ----------> R |
+ * | |
+ * V |
+ * Q1 -----> S1 |
+ * | S / |
+ * F| / |
+ * | F/ |
+ * | / |
+ * | R |
+ * | / |
+ * V L |
+ * Q2 -----> S2 |
+ * | S / |
+ * F| / |
+ * | F/ |
+ * | / |
+ * | R |
+ * | / |
+ * V L |
+ * S3 |
+ * | |
+ * F| |
+ * | |
+ * R |
+ * | |
+ * backtrack V |
+ * <----------Q4 |
+ * \ F | |
+ * \ |S |
+ * \ F V |
+ * \-----S4 |
+ * |
+ * For greedy loops we reverse our expectation and expect to match rather |
+ * than fail. Therefore we want the loop code to look like this (U is the |
+ * unwind code that steps back in the greedy loop). The following alternatives |
+ * look the same as above. |
+ * _____ |
+ * / \ |
+ * V | |
+ * ----------> S1 | |
+ * /| | |
+ * / |S | |
+ * F/ \_____/ |
+ * / |
+ * |<----------- |
+ * | \ |
+ * V \ |
+ * Q2 ---> S2 \ |
+ * | S / | |
+ * F| / | |
+ * | F/ | |
+ * | / | |
+ * | R | |
+ * | / | |
+ * F VL | |
+ * <------U | |
+ * back |S | |
+ * \______________/ |
+ */ |
+ |
+ |
bool ChoiceNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) { |
RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
int choice_count = alternatives_->length(); |
@@ -2136,7 +2734,8 @@ |
int text_length = GreedyLoopTextLength(&(alternatives_->at(0))); |
bool greedy_loop = false; |
Label greedy_loop_label; |
- GenerationVariant counter_backtrack_variant(&greedy_loop_label); |
+ GenerationVariant counter_backtrack_variant; |
+ counter_backtrack_variant.set_backtrack(&greedy_loop_label); |
if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { |
// Here we have special handling for greedy loops containing only text nodes |
// and other simple nodes. These are handled by pushing the current |
@@ -2150,7 +2749,8 @@ |
macro_assembler->PushCurrentPosition(); |
current_variant = &counter_backtrack_variant; |
Label greedy_match_failed; |
- GenerationVariant greedy_match_variant(&greedy_match_failed); |
+ GenerationVariant greedy_match_variant; |
+ greedy_match_variant.set_backtrack(&greedy_match_failed); |
Label loop_label; |
macro_assembler->Bind(&loop_label); |
greedy_match_variant.set_stop_node(this); |
@@ -2167,32 +2767,70 @@ |
Label second_choice; // For use in greedy matches. |
macro_assembler->Bind(&second_choice); |
+ int first_normal_choice = greedy_loop ? 1 : 0; |
+ |
+ int preload_characters = CalculatePreloadCharacters(compiler, |
+ first_normal_choice); |
+ bool preload_is_current = false; |
+ bool preload_has_checked_bounds = false; |
+ |
+ AlternativeGenerationList alt_gens(choice_count); |
+ |
// For now we just call all choices one after the other. The idea ultimately |
// is to use the Dispatch table to try only the relevant ones. |
- for (int i = greedy_loop ? 1 : 0; i < choice_count - 1; i++) { |
+ for (int i = first_normal_choice; i < choice_count; i++) { |
GuardedAlternative alternative = alternatives_->at(i); |
- Label after; |
+ AlternativeGeneration* alt_gen(alt_gens.at(i)); |
+ alt_gen->quick_check_details.set_characters(preload_characters); |
ZoneList<Guard*>* guards = alternative.guards(); |
int guard_count = (guards == NULL) ? 0 : guards->length(); |
+ |
GenerationVariant new_variant(*current_variant); |
- new_variant.set_backtrack(&after); |
- for (int j = 0; j < guard_count; j++) { |
- GenerateGuard(macro_assembler, guards->at(j), &new_variant); |
+ new_variant.set_characters_preloaded(preload_is_current ? |
+ preload_characters : |
+ 0); |
+ new_variant.quick_check_performed()->Clear(); |
+ alt_gen->expects_preload = preload_is_current; |
+ bool generate_full_check_inline = false; |
+ if (alternative.node()->EmitQuickCheck(compiler, |
+ &new_variant, |
+ preload_has_checked_bounds, |
+ &alt_gen->possible_success, |
+ &alt_gen->quick_check_details, |
+ i < choice_count - 1)) { |
+ // Quick check was generated for this choice. |
+ preload_is_current = true; |
+ preload_has_checked_bounds = true; |
+ // On the last choice in the ChoiceNode we generated the quick |
+ // check to fall through on possible success. So now we need to |
+ // generate the full check inline. |
+ if (i == choice_count - 1) { |
+ macro_assembler->Bind(&alt_gen->possible_success); |
+ new_variant.set_quick_check_performed(&alt_gen->quick_check_details); |
+ generate_full_check_inline = true; |
+ } |
+ } else { |
+ // No quick check was generated. Put the full code here. |
+ if (i < choice_count - 1) { |
+ new_variant.set_backtrack(&alt_gen->after); |
+ } |
+ generate_full_check_inline = true; |
} |
- if (!alternative.node()->Emit(compiler, &new_variant)) { |
- after.Unuse(); |
- return false; |
+ if (generate_full_check_inline) { |
+ if (preload_is_current) { |
+ new_variant.set_characters_preloaded(preload_characters); |
+ } |
+ for (int j = 0; j < guard_count; j++) { |
+ GenerateGuard(macro_assembler, guards->at(j), &new_variant); |
+ } |
+ if (!alternative.node()->Emit(compiler, &new_variant)) { |
+ greedy_loop_label.Unuse(); |
+ return false; |
+ } |
+ preload_is_current = false; |
} |
- macro_assembler->Bind(&after); |
+ macro_assembler->Bind(&alt_gen->after); |
} |
- GuardedAlternative alternative = alternatives_->at(choice_count - 1); |
- ZoneList<Guard*>* guards = alternative.guards(); |
- int guard_count = (guards == NULL) ? 0 : guards->length(); |
- for (int j = 0; j < guard_count; j++) { |
- GenerateGuard(macro_assembler, guards->at(j), current_variant); |
- } |
- bool ok = alternative.node()->Emit(compiler, current_variant); |
- if (!ok) return false; |
if (greedy_loop) { |
macro_assembler->Bind(&greedy_loop_label); |
// If we have unwound to the bottom then backtrack. |
@@ -2201,12 +2839,68 @@ |
macro_assembler->AdvanceCurrentPosition(-text_length); |
macro_assembler->GoTo(&second_choice); |
} |
+ // At this point we need to generate slow checks for the alternatives where |
+ // the quick check was inlined. We can recognize these because the associated |
+ // label was bound. |
+ for (int i = first_normal_choice; i < choice_count - 1; i++) { |
+ AlternativeGeneration* alt_gen = alt_gens.at(i); |
+ if (!EmitOutOfLineContinuation(compiler, |
+ current_variant, |
+ alternatives_->at(i), |
+ alt_gen, |
+ preload_characters, |
+ alt_gens.at(i + 1)->expects_preload)) { |
+ return false; |
+ } |
+ } |
return true; |
} |
+bool ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, |
+ GenerationVariant* variant, |
+ GuardedAlternative alternative, |
+ AlternativeGeneration* alt_gen, |
+ int preload_characters, |
+ bool next_expects_preload) { |
+ if (!alt_gen->possible_success.is_linked()) return true; |
+ |
+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
+ macro_assembler->Bind(&alt_gen->possible_success); |
+ GenerationVariant out_of_line_variant(*variant); |
+ out_of_line_variant.set_characters_preloaded(preload_characters); |
+ out_of_line_variant.set_quick_check_performed(&alt_gen->quick_check_details); |
+ ZoneList<Guard*>* guards = alternative.guards(); |
+ int guard_count = (guards == NULL) ? 0 : guards->length(); |
+ if (next_expects_preload) { |
+ Label reload_current_char; |
+ out_of_line_variant.set_backtrack(&reload_current_char); |
+ for (int j = 0; j < guard_count; j++) { |
+ GenerateGuard(macro_assembler, guards->at(j), &out_of_line_variant); |
+ } |
+ bool ok = alternative.node()->Emit(compiler, &out_of_line_variant); |
+ macro_assembler->Bind(&reload_current_char); |
+ // Reload the current character, since the next quick check expects that. |
+ // We don't need to check bounds here because we only get into this |
+ // code through a quick check which already did the checked load. |
+ macro_assembler->LoadCurrentCharacter(variant->cp_offset(), |
+ NULL, |
+ false, |
+ preload_characters); |
+ macro_assembler->GoTo(&(alt_gen->after)); |
+ return ok; |
+ } else { |
+ out_of_line_variant.set_backtrack(&(alt_gen->after)); |
+ for (int j = 0; j < guard_count; j++) { |
+ GenerateGuard(macro_assembler, guards->at(j), &out_of_line_variant); |
+ } |
+ return alternative.node()->Emit(compiler, &out_of_line_variant); |
+ } |
+} |
+ |
+ |
bool ActionNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) { |
- RegExpMacroAssembler* macro = compiler->macro_assembler(); |
+ RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
LimitResult limit_result = LimitVersions(compiler, variant); |
if (limit_result == DONE) return true; |
if (limit_result == FAIL) return false; |
@@ -2238,9 +2932,9 @@ |
} |
case BEGIN_SUBMATCH: |
if (!variant->is_trivial()) return variant->Flush(compiler, this); |
- macro->WriteCurrentPositionToRegister( |
+ assembler->WriteCurrentPositionToRegister( |
data_.u_submatch.current_position_register, 0); |
- macro->WriteStackPointerToRegister( |
+ assembler->WriteStackPointerToRegister( |
data_.u_submatch.stack_pointer_register); |
return on_success()->Emit(compiler, variant); |
case POSITIVE_SUBMATCH_SUCCESS: |
@@ -2255,13 +2949,13 @@ |
Label at_end; |
// Load current character jumps to the label if we are beyond the string |
// end. |
- macro->LoadCurrentCharacter(0, &at_end); |
- macro->GoTo(variant->backtrack()); |
- macro->Bind(&at_end); |
+ assembler->LoadCurrentCharacter(0, &at_end); |
+ assembler->GoTo(variant->backtrack()); |
+ assembler->Bind(&at_end); |
} |
- macro->ReadCurrentPositionFromRegister( |
+ assembler->ReadCurrentPositionFromRegister( |
data_.u_submatch.current_position_register); |
- macro->ReadStackPointerFromRegister( |
+ assembler->ReadStackPointerFromRegister( |
data_.u_submatch.stack_pointer_register); |
return on_success()->Emit(compiler, variant); |
default: |
@@ -2273,7 +2967,7 @@ |
bool BackReferenceNode::Emit(RegExpCompiler* compiler, |
GenerationVariant* variant) { |
- RegExpMacroAssembler* macro = compiler->macro_assembler(); |
+ RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
if (!variant->is_trivial()) { |
return variant->Flush(compiler, this); |
} |
@@ -2289,12 +2983,15 @@ |
if (info()->at_end) { |
// If we are constrained to match at the end of the input then succeed |
// iff the back reference is empty. |
- macro->CheckNotRegistersEqual(start_reg_, end_reg_, variant->backtrack()); |
+ assembler->CheckNotRegistersEqual(start_reg_, |
+ end_reg_, |
+ variant->backtrack()); |
} else { |
if (compiler->ignore_case()) { |
- macro->CheckNotBackReferenceIgnoreCase(start_reg_, variant->backtrack()); |
+ assembler->CheckNotBackReferenceIgnoreCase(start_reg_, |
+ variant->backtrack()); |
} else { |
- macro->CheckNotBackReference(start_reg_, variant->backtrack()); |
+ assembler->CheckNotBackReference(start_reg_, variant->backtrack()); |
} |
} |
return on_success()->Emit(compiler, variant); |