Index: src/jsregexp.cc |
=================================================================== |
--- src/jsregexp.cc (revision 927) |
+++ src/jsregexp.cc (working copy) |
@@ -242,7 +242,8 @@ |
RegExpEngine::Compile(&parse_result, |
&node, |
flags.is_ignore_case(), |
- flags.is_multiline()); |
+ flags.is_multiline(), |
+ pattern); |
if (irregexp_data.is_null()) { |
if (FLAG_disable_jscre) { |
UNIMPLEMENTED(); |
@@ -858,8 +859,156 @@ |
// ------------------------------------------------------------------- |
// Implmentation of the Irregexp regular expression engine. |
+// |
+// The Irregexp regular expression engine is intended to be a complete |
+// implementation of ECMAScript regular expressions. It generates either |
+// bytecodes or native code. |
+// The Irregexp regexp engine is structured in three steps. |
+// 1) The parser generates an abstract syntax tree. See ast.cc. |
+// 2) From the AST a node network is created. The nodes are all |
+// subclasses of RegExpNode. The nodes represent states when |
+// executing a regular expression. Several optimizations are |
+// performed on the node network. |
+// 3) From the nodes we generate either byte codes or native code |
+// that can actually execute the regular expression (perform |
+// the search). The code generation step is described in more |
+// detail below. |
+// Code generation. |
+// |
+// The nodes are divided into four main categories. |
+// * Choice nodes |
+// These represent places where the regular expression can |
+// match in more than one way. For example on entry to an |
+// alternation (foo|bar) or a repetition (*, +, ? or {}). |
+// * Action nodes |
+// These represent places where some action should be |
+// performed. Examples include recording the current position |
+// in the input string to a register (in order to implement |
+// captures) or other actions on register for example in order |
+// to implement the counters needed for {} repetitions. |
+// * Matching nodes |
+// These attempt to match some element part of the input string. |
+// Examples of elements include character classes, plain strings |
+// or back references. |
+// * End nodes |
+// These are used to implement the actions required on finding |
+// a successful match or failing to find a match. |
+// |
+// The code generated (whether as byte codes or native code) maintains |
+// some state as it runs. This consists of the following elements: |
+// |
+// * The capture registers. Used for string captures. |
+// * Other registers. Used for counters etc. |
+// * The current position. |
+// * The stack of backtracking information. Used when a matching node |
+// fails to find a match and needs to try an alternative. |
+// |
+// Conceptual regular expression execution model: |
+// |
+// There is a simple conceptual model of regular expression execution |
+// which will be presented first. The actual code generated is a more |
+// efficient simulation of the simple conceptual model: |
+// |
+// * Choice nodes are implemented as follows: |
+// For each choice except the last { |
+// push current position |
+// push backtrack code location |
+// <generate code to test for choice> |
+// backtrack code location: |
+// pop current position |
+// } |
+// <generate code to test for last choice> |
+// |
+// * Actions nodes are generated as follows |
+// <push affected registers on backtrack stack> |
+// <generate code to perform action> |
+// push backtrack code location |
+// <generate code to test for following nodes> |
+// backtrack code location: |
+// <pop affected registers to restore their state> |
+// <pop backtrack location from stack and go to it> |
+// |
+// * Matching nodes are generated as follows: |
+// if input string matches at current position |
+// update current position |
+// <generate code to test for following nodes> |
+// else |
+// <pop backtrack location from stack and go to it> |
+// |
+// Thus it can be seen that the current position is saved and restored |
+// by the choice nodes, whereas the registers are saved and restored by |
+// by the action nodes that manipulate them. |
+// |
+// The other interesting aspect of this model is that nodes are generated |
+// at the point where they are needed by a recursive call to Emit(). If |
+// the node has already been code generated then the Emit() call will |
+// generate a jump to the previously generated code instead. In order to |
+// limit recursion it is possible for the Emit() function to put the node |
+// on a work list for later generation and instead generate a jump. The |
+// destination of the jump is resolved later when the code is generated. |
+// |
+// Actual regular expression code generation. |
+// |
+// Code generation is actually more complicated than the above. In order |
+// to improve the efficiency of the generated code some optimizations are |
+// performed |
+// |
+// * Choice nodes have 1-character lookahead. |
+// A choice node looks at the following character and eliminates some of |
+// the choices immediately based on that character. This is not yet |
+// implemented. |
+// * Simple greedy loops store reduced backtracking information. |
+// A quantifier like /.*foo/m will greedily match the whole input. It will |
+// then need to backtrack to a point where it can match "foo". The naive |
+// implementation of this would push each character position onto the |
+// backtracking stack, then pop them off one by one. This would use space |
+// proportional to the length of the input string. However since the "." |
+// can only match in one way and always has a constant length (in this case |
+// of 1) it suffices to store the current position on the top of the stack |
+// once. Matching now becomes merely incrementing the current position and |
+// backtracking becomes decrementing the current position and checking the |
+// result against the stored current position. This is faster and saves |
+// space. |
+// * The current state is virtualized. |
+// This is used to defer expensive operations until it is clear that they |
+// are needed and to generate code for a node more than once, allowing |
+// specialized an efficient versions of the code to be created. This is |
+// explained in the section below. |
+// |
+// Execution state virtualization. |
+// |
+// Instead of emitting code, nodes that manipulate the state can record their |
+// manipulation in an object called the GenerationVariant. The |
+// GenerationVariant object can record a current position offset, an |
+// optional backtrack code location on the top of the virtualized backtrack |
+// stack and some register changes. When a node is to be emitted it can flush |
+// the GenerationVariant or update it. Flushing the GenerationVariant |
+// will emit code to bring the actual state into line with the virtual state. |
+// Avoiding flushing the state can postpone some work (eg updates of capture |
+// registers). Postponing work can save time when executing the regular |
+// expression since it may be found that the work never has to be done as a |
+// failure to match can occur. In addition it is much faster to jump to a |
+// known backtrack code location than it is to pop an unknown backtrack |
+// location from the stack and jump there. |
+// |
+// The virtual state found in the GenerationVariant affects code generation. |
+// For example the virtual state contains the difference between the actual |
+// current position and the virtual current position, and matching code needs |
+// to use this offset to attempt a match in the correct location of the input |
+// string. Therefore code generated for a non-trivial GenerationVariant is |
+// specialized to that GenerationVariant. The code generator therefore |
+// has the ability to generate code for each node several times. In order to |
+// limit the size of the generated code there is an arbitrary limit on how |
+// many specialized sets of code may be generated for a given node. If the |
+// limit is reached, the GenerationVariant is flushed and a generic version of |
+// the code for a node is emitted. This is subsequently used for that node. |
+// The code emitted for non-generic GenerationVariants is not recorded in the |
+// node and so it cannot currently be reused in the event that code generation |
+// is requested for an identical GenerationVariant. |
Christian Plesner Hansen
2008/12/08 08:54:19
Most excellent!
|
+ |
+ |
void RegExpTree::AppendToText(RegExpText* text) { |
UNREACHABLE(); |
} |
@@ -914,7 +1063,8 @@ |
Handle<FixedArray> Assemble(RegExpMacroAssembler* assembler, |
RegExpNode* start, |
- int capture_count); |
+ int capture_count, |
+ Handle<String> pattern); |
inline void AddWork(RegExpNode* node) { work_list_->Add(node); } |
@@ -924,7 +1074,6 @@ |
RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } |
EndNode* accept() { return accept_; } |
- EndNode* backtrack() { return backtrack_; } |
static const int kMaxRecursion = 100; |
inline int recursion_depth() { return recursion_depth_; } |
@@ -935,7 +1084,6 @@ |
private: |
EndNode* accept_; |
- EndNode* backtrack_; |
int next_register_; |
List<RegExpNode*>* work_list_; |
int recursion_depth_; |
@@ -944,6 +1092,17 @@ |
}; |
+class RecursionCheck { |
+ public: |
+ explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { |
+ compiler->IncrementRecursionDepth(); |
+ } |
+ ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } |
+ private: |
+ RegExpCompiler* compiler_; |
+}; |
+ |
+ |
// Attempts to compile the regexp using an Irregexp code generator. Returns |
// a fixed array or a null handle depending on whether it succeeded. |
RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case) |
@@ -952,14 +1111,14 @@ |
recursion_depth_(0), |
ignore_case_(ignore_case) { |
accept_ = new EndNode(EndNode::ACCEPT); |
- backtrack_ = new EndNode(EndNode::BACKTRACK); |
} |
Handle<FixedArray> RegExpCompiler::Assemble( |
RegExpMacroAssembler* macro_assembler, |
RegExpNode* start, |
- int capture_count) { |
+ int capture_count, |
+ Handle<String> pattern) { |
#ifdef DEBUG |
if (FLAG_trace_regexp_assembler) |
macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler); |
@@ -969,19 +1128,19 @@ |
List <RegExpNode*> work_list(0); |
work_list_ = &work_list; |
Label fail; |
- macro_assembler_->PushBacktrack(&fail); |
- if (!start->GoTo(this)) { |
+ macro_assembler->PushBacktrack(&fail); |
+ GenerationVariant generic_variant; |
+ if (!start->Emit(this, &generic_variant)) { |
fail.Unuse(); |
return Handle<FixedArray>::null(); |
} |
+ macro_assembler_->Bind(&fail); |
+ macro_assembler_->Fail(); |
while (!work_list.is_empty()) { |
- if (!work_list.RemoveLast()->GoTo(this)) { |
- fail.Unuse(); |
+ if (!work_list.RemoveLast()->Emit(this, &generic_variant)) { |
return Handle<FixedArray>::null(); |
} |
} |
- macro_assembler_->Bind(&fail); |
- macro_assembler_->Fail(); |
Handle<FixedArray> array = |
Factory::NewFixedArray(RegExpImpl::kIrregexpDataLength); |
array->set(RegExpImpl::kIrregexpImplementationIndex, |
@@ -990,7 +1149,7 @@ |
Smi::FromInt(next_register_)); |
array->set(RegExpImpl::kIrregexpNumberOfCapturesIndex, |
Smi::FromInt(capture_count)); |
- Handle<Object> code = macro_assembler_->GetCode(); |
+ Handle<Object> code = macro_assembler_->GetCode(pattern); |
array->set(RegExpImpl::kIrregexpCodeIndex, *code); |
work_list_ = NULL; |
#ifdef DEBUG |
@@ -1002,71 +1161,217 @@ |
} |
-bool RegExpNode::GoTo(RegExpCompiler* compiler) { |
- // TODO(erikcorry): Implement support. |
- if (info_.follows_word_interest || |
- info_.follows_newline_interest || |
- info_.follows_start_interest) { |
- return false; |
+bool GenerationVariant::mentions_reg(int reg) { |
+ for (DeferredAction* action = actions_; |
+ action != NULL; |
+ action = action->next()) { |
+ if (reg == action->reg()) return true; |
} |
- if (label_.is_bound()) { |
- compiler->macro_assembler()->GoTo(&label_); |
- return true; |
- } else { |
- if (compiler->recursion_depth() > RegExpCompiler::kMaxRecursion) { |
- compiler->macro_assembler()->GoTo(&label_); |
- compiler->AddWork(this); |
- return true; |
+ return false; |
+} |
+ |
+ |
+int GenerationVariant::FindAffectedRegisters(OutSet* affected_registers) { |
+ int max_register = -1; |
+ for (DeferredAction* action = actions_; |
+ action != NULL; |
+ action = action->next()) { |
+ affected_registers->Set(action->reg()); |
+ if (action->reg() > max_register) max_register = action->reg(); |
+ } |
+ return max_register; |
+} |
+ |
+ |
+void GenerationVariant::PushAffectedRegisters(RegExpMacroAssembler* macro, |
+ int max_register, |
+ OutSet& affected_registers) { |
+ for (int reg = 0; reg <= max_register; reg++) { |
+ if (affected_registers.Get(reg)) macro->PushRegister(reg); |
+ } |
+} |
+ |
+ |
+void GenerationVariant::RestoreAffectedRegisters(RegExpMacroAssembler* macro, |
+ int max_register, |
+ OutSet& affected_registers) { |
+ for (int reg = max_register; reg >= 0; reg--) { |
+ if (affected_registers.Get(reg)) macro->PopRegister(reg); |
+ } |
+} |
+ |
+ |
+void GenerationVariant::PerformDeferredActions(RegExpMacroAssembler* macro, |
+ int max_register, |
Christian Plesner Hansen
2008/12/08 08:54:19
Indentation off.
|
+ OutSet& affected_registers) { |
+ for (int reg = 0; reg <= max_register; reg++) { |
+ if (!affected_registers.Get(reg)) { |
+ continue; |
+ } |
+ int value = 0; |
+ bool absolute = false; |
+ int store_position = -1; |
+ // This is a little tricky because we are scanning the actions in reverse |
+ // historical order (newest first). |
+ for (DeferredAction* action = actions_; |
+ action != NULL; |
+ action = action->next()) { |
+ if (action->reg() == reg) { |
+ switch (action->type()) { |
+ case ActionNode::SET_REGISTER: { |
+ GenerationVariant::DeferredSetRegister* psr = |
+ static_cast<GenerationVariant::DeferredSetRegister*>(action); |
+ value += psr->value(); |
+ absolute = true; |
+ ASSERT_EQ(store_position, -1); |
+ break; |
+ } |
+ case ActionNode::INCREMENT_REGISTER: |
+ if (!absolute) { |
+ value++; |
+ } |
+ ASSERT_EQ(store_position, -1); |
+ break; |
+ case ActionNode::STORE_POSITION: { |
+ GenerationVariant::DeferredCapture* pc = |
+ static_cast<GenerationVariant::DeferredCapture*>(action); |
+ if (store_position == -1) { |
+ store_position = pc->cp_offset(); |
+ } |
+ ASSERT(!absolute); |
+ ASSERT_EQ(value, 0); |
+ break; |
+ } |
+ default: |
+ UNREACHABLE(); |
+ break; |
+ } |
+ } |
+ } |
+ if (store_position != -1) { |
+ macro->WriteCurrentPositionToRegister(reg, store_position); |
} else { |
- compiler->IncrementRecursionDepth(); |
- bool how_it_went = Emit(compiler); |
- compiler->DecrementRecursionDepth(); |
- return how_it_went; |
+ if (absolute) { |
+ macro->SetRegister(reg, value); |
+ } else { |
+ if (value != 0) { |
+ macro->AdvanceRegister(reg, value); |
+ } |
+ } |
} |
} |
} |
-// EndNodes are special. Because they can be very common and they are very |
-// short we normally inline them. That is, if we are asked to emit a GoTo |
-// we just emit the entire node. Since they don't have successors this |
-// works. |
-bool EndNode::GoTo(RegExpCompiler* compiler) { |
- if (info()->follows_word_interest || |
- info()->follows_newline_interest || |
- info()->follows_start_interest) { |
- return false; |
+// This is called as we come into a loop choice node and some other tricky |
+// nodes. It normalises the state of the code generator to ensure we can |
+// generate generic code. |
+bool GenerationVariant::Flush(RegExpCompiler* compiler, RegExpNode* successor) { |
+ RegExpMacroAssembler* macro = compiler->macro_assembler(); |
+ |
+ ASSERT(actions_ != NULL || cp_offset_ != 0 || backtrack() != NULL); |
+ |
+ if (actions_ == NULL && backtrack() == NULL) { |
+ // Here we just have some deferred cp advances to fix and we are back to |
+ // a normal situation. |
+ macro->AdvanceCurrentPosition(cp_offset_); |
+ // Create a new trivial state and generate the node with that. |
+ GenerationVariant new_state; |
+ return successor->Emit(compiler, &new_state); |
} |
- return Emit(compiler); |
+ |
+ // Generate deferred actions here along with code to undo them again. |
+ OutSet affected_registers; |
+ int max_register = FindAffectedRegisters(&affected_registers); |
+ PushAffectedRegisters(macro, max_register, affected_registers); |
+ PerformDeferredActions(macro, max_register, affected_registers); |
+ if (backtrack() != NULL) { |
+ // Here we have a concrete backtrack location. These are set up by choice |
+ // nodes and so they indicate that we have a deferred save of the current |
+ // position which we may need to emit here. |
+ macro->PushCurrentPosition(); |
+ } |
+ if (cp_offset_ != 0) { |
+ macro->AdvanceCurrentPosition(cp_offset_); |
+ } |
+ |
+ // Create a new trivial state and generate the node with that. |
Christian Plesner Hansen
2008/12/08 08:54:19
trivial -> empty?
|
+ Label undo; |
+ macro->PushBacktrack(&undo); |
+ GenerationVariant new_state; |
+ bool ok = successor->Emit(compiler, &new_state); |
+ |
+ // On backtrack we need to restore state. |
+ macro->Bind(&undo); |
+ if (!ok) return false; |
+ if (backtrack() != NULL) { |
+ macro->PopCurrentPosition(); |
+ } |
+ RestoreAffectedRegisters(macro, max_register, affected_registers); |
+ if (backtrack() == NULL) { |
+ macro->Backtrack(); |
+ } else { |
+ macro->GoTo(backtrack()); |
+ } |
+ |
+ return true; |
} |
-Label* RegExpNode::label() { |
- return &label_; |
+void EndNode::EmitInfoChecks(RegExpMacroAssembler* macro, |
+ GenerationVariant* variant) { |
+ if (info()->at_end) { |
+ Label succeed; |
+ // LoadCurrentCharacter will go to the label if we are at the end of the |
+ // input string. |
+ macro->LoadCurrentCharacter(0, &succeed); |
+ macro->GoTo(variant->backtrack()); |
+ macro->Bind(&succeed); |
+ } |
} |
-bool EndNode::Emit(RegExpCompiler* compiler) { |
+bool NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, |
+ GenerationVariant* variant) { |
+ if (!variant->is_trivial()) { |
+ return variant->Flush(compiler, this); |
+ } |
RegExpMacroAssembler* macro = compiler->macro_assembler(); |
+ if (!label()->is_bound()) { |
+ macro->Bind(label()); |
+ } |
+ EmitInfoChecks(macro, variant); |
+ macro->ReadCurrentPositionFromRegister(current_position_register_); |
+ macro->ReadStackPointerFromRegister(stack_pointer_register_); |
+ // Now that we have unwound the stack we find at the top of the stack the |
+ // backtrack that the BeginSubmatch node got. |
+ macro->Backtrack(); |
+ return true; |
+} |
+ |
+ |
+bool EndNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) { |
+ if (!variant->is_trivial()) { |
+ return variant->Flush(compiler, this); |
+ } |
+ RegExpMacroAssembler* macro = compiler->macro_assembler(); |
+ if (!label()->is_bound()) { |
+ macro->Bind(label()); |
+ } |
switch (action_) { |
case ACCEPT: |
- if (!label()->is_bound()) Bind(macro); |
- if (info()->at_end) { |
- Label succeed; |
- // LoadCurrentCharacter will go to the label if we are at the end of the |
- // input string. |
- macro->LoadCurrentCharacter(0, &succeed); |
- macro->Backtrack(); |
- macro->Bind(&succeed); |
- } |
+ EmitInfoChecks(macro, variant); |
macro->Succeed(); |
return true; |
case BACKTRACK: |
- if (!label()->is_bound()) Bind(macro); |
ASSERT(!info()->at_end); |
- macro->Backtrack(); |
+ macro->GoTo(variant->backtrack()); |
return true; |
+ case NEGATIVE_SUBMATCH_SUCCESS: |
+ // This case is handled in a different virtual method. |
+ UNREACHABLE(); |
} |
+ UNIMPLEMENTED(); |
return false; |
} |
@@ -1078,10 +1383,10 @@ |
} |
-ActionNode* ActionNode::StoreRegister(int reg, |
- int val, |
- RegExpNode* on_success) { |
- ActionNode* result = new ActionNode(STORE_REGISTER, on_success); |
+ActionNode* ActionNode::SetRegister(int reg, |
+ int val, |
+ RegExpNode* on_success) { |
+ ActionNode* result = new ActionNode(SET_REGISTER, on_success); |
result->data_.u_store_register.reg = reg; |
result->data_.u_store_register.value = val; |
return result; |
@@ -1102,13 +1407,6 @@ |
} |
-ActionNode* ActionNode::RestorePosition(int reg, RegExpNode* on_success) { |
- ActionNode* result = new ActionNode(RESTORE_POSITION, on_success); |
- result->data_.u_position_register.reg = reg; |
- return result; |
-} |
- |
- |
ActionNode* ActionNode::BeginSubmatch(int stack_reg, |
int position_reg, |
RegExpNode* on_success) { |
@@ -1119,17 +1417,12 @@ |
} |
-ActionNode* ActionNode::EscapeSubmatch(int stack_reg, |
- bool restore_position, |
- int position_reg, |
- RegExpNode* on_success) { |
- ActionNode* result = new ActionNode(ESCAPE_SUBMATCH, on_success); |
+ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, |
+ int position_reg, |
+ RegExpNode* on_success) { |
+ ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success); |
result->data_.u_submatch.stack_pointer_register = stack_reg; |
- if (restore_position) { |
- result->data_.u_submatch.current_position_register = position_reg; |
- } else { |
- result->data_.u_submatch.current_position_register = -1; |
- } |
+ result->data_.u_submatch.current_position_register = position_reg; |
return result; |
} |
@@ -1147,14 +1440,20 @@ |
void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, |
- Guard* guard, |
- Label* on_failure) { |
+ Guard* guard, |
Christian Plesner Hansen
2008/12/08 08:54:19
Indentation.
|
+ GenerationVariant* variant) { |
switch (guard->op()) { |
case Guard::LT: |
- macro_assembler->IfRegisterGE(guard->reg(), guard->value(), on_failure); |
+ ASSERT(!variant->mentions_reg(guard->reg())); |
+ macro_assembler->IfRegisterGE(guard->reg(), |
+ guard->value(), |
+ variant->backtrack()); |
break; |
case Guard::GEQ: |
- macro_assembler->IfRegisterLT(guard->reg(), guard->value(), on_failure); |
+ ASSERT(!variant->mentions_reg(guard->reg())); |
+ macro_assembler->IfRegisterLT(guard->reg(), |
+ guard->value(), |
+ variant->backtrack()); |
break; |
} |
} |
@@ -1169,13 +1468,22 @@ |
TextElement elm, |
Vector<const uc16> quarks, |
Label* on_failure, |
- int cp_offset) { |
+ int cp_offset, |
+ bool check_offset) { |
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
+ // It is vital that this loop is backwards due to the unchecked character |
+ // load below. |
for (int i = quarks.length() - 1; i >= 0; i--) { |
uc16 c = quarks[i]; |
int length = uncanonicalize.get(c, '\0', chars); |
if (length <= 1) { |
- macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure); |
+ if (check_offset && i == quarks.length() - 1) { |
+ macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure); |
+ } else { |
+ // Here we don't need to check against the end of the input string |
+ // since this character lies before a character that matched. |
+ macro_assembler->LoadCurrentCharacterUnchecked(cp_offset + i); |
+ } |
macro_assembler->CheckNotCharacter(c, on_failure); |
} |
} |
@@ -1216,13 +1524,22 @@ |
TextElement elm, |
Vector<const uc16> quarks, |
Label* on_failure, |
- int cp_offset) { |
+ int cp_offset, |
+ bool check_offset) { |
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
+ // It is vital that this loop is backwards due to the unchecked character |
+ // load below. |
for (int i = quarks.length() - 1; i >= 0; i--) { |
uc16 c = quarks[i]; |
int length = uncanonicalize.get(c, '\0', chars); |
if (length <= 1) continue; |
- macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure); |
+ if (check_offset && i == quarks.length() - 1) { |
+ macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure); |
+ } else { |
+ // Here we don't need to check against the end of the input string |
+ // since this character lies before a character that matched. |
+ macro_assembler->LoadCurrentCharacterUnchecked(cp_offset + i); |
+ } |
Label ok; |
ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); |
switch (length) { |
@@ -1259,10 +1576,8 @@ |
static void EmitCharClass(RegExpMacroAssembler* macro_assembler, |
RegExpCharacterClass* cc, |
int cp_offset, |
- Label* on_failure) { |
- macro_assembler->LoadCurrentCharacter(cp_offset, on_failure); |
- cp_offset++; |
- |
+ Label* on_failure, |
+ bool check_offset) { |
ZoneList<CharacterRange>* ranges = cc->ranges(); |
Label success; |
@@ -1279,6 +1594,27 @@ |
return; |
} |
+ if (range_count == 1 && |
+ !cc->is_negated() && |
+ ranges->at(0).from() == 0 && |
Christian Plesner Hansen
2008/12/08 08:54:19
Maybe add an IsEverything method on CharacterRange
|
+ ranges->at(0).to() == 0xffff) { |
+ // This is a common case hit by non-anchored expressions. |
+ // TODO(erikcorry): We should have a macro assembler instruction that just |
+ // checks for end of string without loading the character. |
+ if (check_offset) { |
+ macro_assembler->LoadCurrentCharacter(cp_offset, on_failure); |
+ } |
+ return; |
+ } |
+ |
+ if (check_offset) { |
+ macro_assembler->LoadCurrentCharacter(cp_offset, on_failure); |
+ } else { |
+ // Here we don't need to check against the end of the input string |
+ // since this character lies before a character that matched. |
+ macro_assembler->LoadCurrentCharacterUnchecked(cp_offset); |
+ } |
+ |
for (int i = 0; i < range_count - 1; i++) { |
CharacterRange& range = ranges->at(i); |
Label next_range; |
@@ -1333,73 +1669,143 @@ |
} |
+RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, |
+ GenerationVariant* variant) { |
+ // TODO(erikcorry): Implement support. |
+ if (info_.follows_word_interest || |
+ info_.follows_newline_interest || |
+ info_.follows_start_interest) { |
+ return FAIL; |
+ } |
-bool TextNode::Emit(RegExpCompiler* compiler) { |
+ // If we are generating a greedy loop then don't stop and don't reuse code. |
+ if (variant->stop_node() != NULL) { |
+ return CONTINUE; |
+ } |
+ |
RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
- Bind(macro_assembler); |
+ if (variant->is_trivial()) { |
+ if (label_.is_bound()) { |
+ // We are being asked to generate a generic version, but that's already |
+ // been done so just go to it. |
+ macro_assembler->GoTo(&label_); |
+ return DONE; |
+ } |
+ if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) { |
+ // To avoid too deep recursion we push the node to the work queue and just |
+ // generate a goto here. |
+ compiler->AddWork(this); |
+ macro_assembler->GoTo(&label_); |
+ return DONE; |
+ } |
+ // Generate generic version of the node and bind the label for later use. |
+ macro_assembler->Bind(&label_); |
+ return CONTINUE; |
+ } |
+ |
+ // We are being asked to make a non-generic version. Keep track of how many |
+ // non-generic versions we generate so as not to overdo it. |
+ variants_generated_++; |
+ if (variants_generated_ < kMaxVariantsGenerated && |
+ compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) { |
+ return CONTINUE; |
+ } |
+ |
+ // If we get here there have been too many variants generated or recursion |
+ // is too deep. Time to switch to a generic version. The code for |
+ // generic versions above can handle deep recursion properly. |
+ bool ok = variant->Flush(compiler, this); |
+ return ok ? DONE : FAIL; |
+} |
+ |
+ |
+// This generates the code to match a text node. A text node can contain |
+// straight character sequences (possibly to be matched in a case-independent |
+// way) and character classes. In order to be most efficient we test for the |
+// simple things first and then move on to the more complicated things. The |
+// simplest thing is a non-letter or a letter if we are matching case. The |
+// next-most simple thing is a case-independent letter. The least simple is |
+// a character class. Another optimization is that we test the last one first. |
+// If that succeeds we don't need to test for the end of the string when we |
+// load other characters. |
+bool TextNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) { |
+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
+ Label *backtrack = variant->backtrack(); |
+ LimitResult limit_result = LimitVersions(compiler, variant); |
+ if (limit_result == FAIL) return false; |
+ if (limit_result == DONE) return true; |
+ ASSERT(limit_result == CONTINUE); |
+ |
int element_count = elms_->length(); |
ASSERT(element_count != 0); |
- int cp_offset = 0; |
if (info()->at_end) { |
- macro_assembler->Backtrack(); |
+ macro_assembler->GoTo(backtrack); |
return true; |
} |
// First, handle straight character matches. |
- for (int i = 0; i < element_count; i++) { |
+ int checked_up_to = -1; |
+ for (int i = element_count - 1; i >= 0; i--) { |
TextElement elm = elms_->at(i); |
+ ASSERT(elm.cp_offset >= 0); |
+ int cp_offset = variant->cp_offset() + elm.cp_offset; |
if (elm.type == TextElement::ATOM) { |
Vector<const uc16> quarks = elm.data.u_atom->data(); |
+ int last_cp_offset = cp_offset + quarks.length(); |
if (compiler->ignore_case()) { |
EmitAtomNonLetters(macro_assembler, |
elm, |
quarks, |
- on_failure_->label(), |
- cp_offset); |
+ backtrack, |
+ cp_offset, |
+ checked_up_to < last_cp_offset); |
} else { |
macro_assembler->CheckCharacters(quarks, |
cp_offset, |
- on_failure_->label()); |
+ backtrack, |
+ checked_up_to < last_cp_offset); |
} |
- cp_offset += quarks.length(); |
+ if (last_cp_offset > checked_up_to) checked_up_to = last_cp_offset - 1; |
} else { |
ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); |
- cp_offset++; |
} |
} |
// Second, handle case independent letter matches if any. |
if (compiler->ignore_case()) { |
- cp_offset = 0; |
- for (int i = 0; i < element_count; i++) { |
+ for (int i = element_count - 1; i >= 0; i--) { |
TextElement elm = elms_->at(i); |
+ int cp_offset = variant->cp_offset() + elm.cp_offset; |
if (elm.type == TextElement::ATOM) { |
Vector<const uc16> quarks = elm.data.u_atom->data(); |
+ int last_cp_offset = cp_offset + quarks.length(); |
EmitAtomLetters(macro_assembler, |
elm, |
quarks, |
- on_failure_->label(), |
- cp_offset); |
- cp_offset += quarks.length(); |
- } else { |
- cp_offset++; |
+ backtrack, |
+ cp_offset, |
+ checked_up_to < last_cp_offset); |
+ if (last_cp_offset > checked_up_to) checked_up_to = last_cp_offset - 1; |
} |
} |
} |
// If the fast character matches passed then do the character classes. |
- cp_offset = 0; |
- for (int i = 0; i < element_count; i++) { |
+ for (int i = element_count - 1; i >= 0; i--) { |
TextElement elm = elms_->at(i); |
+ int cp_offset = variant->cp_offset() + elm.cp_offset; |
if (elm.type == TextElement::CHAR_CLASS) { |
RegExpCharacterClass* cc = elm.data.u_char_class; |
- EmitCharClass(macro_assembler, cc, cp_offset, on_failure_->label()); |
- cp_offset++; |
- } else { |
- cp_offset += elm.data.u_atom->data().length(); |
+ EmitCharClass(macro_assembler, |
+ cc, |
+ cp_offset, |
+ backtrack, |
+ checked_up_to < cp_offset); |
+ if (cp_offset > checked_up_to) checked_up_to = cp_offset; |
} |
} |
- compiler->AddWork(on_failure_); |
- macro_assembler->AdvanceCurrentPosition(cp_offset); |
- return on_success()->GoTo(compiler); |
+ GenerationVariant new_variant(*variant); |
+ new_variant.set_cp_offset(checked_up_to + 1); |
+ RecursionCheck rc(compiler); |
+ return on_success()->Emit(compiler, &new_variant); |
} |
@@ -1419,141 +1825,257 @@ |
} |
-bool ChoiceNode::Emit(RegExpCompiler* compiler) { |
+int TextNode::GreedyLoopTextLength() { |
+ TextElement elm = elms_->at(elms_->length() - 1); |
+ if (elm.type == TextElement::CHAR_CLASS) { |
+ return elm.cp_offset + 1; |
+ } else { |
+ return elm.cp_offset + elm.data.u_atom->data().length(); |
+ } |
+} |
+ |
+ |
+// Finds the fixed match length of a sequence of nodes that goes from |
+// this alternative and back to this choice node. If there are variable |
+// length nodes or other complications in the way then return a sentinel |
+// value indicating that a greedy loop cannot be constructed. |
+int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) { |
+ int length = 0; |
+ RegExpNode* node = alternative->node(); |
+ // Later we will generate code for all these text nodes using recursion |
+ // so we have to limit the max number. |
+ int recursion_depth = 0; |
+ while (node != this) { |
+ if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { |
+ return kNodeIsTooComplexForGreedyLoops; |
+ } |
+ NodeInfo* info = node->info(); |
+ if (info->follows_word_interest || |
+ info->follows_newline_interest || |
+ info->follows_start_interest) { |
+ return kNodeIsTooComplexForGreedyLoops; |
+ } |
+ int node_length = node->GreedyLoopTextLength(); |
+ if (node_length == kNodeIsTooComplexForGreedyLoops) { |
+ return kNodeIsTooComplexForGreedyLoops; |
+ } |
+ length += node_length; |
+ SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); |
+ node = seq_node->on_success(); |
+ } |
+ return length; |
+} |
+ |
+ |
+bool LoopChoiceNode::Emit(RegExpCompiler* compiler, |
+ GenerationVariant* variant) { |
+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
+ if (variant->stop_node() == this) { |
+ int text_length = GreedyLoopTextLength(&(alternatives_->at(0))); |
+ ASSERT(text_length != kNodeIsTooComplexForGreedyLoops); |
+ // Update the counter-based backtracking info on the stack. This is an |
+ // optimization for greedy loops (see below). |
+ ASSERT(variant->cp_offset() == text_length); |
+ macro_assembler->AdvanceCurrentPosition(text_length); |
+ macro_assembler->GoTo(variant->loop_label()); |
+ return true; |
+ } |
+ ASSERT(variant->stop_node() == NULL); |
+ if (!variant->is_trivial()) { |
+ return variant->Flush(compiler, this); |
+ } |
+ return ChoiceNode::Emit(compiler, variant); |
+} |
+ |
+ |
+bool ChoiceNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) { |
+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
int choice_count = alternatives_->length(); |
- RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
- Bind(macro_assembler); |
+#ifdef DEBUG |
+ for (int i = 0; i < choice_count - 1; i++) { |
+ GuardedAlternative alternative = alternatives_->at(i); |
+ ZoneList<Guard*>* guards = alternative.guards(); |
+ int guard_count = (guards == NULL) ? 0 : guards->length(); |
+ for (int j = 0; j < guard_count; j++) { |
+ ASSERT(!variant->mentions_reg(guards->at(j)->reg())); |
+ } |
+ } |
+#endif |
+ |
+ LimitResult limit_result = LimitVersions(compiler, variant); |
+ if (limit_result == DONE) return true; |
+ if (limit_result == FAIL) return false; |
+ ASSERT(limit_result == CONTINUE); |
+ |
+ RecursionCheck rc(compiler); |
+ |
+ GenerationVariant* current_variant = variant; |
+ |
+ int text_length = GreedyLoopTextLength(&(alternatives_->at(0))); |
+ bool greedy_loop = false; |
+ Label greedy_loop_label; |
+ GenerationVariant counter_backtrack_variant(&greedy_loop_label); |
+ if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { |
+ // Here we have special handling for greedy loops containing only text nodes |
+ // and other simple nodes. These are handled by pushing the current |
+ // position on the stack and then incrementing the current position each |
+ // time around the switch. On backtrack we decrement the current position |
+ // and check it against the pushed value. This avoids pushing backtrack |
+ // information for each iteration of the loop, which could take up a lot of |
+ // space. |
+ greedy_loop = true; |
+ ASSERT(variant->stop_node() == NULL); |
+ macro_assembler->PushCurrentPosition(); |
+ current_variant = &counter_backtrack_variant; |
+ Label greedy_match_failed; |
+ GenerationVariant greedy_match_variant(&greedy_match_failed); |
+ Label loop_label; |
+ macro_assembler->Bind(&loop_label); |
+ greedy_match_variant.set_stop_node(this); |
+ greedy_match_variant.set_loop_label(&loop_label); |
+ bool ok = alternatives_->at(0).node()->Emit(compiler, |
+ &greedy_match_variant); |
+ macro_assembler->Bind(&greedy_match_failed); |
+ if (!ok) { |
+ greedy_loop_label.Unuse(); |
+ return false; |
+ } |
+ } |
+ |
+ Label second_choice; // For use in greedy matches. |
+ macro_assembler->Bind(&second_choice); |
+ |
// For now we just call all choices one after the other. The idea ultimately |
// is to use the Dispatch table to try only the relevant ones. |
- for (int i = 0; i < choice_count - 1; i++) { |
+ for (int i = greedy_loop ? 1 : 0; i < choice_count - 1; i++) { |
GuardedAlternative alternative = alternatives_->at(i); |
Label after; |
- Label after_no_pop_cp; |
ZoneList<Guard*>* guards = alternative.guards(); |
- if (guards != NULL) { |
- int guard_count = guards->length(); |
- for (int j = 0; j < guard_count; j++) { |
- GenerateGuard(macro_assembler, guards->at(j), &after_no_pop_cp); |
- } |
+ int guard_count = (guards == NULL) ? 0 : guards->length(); |
+ GenerationVariant new_variant(*current_variant); |
+ new_variant.set_backtrack(&after); |
+ for (int j = 0; j < guard_count; j++) { |
+ GenerateGuard(macro_assembler, guards->at(j), &new_variant); |
} |
- macro_assembler->PushCurrentPosition(); |
- macro_assembler->PushBacktrack(&after); |
- if (!alternative.node()->GoTo(compiler)) { |
+ if (!alternative.node()->Emit(compiler, &new_variant)) { |
after.Unuse(); |
- after_no_pop_cp.Unuse(); |
return false; |
} |
macro_assembler->Bind(&after); |
- macro_assembler->PopCurrentPosition(); |
- macro_assembler->Bind(&after_no_pop_cp); |
} |
GuardedAlternative alternative = alternatives_->at(choice_count - 1); |
ZoneList<Guard*>* guards = alternative.guards(); |
- if (guards != NULL) { |
- int guard_count = guards->length(); |
- for (int j = 0; j < guard_count; j++) { |
- GenerateGuard(macro_assembler, guards->at(j), on_failure_->label()); |
- } |
+ int guard_count = (guards == NULL) ? 0 : guards->length(); |
+ for (int j = 0; j < guard_count; j++) { |
+ GenerateGuard(macro_assembler, guards->at(j), current_variant); |
} |
- if (!on_failure_->IsBacktrack()) { |
- ASSERT_NOT_NULL(on_failure_ -> label()); |
- macro_assembler->PushBacktrack(on_failure_->label()); |
- compiler->AddWork(on_failure_); |
+ bool ok = alternative.node()->Emit(compiler, current_variant); |
+ if (!ok) return false; |
+ if (greedy_loop) { |
+ macro_assembler->Bind(&greedy_loop_label); |
+ // If we have unwound to the bottom then backtrack. |
+ macro_assembler->CheckGreedyLoop(variant->backtrack()); |
+ // Otherwise try the second priority at an earlier position. |
+ macro_assembler->AdvanceCurrentPosition(-text_length); |
+ macro_assembler->GoTo(&second_choice); |
} |
- if (!alternative.node()->GoTo(compiler)) { |
- return false; |
- } |
return true; |
} |
-bool ActionNode::Emit(RegExpCompiler* compiler) { |
+bool ActionNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) { |
RegExpMacroAssembler* macro = compiler->macro_assembler(); |
- Bind(macro); |
+ LimitResult limit_result = LimitVersions(compiler, variant); |
+ if (limit_result == DONE) return true; |
+ if (limit_result == FAIL) return false; |
+ ASSERT(limit_result == CONTINUE); |
+ |
+ RecursionCheck rc(compiler); |
+ |
switch (type_) { |
- case STORE_REGISTER: |
- macro->SetRegister(data_.u_store_register.reg, |
- data_.u_store_register.value); |
- break; |
+ case STORE_POSITION: { |
+ GenerationVariant::DeferredCapture |
+ new_capture(data_.u_position_register.reg, variant); |
+ GenerationVariant new_variant = *variant; |
+ new_variant.add_action(&new_capture); |
+ return on_success()->Emit(compiler, &new_variant); |
+ } |
case INCREMENT_REGISTER: { |
- Label undo; |
- macro->PushBacktrack(&undo); |
- macro->AdvanceRegister(data_.u_increment_register.reg, 1); |
- bool ok = on_success()->GoTo(compiler); |
- if (!ok) { |
- undo.Unuse(); |
- return false; |
- } |
- macro->Bind(&undo); |
- macro->AdvanceRegister(data_.u_increment_register.reg, -1); |
- macro->Backtrack(); |
- break; |
+ GenerationVariant::DeferredIncrementRegister |
+ new_increment(data_.u_increment_register.reg); |
+ GenerationVariant new_variant = *variant; |
+ new_variant.add_action(&new_increment); |
+ return on_success()->Emit(compiler, &new_variant); |
} |
- case STORE_POSITION: { |
- Label undo; |
- macro->PushRegister(data_.u_position_register.reg); |
- macro->PushBacktrack(&undo); |
- macro->WriteCurrentPositionToRegister(data_.u_position_register.reg); |
- bool ok = on_success()->GoTo(compiler); |
- if (!ok) { |
- undo.Unuse(); |
- return false; |
- } |
- macro->Bind(&undo); |
- macro->PopRegister(data_.u_position_register.reg); |
- macro->Backtrack(); |
- break; |
+ case SET_REGISTER: { |
+ GenerationVariant::DeferredSetRegister |
+ new_set(data_.u_store_register.reg, data_.u_store_register.value); |
+ GenerationVariant new_variant = *variant; |
+ new_variant.add_action(&new_set); |
+ return on_success()->Emit(compiler, &new_variant); |
} |
- case RESTORE_POSITION: |
- macro->ReadCurrentPositionFromRegister( |
- data_.u_position_register.reg); |
- break; |
case BEGIN_SUBMATCH: |
+ if (!variant->is_trivial()) return variant->Flush(compiler, this); |
macro->WriteCurrentPositionToRegister( |
- data_.u_submatch.current_position_register); |
+ data_.u_submatch.current_position_register, 0); |
macro->WriteStackPointerToRegister( |
data_.u_submatch.stack_pointer_register); |
- break; |
- case ESCAPE_SUBMATCH: |
+ return on_success()->Emit(compiler, variant); |
+ case POSITIVE_SUBMATCH_SUCCESS: |
+ if (!variant->is_trivial()) return variant->Flush(compiler, this); |
+ // TODO(erikcorry): Implement support. |
+ if (info()->follows_word_interest || |
+ info()->follows_newline_interest || |
+ info()->follows_start_interest) { |
+ return false; |
+ } |
if (info()->at_end) { |
Label at_end; |
// Load current character jumps to the label if we are beyond the string |
// end. |
macro->LoadCurrentCharacter(0, &at_end); |
- macro->Backtrack(); |
+ macro->GoTo(variant->backtrack()); |
macro->Bind(&at_end); |
} |
- if (data_.u_submatch.current_position_register != -1) { |
- macro->ReadCurrentPositionFromRegister( |
- data_.u_submatch.current_position_register); |
- } |
+ macro->ReadCurrentPositionFromRegister( |
+ data_.u_submatch.current_position_register); |
macro->ReadStackPointerFromRegister( |
data_.u_submatch.stack_pointer_register); |
- break; |
+ return on_success()->Emit(compiler, variant); |
default: |
UNREACHABLE(); |
return false; |
} |
- return on_success()->GoTo(compiler); |
} |
-bool BackReferenceNode::Emit(RegExpCompiler* compiler) { |
+bool BackReferenceNode::Emit(RegExpCompiler* compiler, |
+ GenerationVariant* variant) { |
RegExpMacroAssembler* macro = compiler->macro_assembler(); |
- Bind(macro); |
+ if (!variant->is_trivial()) { |
+ return variant->Flush(compiler, this); |
+ } |
+ |
+ LimitResult limit_result = LimitVersions(compiler, variant); |
+ if (limit_result == DONE) return true; |
+ if (limit_result == FAIL) return false; |
+ ASSERT(limit_result == CONTINUE); |
+ |
+ RecursionCheck rc(compiler); |
+ |
ASSERT_EQ(start_reg_ + 1, end_reg_); |
if (info()->at_end) { |
// If we are constrained to match at the end of the input then succeed |
// iff the back reference is empty. |
- macro->CheckNotRegistersEqual(start_reg_, end_reg_, on_failure_->label()); |
+ macro->CheckNotRegistersEqual(start_reg_, end_reg_, variant->backtrack()); |
} else { |
if (compiler->ignore_case()) { |
- macro->CheckNotBackReferenceIgnoreCase(start_reg_, on_failure_->label()); |
+ macro->CheckNotBackReferenceIgnoreCase(start_reg_, variant->backtrack()); |
} else { |
- macro->CheckNotBackReference(start_reg_, on_failure_->label()); |
+ macro->CheckNotBackReference(start_reg_, variant->backtrack()); |
} |
} |
- return on_success()->GoTo(compiler); |
+ return on_success()->Emit(compiler, variant); |
} |
@@ -1571,9 +2093,9 @@ |
stream_(&alloc_) { } |
void PrintNode(const char* label, RegExpNode* node); |
void Visit(RegExpNode* node); |
- void PrintOnFailure(RegExpNode* from, RegExpNode* on_failure); |
void PrintAttributes(RegExpNode* from); |
StringStream* stream() { return &stream_; } |
+ void PrintOnFailure(RegExpNode* from, RegExpNode* to); |
#define DECLARE_VISIT(Type) \ |
virtual void Visit##Type(Type##Node* that); |
FOR_EACH_NODE_TYPE(DECLARE_VISIT) |
@@ -1615,7 +2137,6 @@ |
void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { |
- if (on_failure->IsBacktrack()) return; |
stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure); |
Visit(on_failure); |
} |
@@ -1740,7 +2261,6 @@ |
PrintAttributes(that); |
TableEntryBodyPrinter body_printer(stream(), that); |
that->GetTable(ignore_case_)->ForEach(&body_printer); |
- PrintOnFailure(that, that->on_failure()); |
} else { |
stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that); |
for (int i = 0; i < that->alternatives()->length(); i++) { |
@@ -1785,7 +2305,6 @@ |
PrintAttributes(that); |
stream()->Add(" n%p -> n%p;\n", that, that->on_success()); |
Visit(that->on_success()); |
- PrintOnFailure(that, that->on_failure()); |
} |
@@ -1797,7 +2316,6 @@ |
PrintAttributes(that); |
stream()->Add(" n%p -> n%p;\n", that, that->on_success()); |
Visit(that->on_success()); |
- PrintOnFailure(that, that->on_failure()); |
} |
@@ -1810,7 +2328,7 @@ |
void DotPrinter::VisitAction(ActionNode* that) { |
stream()->Add(" n%p [", that); |
switch (that->type_) { |
- case ActionNode::STORE_REGISTER: |
+ case ActionNode::SET_REGISTER: |
stream()->Add("label=\"$%i:=%i\", shape=octagon", |
that->data_.u_store_register.reg, |
that->data_.u_store_register.value); |
@@ -1823,22 +2341,19 @@ |
stream()->Add("label=\"$%i:=$pos\", shape=octagon", |
that->data_.u_position_register.reg); |
break; |
- case ActionNode::RESTORE_POSITION: |
- stream()->Add("label=\"$pos:=$%i\", shape=octagon", |
- that->data_.u_position_register.reg); |
- break; |
case ActionNode::BEGIN_SUBMATCH: |
stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon", |
that->data_.u_submatch.current_position_register); |
break; |
- case ActionNode::ESCAPE_SUBMATCH: |
+ case ActionNode::POSITIVE_SUBMATCH_SUCCESS: |
stream()->Add("label=\"escape\", shape=septagon"); |
break; |
} |
stream()->Add("];\n"); |
PrintAttributes(that); |
- stream()->Add(" n%p -> n%p;\n", that, that->on_success()); |
- Visit(that->on_success()); |
+ RegExpNode* successor = that->on_success(); |
+ stream()->Add(" n%p -> n%p;\n", that, successor); |
+ Visit(successor); |
} |
@@ -1895,40 +2410,35 @@ |
RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, |
- RegExpNode* on_success, |
- RegExpNode* on_failure) { |
+ RegExpNode* on_success) { |
ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); |
elms->Add(TextElement::Atom(this)); |
- return new TextNode(elms, on_success, on_failure); |
+ return new TextNode(elms, on_success); |
} |
RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, |
- RegExpNode* on_success, |
- RegExpNode* on_failure) { |
- return new TextNode(elements(), on_success, on_failure); |
+ RegExpNode* on_success) { |
+ return new TextNode(elements(), on_success); |
} |
RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, |
- RegExpNode* on_success, |
- RegExpNode* on_failure) { |
+ RegExpNode* on_success) { |
ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); |
elms->Add(TextElement::CharClass(this)); |
- return new TextNode(elms, on_success, on_failure); |
+ return new TextNode(elms, on_success); |
} |
RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, |
- RegExpNode* on_success, |
- RegExpNode* on_failure) { |
+ RegExpNode* on_success) { |
ZoneList<RegExpTree*>* alternatives = this->alternatives(); |
int length = alternatives->length(); |
- ChoiceNode* result = new ChoiceNode(length, on_failure); |
+ ChoiceNode* result = new ChoiceNode(length); |
for (int i = 0; i < length; i++) { |
GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, |
- on_success, |
- on_failure)); |
+ on_success)); |
result->AddAlternative(alternative); |
} |
return result; |
@@ -1936,15 +2446,13 @@ |
RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, |
- RegExpNode* on_success, |
- RegExpNode* on_failure) { |
+ RegExpNode* on_success) { |
return ToNode(min(), |
max(), |
is_greedy(), |
body(), |
compiler, |
- on_success, |
- on_failure); |
+ on_success); |
} |
@@ -1953,8 +2461,7 @@ |
bool is_greedy, |
RegExpTree* body, |
RegExpCompiler* compiler, |
- RegExpNode* on_success, |
- RegExpNode* on_failure) { |
+ RegExpNode* on_success) { |
// x{f, t} becomes this: |
// |
// (r++)<-. |
@@ -1972,11 +2479,11 @@ |
bool has_max = max < RegExpQuantifier::kInfinity; |
bool needs_counter = has_min || has_max; |
int reg_ctr = needs_counter ? compiler->AllocateRegister() : -1; |
- ChoiceNode* center = new ChoiceNode(2, on_failure); |
+ ChoiceNode* center = new LoopChoiceNode(2); |
RegExpNode* loop_return = needs_counter |
? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center)) |
: static_cast<RegExpNode*>(center); |
- RegExpNode* body_node = body->ToNode(compiler, loop_return, on_failure); |
+ RegExpNode* body_node = body->ToNode(compiler, loop_return); |
GuardedAlternative body_alt(body_node); |
if (has_max) { |
Guard* body_guard = new Guard(reg_ctr, Guard::LT, max); |
@@ -1995,7 +2502,7 @@ |
center->AddAlternative(body_alt); |
} |
if (needs_counter) { |
- return ActionNode::StoreRegister(reg_ctr, 0, center); |
+ return ActionNode::SetRegister(reg_ctr, 0, center); |
} else { |
return center; |
} |
@@ -2003,8 +2510,7 @@ |
RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, |
- RegExpNode* on_success, |
- RegExpNode* on_failure) { |
+ RegExpNode* on_success) { |
NodeInfo info; |
switch (type()) { |
case START_OF_LINE: |
@@ -2028,108 +2534,85 @@ |
RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, |
- RegExpNode* on_success, |
- RegExpNode* on_failure) { |
+ RegExpNode* on_success) { |
return new BackReferenceNode(RegExpCapture::StartRegister(index()), |
RegExpCapture::EndRegister(index()), |
- on_success, |
- on_failure); |
+ on_success); |
} |
RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, |
- RegExpNode* on_success, |
- RegExpNode* on_failure) { |
+ RegExpNode* on_success) { |
return on_success; |
} |
RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler, |
- RegExpNode* on_success, |
- RegExpNode* on_failure) { |
+ RegExpNode* on_success) { |
int stack_pointer_register = compiler->AllocateRegister(); |
int position_register = compiler->AllocateRegister(); |
+ RegExpNode* success; |
if (is_positive()) { |
- // begin submatch scope |
- // $reg = $pos |
- // if [body] |
- // then |
- // $pos = $reg |
- // escape submatch scope (drop all backtracks created in scope) |
- // succeed |
- // else |
- // end submatch scope (nothing to clean up, just exit the scope) |
- // fail |
return ActionNode::BeginSubmatch( |
stack_pointer_register, |
position_register, |
body()->ToNode( |
compiler, |
- ActionNode::EscapeSubmatch( |
- stack_pointer_register, |
- true, // Also restore input position. |
- position_register, |
- on_success), |
- on_failure)); |
+ ActionNode::PositiveSubmatchSuccess(stack_pointer_register, |
+ position_register, |
+ on_success))); |
} else { |
- // begin submatch scope |
- // try |
- // first if (body) |
- // then |
- // escape submatch scope |
- // fail |
- // else |
- // backtrack |
- // second |
- // end submatch scope |
- // restore current position |
- // succeed |
- ChoiceNode* try_node = |
- new ChoiceNode(1, ActionNode::RestorePosition(position_register, |
- on_success)); |
- RegExpNode* body_node = body()->ToNode( |
- compiler, |
- ActionNode::EscapeSubmatch(stack_pointer_register, |
- false, // Don't also restore position |
- 0, // Unused arguments. |
- on_failure), |
- compiler->backtrack()); |
- GuardedAlternative body_alt(body_node); |
- try_node->AddAlternative(body_alt); |
+ // We use a ChoiceNode for a negative lookahead because it has most of |
+ // the characteristics we need. It has the body of the lookahead as its |
+ // first alternative and the expression after the lookahead of the second |
+ // alternative. If the first alternative succeeds then the |
+ // NegativeSubmatchSuccess will unwind the stack including everything the |
+ // choice node set up and backtrack. If the first alternative fails then |
+ // the second alternative is tried, which is exactly the desired result |
+ // for a negative lookahead. In the case where the dispatch table |
+ // determines that the first alternative cannot match we will save time |
+ // by not trying it. Things are not quite so well-optimized if the |
+ // dispatch table determines that the second alternative cannot match. |
+ // In this case we could optimize by immediately backtracking. |
+ ChoiceNode* choice_node = new ChoiceNode(2); |
+ GuardedAlternative body_alt( |
+ body()->ToNode( |
+ compiler, |
+ success = new NegativeSubmatchSuccess(stack_pointer_register, |
+ position_register))); |
+ choice_node->AddAlternative(body_alt); |
+ choice_node->AddAlternative(GuardedAlternative(on_success)); |
return ActionNode::BeginSubmatch(stack_pointer_register, |
position_register, |
- try_node); |
+ choice_node); |
} |
} |
RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, |
- RegExpNode* on_success, |
- RegExpNode* on_failure) { |
- return ToNode(body(), index(), compiler, on_success, on_failure); |
+ RegExpNode* on_success) { |
+ return ToNode(body(), index(), compiler, on_success); |
} |
RegExpNode* RegExpCapture::ToNode(RegExpTree* body, |
int index, |
RegExpCompiler* compiler, |
- RegExpNode* on_success, |
- RegExpNode* on_failure) { |
+ RegExpNode* on_success) { |
int start_reg = RegExpCapture::StartRegister(index); |
int end_reg = RegExpCapture::EndRegister(index); |
RegExpNode* store_end = ActionNode::StorePosition(end_reg, on_success); |
- RegExpNode* body_node = body->ToNode(compiler, store_end, on_failure); |
+ RegExpNode* body_node = body->ToNode(compiler, store_end); |
return ActionNode::StorePosition(start_reg, body_node); |
} |
RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, |
- RegExpNode* on_success, |
- RegExpNode* on_failure) { |
+ RegExpNode* on_success) { |
ZoneList<RegExpTree*>* children = nodes(); |
RegExpNode* current = on_success; |
for (int i = children->length() - 1; i >= 0; i--) { |
- current = children->at(i)->ToNode(compiler, current, on_failure); |
+ current = children->at(i)->ToNode(compiler, current); |
} |
return current; |
} |
@@ -2400,9 +2883,7 @@ |
full_info.AddFromPreceding(info); |
bool cloned = false; |
ActionNode* action = EnsureSibling(this, &full_info, &cloned); |
- if (cloned && type_ != ESCAPE_SUBMATCH) { |
- action->set_on_success(action->on_success()->PropagateForward(info)); |
- } |
+ action->set_on_success(action->on_success()->PropagateForward(info)); |
return action; |
} |
@@ -2421,9 +2902,6 @@ |
alternative.set_node(alternative.node()->PropagateForward(info)); |
choice->alternatives()->Add(alternative); |
} |
- if (!choice->on_failure_->IsBacktrack()) { |
- choice->on_failure_ = choice->on_failure_->PropagateForward(info); |
- } |
} |
return choice; |
} |
@@ -2624,12 +3102,29 @@ |
} |
+void TextNode::CalculateOffsets() { |
+ int element_count = elements()->length(); |
+ // Set up the offsets of the elements relative to the start. This is a fixed |
+ // quantity since a TextNode can only contain fixed-width things. |
+ int cp_offset = 0; |
+ for (int i = 0; i < element_count; i++) { |
+ TextElement& elm = elements()->at(i); |
+ elm.cp_offset = cp_offset; |
+ if (elm.type == TextElement::ATOM) { |
+ cp_offset += elm.data.u_atom->data().length(); |
+ } else { |
+ cp_offset++; |
+ Vector<const uc16> quarks = elm.data.u_atom->data(); |
+ } |
+ } |
+} |
+ |
+ |
void Analysis::VisitText(TextNode* that) { |
if (ignore_case_) { |
that->MakeCaseIndependent(); |
} |
EnsureAnalyzed(that->on_success()); |
- EnsureAnalyzed(that->on_failure()); |
NodeInfo* info = that->info(); |
NodeInfo* next_info = that->on_success()->info(); |
// If the following node is interested in what it follows then this |
@@ -2637,14 +3132,16 @@ |
info->determine_newline = next_info->follows_newline_interest; |
info->determine_word = next_info->follows_word_interest; |
info->determine_start = next_info->follows_start_interest; |
+ that->CalculateOffsets(); |
} |
void Analysis::VisitAction(ActionNode* that) { |
- EnsureAnalyzed(that->on_success()); |
+ RegExpNode* target = that->on_success(); |
+ EnsureAnalyzed(target); |
// If the next node is interested in what it follows then this node |
// has to be interested too so it can pass the information on. |
- that->info()->AddFromFollowing(that->on_success()->info()); |
+ that->info()->AddFromFollowing(target->info()); |
} |
@@ -2657,13 +3154,11 @@ |
// this node also, so it can pass it on. |
info->AddFromFollowing(node->info()); |
} |
- EnsureAnalyzed(that->on_failure()); |
} |
void Analysis::VisitBackReference(BackReferenceNode* that) { |
EnsureAnalyzed(that->on_success()); |
- EnsureAnalyzed(that->on_failure()); |
} |
@@ -2746,7 +3241,7 @@ |
} else { |
// If this character class contains both word and non-word |
// characters we need to split it into two. |
- ChoiceNode* result = new ChoiceNode(2, on_failure()); |
+ ChoiceNode* result = new ChoiceNode(2); |
// Welcome to the family, son! |
result->set_siblings(this->siblings()); |
*result->info() = *this->info(); |
@@ -2754,16 +3249,14 @@ |
result->info()->AddAssumptions(info); |
RegExpNode* word_node |
= new TextNode(new RegExpCharacterClass(word, false), |
- on_success(), |
- on_failure()); |
+ on_success()); |
word_node->info()->determine_word = true; |
word_node->info()->does_determine_word = true; |
word_node->info()->is_word = NodeInfo::TRUE; |
result->alternatives()->Add(GuardedAlternative(word_node)); |
RegExpNode* non_word_node |
= new TextNode(new RegExpCharacterClass(non_word, false), |
- on_success(), |
- on_failure()); |
+ on_success()); |
non_word_node->info()->determine_word = true; |
non_word_node->info()->does_determine_word = true; |
non_word_node->info()->is_word = NodeInfo::FALSE; |
@@ -2974,21 +3467,22 @@ |
void DispatchTableConstructor::VisitAction(ActionNode* that) { |
- that->on_success()->Accept(this); |
+ RegExpNode* target = that->on_success(); |
+ target->Accept(this); |
} |
Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input, |
RegExpNode** node_return, |
bool ignore_case, |
- bool is_multiline) { |
+ bool is_multiline, |
+ Handle<String> pattern) { |
RegExpCompiler compiler(input->capture_count, ignore_case); |
// Wrap the body of the regexp in capture #0. |
RegExpNode* captured_body = RegExpCapture::ToNode(input->tree, |
0, |
&compiler, |
- compiler.accept(), |
- compiler.backtrack()); |
+ compiler.accept()); |
// Add a .*? at the beginning, outside the body capture. |
// Note: We could choose to not add this if the regexp is anchored at |
// the start of the input but I'm not sure how best to do that and |
@@ -2999,8 +3493,7 @@ |
false, |
new RegExpCharacterClass('*'), |
&compiler, |
- captured_body, |
- compiler.backtrack()); |
+ captured_body); |
if (node_return != NULL) *node_return = node; |
Analysis analysis(ignore_case); |
analysis.EnsureAnalyzed(node); |
@@ -3024,14 +3517,16 @@ |
(input->capture_count + 1) * 2); |
return compiler.Assemble(¯o_assembler, |
node, |
- input->capture_count); |
+ input->capture_count, |
+ pattern); |
#endif |
} |
EmbeddedVector<byte, 1024> codes; |
RegExpMacroAssemblerIrregexp macro_assembler(codes); |
return compiler.Assemble(¯o_assembler, |
node, |
- input->capture_count); |
+ input->capture_count, |
+ pattern); |
} |