Chromium Code Reviews| Index: src/jsregexp.cc |
| =================================================================== |
| --- src/jsregexp.cc (revision 927) |
| +++ src/jsregexp.cc (working copy) |
| @@ -242,7 +242,8 @@ |
| RegExpEngine::Compile(&parse_result, |
| &node, |
| flags.is_ignore_case(), |
| - flags.is_multiline()); |
| + flags.is_multiline(), |
| + pattern); |
| if (irregexp_data.is_null()) { |
| if (FLAG_disable_jscre) { |
| UNIMPLEMENTED(); |
| @@ -858,8 +859,156 @@ |
| // ------------------------------------------------------------------- |
| // Implmentation of the Irregexp regular expression engine. |
| +// |
| +// The Irregexp regular expression engine is intended to be a complete |
| +// implementation of ECMAScript regular expressions. It generates either |
| +// bytecodes or native code. |
| +// The Irregexp regexp engine is structured in three steps. |
| +// 1) The parser generates an abstract syntax tree. See ast.cc. |
| +// 2) From the AST a node network is created. The nodes are all |
| +// subclasses of RegExpNode. The nodes represent states when |
| +// executing a regular expression. Several optimizations are |
| +// performed on the node network. |
| +// 3) From the nodes we generate either byte codes or native code |
| +// that can actually execute the regular expression (perform |
| +// the search). The code generation step is described in more |
| +// detail below. |
| +// Code generation. |
| +// |
| +// The nodes are divided into four main categories. |
| +// * Choice nodes |
| +// These represent places where the regular expression can |
| +// match in more than one way. For example on entry to an |
| +// alternation (foo|bar) or a repetition (*, +, ? or {}). |
| +// * Action nodes |
| +// These represent places where some action should be |
| +// performed. Examples include recording the current position |
| +// in the input string to a register (in order to implement |
| +// captures) or other actions on register for example in order |
| +// to implement the counters needed for {} repetitions. |
| +// * Matching nodes |
| +// These attempt to match some element part of the input string. |
| +// Examples of elements include character classes, plain strings |
| +// or back references. |
| +// * End nodes |
| +// These are used to implement the actions required on finding |
| +// a successful match or failing to find a match. |
| +// |
| +// The code generated (whether as byte codes or native code) maintains |
| +// some state as it runs. This consists of the following elements: |
| +// |
| +// * The capture registers. Used for string captures. |
| +// * Other registers. Used for counters etc. |
| +// * The current position. |
| +// * The stack of backtracking information. Used when a matching node |
| +// fails to find a match and needs to try an alternative. |
| +// |
| +// Conceptual regular expression execution model: |
| +// |
| +// There is a simple conceptual model of regular expression execution |
| +// which will be presented first. The actual code generated is a more |
| +// efficient simulation of the simple conceptual model: |
| +// |
| +// * Choice nodes are implemented as follows: |
| +// For each choice except the last { |
| +// push current position |
| +// push backtrack code location |
| +// <generate code to test for choice> |
| +// backtrack code location: |
| +// pop current position |
| +// } |
| +// <generate code to test for last choice> |
| +// |
| +// * Actions nodes are generated as follows |
| +// <push affected registers on backtrack stack> |
| +// <generate code to perform action> |
| +// push backtrack code location |
| +// <generate code to test for following nodes> |
| +// backtrack code location: |
| +// <pop affected registers to restore their state> |
| +// <pop backtrack location from stack and go to it> |
| +// |
| +// * Matching nodes are generated as follows: |
| +// if input string matches at current position |
| +// update current position |
| +// <generate code to test for following nodes> |
| +// else |
| +// <pop backtrack location from stack and go to it> |
| +// |
| +// Thus it can be seen that the current position is saved and restored |
| +// by the choice nodes, whereas the registers are saved and restored by |
| +// by the action nodes that manipulate them. |
| +// |
| +// The other interesting aspect of this model is that nodes are generated |
| +// at the point where they are needed by a recursive call to Emit(). If |
| +// the node has already been code generated then the Emit() call will |
| +// generate a jump to the previously generated code instead. In order to |
| +// limit recursion it is possible for the Emit() function to put the node |
| +// on a work list for later generation and instead generate a jump. The |
| +// destination of the jump is resolved later when the code is generated. |
| +// |
| +// Actual regular expression code generation. |
| +// |
| +// Code generation is actually more complicated than the above. In order |
| +// to improve the efficiency of the generated code some optimizations are |
| +// performed |
| +// |
| +// * Choice nodes have 1-character lookahead. |
| +// A choice node looks at the following character and eliminates some of |
| +// the choices immediately based on that character. This is not yet |
| +// implemented. |
| +// * Simple greedy loops store reduced backtracking information. |
| +// A quantifier like /.*foo/m will greedily match the whole input. It will |
| +// then need to backtrack to a point where it can match "foo". The naive |
| +// implementation of this would push each character position onto the |
| +// backtracking stack, then pop them off one by one. This would use space |
| +// proportional to the length of the input string. However since the "." |
| +// can only match in one way and always has a constant length (in this case |
| +// of 1) it suffices to store the current position on the top of the stack |
| +// once. Matching now becomes merely incrementing the current position and |
| +// backtracking becomes decrementing the current position and checking the |
| +// result against the stored current position. This is faster and saves |
| +// space. |
| +// * The current state is virtualized. |
| +// This is used to defer expensive operations until it is clear that they |
| +// are needed and to generate code for a node more than once, allowing |
| +// specialized an efficient versions of the code to be created. This is |
| +// explained in the section below. |
| +// |
| +// Execution state virtualization. |
| +// |
| +// Instead of emitting code, nodes that manipulate the state can record their |
| +// manipulation in an object called the GenerationVariant. The |
| +// GenerationVariant object can record a current position offset, an |
| +// optional backtrack code location on the top of the virtualized backtrack |
| +// stack and some register changes. When a node is to be emitted it can flush |
| +// the GenerationVariant or update it. Flushing the GenerationVariant |
| +// will emit code to bring the actual state into line with the virtual state. |
| +// Avoiding flushing the state can postpone some work (eg updates of capture |
| +// registers). Postponing work can save time when executing the regular |
| +// expression since it may be found that the work never has to be done as a |
| +// failure to match can occur. In addition it is much faster to jump to a |
| +// known backtrack code location than it is to pop an unknown backtrack |
| +// location from the stack and jump there. |
| +// |
| +// The virtual state found in the GenerationVariant affects code generation. |
| +// For example the virtual state contains the difference between the actual |
| +// current position and the virtual current position, and matching code needs |
| +// to use this offset to attempt a match in the correct location of the input |
| +// string. Therefore code generated for a non-trivial GenerationVariant is |
| +// specialized to that GenerationVariant. The code generator therefore |
| +// has the ability to generate code for each node several times. In order to |
| +// limit the size of the generated code there is an arbitrary limit on how |
| +// many specialized sets of code may be generated for a given node. If the |
| +// limit is reached, the GenerationVariant is flushed and a generic version of |
| +// the code for a node is emitted. This is subsequently used for that node. |
| +// The code emitted for non-generic GenerationVariants is not recorded in the |
| +// node and so it cannot currently be reused in the event that code generation |
| +// is requested for an identical GenerationVariant. |
|
Christian Plesner Hansen
2008/12/08 08:54:19
Most excellent!
|
| + |
| + |
| void RegExpTree::AppendToText(RegExpText* text) { |
| UNREACHABLE(); |
| } |
| @@ -914,7 +1063,8 @@ |
| Handle<FixedArray> Assemble(RegExpMacroAssembler* assembler, |
| RegExpNode* start, |
| - int capture_count); |
| + int capture_count, |
| + Handle<String> pattern); |
| inline void AddWork(RegExpNode* node) { work_list_->Add(node); } |
| @@ -924,7 +1074,6 @@ |
| RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } |
| EndNode* accept() { return accept_; } |
| - EndNode* backtrack() { return backtrack_; } |
| static const int kMaxRecursion = 100; |
| inline int recursion_depth() { return recursion_depth_; } |
| @@ -935,7 +1084,6 @@ |
| private: |
| EndNode* accept_; |
| - EndNode* backtrack_; |
| int next_register_; |
| List<RegExpNode*>* work_list_; |
| int recursion_depth_; |
| @@ -944,6 +1092,17 @@ |
| }; |
| +class RecursionCheck { |
| + public: |
| + explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { |
| + compiler->IncrementRecursionDepth(); |
| + } |
| + ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } |
| + private: |
| + RegExpCompiler* compiler_; |
| +}; |
| + |
| + |
| // Attempts to compile the regexp using an Irregexp code generator. Returns |
| // a fixed array or a null handle depending on whether it succeeded. |
| RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case) |
| @@ -952,14 +1111,14 @@ |
| recursion_depth_(0), |
| ignore_case_(ignore_case) { |
| accept_ = new EndNode(EndNode::ACCEPT); |
| - backtrack_ = new EndNode(EndNode::BACKTRACK); |
| } |
| Handle<FixedArray> RegExpCompiler::Assemble( |
| RegExpMacroAssembler* macro_assembler, |
| RegExpNode* start, |
| - int capture_count) { |
| + int capture_count, |
| + Handle<String> pattern) { |
| #ifdef DEBUG |
| if (FLAG_trace_regexp_assembler) |
| macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler); |
| @@ -969,19 +1128,19 @@ |
| List <RegExpNode*> work_list(0); |
| work_list_ = &work_list; |
| Label fail; |
| - macro_assembler_->PushBacktrack(&fail); |
| - if (!start->GoTo(this)) { |
| + macro_assembler->PushBacktrack(&fail); |
| + GenerationVariant generic_variant; |
| + if (!start->Emit(this, &generic_variant)) { |
| fail.Unuse(); |
| return Handle<FixedArray>::null(); |
| } |
| + macro_assembler_->Bind(&fail); |
| + macro_assembler_->Fail(); |
| while (!work_list.is_empty()) { |
| - if (!work_list.RemoveLast()->GoTo(this)) { |
| - fail.Unuse(); |
| + if (!work_list.RemoveLast()->Emit(this, &generic_variant)) { |
| return Handle<FixedArray>::null(); |
| } |
| } |
| - macro_assembler_->Bind(&fail); |
| - macro_assembler_->Fail(); |
| Handle<FixedArray> array = |
| Factory::NewFixedArray(RegExpImpl::kIrregexpDataLength); |
| array->set(RegExpImpl::kIrregexpImplementationIndex, |
| @@ -990,7 +1149,7 @@ |
| Smi::FromInt(next_register_)); |
| array->set(RegExpImpl::kIrregexpNumberOfCapturesIndex, |
| Smi::FromInt(capture_count)); |
| - Handle<Object> code = macro_assembler_->GetCode(); |
| + Handle<Object> code = macro_assembler_->GetCode(pattern); |
| array->set(RegExpImpl::kIrregexpCodeIndex, *code); |
| work_list_ = NULL; |
| #ifdef DEBUG |
| @@ -1002,71 +1161,217 @@ |
| } |
| -bool RegExpNode::GoTo(RegExpCompiler* compiler) { |
| - // TODO(erikcorry): Implement support. |
| - if (info_.follows_word_interest || |
| - info_.follows_newline_interest || |
| - info_.follows_start_interest) { |
| - return false; |
| +bool GenerationVariant::mentions_reg(int reg) { |
| + for (DeferredAction* action = actions_; |
| + action != NULL; |
| + action = action->next()) { |
| + if (reg == action->reg()) return true; |
| } |
| - if (label_.is_bound()) { |
| - compiler->macro_assembler()->GoTo(&label_); |
| - return true; |
| - } else { |
| - if (compiler->recursion_depth() > RegExpCompiler::kMaxRecursion) { |
| - compiler->macro_assembler()->GoTo(&label_); |
| - compiler->AddWork(this); |
| - return true; |
| + return false; |
| +} |
| + |
| + |
| +int GenerationVariant::FindAffectedRegisters(OutSet* affected_registers) { |
| + int max_register = -1; |
| + for (DeferredAction* action = actions_; |
| + action != NULL; |
| + action = action->next()) { |
| + affected_registers->Set(action->reg()); |
| + if (action->reg() > max_register) max_register = action->reg(); |
| + } |
| + return max_register; |
| +} |
| + |
| + |
| +void GenerationVariant::PushAffectedRegisters(RegExpMacroAssembler* macro, |
| + int max_register, |
| + OutSet& affected_registers) { |
| + for (int reg = 0; reg <= max_register; reg++) { |
| + if (affected_registers.Get(reg)) macro->PushRegister(reg); |
| + } |
| +} |
| + |
| + |
| +void GenerationVariant::RestoreAffectedRegisters(RegExpMacroAssembler* macro, |
| + int max_register, |
| + OutSet& affected_registers) { |
| + for (int reg = max_register; reg >= 0; reg--) { |
| + if (affected_registers.Get(reg)) macro->PopRegister(reg); |
| + } |
| +} |
| + |
| + |
| +void GenerationVariant::PerformDeferredActions(RegExpMacroAssembler* macro, |
| + int max_register, |
|
Christian Plesner Hansen
2008/12/08 08:54:19
Indentation off.
|
| + OutSet& affected_registers) { |
| + for (int reg = 0; reg <= max_register; reg++) { |
| + if (!affected_registers.Get(reg)) { |
| + continue; |
| + } |
| + int value = 0; |
| + bool absolute = false; |
| + int store_position = -1; |
| + // This is a little tricky because we are scanning the actions in reverse |
| + // historical order (newest first). |
| + for (DeferredAction* action = actions_; |
| + action != NULL; |
| + action = action->next()) { |
| + if (action->reg() == reg) { |
| + switch (action->type()) { |
| + case ActionNode::SET_REGISTER: { |
| + GenerationVariant::DeferredSetRegister* psr = |
| + static_cast<GenerationVariant::DeferredSetRegister*>(action); |
| + value += psr->value(); |
| + absolute = true; |
| + ASSERT_EQ(store_position, -1); |
| + break; |
| + } |
| + case ActionNode::INCREMENT_REGISTER: |
| + if (!absolute) { |
| + value++; |
| + } |
| + ASSERT_EQ(store_position, -1); |
| + break; |
| + case ActionNode::STORE_POSITION: { |
| + GenerationVariant::DeferredCapture* pc = |
| + static_cast<GenerationVariant::DeferredCapture*>(action); |
| + if (store_position == -1) { |
| + store_position = pc->cp_offset(); |
| + } |
| + ASSERT(!absolute); |
| + ASSERT_EQ(value, 0); |
| + break; |
| + } |
| + default: |
| + UNREACHABLE(); |
| + break; |
| + } |
| + } |
| + } |
| + if (store_position != -1) { |
| + macro->WriteCurrentPositionToRegister(reg, store_position); |
| } else { |
| - compiler->IncrementRecursionDepth(); |
| - bool how_it_went = Emit(compiler); |
| - compiler->DecrementRecursionDepth(); |
| - return how_it_went; |
| + if (absolute) { |
| + macro->SetRegister(reg, value); |
| + } else { |
| + if (value != 0) { |
| + macro->AdvanceRegister(reg, value); |
| + } |
| + } |
| } |
| } |
| } |
| -// EndNodes are special. Because they can be very common and they are very |
| -// short we normally inline them. That is, if we are asked to emit a GoTo |
| -// we just emit the entire node. Since they don't have successors this |
| -// works. |
| -bool EndNode::GoTo(RegExpCompiler* compiler) { |
| - if (info()->follows_word_interest || |
| - info()->follows_newline_interest || |
| - info()->follows_start_interest) { |
| - return false; |
| +// This is called as we come into a loop choice node and some other tricky |
| +// nodes. It normalises the state of the code generator to ensure we can |
| +// generate generic code. |
| +bool GenerationVariant::Flush(RegExpCompiler* compiler, RegExpNode* successor) { |
| + RegExpMacroAssembler* macro = compiler->macro_assembler(); |
| + |
| + ASSERT(actions_ != NULL || cp_offset_ != 0 || backtrack() != NULL); |
| + |
| + if (actions_ == NULL && backtrack() == NULL) { |
| + // Here we just have some deferred cp advances to fix and we are back to |
| + // a normal situation. |
| + macro->AdvanceCurrentPosition(cp_offset_); |
| + // Create a new trivial state and generate the node with that. |
| + GenerationVariant new_state; |
| + return successor->Emit(compiler, &new_state); |
| } |
| - return Emit(compiler); |
| + |
| + // Generate deferred actions here along with code to undo them again. |
| + OutSet affected_registers; |
| + int max_register = FindAffectedRegisters(&affected_registers); |
| + PushAffectedRegisters(macro, max_register, affected_registers); |
| + PerformDeferredActions(macro, max_register, affected_registers); |
| + if (backtrack() != NULL) { |
| + // Here we have a concrete backtrack location. These are set up by choice |
| + // nodes and so they indicate that we have a deferred save of the current |
| + // position which we may need to emit here. |
| + macro->PushCurrentPosition(); |
| + } |
| + if (cp_offset_ != 0) { |
| + macro->AdvanceCurrentPosition(cp_offset_); |
| + } |
| + |
| + // Create a new trivial state and generate the node with that. |
|
Christian Plesner Hansen
2008/12/08 08:54:19
trivial -> empty?
|
| + Label undo; |
| + macro->PushBacktrack(&undo); |
| + GenerationVariant new_state; |
| + bool ok = successor->Emit(compiler, &new_state); |
| + |
| + // On backtrack we need to restore state. |
| + macro->Bind(&undo); |
| + if (!ok) return false; |
| + if (backtrack() != NULL) { |
| + macro->PopCurrentPosition(); |
| + } |
| + RestoreAffectedRegisters(macro, max_register, affected_registers); |
| + if (backtrack() == NULL) { |
| + macro->Backtrack(); |
| + } else { |
| + macro->GoTo(backtrack()); |
| + } |
| + |
| + return true; |
| } |
| -Label* RegExpNode::label() { |
| - return &label_; |
| +void EndNode::EmitInfoChecks(RegExpMacroAssembler* macro, |
| + GenerationVariant* variant) { |
| + if (info()->at_end) { |
| + Label succeed; |
| + // LoadCurrentCharacter will go to the label if we are at the end of the |
| + // input string. |
| + macro->LoadCurrentCharacter(0, &succeed); |
| + macro->GoTo(variant->backtrack()); |
| + macro->Bind(&succeed); |
| + } |
| } |
| -bool EndNode::Emit(RegExpCompiler* compiler) { |
| +bool NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, |
| + GenerationVariant* variant) { |
| + if (!variant->is_trivial()) { |
| + return variant->Flush(compiler, this); |
| + } |
| RegExpMacroAssembler* macro = compiler->macro_assembler(); |
| + if (!label()->is_bound()) { |
| + macro->Bind(label()); |
| + } |
| + EmitInfoChecks(macro, variant); |
| + macro->ReadCurrentPositionFromRegister(current_position_register_); |
| + macro->ReadStackPointerFromRegister(stack_pointer_register_); |
| + // Now that we have unwound the stack we find at the top of the stack the |
| + // backtrack that the BeginSubmatch node got. |
| + macro->Backtrack(); |
| + return true; |
| +} |
| + |
| + |
| +bool EndNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) { |
| + if (!variant->is_trivial()) { |
| + return variant->Flush(compiler, this); |
| + } |
| + RegExpMacroAssembler* macro = compiler->macro_assembler(); |
| + if (!label()->is_bound()) { |
| + macro->Bind(label()); |
| + } |
| switch (action_) { |
| case ACCEPT: |
| - if (!label()->is_bound()) Bind(macro); |
| - if (info()->at_end) { |
| - Label succeed; |
| - // LoadCurrentCharacter will go to the label if we are at the end of the |
| - // input string. |
| - macro->LoadCurrentCharacter(0, &succeed); |
| - macro->Backtrack(); |
| - macro->Bind(&succeed); |
| - } |
| + EmitInfoChecks(macro, variant); |
| macro->Succeed(); |
| return true; |
| case BACKTRACK: |
| - if (!label()->is_bound()) Bind(macro); |
| ASSERT(!info()->at_end); |
| - macro->Backtrack(); |
| + macro->GoTo(variant->backtrack()); |
| return true; |
| + case NEGATIVE_SUBMATCH_SUCCESS: |
| + // This case is handled in a different virtual method. |
| + UNREACHABLE(); |
| } |
| + UNIMPLEMENTED(); |
| return false; |
| } |
| @@ -1078,10 +1383,10 @@ |
| } |
| -ActionNode* ActionNode::StoreRegister(int reg, |
| - int val, |
| - RegExpNode* on_success) { |
| - ActionNode* result = new ActionNode(STORE_REGISTER, on_success); |
| +ActionNode* ActionNode::SetRegister(int reg, |
| + int val, |
| + RegExpNode* on_success) { |
| + ActionNode* result = new ActionNode(SET_REGISTER, on_success); |
| result->data_.u_store_register.reg = reg; |
| result->data_.u_store_register.value = val; |
| return result; |
| @@ -1102,13 +1407,6 @@ |
| } |
| -ActionNode* ActionNode::RestorePosition(int reg, RegExpNode* on_success) { |
| - ActionNode* result = new ActionNode(RESTORE_POSITION, on_success); |
| - result->data_.u_position_register.reg = reg; |
| - return result; |
| -} |
| - |
| - |
| ActionNode* ActionNode::BeginSubmatch(int stack_reg, |
| int position_reg, |
| RegExpNode* on_success) { |
| @@ -1119,17 +1417,12 @@ |
| } |
| -ActionNode* ActionNode::EscapeSubmatch(int stack_reg, |
| - bool restore_position, |
| - int position_reg, |
| - RegExpNode* on_success) { |
| - ActionNode* result = new ActionNode(ESCAPE_SUBMATCH, on_success); |
| +ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, |
| + int position_reg, |
| + RegExpNode* on_success) { |
| + ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success); |
| result->data_.u_submatch.stack_pointer_register = stack_reg; |
| - if (restore_position) { |
| - result->data_.u_submatch.current_position_register = position_reg; |
| - } else { |
| - result->data_.u_submatch.current_position_register = -1; |
| - } |
| + result->data_.u_submatch.current_position_register = position_reg; |
| return result; |
| } |
| @@ -1147,14 +1440,20 @@ |
| void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, |
| - Guard* guard, |
| - Label* on_failure) { |
| + Guard* guard, |
|
Christian Plesner Hansen
2008/12/08 08:54:19
Indentation.
|
| + GenerationVariant* variant) { |
| switch (guard->op()) { |
| case Guard::LT: |
| - macro_assembler->IfRegisterGE(guard->reg(), guard->value(), on_failure); |
| + ASSERT(!variant->mentions_reg(guard->reg())); |
| + macro_assembler->IfRegisterGE(guard->reg(), |
| + guard->value(), |
| + variant->backtrack()); |
| break; |
| case Guard::GEQ: |
| - macro_assembler->IfRegisterLT(guard->reg(), guard->value(), on_failure); |
| + ASSERT(!variant->mentions_reg(guard->reg())); |
| + macro_assembler->IfRegisterLT(guard->reg(), |
| + guard->value(), |
| + variant->backtrack()); |
| break; |
| } |
| } |
| @@ -1169,13 +1468,22 @@ |
| TextElement elm, |
| Vector<const uc16> quarks, |
| Label* on_failure, |
| - int cp_offset) { |
| + int cp_offset, |
| + bool check_offset) { |
| unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| + // It is vital that this loop is backwards due to the unchecked character |
| + // load below. |
| for (int i = quarks.length() - 1; i >= 0; i--) { |
| uc16 c = quarks[i]; |
| int length = uncanonicalize.get(c, '\0', chars); |
| if (length <= 1) { |
| - macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure); |
| + if (check_offset && i == quarks.length() - 1) { |
| + macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure); |
| + } else { |
| + // Here we don't need to check against the end of the input string |
| + // since this character lies before a character that matched. |
| + macro_assembler->LoadCurrentCharacterUnchecked(cp_offset + i); |
| + } |
| macro_assembler->CheckNotCharacter(c, on_failure); |
| } |
| } |
| @@ -1216,13 +1524,22 @@ |
| TextElement elm, |
| Vector<const uc16> quarks, |
| Label* on_failure, |
| - int cp_offset) { |
| + int cp_offset, |
| + bool check_offset) { |
| unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| + // It is vital that this loop is backwards due to the unchecked character |
| + // load below. |
| for (int i = quarks.length() - 1; i >= 0; i--) { |
| uc16 c = quarks[i]; |
| int length = uncanonicalize.get(c, '\0', chars); |
| if (length <= 1) continue; |
| - macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure); |
| + if (check_offset && i == quarks.length() - 1) { |
| + macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure); |
| + } else { |
| + // Here we don't need to check against the end of the input string |
| + // since this character lies before a character that matched. |
| + macro_assembler->LoadCurrentCharacterUnchecked(cp_offset + i); |
| + } |
| Label ok; |
| ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); |
| switch (length) { |
| @@ -1259,10 +1576,8 @@ |
| static void EmitCharClass(RegExpMacroAssembler* macro_assembler, |
| RegExpCharacterClass* cc, |
| int cp_offset, |
| - Label* on_failure) { |
| - macro_assembler->LoadCurrentCharacter(cp_offset, on_failure); |
| - cp_offset++; |
| - |
| + Label* on_failure, |
| + bool check_offset) { |
| ZoneList<CharacterRange>* ranges = cc->ranges(); |
| Label success; |
| @@ -1279,6 +1594,27 @@ |
| return; |
| } |
| + if (range_count == 1 && |
| + !cc->is_negated() && |
| + ranges->at(0).from() == 0 && |
|
Christian Plesner Hansen
2008/12/08 08:54:19
Maybe add an IsEverything method on CharacterRange
|
| + ranges->at(0).to() == 0xffff) { |
| + // This is a common case hit by non-anchored expressions. |
| + // TODO(erikcorry): We should have a macro assembler instruction that just |
| + // checks for end of string without loading the character. |
| + if (check_offset) { |
| + macro_assembler->LoadCurrentCharacter(cp_offset, on_failure); |
| + } |
| + return; |
| + } |
| + |
| + if (check_offset) { |
| + macro_assembler->LoadCurrentCharacter(cp_offset, on_failure); |
| + } else { |
| + // Here we don't need to check against the end of the input string |
| + // since this character lies before a character that matched. |
| + macro_assembler->LoadCurrentCharacterUnchecked(cp_offset); |
| + } |
| + |
| for (int i = 0; i < range_count - 1; i++) { |
| CharacterRange& range = ranges->at(i); |
| Label next_range; |
| @@ -1333,73 +1669,143 @@ |
| } |
| +RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, |
| + GenerationVariant* variant) { |
| + // TODO(erikcorry): Implement support. |
| + if (info_.follows_word_interest || |
| + info_.follows_newline_interest || |
| + info_.follows_start_interest) { |
| + return FAIL; |
| + } |
| -bool TextNode::Emit(RegExpCompiler* compiler) { |
| + // If we are generating a greedy loop then don't stop and don't reuse code. |
| + if (variant->stop_node() != NULL) { |
| + return CONTINUE; |
| + } |
| + |
| RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
| - Bind(macro_assembler); |
| + if (variant->is_trivial()) { |
| + if (label_.is_bound()) { |
| + // We are being asked to generate a generic version, but that's already |
| + // been done so just go to it. |
| + macro_assembler->GoTo(&label_); |
| + return DONE; |
| + } |
| + if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) { |
| + // To avoid too deep recursion we push the node to the work queue and just |
| + // generate a goto here. |
| + compiler->AddWork(this); |
| + macro_assembler->GoTo(&label_); |
| + return DONE; |
| + } |
| + // Generate generic version of the node and bind the label for later use. |
| + macro_assembler->Bind(&label_); |
| + return CONTINUE; |
| + } |
| + |
| + // We are being asked to make a non-generic version. Keep track of how many |
| + // non-generic versions we generate so as not to overdo it. |
| + variants_generated_++; |
| + if (variants_generated_ < kMaxVariantsGenerated && |
| + compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) { |
| + return CONTINUE; |
| + } |
| + |
| + // If we get here there have been too many variants generated or recursion |
| + // is too deep. Time to switch to a generic version. The code for |
| + // generic versions above can handle deep recursion properly. |
| + bool ok = variant->Flush(compiler, this); |
| + return ok ? DONE : FAIL; |
| +} |
| + |
| + |
| +// This generates the code to match a text node. A text node can contain |
| +// straight character sequences (possibly to be matched in a case-independent |
| +// way) and character classes. In order to be most efficient we test for the |
| +// simple things first and then move on to the more complicated things. The |
| +// simplest thing is a non-letter or a letter if we are matching case. The |
| +// next-most simple thing is a case-independent letter. The least simple is |
| +// a character class. Another optimization is that we test the last one first. |
| +// If that succeeds we don't need to test for the end of the string when we |
| +// load other characters. |
| +bool TextNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) { |
| + RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
| + Label *backtrack = variant->backtrack(); |
| + LimitResult limit_result = LimitVersions(compiler, variant); |
| + if (limit_result == FAIL) return false; |
| + if (limit_result == DONE) return true; |
| + ASSERT(limit_result == CONTINUE); |
| + |
| int element_count = elms_->length(); |
| ASSERT(element_count != 0); |
| - int cp_offset = 0; |
| if (info()->at_end) { |
| - macro_assembler->Backtrack(); |
| + macro_assembler->GoTo(backtrack); |
| return true; |
| } |
| // First, handle straight character matches. |
| - for (int i = 0; i < element_count; i++) { |
| + int checked_up_to = -1; |
| + for (int i = element_count - 1; i >= 0; i--) { |
| TextElement elm = elms_->at(i); |
| + ASSERT(elm.cp_offset >= 0); |
| + int cp_offset = variant->cp_offset() + elm.cp_offset; |
| if (elm.type == TextElement::ATOM) { |
| Vector<const uc16> quarks = elm.data.u_atom->data(); |
| + int last_cp_offset = cp_offset + quarks.length(); |
| if (compiler->ignore_case()) { |
| EmitAtomNonLetters(macro_assembler, |
| elm, |
| quarks, |
| - on_failure_->label(), |
| - cp_offset); |
| + backtrack, |
| + cp_offset, |
| + checked_up_to < last_cp_offset); |
| } else { |
| macro_assembler->CheckCharacters(quarks, |
| cp_offset, |
| - on_failure_->label()); |
| + backtrack, |
| + checked_up_to < last_cp_offset); |
| } |
| - cp_offset += quarks.length(); |
| + if (last_cp_offset > checked_up_to) checked_up_to = last_cp_offset - 1; |
| } else { |
| ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); |
| - cp_offset++; |
| } |
| } |
| // Second, handle case independent letter matches if any. |
| if (compiler->ignore_case()) { |
| - cp_offset = 0; |
| - for (int i = 0; i < element_count; i++) { |
| + for (int i = element_count - 1; i >= 0; i--) { |
| TextElement elm = elms_->at(i); |
| + int cp_offset = variant->cp_offset() + elm.cp_offset; |
| if (elm.type == TextElement::ATOM) { |
| Vector<const uc16> quarks = elm.data.u_atom->data(); |
| + int last_cp_offset = cp_offset + quarks.length(); |
| EmitAtomLetters(macro_assembler, |
| elm, |
| quarks, |
| - on_failure_->label(), |
| - cp_offset); |
| - cp_offset += quarks.length(); |
| - } else { |
| - cp_offset++; |
| + backtrack, |
| + cp_offset, |
| + checked_up_to < last_cp_offset); |
| + if (last_cp_offset > checked_up_to) checked_up_to = last_cp_offset - 1; |
| } |
| } |
| } |
| // If the fast character matches passed then do the character classes. |
| - cp_offset = 0; |
| - for (int i = 0; i < element_count; i++) { |
| + for (int i = element_count - 1; i >= 0; i--) { |
| TextElement elm = elms_->at(i); |
| + int cp_offset = variant->cp_offset() + elm.cp_offset; |
| if (elm.type == TextElement::CHAR_CLASS) { |
| RegExpCharacterClass* cc = elm.data.u_char_class; |
| - EmitCharClass(macro_assembler, cc, cp_offset, on_failure_->label()); |
| - cp_offset++; |
| - } else { |
| - cp_offset += elm.data.u_atom->data().length(); |
| + EmitCharClass(macro_assembler, |
| + cc, |
| + cp_offset, |
| + backtrack, |
| + checked_up_to < cp_offset); |
| + if (cp_offset > checked_up_to) checked_up_to = cp_offset; |
| } |
| } |
| - compiler->AddWork(on_failure_); |
| - macro_assembler->AdvanceCurrentPosition(cp_offset); |
| - return on_success()->GoTo(compiler); |
| + GenerationVariant new_variant(*variant); |
| + new_variant.set_cp_offset(checked_up_to + 1); |
| + RecursionCheck rc(compiler); |
| + return on_success()->Emit(compiler, &new_variant); |
| } |
| @@ -1419,141 +1825,257 @@ |
| } |
| -bool ChoiceNode::Emit(RegExpCompiler* compiler) { |
| +int TextNode::GreedyLoopTextLength() { |
| + TextElement elm = elms_->at(elms_->length() - 1); |
| + if (elm.type == TextElement::CHAR_CLASS) { |
| + return elm.cp_offset + 1; |
| + } else { |
| + return elm.cp_offset + elm.data.u_atom->data().length(); |
| + } |
| +} |
| + |
| + |
| +// Finds the fixed match length of a sequence of nodes that goes from |
| +// this alternative and back to this choice node. If there are variable |
| +// length nodes or other complications in the way then return a sentinel |
| +// value indicating that a greedy loop cannot be constructed. |
| +int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) { |
| + int length = 0; |
| + RegExpNode* node = alternative->node(); |
| + // Later we will generate code for all these text nodes using recursion |
| + // so we have to limit the max number. |
| + int recursion_depth = 0; |
| + while (node != this) { |
| + if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { |
| + return kNodeIsTooComplexForGreedyLoops; |
| + } |
| + NodeInfo* info = node->info(); |
| + if (info->follows_word_interest || |
| + info->follows_newline_interest || |
| + info->follows_start_interest) { |
| + return kNodeIsTooComplexForGreedyLoops; |
| + } |
| + int node_length = node->GreedyLoopTextLength(); |
| + if (node_length == kNodeIsTooComplexForGreedyLoops) { |
| + return kNodeIsTooComplexForGreedyLoops; |
| + } |
| + length += node_length; |
| + SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); |
| + node = seq_node->on_success(); |
| + } |
| + return length; |
| +} |
| + |
| + |
| +bool LoopChoiceNode::Emit(RegExpCompiler* compiler, |
| + GenerationVariant* variant) { |
| + RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
| + if (variant->stop_node() == this) { |
| + int text_length = GreedyLoopTextLength(&(alternatives_->at(0))); |
| + ASSERT(text_length != kNodeIsTooComplexForGreedyLoops); |
| + // Update the counter-based backtracking info on the stack. This is an |
| + // optimization for greedy loops (see below). |
| + ASSERT(variant->cp_offset() == text_length); |
| + macro_assembler->AdvanceCurrentPosition(text_length); |
| + macro_assembler->GoTo(variant->loop_label()); |
| + return true; |
| + } |
| + ASSERT(variant->stop_node() == NULL); |
| + if (!variant->is_trivial()) { |
| + return variant->Flush(compiler, this); |
| + } |
| + return ChoiceNode::Emit(compiler, variant); |
| +} |
| + |
| + |
| +bool ChoiceNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) { |
| + RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
| int choice_count = alternatives_->length(); |
| - RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
| - Bind(macro_assembler); |
| +#ifdef DEBUG |
| + for (int i = 0; i < choice_count - 1; i++) { |
| + GuardedAlternative alternative = alternatives_->at(i); |
| + ZoneList<Guard*>* guards = alternative.guards(); |
| + int guard_count = (guards == NULL) ? 0 : guards->length(); |
| + for (int j = 0; j < guard_count; j++) { |
| + ASSERT(!variant->mentions_reg(guards->at(j)->reg())); |
| + } |
| + } |
| +#endif |
| + |
| + LimitResult limit_result = LimitVersions(compiler, variant); |
| + if (limit_result == DONE) return true; |
| + if (limit_result == FAIL) return false; |
| + ASSERT(limit_result == CONTINUE); |
| + |
| + RecursionCheck rc(compiler); |
| + |
| + GenerationVariant* current_variant = variant; |
| + |
| + int text_length = GreedyLoopTextLength(&(alternatives_->at(0))); |
| + bool greedy_loop = false; |
| + Label greedy_loop_label; |
| + GenerationVariant counter_backtrack_variant(&greedy_loop_label); |
| + if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { |
| + // Here we have special handling for greedy loops containing only text nodes |
| + // and other simple nodes. These are handled by pushing the current |
| + // position on the stack and then incrementing the current position each |
| + // time around the switch. On backtrack we decrement the current position |
| + // and check it against the pushed value. This avoids pushing backtrack |
| + // information for each iteration of the loop, which could take up a lot of |
| + // space. |
| + greedy_loop = true; |
| + ASSERT(variant->stop_node() == NULL); |
| + macro_assembler->PushCurrentPosition(); |
| + current_variant = &counter_backtrack_variant; |
| + Label greedy_match_failed; |
| + GenerationVariant greedy_match_variant(&greedy_match_failed); |
| + Label loop_label; |
| + macro_assembler->Bind(&loop_label); |
| + greedy_match_variant.set_stop_node(this); |
| + greedy_match_variant.set_loop_label(&loop_label); |
| + bool ok = alternatives_->at(0).node()->Emit(compiler, |
| + &greedy_match_variant); |
| + macro_assembler->Bind(&greedy_match_failed); |
| + if (!ok) { |
| + greedy_loop_label.Unuse(); |
| + return false; |
| + } |
| + } |
| + |
| + Label second_choice; // For use in greedy matches. |
| + macro_assembler->Bind(&second_choice); |
| + |
| // For now we just call all choices one after the other. The idea ultimately |
| // is to use the Dispatch table to try only the relevant ones. |
| - for (int i = 0; i < choice_count - 1; i++) { |
| + for (int i = greedy_loop ? 1 : 0; i < choice_count - 1; i++) { |
| GuardedAlternative alternative = alternatives_->at(i); |
| Label after; |
| - Label after_no_pop_cp; |
| ZoneList<Guard*>* guards = alternative.guards(); |
| - if (guards != NULL) { |
| - int guard_count = guards->length(); |
| - for (int j = 0; j < guard_count; j++) { |
| - GenerateGuard(macro_assembler, guards->at(j), &after_no_pop_cp); |
| - } |
| + int guard_count = (guards == NULL) ? 0 : guards->length(); |
| + GenerationVariant new_variant(*current_variant); |
| + new_variant.set_backtrack(&after); |
| + for (int j = 0; j < guard_count; j++) { |
| + GenerateGuard(macro_assembler, guards->at(j), &new_variant); |
| } |
| - macro_assembler->PushCurrentPosition(); |
| - macro_assembler->PushBacktrack(&after); |
| - if (!alternative.node()->GoTo(compiler)) { |
| + if (!alternative.node()->Emit(compiler, &new_variant)) { |
| after.Unuse(); |
| - after_no_pop_cp.Unuse(); |
| return false; |
| } |
| macro_assembler->Bind(&after); |
| - macro_assembler->PopCurrentPosition(); |
| - macro_assembler->Bind(&after_no_pop_cp); |
| } |
| GuardedAlternative alternative = alternatives_->at(choice_count - 1); |
| ZoneList<Guard*>* guards = alternative.guards(); |
| - if (guards != NULL) { |
| - int guard_count = guards->length(); |
| - for (int j = 0; j < guard_count; j++) { |
| - GenerateGuard(macro_assembler, guards->at(j), on_failure_->label()); |
| - } |
| + int guard_count = (guards == NULL) ? 0 : guards->length(); |
| + for (int j = 0; j < guard_count; j++) { |
| + GenerateGuard(macro_assembler, guards->at(j), current_variant); |
| } |
| - if (!on_failure_->IsBacktrack()) { |
| - ASSERT_NOT_NULL(on_failure_ -> label()); |
| - macro_assembler->PushBacktrack(on_failure_->label()); |
| - compiler->AddWork(on_failure_); |
| + bool ok = alternative.node()->Emit(compiler, current_variant); |
| + if (!ok) return false; |
| + if (greedy_loop) { |
| + macro_assembler->Bind(&greedy_loop_label); |
| + // If we have unwound to the bottom then backtrack. |
| + macro_assembler->CheckGreedyLoop(variant->backtrack()); |
| + // Otherwise try the second priority at an earlier position. |
| + macro_assembler->AdvanceCurrentPosition(-text_length); |
| + macro_assembler->GoTo(&second_choice); |
| } |
| - if (!alternative.node()->GoTo(compiler)) { |
| - return false; |
| - } |
| return true; |
| } |
| -bool ActionNode::Emit(RegExpCompiler* compiler) { |
| +bool ActionNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) { |
| RegExpMacroAssembler* macro = compiler->macro_assembler(); |
| - Bind(macro); |
| + LimitResult limit_result = LimitVersions(compiler, variant); |
| + if (limit_result == DONE) return true; |
| + if (limit_result == FAIL) return false; |
| + ASSERT(limit_result == CONTINUE); |
| + |
| + RecursionCheck rc(compiler); |
| + |
| switch (type_) { |
| - case STORE_REGISTER: |
| - macro->SetRegister(data_.u_store_register.reg, |
| - data_.u_store_register.value); |
| - break; |
| + case STORE_POSITION: { |
| + GenerationVariant::DeferredCapture |
| + new_capture(data_.u_position_register.reg, variant); |
| + GenerationVariant new_variant = *variant; |
| + new_variant.add_action(&new_capture); |
| + return on_success()->Emit(compiler, &new_variant); |
| + } |
| case INCREMENT_REGISTER: { |
| - Label undo; |
| - macro->PushBacktrack(&undo); |
| - macro->AdvanceRegister(data_.u_increment_register.reg, 1); |
| - bool ok = on_success()->GoTo(compiler); |
| - if (!ok) { |
| - undo.Unuse(); |
| - return false; |
| - } |
| - macro->Bind(&undo); |
| - macro->AdvanceRegister(data_.u_increment_register.reg, -1); |
| - macro->Backtrack(); |
| - break; |
| + GenerationVariant::DeferredIncrementRegister |
| + new_increment(data_.u_increment_register.reg); |
| + GenerationVariant new_variant = *variant; |
| + new_variant.add_action(&new_increment); |
| + return on_success()->Emit(compiler, &new_variant); |
| } |
| - case STORE_POSITION: { |
| - Label undo; |
| - macro->PushRegister(data_.u_position_register.reg); |
| - macro->PushBacktrack(&undo); |
| - macro->WriteCurrentPositionToRegister(data_.u_position_register.reg); |
| - bool ok = on_success()->GoTo(compiler); |
| - if (!ok) { |
| - undo.Unuse(); |
| - return false; |
| - } |
| - macro->Bind(&undo); |
| - macro->PopRegister(data_.u_position_register.reg); |
| - macro->Backtrack(); |
| - break; |
| + case SET_REGISTER: { |
| + GenerationVariant::DeferredSetRegister |
| + new_set(data_.u_store_register.reg, data_.u_store_register.value); |
| + GenerationVariant new_variant = *variant; |
| + new_variant.add_action(&new_set); |
| + return on_success()->Emit(compiler, &new_variant); |
| } |
| - case RESTORE_POSITION: |
| - macro->ReadCurrentPositionFromRegister( |
| - data_.u_position_register.reg); |
| - break; |
| case BEGIN_SUBMATCH: |
| + if (!variant->is_trivial()) return variant->Flush(compiler, this); |
| macro->WriteCurrentPositionToRegister( |
| - data_.u_submatch.current_position_register); |
| + data_.u_submatch.current_position_register, 0); |
| macro->WriteStackPointerToRegister( |
| data_.u_submatch.stack_pointer_register); |
| - break; |
| - case ESCAPE_SUBMATCH: |
| + return on_success()->Emit(compiler, variant); |
| + case POSITIVE_SUBMATCH_SUCCESS: |
| + if (!variant->is_trivial()) return variant->Flush(compiler, this); |
| + // TODO(erikcorry): Implement support. |
| + if (info()->follows_word_interest || |
| + info()->follows_newline_interest || |
| + info()->follows_start_interest) { |
| + return false; |
| + } |
| if (info()->at_end) { |
| Label at_end; |
| // Load current character jumps to the label if we are beyond the string |
| // end. |
| macro->LoadCurrentCharacter(0, &at_end); |
| - macro->Backtrack(); |
| + macro->GoTo(variant->backtrack()); |
| macro->Bind(&at_end); |
| } |
| - if (data_.u_submatch.current_position_register != -1) { |
| - macro->ReadCurrentPositionFromRegister( |
| - data_.u_submatch.current_position_register); |
| - } |
| + macro->ReadCurrentPositionFromRegister( |
| + data_.u_submatch.current_position_register); |
| macro->ReadStackPointerFromRegister( |
| data_.u_submatch.stack_pointer_register); |
| - break; |
| + return on_success()->Emit(compiler, variant); |
| default: |
| UNREACHABLE(); |
| return false; |
| } |
| - return on_success()->GoTo(compiler); |
| } |
| -bool BackReferenceNode::Emit(RegExpCompiler* compiler) { |
| +bool BackReferenceNode::Emit(RegExpCompiler* compiler, |
| + GenerationVariant* variant) { |
| RegExpMacroAssembler* macro = compiler->macro_assembler(); |
| - Bind(macro); |
| + if (!variant->is_trivial()) { |
| + return variant->Flush(compiler, this); |
| + } |
| + |
| + LimitResult limit_result = LimitVersions(compiler, variant); |
| + if (limit_result == DONE) return true; |
| + if (limit_result == FAIL) return false; |
| + ASSERT(limit_result == CONTINUE); |
| + |
| + RecursionCheck rc(compiler); |
| + |
| ASSERT_EQ(start_reg_ + 1, end_reg_); |
| if (info()->at_end) { |
| // If we are constrained to match at the end of the input then succeed |
| // iff the back reference is empty. |
| - macro->CheckNotRegistersEqual(start_reg_, end_reg_, on_failure_->label()); |
| + macro->CheckNotRegistersEqual(start_reg_, end_reg_, variant->backtrack()); |
| } else { |
| if (compiler->ignore_case()) { |
| - macro->CheckNotBackReferenceIgnoreCase(start_reg_, on_failure_->label()); |
| + macro->CheckNotBackReferenceIgnoreCase(start_reg_, variant->backtrack()); |
| } else { |
| - macro->CheckNotBackReference(start_reg_, on_failure_->label()); |
| + macro->CheckNotBackReference(start_reg_, variant->backtrack()); |
| } |
| } |
| - return on_success()->GoTo(compiler); |
| + return on_success()->Emit(compiler, variant); |
| } |
| @@ -1571,9 +2093,9 @@ |
| stream_(&alloc_) { } |
| void PrintNode(const char* label, RegExpNode* node); |
| void Visit(RegExpNode* node); |
| - void PrintOnFailure(RegExpNode* from, RegExpNode* on_failure); |
| void PrintAttributes(RegExpNode* from); |
| StringStream* stream() { return &stream_; } |
| + void PrintOnFailure(RegExpNode* from, RegExpNode* to); |
| #define DECLARE_VISIT(Type) \ |
| virtual void Visit##Type(Type##Node* that); |
| FOR_EACH_NODE_TYPE(DECLARE_VISIT) |
| @@ -1615,7 +2137,6 @@ |
| void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { |
| - if (on_failure->IsBacktrack()) return; |
| stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure); |
| Visit(on_failure); |
| } |
| @@ -1740,7 +2261,6 @@ |
| PrintAttributes(that); |
| TableEntryBodyPrinter body_printer(stream(), that); |
| that->GetTable(ignore_case_)->ForEach(&body_printer); |
| - PrintOnFailure(that, that->on_failure()); |
| } else { |
| stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that); |
| for (int i = 0; i < that->alternatives()->length(); i++) { |
| @@ -1785,7 +2305,6 @@ |
| PrintAttributes(that); |
| stream()->Add(" n%p -> n%p;\n", that, that->on_success()); |
| Visit(that->on_success()); |
| - PrintOnFailure(that, that->on_failure()); |
| } |
| @@ -1797,7 +2316,6 @@ |
| PrintAttributes(that); |
| stream()->Add(" n%p -> n%p;\n", that, that->on_success()); |
| Visit(that->on_success()); |
| - PrintOnFailure(that, that->on_failure()); |
| } |
| @@ -1810,7 +2328,7 @@ |
| void DotPrinter::VisitAction(ActionNode* that) { |
| stream()->Add(" n%p [", that); |
| switch (that->type_) { |
| - case ActionNode::STORE_REGISTER: |
| + case ActionNode::SET_REGISTER: |
| stream()->Add("label=\"$%i:=%i\", shape=octagon", |
| that->data_.u_store_register.reg, |
| that->data_.u_store_register.value); |
| @@ -1823,22 +2341,19 @@ |
| stream()->Add("label=\"$%i:=$pos\", shape=octagon", |
| that->data_.u_position_register.reg); |
| break; |
| - case ActionNode::RESTORE_POSITION: |
| - stream()->Add("label=\"$pos:=$%i\", shape=octagon", |
| - that->data_.u_position_register.reg); |
| - break; |
| case ActionNode::BEGIN_SUBMATCH: |
| stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon", |
| that->data_.u_submatch.current_position_register); |
| break; |
| - case ActionNode::ESCAPE_SUBMATCH: |
| + case ActionNode::POSITIVE_SUBMATCH_SUCCESS: |
| stream()->Add("label=\"escape\", shape=septagon"); |
| break; |
| } |
| stream()->Add("];\n"); |
| PrintAttributes(that); |
| - stream()->Add(" n%p -> n%p;\n", that, that->on_success()); |
| - Visit(that->on_success()); |
| + RegExpNode* successor = that->on_success(); |
| + stream()->Add(" n%p -> n%p;\n", that, successor); |
| + Visit(successor); |
| } |
| @@ -1895,40 +2410,35 @@ |
| RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, |
| - RegExpNode* on_success, |
| - RegExpNode* on_failure) { |
| + RegExpNode* on_success) { |
| ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); |
| elms->Add(TextElement::Atom(this)); |
| - return new TextNode(elms, on_success, on_failure); |
| + return new TextNode(elms, on_success); |
| } |
| RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, |
| - RegExpNode* on_success, |
| - RegExpNode* on_failure) { |
| - return new TextNode(elements(), on_success, on_failure); |
| + RegExpNode* on_success) { |
| + return new TextNode(elements(), on_success); |
| } |
| RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, |
| - RegExpNode* on_success, |
| - RegExpNode* on_failure) { |
| + RegExpNode* on_success) { |
| ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); |
| elms->Add(TextElement::CharClass(this)); |
| - return new TextNode(elms, on_success, on_failure); |
| + return new TextNode(elms, on_success); |
| } |
| RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, |
| - RegExpNode* on_success, |
| - RegExpNode* on_failure) { |
| + RegExpNode* on_success) { |
| ZoneList<RegExpTree*>* alternatives = this->alternatives(); |
| int length = alternatives->length(); |
| - ChoiceNode* result = new ChoiceNode(length, on_failure); |
| + ChoiceNode* result = new ChoiceNode(length); |
| for (int i = 0; i < length; i++) { |
| GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, |
| - on_success, |
| - on_failure)); |
| + on_success)); |
| result->AddAlternative(alternative); |
| } |
| return result; |
| @@ -1936,15 +2446,13 @@ |
| RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, |
| - RegExpNode* on_success, |
| - RegExpNode* on_failure) { |
| + RegExpNode* on_success) { |
| return ToNode(min(), |
| max(), |
| is_greedy(), |
| body(), |
| compiler, |
| - on_success, |
| - on_failure); |
| + on_success); |
| } |
| @@ -1953,8 +2461,7 @@ |
| bool is_greedy, |
| RegExpTree* body, |
| RegExpCompiler* compiler, |
| - RegExpNode* on_success, |
| - RegExpNode* on_failure) { |
| + RegExpNode* on_success) { |
| // x{f, t} becomes this: |
| // |
| // (r++)<-. |
| @@ -1972,11 +2479,11 @@ |
| bool has_max = max < RegExpQuantifier::kInfinity; |
| bool needs_counter = has_min || has_max; |
| int reg_ctr = needs_counter ? compiler->AllocateRegister() : -1; |
| - ChoiceNode* center = new ChoiceNode(2, on_failure); |
| + ChoiceNode* center = new LoopChoiceNode(2); |
| RegExpNode* loop_return = needs_counter |
| ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center)) |
| : static_cast<RegExpNode*>(center); |
| - RegExpNode* body_node = body->ToNode(compiler, loop_return, on_failure); |
| + RegExpNode* body_node = body->ToNode(compiler, loop_return); |
| GuardedAlternative body_alt(body_node); |
| if (has_max) { |
| Guard* body_guard = new Guard(reg_ctr, Guard::LT, max); |
| @@ -1995,7 +2502,7 @@ |
| center->AddAlternative(body_alt); |
| } |
| if (needs_counter) { |
| - return ActionNode::StoreRegister(reg_ctr, 0, center); |
| + return ActionNode::SetRegister(reg_ctr, 0, center); |
| } else { |
| return center; |
| } |
| @@ -2003,8 +2510,7 @@ |
| RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, |
| - RegExpNode* on_success, |
| - RegExpNode* on_failure) { |
| + RegExpNode* on_success) { |
| NodeInfo info; |
| switch (type()) { |
| case START_OF_LINE: |
| @@ -2028,108 +2534,85 @@ |
| RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, |
| - RegExpNode* on_success, |
| - RegExpNode* on_failure) { |
| + RegExpNode* on_success) { |
| return new BackReferenceNode(RegExpCapture::StartRegister(index()), |
| RegExpCapture::EndRegister(index()), |
| - on_success, |
| - on_failure); |
| + on_success); |
| } |
| RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, |
| - RegExpNode* on_success, |
| - RegExpNode* on_failure) { |
| + RegExpNode* on_success) { |
| return on_success; |
| } |
| RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler, |
| - RegExpNode* on_success, |
| - RegExpNode* on_failure) { |
| + RegExpNode* on_success) { |
| int stack_pointer_register = compiler->AllocateRegister(); |
| int position_register = compiler->AllocateRegister(); |
| + RegExpNode* success; |
| if (is_positive()) { |
| - // begin submatch scope |
| - // $reg = $pos |
| - // if [body] |
| - // then |
| - // $pos = $reg |
| - // escape submatch scope (drop all backtracks created in scope) |
| - // succeed |
| - // else |
| - // end submatch scope (nothing to clean up, just exit the scope) |
| - // fail |
| return ActionNode::BeginSubmatch( |
| stack_pointer_register, |
| position_register, |
| body()->ToNode( |
| compiler, |
| - ActionNode::EscapeSubmatch( |
| - stack_pointer_register, |
| - true, // Also restore input position. |
| - position_register, |
| - on_success), |
| - on_failure)); |
| + ActionNode::PositiveSubmatchSuccess(stack_pointer_register, |
| + position_register, |
| + on_success))); |
| } else { |
| - // begin submatch scope |
| - // try |
| - // first if (body) |
| - // then |
| - // escape submatch scope |
| - // fail |
| - // else |
| - // backtrack |
| - // second |
| - // end submatch scope |
| - // restore current position |
| - // succeed |
| - ChoiceNode* try_node = |
| - new ChoiceNode(1, ActionNode::RestorePosition(position_register, |
| - on_success)); |
| - RegExpNode* body_node = body()->ToNode( |
| - compiler, |
| - ActionNode::EscapeSubmatch(stack_pointer_register, |
| - false, // Don't also restore position |
| - 0, // Unused arguments. |
| - on_failure), |
| - compiler->backtrack()); |
| - GuardedAlternative body_alt(body_node); |
| - try_node->AddAlternative(body_alt); |
| + // We use a ChoiceNode for a negative lookahead because it has most of |
| + // the characteristics we need. It has the body of the lookahead as its |
| + // first alternative and the expression after the lookahead of the second |
| + // alternative. If the first alternative succeeds then the |
| + // NegativeSubmatchSuccess will unwind the stack including everything the |
| + // choice node set up and backtrack. If the first alternative fails then |
| + // the second alternative is tried, which is exactly the desired result |
| + // for a negative lookahead. In the case where the dispatch table |
| + // determines that the first alternative cannot match we will save time |
| + // by not trying it. Things are not quite so well-optimized if the |
| + // dispatch table determines that the second alternative cannot match. |
| + // In this case we could optimize by immediately backtracking. |
| + ChoiceNode* choice_node = new ChoiceNode(2); |
| + GuardedAlternative body_alt( |
| + body()->ToNode( |
| + compiler, |
| + success = new NegativeSubmatchSuccess(stack_pointer_register, |
| + position_register))); |
| + choice_node->AddAlternative(body_alt); |
| + choice_node->AddAlternative(GuardedAlternative(on_success)); |
| return ActionNode::BeginSubmatch(stack_pointer_register, |
| position_register, |
| - try_node); |
| + choice_node); |
| } |
| } |
| RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, |
| - RegExpNode* on_success, |
| - RegExpNode* on_failure) { |
| - return ToNode(body(), index(), compiler, on_success, on_failure); |
| + RegExpNode* on_success) { |
| + return ToNode(body(), index(), compiler, on_success); |
| } |
| RegExpNode* RegExpCapture::ToNode(RegExpTree* body, |
| int index, |
| RegExpCompiler* compiler, |
| - RegExpNode* on_success, |
| - RegExpNode* on_failure) { |
| + RegExpNode* on_success) { |
| int start_reg = RegExpCapture::StartRegister(index); |
| int end_reg = RegExpCapture::EndRegister(index); |
| RegExpNode* store_end = ActionNode::StorePosition(end_reg, on_success); |
| - RegExpNode* body_node = body->ToNode(compiler, store_end, on_failure); |
| + RegExpNode* body_node = body->ToNode(compiler, store_end); |
| return ActionNode::StorePosition(start_reg, body_node); |
| } |
| RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, |
| - RegExpNode* on_success, |
| - RegExpNode* on_failure) { |
| + RegExpNode* on_success) { |
| ZoneList<RegExpTree*>* children = nodes(); |
| RegExpNode* current = on_success; |
| for (int i = children->length() - 1; i >= 0; i--) { |
| - current = children->at(i)->ToNode(compiler, current, on_failure); |
| + current = children->at(i)->ToNode(compiler, current); |
| } |
| return current; |
| } |
| @@ -2400,9 +2883,7 @@ |
| full_info.AddFromPreceding(info); |
| bool cloned = false; |
| ActionNode* action = EnsureSibling(this, &full_info, &cloned); |
| - if (cloned && type_ != ESCAPE_SUBMATCH) { |
| - action->set_on_success(action->on_success()->PropagateForward(info)); |
| - } |
| + action->set_on_success(action->on_success()->PropagateForward(info)); |
| return action; |
| } |
| @@ -2421,9 +2902,6 @@ |
| alternative.set_node(alternative.node()->PropagateForward(info)); |
| choice->alternatives()->Add(alternative); |
| } |
| - if (!choice->on_failure_->IsBacktrack()) { |
| - choice->on_failure_ = choice->on_failure_->PropagateForward(info); |
| - } |
| } |
| return choice; |
| } |
| @@ -2624,12 +3102,29 @@ |
| } |
| +void TextNode::CalculateOffsets() { |
| + int element_count = elements()->length(); |
| + // Set up the offsets of the elements relative to the start. This is a fixed |
| + // quantity since a TextNode can only contain fixed-width things. |
| + int cp_offset = 0; |
| + for (int i = 0; i < element_count; i++) { |
| + TextElement& elm = elements()->at(i); |
| + elm.cp_offset = cp_offset; |
| + if (elm.type == TextElement::ATOM) { |
| + cp_offset += elm.data.u_atom->data().length(); |
| + } else { |
| + cp_offset++; |
| + Vector<const uc16> quarks = elm.data.u_atom->data(); |
| + } |
| + } |
| +} |
| + |
| + |
| void Analysis::VisitText(TextNode* that) { |
| if (ignore_case_) { |
| that->MakeCaseIndependent(); |
| } |
| EnsureAnalyzed(that->on_success()); |
| - EnsureAnalyzed(that->on_failure()); |
| NodeInfo* info = that->info(); |
| NodeInfo* next_info = that->on_success()->info(); |
| // If the following node is interested in what it follows then this |
| @@ -2637,14 +3132,16 @@ |
| info->determine_newline = next_info->follows_newline_interest; |
| info->determine_word = next_info->follows_word_interest; |
| info->determine_start = next_info->follows_start_interest; |
| + that->CalculateOffsets(); |
| } |
| void Analysis::VisitAction(ActionNode* that) { |
| - EnsureAnalyzed(that->on_success()); |
| + RegExpNode* target = that->on_success(); |
| + EnsureAnalyzed(target); |
| // If the next node is interested in what it follows then this node |
| // has to be interested too so it can pass the information on. |
| - that->info()->AddFromFollowing(that->on_success()->info()); |
| + that->info()->AddFromFollowing(target->info()); |
| } |
| @@ -2657,13 +3154,11 @@ |
| // this node also, so it can pass it on. |
| info->AddFromFollowing(node->info()); |
| } |
| - EnsureAnalyzed(that->on_failure()); |
| } |
| void Analysis::VisitBackReference(BackReferenceNode* that) { |
| EnsureAnalyzed(that->on_success()); |
| - EnsureAnalyzed(that->on_failure()); |
| } |
| @@ -2746,7 +3241,7 @@ |
| } else { |
| // If this character class contains both word and non-word |
| // characters we need to split it into two. |
| - ChoiceNode* result = new ChoiceNode(2, on_failure()); |
| + ChoiceNode* result = new ChoiceNode(2); |
| // Welcome to the family, son! |
| result->set_siblings(this->siblings()); |
| *result->info() = *this->info(); |
| @@ -2754,16 +3249,14 @@ |
| result->info()->AddAssumptions(info); |
| RegExpNode* word_node |
| = new TextNode(new RegExpCharacterClass(word, false), |
| - on_success(), |
| - on_failure()); |
| + on_success()); |
| word_node->info()->determine_word = true; |
| word_node->info()->does_determine_word = true; |
| word_node->info()->is_word = NodeInfo::TRUE; |
| result->alternatives()->Add(GuardedAlternative(word_node)); |
| RegExpNode* non_word_node |
| = new TextNode(new RegExpCharacterClass(non_word, false), |
| - on_success(), |
| - on_failure()); |
| + on_success()); |
| non_word_node->info()->determine_word = true; |
| non_word_node->info()->does_determine_word = true; |
| non_word_node->info()->is_word = NodeInfo::FALSE; |
| @@ -2974,21 +3467,22 @@ |
| void DispatchTableConstructor::VisitAction(ActionNode* that) { |
| - that->on_success()->Accept(this); |
| + RegExpNode* target = that->on_success(); |
| + target->Accept(this); |
| } |
| Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input, |
| RegExpNode** node_return, |
| bool ignore_case, |
| - bool is_multiline) { |
| + bool is_multiline, |
| + Handle<String> pattern) { |
| RegExpCompiler compiler(input->capture_count, ignore_case); |
| // Wrap the body of the regexp in capture #0. |
| RegExpNode* captured_body = RegExpCapture::ToNode(input->tree, |
| 0, |
| &compiler, |
| - compiler.accept(), |
| - compiler.backtrack()); |
| + compiler.accept()); |
| // Add a .*? at the beginning, outside the body capture. |
| // Note: We could choose to not add this if the regexp is anchored at |
| // the start of the input but I'm not sure how best to do that and |
| @@ -2999,8 +3493,7 @@ |
| false, |
| new RegExpCharacterClass('*'), |
| &compiler, |
| - captured_body, |
| - compiler.backtrack()); |
| + captured_body); |
| if (node_return != NULL) *node_return = node; |
| Analysis analysis(ignore_case); |
| analysis.EnsureAnalyzed(node); |
| @@ -3024,14 +3517,16 @@ |
| (input->capture_count + 1) * 2); |
| return compiler.Assemble(¯o_assembler, |
| node, |
| - input->capture_count); |
| + input->capture_count, |
| + pattern); |
| #endif |
| } |
| EmbeddedVector<byte, 1024> codes; |
| RegExpMacroAssemblerIrregexp macro_assembler(codes); |
| return compiler.Assemble(¯o_assembler, |
| node, |
| - input->capture_count); |
| + input->capture_count, |
| + pattern); |
| } |