Index: src/jsregexp.cc |
=================================================================== |
--- src/jsregexp.cc (revision 1094) |
+++ src/jsregexp.cc (working copy) |
@@ -1522,18 +1522,6 @@ |
} |
-void EndNode::EmitInfoChecks(RegExpMacroAssembler* assembler, Trace* trace) { |
- if (info()->at_end) { |
- Label succeed; |
- // LoadCurrentCharacter will go to the label if we are at the end of the |
- // input string. |
- assembler->LoadCurrentCharacter(0, &succeed); |
- assembler->GoTo(trace->backtrack()); |
- assembler->Bind(&succeed); |
- } |
-} |
- |
- |
bool NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { |
if (!trace->is_trivial()) { |
return trace->Flush(compiler, this); |
@@ -1542,7 +1530,6 @@ |
if (!label()->is_bound()) { |
assembler->Bind(label()); |
} |
- EmitInfoChecks(assembler, trace); |
assembler->ReadCurrentPositionFromRegister(current_position_register_); |
assembler->ReadStackPointerFromRegister(stack_pointer_register_); |
// Now that we have unwound the stack we find at the top of the stack the |
@@ -1562,11 +1549,9 @@ |
} |
switch (action_) { |
case ACCEPT: |
- EmitInfoChecks(assembler, trace); |
assembler->Succeed(); |
return true; |
case BACKTRACK: |
- ASSERT(!info()->at_end); |
assembler->GoTo(trace->backtrack()); |
return true; |
case NEGATIVE_SUBMATCH_SUCCESS: |
@@ -1935,13 +1920,6 @@ |
RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, |
Trace* trace) { |
- // TODO(erikcorry): Implement support. |
- if (info_.follows_word_interest || |
- info_.follows_newline_interest || |
- info_.follows_start_interest) { |
- return FAIL; |
- } |
- |
// If we are generating a greedy loop then don't stop and don't reuse code. |
if (trace->stop_node() != NULL) { |
return CONTINUE; |
@@ -1990,6 +1968,19 @@ |
} |
+int AssertionNode::EatsAtLeast(int recursion_depth) { |
+ if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
+ return on_success()->EatsAtLeast(recursion_depth + 1); |
+} |
+ |
+ |
+int BackReferenceNode::EatsAtLeast(int recursion_depth) { |
+ if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
+ return on_success()->EatsAtLeast(recursion_depth + 1); |
+} |
+ |
+ |
+ |
int TextNode::EatsAtLeast(int recursion_depth) { |
int answer = Length(); |
if (answer >= 4) return answer; |
@@ -2257,7 +2248,7 @@ |
void QuickCheckDetails::Advance(int by, bool ascii) { |
- ASSERT(by > 0); |
+ ASSERT(by >= 0); |
if (by >= characters_) { |
Clear(); |
return; |
@@ -2342,6 +2333,116 @@ |
} |
+// Check for [0-9A-Z_a-z]. |
+static void EmitWordCheck(RegExpMacroAssembler* assembler, |
+ Label* word, |
+ Label* non_word, |
+ bool fall_through_on_word) { |
+ assembler->CheckCharacterGT('z', non_word); |
+ assembler->CheckCharacterLT('0', non_word); |
+ assembler->CheckCharacterGT('a' - 1, word); |
+ assembler->CheckCharacterLT('9' + 1, word); |
+ assembler->CheckCharacterLT('A', non_word); |
+ assembler->CheckCharacterLT('Z' + 1, word); |
+ if (fall_through_on_word) { |
+ assembler->CheckNotCharacter('_', non_word); |
+ } else { |
+ assembler->CheckCharacter('_', word); |
+ } |
+} |
+ |
+ |
+bool AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { |
+ RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
+ switch (type_) { |
+ case AT_END: { |
+ Label ok; |
+ assembler->LoadCurrentCharacter(trace->cp_offset(), &ok); |
+ assembler->GoTo(trace->backtrack()); |
+ assembler->Bind(&ok); |
+ break; |
+ } |
+ case AT_START: |
+ assembler->CheckNotAtStart(trace->backtrack()); |
+ break; |
+ case AFTER_NEWLINE: { |
+ // Check whether we are either at the start or immediately after a |
+ // newline. |
+ Trace new_trace(*trace); |
+ // Invalidate the current character register. |
+ new_trace.AdvanceCurrentPositionInTrace(0, compiler->ascii()); |
+ |
+ Label ok; |
+ if (new_trace.cp_offset() == 0) { |
+ assembler->CheckAtStart(&ok); |
+ } |
+ assembler->LoadCurrentCharacter(-1, new_trace.backtrack(), false); |
Christian Plesner Hansen
2009/01/19 10:28:59
Maybe a comment about why this works when you're a
|
+ // Newline means \n, \r, 0x2028 or 0x2029. |
+ if (!compiler->ascii()) { |
+ assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); |
+ } |
+ assembler->CheckCharacter('\n', &ok); |
+ assembler->CheckNotCharacter('\r', new_trace.backtrack()); |
+ assembler->Bind(&ok); |
+ return on_success()->Emit(compiler, &new_trace); |
+ } |
+ case AT_NON_BOUNDARY: |
+ case AT_BOUNDARY: { |
+ Label before_non_word; |
Christian Plesner Hansen
2009/01/19 10:28:59
Maybe you could split these huge cases out into se
|
+ Label before_word; |
+ if (trace->characters_preloaded() != 1) { |
+ assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); |
+ } |
+ // Fall through on non-word. |
+ EmitWordCheck(assembler, &before_word, &before_non_word, false); |
+ |
+ // Invalidate the current character register. |
+ Trace new_trace(*trace); |
+ new_trace.AdvanceCurrentPositionInTrace(0, compiler->ascii()); |
+ |
+ Label ok; |
+ Label* boundary; |
+ Label* not_boundary; |
+ if (type_ == AT_BOUNDARY) { |
+ boundary = &ok; |
+ not_boundary = new_trace.backtrack(); |
+ } else { |
+ not_boundary = &ok; |
+ boundary = new_trace.backtrack(); |
+ } |
+ |
+ // Next character is not a word character. |
+ assembler->Bind(&before_non_word); |
+ if (new_trace.cp_offset() == 0) { |
+ assembler->CheckAtStart(not_boundary); |
+ } |
+ assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, |
Christian Plesner Hansen
2009/01/19 10:28:59
What happens here if we're right at the beginning
|
+ &ok, // Unused dummy label in this call. |
+ false); |
+ // Fall through on non-word. |
+ EmitWordCheck(assembler, boundary, not_boundary, false); |
+ assembler->GoTo(not_boundary); |
+ |
+ // Next character is a word character. |
+ assembler->Bind(&before_word); |
+ if (new_trace.cp_offset() == 0) { |
+ assembler->CheckAtStart(boundary); |
+ } |
+ assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, |
+ &ok, // Unused dummy label in this call. |
+ false); |
+ bool fall_through_on_word = (type_ == AT_NON_BOUNDARY); |
+ EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word); |
+ |
+ assembler->Bind(&ok); |
+ |
+ return on_success()->Emit(compiler, &new_trace); |
+ } |
+ } |
+ return on_success()->Emit(compiler, trace); |
+} |
+ |
+ |
// We call this repeatedly to generate code for each pass over the text node. |
// The passes are in increasing order of difficulty because we hope one |
// of the first passes will fail in which case we are saved the work of the |
@@ -2487,17 +2588,6 @@ |
if (limit_result == DONE) return true; |
ASSERT(limit_result == CONTINUE); |
- if (info()->follows_word_interest || |
- info()->follows_newline_interest || |
- info()->follows_start_interest) { |
- return false; |
- } |
- |
- if (info()->at_end) { |
- compiler->macro_assembler()->GoTo(trace->backtrack()); |
- return true; |
- } |
- |
if (compiler->ascii()) { |
int dummy = 0; |
TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); |
@@ -2562,7 +2652,9 @@ |
void Trace::AdvanceCurrentPositionInTrace(int by, bool ascii) { |
- ASSERT(by > 0); |
+ // We can call this with by == 0. In that case it just means that the |
Christian Plesner Hansen
2009/01/19 10:28:59
Maybe split invalidation out into a separate metho
|
+ // current position register has been invalidated. |
+ |
// We don't have an instruction for shifting the current character register |
// down or for using a shifted value for anything so lets just forget that |
// we preloaded any characters into it. |
@@ -2616,12 +2708,6 @@ |
if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { |
return kNodeIsTooComplexForGreedyLoops; |
} |
- NodeInfo* info = node->info(); |
- if (info->follows_word_interest || |
- info->follows_newline_interest || |
- info->follows_start_interest) { |
- return kNodeIsTooComplexForGreedyLoops; |
- } |
int node_length = node->GreedyLoopTextLength(); |
if (node_length == kNodeIsTooComplexForGreedyLoops) { |
return kNodeIsTooComplexForGreedyLoops; |
@@ -3096,20 +3182,6 @@ |
} |
case POSITIVE_SUBMATCH_SUCCESS: |
if (!trace->is_trivial()) return trace->Flush(compiler, this); |
- // TODO(erikcorry): Implement support. |
- if (info()->follows_word_interest || |
- info()->follows_newline_interest || |
- info()->follows_start_interest) { |
- return false; |
- } |
- if (info()->at_end) { |
- Label at_end; |
- // Load current character jumps to the label if we are beyond the string |
- // end. |
- assembler->LoadCurrentCharacter(0, &at_end); |
- assembler->GoTo(trace->backtrack()); |
- assembler->Bind(&at_end); |
- } |
assembler->ReadCurrentPositionFromRegister( |
data_.u_submatch.current_position_register); |
assembler->ReadStackPointerFromRegister( |
@@ -3136,19 +3208,11 @@ |
RecursionCheck rc(compiler); |
ASSERT_EQ(start_reg_ + 1, end_reg_); |
- if (info()->at_end) { |
- // If we are constrained to match at the end of the input then succeed |
- // iff the back reference is empty. |
- assembler->CheckNotRegistersEqual(start_reg_, |
- end_reg_, |
- trace->backtrack()); |
+ if (compiler->ignore_case()) { |
+ assembler->CheckNotBackReferenceIgnoreCase(start_reg_, |
+ trace->backtrack()); |
} else { |
- if (compiler->ignore_case()) { |
- assembler->CheckNotBackReferenceIgnoreCase(start_reg_, |
- trace->backtrack()); |
- } else { |
- assembler->CheckNotBackReference(start_reg_, trace->backtrack()); |
- } |
+ assembler->CheckNotBackReference(start_reg_, trace->backtrack()); |
} |
return on_success()->Emit(compiler, trace); |
} |
@@ -3389,6 +3453,33 @@ |
} |
+void DotPrinter::VisitAssertion(AssertionNode* that) { |
+ stream()->Add(" n%p [", that); |
+ switch (that->type()) { |
+ case AssertionNode::AT_END: |
+ stream()->Add("label=\"$\", shape=septagon"); |
+ break; |
+ case AssertionNode::AT_START: |
+ stream()->Add("label=\"^\", shape=septagon"); |
+ break; |
+ case AssertionNode::AT_BOUNDARY: |
+ stream()->Add("label=\"\\b\", shape=septagon"); |
+ break; |
+ case AssertionNode::AT_NON_BOUNDARY: |
+ stream()->Add("label=\"\\B\", shape=septagon"); |
+ break; |
+ case AssertionNode::AFTER_NEWLINE: |
+ stream()->Add("label=\"(?<=\\n)\", shape=septagon"); |
+ break; |
+ } |
+ stream()->Add("];\n"); |
+ PrintAttributes(that); |
+ RegExpNode* successor = that->on_success(); |
+ stream()->Add(" n%p -> n%p;\n", that, successor); |
+ Visit(successor); |
+} |
+ |
+ |
void DotPrinter::VisitAction(ActionNode* that) { |
stream()->Add(" n%p [", that); |
switch (that->type_) { |
@@ -3749,22 +3840,50 @@ |
NodeInfo info; |
switch (type()) { |
case START_OF_LINE: |
- info.follows_newline_interest = true; |
- break; |
+ return AssertionNode::AfterNewline(on_success); |
case START_OF_INPUT: |
- info.follows_start_interest = true; |
- break; |
- case BOUNDARY: case NON_BOUNDARY: |
- info.follows_word_interest = true; |
- break; |
+ return AssertionNode::AtStart(on_success); |
+ case BOUNDARY: |
+ return AssertionNode::AtBoundary(on_success); |
+ case NON_BOUNDARY: |
+ return AssertionNode::AtNonBoundary(on_success); |
case END_OF_INPUT: |
- info.at_end = true; |
- break; |
+ return AssertionNode::AtEnd(on_success); |
case END_OF_LINE: |
- // This is wrong but has the effect of making the compiler abort. |
- info.at_end = true; |
+ // Compile $ in multiline regexps as an alternation with a positive |
+ // lookahead in one side and an end-of-input on the other side. |
+ // We need two registers for the lookahead. |
+ int stack_pointer_register = compiler->AllocateRegister(); |
+ int position_register = compiler->AllocateRegister(); |
+ // The ChoiceNode to distinguish between a newline and end-of-input. |
+ ChoiceNode* result = new ChoiceNode(2); |
+ // Create a newline atom. |
+ ZoneList<CharacterRange>* newline_ranges = |
+ new ZoneList<CharacterRange>(3); |
+ CharacterRange::AddClassEscape('n', newline_ranges); |
+ RegExpCharacterClass* newline_atom = |
+ new RegExpCharacterClass(newline_ranges, false); |
Christian Plesner Hansen
2009/01/19 10:28:59
You should be able to just use "new RegExpCharacte
|
+ ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); |
+ elms->Add(TextElement::CharClass(newline_atom)); |
+ TextNode* newline_matcher = new TextNode( |
Christian Plesner Hansen
2009/01/19 10:28:59
You should be able to use "new TextNode(newline_at
|
+ elms, |
+ ActionNode::PositiveSubmatchSuccess(stack_pointer_register, |
+ position_register, |
+ on_success)); |
+ // Create an end-of-input matcher. |
+ RegExpNode* end_of_line = ActionNode::BeginSubmatch( |
+ stack_pointer_register, |
+ position_register, |
+ newline_matcher); |
+ // Add the two alternatives to the ChoiceNode. |
+ GuardedAlternative eol_alternative(end_of_line); |
+ result->AddAlternative(eol_alternative); |
+ GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); |
+ result->AddAlternative(end_alternative); |
+ return result; |
} |
- return on_success->PropagateForward(&info); |
+ UNREACHABLE(); |
Christian Plesner Hansen
2009/01/19 10:28:59
Maybe have this in the 'default' case, that way th
|
+ return on_success; |
} |
@@ -3911,6 +4030,13 @@ |
case '*': |
ranges->Add(CharacterRange::Everything()); |
break; |
+ // This is the set of characters matched by the $ and ^ symbols |
+ // in multiline mode. |
+ case 'n': |
+ AddClass(kLineTerminatorRanges, |
+ kLineTerminatorRangeCount, |
+ ranges); |
+ break; |
default: |
UNREACHABLE(); |
} |
@@ -4096,62 +4222,6 @@ |
} |
-RegExpNode* ActionNode::PropagateForward(NodeInfo* info) { |
Christian Plesner Hansen
2009/01/19 10:28:59
O joy, we can get rid of all this cra..., I mean s
|
- NodeInfo full_info(*this->info()); |
- full_info.AddFromPreceding(info); |
- bool cloned = false; |
- ActionNode* action = EnsureSibling(this, &full_info, &cloned); |
- action->set_on_success(action->on_success()->PropagateForward(info)); |
- return action; |
-} |
- |
- |
-RegExpNode* ChoiceNode::PropagateForward(NodeInfo* info) { |
- NodeInfo full_info(*this->info()); |
- full_info.AddFromPreceding(info); |
- bool cloned = false; |
- ChoiceNode* choice = EnsureSibling(this, &full_info, &cloned); |
- if (cloned) { |
- ZoneList<GuardedAlternative>* old_alternatives = alternatives(); |
- int count = old_alternatives->length(); |
- choice->alternatives_ = new ZoneList<GuardedAlternative>(count); |
- for (int i = 0; i < count; i++) { |
- GuardedAlternative alternative = old_alternatives->at(i); |
- alternative.set_node(alternative.node()->PropagateForward(info)); |
- choice->alternatives()->Add(alternative); |
- } |
- } |
- return choice; |
-} |
- |
- |
-RegExpNode* EndNode::PropagateForward(NodeInfo* info) { |
- return PropagateToEndpoint(this, info); |
-} |
- |
- |
-RegExpNode* BackReferenceNode::PropagateForward(NodeInfo* info) { |
- NodeInfo full_info(*this->info()); |
- full_info.AddFromPreceding(info); |
- bool cloned = false; |
- BackReferenceNode* back_ref = EnsureSibling(this, &full_info, &cloned); |
- if (cloned) { |
- // TODO(erikcorry): A back reference has to have two successors (by default |
- // the same node). The first is used if the back reference matches a non- |
- // empty back reference, the second if it matches an empty one. This |
- // doesn't matter for at_end, which is the only one implemented right now, |
- // but it will matter for other pieces of info. |
- back_ref->set_on_success(back_ref->on_success()->PropagateForward(info)); |
- } |
- return back_ref; |
-} |
- |
- |
-RegExpNode* TextNode::PropagateForward(NodeInfo* info) { |
- return PropagateToEndpoint(this, info); |
-} |
- |
- |
// ------------------------------------------------------------------- |
// Splay tree |
@@ -4389,6 +4459,11 @@ |
} |
+void Analysis::VisitAssertion(AssertionNode* that) { |
+ EnsureAnalyzed(that->on_success()); |
+} |
+ |
+ |
// ------------------------------------------------------------------- |
// Dispatch table construction |
@@ -4441,7 +4516,13 @@ |
} |
+void DispatchTableConstructor::VisitAssertion(AssertionNode* that) { |
+ RegExpNode* target = that->on_success(); |
+ target->Accept(this); |
+} |
+ |
+ |
static int CompareRangeByFrom(const CharacterRange* a, |
const CharacterRange* b) { |
return Compare<uc16>(a->from(), b->from()); |
@@ -4527,10 +4608,6 @@ |
NodeInfo info = *node->info(); |
- if (is_multiline && !FLAG_attempt_multiline_irregexp) { |
- return Handle<FixedArray>::null(); |
- } |
- |
if (FLAG_irregexp_native) { |
#ifdef ARM |
// Unimplemented, fall-through to bytecode implementation. |