Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(383)

Unified Diff: src/jsregexp.cc

Issue 18193: Add support for \b and ^ and $ in multiline mode, completing Irregexp... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 11 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/jsregexp.cc
===================================================================
--- src/jsregexp.cc (revision 1094)
+++ src/jsregexp.cc (working copy)
@@ -1522,18 +1522,6 @@
}
-void EndNode::EmitInfoChecks(RegExpMacroAssembler* assembler, Trace* trace) {
- if (info()->at_end) {
- Label succeed;
- // LoadCurrentCharacter will go to the label if we are at the end of the
- // input string.
- assembler->LoadCurrentCharacter(0, &succeed);
- assembler->GoTo(trace->backtrack());
- assembler->Bind(&succeed);
- }
-}
-
-
bool NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
if (!trace->is_trivial()) {
return trace->Flush(compiler, this);
@@ -1542,7 +1530,6 @@
if (!label()->is_bound()) {
assembler->Bind(label());
}
- EmitInfoChecks(assembler, trace);
assembler->ReadCurrentPositionFromRegister(current_position_register_);
assembler->ReadStackPointerFromRegister(stack_pointer_register_);
// Now that we have unwound the stack we find at the top of the stack the
@@ -1562,11 +1549,9 @@
}
switch (action_) {
case ACCEPT:
- EmitInfoChecks(assembler, trace);
assembler->Succeed();
return true;
case BACKTRACK:
- ASSERT(!info()->at_end);
assembler->GoTo(trace->backtrack());
return true;
case NEGATIVE_SUBMATCH_SUCCESS:
@@ -1935,13 +1920,6 @@
RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
Trace* trace) {
- // TODO(erikcorry): Implement support.
- if (info_.follows_word_interest ||
- info_.follows_newline_interest ||
- info_.follows_start_interest) {
- return FAIL;
- }
-
// If we are generating a greedy loop then don't stop and don't reuse code.
if (trace->stop_node() != NULL) {
return CONTINUE;
@@ -1990,6 +1968,19 @@
}
+int AssertionNode::EatsAtLeast(int recursion_depth) {
+ if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
+ return on_success()->EatsAtLeast(recursion_depth + 1);
+}
+
+
+int BackReferenceNode::EatsAtLeast(int recursion_depth) {
+ if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
+ return on_success()->EatsAtLeast(recursion_depth + 1);
+}
+
+
+
int TextNode::EatsAtLeast(int recursion_depth) {
int answer = Length();
if (answer >= 4) return answer;
@@ -2257,7 +2248,7 @@
void QuickCheckDetails::Advance(int by, bool ascii) {
- ASSERT(by > 0);
+ ASSERT(by >= 0);
if (by >= characters_) {
Clear();
return;
@@ -2342,6 +2333,116 @@
}
+// Check for [0-9A-Z_a-z].
+static void EmitWordCheck(RegExpMacroAssembler* assembler,
+ Label* word,
+ Label* non_word,
+ bool fall_through_on_word) {
+ assembler->CheckCharacterGT('z', non_word);
+ assembler->CheckCharacterLT('0', non_word);
+ assembler->CheckCharacterGT('a' - 1, word);
+ assembler->CheckCharacterLT('9' + 1, word);
+ assembler->CheckCharacterLT('A', non_word);
+ assembler->CheckCharacterLT('Z' + 1, word);
+ if (fall_through_on_word) {
+ assembler->CheckNotCharacter('_', non_word);
+ } else {
+ assembler->CheckCharacter('_', word);
+ }
+}
+
+
+bool AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
+ RegExpMacroAssembler* assembler = compiler->macro_assembler();
+ switch (type_) {
+ case AT_END: {
+ Label ok;
+ assembler->LoadCurrentCharacter(trace->cp_offset(), &ok);
+ assembler->GoTo(trace->backtrack());
+ assembler->Bind(&ok);
+ break;
+ }
+ case AT_START:
+ assembler->CheckNotAtStart(trace->backtrack());
+ break;
+ case AFTER_NEWLINE: {
+ // Check whether we are either at the start or immediately after a
+ // newline.
+ Trace new_trace(*trace);
+ // Invalidate the current character register.
+ new_trace.AdvanceCurrentPositionInTrace(0, compiler->ascii());
+
+ Label ok;
+ if (new_trace.cp_offset() == 0) {
+ assembler->CheckAtStart(&ok);
+ }
+ assembler->LoadCurrentCharacter(-1, new_trace.backtrack(), false);
Christian Plesner Hansen 2009/01/19 10:28:59 Maybe a comment about why this works when you're a
+ // Newline means \n, \r, 0x2028 or 0x2029.
+ if (!compiler->ascii()) {
+ assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
+ }
+ assembler->CheckCharacter('\n', &ok);
+ assembler->CheckNotCharacter('\r', new_trace.backtrack());
+ assembler->Bind(&ok);
+ return on_success()->Emit(compiler, &new_trace);
+ }
+ case AT_NON_BOUNDARY:
+ case AT_BOUNDARY: {
+ Label before_non_word;
Christian Plesner Hansen 2009/01/19 10:28:59 Maybe you could split these huge cases out into se
+ Label before_word;
+ if (trace->characters_preloaded() != 1) {
+ assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
+ }
+ // Fall through on non-word.
+ EmitWordCheck(assembler, &before_word, &before_non_word, false);
+
+ // Invalidate the current character register.
+ Trace new_trace(*trace);
+ new_trace.AdvanceCurrentPositionInTrace(0, compiler->ascii());
+
+ Label ok;
+ Label* boundary;
+ Label* not_boundary;
+ if (type_ == AT_BOUNDARY) {
+ boundary = &ok;
+ not_boundary = new_trace.backtrack();
+ } else {
+ not_boundary = &ok;
+ boundary = new_trace.backtrack();
+ }
+
+ // Next character is not a word character.
+ assembler->Bind(&before_non_word);
+ if (new_trace.cp_offset() == 0) {
+ assembler->CheckAtStart(not_boundary);
+ }
+ assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
Christian Plesner Hansen 2009/01/19 10:28:59 What happens here if we're right at the beginning
+ &ok, // Unused dummy label in this call.
+ false);
+ // Fall through on non-word.
+ EmitWordCheck(assembler, boundary, not_boundary, false);
+ assembler->GoTo(not_boundary);
+
+ // Next character is a word character.
+ assembler->Bind(&before_word);
+ if (new_trace.cp_offset() == 0) {
+ assembler->CheckAtStart(boundary);
+ }
+ assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
+ &ok, // Unused dummy label in this call.
+ false);
+ bool fall_through_on_word = (type_ == AT_NON_BOUNDARY);
+ EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
+
+ assembler->Bind(&ok);
+
+ return on_success()->Emit(compiler, &new_trace);
+ }
+ }
+ return on_success()->Emit(compiler, trace);
+}
+
+
// We call this repeatedly to generate code for each pass over the text node.
// The passes are in increasing order of difficulty because we hope one
// of the first passes will fail in which case we are saved the work of the
@@ -2487,17 +2588,6 @@
if (limit_result == DONE) return true;
ASSERT(limit_result == CONTINUE);
- if (info()->follows_word_interest ||
- info()->follows_newline_interest ||
- info()->follows_start_interest) {
- return false;
- }
-
- if (info()->at_end) {
- compiler->macro_assembler()->GoTo(trace->backtrack());
- return true;
- }
-
if (compiler->ascii()) {
int dummy = 0;
TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
@@ -2562,7 +2652,9 @@
void Trace::AdvanceCurrentPositionInTrace(int by, bool ascii) {
- ASSERT(by > 0);
+ // We can call this with by == 0. In that case it just means that the
Christian Plesner Hansen 2009/01/19 10:28:59 Maybe split invalidation out into a separate metho
+ // current position register has been invalidated.
+
// We don't have an instruction for shifting the current character register
// down or for using a shifted value for anything so lets just forget that
// we preloaded any characters into it.
@@ -2616,12 +2708,6 @@
if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
return kNodeIsTooComplexForGreedyLoops;
}
- NodeInfo* info = node->info();
- if (info->follows_word_interest ||
- info->follows_newline_interest ||
- info->follows_start_interest) {
- return kNodeIsTooComplexForGreedyLoops;
- }
int node_length = node->GreedyLoopTextLength();
if (node_length == kNodeIsTooComplexForGreedyLoops) {
return kNodeIsTooComplexForGreedyLoops;
@@ -3096,20 +3182,6 @@
}
case POSITIVE_SUBMATCH_SUCCESS:
if (!trace->is_trivial()) return trace->Flush(compiler, this);
- // TODO(erikcorry): Implement support.
- if (info()->follows_word_interest ||
- info()->follows_newline_interest ||
- info()->follows_start_interest) {
- return false;
- }
- if (info()->at_end) {
- Label at_end;
- // Load current character jumps to the label if we are beyond the string
- // end.
- assembler->LoadCurrentCharacter(0, &at_end);
- assembler->GoTo(trace->backtrack());
- assembler->Bind(&at_end);
- }
assembler->ReadCurrentPositionFromRegister(
data_.u_submatch.current_position_register);
assembler->ReadStackPointerFromRegister(
@@ -3136,19 +3208,11 @@
RecursionCheck rc(compiler);
ASSERT_EQ(start_reg_ + 1, end_reg_);
- if (info()->at_end) {
- // If we are constrained to match at the end of the input then succeed
- // iff the back reference is empty.
- assembler->CheckNotRegistersEqual(start_reg_,
- end_reg_,
- trace->backtrack());
+ if (compiler->ignore_case()) {
+ assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
+ trace->backtrack());
} else {
- if (compiler->ignore_case()) {
- assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
- trace->backtrack());
- } else {
- assembler->CheckNotBackReference(start_reg_, trace->backtrack());
- }
+ assembler->CheckNotBackReference(start_reg_, trace->backtrack());
}
return on_success()->Emit(compiler, trace);
}
@@ -3389,6 +3453,33 @@
}
+void DotPrinter::VisitAssertion(AssertionNode* that) {
+ stream()->Add(" n%p [", that);
+ switch (that->type()) {
+ case AssertionNode::AT_END:
+ stream()->Add("label=\"$\", shape=septagon");
+ break;
+ case AssertionNode::AT_START:
+ stream()->Add("label=\"^\", shape=septagon");
+ break;
+ case AssertionNode::AT_BOUNDARY:
+ stream()->Add("label=\"\\b\", shape=septagon");
+ break;
+ case AssertionNode::AT_NON_BOUNDARY:
+ stream()->Add("label=\"\\B\", shape=septagon");
+ break;
+ case AssertionNode::AFTER_NEWLINE:
+ stream()->Add("label=\"(?<=\\n)\", shape=septagon");
+ break;
+ }
+ stream()->Add("];\n");
+ PrintAttributes(that);
+ RegExpNode* successor = that->on_success();
+ stream()->Add(" n%p -> n%p;\n", that, successor);
+ Visit(successor);
+}
+
+
void DotPrinter::VisitAction(ActionNode* that) {
stream()->Add(" n%p [", that);
switch (that->type_) {
@@ -3749,22 +3840,50 @@
NodeInfo info;
switch (type()) {
case START_OF_LINE:
- info.follows_newline_interest = true;
- break;
+ return AssertionNode::AfterNewline(on_success);
case START_OF_INPUT:
- info.follows_start_interest = true;
- break;
- case BOUNDARY: case NON_BOUNDARY:
- info.follows_word_interest = true;
- break;
+ return AssertionNode::AtStart(on_success);
+ case BOUNDARY:
+ return AssertionNode::AtBoundary(on_success);
+ case NON_BOUNDARY:
+ return AssertionNode::AtNonBoundary(on_success);
case END_OF_INPUT:
- info.at_end = true;
- break;
+ return AssertionNode::AtEnd(on_success);
case END_OF_LINE:
- // This is wrong but has the effect of making the compiler abort.
- info.at_end = true;
+ // Compile $ in multiline regexps as an alternation with a positive
+ // lookahead in one side and an end-of-input on the other side.
+ // We need two registers for the lookahead.
+ int stack_pointer_register = compiler->AllocateRegister();
+ int position_register = compiler->AllocateRegister();
+ // The ChoiceNode to distinguish between a newline and end-of-input.
+ ChoiceNode* result = new ChoiceNode(2);
+ // Create a newline atom.
+ ZoneList<CharacterRange>* newline_ranges =
+ new ZoneList<CharacterRange>(3);
+ CharacterRange::AddClassEscape('n', newline_ranges);
+ RegExpCharacterClass* newline_atom =
+ new RegExpCharacterClass(newline_ranges, false);
Christian Plesner Hansen 2009/01/19 10:28:59 You should be able to just use "new RegExpCharacte
+ ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
+ elms->Add(TextElement::CharClass(newline_atom));
+ TextNode* newline_matcher = new TextNode(
Christian Plesner Hansen 2009/01/19 10:28:59 You should be able to use "new TextNode(newline_at
+ elms,
+ ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
+ position_register,
+ on_success));
+ // Create an end-of-input matcher.
+ RegExpNode* end_of_line = ActionNode::BeginSubmatch(
+ stack_pointer_register,
+ position_register,
+ newline_matcher);
+ // Add the two alternatives to the ChoiceNode.
+ GuardedAlternative eol_alternative(end_of_line);
+ result->AddAlternative(eol_alternative);
+ GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
+ result->AddAlternative(end_alternative);
+ return result;
}
- return on_success->PropagateForward(&info);
+ UNREACHABLE();
Christian Plesner Hansen 2009/01/19 10:28:59 Maybe have this in the 'default' case, that way th
+ return on_success;
}
@@ -3911,6 +4030,13 @@
case '*':
ranges->Add(CharacterRange::Everything());
break;
+ // This is the set of characters matched by the $ and ^ symbols
+ // in multiline mode.
+ case 'n':
+ AddClass(kLineTerminatorRanges,
+ kLineTerminatorRangeCount,
+ ranges);
+ break;
default:
UNREACHABLE();
}
@@ -4096,62 +4222,6 @@
}
-RegExpNode* ActionNode::PropagateForward(NodeInfo* info) {
Christian Plesner Hansen 2009/01/19 10:28:59 O joy, we can get rid of all this cra..., I mean s
- NodeInfo full_info(*this->info());
- full_info.AddFromPreceding(info);
- bool cloned = false;
- ActionNode* action = EnsureSibling(this, &full_info, &cloned);
- action->set_on_success(action->on_success()->PropagateForward(info));
- return action;
-}
-
-
-RegExpNode* ChoiceNode::PropagateForward(NodeInfo* info) {
- NodeInfo full_info(*this->info());
- full_info.AddFromPreceding(info);
- bool cloned = false;
- ChoiceNode* choice = EnsureSibling(this, &full_info, &cloned);
- if (cloned) {
- ZoneList<GuardedAlternative>* old_alternatives = alternatives();
- int count = old_alternatives->length();
- choice->alternatives_ = new ZoneList<GuardedAlternative>(count);
- for (int i = 0; i < count; i++) {
- GuardedAlternative alternative = old_alternatives->at(i);
- alternative.set_node(alternative.node()->PropagateForward(info));
- choice->alternatives()->Add(alternative);
- }
- }
- return choice;
-}
-
-
-RegExpNode* EndNode::PropagateForward(NodeInfo* info) {
- return PropagateToEndpoint(this, info);
-}
-
-
-RegExpNode* BackReferenceNode::PropagateForward(NodeInfo* info) {
- NodeInfo full_info(*this->info());
- full_info.AddFromPreceding(info);
- bool cloned = false;
- BackReferenceNode* back_ref = EnsureSibling(this, &full_info, &cloned);
- if (cloned) {
- // TODO(erikcorry): A back reference has to have two successors (by default
- // the same node). The first is used if the back reference matches a non-
- // empty back reference, the second if it matches an empty one. This
- // doesn't matter for at_end, which is the only one implemented right now,
- // but it will matter for other pieces of info.
- back_ref->set_on_success(back_ref->on_success()->PropagateForward(info));
- }
- return back_ref;
-}
-
-
-RegExpNode* TextNode::PropagateForward(NodeInfo* info) {
- return PropagateToEndpoint(this, info);
-}
-
-
// -------------------------------------------------------------------
// Splay tree
@@ -4389,6 +4459,11 @@
}
+void Analysis::VisitAssertion(AssertionNode* that) {
+ EnsureAnalyzed(that->on_success());
+}
+
+
// -------------------------------------------------------------------
// Dispatch table construction
@@ -4441,7 +4516,13 @@
}
+void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
+ RegExpNode* target = that->on_success();
+ target->Accept(this);
+}
+
+
static int CompareRangeByFrom(const CharacterRange* a,
const CharacterRange* b) {
return Compare<uc16>(a->from(), b->from());
@@ -4527,10 +4608,6 @@
NodeInfo info = *node->info();
- if (is_multiline && !FLAG_attempt_multiline_irregexp) {
- return Handle<FixedArray>::null();
- }
-
if (FLAG_irregexp_native) {
#ifdef ARM
// Unimplemented, fall-through to bytecode implementation.

Powered by Google App Engine
This is Rietveld 408576698