src/jsregexp.cc - Issue 18193: Add support for \b and ^ and $ in multiline mode, completing Irregexp...

Unified Diff: src/jsregexp.cc

Issue 18193: Add support for \b and ^ and $ in multiline mode, completing Irregexp... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 11 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/jsregexp.cc

===================================================================

--- src/jsregexp.cc (revision 1094)

+++ src/jsregexp.cc (working copy)

@@ -1522,18 +1522,6 @@

}

-void EndNode::EmitInfoChecks(RegExpMacroAssembler* assembler, Trace* trace) {

- if (info()->at_end) {

- Label succeed;

- // LoadCurrentCharacter will go to the label if we are at the end of the

- // input string.

- assembler->LoadCurrentCharacter(0, &succeed);

- assembler->GoTo(trace->backtrack());

- assembler->Bind(&succeed);

- }

bool NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {

if (!trace->is_trivial()) {

return trace->Flush(compiler, this);

@@ -1542,7 +1530,6 @@

if (!label()->is_bound()) {

assembler->Bind(label());

}

- EmitInfoChecks(assembler, trace);

assembler->ReadCurrentPositionFromRegister(current_position_register_);

assembler->ReadStackPointerFromRegister(stack_pointer_register_);

// Now that we have unwound the stack we find at the top of the stack the

@@ -1562,11 +1549,9 @@

}

switch (action_) {

case ACCEPT:

- EmitInfoChecks(assembler, trace);

assembler->Succeed();

return true;

case BACKTRACK:

- ASSERT(!info()->at_end);

assembler->GoTo(trace->backtrack());

return true;

case NEGATIVE_SUBMATCH_SUCCESS:

@@ -1935,13 +1920,6 @@

RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,

Trace* trace) {

- // TODO(erikcorry): Implement support.

- if (info_.follows_word_interest ||

- info_.follows_newline_interest ||

- info_.follows_start_interest) {

- return FAIL;

- }

// If we are generating a greedy loop then don't stop and don't reuse code.

if (trace->stop_node() != NULL) {

return CONTINUE;

@@ -1990,6 +1968,19 @@

}

+int AssertionNode::EatsAtLeast(int recursion_depth) {

+ if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;

+ return on_success()->EatsAtLeast(recursion_depth + 1);

+int BackReferenceNode::EatsAtLeast(int recursion_depth) {

+ if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;

+ return on_success()->EatsAtLeast(recursion_depth + 1);

int TextNode::EatsAtLeast(int recursion_depth) {

int answer = Length();

if (answer >= 4) return answer;

@@ -2257,7 +2248,7 @@

void QuickCheckDetails::Advance(int by, bool ascii) {

- ASSERT(by > 0);

+ ASSERT(by >= 0);

if (by >= characters_) {

Clear();

return;

@@ -2342,6 +2333,116 @@

}

+// Check for [0-9A-Z_a-z].

+static void EmitWordCheck(RegExpMacroAssembler* assembler,

+ Label* word,

+ Label* non_word,

+ bool fall_through_on_word) {

+ assembler->CheckCharacterGT('z', non_word);

+ assembler->CheckCharacterLT('0', non_word);

+ assembler->CheckCharacterGT('a' - 1, word);

+ assembler->CheckCharacterLT('9' + 1, word);

+ assembler->CheckCharacterLT('A', non_word);

+ assembler->CheckCharacterLT('Z' + 1, word);

+ if (fall_through_on_word) {

+ assembler->CheckNotCharacter('_', non_word);

+ } else {

+ assembler->CheckCharacter('_', word);

+ }

+bool AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {

+ RegExpMacroAssembler* assembler = compiler->macro_assembler();

+ switch (type_) {

+ case AT_END: {

+ Label ok;

+ assembler->LoadCurrentCharacter(trace->cp_offset(), &ok);

+ assembler->GoTo(trace->backtrack());

+ assembler->Bind(&ok);

+ break;

+ }

+ case AT_START:

+ assembler->CheckNotAtStart(trace->backtrack());

+ break;

+ case AFTER_NEWLINE: {

+ // Check whether we are either at the start or immediately after a

+ // newline.

+ Trace new_trace(*trace);

+ // Invalidate the current character register.

+ new_trace.AdvanceCurrentPositionInTrace(0, compiler->ascii());

+ Label ok;

+ if (new_trace.cp_offset() == 0) {

+ assembler->CheckAtStart(&ok);

+ }

+ assembler->LoadCurrentCharacter(-1, new_trace.backtrack(), false);

Christian Plesner Hansen 2009/01/19 10:28:59 Maybe a comment about why this works when you're a

+ // Newline means \n, \r, 0x2028 or 0x2029.

+ if (!compiler->ascii()) {

+ assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);

+ }

+ assembler->CheckCharacter('\n', &ok);

+ assembler->CheckNotCharacter('\r', new_trace.backtrack());

+ assembler->Bind(&ok);

+ return on_success()->Emit(compiler, &new_trace);

+ }

+ case AT_NON_BOUNDARY:

+ case AT_BOUNDARY: {

+ Label before_non_word;

Christian Plesner Hansen 2009/01/19 10:28:59 Maybe you could split these huge cases out into se

+ Label before_word;

+ if (trace->characters_preloaded() != 1) {

+ assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);

+ }

+ // Fall through on non-word.

+ EmitWordCheck(assembler, &before_word, &before_non_word, false);

+ // Invalidate the current character register.

+ Trace new_trace(*trace);

+ new_trace.AdvanceCurrentPositionInTrace(0, compiler->ascii());

+ Label ok;

+ Label* boundary;

+ Label* not_boundary;

+ if (type_ == AT_BOUNDARY) {

+ boundary = &ok;

+ not_boundary = new_trace.backtrack();

+ } else {

+ not_boundary = &ok;

+ boundary = new_trace.backtrack();

+ }

+ // Next character is not a word character.

+ assembler->Bind(&before_non_word);

+ if (new_trace.cp_offset() == 0) {

+ assembler->CheckAtStart(not_boundary);

+ }

+ assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,

Christian Plesner Hansen 2009/01/19 10:28:59 What happens here if we're right at the beginning

+ &ok, // Unused dummy label in this call.

+ false);

+ // Fall through on non-word.

+ EmitWordCheck(assembler, boundary, not_boundary, false);

+ assembler->GoTo(not_boundary);

+ // Next character is a word character.

+ assembler->Bind(&before_word);

+ if (new_trace.cp_offset() == 0) {

+ assembler->CheckAtStart(boundary);

+ }

+ assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,

+ &ok, // Unused dummy label in this call.

+ false);

+ bool fall_through_on_word = (type_ == AT_NON_BOUNDARY);

+ EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);

+ assembler->Bind(&ok);

+ return on_success()->Emit(compiler, &new_trace);

+ }

+ return on_success()->Emit(compiler, trace);

// We call this repeatedly to generate code for each pass over the text node.

// The passes are in increasing order of difficulty because we hope one

// of the first passes will fail in which case we are saved the work of the

@@ -2487,17 +2588,6 @@

if (limit_result == DONE) return true;

ASSERT(limit_result == CONTINUE);

- if (info()->follows_word_interest ||

- info()->follows_newline_interest ||

- info()->follows_start_interest) {

- return false;

- }

- if (info()->at_end) {

- compiler->macro_assembler()->GoTo(trace->backtrack());

- return true;

- }

if (compiler->ascii()) {

int dummy = 0;

TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);

@@ -2562,7 +2652,9 @@

void Trace::AdvanceCurrentPositionInTrace(int by, bool ascii) {

- ASSERT(by > 0);

+ // We can call this with by == 0. In that case it just means that the

Christian Plesner Hansen 2009/01/19 10:28:59 Maybe split invalidation out into a separate metho

+ // current position register has been invalidated.

// We don't have an instruction for shifting the current character register

// down or for using a shifted value for anything so lets just forget that

// we preloaded any characters into it.

@@ -2616,12 +2708,6 @@

if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {

return kNodeIsTooComplexForGreedyLoops;

}

- NodeInfo* info = node->info();

- if (info->follows_word_interest ||

- info->follows_newline_interest ||

- info->follows_start_interest) {

- return kNodeIsTooComplexForGreedyLoops;

- }

int node_length = node->GreedyLoopTextLength();

if (node_length == kNodeIsTooComplexForGreedyLoops) {

return kNodeIsTooComplexForGreedyLoops;

@@ -3096,20 +3182,6 @@

}

case POSITIVE_SUBMATCH_SUCCESS:

if (!trace->is_trivial()) return trace->Flush(compiler, this);

- // TODO(erikcorry): Implement support.

- if (info()->follows_word_interest ||

- info()->follows_newline_interest ||

- info()->follows_start_interest) {

- return false;

- }

- if (info()->at_end) {

- Label at_end;

- // Load current character jumps to the label if we are beyond the string

- // end.

- assembler->LoadCurrentCharacter(0, &at_end);

- assembler->GoTo(trace->backtrack());

- assembler->Bind(&at_end);

- }

assembler->ReadCurrentPositionFromRegister(

data_.u_submatch.current_position_register);

assembler->ReadStackPointerFromRegister(

@@ -3136,19 +3208,11 @@

RecursionCheck rc(compiler);

ASSERT_EQ(start_reg_ + 1, end_reg_);

- if (info()->at_end) {

- // If we are constrained to match at the end of the input then succeed

- // iff the back reference is empty.

- assembler->CheckNotRegistersEqual(start_reg_,

- end_reg_,

- trace->backtrack());

+ if (compiler->ignore_case()) {

+ assembler->CheckNotBackReferenceIgnoreCase(start_reg_,

+ trace->backtrack());

} else {

- if (compiler->ignore_case()) {

- assembler->CheckNotBackReferenceIgnoreCase(start_reg_,

- trace->backtrack());

- } else {

- assembler->CheckNotBackReference(start_reg_, trace->backtrack());

- }

+ assembler->CheckNotBackReference(start_reg_, trace->backtrack());

}

return on_success()->Emit(compiler, trace);

}

@@ -3389,6 +3453,33 @@

}

+void DotPrinter::VisitAssertion(AssertionNode* that) {

+ stream()->Add(" n%p [", that);

+ switch (that->type()) {

+ case AssertionNode::AT_END:

+ stream()->Add("label=\"$\", shape=septagon");

+ break;

+ case AssertionNode::AT_START:

+ stream()->Add("label=\"^\", shape=septagon");

+ break;

+ case AssertionNode::AT_BOUNDARY:

+ stream()->Add("label=\"\\b\", shape=septagon");

+ break;

+ case AssertionNode::AT_NON_BOUNDARY:

+ stream()->Add("label=\"\\B\", shape=septagon");

+ break;

+ case AssertionNode::AFTER_NEWLINE:

+ stream()->Add("label=\"(?<=\\n)\", shape=septagon");

+ break;

+ }

+ stream()->Add("];\n");

+ PrintAttributes(that);

+ RegExpNode* successor = that->on_success();

+ stream()->Add(" n%p -> n%p;\n", that, successor);

+ Visit(successor);

void DotPrinter::VisitAction(ActionNode* that) {

stream()->Add(" n%p [", that);

switch (that->type_) {

@@ -3749,22 +3840,50 @@

NodeInfo info;

switch (type()) {

case START_OF_LINE:

- info.follows_newline_interest = true;

- break;

+ return AssertionNode::AfterNewline(on_success);

case START_OF_INPUT:

- info.follows_start_interest = true;

- break;

- case BOUNDARY: case NON_BOUNDARY:

- info.follows_word_interest = true;

- break;

+ return AssertionNode::AtStart(on_success);

+ case BOUNDARY:

+ return AssertionNode::AtBoundary(on_success);

+ case NON_BOUNDARY:

+ return AssertionNode::AtNonBoundary(on_success);

case END_OF_INPUT:

- info.at_end = true;

- break;

+ return AssertionNode::AtEnd(on_success);

case END_OF_LINE:

- // This is wrong but has the effect of making the compiler abort.

- info.at_end = true;

+ // Compile $ in multiline regexps as an alternation with a positive

+ // lookahead in one side and an end-of-input on the other side.

+ // We need two registers for the lookahead.

+ int stack_pointer_register = compiler->AllocateRegister();

+ int position_register = compiler->AllocateRegister();

+ // The ChoiceNode to distinguish between a newline and end-of-input.

+ ChoiceNode* result = new ChoiceNode(2);

+ // Create a newline atom.

+ ZoneList<CharacterRange>* newline_ranges =

+ new ZoneList<CharacterRange>(3);

+ CharacterRange::AddClassEscape('n', newline_ranges);

+ RegExpCharacterClass* newline_atom =

+ new RegExpCharacterClass(newline_ranges, false);

Christian Plesner Hansen 2009/01/19 10:28:59 You should be able to just use "new RegExpCharacte

+ ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);

+ elms->Add(TextElement::CharClass(newline_atom));

+ TextNode* newline_matcher = new TextNode(

Christian Plesner Hansen 2009/01/19 10:28:59 You should be able to use "new TextNode(newline_at

+ elms,

+ ActionNode::PositiveSubmatchSuccess(stack_pointer_register,

+ position_register,

+ on_success));

+ // Create an end-of-input matcher.

+ RegExpNode* end_of_line = ActionNode::BeginSubmatch(

+ stack_pointer_register,

+ position_register,

+ newline_matcher);

+ // Add the two alternatives to the ChoiceNode.

+ GuardedAlternative eol_alternative(end_of_line);

+ result->AddAlternative(eol_alternative);

+ GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));

+ result->AddAlternative(end_alternative);

+ return result;

}

- return on_success->PropagateForward(&info);

+ UNREACHABLE();

Christian Plesner Hansen 2009/01/19 10:28:59 Maybe have this in the 'default' case, that way th

+ return on_success;

}

@@ -3911,6 +4030,13 @@

case '*':

ranges->Add(CharacterRange::Everything());

break;

+ // This is the set of characters matched by the $ and ^ symbols

+ // in multiline mode.

+ case 'n':

+ AddClass(kLineTerminatorRanges,

+ kLineTerminatorRangeCount,

+ ranges);

+ break;

default:

UNREACHABLE();

}

@@ -4096,62 +4222,6 @@

}

-RegExpNode* ActionNode::PropagateForward(NodeInfo* info) {

Christian Plesner Hansen 2009/01/19 10:28:59 O joy, we can get rid of all this cra..., I mean s

- NodeInfo full_info(*this->info());

- full_info.AddFromPreceding(info);

- bool cloned = false;

- ActionNode* action = EnsureSibling(this, &full_info, &cloned);

- action->set_on_success(action->on_success()->PropagateForward(info));

- return action;

-RegExpNode* ChoiceNode::PropagateForward(NodeInfo* info) {

- NodeInfo full_info(*this->info());

- full_info.AddFromPreceding(info);

- bool cloned = false;

- ChoiceNode* choice = EnsureSibling(this, &full_info, &cloned);

- if (cloned) {

- ZoneList<GuardedAlternative>* old_alternatives = alternatives();

- int count = old_alternatives->length();

- choice->alternatives_ = new ZoneList<GuardedAlternative>(count);

- for (int i = 0; i < count; i++) {

- GuardedAlternative alternative = old_alternatives->at(i);

- alternative.set_node(alternative.node()->PropagateForward(info));

- choice->alternatives()->Add(alternative);

- }

- return choice;

-RegExpNode* EndNode::PropagateForward(NodeInfo* info) {

- return PropagateToEndpoint(this, info);

-RegExpNode* BackReferenceNode::PropagateForward(NodeInfo* info) {

- NodeInfo full_info(*this->info());

- full_info.AddFromPreceding(info);

- bool cloned = false;

- BackReferenceNode* back_ref = EnsureSibling(this, &full_info, &cloned);

- if (cloned) {

- // TODO(erikcorry): A back reference has to have two successors (by default

- // the same node). The first is used if the back reference matches a non-

- // empty back reference, the second if it matches an empty one. This

- // doesn't matter for at_end, which is the only one implemented right now,

- // but it will matter for other pieces of info.

- back_ref->set_on_success(back_ref->on_success()->PropagateForward(info));

- }

- return back_ref;

-RegExpNode* TextNode::PropagateForward(NodeInfo* info) {

- return PropagateToEndpoint(this, info);

// -------------------------------------------------------------------

// Splay tree

@@ -4389,6 +4459,11 @@

}

+void Analysis::VisitAssertion(AssertionNode* that) {

+ EnsureAnalyzed(that->on_success());

// -------------------------------------------------------------------

// Dispatch table construction

@@ -4441,7 +4516,13 @@

}

+void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {

+ RegExpNode* target = that->on_success();

+ target->Accept(this);

static int CompareRangeByFrom(const CharacterRange* a,

const CharacterRange* b) {

return Compare<uc16>(a->from(), b->from());

@@ -4527,10 +4608,6 @@

NodeInfo info = *node->info();

- if (is_multiline && !FLAG_attempt_multiline_irregexp) {

- return Handle<FixedArray>::null();

- }

if (FLAG_irregexp_native) {

#ifdef ARM

// Unimplemented, fall-through to bytecode implementation.

« src/flag-definitions.h ('K') | « src/jsregexp.h ('k') | src/regexp-macro-assembler.h » ('j') | test/mozilla/mozilla.status » ('J')