Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(66)

Unified Diff: src/regexp/jsregexp.cc

Issue 1418963009: Experimental support for RegExp lookbehind. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: fixed test cases Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/regexp/jsregexp.cc
diff --git a/src/regexp/jsregexp.cc b/src/regexp/jsregexp.cc
index 225ad73c4e27c499196ec0504b3eab9b3e5c355f..a41ab90b1e295d508c0f87abbfe166836e1aab97 100644
--- a/src/regexp/jsregexp.cc
+++ b/src/regexp/jsregexp.cc
@@ -1224,7 +1224,8 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
int value = 0;
bool absolute = false;
bool clear = false;
- int store_position = -1;
+ static const int kNoStore = kMinInt;
+ int store_position = kNoStore;
// This is a little tricky because we are scanning the actions in reverse
// historical order (newest first).
for (DeferredAction* action = actions_;
@@ -1245,7 +1246,7 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
// we can set undo_action to IGNORE if we know there is no value to
// restore.
undo_action = RESTORE;
- DCHECK_EQ(store_position, -1);
+ DCHECK_EQ(store_position, kNoStore);
DCHECK(!clear);
break;
}
@@ -1253,14 +1254,14 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
if (!absolute) {
value++;
}
- DCHECK_EQ(store_position, -1);
+ DCHECK_EQ(store_position, kNoStore);
DCHECK(!clear);
undo_action = RESTORE;
break;
case ActionNode::STORE_POSITION: {
Trace::DeferredCapture* pc =
static_cast<Trace::DeferredCapture*>(action);
- if (!clear && store_position == -1) {
+ if (!clear && store_position == kNoStore) {
store_position = pc->cp_offset();
}
@@ -1284,7 +1285,7 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
// Since we're scanning in reverse order, if we've already
// set the position we have to ignore historically earlier
// clearing operations.
- if (store_position == -1) {
+ if (store_position == kNoStore) {
clear = true;
}
undo_action = RESTORE;
@@ -1315,7 +1316,7 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
}
// Perform the chronologically last action (or accumulated increment)
// for the register.
- if (store_position != -1) {
+ if (store_position != kNoStore) {
assembler->WriteCurrentPositionToRegister(reg, store_position);
} else if (clear) {
assembler->ClearRegisters(reg, reg);
@@ -2313,6 +2314,7 @@ void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
int BackReferenceNode::EatsAtLeast(int still_to_find,
int budget,
bool not_at_start) {
+ if (read_backward()) return 0;
if (budget <= 0) return 0;
return on_success()->EatsAtLeast(still_to_find,
budget - 1,
@@ -2323,6 +2325,7 @@ int BackReferenceNode::EatsAtLeast(int still_to_find,
int TextNode::EatsAtLeast(int still_to_find,
int budget,
bool not_at_start) {
+ if (read_backward()) return 0;
int answer = Length();
if (answer >= still_to_find) return answer;
if (budget <= 0) return answer;
@@ -2526,8 +2529,8 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
} else {
char_mask = String::kMaxUtf16CodeUnit;
}
- for (int k = 0; k < elms_->length(); k++) {
- TextElement elm = elms_->at(k);
+ for (int k = 0; k < elements()->length(); k++) {
+ TextElement elm = elements()->at(k);
if (elm.text_type() == TextElement::ATOM) {
Vector<const uc16> quarks = elm.atom()->data();
for (int i = 0; i < characters && i < quarks.length(); i++) {
@@ -2678,7 +2681,6 @@ void QuickCheckDetails::Clear() {
void QuickCheckDetails::Advance(int by, bool one_byte) {
- DCHECK(by >= 0);
if (by >= characters_) {
Clear();
return;
@@ -2780,9 +2782,9 @@ RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
if (depth < 0) return this;
DCHECK(!info()->visited);
VisitMarker marker(info());
- int element_count = elms_->length();
+ int element_count = elements()->length();
for (int i = 0; i < element_count; i++) {
- TextElement elm = elms_->at(i);
+ TextElement elm = elements()->at(i);
if (elm.text_type() == TextElement::ATOM) {
Vector<const uc16> quarks = elm.atom()->data();
for (int j = 0; j < quarks.length(); j++) {
@@ -3146,9 +3148,9 @@ void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
return;
}
if (trace->at_start() == Trace::UNKNOWN) {
- assembler->CheckNotAtStart(trace->backtrack());
+ assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
Trace at_start_trace = *trace;
- at_start_trace.set_at_start(true);
+ at_start_trace.set_at_start(Trace::TRUE_VALUE);
on_success()->Emit(compiler, &at_start_trace);
return;
}
@@ -3221,10 +3223,11 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
bool one_byte = compiler->one_byte();
Label* backtrack = trace->backtrack();
QuickCheckDetails* quick_check = trace->quick_check_performed();
- int element_count = elms_->length();
+ int element_count = elements()->length();
+ int backward_offset = read_backward() ? -Length() : 0;
for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
- TextElement elm = elms_->at(i);
- int cp_offset = trace->cp_offset() + elm.cp_offset();
+ TextElement elm = elements()->at(i);
+ int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
if (elm.text_type() == TextElement::ATOM) {
Vector<const uc16> quarks = elm.atom()->data();
for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
@@ -3252,13 +3255,10 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
break;
}
if (emit_function != NULL) {
- bool bound_checked = emit_function(isolate,
- compiler,
- quarks[j],
- backtrack,
- cp_offset + j,
- *checked_up_to < cp_offset + j,
- preloaded);
+ bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
+ bool bound_checked =
+ emit_function(isolate, compiler, quarks[j], backtrack,
+ cp_offset + j, bounds_check, preloaded);
if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
}
}
@@ -3268,8 +3268,9 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
if (first_element_checked && i == 0) continue;
if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
RegExpCharacterClass* cc = elm.char_class();
+ bool bounds_check = *checked_up_to < cp_offset || read_backward();
EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
- *checked_up_to < cp_offset, preloaded, zone());
+ bounds_check, preloaded, zone());
UpdateBoundsCheck(cp_offset, checked_up_to);
}
}
@@ -3278,7 +3279,7 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
int TextNode::Length() {
- TextElement elm = elms_->last();
+ TextElement elm = elements()->last();
DCHECK(elm.cp_offset() >= 0);
return elm.cp_offset() + elm.length();
}
@@ -3347,8 +3348,11 @@ void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
}
Trace successor_trace(*trace);
- successor_trace.set_at_start(false);
- successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
+ // If we advance backward, we may end up at the start.
+ successor_trace.AdvanceCurrentPositionInTrace(
+ read_backward() ? -Length() : Length(), compiler);
+ successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
+ : Trace::FALSE_VALUE);
RecursionCheck rc(compiler);
on_success()->Emit(compiler, &successor_trace);
}
@@ -3360,7 +3364,6 @@ void Trace::InvalidateCurrentCharacter() {
void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
- DCHECK(by > 0);
// We don't have an instruction for shifting the current character register
// down or for using a shifted value for anything so lets just forget that
// we preloaded any characters into it.
@@ -3379,9 +3382,9 @@ void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
- int element_count = elms_->length();
+ int element_count = elements()->length();
for (int i = 0; i < element_count; i++) {
- TextElement elm = elms_->at(i);
+ TextElement elm = elements()->at(i);
if (elm.text_type() == TextElement::CHAR_CLASS) {
RegExpCharacterClass* cc = elm.char_class();
// None of the standard character classes is different in the case
@@ -3398,15 +3401,14 @@ void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
int TextNode::GreedyLoopTextLength() {
- TextElement elm = elms_->at(elms_->length() - 1);
- return elm.cp_offset() + elm.length();
+ return read_backward() ? kNodeIsTooComplexForGreedyLoops : Length();
erikcorry 2015/11/11 15:39:04 This is a case where perhaps we do need a little o
Yang 2015/11/12 08:25:35 Done.
}
RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
erikcorry 2015/11/11 15:39:04 Not quite sure, but this optimization may have to
Yang 2015/11/12 08:25:35 I haven't found a test case where this would becom
RegExpCompiler* compiler) {
- if (elms_->length() != 1) return NULL;
- TextElement elm = elms_->at(0);
+ if (elements()->length() != 1) return NULL;
+ TextElement elm = elements()->at(0);
if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
RegExpCharacterClass* node = elm.char_class();
ZoneList<CharacterRange>* ranges = node->ranges(zone());
@@ -3881,7 +3883,7 @@ void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
GreedyLoopState::GreedyLoopState(bool not_at_start) {
counter_backtrack_trace_.set_backtrack(&label_);
- if (not_at_start) counter_backtrack_trace_.set_at_start(false);
+ if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
}
@@ -4008,7 +4010,7 @@ Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
macro_assembler->PushCurrentPosition();
Label greedy_match_failed;
Trace greedy_match_trace;
- if (not_at_start()) greedy_match_trace.set_at_start(false);
+ if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
greedy_match_trace.set_backtrack(&greedy_match_failed);
Label loop_label;
macro_assembler->Bind(&loop_label);
@@ -4354,11 +4356,14 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
DCHECK_EQ(start_reg_ + 1, end_reg_);
if (compiler->ignore_case()) {
- assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
+ assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
trace->backtrack());
} else {
- assembler->CheckNotBackReference(start_reg_, trace->backtrack());
+ assembler->CheckNotBackReference(start_reg_, read_backward(),
+ trace->backtrack());
}
+ // We are going to advance backward, so we may end up at the start.
+ if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
on_success()->Emit(compiler, trace);
}
@@ -4719,13 +4724,14 @@ RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
ZoneList<TextElement>* elms =
new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
elms->Add(TextElement::Atom(this), compiler->zone());
- return new(compiler->zone()) TextNode(elms, on_success);
+ return new (compiler->zone()) TextNode(elms, read_backward(), on_success);
}
RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
- return new(compiler->zone()) TextNode(elements(), on_success);
+ return new (compiler->zone())
+ TextNode(elements(), read_backward(), on_success);
}
@@ -4822,7 +4828,7 @@ bool RegExpCharacterClass::is_standard(Zone* zone) {
RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
- return new(compiler->zone()) TextNode(this, on_success);
+ return new (compiler->zone()) TextNode(this, read_backward(), on_success);
}
@@ -4967,8 +4973,8 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
}
}
}
- RegExpAtom* prefix =
- new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
+ RegExpAtom* prefix = new (zone) RegExpAtom(
+ atom->data().SubVector(0, prefix_length), read_direction());
ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
pair->Add(prefix, zone);
ZoneList<RegExpTree*>* suffixes =
@@ -4981,12 +4987,14 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
suffixes->Add(new (zone) RegExpEmpty(), zone);
} else {
RegExpTree* suffix = new (zone) RegExpAtom(
- old_atom->data().SubVector(prefix_length, old_atom->length()));
+ old_atom->data().SubVector(prefix_length, old_atom->length()),
+ read_direction());
suffixes->Add(suffix, zone);
}
}
- pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
- alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
+ pair->Add(new (zone) RegExpDisjunction(suffixes, read_direction()), zone);
+ alternatives->at(write_posn++) =
+ new (zone) RegExpAlternative(pair, read_direction());
} else {
// Just copy any non-worthwhile alternatives.
for (int j = first_with_prefix; j < i; j++) {
@@ -5040,7 +5048,7 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions(
ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
}
alternatives->at(write_posn++) =
- new (zone) RegExpCharacterClass(ranges, false);
+ new (zone) RegExpCharacterClass(ranges, false, read_direction());
} else {
// Just copy any trivial alternatives.
for (int j = first_in_run; j < i; j++) {
@@ -5294,14 +5302,14 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
ZoneList<CharacterRange>* newline_ranges =
new(zone) ZoneList<CharacterRange>(3, zone);
CharacterRange::AddClassEscape('n', newline_ranges, zone);
- RegExpCharacterClass* newline_atom = new(zone) RegExpCharacterClass('n');
- TextNode* newline_matcher = new(zone) TextNode(
- newline_atom,
- ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
- position_register,
- 0, // No captures inside.
- -1, // Ignored if no captures.
- on_success));
+ RegExpCharacterClass* newline_atom =
+ new (zone) RegExpCharacterClass('n', RegExpTree::READ_FORWARD);
erikcorry 2015/11/11 15:39:04 Yes :-) Do we have a test of $ and ^ inside lookb
Yang 2015/11/12 08:25:34 Added a couple of tests.
+ TextNode* newline_matcher = new (zone) TextNode(
+ newline_atom, false, ActionNode::PositiveSubmatchSuccess(
+ stack_pointer_register, position_register,
+ 0, // No captures inside.
+ -1, // Ignored if no captures.
+ on_success));
// Create an end-of-input matcher.
RegExpNode* end_of_line = ActionNode::BeginSubmatch(
stack_pointer_register,
@@ -5323,10 +5331,9 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
- return new(compiler->zone())
- BackReferenceNode(RegExpCapture::StartRegister(index()),
- RegExpCapture::EndRegister(index()),
- on_success);
+ return new (compiler->zone()) BackReferenceNode(
+ RegExpCapture::StartRegister(index()),
+ RegExpCapture::EndRegister(index()), read_backward(), on_success);
}
@@ -5336,8 +5343,8 @@ RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
}
-RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
- RegExpNode* on_success) {
+RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success) {
int stack_pointer_register = compiler->AllocateRegister();
int position_register = compiler->AllocateRegister();
@@ -5347,7 +5354,6 @@ RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
int register_start =
register_of_first_capture + capture_from_ * registers_per_capture;
- RegExpNode* success;
if (is_positive()) {
RegExpNode* node = ActionNode::BeginSubmatch(
stack_pointer_register,
@@ -5374,13 +5380,9 @@ RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
Zone* zone = compiler->zone();
GuardedAlternative body_alt(
- body()->ToNode(
- compiler,
- success = new(zone) NegativeSubmatchSuccess(stack_pointer_register,
- position_register,
- register_count,
- register_start,
- zone)));
+ body()->ToNode(compiler, new (zone) NegativeSubmatchSuccess(
+ stack_pointer_register, position_register,
+ register_count, register_start, zone)));
ChoiceNode* choice_node =
new(zone) NegativeLookaheadChoiceNode(body_alt,
GuardedAlternative(on_success),
@@ -5402,8 +5404,10 @@ RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
int index,
RegExpCompiler* compiler,
RegExpNode* on_success) {
+ DCHECK_NOT_NULL(body);
int start_reg = RegExpCapture::StartRegister(index);
int end_reg = RegExpCapture::EndRegister(index);
+ if (body->read_backward()) std::swap(start_reg, end_reg);
RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
RegExpNode* body_node = body->ToNode(compiler, store_end);
return ActionNode::StorePosition(start_reg, true, body_node);
@@ -5414,8 +5418,14 @@ RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
ZoneList<RegExpTree*>* children = nodes();
RegExpNode* current = on_success;
- for (int i = children->length() - 1; i >= 0; i--) {
- current = children->at(i)->ToNode(compiler, current);
+ if (read_backward()) {
+ for (int i = 0; i < children->length(); i++) {
+ current = children->at(i)->ToNode(compiler, current);
+ }
+ } else {
+ for (int i = children->length() - 1; i >= 0; i--) {
+ current = children->at(i)->ToNode(compiler, current);
+ }
}
return current;
}
@@ -6291,22 +6301,19 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
if (!is_start_anchored && !is_sticky) {
// Add a .*? at the beginning, outside the body capture, unless
// this expression is anchored at the beginning or sticky.
- RegExpNode* loop_node =
- RegExpQuantifier::ToNode(0,
- RegExpTree::kInfinity,
- false,
- new(zone) RegExpCharacterClass('*'),
- &compiler,
- captured_body,
- data->contains_anchor);
+ RegExpNode* loop_node = RegExpQuantifier::ToNode(
+ 0, RegExpTree::kInfinity, false,
+ new (zone) RegExpCharacterClass('*', RegExpTree::READ_FORWARD),
+ &compiler, captured_body, data->contains_anchor);
if (data->contains_anchor) {
// Unroll loop once, to take care of the case that might start
// at the start of input.
ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
first_step_node->AddAlternative(GuardedAlternative(captured_body));
- first_step_node->AddAlternative(GuardedAlternative(
- new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node)));
+ first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
+ new (zone) RegExpCharacterClass('*', RegExpTree::READ_FORWARD), false,
+ loop_node)));
node = first_step_node;
} else {
node = loop_node;

Powered by Google App Engine
This is Rietveld 408576698