Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1713)

Unified Diff: src/jsregexp.cc

Issue 8188: Some new regexp infrastructure. (Closed)
Patch Set: Created 12 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/jsregexp.cc
diff --git a/src/jsregexp.cc b/src/jsregexp.cc
index de03b6e21589510c242ca1830549541ec6a85200..404198411fdf4ad789619f0ddf333ebbfff9671c 100644
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@@ -27,6 +27,7 @@
#include "v8.h"
+#include "ast.h"
#include "execution.h"
#include "factory.h"
#include "jsregexp.h"
@@ -35,6 +36,8 @@
#include "runtime.h"
#include "top.h"
#include "compilation-cache.h"
+#include "string-stream.h"
+#include <set>
namespace v8 { namespace internal {
@@ -538,4 +541,478 @@ ByteArray* RegExpImpl::JsreInternal(Handle<JSRegExp> re) {
return ByteArray::cast(value->get(INTERNAL_INDEX));
}
+
+// -------------------------------------------------------------------
+// New regular expression engine
+
+
+template <typename Char> class ExecutionState;
+
+
+template <typename Char>
+class DotPrinter {
+ public:
+ DotPrinter() : stream_(&alloc_) { }
+ void PrintNode(RegExpNode<Char>* node);
+ void Visit(RegExpNode<Char>* node);
+ StringStream* stream() { return &stream_; }
+ private:
+ HeapStringAllocator alloc_;
+ StringStream stream_;
+ std::set<RegExpNode<Char>*> seen_;
+};
+
+
+template <typename Char>
+class RegExpCompiler: public RegExpVisitor {
+ public:
+ RegExpCompiler() { }
+ RegExpNode<Char>* Compile(RegExpTree* tree, RegExpNode<Char>* rest) {
+ return static_cast<RegExpNode<Char>*>(tree->Accept(this, rest));
+ }
+#define MAKE_CASE(Name) virtual void* Visit##Name(RegExp##Name*, void*);
+ FOR_EACH_REG_EXP_NODE_TYPE(MAKE_CASE)
+#undef MAKE_CASE
+};
+
+
+template <typename Char>
+class RegExpNode: public ZoneObject {
+ public:
+ virtual ~RegExpNode() { }
+ virtual void EmitDot(DotPrinter<Char>* out);
+ virtual bool Step(ExecutionState<Char>* state) = 0;
+};
+
+
+template <typename Char>
+class SeqRegExpNode: public RegExpNode<Char> {
+ public:
+ SeqRegExpNode(RegExpNode<Char>* next) : next_(next) { }
+ RegExpNode<Char>* next() { return next_; }
+ private:
+ RegExpNode<Char>* next_;
+};
+
+
+template <typename Char>
+class EndNode: public RegExpNode<Char> {
+ public:
+ virtual void EmitDot(DotPrinter<Char>* out);
+ virtual bool Step(ExecutionState<Char>* state);
+};
+
+
+template <typename Char>
+class AtomNode: public SeqRegExpNode<Char> {
+ public:
+ AtomNode(Vector<const uc16> data, RegExpNode<Char>* next)
+ : SeqRegExpNode<Char>(next),
+ data_(data) { }
+ virtual void EmitDot(DotPrinter<Char>* out);
+ virtual bool Step(ExecutionState<Char>* state);
+ Vector<const uc16> data() { return data_; }
+ private:
+ Vector<const uc16> data_;
+};
+
+
+template <typename Char>
+class CharacterClassNode: public SeqRegExpNode<Char> {
+ public:
+ CharacterClassNode(ZoneList<CharacterRange>* ranges, RegExpNode<Char>* next)
+ : SeqRegExpNode<Char>(next),
+ ranges_(ranges) { }
+ virtual void EmitDot(DotPrinter<Char>* out);
+ virtual bool Step(ExecutionState<Char>* state);
+ ZoneList<CharacterRange>* ranges() { return ranges_; }
+ private:
+ ZoneList<CharacterRange>* ranges_;
+};
+
+
+template <typename Char>
+class ChoiceNode: public RegExpNode<Char> {
+ public:
+ ChoiceNode(ZoneList<RegExpNode<Char>*>* choices) : choices_(choices) { }
+ virtual void EmitDot(DotPrinter<Char>* out);
+ virtual bool Step(ExecutionState<Char>* state);
+ ZoneList<RegExpNode<Char>*>* choices() { return choices_; }
+ private:
+ ZoneList<RegExpNode<Char>*>* choices_;
+};
+
+
+// -------------------------------------------------------------------
+// Dot/dotty output
Erik Corry 2008/10/27 14:58:44 Cool idea :-)
+
+
+template <typename Char>
+void DotPrinter<Char>::PrintNode(RegExpNode<Char>* node) {
+ stream()->Add("digraph G {\n");
+ Visit(node);
+ stream()->Add("}\n");
+ printf("%s", *(stream()->ToCString()));
+}
+
+
+template <typename Char>
+void DotPrinter<Char>::Visit(RegExpNode<Char>* node) {
+ if (seen_.find(node) != seen_.end())
+ return;
+ seen_.insert(node);
+ node->EmitDot(this);
+}
+
+
+template <typename Char>
+void RegExpNode<Char>::EmitDot(DotPrinter<Char>* out) {
+ UNIMPLEMENTED();
+}
+
+
+template <typename Char>
+void ChoiceNode<Char>::EmitDot(DotPrinter<Char>* out) {
+ out->stream()->Add("n%p [label=\"?\"];\n", this);
+ for (int i = 0; i < choices()->length(); i++) {
+ out->stream()->Add("n%p -> n%p [label=\"%i\"];\n", this, choices()->at(i), i);
+ out->Visit(choices()->at(i));
+ }
+}
+
+
+template <typename Char>
+void AtomNode<Char>::EmitDot(DotPrinter<Char>* out) {
+ out->stream()->Add("n%p [label=\"'%w'\"];\n", this, data());
+ out->stream()->Add("n%p -> n%p;\n", this, this->next());
+ out->Visit(this->next());
+}
+
+
+template <typename Char>
+void EndNode<Char>::EmitDot(DotPrinter<Char>* out) {
+ out->stream()->Add("n%p [style=bold, label=\"done\"];\n", this);
+}
+
+
+template <typename Char>
+void CharacterClassNode<Char>::EmitDot(DotPrinter<Char>* out) {
+ out->stream()->Add("n%p [label=\"[...]\"];\n", this);
+ out->stream()->Add("n%p -> n%p;\n", this, this->next());
+ out->Visit(this->next());
+}
+
+
+// -------------------------------------------------------------------
+// Tree to graph conversion
+
+
+template <typename Char>
+void* RegExpCompiler<Char>::VisitAtom(RegExpAtom* that, void* rest) {
+ return new AtomNode<Char>(that->data(),
+ static_cast<RegExpNode<Char>*>(rest));
+}
+
+
+template <typename Char>
+void* RegExpCompiler<Char>::VisitCharacterClass(RegExpCharacterClass* that,
+ void* rest) {
+ return new CharacterClassNode<Char>(that->ranges(),
+ static_cast<RegExpNode<Char>*>(rest));
+}
+
+
+template <typename Char>
+void* RegExpCompiler<Char>::VisitDisjunction(RegExpDisjunction* that,
+ void* rest_ptr) {
+ RegExpNode<Char>* rest = static_cast<RegExpNode<Char>*>(rest_ptr);
+ ZoneList<RegExpTree*>* children = that->nodes();
+ int length = children->length();
+ ZoneList<RegExpNode<Char>*>* choices
+ = new ZoneList<RegExpNode<Char>*>(length);
+ for (int i = 0; i < length; i++)
+ choices->Add(Compile(children->at(i), rest));
+ return new ChoiceNode<Char>(choices);
+}
+
+
+template <typename Char>
+void* RegExpCompiler<Char>::VisitQuantifier(RegExpQuantifier* that,
+ void* rest_ptr) {
+ RegExpNode<Char>* rest = static_cast<RegExpNode<Char>*>(rest_ptr);
+ if (that->max() >= RegExpQuantifier::kInfinity) {
+ // Don't try to count the number of iterations if the max it too
+ // large.
+ if (that->min() != 0) {
+ UNIMPLEMENTED();
+ }
+ ZoneList<RegExpNode<Char>*>* loop_choices
+ = new ZoneList<RegExpNode<Char>*>(2);
+ RegExpNode<Char>* loop_node = new ChoiceNode<Char>(loop_choices);
+ RegExpNode<Char>* body_node = Compile(that->body(), loop_node);
+ if (that->is_greedy()) {
+ loop_choices->Add(body_node);
+ loop_choices->Add(rest);
+ } else {
+ loop_choices->Add(rest);
+ loop_choices->Add(body_node);
+ }
+ return loop_node;
+ } else {
+ UNIMPLEMENTED();
+ return NULL;
+ }
+}
+
+
+template <typename Char>
+void* RegExpCompiler<Char>::VisitAssertion(RegExpAssertion* that,
+ void* rest) {
+ UNIMPLEMENTED();
+ return NULL;
+}
+
+
+template <typename Char>
+void* RegExpCompiler<Char>::VisitCapture(RegExpCapture* that, void* rest) {
+ UNIMPLEMENTED();
+ return NULL;
+}
+
+
+template <typename Char>
+void* RegExpCompiler<Char>::VisitLookahead(RegExpLookahead* that,
+ void* rest) {
+ UNIMPLEMENTED();
+ return NULL;
+}
+
+
+template <typename Char>
+void* RegExpCompiler<Char>::VisitEmpty(RegExpEmpty* that, void* rest) {
+ return rest;
+}
+
+
+template <typename Char>
+void* RegExpCompiler<Char>::VisitAlternative(RegExpAlternative* that,
+ void* rest) {
+ ZoneList<RegExpTree*>* children = that->nodes();
+ RegExpNode<Char>* current = static_cast<RegExpNode<Char>*>(rest);
+ for (int i = children->length() - 1; i >= 0; i--) {
+ current = Compile(children->at(i), current);
+ }
+ return current;
+}
+
+
+// -------------------------------------------------------------------
+// Execution
+
+
+template <typename Char>
+class ExecutionState {
+ public:
+ ExecutionState(RegExpNode<Char>* start, Vector<Char> input)
+ : current_(start),
+ input_(input),
+ pos_(0),
+ backtrack_stack_(8),
+ is_done_(false) { }
+
+ class BacktrackState {
+ public:
+ BacktrackState(ChoiceNode<Char>* choice, int next, int pos)
+ : choice_(choice),
+ next_(next),
+ pos_(pos) { }
+ ChoiceNode<Char>* choice() { return choice_; }
+ int next() { return next_; }
+ void set_next(int value) { next_ = value; }
+ int pos() { return pos_; }
+ private:
+ ChoiceNode<Char>* choice_;
+ int next_;
+ int pos_;
+ };
+
+ // Execute a single step, returning true if it succeeded
+ inline bool Step() { return current()->Step(this); }
+
+ // Stores the given choice node and the execution state on the
+ // backtrack stack.
+ void SaveBacktrack(ChoiceNode<Char>* choice);
+
+ // Reverts to the next unused backtrack if there is one. Returns
+ // false exactly if there was no backtrack to restore.
+ bool Backtrack();
+
+ Char current_char() { return input()[pos()]; }
+
+ void Advance(int delta, RegExpNode<Char>* next) {
+ pos_ += delta;
+ current_ = next;
+ }
+
+ bool AtEnd() { return pos_ >= input_.length(); }
+
+ bool is_done() { return is_done_; }
+ void set_done() { is_done_ = true; }
+
+ List<BacktrackState>* backtrack_stack() { return &backtrack_stack_; }
+ RegExpNode<Char>* current() { return current_; }
+ void set_current(RegExpNode<Char>* value) { current_ = value; }
+ Vector<Char> input() { return input_; }
+ int pos() { return pos_; }
+ private:
+ RegExpNode<Char>* current_;
+ Vector<Char> input_;
+ int pos_;
+ List<BacktrackState> backtrack_stack_;
+ bool is_done_;
+};
+
+
+template <typename Char>
+void ExecutionState<Char>::SaveBacktrack(ChoiceNode<Char>* choice) {
+ ASSERT(choice->choices()->length() > 1);
+ if (FLAG_trace_regexps) {
+ PrintF("Setting up backtrack on level %i for choice %p\n",
+ backtrack_stack()->length(),
+ choice);
+ }
+ backtrack_stack()->Add(BacktrackState(choice, 1, pos_));
+}
+
+
+template <typename Char>
+bool ExecutionState<Char>::Backtrack() {
+ if (backtrack_stack()->is_empty()) return false;
+ BacktrackState& top = backtrack_stack()->at(backtrack_stack()->length() - 1);
+ ZoneList<RegExpNode<Char>*>* choices = top.choice()->choices();
+ int next_index = top.next();
+ current_ = choices->at(next_index);
+ pos_ = top.pos();
+ if (FLAG_trace_regexps) {
+ PrintF("Backtracking to %p[%i] on level %i\n",
+ top.choice(),
+ next_index,
+ backtrack_stack()->length() - 1);
+ }
+ if (next_index == choices->length() - 1) {
+ if (FLAG_trace_regexps)
+ PrintF("Popping backtrack on level %i\n",
Lasse Reichstein 2008/10/27 13:12:58 Move to same line as "if" or wrap in squiggly brac
+ backtrack_stack()->length() - 1);
+ // If this was the last alternative we're done with this backtrack
+ // state and can pop it off the stack.
+ backtrack_stack()->RemoveLast();
+ } else {
+ if (FLAG_trace_regexps)
+ PrintF("Advancing backtrack on level %i\n",
+ backtrack_stack()->length() - 1);
+ // Otherwise we set the next choice to visit if this one fails.
+ top.set_next(next_index + 1);
+ }
+ return true;
+}
+
+
+template <typename Char>
+bool ChoiceNode<Char>::Step(ExecutionState<Char>* state) {
+ state->SaveBacktrack(this);
+ state->set_current(this->choices()->at(0));
+ return true;
+}
+
+
+template <typename Char>
+bool AtomNode<Char>::Step(ExecutionState<Char>* state) {
+ Vector<const uc16> data = this->data();
+ int length = data.length();
+ Vector<Char> input = state->input();
+ int p = state->pos();
+ if (p + length > input.length())
+ return false;
Lasse Reichstein 2008/10/27 13:12:58 Same line as "if" or squiggly braces. There are s
+ for (int i = 0; i < length; i++, p++) {
+ if (data[i] != input[p])
+ return false;
+ }
+ state->Advance(length, this->next());
+ return true;
+}
+
+
+template <typename Char>
+bool CharacterClassNode<Char>::Step(ExecutionState<Char>* state) {
+ if (state->AtEnd()) return false;
+ ZoneList<CharacterRange>* ranges = this->ranges();
+ unsigned current = state->current_char();
+ for (int i = 0; i < ranges->length(); i++) {
+ CharacterRange& range = ranges->at(i);
+ if (range.is_special()) {
+ switch (range.from()) {
+ case '.':
Lasse Reichstein 2008/10/27 13:12:58 Is just the "." in the initial implicit ".*?"? Oth
Christian Plesner Hansen 2008/10/27 18:57:02 This is the general '.'. However, I don't expect
+ state->Advance(1, this->next());
+ return true;
+ default:
+ UNIMPLEMENTED();
+ }
+ } else {
+ if (range.from() <= current && current <= range.to()) {
+ state->Advance(1, this->next());
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+
+template <typename Char>
+bool EndNode<Char>::Step(ExecutionState<Char>* state) {
+ state->set_done();
+ return false;
+}
+
+
+template <typename Char>
+bool RegExpEngine::Execute(RegExpNode<Char>* start, Vector<Char> input) {
+ ExecutionState<Char> state(start, input);
+ if (FLAG_trace_regexps) {
+ PrintF("Beginning regexp execution\n");
+ }
+ while (true) {
+ if (!state.Step() && (state.is_done() || !state.Backtrack()))
+ break;
Lasse Reichstein 2008/10/27 13:12:58 Is there a reason for splitting the if and the whi
Christian Plesner Hansen 2008/10/27 18:57:02 No, I've fixed it.
+ }
+ if (FLAG_trace_regexps) {
+ PrintF("Matching %s\n", state.is_done() ? "succeeded" : "failed");
+ }
+ return state.is_done();
+}
+
+
+template <typename Char>
+RegExpNode<Char>* RegExpEngine::Compile(RegExpTree* regexp) {
+ RegExpNode<Char>* end = new EndNode<Char>();
+ RegExpCompiler<Char> compiler;
+ return compiler.Compile(regexp, end);
+}
+
+
+template
+RegExpNode<const char>* RegExpEngine::Compile<const char>(RegExpTree* regexp);
+
+template
+RegExpNode<const uc16>* RegExpEngine::Compile<const uc16>(RegExpTree* regexp);
+
+template
+bool RegExpEngine::Execute<const char>(RegExpNode<const char>* start,
+ Vector<const char> input);
+
+template
+bool RegExpEngine::Execute<const uc32>(RegExpNode<const uc32>* start,
+ Vector<const uc32> input);
Lasse Reichstein 2008/10/27 13:12:58 Should input really be uc32?
Christian Plesner Hansen 2008/10/27 18:57:02 No. It was getting late...
+
+
}} // namespace v8::internal

Powered by Google App Engine
This is Rietveld 408576698