runtime/vm/regexp.h - Issue 539153002: Port and integrate the irregexp engine from V8

Unified Diff: runtime/vm/regexp.h

Issue 539153002: Port and integrate the irregexp engine from V8 (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 6 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: runtime/vm/regexp.h

diff --git a/runtime/vm/regexp.h b/runtime/vm/regexp.h

index e3855bedec07c0cff86e4b38a1b44af1f13b6c57..79708abb6aa808ed073a0410786a2a4263ce3830 100644

--- a/runtime/vm/regexp.h

+++ b/runtime/vm/regexp.h

@@ -6,7 +6,9 @@

#define VM_REGEXP_H_

#include "vm/assembler.h"

+#include "vm/intermediate_language.h"

#include "vm/object.h"

+#include "vm/regexp_assembler.h"

namespace dart {

@@ -239,7 +241,7 @@ class RegExpNode: public ZoneAllocated {

bool EmitQuickCheck(RegExpCompiler* compiler,

Trace* trace,

bool preload_has_checked_bounds,

- Label* on_possible_success,

+ BlockLabel* on_possible_success,

QuickCheckDetails* details_return,

bool fall_through_on_failure);

// For a given number of characters this returns a mask and a value. The

@@ -299,7 +301,7 @@ class RegExpNode: public ZoneAllocated {

if (offset == 0) set_bm_info(not_at_start, bm);

}

- Label* label() { return &label_; }

+ BlockLabel* label() { return &label_; }

// If non-generic code is generated for a node (i.e. the node is not at the

// start of the trace) then it cannot be reused. This variable sets a limit

// on how often we allow that to happen before we insist on starting a new

@@ -319,13 +321,15 @@ class RegExpNode: public ZoneAllocated {

enum LimitResult { DONE, CONTINUE };

RegExpNode* replacement_;

+ LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);

void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {

bm_info_[not_at_start ? 1 : 0] = bm;

}

private:

static const intptr_t kFirstCharBudget = 10;

- Label label_;

+ BlockLabel label_;

NodeInfo info_;

// This variable keeps track of how many times code has been generated for

// this node (in different traces). We don't keep track of where the

@@ -535,7 +539,7 @@ class GuardedAlternative {

class OutSet: public ZoneAllocated {

public:

OutSet() : first_(0), remaining_(NULL), successors_(NULL) { }

- OutSet* Extend(unsigned value, Zone* zone);

+ OutSet* Extend(unsigned value, Isolate* isolate);

bool Get(unsigned value) const;

static const unsigned kFirstLimit = 32;

@@ -543,7 +547,7 @@ class OutSet: public ZoneAllocated {

// Destructively set a value in this set. In most cases you want

// to use Extend instead to ensure that only one instance exists

// that contains the same values.

- void Set(unsigned value, Zone* zone);

+ void Set(unsigned value, Isolate* isolate);

// The successors are a list of sets that contain the same values

// as this set and the one more value that is not present in this

@@ -573,8 +577,8 @@ class DispatchTable : public ZoneAllocated {

uint16_t from() { return from_; }

uint16_t to() { return to_; }

void set_to(uint16_t value) { to_ = value; }

- void AddValue(intptr_t value, Zone* zone) {

- out_set_ = out_set_->Extend(value, zone);

+ void AddValue(intptr_t value, Isolate* isolate) {

+ out_set_ = out_set_->Extend(value, isolate);

}

OutSet* out_set() { return out_set_; }

private:

@@ -986,6 +990,149 @@ class QuickCheckDetails {

};

+// Improve the speed that we scan for an initial point where a non-anchored

+// regexp can match by using a Boyer-Moore-like table. This is done by

+// identifying non-greedy non-capturing loops in the nodes that eat any

+// character one at a time. For example in the middle of the regexp

+// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly

+// inserted at the start of any non-anchored regexp.

+//

+// When we have found such a loop we look ahead in the nodes to find the set of

+// characters that can come at given distances. For example for the regexp

+// /.?foo/ we know that there are at least 3 characters ahead of us, and the

+// sets of characters that can occur are [any, [f, o], [o]]. We find a range in

+// the lookahead info where the set of characters is reasonably constrained. In

+// our example this is from index 1 to 2 (0 is not constrained). We can now

+// look 3 characters ahead and if we don't find one of [f, o] (the union of

+// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).

+//

+// For Unicode input strings we do the same, but modulo 128.

+//

+// We also look at the first string fed to the regexp and use that to get a hint

+// of the character frequencies in the inputs. This affects the assessment of

+// whether the set of characters is 'reasonably constrained'.

+//

+// We also have another lookahead mechanism (called quick check in the code),

+// which uses a wide load of multiple characters followed by a mask and compare

+// to determine whether a match is possible at this point.

+enum ContainedInLattice {

+ kNotYet = 0,

+ kLatticeIn = 1,

+ kLatticeOut = 2,

+ kLatticeUnknown = 3 // Can also mean both in and out.

+};

+inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {

+ return static_cast<ContainedInLattice>(a | b);

+ContainedInLattice AddRange(ContainedInLattice a,

+ const int* ranges,

+ intptr_t ranges_size,

+ Interval new_range);

+class BoyerMoorePositionInfo : public ZoneAllocated {

+ public:

+ explicit BoyerMoorePositionInfo(Isolate* isolate)

+ : map_(new(isolate) ZoneGrowableArray<bool>(kMapSize)),

+ map_count_(0),

+ w_(kNotYet),

+ s_(kNotYet),

+ d_(kNotYet),

+ surrogate_(kNotYet) {

+ for (intptr_t i = 0; i < kMapSize; i++) {

+ map_->Add(false);

+ }

+ bool& at(intptr_t i) { return map_->At(i); }

+ static const intptr_t kMapSize = 128;

+ static const intptr_t kMask = kMapSize - 1;

+ intptr_t map_count() const { return map_count_; }

+ void Set(intptr_t character);

+ void SetInterval(const Interval& interval);

+ void SetAll();

+ bool is_non_word() { return w_ == kLatticeOut; }

+ bool is_word() { return w_ == kLatticeIn; }

+ private:

+ ZoneGrowableArray<bool>* map_;

+ intptr_t map_count_; // Number of set bits in the map.

+ ContainedInLattice w_; // The \w character class.

+ ContainedInLattice s_; // The \s character class.

+ ContainedInLattice d_; // The \d character class.

+ ContainedInLattice surrogate_; // Surrogate UTF-16 code units.

+};

+class BoyerMooreLookahead : public ZoneAllocated{

+ public:

+ BoyerMooreLookahead(intptr_t length, RegExpCompiler* compiler,

+ Isolate* Isolate);

+ intptr_t length() { return length_; }

+ intptr_t max_char() { return max_char_; }

+ RegExpCompiler* compiler() { return compiler_; }

+ intptr_t Count(intptr_t map_number) {

+ return bitmaps_->At(map_number)->map_count();

+ }

+ BoyerMoorePositionInfo* at(intptr_t i) { return bitmaps_->At(i); }

+ void Set(intptr_t map_number, intptr_t character) {

+ if (character > max_char_) return;

+ BoyerMoorePositionInfo* info = bitmaps_->At(map_number);

+ info->Set(character);

+ }

+ void SetInterval(intptr_t map_number, const Interval& interval) {

+ if (interval.from() > max_char_) return;

+ BoyerMoorePositionInfo* info = bitmaps_->At(map_number);

+ if (interval.to() > max_char_) {

+ info->SetInterval(Interval(interval.from(), max_char_));

+ } else {

+ info->SetInterval(interval);

+ }

+ void SetAll(intptr_t map_number) {

+ bitmaps_->At(map_number)->SetAll();

+ }

+ void SetRest(intptr_t from_map) {

+ for (intptr_t i = from_map; i < length_; i++) SetAll(i);

+ }

+ bool EmitSkipInstructions(RegExpMacroAssembler* masm);

+ private:

+ // This is the value obtained by EatsAtLeast. If we do not have at least this

+ // many characters left in the sample string then the match is bound to fail.

+ // Therefore it is OK to read a character this far ahead of the current match

+ // point.

+ intptr_t length_;

+ RegExpCompiler* compiler_;

+ // 0x7f for ASCII, 0xffff for UTF-16.

+ intptr_t max_char_;

+ ZoneGrowableArray<BoyerMoorePositionInfo*>* bitmaps_;

+ intptr_t GetSkipTable(intptr_t min_lookahead,

+ intptr_t max_lookahead,

ricow1 2014/09/05 05:49:58 indentation

jgruber1 2014/09/05 06:17:25 Done.

+ TypedData* boolean_skip_table);

+ bool FindWorthwhileInterval(int* from, int* to);

+ intptr_t FindBestInterval(

+ intptr_t max_number_of_chars,

+ intptr_t old_biggest_points,

+ int* from, int* to);

+};

// There are many ways to generate code for a node. This class encapsulates

// the current way we should be generating. In other words it encapsulates

// the current state of the code generator. The effect of this is that we

@@ -1101,8 +1248,8 @@ class Trace {

void set_at_start(bool at_start) {

at_start_ = at_start ? TRUE_VALUE : FALSE_VALUE;

}

- Label* backtrack() { return backtrack_; }

- Label* loop_label() { return loop_label_; }

+ BlockLabel* backtrack() { return backtrack_; }

+ BlockLabel* loop_label() { return loop_label_; }

RegExpNode* stop_node() { return stop_node_; }

intptr_t characters_preloaded() { return characters_preloaded_; }

intptr_t bound_checked_up_to() { return bound_checked_up_to_; }

@@ -1120,9 +1267,9 @@ class Trace {

new_action->next_ = actions_;

actions_ = new_action;

}

- void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }

+ void set_backtrack(BlockLabel* backtrack) { backtrack_ = backtrack; }

void set_stop_node(RegExpNode* node) { stop_node_ = node; }

- void set_loop_label(Label* label) { loop_label_ = label; }

+ void set_loop_label(BlockLabel* label) { loop_label_ = label; }

void set_characters_preloaded(intptr_t count) {

characters_preloaded_ = count;

}

@@ -1135,22 +1282,22 @@ class Trace {

void AdvanceCurrentPositionInTrace(intptr_t by, RegExpCompiler* compiler);

private:

- intptr_t FindAffectedRegisters(OutSet* affected_registers, Zone* zone);

+ intptr_t FindAffectedRegisters(OutSet* affected_registers, Isolate* isolate);

void PerformDeferredActions(RegExpMacroAssembler* macro,

intptr_t max_register,

const OutSet& affected_registers,

OutSet* registers_to_pop,

OutSet* registers_to_clear,

- Zone* zone);

+ Isolate* isolate);

void RestoreAffectedRegisters(RegExpMacroAssembler* macro,

intptr_t max_register,

const OutSet& registers_to_pop,

const OutSet& registers_to_clear);

intptr_t cp_offset_;

DeferredAction* actions_;

- Label* backtrack_;

+ BlockLabel* backtrack_;

RegExpNode* stop_node_;

- Label* loop_label_;

+ BlockLabel* loop_label_;

intptr_t characters_preloaded_;

intptr_t bound_checked_up_to_;

QuickCheckDetails quick_check_performed_;

@@ -1235,13 +1382,14 @@ class RegExpEngine: public AllStatic {

public:

struct CompilationResult {

explicit CompilationResult(const char* error_message)

- : error_message(error_message),

- num_registers(0) {}

- CompilationResult(Object* code, intptr_t registers)

- : error_message(NULL),

- num_registers(registers) {}

+ : function(Function::Handle()),

+ error_message(error_message) {}

+ explicit CompilationResult(const Function& function)

+ : function(function),

+ error_message(NULL) {}

+ const Function& function;

const char* error_message;

- intptr_t num_registers;

};

static CompilationResult Compile(RegExpCompileData* input,

@@ -1252,6 +1400,10 @@ class RegExpEngine: public AllStatic {

const String& sample_subject,

bool is_ascii);

+ static RawJSRegExp* New(const String& pattern,

+ bool multi_line,

+ bool ignore_case);

static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);

};

« runtime/vm/parser.h ('K') | « runtime/vm/raw_object.h ('k') | runtime/vm/regexp.cc » ('j') | runtime/vm/regexp_assembler.cc » ('J')