Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(260)

Unified Diff: runtime/vm/regexp.h

Issue 539153002: Port and integrate the irregexp engine from V8 (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Port remaining V8 regexp tests and fix exposed bugs. Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: runtime/vm/regexp.h
diff --git a/runtime/vm/regexp.h b/runtime/vm/regexp.h
index 2b2f6e9c2f643e6f78972030f65dcacf6838f6e6..6551433f182e01b7088753e567cb29982b3eb901 100644
--- a/runtime/vm/regexp.h
+++ b/runtime/vm/regexp.h
@@ -6,7 +6,9 @@
#define VM_REGEXP_H_
#include "vm/assembler.h"
+#include "vm/intermediate_language.h"
#include "vm/object.h"
+#include "vm/regexp_assembler.h"
namespace dart {
@@ -239,7 +241,7 @@ class RegExpNode: public ZoneAllocated {
bool EmitQuickCheck(RegExpCompiler* compiler,
Trace* trace,
bool preload_has_checked_bounds,
- Label* on_possible_success,
+ BlockLabel* on_possible_success,
QuickCheckDetails* details_return,
bool fall_through_on_failure);
// For a given number of characters this returns a mask and a value. The
@@ -299,7 +301,7 @@ class RegExpNode: public ZoneAllocated {
if (offset == 0) set_bm_info(not_at_start, bm);
}
- Label* label() { return &label_; }
+ BlockLabel* label() { return &label_; }
// If non-generic code is generated for a node (i.e. the node is not at the
// start of the trace) then it cannot be reused. This variable sets a limit
// on how often we allow that to happen before we insist on starting a new
@@ -319,13 +321,15 @@ class RegExpNode: public ZoneAllocated {
enum LimitResult { DONE, CONTINUE };
RegExpNode* replacement_;
+ LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
+
void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
bm_info_[not_at_start ? 1 : 0] = bm;
}
private:
static const intptr_t kFirstCharBudget = 10;
- Label label_;
+ BlockLabel label_;
NodeInfo info_;
// This variable keeps track of how many times code has been generated for
// this node (in different traces). We don't keep track of where the
@@ -535,7 +539,7 @@ class GuardedAlternative {
class OutSet: public ZoneAllocated {
public:
OutSet() : first_(0), remaining_(NULL), successors_(NULL) { }
- OutSet* Extend(unsigned value, Zone* zone);
+ OutSet* Extend(unsigned value, Isolate* isolate);
bool Get(unsigned value) const;
static const unsigned kFirstLimit = 32;
@@ -543,7 +547,7 @@ class OutSet: public ZoneAllocated {
// Destructively set a value in this set. In most cases you want
// to use Extend instead to ensure that only one instance exists
// that contains the same values.
- void Set(unsigned value, Zone* zone);
+ void Set(unsigned value, Isolate* isolate);
// The successors are a list of sets that contain the same values
// as this set and the one more value that is not present in this
@@ -573,8 +577,8 @@ class DispatchTable : public ZoneAllocated {
uint16_t from() { return from_; }
uint16_t to() { return to_; }
void set_to(uint16_t value) { to_ = value; }
- void AddValue(intptr_t value, Zone* zone) {
- out_set_ = out_set_->Extend(value, zone);
+ void AddValue(intptr_t value, Isolate* isolate) {
+ out_set_ = out_set_->Extend(value, isolate);
}
OutSet* out_set() { return out_set_; }
private:
@@ -632,7 +636,6 @@ class ChoiceNode: public RegExpNode {
ZoneGrowableArray<GuardedAlternative>* alternatives() {
return alternatives_;
}
- DispatchTable* GetTable(bool ignore_case);
virtual void Emit(RegExpCompiler* compiler, Trace* trace);
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t budget,
bool not_at_start);
@@ -986,6 +989,149 @@ class QuickCheckDetails {
};
+// Improve the speed that we scan for an initial point where a non-anchored
+// regexp can match by using a Boyer-Moore-like table. This is done by
+// identifying non-greedy non-capturing loops in the nodes that eat any
+// character one at a time. For example in the middle of the regexp
+// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
+// inserted at the start of any non-anchored regexp.
+//
+// When we have found such a loop we look ahead in the nodes to find the set of
+// characters that can come at given distances. For example for the regexp
+// /.?foo/ we know that there are at least 3 characters ahead of us, and the
+// sets of characters that can occur are [any, [f, o], [o]]. We find a range in
+// the lookahead info where the set of characters is reasonably constrained. In
+// our example this is from index 1 to 2 (0 is not constrained). We can now
+// look 3 characters ahead and if we don't find one of [f, o] (the union of
+// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
+//
+// For Unicode input strings we do the same, but modulo 128.
+//
+// We also look at the first string fed to the regexp and use that to get a hint
+// of the character frequencies in the inputs. This affects the assessment of
+// whether the set of characters is 'reasonably constrained'.
+//
+// We also have another lookahead mechanism (called quick check in the code),
+// which uses a wide load of multiple characters followed by a mask and compare
+// to determine whether a match is possible at this point.
+enum ContainedInLattice {
+ kNotYet = 0,
+ kLatticeIn = 1,
+ kLatticeOut = 2,
+ kLatticeUnknown = 3 // Can also mean both in and out.
+};
+
+
+inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
+ return static_cast<ContainedInLattice>(a | b);
+}
+
+
+ContainedInLattice AddRange(ContainedInLattice a,
+ const int* ranges,
+ intptr_t ranges_size,
+ Interval new_range);
+
+
+class BoyerMoorePositionInfo : public ZoneAllocated {
+ public:
+ explicit BoyerMoorePositionInfo(Isolate* isolate)
+ : map_(new(isolate) ZoneGrowableArray<bool>(kMapSize)),
+ map_count_(0),
+ w_(kNotYet),
+ s_(kNotYet),
+ d_(kNotYet),
+ surrogate_(kNotYet) {
+ for (intptr_t i = 0; i < kMapSize; i++) {
+ map_->Add(false);
+ }
+ }
+
+ bool& at(intptr_t i) { return (*map_)[i]; }
+
+ static const intptr_t kMapSize = 128;
+ static const intptr_t kMask = kMapSize - 1;
+
+ intptr_t map_count() const { return map_count_; }
+
+ void Set(intptr_t character);
+ void SetInterval(const Interval& interval);
+ void SetAll();
+ bool is_non_word() { return w_ == kLatticeOut; }
+ bool is_word() { return w_ == kLatticeIn; }
+
+ private:
+ ZoneGrowableArray<bool>* map_;
+ intptr_t map_count_; // Number of set bits in the map.
+ ContainedInLattice w_; // The \w character class.
+ ContainedInLattice s_; // The \s character class.
+ ContainedInLattice d_; // The \d character class.
+ ContainedInLattice surrogate_; // Surrogate UTF-16 code units.
+};
+
+
+class BoyerMooreLookahead : public ZoneAllocated{
+ public:
+ BoyerMooreLookahead(intptr_t length, RegExpCompiler* compiler,
+ Isolate* Isolate);
+
+ intptr_t length() { return length_; }
+ intptr_t max_char() { return max_char_; }
+ RegExpCompiler* compiler() { return compiler_; }
+
+ intptr_t Count(intptr_t map_number) {
+ return bitmaps_->At(map_number)->map_count();
+ }
+
+ BoyerMoorePositionInfo* at(intptr_t i) { return bitmaps_->At(i); }
+
+ void Set(intptr_t map_number, intptr_t character) {
+ if (character > max_char_) return;
+ BoyerMoorePositionInfo* info = bitmaps_->At(map_number);
+ info->Set(character);
+ }
+
+ void SetInterval(intptr_t map_number, const Interval& interval) {
+ if (interval.from() > max_char_) return;
+ BoyerMoorePositionInfo* info = bitmaps_->At(map_number);
+ if (interval.to() > max_char_) {
+ info->SetInterval(Interval(interval.from(), max_char_));
+ } else {
+ info->SetInterval(interval);
+ }
+ }
+
+ void SetAll(intptr_t map_number) {
+ bitmaps_->At(map_number)->SetAll();
+ }
+
+ void SetRest(intptr_t from_map) {
+ for (intptr_t i = from_map; i < length_; i++) SetAll(i);
+ }
+ bool EmitSkipInstructions(RegExpMacroAssembler* masm);
+
+ private:
+ // This is the value obtained by EatsAtLeast. If we do not have at least this
+ // many characters left in the sample string then the match is bound to fail.
+ // Therefore it is OK to read a character this far ahead of the current match
+ // point.
+ intptr_t length_;
+ RegExpCompiler* compiler_;
+ // 0x7f for ASCII, 0xffff for UTF-16.
+ intptr_t max_char_;
+ ZoneGrowableArray<BoyerMoorePositionInfo*>* bitmaps_;
+
+ intptr_t GetSkipTable(intptr_t min_lookahead,
+ intptr_t max_lookahead,
+ TypedData* boolean_skip_table);
+ bool FindWorthwhileInterval(intptr_t* from, intptr_t* to);
+ intptr_t FindBestInterval(
+ intptr_t max_number_of_chars,
+ intptr_t old_biggest_points,
+ intptr_t* from, intptr_t* to);
+};
+
+
// There are many ways to generate code for a node. This class encapsulates
// the current way we should be generating. In other words it encapsulates
// the current state of the code generator. The effect of this is that we
@@ -1101,8 +1247,8 @@ class Trace {
void set_at_start(bool at_start) {
at_start_ = at_start ? TRUE_VALUE : FALSE_VALUE;
}
- Label* backtrack() { return backtrack_; }
- Label* loop_label() { return loop_label_; }
+ BlockLabel* backtrack() { return backtrack_; }
+ BlockLabel* loop_label() { return loop_label_; }
RegExpNode* stop_node() { return stop_node_; }
intptr_t characters_preloaded() { return characters_preloaded_; }
intptr_t bound_checked_up_to() { return bound_checked_up_to_; }
@@ -1120,9 +1266,9 @@ class Trace {
new_action->next_ = actions_;
actions_ = new_action;
}
- void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
+ void set_backtrack(BlockLabel* backtrack) { backtrack_ = backtrack; }
void set_stop_node(RegExpNode* node) { stop_node_ = node; }
- void set_loop_label(Label* label) { loop_label_ = label; }
+ void set_loop_label(BlockLabel* label) { loop_label_ = label; }
void set_characters_preloaded(intptr_t count) {
characters_preloaded_ = count;
}
@@ -1135,22 +1281,22 @@ class Trace {
void AdvanceCurrentPositionInTrace(intptr_t by, RegExpCompiler* compiler);
private:
- intptr_t FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
+ intptr_t FindAffectedRegisters(OutSet* affected_registers, Isolate* isolate);
void PerformDeferredActions(RegExpMacroAssembler* macro,
intptr_t max_register,
const OutSet& affected_registers,
OutSet* registers_to_pop,
OutSet* registers_to_clear,
- Zone* zone);
+ Isolate* isolate);
void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
intptr_t max_register,
const OutSet& registers_to_pop,
const OutSet& registers_to_clear);
intptr_t cp_offset_;
DeferredAction* actions_;
- Label* backtrack_;
+ BlockLabel* backtrack_;
RegExpNode* stop_node_;
- Label* loop_label_;
+ BlockLabel* loop_label_;
intptr_t characters_preloaded_;
intptr_t bound_checked_up_to_;
QuickCheckDetails quick_check_performed_;
@@ -1214,7 +1360,7 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
};
-struct RegExpCompileData {
+struct RegExpCompileData : public ZoneAllocated {
RegExpCompileData()
: tree(NULL),
node(NULL),
@@ -1235,22 +1381,38 @@ class RegExpEngine: public AllStatic {
public:
struct CompilationResult {
explicit CompilationResult(const char* error_message)
- : error_message(error_message),
- num_registers(0) {}
- CompilationResult(Object* code, intptr_t registers)
- : error_message(NULL),
- num_registers(registers) {}
+ : macro_assembler(NULL),
+ graph_entry(NULL),
+ num_blocks(-1),
+ num_stack_locals(-1),
+ error_message(error_message) {}
+ explicit CompilationResult(IRRegExpMacroAssembler* macro_assembler,
Vyacheslav Egorov (Google) 2014/10/01 20:13:21 No need to mark multiple arguments constructor "ex
jgruber1 2014/10/03 18:59:51 Done.
+ GraphEntryInstr* graph_entry,
+ intptr_t num_blocks,
+ intptr_t num_stack_locals)
+ : macro_assembler(macro_assembler),
+ graph_entry(graph_entry),
+ num_blocks(num_blocks),
+ num_stack_locals(num_stack_locals),
+ error_message(NULL) {}
+
+ IRRegExpMacroAssembler* macro_assembler;
+ GraphEntryInstr* graph_entry;
+ const intptr_t num_blocks;
+ const intptr_t num_stack_locals;
+
const char* error_message;
- intptr_t num_registers;
};
- static CompilationResult Compile(RegExpCompileData* input,
- bool ignore_case,
- bool global,
- bool multiline,
- const String& pattern,
- const String& sample_subject,
- bool is_ascii);
+ static CompilationResult Compile(
+ RegExpCompileData* input,
+ const ParsedFunction* parsed_function,
+ ZoneGrowableArray<const ICData*>* ic_data_array);
+
+ static RawJSRegExp* New(Isolate* isolate,
+ const String& pattern,
+ bool multi_line,
+ bool ignore_case);
static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
};

Powered by Google App Engine
This is Rietveld 408576698