Index: runtime/vm/regexp.h |
diff --git a/runtime/vm/regexp.h b/runtime/vm/regexp.h |
index 2b2f6e9c2f643e6f78972030f65dcacf6838f6e6..6551433f182e01b7088753e567cb29982b3eb901 100644 |
--- a/runtime/vm/regexp.h |
+++ b/runtime/vm/regexp.h |
@@ -6,7 +6,9 @@ |
#define VM_REGEXP_H_ |
#include "vm/assembler.h" |
+#include "vm/intermediate_language.h" |
#include "vm/object.h" |
+#include "vm/regexp_assembler.h" |
namespace dart { |
@@ -239,7 +241,7 @@ class RegExpNode: public ZoneAllocated { |
bool EmitQuickCheck(RegExpCompiler* compiler, |
Trace* trace, |
bool preload_has_checked_bounds, |
- Label* on_possible_success, |
+ BlockLabel* on_possible_success, |
QuickCheckDetails* details_return, |
bool fall_through_on_failure); |
// For a given number of characters this returns a mask and a value. The |
@@ -299,7 +301,7 @@ class RegExpNode: public ZoneAllocated { |
if (offset == 0) set_bm_info(not_at_start, bm); |
} |
- Label* label() { return &label_; } |
+ BlockLabel* label() { return &label_; } |
// If non-generic code is generated for a node (i.e. the node is not at the |
// start of the trace) then it cannot be reused. This variable sets a limit |
// on how often we allow that to happen before we insist on starting a new |
@@ -319,13 +321,15 @@ class RegExpNode: public ZoneAllocated { |
enum LimitResult { DONE, CONTINUE }; |
RegExpNode* replacement_; |
+ LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace); |
+ |
void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) { |
bm_info_[not_at_start ? 1 : 0] = bm; |
} |
private: |
static const intptr_t kFirstCharBudget = 10; |
- Label label_; |
+ BlockLabel label_; |
NodeInfo info_; |
// This variable keeps track of how many times code has been generated for |
// this node (in different traces). We don't keep track of where the |
@@ -535,7 +539,7 @@ class GuardedAlternative { |
class OutSet: public ZoneAllocated { |
public: |
OutSet() : first_(0), remaining_(NULL), successors_(NULL) { } |
- OutSet* Extend(unsigned value, Zone* zone); |
+ OutSet* Extend(unsigned value, Isolate* isolate); |
bool Get(unsigned value) const; |
static const unsigned kFirstLimit = 32; |
@@ -543,7 +547,7 @@ class OutSet: public ZoneAllocated { |
// Destructively set a value in this set. In most cases you want |
// to use Extend instead to ensure that only one instance exists |
// that contains the same values. |
- void Set(unsigned value, Zone* zone); |
+ void Set(unsigned value, Isolate* isolate); |
// The successors are a list of sets that contain the same values |
// as this set and the one more value that is not present in this |
@@ -573,8 +577,8 @@ class DispatchTable : public ZoneAllocated { |
uint16_t from() { return from_; } |
uint16_t to() { return to_; } |
void set_to(uint16_t value) { to_ = value; } |
- void AddValue(intptr_t value, Zone* zone) { |
- out_set_ = out_set_->Extend(value, zone); |
+ void AddValue(intptr_t value, Isolate* isolate) { |
+ out_set_ = out_set_->Extend(value, isolate); |
} |
OutSet* out_set() { return out_set_; } |
private: |
@@ -632,7 +636,6 @@ class ChoiceNode: public RegExpNode { |
ZoneGrowableArray<GuardedAlternative>* alternatives() { |
return alternatives_; |
} |
- DispatchTable* GetTable(bool ignore_case); |
virtual void Emit(RegExpCompiler* compiler, Trace* trace); |
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t budget, |
bool not_at_start); |
@@ -986,6 +989,149 @@ class QuickCheckDetails { |
}; |
+// Improve the speed that we scan for an initial point where a non-anchored |
+// regexp can match by using a Boyer-Moore-like table. This is done by |
+// identifying non-greedy non-capturing loops in the nodes that eat any |
+// character one at a time. For example in the middle of the regexp |
+// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly |
+// inserted at the start of any non-anchored regexp. |
+// |
+// When we have found such a loop we look ahead in the nodes to find the set of |
+// characters that can come at given distances. For example for the regexp |
+// /.?foo/ we know that there are at least 3 characters ahead of us, and the |
+// sets of characters that can occur are [any, [f, o], [o]]. We find a range in |
+// the lookahead info where the set of characters is reasonably constrained. In |
+// our example this is from index 1 to 2 (0 is not constrained). We can now |
+// look 3 characters ahead and if we don't find one of [f, o] (the union of |
+// [f, o] and [o]) then we can skip forwards by the range size (in this case 2). |
+// |
+// For Unicode input strings we do the same, but modulo 128. |
+// |
+// We also look at the first string fed to the regexp and use that to get a hint |
+// of the character frequencies in the inputs. This affects the assessment of |
+// whether the set of characters is 'reasonably constrained'. |
+// |
+// We also have another lookahead mechanism (called quick check in the code), |
+// which uses a wide load of multiple characters followed by a mask and compare |
+// to determine whether a match is possible at this point. |
+enum ContainedInLattice { |
+ kNotYet = 0, |
+ kLatticeIn = 1, |
+ kLatticeOut = 2, |
+ kLatticeUnknown = 3 // Can also mean both in and out. |
+}; |
+ |
+ |
+inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) { |
+ return static_cast<ContainedInLattice>(a | b); |
+} |
+ |
+ |
+ContainedInLattice AddRange(ContainedInLattice a, |
+ const int* ranges, |
+ intptr_t ranges_size, |
+ Interval new_range); |
+ |
+ |
+class BoyerMoorePositionInfo : public ZoneAllocated { |
+ public: |
+ explicit BoyerMoorePositionInfo(Isolate* isolate) |
+ : map_(new(isolate) ZoneGrowableArray<bool>(kMapSize)), |
+ map_count_(0), |
+ w_(kNotYet), |
+ s_(kNotYet), |
+ d_(kNotYet), |
+ surrogate_(kNotYet) { |
+ for (intptr_t i = 0; i < kMapSize; i++) { |
+ map_->Add(false); |
+ } |
+ } |
+ |
+ bool& at(intptr_t i) { return (*map_)[i]; } |
+ |
+ static const intptr_t kMapSize = 128; |
+ static const intptr_t kMask = kMapSize - 1; |
+ |
+ intptr_t map_count() const { return map_count_; } |
+ |
+ void Set(intptr_t character); |
+ void SetInterval(const Interval& interval); |
+ void SetAll(); |
+ bool is_non_word() { return w_ == kLatticeOut; } |
+ bool is_word() { return w_ == kLatticeIn; } |
+ |
+ private: |
+ ZoneGrowableArray<bool>* map_; |
+ intptr_t map_count_; // Number of set bits in the map. |
+ ContainedInLattice w_; // The \w character class. |
+ ContainedInLattice s_; // The \s character class. |
+ ContainedInLattice d_; // The \d character class. |
+ ContainedInLattice surrogate_; // Surrogate UTF-16 code units. |
+}; |
+ |
+ |
+class BoyerMooreLookahead : public ZoneAllocated{ |
+ public: |
+ BoyerMooreLookahead(intptr_t length, RegExpCompiler* compiler, |
+ Isolate* Isolate); |
+ |
+ intptr_t length() { return length_; } |
+ intptr_t max_char() { return max_char_; } |
+ RegExpCompiler* compiler() { return compiler_; } |
+ |
+ intptr_t Count(intptr_t map_number) { |
+ return bitmaps_->At(map_number)->map_count(); |
+ } |
+ |
+ BoyerMoorePositionInfo* at(intptr_t i) { return bitmaps_->At(i); } |
+ |
+ void Set(intptr_t map_number, intptr_t character) { |
+ if (character > max_char_) return; |
+ BoyerMoorePositionInfo* info = bitmaps_->At(map_number); |
+ info->Set(character); |
+ } |
+ |
+ void SetInterval(intptr_t map_number, const Interval& interval) { |
+ if (interval.from() > max_char_) return; |
+ BoyerMoorePositionInfo* info = bitmaps_->At(map_number); |
+ if (interval.to() > max_char_) { |
+ info->SetInterval(Interval(interval.from(), max_char_)); |
+ } else { |
+ info->SetInterval(interval); |
+ } |
+ } |
+ |
+ void SetAll(intptr_t map_number) { |
+ bitmaps_->At(map_number)->SetAll(); |
+ } |
+ |
+ void SetRest(intptr_t from_map) { |
+ for (intptr_t i = from_map; i < length_; i++) SetAll(i); |
+ } |
+ bool EmitSkipInstructions(RegExpMacroAssembler* masm); |
+ |
+ private: |
+ // This is the value obtained by EatsAtLeast. If we do not have at least this |
+ // many characters left in the sample string then the match is bound to fail. |
+ // Therefore it is OK to read a character this far ahead of the current match |
+ // point. |
+ intptr_t length_; |
+ RegExpCompiler* compiler_; |
+ // 0x7f for ASCII, 0xffff for UTF-16. |
+ intptr_t max_char_; |
+ ZoneGrowableArray<BoyerMoorePositionInfo*>* bitmaps_; |
+ |
+ intptr_t GetSkipTable(intptr_t min_lookahead, |
+ intptr_t max_lookahead, |
+ TypedData* boolean_skip_table); |
+ bool FindWorthwhileInterval(intptr_t* from, intptr_t* to); |
+ intptr_t FindBestInterval( |
+ intptr_t max_number_of_chars, |
+ intptr_t old_biggest_points, |
+ intptr_t* from, intptr_t* to); |
+}; |
+ |
+ |
// There are many ways to generate code for a node. This class encapsulates |
// the current way we should be generating. In other words it encapsulates |
// the current state of the code generator. The effect of this is that we |
@@ -1101,8 +1247,8 @@ class Trace { |
void set_at_start(bool at_start) { |
at_start_ = at_start ? TRUE_VALUE : FALSE_VALUE; |
} |
- Label* backtrack() { return backtrack_; } |
- Label* loop_label() { return loop_label_; } |
+ BlockLabel* backtrack() { return backtrack_; } |
+ BlockLabel* loop_label() { return loop_label_; } |
RegExpNode* stop_node() { return stop_node_; } |
intptr_t characters_preloaded() { return characters_preloaded_; } |
intptr_t bound_checked_up_to() { return bound_checked_up_to_; } |
@@ -1120,9 +1266,9 @@ class Trace { |
new_action->next_ = actions_; |
actions_ = new_action; |
} |
- void set_backtrack(Label* backtrack) { backtrack_ = backtrack; } |
+ void set_backtrack(BlockLabel* backtrack) { backtrack_ = backtrack; } |
void set_stop_node(RegExpNode* node) { stop_node_ = node; } |
- void set_loop_label(Label* label) { loop_label_ = label; } |
+ void set_loop_label(BlockLabel* label) { loop_label_ = label; } |
void set_characters_preloaded(intptr_t count) { |
characters_preloaded_ = count; |
} |
@@ -1135,22 +1281,22 @@ class Trace { |
void AdvanceCurrentPositionInTrace(intptr_t by, RegExpCompiler* compiler); |
private: |
- intptr_t FindAffectedRegisters(OutSet* affected_registers, Zone* zone); |
+ intptr_t FindAffectedRegisters(OutSet* affected_registers, Isolate* isolate); |
void PerformDeferredActions(RegExpMacroAssembler* macro, |
intptr_t max_register, |
const OutSet& affected_registers, |
OutSet* registers_to_pop, |
OutSet* registers_to_clear, |
- Zone* zone); |
+ Isolate* isolate); |
void RestoreAffectedRegisters(RegExpMacroAssembler* macro, |
intptr_t max_register, |
const OutSet& registers_to_pop, |
const OutSet& registers_to_clear); |
intptr_t cp_offset_; |
DeferredAction* actions_; |
- Label* backtrack_; |
+ BlockLabel* backtrack_; |
RegExpNode* stop_node_; |
- Label* loop_label_; |
+ BlockLabel* loop_label_; |
intptr_t characters_preloaded_; |
intptr_t bound_checked_up_to_; |
QuickCheckDetails quick_check_performed_; |
@@ -1214,7 +1360,7 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT) |
}; |
-struct RegExpCompileData { |
+struct RegExpCompileData : public ZoneAllocated { |
RegExpCompileData() |
: tree(NULL), |
node(NULL), |
@@ -1235,22 +1381,38 @@ class RegExpEngine: public AllStatic { |
public: |
struct CompilationResult { |
explicit CompilationResult(const char* error_message) |
- : error_message(error_message), |
- num_registers(0) {} |
- CompilationResult(Object* code, intptr_t registers) |
- : error_message(NULL), |
- num_registers(registers) {} |
+ : macro_assembler(NULL), |
+ graph_entry(NULL), |
+ num_blocks(-1), |
+ num_stack_locals(-1), |
+ error_message(error_message) {} |
+ explicit CompilationResult(IRRegExpMacroAssembler* macro_assembler, |
Vyacheslav Egorov (Google)
2014/10/01 20:13:21
No need to mark multiple arguments constructor "ex
jgruber1
2014/10/03 18:59:51
Done.
|
+ GraphEntryInstr* graph_entry, |
+ intptr_t num_blocks, |
+ intptr_t num_stack_locals) |
+ : macro_assembler(macro_assembler), |
+ graph_entry(graph_entry), |
+ num_blocks(num_blocks), |
+ num_stack_locals(num_stack_locals), |
+ error_message(NULL) {} |
+ |
+ IRRegExpMacroAssembler* macro_assembler; |
+ GraphEntryInstr* graph_entry; |
+ const intptr_t num_blocks; |
+ const intptr_t num_stack_locals; |
+ |
const char* error_message; |
- intptr_t num_registers; |
}; |
- static CompilationResult Compile(RegExpCompileData* input, |
- bool ignore_case, |
- bool global, |
- bool multiline, |
- const String& pattern, |
- const String& sample_subject, |
- bool is_ascii); |
+ static CompilationResult Compile( |
+ RegExpCompileData* input, |
+ const ParsedFunction* parsed_function, |
+ ZoneGrowableArray<const ICData*>* ic_data_array); |
+ |
+ static RawJSRegExp* New(Isolate* isolate, |
+ const String& pattern, |
+ bool multi_line, |
+ bool ignore_case); |
static void DotPrint(const char* label, RegExpNode* node, bool ignore_case); |
}; |