Chromium Code Reviews| Index: runtime/vm/regexp.h |
| diff --git a/runtime/vm/regexp.h b/runtime/vm/regexp.h |
| index e3855bedec07c0cff86e4b38a1b44af1f13b6c57..79708abb6aa808ed073a0410786a2a4263ce3830 100644 |
| --- a/runtime/vm/regexp.h |
| +++ b/runtime/vm/regexp.h |
| @@ -6,7 +6,9 @@ |
| #define VM_REGEXP_H_ |
| #include "vm/assembler.h" |
| +#include "vm/intermediate_language.h" |
| #include "vm/object.h" |
| +#include "vm/regexp_assembler.h" |
| namespace dart { |
| @@ -239,7 +241,7 @@ class RegExpNode: public ZoneAllocated { |
| bool EmitQuickCheck(RegExpCompiler* compiler, |
| Trace* trace, |
| bool preload_has_checked_bounds, |
| - Label* on_possible_success, |
| + BlockLabel* on_possible_success, |
| QuickCheckDetails* details_return, |
| bool fall_through_on_failure); |
| // For a given number of characters this returns a mask and a value. The |
| @@ -299,7 +301,7 @@ class RegExpNode: public ZoneAllocated { |
| if (offset == 0) set_bm_info(not_at_start, bm); |
| } |
| - Label* label() { return &label_; } |
| + BlockLabel* label() { return &label_; } |
| // If non-generic code is generated for a node (i.e. the node is not at the |
| // start of the trace) then it cannot be reused. This variable sets a limit |
| // on how often we allow that to happen before we insist on starting a new |
| @@ -319,13 +321,15 @@ class RegExpNode: public ZoneAllocated { |
| enum LimitResult { DONE, CONTINUE }; |
| RegExpNode* replacement_; |
| + LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace); |
| + |
| void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) { |
| bm_info_[not_at_start ? 1 : 0] = bm; |
| } |
| private: |
| static const intptr_t kFirstCharBudget = 10; |
| - Label label_; |
| + BlockLabel label_; |
| NodeInfo info_; |
| // This variable keeps track of how many times code has been generated for |
| // this node (in different traces). We don't keep track of where the |
| @@ -535,7 +539,7 @@ class GuardedAlternative { |
| class OutSet: public ZoneAllocated { |
| public: |
| OutSet() : first_(0), remaining_(NULL), successors_(NULL) { } |
| - OutSet* Extend(unsigned value, Zone* zone); |
| + OutSet* Extend(unsigned value, Isolate* isolate); |
| bool Get(unsigned value) const; |
| static const unsigned kFirstLimit = 32; |
| @@ -543,7 +547,7 @@ class OutSet: public ZoneAllocated { |
| // Destructively set a value in this set. In most cases you want |
| // to use Extend instead to ensure that only one instance exists |
| // that contains the same values. |
| - void Set(unsigned value, Zone* zone); |
| + void Set(unsigned value, Isolate* isolate); |
| // The successors are a list of sets that contain the same values |
| // as this set and the one more value that is not present in this |
| @@ -573,8 +577,8 @@ class DispatchTable : public ZoneAllocated { |
| uint16_t from() { return from_; } |
| uint16_t to() { return to_; } |
| void set_to(uint16_t value) { to_ = value; } |
| - void AddValue(intptr_t value, Zone* zone) { |
| - out_set_ = out_set_->Extend(value, zone); |
| + void AddValue(intptr_t value, Isolate* isolate) { |
| + out_set_ = out_set_->Extend(value, isolate); |
| } |
| OutSet* out_set() { return out_set_; } |
| private: |
| @@ -986,6 +990,149 @@ class QuickCheckDetails { |
| }; |
| +// Improve the speed that we scan for an initial point where a non-anchored |
| +// regexp can match by using a Boyer-Moore-like table. This is done by |
| +// identifying non-greedy non-capturing loops in the nodes that eat any |
| +// character one at a time. For example in the middle of the regexp |
| +// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly |
| +// inserted at the start of any non-anchored regexp. |
| +// |
| +// When we have found such a loop we look ahead in the nodes to find the set of |
| +// characters that can come at given distances. For example for the regexp |
| +// /.?foo/ we know that there are at least 3 characters ahead of us, and the |
| +// sets of characters that can occur are [any, [f, o], [o]]. We find a range in |
| +// the lookahead info where the set of characters is reasonably constrained. In |
| +// our example this is from index 1 to 2 (0 is not constrained). We can now |
| +// look 3 characters ahead and if we don't find one of [f, o] (the union of |
| +// [f, o] and [o]) then we can skip forwards by the range size (in this case 2). |
| +// |
| +// For Unicode input strings we do the same, but modulo 128. |
| +// |
| +// We also look at the first string fed to the regexp and use that to get a hint |
| +// of the character frequencies in the inputs. This affects the assessment of |
| +// whether the set of characters is 'reasonably constrained'. |
| +// |
| +// We also have another lookahead mechanism (called quick check in the code), |
| +// which uses a wide load of multiple characters followed by a mask and compare |
| +// to determine whether a match is possible at this point. |
| +enum ContainedInLattice { |
| + kNotYet = 0, |
| + kLatticeIn = 1, |
| + kLatticeOut = 2, |
| + kLatticeUnknown = 3 // Can also mean both in and out. |
| +}; |
| + |
| + |
| +inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) { |
| + return static_cast<ContainedInLattice>(a | b); |
| +} |
| + |
| + |
| +ContainedInLattice AddRange(ContainedInLattice a, |
| + const int* ranges, |
| + intptr_t ranges_size, |
| + Interval new_range); |
| + |
| + |
| +class BoyerMoorePositionInfo : public ZoneAllocated { |
| + public: |
| + explicit BoyerMoorePositionInfo(Isolate* isolate) |
| + : map_(new(isolate) ZoneGrowableArray<bool>(kMapSize)), |
| + map_count_(0), |
| + w_(kNotYet), |
| + s_(kNotYet), |
| + d_(kNotYet), |
| + surrogate_(kNotYet) { |
| + for (intptr_t i = 0; i < kMapSize; i++) { |
| + map_->Add(false); |
| + } |
| + } |
| + |
| + bool& at(intptr_t i) { return map_->At(i); } |
| + |
| + static const intptr_t kMapSize = 128; |
| + static const intptr_t kMask = kMapSize - 1; |
| + |
| + intptr_t map_count() const { return map_count_; } |
| + |
| + void Set(intptr_t character); |
| + void SetInterval(const Interval& interval); |
| + void SetAll(); |
| + bool is_non_word() { return w_ == kLatticeOut; } |
| + bool is_word() { return w_ == kLatticeIn; } |
| + |
| + private: |
| + ZoneGrowableArray<bool>* map_; |
| + intptr_t map_count_; // Number of set bits in the map. |
| + ContainedInLattice w_; // The \w character class. |
| + ContainedInLattice s_; // The \s character class. |
| + ContainedInLattice d_; // The \d character class. |
| + ContainedInLattice surrogate_; // Surrogate UTF-16 code units. |
| +}; |
| + |
| + |
| +class BoyerMooreLookahead : public ZoneAllocated{ |
| + public: |
| + BoyerMooreLookahead(intptr_t length, RegExpCompiler* compiler, |
| + Isolate* Isolate); |
| + |
| + intptr_t length() { return length_; } |
| + intptr_t max_char() { return max_char_; } |
| + RegExpCompiler* compiler() { return compiler_; } |
| + |
| + intptr_t Count(intptr_t map_number) { |
| + return bitmaps_->At(map_number)->map_count(); |
| + } |
| + |
| + BoyerMoorePositionInfo* at(intptr_t i) { return bitmaps_->At(i); } |
| + |
| + void Set(intptr_t map_number, intptr_t character) { |
| + if (character > max_char_) return; |
| + BoyerMoorePositionInfo* info = bitmaps_->At(map_number); |
| + info->Set(character); |
| + } |
| + |
| + void SetInterval(intptr_t map_number, const Interval& interval) { |
| + if (interval.from() > max_char_) return; |
| + BoyerMoorePositionInfo* info = bitmaps_->At(map_number); |
| + if (interval.to() > max_char_) { |
| + info->SetInterval(Interval(interval.from(), max_char_)); |
| + } else { |
| + info->SetInterval(interval); |
| + } |
| + } |
| + |
| + void SetAll(intptr_t map_number) { |
| + bitmaps_->At(map_number)->SetAll(); |
| + } |
| + |
| + void SetRest(intptr_t from_map) { |
| + for (intptr_t i = from_map; i < length_; i++) SetAll(i); |
| + } |
| + bool EmitSkipInstructions(RegExpMacroAssembler* masm); |
| + |
| + private: |
| + // This is the value obtained by EatsAtLeast. If we do not have at least this |
| + // many characters left in the sample string then the match is bound to fail. |
| + // Therefore it is OK to read a character this far ahead of the current match |
| + // point. |
| + intptr_t length_; |
| + RegExpCompiler* compiler_; |
| + // 0x7f for ASCII, 0xffff for UTF-16. |
| + intptr_t max_char_; |
| + ZoneGrowableArray<BoyerMoorePositionInfo*>* bitmaps_; |
| + |
| + intptr_t GetSkipTable(intptr_t min_lookahead, |
| + intptr_t max_lookahead, |
|
ricow1
2014/09/05 05:49:58
indentation
jgruber1
2014/09/05 06:17:25
Done.
|
| + TypedData* boolean_skip_table); |
| + bool FindWorthwhileInterval(int* from, int* to); |
| + intptr_t FindBestInterval( |
| + intptr_t max_number_of_chars, |
| + intptr_t old_biggest_points, |
| + int* from, int* to); |
| +}; |
| + |
| + |
| // There are many ways to generate code for a node. This class encapsulates |
| // the current way we should be generating. In other words it encapsulates |
| // the current state of the code generator. The effect of this is that we |
| @@ -1101,8 +1248,8 @@ class Trace { |
| void set_at_start(bool at_start) { |
| at_start_ = at_start ? TRUE_VALUE : FALSE_VALUE; |
| } |
| - Label* backtrack() { return backtrack_; } |
| - Label* loop_label() { return loop_label_; } |
| + BlockLabel* backtrack() { return backtrack_; } |
| + BlockLabel* loop_label() { return loop_label_; } |
| RegExpNode* stop_node() { return stop_node_; } |
| intptr_t characters_preloaded() { return characters_preloaded_; } |
| intptr_t bound_checked_up_to() { return bound_checked_up_to_; } |
| @@ -1120,9 +1267,9 @@ class Trace { |
| new_action->next_ = actions_; |
| actions_ = new_action; |
| } |
| - void set_backtrack(Label* backtrack) { backtrack_ = backtrack; } |
| + void set_backtrack(BlockLabel* backtrack) { backtrack_ = backtrack; } |
| void set_stop_node(RegExpNode* node) { stop_node_ = node; } |
| - void set_loop_label(Label* label) { loop_label_ = label; } |
| + void set_loop_label(BlockLabel* label) { loop_label_ = label; } |
| void set_characters_preloaded(intptr_t count) { |
| characters_preloaded_ = count; |
| } |
| @@ -1135,22 +1282,22 @@ class Trace { |
| void AdvanceCurrentPositionInTrace(intptr_t by, RegExpCompiler* compiler); |
| private: |
| - intptr_t FindAffectedRegisters(OutSet* affected_registers, Zone* zone); |
| + intptr_t FindAffectedRegisters(OutSet* affected_registers, Isolate* isolate); |
| void PerformDeferredActions(RegExpMacroAssembler* macro, |
| intptr_t max_register, |
| const OutSet& affected_registers, |
| OutSet* registers_to_pop, |
| OutSet* registers_to_clear, |
| - Zone* zone); |
| + Isolate* isolate); |
| void RestoreAffectedRegisters(RegExpMacroAssembler* macro, |
| intptr_t max_register, |
| const OutSet& registers_to_pop, |
| const OutSet& registers_to_clear); |
| intptr_t cp_offset_; |
| DeferredAction* actions_; |
| - Label* backtrack_; |
| + BlockLabel* backtrack_; |
| RegExpNode* stop_node_; |
| - Label* loop_label_; |
| + BlockLabel* loop_label_; |
| intptr_t characters_preloaded_; |
| intptr_t bound_checked_up_to_; |
| QuickCheckDetails quick_check_performed_; |
| @@ -1235,13 +1382,14 @@ class RegExpEngine: public AllStatic { |
| public: |
| struct CompilationResult { |
| explicit CompilationResult(const char* error_message) |
| - : error_message(error_message), |
| - num_registers(0) {} |
| - CompilationResult(Object* code, intptr_t registers) |
| - : error_message(NULL), |
| - num_registers(registers) {} |
| + : function(Function::Handle()), |
| + error_message(error_message) {} |
| + explicit CompilationResult(const Function& function) |
| + : function(function), |
| + error_message(NULL) {} |
| + |
| + const Function& function; |
| const char* error_message; |
| - intptr_t num_registers; |
| }; |
| static CompilationResult Compile(RegExpCompileData* input, |
| @@ -1252,6 +1400,10 @@ class RegExpEngine: public AllStatic { |
| const String& sample_subject, |
| bool is_ascii); |
| + static RawJSRegExp* New(const String& pattern, |
| + bool multi_line, |
| + bool ignore_case); |
| + |
| static void DotPrint(const char* label, RegExpNode* node, bool ignore_case); |
| }; |