| Index: runtime/vm/regexp.h
|
| diff --git a/runtime/vm/regexp.h b/runtime/vm/regexp.h
|
| index 5a22c5f687e3b3fc44a3e3a709f8f2a829bcf26c..636d48fa7b7b9d3dbd5afd1056263642e233f7fa 100644
|
| --- a/runtime/vm/regexp.h
|
| +++ b/runtime/vm/regexp.h
|
| @@ -6,7 +6,9 @@
|
| #define VM_REGEXP_H_
|
|
|
| #include "vm/assembler.h"
|
| +#include "vm/intermediate_language.h"
|
| #include "vm/object.h"
|
| +#include "vm/regexp_assembler.h"
|
|
|
| namespace dart {
|
|
|
| @@ -239,7 +241,7 @@ class RegExpNode: public ZoneAllocated {
|
| bool EmitQuickCheck(RegExpCompiler* compiler,
|
| Trace* trace,
|
| bool preload_has_checked_bounds,
|
| - Label* on_possible_success,
|
| + BlockLabel* on_possible_success,
|
| QuickCheckDetails* details_return,
|
| bool fall_through_on_failure);
|
| // For a given number of characters this returns a mask and a value. The
|
| @@ -299,7 +301,7 @@ class RegExpNode: public ZoneAllocated {
|
| if (offset == 0) set_bm_info(not_at_start, bm);
|
| }
|
|
|
| - Label* label() { return &label_; }
|
| + BlockLabel* label() { return &label_; }
|
| // If non-generic code is generated for a node (i.e. the node is not at the
|
| // start of the trace) then it cannot be reused. This variable sets a limit
|
| // on how often we allow that to happen before we insist on starting a new
|
| @@ -319,13 +321,15 @@ class RegExpNode: public ZoneAllocated {
|
| enum LimitResult { DONE, CONTINUE };
|
| RegExpNode* replacement_;
|
|
|
| + LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
|
| +
|
| void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
|
| bm_info_[not_at_start ? 1 : 0] = bm;
|
| }
|
|
|
| private:
|
| static const intptr_t kFirstCharBudget = 10;
|
| - Label label_;
|
| + BlockLabel label_;
|
| NodeInfo info_;
|
| // This variable keeps track of how many times code has been generated for
|
| // this node (in different traces). We don't keep track of where the
|
| @@ -535,7 +539,7 @@ class GuardedAlternative {
|
| class OutSet: public ZoneAllocated {
|
| public:
|
| OutSet() : first_(0), remaining_(NULL), successors_(NULL) { }
|
| - OutSet* Extend(unsigned value, Zone* zone);
|
| + OutSet* Extend(unsigned value, Isolate* isolate);
|
| bool Get(unsigned value) const;
|
| static const unsigned kFirstLimit = 32;
|
|
|
| @@ -543,7 +547,7 @@ class OutSet: public ZoneAllocated {
|
| // Destructively set a value in this set. In most cases you want
|
| // to use Extend instead to ensure that only one instance exists
|
| // that contains the same values.
|
| - void Set(unsigned value, Zone* zone);
|
| + void Set(unsigned value, Isolate* isolate);
|
|
|
| // The successors are a list of sets that contain the same values
|
| // as this set and the one more value that is not present in this
|
| @@ -573,8 +577,8 @@ class DispatchTable : public ZoneAllocated {
|
| uint16_t from() { return from_; }
|
| uint16_t to() { return to_; }
|
| void set_to(uint16_t value) { to_ = value; }
|
| - void AddValue(intptr_t value, Zone* zone) {
|
| - out_set_ = out_set_->Extend(value, zone);
|
| + void AddValue(intptr_t value, Isolate* isolate) {
|
| + out_set_ = out_set_->Extend(value, isolate);
|
| }
|
| OutSet* out_set() { return out_set_; }
|
| private:
|
| @@ -632,7 +636,6 @@ class ChoiceNode: public RegExpNode {
|
| ZoneGrowableArray<GuardedAlternative>* alternatives() {
|
| return alternatives_;
|
| }
|
| - DispatchTable* GetTable(bool ignore_case);
|
| virtual void Emit(RegExpCompiler* compiler, Trace* trace);
|
| virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t budget,
|
| bool not_at_start);
|
| @@ -986,6 +989,149 @@ class QuickCheckDetails {
|
| };
|
|
|
|
|
| +// Improve the speed that we scan for an initial point where a non-anchored
|
| +// regexp can match by using a Boyer-Moore-like table. This is done by
|
| +// identifying non-greedy non-capturing loops in the nodes that eat any
|
| +// character one at a time. For example in the middle of the regexp
|
| +// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
|
| +// inserted at the start of any non-anchored regexp.
|
| +//
|
| +// When we have found such a loop we look ahead in the nodes to find the set of
|
| +// characters that can come at given distances. For example for the regexp
|
| +// /.?foo/ we know that there are at least 3 characters ahead of us, and the
|
| +// sets of characters that can occur are [any, [f, o], [o]]. We find a range in
|
| +// the lookahead info where the set of characters is reasonably constrained. In
|
| +// our example this is from index 1 to 2 (0 is not constrained). We can now
|
| +// look 3 characters ahead and if we don't find one of [f, o] (the union of
|
| +// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
|
| +//
|
| +// For Unicode input strings we do the same, but modulo 128.
|
| +//
|
| +// We also look at the first string fed to the regexp and use that to get a hint
|
| +// of the character frequencies in the inputs. This affects the assessment of
|
| +// whether the set of characters is 'reasonably constrained'.
|
| +//
|
| +// We also have another lookahead mechanism (called quick check in the code),
|
| +// which uses a wide load of multiple characters followed by a mask and compare
|
| +// to determine whether a match is possible at this point.
|
| +enum ContainedInLattice {
|
| + kNotYet = 0,
|
| + kLatticeIn = 1,
|
| + kLatticeOut = 2,
|
| + kLatticeUnknown = 3 // Can also mean both in and out.
|
| +};
|
| +
|
| +
|
| +inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
|
| + return static_cast<ContainedInLattice>(a | b);
|
| +}
|
| +
|
| +
|
| +ContainedInLattice AddRange(ContainedInLattice a,
|
| + const int* ranges,
|
| + intptr_t ranges_size,
|
| + Interval new_range);
|
| +
|
| +
|
| +class BoyerMoorePositionInfo : public ZoneAllocated {
|
| + public:
|
| + explicit BoyerMoorePositionInfo(Isolate* isolate)
|
| + : map_(new(isolate) ZoneGrowableArray<bool>(kMapSize)),
|
| + map_count_(0),
|
| + w_(kNotYet),
|
| + s_(kNotYet),
|
| + d_(kNotYet),
|
| + surrogate_(kNotYet) {
|
| + for (intptr_t i = 0; i < kMapSize; i++) {
|
| + map_->Add(false);
|
| + }
|
| + }
|
| +
|
| + bool& at(intptr_t i) { return (*map_)[i]; }
|
| +
|
| + static const intptr_t kMapSize = 128;
|
| + static const intptr_t kMask = kMapSize - 1;
|
| +
|
| + intptr_t map_count() const { return map_count_; }
|
| +
|
| + void Set(intptr_t character);
|
| + void SetInterval(const Interval& interval);
|
| + void SetAll();
|
| + bool is_non_word() { return w_ == kLatticeOut; }
|
| + bool is_word() { return w_ == kLatticeIn; }
|
| +
|
| + private:
|
| + ZoneGrowableArray<bool>* map_;
|
| + intptr_t map_count_; // Number of set bits in the map.
|
| + ContainedInLattice w_; // The \w character class.
|
| + ContainedInLattice s_; // The \s character class.
|
| + ContainedInLattice d_; // The \d character class.
|
| + ContainedInLattice surrogate_; // Surrogate UTF-16 code units.
|
| +};
|
| +
|
| +
|
| +class BoyerMooreLookahead : public ZoneAllocated{
|
| + public:
|
| + BoyerMooreLookahead(intptr_t length, RegExpCompiler* compiler,
|
| + Isolate* Isolate);
|
| +
|
| + intptr_t length() { return length_; }
|
| + intptr_t max_char() { return max_char_; }
|
| + RegExpCompiler* compiler() { return compiler_; }
|
| +
|
| + intptr_t Count(intptr_t map_number) {
|
| + return bitmaps_->At(map_number)->map_count();
|
| + }
|
| +
|
| + BoyerMoorePositionInfo* at(intptr_t i) { return bitmaps_->At(i); }
|
| +
|
| + void Set(intptr_t map_number, intptr_t character) {
|
| + if (character > max_char_) return;
|
| + BoyerMoorePositionInfo* info = bitmaps_->At(map_number);
|
| + info->Set(character);
|
| + }
|
| +
|
| + void SetInterval(intptr_t map_number, const Interval& interval) {
|
| + if (interval.from() > max_char_) return;
|
| + BoyerMoorePositionInfo* info = bitmaps_->At(map_number);
|
| + if (interval.to() > max_char_) {
|
| + info->SetInterval(Interval(interval.from(), max_char_));
|
| + } else {
|
| + info->SetInterval(interval);
|
| + }
|
| + }
|
| +
|
| + void SetAll(intptr_t map_number) {
|
| + bitmaps_->At(map_number)->SetAll();
|
| + }
|
| +
|
| + void SetRest(intptr_t from_map) {
|
| + for (intptr_t i = from_map; i < length_; i++) SetAll(i);
|
| + }
|
| + bool EmitSkipInstructions(RegExpMacroAssembler* masm);
|
| +
|
| + private:
|
| + // This is the value obtained by EatsAtLeast. If we do not have at least this
|
| + // many characters left in the sample string then the match is bound to fail.
|
| + // Therefore it is OK to read a character this far ahead of the current match
|
| + // point.
|
| + intptr_t length_;
|
| + RegExpCompiler* compiler_;
|
| + // 0x7f for ASCII, 0xffff for UTF-16.
|
| + intptr_t max_char_;
|
| + ZoneGrowableArray<BoyerMoorePositionInfo*>* bitmaps_;
|
| +
|
| + intptr_t GetSkipTable(intptr_t min_lookahead,
|
| + intptr_t max_lookahead,
|
| + TypedData* boolean_skip_table);
|
| + bool FindWorthwhileInterval(intptr_t* from, intptr_t* to);
|
| + intptr_t FindBestInterval(
|
| + intptr_t max_number_of_chars,
|
| + intptr_t old_biggest_points,
|
| + intptr_t* from, intptr_t* to);
|
| +};
|
| +
|
| +
|
| // There are many ways to generate code for a node. This class encapsulates
|
| // the current way we should be generating. In other words it encapsulates
|
| // the current state of the code generator. The effect of this is that we
|
| @@ -1101,8 +1247,8 @@ class Trace {
|
| void set_at_start(bool at_start) {
|
| at_start_ = at_start ? TRUE_VALUE : FALSE_VALUE;
|
| }
|
| - Label* backtrack() { return backtrack_; }
|
| - Label* loop_label() { return loop_label_; }
|
| + BlockLabel* backtrack() { return backtrack_; }
|
| + BlockLabel* loop_label() { return loop_label_; }
|
| RegExpNode* stop_node() { return stop_node_; }
|
| intptr_t characters_preloaded() { return characters_preloaded_; }
|
| intptr_t bound_checked_up_to() { return bound_checked_up_to_; }
|
| @@ -1120,9 +1266,9 @@ class Trace {
|
| new_action->next_ = actions_;
|
| actions_ = new_action;
|
| }
|
| - void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
|
| + void set_backtrack(BlockLabel* backtrack) { backtrack_ = backtrack; }
|
| void set_stop_node(RegExpNode* node) { stop_node_ = node; }
|
| - void set_loop_label(Label* label) { loop_label_ = label; }
|
| + void set_loop_label(BlockLabel* label) { loop_label_ = label; }
|
| void set_characters_preloaded(intptr_t count) {
|
| characters_preloaded_ = count;
|
| }
|
| @@ -1135,22 +1281,22 @@ class Trace {
|
| void AdvanceCurrentPositionInTrace(intptr_t by, RegExpCompiler* compiler);
|
|
|
| private:
|
| - intptr_t FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
|
| + intptr_t FindAffectedRegisters(OutSet* affected_registers, Isolate* isolate);
|
| void PerformDeferredActions(RegExpMacroAssembler* macro,
|
| intptr_t max_register,
|
| const OutSet& affected_registers,
|
| OutSet* registers_to_pop,
|
| OutSet* registers_to_clear,
|
| - Zone* zone);
|
| + Isolate* isolate);
|
| void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
|
| intptr_t max_register,
|
| const OutSet& registers_to_pop,
|
| const OutSet& registers_to_clear);
|
| intptr_t cp_offset_;
|
| DeferredAction* actions_;
|
| - Label* backtrack_;
|
| + BlockLabel* backtrack_;
|
| RegExpNode* stop_node_;
|
| - Label* loop_label_;
|
| + BlockLabel* loop_label_;
|
| intptr_t characters_preloaded_;
|
| intptr_t bound_checked_up_to_;
|
| QuickCheckDetails quick_check_performed_;
|
| @@ -1214,7 +1360,7 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
|
| };
|
|
|
|
|
| -struct RegExpCompileData {
|
| +struct RegExpCompileData : public ZoneAllocated {
|
| RegExpCompileData()
|
| : tree(NULL),
|
| node(NULL),
|
| @@ -1235,22 +1381,38 @@ class RegExpEngine: public AllStatic {
|
| public:
|
| struct CompilationResult {
|
| explicit CompilationResult(const char* error_message)
|
| - : error_message(error_message),
|
| - num_registers(0) {}
|
| - CompilationResult(Object* code, intptr_t registers)
|
| - : error_message(NULL),
|
| - num_registers(registers) {}
|
| + : macro_assembler(NULL),
|
| + graph_entry(NULL),
|
| + num_blocks(-1),
|
| + num_stack_locals(-1),
|
| + error_message(error_message) {}
|
| + explicit CompilationResult(IRRegExpMacroAssembler* macro_assembler,
|
| + GraphEntryInstr* graph_entry,
|
| + intptr_t num_blocks,
|
| + intptr_t num_stack_locals)
|
| + : macro_assembler(macro_assembler),
|
| + graph_entry(graph_entry),
|
| + num_blocks(num_blocks),
|
| + num_stack_locals(num_stack_locals),
|
| + error_message(NULL) {}
|
| +
|
| + IRRegExpMacroAssembler* macro_assembler;
|
| + GraphEntryInstr* graph_entry;
|
| + const intptr_t num_blocks;
|
| + const intptr_t num_stack_locals;
|
| +
|
| const char* error_message;
|
| - intptr_t num_registers;
|
| };
|
|
|
| - static CompilationResult Compile(RegExpCompileData* input,
|
| - bool ignore_case,
|
| - bool global,
|
| - bool multiline,
|
| - const String& pattern,
|
| - const String& sample_subject,
|
| - bool is_ascii);
|
| + static CompilationResult Compile(
|
| + RegExpCompileData* input,
|
| + const ParsedFunction* parsed_function,
|
| + ZoneGrowableArray<const ICData*>* ic_data_array);
|
| +
|
| + static RawJSRegExp* New(Isolate* isolate,
|
| + const String& pattern,
|
| + bool multi_line,
|
| + bool ignore_case);
|
|
|
| static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
|
| };
|
|
|