Index: src/jsregexp.h |
=================================================================== |
--- src/jsregexp.h (revision 11190) |
+++ src/jsregexp.h (working copy) |
@@ -190,8 +190,10 @@ |
static String* last_ascii_string_; |
static String* two_byte_cached_string_; |
- static bool CompileIrregexp(Handle<JSRegExp> re, bool is_ascii); |
- static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii); |
+ static bool CompileIrregexp( |
+ Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii); |
+ static inline bool EnsureCompiledIrregexp( |
+ Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii); |
// Set the subject cache. The previous string buffer is not deleted, so the |
@@ -419,6 +421,90 @@ |
}; |
+// Improve the speed that we scan for an initial point where a non-anchored |
+// regexp can match by using a Boyer-Moore-like table. This is done by |
+// identifying non-greedy non-capturing loops in the nodes that eat any |
+// character one at a time. For example in the middle of the regexp |
+// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly |
+// inserted at the start of any non-anchored regexp. |
+// |
+// When we have found such a loop we look ahead in the nodes to find the set of |
+// characters that can come at given distances. For example for the regexp |
+// /.?foo/ we know that there are at least 3 characters ahead of us, and the |
+// sets of characters that can occur are [any, [f, o], [o]]. We find a range in |
+// the lookahead info where the set of characters is reasonably constrained. In |
+// our example this is from index 1 to 2 (0 is not constrained). We can now |
+// look 3 characters ahead and if we don't find one of [f, o] (the union of |
+// [f, o] and [o]) then we can skip forwards by the range size (in this case 2). |
+// |
+// For Unicode input strings we do the same, but modulo 128. |
+// |
+// We also look at the first string fed to the regexp and use that to get a hint |
+// of the character frequencies in the inputs. This affects the assessment of |
+// whether the set of characters is 'reasonably constrained'. |
+// |
+// We also have another lookahead mechanism (called quick check in the code), |
+// which uses a wide load of multiple characters followed by a mask and compare |
+// to determine whether a match is possible at this point. |
+class BoyerMooreLookahead { |
+ public: |
+ BoyerMooreLookahead(int length, int map_length, RegExpCompiler* compiler); |
+ |
+ int length() { return length_; } |
+ int max_char() { return max_char_; } |
+ RegExpCompiler* compiler() { return compiler_; } |
+ |
+ static const int kTooManyCharacters = 32; |
+ |
+ int Count(int map_number) { |
+ ZoneList<bool>* map = bitmaps_->at(map_number); |
+ if (map == NULL) return map_length_; |
+ int count = 0; |
+ for (int i = 0; i < map_length_; i++) { |
+ if (map->at(i)) count++; |
+ } |
+ return count; |
+ } |
+ |
+ void Set(int map_number, int character) { |
+ if (character > max_char_) return; |
+ ZoneList<bool>* map = bitmaps_->at(map_number); |
+ if (map == NULL) return; |
+ map->at(character & (map_length_ - 1)) = true; |
+ } |
+ |
+ void SetAll(int map_number) { |
+ bitmaps_->at(map_number) = NULL; |
+ } |
+ |
+ void SetRest(int from_map) { |
+ for (int i = from_map; i < length_; i++) SetAll(i); |
+ } |
+ bool EmitSkipInstructions(RegExpMacroAssembler* masm); |
+ |
+ private: |
+ // This is the value obtained by EatsAtLeast. If we do not have at least this |
+ // many characters left in the sample string then the match is bound to fail. |
+ // Therefore it is OK to read a character this far ahead of the current match |
+ // point. |
+ int length_; |
+ // We conservatively consider all character values modulo this length. For |
+ // ASCII there is no loss of precision, since this has a value of 128. |
+ int map_length_; |
+ RegExpCompiler* compiler_; |
+ // 0x7f for ASCII, 0xffff for UTF-16. |
+ int max_char_; |
+ ZoneList<ZoneList<bool>*>* bitmaps_; |
+ |
+ int GetSkipTable(int min_lookahead, |
+ int max_lookahead, |
+ Handle<ByteArray> boolean_skip_table); |
+ bool FindWorthwhileInterval(int* from, int* to); |
+ int FindBestInterval( |
+ int max_number_of_chars, int old_biggest_points, int* from, int* to); |
+}; |
+ |
+ |
#define FOR_EACH_NODE_TYPE(VISIT) \ |
VISIT(End) \ |
VISIT(Action) \ |
@@ -635,6 +721,22 @@ |
bool not_at_start) = 0; |
static const int kNodeIsTooComplexForGreedyLoops = -1; |
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } |
+ // Only returns the successor for a text node of length 1 that matches any |
+ // character and that has no guards on it. |
+ virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( |
+ RegExpCompiler* compiler) { |
+ return NULL; |
+ } |
+ |
+ // Collects information on the possible code units (mod 128) that can match if |
+ // we look forward. This is used for a Boyer-Moore-like string searching |
+ // implementation. TODO(erikcorry): This should share more code with |
+ // EatsAtLeast, GetQuickCheckDetails and ComputeFirstCharacterSet. |
+ virtual void FillInBMInfo( |
+ int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
+ UNREACHABLE(); |
+ } |
+ |
Label* label() { return &label_; } |
// If non-generic code is generated for a node (i.e. the node is not at the |
// start of the trace) then it cannot be reused. This variable sets a limit |
@@ -747,6 +849,10 @@ |
: on_success_(on_success) { } |
RegExpNode* on_success() { return on_success_; } |
void set_on_success(RegExpNode* node) { on_success_ = node; } |
+ virtual void FillInBMInfo( |
+ int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
+ on_success_->FillInBMInfo(offset, bm, not_at_start); |
+ } |
private: |
RegExpNode* on_success_; |
}; |
@@ -793,6 +899,8 @@ |
return on_success()->GetQuickCheckDetails( |
details, compiler, filled_in, not_at_start); |
} |
+ virtual void FillInBMInfo( |
+ int offset, BoyerMooreLookahead* bm, bool not_at_start); |
Type type() { return type_; } |
// TODO(erikcorry): We should allow some action nodes in greedy loops. |
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } |
@@ -860,6 +968,10 @@ |
ZoneList<TextElement>* elements() { return elms_; } |
void MakeCaseIndependent(bool is_ascii); |
virtual int GreedyLoopTextLength(); |
+ virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( |
+ RegExpCompiler* compiler); |
+ virtual void FillInBMInfo( |
+ int offset, BoyerMooreLookahead* bm, bool not_at_start); |
virtual TextNode* Clone() { |
TextNode* result = new TextNode(*this); |
result->CalculateOffsets(); |
@@ -928,6 +1040,8 @@ |
RegExpCompiler* compiler, |
int filled_in, |
bool not_at_start); |
+ virtual void FillInBMInfo( |
+ int offset, BoyerMooreLookahead* bm, bool not_at_start); |
virtual int ComputeFirstCharacterSet(int budget); |
virtual AssertionNode* Clone() { return new AssertionNode(*this); } |
AssertionNodeType type() { return type_; } |
@@ -961,6 +1075,12 @@ |
bool not_at_start) { |
return; |
} |
+ virtual void FillInBMInfo( |
+ int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
+ // Working out the set of characters that a backreference can match is too |
+ // hard, so we just say that any character can match. |
+ bm->SetRest(offset); |
+ } |
virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); } |
virtual int ComputeFirstCharacterSet(int budget); |
@@ -986,6 +1106,11 @@ |
// Returning 0 from EatsAtLeast should ensure we never get here. |
UNREACHABLE(); |
} |
+ virtual void FillInBMInfo( |
+ int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
+ // Returning 0 from EatsAtLeast should ensure we never get here. |
+ UNREACHABLE(); |
+ } |
virtual EndNode* Clone() { return new EndNode(*this); } |
private: |
Action action_; |
@@ -1071,6 +1196,8 @@ |
RegExpCompiler* compiler, |
int characters_filled_in, |
bool not_at_start); |
+ virtual void FillInBMInfo( |
+ int offset, BoyerMooreLookahead* bm, bool not_at_start); |
virtual ChoiceNode* Clone() { return new ChoiceNode(*this); } |
bool being_calculated() { return being_calculated_; } |
@@ -1089,7 +1216,7 @@ |
void GenerateGuard(RegExpMacroAssembler* macro_assembler, |
Guard* guard, |
Trace* trace); |
- int CalculatePreloadCharacters(RegExpCompiler* compiler, bool not_at_start); |
+ int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least); |
void EmitOutOfLineContinuation(RegExpCompiler* compiler, |
Trace* trace, |
GuardedAlternative alternative, |
@@ -1119,6 +1246,10 @@ |
RegExpCompiler* compiler, |
int characters_filled_in, |
bool not_at_start); |
+ virtual void FillInBMInfo( |
+ int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
+ alternatives_->at(1).node()->FillInBMInfo(offset, bm, not_at_start); |
+ } |
// For a negative lookahead we don't emit the quick check for the |
// alternative that is expected to fail. This is because quick check code |
// starts by loading enough characters for the alternative that takes fewest |
@@ -1146,6 +1277,8 @@ |
RegExpCompiler* compiler, |
int characters_filled_in, |
bool not_at_start); |
+ virtual void FillInBMInfo( |
+ int offset, BoyerMooreLookahead* bm, bool not_at_start); |
virtual int ComputeFirstCharacterSet(int budget); |
virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); } |
RegExpNode* loop_node() { return loop_node_; } |
@@ -1458,6 +1591,7 @@ |
bool ignore_case, |
bool multiline, |
Handle<String> pattern, |
+ Handle<String> sample_subject, |
bool is_ascii); |
static void DotPrint(const char* label, RegExpNode* node, bool ignore_case); |