| Index: src/jsregexp.h | 
| =================================================================== | 
| --- src/jsregexp.h	(revision 11190) | 
| +++ src/jsregexp.h	(working copy) | 
| @@ -190,8 +190,10 @@ | 
| static String* last_ascii_string_; | 
| static String* two_byte_cached_string_; | 
|  | 
| -  static bool CompileIrregexp(Handle<JSRegExp> re, bool is_ascii); | 
| -  static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii); | 
| +  static bool CompileIrregexp( | 
| +      Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii); | 
| +  static inline bool EnsureCompiledIrregexp( | 
| +      Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii); | 
|  | 
|  | 
| // Set the subject cache.  The previous string buffer is not deleted, so the | 
| @@ -419,6 +421,90 @@ | 
| }; | 
|  | 
|  | 
| +// Improve the speed that we scan for an initial point where a non-anchored | 
| +// regexp can match by using a Boyer-Moore-like table. This is done by | 
| +// identifying non-greedy non-capturing loops in the nodes that eat any | 
| +// character one at a time.  For example in the middle of the regexp | 
| +// /foo[\s\S]*?bar/ we find such a loop.  There is also such a loop implicitly | 
| +// inserted at the start of any non-anchored regexp. | 
| +// | 
| +// When we have found such a loop we look ahead in the nodes to find the set of | 
| +// characters that can come at given distances. For example for the regexp | 
| +// /.?foo/ we know that there are at least 3 characters ahead of us, and the | 
| +// sets of characters that can occur are [any, [f, o], [o]]. We find a range in | 
| +// the lookahead info where the set of characters is reasonably constrained. In | 
| +// our example this is from index 1 to 2 (0 is not constrained). We can now | 
| +// look 3 characters ahead and if we don't find one of [f, o] (the union of | 
| +// [f, o] and [o]) then we can skip forwards by the range size (in this case 2). | 
| +// | 
| +// For Unicode input strings we do the same, but modulo 128. | 
| +// | 
| +// We also look at the first string fed to the regexp and use that to get a hint | 
| +// of the character frequencies in the inputs. This affects the assessment of | 
| +// whether the set of characters is 'reasonably constrained'. | 
| +// | 
| +// We also have another lookahead mechanism (called quick check in the code), | 
| +// which uses a wide load of multiple characters followed by a mask and compare | 
| +// to determine whether a match is possible at this point. | 
| +class BoyerMooreLookahead { | 
| + public: | 
| +  BoyerMooreLookahead(int length, int map_length, RegExpCompiler* compiler); | 
| + | 
| +  int length() { return length_; } | 
| +  int max_char() { return max_char_; } | 
| +  RegExpCompiler* compiler() { return compiler_; } | 
| + | 
| +  static const int kTooManyCharacters = 32; | 
| + | 
| +  int Count(int map_number) { | 
| +    ZoneList<bool>* map = bitmaps_->at(map_number); | 
| +    if (map == NULL) return map_length_; | 
| +    int count = 0; | 
| +    for (int i = 0; i < map_length_; i++) { | 
| +      if (map->at(i)) count++; | 
| +    } | 
| +    return count; | 
| +  } | 
| + | 
| +  void Set(int map_number, int character) { | 
| +    if (character > max_char_) return; | 
| +    ZoneList<bool>* map = bitmaps_->at(map_number); | 
| +    if (map == NULL) return; | 
| +    map->at(character & (map_length_ - 1)) = true; | 
| +  } | 
| + | 
| +  void SetAll(int map_number) { | 
| +    bitmaps_->at(map_number) = NULL; | 
| +  } | 
| + | 
| +  void SetRest(int from_map) { | 
| +    for (int i = from_map; i < length_; i++) SetAll(i); | 
| +  } | 
| +  bool EmitSkipInstructions(RegExpMacroAssembler* masm); | 
| + | 
| + private: | 
| +  // This is the value obtained by EatsAtLeast.  If we do not have at least this | 
| +  // many characters left in the sample string then the match is bound to fail. | 
| +  // Therefore it is OK to read a character this far ahead of the current match | 
| +  // point. | 
| +  int length_; | 
| +  // We conservatively consider all character values modulo this length.  For | 
| +  // ASCII there is no loss of precision, since this has a value of 128. | 
| +  int map_length_; | 
| +  RegExpCompiler* compiler_; | 
| +  // 0x7f for ASCII, 0xffff for UTF-16. | 
| +  int max_char_; | 
| +  ZoneList<ZoneList<bool>*>* bitmaps_; | 
| + | 
| +  int GetSkipTable(int min_lookahead, | 
| +                   int max_lookahead, | 
| +                   Handle<ByteArray> boolean_skip_table); | 
| +  bool FindWorthwhileInterval(int* from, int* to); | 
| +  int FindBestInterval( | 
| +    int max_number_of_chars, int old_biggest_points, int* from, int* to); | 
| +}; | 
| + | 
| + | 
| #define FOR_EACH_NODE_TYPE(VISIT)                                    \ | 
| VISIT(End)                                                         \ | 
| VISIT(Action)                                                      \ | 
| @@ -635,6 +721,22 @@ | 
| bool not_at_start) = 0; | 
| static const int kNodeIsTooComplexForGreedyLoops = -1; | 
| virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } | 
| +  // Only returns the successor for a text node of length 1 that matches any | 
| +  // character and that has no guards on it. | 
| +  virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( | 
| +      RegExpCompiler* compiler) { | 
| +    return NULL; | 
| +  } | 
| + | 
| +  // Collects information on the possible code units (mod 128) that can match if | 
| +  // we look forward.  This is used for a Boyer-Moore-like string searching | 
| +  // implementation.  TODO(erikcorry):  This should share more code with | 
| +  // EatsAtLeast, GetQuickCheckDetails and ComputeFirstCharacterSet. | 
| +  virtual void FillInBMInfo( | 
| +      int offset, BoyerMooreLookahead* bm, bool not_at_start) { | 
| +    UNREACHABLE(); | 
| +  } | 
| + | 
| Label* label() { return &label_; } | 
| // If non-generic code is generated for a node (i.e. the node is not at the | 
| // start of the trace) then it cannot be reused.  This variable sets a limit | 
| @@ -747,6 +849,10 @@ | 
| : on_success_(on_success) { } | 
| RegExpNode* on_success() { return on_success_; } | 
| void set_on_success(RegExpNode* node) { on_success_ = node; } | 
| +  virtual void FillInBMInfo( | 
| +      int offset, BoyerMooreLookahead* bm, bool not_at_start) { | 
| +    on_success_->FillInBMInfo(offset, bm, not_at_start); | 
| +  } | 
| private: | 
| RegExpNode* on_success_; | 
| }; | 
| @@ -793,6 +899,8 @@ | 
| return on_success()->GetQuickCheckDetails( | 
| details, compiler, filled_in, not_at_start); | 
| } | 
| +  virtual void FillInBMInfo( | 
| +      int offset, BoyerMooreLookahead* bm, bool not_at_start); | 
| Type type() { return type_; } | 
| // TODO(erikcorry): We should allow some action nodes in greedy loops. | 
| virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } | 
| @@ -860,6 +968,10 @@ | 
| ZoneList<TextElement>* elements() { return elms_; } | 
| void MakeCaseIndependent(bool is_ascii); | 
| virtual int GreedyLoopTextLength(); | 
| +  virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( | 
| +      RegExpCompiler* compiler); | 
| +  virtual void FillInBMInfo( | 
| +      int offset, BoyerMooreLookahead* bm, bool not_at_start); | 
| virtual TextNode* Clone() { | 
| TextNode* result = new TextNode(*this); | 
| result->CalculateOffsets(); | 
| @@ -928,6 +1040,8 @@ | 
| RegExpCompiler* compiler, | 
| int filled_in, | 
| bool not_at_start); | 
| +  virtual void FillInBMInfo( | 
| +      int offset, BoyerMooreLookahead* bm, bool not_at_start); | 
| virtual int ComputeFirstCharacterSet(int budget); | 
| virtual AssertionNode* Clone() { return new AssertionNode(*this); } | 
| AssertionNodeType type() { return type_; } | 
| @@ -961,6 +1075,12 @@ | 
| bool not_at_start) { | 
| return; | 
| } | 
| +  virtual void FillInBMInfo( | 
| +      int offset, BoyerMooreLookahead* bm, bool not_at_start) { | 
| +    // Working out the set of characters that a backreference can match is too | 
| +    // hard, so we just say that any character can match. | 
| +    bm->SetRest(offset); | 
| +  } | 
| virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); } | 
| virtual int ComputeFirstCharacterSet(int budget); | 
|  | 
| @@ -986,6 +1106,11 @@ | 
| // Returning 0 from EatsAtLeast should ensure we never get here. | 
| UNREACHABLE(); | 
| } | 
| +  virtual void FillInBMInfo( | 
| +      int offset, BoyerMooreLookahead* bm, bool not_at_start) { | 
| +    // Returning 0 from EatsAtLeast should ensure we never get here. | 
| +    UNREACHABLE(); | 
| +  } | 
| virtual EndNode* Clone() { return new EndNode(*this); } | 
| private: | 
| Action action_; | 
| @@ -1071,6 +1196,8 @@ | 
| RegExpCompiler* compiler, | 
| int characters_filled_in, | 
| bool not_at_start); | 
| +  virtual void FillInBMInfo( | 
| +      int offset, BoyerMooreLookahead* bm, bool not_at_start); | 
| virtual ChoiceNode* Clone() { return new ChoiceNode(*this); } | 
|  | 
| bool being_calculated() { return being_calculated_; } | 
| @@ -1089,7 +1216,7 @@ | 
| void GenerateGuard(RegExpMacroAssembler* macro_assembler, | 
| Guard* guard, | 
| Trace* trace); | 
| -  int CalculatePreloadCharacters(RegExpCompiler* compiler, bool not_at_start); | 
| +  int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least); | 
| void EmitOutOfLineContinuation(RegExpCompiler* compiler, | 
| Trace* trace, | 
| GuardedAlternative alternative, | 
| @@ -1119,6 +1246,10 @@ | 
| RegExpCompiler* compiler, | 
| int characters_filled_in, | 
| bool not_at_start); | 
| +  virtual void FillInBMInfo( | 
| +      int offset, BoyerMooreLookahead* bm, bool not_at_start) { | 
| +    alternatives_->at(1).node()->FillInBMInfo(offset, bm, not_at_start); | 
| +  } | 
| // For a negative lookahead we don't emit the quick check for the | 
| // alternative that is expected to fail.  This is because quick check code | 
| // starts by loading enough characters for the alternative that takes fewest | 
| @@ -1146,6 +1277,8 @@ | 
| RegExpCompiler* compiler, | 
| int characters_filled_in, | 
| bool not_at_start); | 
| +  virtual void FillInBMInfo( | 
| +      int offset, BoyerMooreLookahead* bm, bool not_at_start); | 
| virtual int ComputeFirstCharacterSet(int budget); | 
| virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); } | 
| RegExpNode* loop_node() { return loop_node_; } | 
| @@ -1458,6 +1591,7 @@ | 
| bool ignore_case, | 
| bool multiline, | 
| Handle<String> pattern, | 
| +                                   Handle<String> sample_subject, | 
| bool is_ascii); | 
|  | 
| static void DotPrint(const char* label, RegExpNode* node, bool ignore_case); | 
|  |