src/jsregexp.h - Issue 9965010: Regexp: Improve the speed that we scan for an initial point where a non-anchored

Unified Diff: src/jsregexp.h

Issue 9965010: Regexp: Improve the speed that we scan for an initial point where a non-anchored (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: '' Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/jsregexp.h

===================================================================

--- src/jsregexp.h (revision 11190)

+++ src/jsregexp.h (working copy)

@@ -190,8 +190,10 @@

static String* last_ascii_string_;

static String* two_byte_cached_string_;

- static bool CompileIrregexp(Handle<JSRegExp> re, bool is_ascii);

- static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii);

+ static bool CompileIrregexp(

+ Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii);

+ static inline bool EnsureCompiledIrregexp(

+ Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii);

// Set the subject cache. The previous string buffer is not deleted, so the

@@ -419,6 +421,90 @@

};

+// Improve the speed that we scan for an initial point where a non-anchored

+// regexp can match by using a Boyer-Moore-like table. This is done by

+// identifying non-greedy non-capturing loops in the nodes that eat any

+// character one at a time. For example in the middle of the regexp

+// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly

+// inserted at the start of any non-anchored regexp.

+//

+// When we have found such a loop we look ahead in the nodes to find the set of

+// characters that can come at given distances. For example for the regexp

+// /.?foo/ we know that there are at least 3 characters ahead of us, and the

+// sets of characters that can occur are [any, [f, o], [o]]. We find a range in

+// the lookahead info where the set of characters is reasonably constrained. In

+// our example this is from index 1 to 2 (0 is not constrained). We can now

+// look 3 characters ahead and if we don't find one of [f, o] (the union of

+// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).

+//

+// For Unicode input strings we do the same, but modulo 128.

+//

+// We also look at the first string fed to the regexp and use that to get a hint

+// of the character frequencies in the inputs. This affects the assessment of

+// whether the set of characters is 'reasonably constrained'.

+//

+// We also have another lookahead mechanism (called quick check in the code),

+// which uses a wide load of multiple characters followed by a mask and compare

+// to determine whether a match is possible at this point.

+class BoyerMooreLookahead {

+ public:

+ BoyerMooreLookahead(int length, int map_length, RegExpCompiler* compiler);

+ int length() { return length_; }

+ int max_char() { return max_char_; }

+ RegExpCompiler* compiler() { return compiler_; }

+ static const int kTooManyCharacters = 32;

+ int Count(int map_number) {

+ ZoneList<bool>* map = bitmaps_->at(map_number);

+ if (map == NULL) return map_length_;

+ int count = 0;

+ for (int i = 0; i < map_length_; i++) {

+ if (map->at(i)) count++;

+ }

+ return count;

+ }

+ void Set(int map_number, int character) {

+ if (character > max_char_) return;

+ ZoneList<bool>* map = bitmaps_->at(map_number);

+ if (map == NULL) return;

+ map->at(character & (map_length_ - 1)) = true;

+ }

+ void SetAll(int map_number) {

+ bitmaps_->at(map_number) = NULL;

+ }

+ void SetRest(int from_map) {

+ for (int i = from_map; i < length_; i++) SetAll(i);

+ }

+ bool EmitSkipInstructions(RegExpMacroAssembler* masm);

+ private:

+ // This is the value obtained by EatsAtLeast. If we do not have at least this

+ // many characters left in the sample string then the match is bound to fail.

+ // Therefore it is OK to read a character this far ahead of the current match

+ // point.

+ int length_;

+ // We conservatively consider all character values modulo this length. For

+ // ASCII there is no loss of precision, since this has a value of 128.

+ int map_length_;

+ RegExpCompiler* compiler_;

+ // 0x7f for ASCII, 0xffff for UTF-16.

+ int max_char_;

+ ZoneList<ZoneList<bool>*>* bitmaps_;

+ int GetSkipTable(int min_lookahead,

+ int max_lookahead,

+ Handle<ByteArray> boolean_skip_table);

+ bool FindWorthwhileInterval(int* from, int* to);

+ int FindBestInterval(

+ int max_number_of_chars, int old_biggest_points, int* from, int* to);

+};

#define FOR_EACH_NODE_TYPE(VISIT) \

VISIT(End) \

VISIT(Action) \

@@ -635,6 +721,22 @@

bool not_at_start) = 0;

static const int kNodeIsTooComplexForGreedyLoops = -1;

virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }

+ // Only returns the successor for a text node of length 1 that matches any

+ // character and that has no guards on it.

+ virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(

+ RegExpCompiler* compiler) {

+ return NULL;

+ }

+ // Collects information on the possible code units (mod 128) that can match if

+ // we look forward. This is used for a Boyer-Moore-like string searching

+ // implementation. TODO(erikcorry): This should share more code with

+ // EatsAtLeast, GetQuickCheckDetails and ComputeFirstCharacterSet.

+ virtual void FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {

+ UNREACHABLE();

+ }

Label* label() { return &label_; }

// If non-generic code is generated for a node (i.e. the node is not at the

// start of the trace) then it cannot be reused. This variable sets a limit

@@ -747,6 +849,10 @@

: on_success_(on_success) { }

RegExpNode* on_success() { return on_success_; }

void set_on_success(RegExpNode* node) { on_success_ = node; }

+ virtual void FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {

+ on_success_->FillInBMInfo(offset, bm, not_at_start);

+ }

private:

RegExpNode* on_success_;

};

@@ -793,6 +899,8 @@

return on_success()->GetQuickCheckDetails(

details, compiler, filled_in, not_at_start);

}

+ virtual void FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool not_at_start);

Type type() { return type_; }

// TODO(erikcorry): We should allow some action nodes in greedy loops.

virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }

@@ -860,6 +968,10 @@

ZoneList<TextElement>* elements() { return elms_; }

void MakeCaseIndependent(bool is_ascii);

virtual int GreedyLoopTextLength();

+ virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(

+ RegExpCompiler* compiler);

+ virtual void FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool not_at_start);

virtual TextNode* Clone() {

TextNode* result = new TextNode(*this);

result->CalculateOffsets();

@@ -928,6 +1040,8 @@

RegExpCompiler* compiler,

int filled_in,

bool not_at_start);

+ virtual void FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool not_at_start);

virtual int ComputeFirstCharacterSet(int budget);

virtual AssertionNode* Clone() { return new AssertionNode(*this); }

AssertionNodeType type() { return type_; }

@@ -961,6 +1075,12 @@

bool not_at_start) {

return;

}

+ virtual void FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {

+ // Working out the set of characters that a backreference can match is too

+ // hard, so we just say that any character can match.

+ bm->SetRest(offset);

+ }

virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }

virtual int ComputeFirstCharacterSet(int budget);

@@ -986,6 +1106,11 @@

// Returning 0 from EatsAtLeast should ensure we never get here.

UNREACHABLE();

}

+ virtual void FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {

+ // Returning 0 from EatsAtLeast should ensure we never get here.

+ UNREACHABLE();

+ }

virtual EndNode* Clone() { return new EndNode(*this); }

private:

Action action_;

@@ -1071,6 +1196,8 @@

RegExpCompiler* compiler,

int characters_filled_in,

bool not_at_start);

+ virtual void FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool not_at_start);

virtual ChoiceNode* Clone() { return new ChoiceNode(*this); }

bool being_calculated() { return being_calculated_; }

@@ -1089,7 +1216,7 @@

void GenerateGuard(RegExpMacroAssembler* macro_assembler,

Guard* guard,

Trace* trace);

- int CalculatePreloadCharacters(RegExpCompiler* compiler, bool not_at_start);

+ int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);

void EmitOutOfLineContinuation(RegExpCompiler* compiler,

Trace* trace,

GuardedAlternative alternative,

@@ -1119,6 +1246,10 @@

RegExpCompiler* compiler,

int characters_filled_in,

bool not_at_start);

+ virtual void FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {

+ alternatives_->at(1).node()->FillInBMInfo(offset, bm, not_at_start);

+ }

// For a negative lookahead we don't emit the quick check for the

// alternative that is expected to fail. This is because quick check code

// starts by loading enough characters for the alternative that takes fewest

@@ -1146,6 +1277,8 @@

RegExpCompiler* compiler,

int characters_filled_in,

bool not_at_start);

+ virtual void FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool not_at_start);

virtual int ComputeFirstCharacterSet(int budget);

virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }

RegExpNode* loop_node() { return loop_node_; }

@@ -1458,6 +1591,7 @@

bool ignore_case,

bool multiline,

Handle<String> pattern,

+ Handle<String> sample_subject,

bool is_ascii);

static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);

« no previous file with comments | « src/ia32/regexp-macro-assembler-ia32.cc ('k') | src/jsregexp.cc » ('j') | src/jsregexp.cc » ('J')