src/jsregexp.h - Issue 507051: Attempt to make \b\w+ faster. Slight performance increase on, e.g., string unpacking.

Unified Diff: src/jsregexp.h

Issue 507051: Attempt to make \b\w+ faster. Slight performance increase on, e.g., string unpacking. (Closed)

Patch Set: Addressed review comments. Created 10 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/jsregexp.h

diff --git a/src/jsregexp.h b/src/jsregexp.h

index 5ce5f29f78718632588e4a5df1e8c6c9e6914886..70fa0d6d4f822a62f1205bfc259a63d7db54c26a 100644

--- a/src/jsregexp.h

+++ b/src/jsregexp.h

@@ -167,6 +167,57 @@ class RegExpImpl {

};

+// Represents the location of one element relative to the intersection of

+// two sets. Corresponds to the four areas of a Venn diagram.

+enum ElementInSetsRelation {

+ kInsideNone = 0,

+ kInsideFirst = 1,

+ kInsideSecond = 2,

+ kInsideBoth = 3

+};

+// Represents the relation of two sets.

+// Sets can be either disjoint, partially or fully overlapping, or equal.

+class SetRelation BASE_EMBEDDED {

+ public:

+ // Relation is represented by a bit saying whether there are elements in

+ // one set that is not in the other, and a bit saying that there are elements

+ // that are in both sets.

+ // Location of an element. Corresponds to the internal areas of

+ // a Venn diagram.

+ enum {

+ kInFirst = 1 << kInsideFirst,

+ kInSecond = 1 << kInsideSecond,

+ kInBoth = 1 << kInsideBoth

+ };

+ SetRelation() : bits_(0) {}

+ ~SetRelation() {}

+ // Add the existence of objects in a particular

+ void SetElementsInFirstSet() { bits_ |= kInFirst; }

+ void SetElementsInSecondSet() { bits_ |= kInSecond; }

+ void SetElementsInBothSets() { bits_ |= kInBoth; }

+ // Check the currently known relation of the sets (common functions only,

+ // for other combinations, use value() to get the bits and check them

+ // manually).

+ // Sets are completely disjoint.

+ bool Disjoint() { return (bits_ & kInBoth) == 0; }

+ // Sets are equal.

+ bool Equals() { return (bits_ & (kInFirst | kInSecond)) == 0; }

+ // First set contains second.

+ bool Contains() { return (bits_ & kInSecond) == 0; }

+ // Second set contains first.

+ bool ContainedIn() { return (bits_ & kInFirst) == 0; }

+ bool NonTrivialIntersection() {

+ return (bits_ == (kInFirst | kInSecond | kInBoth));

+ }

+ int value() { return bits_; }

+ private:

+ int bits_;

+};

class CharacterRange {

public:

CharacterRange() : from_(0), to_(0) { }

@@ -198,7 +249,39 @@ class CharacterRange {

Vector<const uc16> overlay,

ZoneList<CharacterRange>** included,

ZoneList<CharacterRange>** excluded);

+ // Whether a range list is in canonical form: Ranges ordered by from value,

+ // and ranges non-overlapping and non-adjacent.

+ static bool IsCanonical(ZoneList<CharacterRange>* ranges);

+ // Convert range list to canonical form. The characters covered by the ranges

+ // will still be the same, but no character is in more than one range, and

+ // adjacent ranges are merged. The resulting list may be shorter than the

+ // original, but cannot be longer.

+ static void Canonicalize(ZoneList<CharacterRange>* ranges);

+ // Check how the set of characters defined by a CharacterRange list relates

+ // to the set of word characters. List must be in canonical form.

+ static SetRelation WordCharacterRelation(ZoneList<CharacterRange>* ranges);

+ // Takes two character range lists (representing character sets) in canonical

+ // form and merges them.

+ // The characters that are only covered by the first set are added to

+ // first_set_only_out. the characters that are only in the second set are

+ // added to second_set_only_out, and the characters that are in both are

+ // added to both_sets_out.

+ // The pointers to first_set_only_out, second_set_only_out and both_sets_out

+ // should be to empty lists, but they need not be distinct, and may be NULL.

+ // If NULL, the characters are dropped, and if two arguments are the same

+ // pointer, the result is the union of the two sets that would be created

+ // if the pointers had been distinct.

+ // This way, the Merge function can compute all the usual set operations:

+ // union (all three out-sets are equal), intersection (only both_sets_out is

+ // non-NULL), and set difference (only first_set is non-NULL).

+ static void Merge(ZoneList<CharacterRange>* first_set,

+ ZoneList<CharacterRange>* second_set,

+ ZoneList<CharacterRange>* first_set_only_out,

+ ZoneList<CharacterRange>* second_set_only_out,

+ ZoneList<CharacterRange>* both_sets_out);

+ // Negate the contents of a character range in canonical form.

+ static void Negate(ZoneList<CharacterRange>* src,

+ ZoneList<CharacterRange>* dst);

static const int kRangeCanonicalizeMax = 0x346;

static const int kStartMarker = (1 << 24);

static const int kPayloadMask = (1 << 24) - 1;

@@ -472,7 +555,7 @@ class QuickCheckDetails {

class RegExpNode: public ZoneObject {

public:

- RegExpNode() : trace_count_(0) { }

+ RegExpNode() : first_character_set_(NULL), trace_count_(0) { }

virtual ~RegExpNode();

virtual void Accept(NodeVisitor* visitor) = 0;

// Generates a goto to this node or actually generates the code at this point.

@@ -523,8 +606,29 @@ class RegExpNode: public ZoneObject {

SiblingList* siblings() { return &siblings_; }

void set_siblings(SiblingList* other) { siblings_ = *other; }

+ // Return the set of possible next characters recognized by the regexp

+ // (or a safe subset, potentially the set of all characters).

+ ZoneList<CharacterRange>* FirstCharacterSet();

+ // Compute (if possible within the budget of traversed nodes) the

+ // possible first characters of the input matched by this node and

+ // its continuation. Returns the remaining budget after the computation.

+ // If the budget is spent, the result is negative, and the cached

+ // first_character_set_ value isn't set.

+ virtual int ComputeFirstCharacterSet(int budget);

+ // Get and set the cached first character set value.

+ ZoneList<CharacterRange>* first_character_set() {

+ return first_character_set_;

+ }

+ void set_first_character_set(ZoneList<CharacterRange>* character_set) {

+ first_character_set_ = character_set;

+ }

protected:

enum LimitResult { DONE, CONTINUE };

+ static const int kComputeFirstCharacterSetFail = -1;

LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);

// Returns a sibling of this node whose interests and assumptions

@@ -545,9 +649,11 @@ class RegExpNode: public ZoneObject {

virtual RegExpNode* Clone() = 0;

private:

+ static const int kFirstCharBudget = 10;

Label label_;

NodeInfo info_;

SiblingList siblings_;

+ ZoneList<CharacterRange>* first_character_set_;

// This variable keeps track of how many times code has been generated for

// this node (in different traces). We don't keep track of where the

// generated code is located unless the code is generated at the start of

@@ -638,7 +744,7 @@ class ActionNode: public SeqRegExpNode {

// TODO(erikcorry): We should allow some action nodes in greedy loops.

virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }

virtual ActionNode* Clone() { return new ActionNode(*this); }

+ virtual int ComputeFirstCharacterSet(int budget);

private:

union {

struct {

@@ -704,7 +810,7 @@ class TextNode: public SeqRegExpNode {

return result;

}

void CalculateOffsets();

+ virtual int ComputeFirstCharacterSet(int budget);

private:

enum TextEmitPassType {

NON_ASCII_MATCH, // Check for characters that can't match.

@@ -734,7 +840,12 @@ class AssertionNode: public SeqRegExpNode {

AT_START,

AT_BOUNDARY,

AT_NON_BOUNDARY,

- AFTER_NEWLINE

+ AFTER_NEWLINE,

+ // Types not directly expressible in regexp syntax.

+ // Used for modifying a boundary node if its following character is

+ // known to be word and/or non-word.

+ AFTER_NONWORD_CHARACTER,

+ AFTER_WORD_CHARACTER

};

static AssertionNode* AtEnd(RegExpNode* on_success) {

return new AssertionNode(AT_END, on_success);

@@ -758,8 +869,10 @@ class AssertionNode: public SeqRegExpNode {

RegExpCompiler* compiler,

int filled_in,

bool not_at_start);

+ virtual int ComputeFirstCharacterSet(int budget);

virtual AssertionNode* Clone() { return new AssertionNode(*this); }

AssertionNodeType type() { return type_; }

+ void set_type(AssertionNodeType type) { type_ = type; }

private:

AssertionNode(AssertionNodeType t, RegExpNode* on_success)

: SeqRegExpNode(on_success), type_(t) { }

@@ -787,7 +900,7 @@ class BackReferenceNode: public SeqRegExpNode {

return;

}

virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }

+ virtual int ComputeFirstCharacterSet(int budget);

private:

int start_reg_;

int end_reg_;

@@ -809,7 +922,6 @@ class EndNode: public RegExpNode {

UNREACHABLE();

}

virtual EndNode* Clone() { return new EndNode(*this); }

private:

Action action_;

};

@@ -943,6 +1055,7 @@ class NegativeLookaheadChoiceNode: public ChoiceNode {

// characters, but on a negative lookahead the negative branch did not take

// part in that calculation (EatsAtLeast) so the assumptions don't hold.

virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; }

+ virtual int ComputeFirstCharacterSet(int budget);

};

@@ -961,6 +1074,7 @@ class LoopChoiceNode: public ChoiceNode {

RegExpCompiler* compiler,

int characters_filled_in,

bool not_at_start);

+ virtual int ComputeFirstCharacterSet(int budget);

virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }

RegExpNode* loop_node() { return loop_node_; }

RegExpNode* continue_node() { return continue_node_; }

@@ -1116,7 +1230,7 @@ class Trace {

void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }

void set_stop_node(RegExpNode* node) { stop_node_ = node; }

void set_loop_label(Label* label) { loop_label_ = label; }

- void set_characters_preloaded(int cpre) { characters_preloaded_ = cpre; }

+ void set_characters_preloaded(int count) { characters_preloaded_ = count; }

void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }

void set_flush_budget(int to) { flush_budget_ = to; }

void set_quick_check_performed(QuickCheckDetails* d) {

« no previous file with comments | « src/ia32/regexp-macro-assembler-ia32.cc ('k') | src/jsregexp.cc » ('j') | no next file with comments »