| Index: src/jsregexp.h
|
| diff --git a/src/jsregexp.h b/src/jsregexp.h
|
| index 5ce5f29f78718632588e4a5df1e8c6c9e6914886..70fa0d6d4f822a62f1205bfc259a63d7db54c26a 100644
|
| --- a/src/jsregexp.h
|
| +++ b/src/jsregexp.h
|
| @@ -167,6 +167,57 @@ class RegExpImpl {
|
| };
|
|
|
|
|
| +// Represents the location of one element relative to the intersection of
|
| +// two sets. Corresponds to the four areas of a Venn diagram.
|
| +enum ElementInSetsRelation {
|
| + kInsideNone = 0,
|
| + kInsideFirst = 1,
|
| + kInsideSecond = 2,
|
| + kInsideBoth = 3
|
| +};
|
| +
|
| +
|
| +// Represents the relation of two sets.
|
| +// Sets can be either disjoint, partially or fully overlapping, or equal.
|
| +class SetRelation BASE_EMBEDDED {
|
| + public:
|
| + // Relation is represented by a bit saying whether there are elements in
|
| + // one set that is not in the other, and a bit saying that there are elements
|
| + // that are in both sets.
|
| +
|
| + // Location of an element. Corresponds to the internal areas of
|
| + // a Venn diagram.
|
| + enum {
|
| + kInFirst = 1 << kInsideFirst,
|
| + kInSecond = 1 << kInsideSecond,
|
| + kInBoth = 1 << kInsideBoth
|
| + };
|
| + SetRelation() : bits_(0) {}
|
| + ~SetRelation() {}
|
| + // Add the existence of objects in a particular
|
| + void SetElementsInFirstSet() { bits_ |= kInFirst; }
|
| + void SetElementsInSecondSet() { bits_ |= kInSecond; }
|
| + void SetElementsInBothSets() { bits_ |= kInBoth; }
|
| + // Check the currently known relation of the sets (common functions only,
|
| + // for other combinations, use value() to get the bits and check them
|
| + // manually).
|
| + // Sets are completely disjoint.
|
| + bool Disjoint() { return (bits_ & kInBoth) == 0; }
|
| + // Sets are equal.
|
| + bool Equals() { return (bits_ & (kInFirst | kInSecond)) == 0; }
|
| + // First set contains second.
|
| + bool Contains() { return (bits_ & kInSecond) == 0; }
|
| + // Second set contains first.
|
| + bool ContainedIn() { return (bits_ & kInFirst) == 0; }
|
| + bool NonTrivialIntersection() {
|
| + return (bits_ == (kInFirst | kInSecond | kInBoth));
|
| + }
|
| + int value() { return bits_; }
|
| + private:
|
| + int bits_;
|
| +};
|
| +
|
| +
|
| class CharacterRange {
|
| public:
|
| CharacterRange() : from_(0), to_(0) { }
|
| @@ -198,7 +249,39 @@ class CharacterRange {
|
| Vector<const uc16> overlay,
|
| ZoneList<CharacterRange>** included,
|
| ZoneList<CharacterRange>** excluded);
|
| -
|
| + // Whether a range list is in canonical form: Ranges ordered by from value,
|
| + // and ranges non-overlapping and non-adjacent.
|
| + static bool IsCanonical(ZoneList<CharacterRange>* ranges);
|
| + // Convert range list to canonical form. The characters covered by the ranges
|
| + // will still be the same, but no character is in more than one range, and
|
| + // adjacent ranges are merged. The resulting list may be shorter than the
|
| + // original, but cannot be longer.
|
| + static void Canonicalize(ZoneList<CharacterRange>* ranges);
|
| + // Check how the set of characters defined by a CharacterRange list relates
|
| + // to the set of word characters. List must be in canonical form.
|
| + static SetRelation WordCharacterRelation(ZoneList<CharacterRange>* ranges);
|
| + // Takes two character range lists (representing character sets) in canonical
|
| + // form and merges them.
|
| + // The characters that are only covered by the first set are added to
|
| + // first_set_only_out. the characters that are only in the second set are
|
| + // added to second_set_only_out, and the characters that are in both are
|
| + // added to both_sets_out.
|
| + // The pointers to first_set_only_out, second_set_only_out and both_sets_out
|
| + // should be to empty lists, but they need not be distinct, and may be NULL.
|
| + // If NULL, the characters are dropped, and if two arguments are the same
|
| + // pointer, the result is the union of the two sets that would be created
|
| + // if the pointers had been distinct.
|
| + // This way, the Merge function can compute all the usual set operations:
|
| + // union (all three out-sets are equal), intersection (only both_sets_out is
|
| + // non-NULL), and set difference (only first_set is non-NULL).
|
| + static void Merge(ZoneList<CharacterRange>* first_set,
|
| + ZoneList<CharacterRange>* second_set,
|
| + ZoneList<CharacterRange>* first_set_only_out,
|
| + ZoneList<CharacterRange>* second_set_only_out,
|
| + ZoneList<CharacterRange>* both_sets_out);
|
| + // Negate the contents of a character range in canonical form.
|
| + static void Negate(ZoneList<CharacterRange>* src,
|
| + ZoneList<CharacterRange>* dst);
|
| static const int kRangeCanonicalizeMax = 0x346;
|
| static const int kStartMarker = (1 << 24);
|
| static const int kPayloadMask = (1 << 24) - 1;
|
| @@ -472,7 +555,7 @@ class QuickCheckDetails {
|
|
|
| class RegExpNode: public ZoneObject {
|
| public:
|
| - RegExpNode() : trace_count_(0) { }
|
| + RegExpNode() : first_character_set_(NULL), trace_count_(0) { }
|
| virtual ~RegExpNode();
|
| virtual void Accept(NodeVisitor* visitor) = 0;
|
| // Generates a goto to this node or actually generates the code at this point.
|
| @@ -523,8 +606,29 @@ class RegExpNode: public ZoneObject {
|
| SiblingList* siblings() { return &siblings_; }
|
| void set_siblings(SiblingList* other) { siblings_ = *other; }
|
|
|
| + // Return the set of possible next characters recognized by the regexp
|
| + // (or a safe subset, potentially the set of all characters).
|
| + ZoneList<CharacterRange>* FirstCharacterSet();
|
| +
|
| + // Compute (if possible within the budget of traversed nodes) the
|
| + // possible first characters of the input matched by this node and
|
| + // its continuation. Returns the remaining budget after the computation.
|
| + // If the budget is spent, the result is negative, and the cached
|
| + // first_character_set_ value isn't set.
|
| + virtual int ComputeFirstCharacterSet(int budget);
|
| +
|
| + // Get and set the cached first character set value.
|
| + ZoneList<CharacterRange>* first_character_set() {
|
| + return first_character_set_;
|
| + }
|
| + void set_first_character_set(ZoneList<CharacterRange>* character_set) {
|
| + first_character_set_ = character_set;
|
| + }
|
| +
|
| protected:
|
| enum LimitResult { DONE, CONTINUE };
|
| + static const int kComputeFirstCharacterSetFail = -1;
|
| +
|
| LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
|
|
|
| // Returns a sibling of this node whose interests and assumptions
|
| @@ -545,9 +649,11 @@ class RegExpNode: public ZoneObject {
|
| virtual RegExpNode* Clone() = 0;
|
|
|
| private:
|
| + static const int kFirstCharBudget = 10;
|
| Label label_;
|
| NodeInfo info_;
|
| SiblingList siblings_;
|
| + ZoneList<CharacterRange>* first_character_set_;
|
| // This variable keeps track of how many times code has been generated for
|
| // this node (in different traces). We don't keep track of where the
|
| // generated code is located unless the code is generated at the start of
|
| @@ -638,7 +744,7 @@ class ActionNode: public SeqRegExpNode {
|
| // TODO(erikcorry): We should allow some action nodes in greedy loops.
|
| virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
|
| virtual ActionNode* Clone() { return new ActionNode(*this); }
|
| -
|
| + virtual int ComputeFirstCharacterSet(int budget);
|
| private:
|
| union {
|
| struct {
|
| @@ -704,7 +810,7 @@ class TextNode: public SeqRegExpNode {
|
| return result;
|
| }
|
| void CalculateOffsets();
|
| -
|
| + virtual int ComputeFirstCharacterSet(int budget);
|
| private:
|
| enum TextEmitPassType {
|
| NON_ASCII_MATCH, // Check for characters that can't match.
|
| @@ -734,7 +840,12 @@ class AssertionNode: public SeqRegExpNode {
|
| AT_START,
|
| AT_BOUNDARY,
|
| AT_NON_BOUNDARY,
|
| - AFTER_NEWLINE
|
| + AFTER_NEWLINE,
|
| + // Types not directly expressible in regexp syntax.
|
| + // Used for modifying a boundary node if its following character is
|
| + // known to be word and/or non-word.
|
| + AFTER_NONWORD_CHARACTER,
|
| + AFTER_WORD_CHARACTER
|
| };
|
| static AssertionNode* AtEnd(RegExpNode* on_success) {
|
| return new AssertionNode(AT_END, on_success);
|
| @@ -758,8 +869,10 @@ class AssertionNode: public SeqRegExpNode {
|
| RegExpCompiler* compiler,
|
| int filled_in,
|
| bool not_at_start);
|
| + virtual int ComputeFirstCharacterSet(int budget);
|
| virtual AssertionNode* Clone() { return new AssertionNode(*this); }
|
| AssertionNodeType type() { return type_; }
|
| + void set_type(AssertionNodeType type) { type_ = type; }
|
| private:
|
| AssertionNode(AssertionNodeType t, RegExpNode* on_success)
|
| : SeqRegExpNode(on_success), type_(t) { }
|
| @@ -787,7 +900,7 @@ class BackReferenceNode: public SeqRegExpNode {
|
| return;
|
| }
|
| virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }
|
| -
|
| + virtual int ComputeFirstCharacterSet(int budget);
|
| private:
|
| int start_reg_;
|
| int end_reg_;
|
| @@ -809,7 +922,6 @@ class EndNode: public RegExpNode {
|
| UNREACHABLE();
|
| }
|
| virtual EndNode* Clone() { return new EndNode(*this); }
|
| -
|
| private:
|
| Action action_;
|
| };
|
| @@ -943,6 +1055,7 @@ class NegativeLookaheadChoiceNode: public ChoiceNode {
|
| // characters, but on a negative lookahead the negative branch did not take
|
| // part in that calculation (EatsAtLeast) so the assumptions don't hold.
|
| virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; }
|
| + virtual int ComputeFirstCharacterSet(int budget);
|
| };
|
|
|
|
|
| @@ -961,6 +1074,7 @@ class LoopChoiceNode: public ChoiceNode {
|
| RegExpCompiler* compiler,
|
| int characters_filled_in,
|
| bool not_at_start);
|
| + virtual int ComputeFirstCharacterSet(int budget);
|
| virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }
|
| RegExpNode* loop_node() { return loop_node_; }
|
| RegExpNode* continue_node() { return continue_node_; }
|
| @@ -1116,7 +1230,7 @@ class Trace {
|
| void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
|
| void set_stop_node(RegExpNode* node) { stop_node_ = node; }
|
| void set_loop_label(Label* label) { loop_label_ = label; }
|
| - void set_characters_preloaded(int cpre) { characters_preloaded_ = cpre; }
|
| + void set_characters_preloaded(int count) { characters_preloaded_ = count; }
|
| void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
|
| void set_flush_budget(int to) { flush_budget_ = to; }
|
| void set_quick_check_performed(QuickCheckDetails* d) {
|
|
|