Index: src/regexp/regexp-ast.h |
diff --git a/src/regexp/regexp-ast.h b/src/regexp/regexp-ast.h |
new file mode 100644 |
index 0000000000000000000000000000000000000000..f87778596ad44dedb0280b6c84743f931cdce954 |
--- /dev/null |
+++ b/src/regexp/regexp-ast.h |
@@ -0,0 +1,496 @@ |
+// Copyright 2016 the V8 project authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#ifndef V8_REGEXP_REGEXP_AST_H_ |
+#define V8_REGEXP_REGEXP_AST_H_ |
+ |
+#include "src/utils.h" |
+#include "src/zone.h" |
+ |
+namespace v8 { |
+namespace internal { |
+ |
+#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ |
+ VISIT(Disjunction) \ |
+ VISIT(Alternative) \ |
+ VISIT(Assertion) \ |
+ VISIT(CharacterClass) \ |
+ VISIT(Atom) \ |
+ VISIT(Quantifier) \ |
+ VISIT(Capture) \ |
+ VISIT(Lookaround) \ |
+ VISIT(BackReference) \ |
+ VISIT(Empty) \ |
+ VISIT(Text) |
+ |
+ |
+#define FORWARD_DECLARE(Name) class RegExp##Name; |
+FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE) |
+#undef FORWARD_DECLARE |
+ |
+class RegExpCompiler; |
+class RegExpNode; |
+class RegExpTree; |
+ |
+ |
+class RegExpVisitor BASE_EMBEDDED { |
+ public: |
+ virtual ~RegExpVisitor() {} |
+#define MAKE_CASE(Name) \ |
+ virtual void* Visit##Name(RegExp##Name*, void* data) = 0; |
+ FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) |
+#undef MAKE_CASE |
+}; |
+ |
+ |
+// A simple closed interval. |
+class Interval { |
+ public: |
+ Interval() : from_(kNone), to_(kNone) {} |
+ Interval(int from, int to) : from_(from), to_(to) {} |
+ Interval Union(Interval that) { |
+ if (that.from_ == kNone) |
+ return *this; |
+ else if (from_ == kNone) |
+ return that; |
+ else |
+ return Interval(Min(from_, that.from_), Max(to_, that.to_)); |
+ } |
+ bool Contains(int value) { return (from_ <= value) && (value <= to_); } |
+ bool is_empty() { return from_ == kNone; } |
+ int from() const { return from_; } |
+ int to() const { return to_; } |
+ static Interval Empty() { return Interval(); } |
+ static const int kNone = -1; |
+ |
+ private: |
+ int from_; |
+ int to_; |
+}; |
+ |
+ |
+// Represents code units in the range from from_ to to_, both ends are |
+// inclusive. |
+class CharacterRange { |
+ public: |
+ CharacterRange() : from_(0), to_(0) {} |
+ // For compatibility with the CHECK_OK macro |
+ CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT |
+ CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) {} |
+ static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges, |
+ Zone* zone); |
+ static Vector<const int> GetWordBounds(); |
+ static inline CharacterRange Singleton(uc16 value) { |
+ return CharacterRange(value, value); |
+ } |
+ static inline CharacterRange Range(uc16 from, uc16 to) { |
+ DCHECK(from <= to); |
+ return CharacterRange(from, to); |
+ } |
+ static inline CharacterRange Everything() { |
+ return CharacterRange(0, 0xFFFF); |
+ } |
+ bool Contains(uc16 i) { return from_ <= i && i <= to_; } |
+ uc16 from() const { return from_; } |
+ void set_from(uc16 value) { from_ = value; } |
+ uc16 to() const { return to_; } |
+ void set_to(uc16 value) { to_ = value; } |
+ bool is_valid() { return from_ <= to_; } |
+ bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; } |
+ bool IsSingleton() { return (from_ == to_); } |
+ void AddCaseEquivalents(Isolate* isolate, Zone* zone, |
+ ZoneList<CharacterRange>* ranges, bool is_one_byte); |
+ static void Split(ZoneList<CharacterRange>* base, Vector<const int> overlay, |
+ ZoneList<CharacterRange>** included, |
+ ZoneList<CharacterRange>** excluded, Zone* zone); |
+ // Whether a range list is in canonical form: Ranges ordered by from value, |
+ // and ranges non-overlapping and non-adjacent. |
+ static bool IsCanonical(ZoneList<CharacterRange>* ranges); |
+ // Convert range list to canonical form. The characters covered by the ranges |
+ // will still be the same, but no character is in more than one range, and |
+ // adjacent ranges are merged. The resulting list may be shorter than the |
+ // original, but cannot be longer. |
+ static void Canonicalize(ZoneList<CharacterRange>* ranges); |
+ // Negate the contents of a character range in canonical form. |
+ static void Negate(ZoneList<CharacterRange>* src, |
+ ZoneList<CharacterRange>* dst, Zone* zone); |
+ static const int kStartMarker = (1 << 24); |
+ static const int kPayloadMask = (1 << 24) - 1; |
+ |
+ private: |
+ uc16 from_; |
+ uc16 to_; |
+}; |
+ |
+ |
+class CharacterSet final BASE_EMBEDDED { |
+ public: |
+ explicit CharacterSet(uc16 standard_set_type) |
+ : ranges_(NULL), standard_set_type_(standard_set_type) {} |
+ explicit CharacterSet(ZoneList<CharacterRange>* ranges) |
+ : ranges_(ranges), standard_set_type_(0) {} |
+ ZoneList<CharacterRange>* ranges(Zone* zone); |
+ uc16 standard_set_type() { return standard_set_type_; } |
+ void set_standard_set_type(uc16 special_set_type) { |
+ standard_set_type_ = special_set_type; |
+ } |
+ bool is_standard() { return standard_set_type_ != 0; } |
+ void Canonicalize(); |
+ |
+ private: |
+ ZoneList<CharacterRange>* ranges_; |
+ // If non-zero, the value represents a standard set (e.g., all whitespace |
+ // characters) without having to expand the ranges. |
+ uc16 standard_set_type_; |
+}; |
+ |
+ |
+class TextElement final BASE_EMBEDDED { |
+ public: |
+ enum TextType { ATOM, CHAR_CLASS }; |
+ |
+ static TextElement Atom(RegExpAtom* atom); |
+ static TextElement CharClass(RegExpCharacterClass* char_class); |
+ |
+ int cp_offset() const { return cp_offset_; } |
+ void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } |
+ int length() const; |
+ |
+ TextType text_type() const { return text_type_; } |
+ |
+ RegExpTree* tree() const { return tree_; } |
+ |
+ RegExpAtom* atom() const { |
+ DCHECK(text_type() == ATOM); |
+ return reinterpret_cast<RegExpAtom*>(tree()); |
+ } |
+ |
+ RegExpCharacterClass* char_class() const { |
+ DCHECK(text_type() == CHAR_CLASS); |
+ return reinterpret_cast<RegExpCharacterClass*>(tree()); |
+ } |
+ |
+ private: |
+ TextElement(TextType text_type, RegExpTree* tree) |
+ : cp_offset_(-1), text_type_(text_type), tree_(tree) {} |
+ |
+ int cp_offset_; |
+ TextType text_type_; |
+ RegExpTree* tree_; |
+}; |
+ |
+ |
+class RegExpTree : public ZoneObject { |
+ public: |
+ static const int kInfinity = kMaxInt; |
+ virtual ~RegExpTree() {} |
+ virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; |
+ virtual RegExpNode* ToNode(RegExpCompiler* compiler, |
+ RegExpNode* on_success) = 0; |
+ virtual bool IsTextElement() { return false; } |
+ virtual bool IsAnchoredAtStart() { return false; } |
+ virtual bool IsAnchoredAtEnd() { return false; } |
+ virtual int min_match() = 0; |
+ virtual int max_match() = 0; |
+ // Returns the interval of registers used for captures within this |
+ // expression. |
+ virtual Interval CaptureRegisters() { return Interval::Empty(); } |
+ virtual void AppendToText(RegExpText* text, Zone* zone); |
+ std::ostream& Print(std::ostream& os, Zone* zone); // NOLINT |
+#define MAKE_ASTYPE(Name) \ |
+ virtual RegExp##Name* As##Name(); \ |
+ virtual bool Is##Name(); |
+ FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) |
+#undef MAKE_ASTYPE |
+}; |
+ |
+ |
+class RegExpDisjunction final : public RegExpTree { |
+ public: |
+ explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives); |
+ void* Accept(RegExpVisitor* visitor, void* data) override; |
+ RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
+ RegExpDisjunction* AsDisjunction() override; |
+ Interval CaptureRegisters() override; |
+ bool IsDisjunction() override; |
+ bool IsAnchoredAtStart() override; |
+ bool IsAnchoredAtEnd() override; |
+ int min_match() override { return min_match_; } |
+ int max_match() override { return max_match_; } |
+ ZoneList<RegExpTree*>* alternatives() { return alternatives_; } |
+ |
+ private: |
+ bool SortConsecutiveAtoms(RegExpCompiler* compiler); |
+ void RationalizeConsecutiveAtoms(RegExpCompiler* compiler); |
+ void FixSingleCharacterDisjunctions(RegExpCompiler* compiler); |
+ ZoneList<RegExpTree*>* alternatives_; |
+ int min_match_; |
+ int max_match_; |
+}; |
+ |
+ |
+class RegExpAlternative final : public RegExpTree { |
+ public: |
+ explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes); |
+ void* Accept(RegExpVisitor* visitor, void* data) override; |
+ RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
+ RegExpAlternative* AsAlternative() override; |
+ Interval CaptureRegisters() override; |
+ bool IsAlternative() override; |
+ bool IsAnchoredAtStart() override; |
+ bool IsAnchoredAtEnd() override; |
+ int min_match() override { return min_match_; } |
+ int max_match() override { return max_match_; } |
+ ZoneList<RegExpTree*>* nodes() { return nodes_; } |
+ |
+ private: |
+ ZoneList<RegExpTree*>* nodes_; |
+ int min_match_; |
+ int max_match_; |
+}; |
+ |
+ |
+class RegExpAssertion final : public RegExpTree { |
+ public: |
+ enum AssertionType { |
+ START_OF_LINE, |
+ START_OF_INPUT, |
+ END_OF_LINE, |
+ END_OF_INPUT, |
+ BOUNDARY, |
+ NON_BOUNDARY |
+ }; |
+ explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {} |
+ void* Accept(RegExpVisitor* visitor, void* data) override; |
+ RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
+ RegExpAssertion* AsAssertion() override; |
+ bool IsAssertion() override; |
+ bool IsAnchoredAtStart() override; |
+ bool IsAnchoredAtEnd() override; |
+ int min_match() override { return 0; } |
+ int max_match() override { return 0; } |
+ AssertionType assertion_type() { return assertion_type_; } |
+ |
+ private: |
+ AssertionType assertion_type_; |
+}; |
+ |
+ |
+class RegExpCharacterClass final : public RegExpTree { |
+ public: |
+ RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated) |
+ : set_(ranges), is_negated_(is_negated) {} |
+ explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {} |
+ void* Accept(RegExpVisitor* visitor, void* data) override; |
+ RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
+ RegExpCharacterClass* AsCharacterClass() override; |
+ bool IsCharacterClass() override; |
+ bool IsTextElement() override { return true; } |
+ int min_match() override { return 1; } |
+ int max_match() override { return 1; } |
+ void AppendToText(RegExpText* text, Zone* zone) override; |
+ CharacterSet character_set() { return set_; } |
+ // TODO(lrn): Remove need for complex version if is_standard that |
+ // recognizes a mangled standard set and just do { return set_.is_special(); } |
+ bool is_standard(Zone* zone); |
+ // Returns a value representing the standard character set if is_standard() |
+ // returns true. |
+ // Currently used values are: |
+ // s : unicode whitespace |
+ // S : unicode non-whitespace |
+ // w : ASCII word character (digit, letter, underscore) |
+ // W : non-ASCII word character |
+ // d : ASCII digit |
+ // D : non-ASCII digit |
+ // . : non-unicode non-newline |
+ // * : All characters |
+ uc16 standard_type() { return set_.standard_set_type(); } |
+ ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); } |
+ bool is_negated() { return is_negated_; } |
+ |
+ private: |
+ CharacterSet set_; |
+ bool is_negated_; |
+}; |
+ |
+ |
+class RegExpAtom final : public RegExpTree { |
+ public: |
+ explicit RegExpAtom(Vector<const uc16> data) : data_(data) {} |
+ void* Accept(RegExpVisitor* visitor, void* data) override; |
+ RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
+ RegExpAtom* AsAtom() override; |
+ bool IsAtom() override; |
+ bool IsTextElement() override { return true; } |
+ int min_match() override { return data_.length(); } |
+ int max_match() override { return data_.length(); } |
+ void AppendToText(RegExpText* text, Zone* zone) override; |
+ Vector<const uc16> data() { return data_; } |
+ int length() { return data_.length(); } |
+ |
+ private: |
+ Vector<const uc16> data_; |
+}; |
+ |
+ |
+class RegExpText final : public RegExpTree { |
+ public: |
+ explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {} |
+ void* Accept(RegExpVisitor* visitor, void* data) override; |
+ RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
+ RegExpText* AsText() override; |
+ bool IsText() override; |
+ bool IsTextElement() override { return true; } |
+ int min_match() override { return length_; } |
+ int max_match() override { return length_; } |
+ void AppendToText(RegExpText* text, Zone* zone) override; |
+ void AddElement(TextElement elm, Zone* zone) { |
+ elements_.Add(elm, zone); |
+ length_ += elm.length(); |
+ } |
+ ZoneList<TextElement>* elements() { return &elements_; } |
+ |
+ private: |
+ ZoneList<TextElement> elements_; |
+ int length_; |
+}; |
+ |
+ |
+class RegExpQuantifier final : public RegExpTree { |
+ public: |
+ enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE }; |
+ RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body) |
+ : body_(body), |
+ min_(min), |
+ max_(max), |
+ min_match_(min * body->min_match()), |
+ quantifier_type_(type) { |
+ if (max > 0 && body->max_match() > kInfinity / max) { |
+ max_match_ = kInfinity; |
+ } else { |
+ max_match_ = max * body->max_match(); |
+ } |
+ } |
+ void* Accept(RegExpVisitor* visitor, void* data) override; |
+ RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
+ static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body, |
+ RegExpCompiler* compiler, RegExpNode* on_success, |
+ bool not_at_start = false); |
+ RegExpQuantifier* AsQuantifier() override; |
+ Interval CaptureRegisters() override; |
+ bool IsQuantifier() override; |
+ int min_match() override { return min_match_; } |
+ int max_match() override { return max_match_; } |
+ int min() { return min_; } |
+ int max() { return max_; } |
+ bool is_possessive() { return quantifier_type_ == POSSESSIVE; } |
+ bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; } |
+ bool is_greedy() { return quantifier_type_ == GREEDY; } |
+ RegExpTree* body() { return body_; } |
+ |
+ private: |
+ RegExpTree* body_; |
+ int min_; |
+ int max_; |
+ int min_match_; |
+ int max_match_; |
+ QuantifierType quantifier_type_; |
+}; |
+ |
+ |
+class RegExpCapture final : public RegExpTree { |
+ public: |
+ explicit RegExpCapture(int index) : body_(NULL), index_(index) {} |
+ void* Accept(RegExpVisitor* visitor, void* data) override; |
+ RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
+ static RegExpNode* ToNode(RegExpTree* body, int index, |
+ RegExpCompiler* compiler, RegExpNode* on_success); |
+ RegExpCapture* AsCapture() override; |
+ bool IsAnchoredAtStart() override; |
+ bool IsAnchoredAtEnd() override; |
+ Interval CaptureRegisters() override; |
+ bool IsCapture() override; |
+ int min_match() override { return body_->min_match(); } |
+ int max_match() override { return body_->max_match(); } |
+ RegExpTree* body() { return body_; } |
+ void set_body(RegExpTree* body) { body_ = body; } |
+ int index() { return index_; } |
+ static int StartRegister(int index) { return index * 2; } |
+ static int EndRegister(int index) { return index * 2 + 1; } |
+ |
+ private: |
+ RegExpTree* body_; |
+ int index_; |
+}; |
+ |
+ |
+class RegExpLookaround final : public RegExpTree { |
+ public: |
+ enum Type { LOOKAHEAD, LOOKBEHIND }; |
+ |
+ RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count, |
+ int capture_from, Type type) |
+ : body_(body), |
+ is_positive_(is_positive), |
+ capture_count_(capture_count), |
+ capture_from_(capture_from), |
+ type_(type) {} |
+ |
+ void* Accept(RegExpVisitor* visitor, void* data) override; |
+ RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
+ RegExpLookaround* AsLookaround() override; |
+ Interval CaptureRegisters() override; |
+ bool IsLookaround() override; |
+ bool IsAnchoredAtStart() override; |
+ int min_match() override { return 0; } |
+ int max_match() override { return 0; } |
+ RegExpTree* body() { return body_; } |
+ bool is_positive() { return is_positive_; } |
+ int capture_count() { return capture_count_; } |
+ int capture_from() { return capture_from_; } |
+ Type type() { return type_; } |
+ |
+ private: |
+ RegExpTree* body_; |
+ bool is_positive_; |
+ int capture_count_; |
+ int capture_from_; |
+ Type type_; |
+}; |
+ |
+ |
+class RegExpBackReference final : public RegExpTree { |
+ public: |
+ explicit RegExpBackReference(RegExpCapture* capture) : capture_(capture) {} |
+ void* Accept(RegExpVisitor* visitor, void* data) override; |
+ RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
+ RegExpBackReference* AsBackReference() override; |
+ bool IsBackReference() override; |
+ int min_match() override { return 0; } |
+ // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite |
+ // recursion, we give up. Ignorance is bliss. |
+ int max_match() override { return kInfinity; } |
+ int index() { return capture_->index(); } |
+ RegExpCapture* capture() { return capture_; } |
+ |
+ private: |
+ RegExpCapture* capture_; |
+}; |
+ |
+ |
+class RegExpEmpty final : public RegExpTree { |
+ public: |
+ RegExpEmpty() {} |
+ void* Accept(RegExpVisitor* visitor, void* data) override; |
+ RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
+ RegExpEmpty* AsEmpty() override; |
+ bool IsEmpty() override; |
+ int min_match() override { return 0; } |
+ int max_match() override { return 0; } |
+}; |
+ |
+} // namespace internal |
+} // namespace v8 |
+ |
+#endif // V8_REGEXP_REGEXP_AST_H_ |