| Index: src/regexp/regexp-ast.h
|
| diff --git a/src/regexp/regexp-ast.h b/src/regexp/regexp-ast.h
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..f87778596ad44dedb0280b6c84743f931cdce954
|
| --- /dev/null
|
| +++ b/src/regexp/regexp-ast.h
|
| @@ -0,0 +1,496 @@
|
| +// Copyright 2016 the V8 project authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +#ifndef V8_REGEXP_REGEXP_AST_H_
|
| +#define V8_REGEXP_REGEXP_AST_H_
|
| +
|
| +#include "src/utils.h"
|
| +#include "src/zone.h"
|
| +
|
| +namespace v8 {
|
| +namespace internal {
|
| +
|
| +#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
|
| + VISIT(Disjunction) \
|
| + VISIT(Alternative) \
|
| + VISIT(Assertion) \
|
| + VISIT(CharacterClass) \
|
| + VISIT(Atom) \
|
| + VISIT(Quantifier) \
|
| + VISIT(Capture) \
|
| + VISIT(Lookaround) \
|
| + VISIT(BackReference) \
|
| + VISIT(Empty) \
|
| + VISIT(Text)
|
| +
|
| +
|
| +#define FORWARD_DECLARE(Name) class RegExp##Name;
|
| +FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
|
| +#undef FORWARD_DECLARE
|
| +
|
| +class RegExpCompiler;
|
| +class RegExpNode;
|
| +class RegExpTree;
|
| +
|
| +
|
| +class RegExpVisitor BASE_EMBEDDED {
|
| + public:
|
| + virtual ~RegExpVisitor() {}
|
| +#define MAKE_CASE(Name) \
|
| + virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
|
| + FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
|
| +#undef MAKE_CASE
|
| +};
|
| +
|
| +
|
| +// A simple closed interval.
|
| +class Interval {
|
| + public:
|
| + Interval() : from_(kNone), to_(kNone) {}
|
| + Interval(int from, int to) : from_(from), to_(to) {}
|
| + Interval Union(Interval that) {
|
| + if (that.from_ == kNone)
|
| + return *this;
|
| + else if (from_ == kNone)
|
| + return that;
|
| + else
|
| + return Interval(Min(from_, that.from_), Max(to_, that.to_));
|
| + }
|
| + bool Contains(int value) { return (from_ <= value) && (value <= to_); }
|
| + bool is_empty() { return from_ == kNone; }
|
| + int from() const { return from_; }
|
| + int to() const { return to_; }
|
| + static Interval Empty() { return Interval(); }
|
| + static const int kNone = -1;
|
| +
|
| + private:
|
| + int from_;
|
| + int to_;
|
| +};
|
| +
|
| +
|
| +// Represents code units in the range from from_ to to_, both ends are
|
| +// inclusive.
|
| +class CharacterRange {
|
| + public:
|
| + CharacterRange() : from_(0), to_(0) {}
|
| + // For compatibility with the CHECK_OK macro
|
| + CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT
|
| + CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) {}
|
| + static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
|
| + Zone* zone);
|
| + static Vector<const int> GetWordBounds();
|
| + static inline CharacterRange Singleton(uc16 value) {
|
| + return CharacterRange(value, value);
|
| + }
|
| + static inline CharacterRange Range(uc16 from, uc16 to) {
|
| + DCHECK(from <= to);
|
| + return CharacterRange(from, to);
|
| + }
|
| + static inline CharacterRange Everything() {
|
| + return CharacterRange(0, 0xFFFF);
|
| + }
|
| + bool Contains(uc16 i) { return from_ <= i && i <= to_; }
|
| + uc16 from() const { return from_; }
|
| + void set_from(uc16 value) { from_ = value; }
|
| + uc16 to() const { return to_; }
|
| + void set_to(uc16 value) { to_ = value; }
|
| + bool is_valid() { return from_ <= to_; }
|
| + bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
|
| + bool IsSingleton() { return (from_ == to_); }
|
| + void AddCaseEquivalents(Isolate* isolate, Zone* zone,
|
| + ZoneList<CharacterRange>* ranges, bool is_one_byte);
|
| + static void Split(ZoneList<CharacterRange>* base, Vector<const int> overlay,
|
| + ZoneList<CharacterRange>** included,
|
| + ZoneList<CharacterRange>** excluded, Zone* zone);
|
| + // Whether a range list is in canonical form: Ranges ordered by from value,
|
| + // and ranges non-overlapping and non-adjacent.
|
| + static bool IsCanonical(ZoneList<CharacterRange>* ranges);
|
| + // Convert range list to canonical form. The characters covered by the ranges
|
| + // will still be the same, but no character is in more than one range, and
|
| + // adjacent ranges are merged. The resulting list may be shorter than the
|
| + // original, but cannot be longer.
|
| + static void Canonicalize(ZoneList<CharacterRange>* ranges);
|
| + // Negate the contents of a character range in canonical form.
|
| + static void Negate(ZoneList<CharacterRange>* src,
|
| + ZoneList<CharacterRange>* dst, Zone* zone);
|
| + static const int kStartMarker = (1 << 24);
|
| + static const int kPayloadMask = (1 << 24) - 1;
|
| +
|
| + private:
|
| + uc16 from_;
|
| + uc16 to_;
|
| +};
|
| +
|
| +
|
| +class CharacterSet final BASE_EMBEDDED {
|
| + public:
|
| + explicit CharacterSet(uc16 standard_set_type)
|
| + : ranges_(NULL), standard_set_type_(standard_set_type) {}
|
| + explicit CharacterSet(ZoneList<CharacterRange>* ranges)
|
| + : ranges_(ranges), standard_set_type_(0) {}
|
| + ZoneList<CharacterRange>* ranges(Zone* zone);
|
| + uc16 standard_set_type() { return standard_set_type_; }
|
| + void set_standard_set_type(uc16 special_set_type) {
|
| + standard_set_type_ = special_set_type;
|
| + }
|
| + bool is_standard() { return standard_set_type_ != 0; }
|
| + void Canonicalize();
|
| +
|
| + private:
|
| + ZoneList<CharacterRange>* ranges_;
|
| + // If non-zero, the value represents a standard set (e.g., all whitespace
|
| + // characters) without having to expand the ranges.
|
| + uc16 standard_set_type_;
|
| +};
|
| +
|
| +
|
| +class TextElement final BASE_EMBEDDED {
|
| + public:
|
| + enum TextType { ATOM, CHAR_CLASS };
|
| +
|
| + static TextElement Atom(RegExpAtom* atom);
|
| + static TextElement CharClass(RegExpCharacterClass* char_class);
|
| +
|
| + int cp_offset() const { return cp_offset_; }
|
| + void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
|
| + int length() const;
|
| +
|
| + TextType text_type() const { return text_type_; }
|
| +
|
| + RegExpTree* tree() const { return tree_; }
|
| +
|
| + RegExpAtom* atom() const {
|
| + DCHECK(text_type() == ATOM);
|
| + return reinterpret_cast<RegExpAtom*>(tree());
|
| + }
|
| +
|
| + RegExpCharacterClass* char_class() const {
|
| + DCHECK(text_type() == CHAR_CLASS);
|
| + return reinterpret_cast<RegExpCharacterClass*>(tree());
|
| + }
|
| +
|
| + private:
|
| + TextElement(TextType text_type, RegExpTree* tree)
|
| + : cp_offset_(-1), text_type_(text_type), tree_(tree) {}
|
| +
|
| + int cp_offset_;
|
| + TextType text_type_;
|
| + RegExpTree* tree_;
|
| +};
|
| +
|
| +
|
| +class RegExpTree : public ZoneObject {
|
| + public:
|
| + static const int kInfinity = kMaxInt;
|
| + virtual ~RegExpTree() {}
|
| + virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
|
| + virtual RegExpNode* ToNode(RegExpCompiler* compiler,
|
| + RegExpNode* on_success) = 0;
|
| + virtual bool IsTextElement() { return false; }
|
| + virtual bool IsAnchoredAtStart() { return false; }
|
| + virtual bool IsAnchoredAtEnd() { return false; }
|
| + virtual int min_match() = 0;
|
| + virtual int max_match() = 0;
|
| + // Returns the interval of registers used for captures within this
|
| + // expression.
|
| + virtual Interval CaptureRegisters() { return Interval::Empty(); }
|
| + virtual void AppendToText(RegExpText* text, Zone* zone);
|
| + std::ostream& Print(std::ostream& os, Zone* zone); // NOLINT
|
| +#define MAKE_ASTYPE(Name) \
|
| + virtual RegExp##Name* As##Name(); \
|
| + virtual bool Is##Name();
|
| + FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
|
| +#undef MAKE_ASTYPE
|
| +};
|
| +
|
| +
|
| +class RegExpDisjunction final : public RegExpTree {
|
| + public:
|
| + explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
|
| + void* Accept(RegExpVisitor* visitor, void* data) override;
|
| + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
| + RegExpDisjunction* AsDisjunction() override;
|
| + Interval CaptureRegisters() override;
|
| + bool IsDisjunction() override;
|
| + bool IsAnchoredAtStart() override;
|
| + bool IsAnchoredAtEnd() override;
|
| + int min_match() override { return min_match_; }
|
| + int max_match() override { return max_match_; }
|
| + ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
|
| +
|
| + private:
|
| + bool SortConsecutiveAtoms(RegExpCompiler* compiler);
|
| + void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
|
| + void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
|
| + ZoneList<RegExpTree*>* alternatives_;
|
| + int min_match_;
|
| + int max_match_;
|
| +};
|
| +
|
| +
|
| +class RegExpAlternative final : public RegExpTree {
|
| + public:
|
| + explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
|
| + void* Accept(RegExpVisitor* visitor, void* data) override;
|
| + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
| + RegExpAlternative* AsAlternative() override;
|
| + Interval CaptureRegisters() override;
|
| + bool IsAlternative() override;
|
| + bool IsAnchoredAtStart() override;
|
| + bool IsAnchoredAtEnd() override;
|
| + int min_match() override { return min_match_; }
|
| + int max_match() override { return max_match_; }
|
| + ZoneList<RegExpTree*>* nodes() { return nodes_; }
|
| +
|
| + private:
|
| + ZoneList<RegExpTree*>* nodes_;
|
| + int min_match_;
|
| + int max_match_;
|
| +};
|
| +
|
| +
|
| +class RegExpAssertion final : public RegExpTree {
|
| + public:
|
| + enum AssertionType {
|
| + START_OF_LINE,
|
| + START_OF_INPUT,
|
| + END_OF_LINE,
|
| + END_OF_INPUT,
|
| + BOUNDARY,
|
| + NON_BOUNDARY
|
| + };
|
| + explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {}
|
| + void* Accept(RegExpVisitor* visitor, void* data) override;
|
| + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
| + RegExpAssertion* AsAssertion() override;
|
| + bool IsAssertion() override;
|
| + bool IsAnchoredAtStart() override;
|
| + bool IsAnchoredAtEnd() override;
|
| + int min_match() override { return 0; }
|
| + int max_match() override { return 0; }
|
| + AssertionType assertion_type() { return assertion_type_; }
|
| +
|
| + private:
|
| + AssertionType assertion_type_;
|
| +};
|
| +
|
| +
|
| +class RegExpCharacterClass final : public RegExpTree {
|
| + public:
|
| + RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated)
|
| + : set_(ranges), is_negated_(is_negated) {}
|
| + explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {}
|
| + void* Accept(RegExpVisitor* visitor, void* data) override;
|
| + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
| + RegExpCharacterClass* AsCharacterClass() override;
|
| + bool IsCharacterClass() override;
|
| + bool IsTextElement() override { return true; }
|
| + int min_match() override { return 1; }
|
| + int max_match() override { return 1; }
|
| + void AppendToText(RegExpText* text, Zone* zone) override;
|
| + CharacterSet character_set() { return set_; }
|
| + // TODO(lrn): Remove need for complex version if is_standard that
|
| + // recognizes a mangled standard set and just do { return set_.is_special(); }
|
| + bool is_standard(Zone* zone);
|
| + // Returns a value representing the standard character set if is_standard()
|
| + // returns true.
|
| + // Currently used values are:
|
| + // s : unicode whitespace
|
| + // S : unicode non-whitespace
|
| + // w : ASCII word character (digit, letter, underscore)
|
| + // W : non-ASCII word character
|
| + // d : ASCII digit
|
| + // D : non-ASCII digit
|
| + // . : non-unicode non-newline
|
| + // * : All characters
|
| + uc16 standard_type() { return set_.standard_set_type(); }
|
| + ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
|
| + bool is_negated() { return is_negated_; }
|
| +
|
| + private:
|
| + CharacterSet set_;
|
| + bool is_negated_;
|
| +};
|
| +
|
| +
|
| +class RegExpAtom final : public RegExpTree {
|
| + public:
|
| + explicit RegExpAtom(Vector<const uc16> data) : data_(data) {}
|
| + void* Accept(RegExpVisitor* visitor, void* data) override;
|
| + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
| + RegExpAtom* AsAtom() override;
|
| + bool IsAtom() override;
|
| + bool IsTextElement() override { return true; }
|
| + int min_match() override { return data_.length(); }
|
| + int max_match() override { return data_.length(); }
|
| + void AppendToText(RegExpText* text, Zone* zone) override;
|
| + Vector<const uc16> data() { return data_; }
|
| + int length() { return data_.length(); }
|
| +
|
| + private:
|
| + Vector<const uc16> data_;
|
| +};
|
| +
|
| +
|
| +class RegExpText final : public RegExpTree {
|
| + public:
|
| + explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {}
|
| + void* Accept(RegExpVisitor* visitor, void* data) override;
|
| + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
| + RegExpText* AsText() override;
|
| + bool IsText() override;
|
| + bool IsTextElement() override { return true; }
|
| + int min_match() override { return length_; }
|
| + int max_match() override { return length_; }
|
| + void AppendToText(RegExpText* text, Zone* zone) override;
|
| + void AddElement(TextElement elm, Zone* zone) {
|
| + elements_.Add(elm, zone);
|
| + length_ += elm.length();
|
| + }
|
| + ZoneList<TextElement>* elements() { return &elements_; }
|
| +
|
| + private:
|
| + ZoneList<TextElement> elements_;
|
| + int length_;
|
| +};
|
| +
|
| +
|
| +class RegExpQuantifier final : public RegExpTree {
|
| + public:
|
| + enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
|
| + RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body)
|
| + : body_(body),
|
| + min_(min),
|
| + max_(max),
|
| + min_match_(min * body->min_match()),
|
| + quantifier_type_(type) {
|
| + if (max > 0 && body->max_match() > kInfinity / max) {
|
| + max_match_ = kInfinity;
|
| + } else {
|
| + max_match_ = max * body->max_match();
|
| + }
|
| + }
|
| + void* Accept(RegExpVisitor* visitor, void* data) override;
|
| + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
| + static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body,
|
| + RegExpCompiler* compiler, RegExpNode* on_success,
|
| + bool not_at_start = false);
|
| + RegExpQuantifier* AsQuantifier() override;
|
| + Interval CaptureRegisters() override;
|
| + bool IsQuantifier() override;
|
| + int min_match() override { return min_match_; }
|
| + int max_match() override { return max_match_; }
|
| + int min() { return min_; }
|
| + int max() { return max_; }
|
| + bool is_possessive() { return quantifier_type_ == POSSESSIVE; }
|
| + bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
|
| + bool is_greedy() { return quantifier_type_ == GREEDY; }
|
| + RegExpTree* body() { return body_; }
|
| +
|
| + private:
|
| + RegExpTree* body_;
|
| + int min_;
|
| + int max_;
|
| + int min_match_;
|
| + int max_match_;
|
| + QuantifierType quantifier_type_;
|
| +};
|
| +
|
| +
|
| +class RegExpCapture final : public RegExpTree {
|
| + public:
|
| + explicit RegExpCapture(int index) : body_(NULL), index_(index) {}
|
| + void* Accept(RegExpVisitor* visitor, void* data) override;
|
| + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
| + static RegExpNode* ToNode(RegExpTree* body, int index,
|
| + RegExpCompiler* compiler, RegExpNode* on_success);
|
| + RegExpCapture* AsCapture() override;
|
| + bool IsAnchoredAtStart() override;
|
| + bool IsAnchoredAtEnd() override;
|
| + Interval CaptureRegisters() override;
|
| + bool IsCapture() override;
|
| + int min_match() override { return body_->min_match(); }
|
| + int max_match() override { return body_->max_match(); }
|
| + RegExpTree* body() { return body_; }
|
| + void set_body(RegExpTree* body) { body_ = body; }
|
| + int index() { return index_; }
|
| + static int StartRegister(int index) { return index * 2; }
|
| + static int EndRegister(int index) { return index * 2 + 1; }
|
| +
|
| + private:
|
| + RegExpTree* body_;
|
| + int index_;
|
| +};
|
| +
|
| +
|
| +class RegExpLookaround final : public RegExpTree {
|
| + public:
|
| + enum Type { LOOKAHEAD, LOOKBEHIND };
|
| +
|
| + RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count,
|
| + int capture_from, Type type)
|
| + : body_(body),
|
| + is_positive_(is_positive),
|
| + capture_count_(capture_count),
|
| + capture_from_(capture_from),
|
| + type_(type) {}
|
| +
|
| + void* Accept(RegExpVisitor* visitor, void* data) override;
|
| + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
| + RegExpLookaround* AsLookaround() override;
|
| + Interval CaptureRegisters() override;
|
| + bool IsLookaround() override;
|
| + bool IsAnchoredAtStart() override;
|
| + int min_match() override { return 0; }
|
| + int max_match() override { return 0; }
|
| + RegExpTree* body() { return body_; }
|
| + bool is_positive() { return is_positive_; }
|
| + int capture_count() { return capture_count_; }
|
| + int capture_from() { return capture_from_; }
|
| + Type type() { return type_; }
|
| +
|
| + private:
|
| + RegExpTree* body_;
|
| + bool is_positive_;
|
| + int capture_count_;
|
| + int capture_from_;
|
| + Type type_;
|
| +};
|
| +
|
| +
|
| +class RegExpBackReference final : public RegExpTree {
|
| + public:
|
| + explicit RegExpBackReference(RegExpCapture* capture) : capture_(capture) {}
|
| + void* Accept(RegExpVisitor* visitor, void* data) override;
|
| + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
| + RegExpBackReference* AsBackReference() override;
|
| + bool IsBackReference() override;
|
| + int min_match() override { return 0; }
|
| + // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
|
| + // recursion, we give up. Ignorance is bliss.
|
| + int max_match() override { return kInfinity; }
|
| + int index() { return capture_->index(); }
|
| + RegExpCapture* capture() { return capture_; }
|
| +
|
| + private:
|
| + RegExpCapture* capture_;
|
| +};
|
| +
|
| +
|
| +class RegExpEmpty final : public RegExpTree {
|
| + public:
|
| + RegExpEmpty() {}
|
| + void* Accept(RegExpVisitor* visitor, void* data) override;
|
| + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
| + RegExpEmpty* AsEmpty() override;
|
| + bool IsEmpty() override;
|
| + int min_match() override { return 0; }
|
| + int max_match() override { return 0; }
|
| +};
|
| +
|
| +} // namespace internal
|
| +} // namespace v8
|
| +
|
| +#endif // V8_REGEXP_REGEXP_AST_H_
|
|
|