OLD | NEW |
(Empty) | |
| 1 // Copyright 2016 the V8 project authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #ifndef V8_REGEXP_REGEXP_AST_H_ |
| 6 #define V8_REGEXP_REGEXP_AST_H_ |
| 7 |
| 8 #include "src/utils.h" |
| 9 #include "src/zone.h" |
| 10 |
| 11 namespace v8 { |
| 12 namespace internal { |
| 13 |
| 14 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ |
| 15 VISIT(Disjunction) \ |
| 16 VISIT(Alternative) \ |
| 17 VISIT(Assertion) \ |
| 18 VISIT(CharacterClass) \ |
| 19 VISIT(Atom) \ |
| 20 VISIT(Quantifier) \ |
| 21 VISIT(Capture) \ |
| 22 VISIT(Lookaround) \ |
| 23 VISIT(BackReference) \ |
| 24 VISIT(Empty) \ |
| 25 VISIT(Text) |
| 26 |
| 27 |
| 28 #define FORWARD_DECLARE(Name) class RegExp##Name; |
| 29 FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE) |
| 30 #undef FORWARD_DECLARE |
| 31 |
| 32 class RegExpCompiler; |
| 33 class RegExpNode; |
| 34 class RegExpTree; |
| 35 |
| 36 |
| 37 class RegExpVisitor BASE_EMBEDDED { |
| 38 public: |
| 39 virtual ~RegExpVisitor() {} |
| 40 #define MAKE_CASE(Name) \ |
| 41 virtual void* Visit##Name(RegExp##Name*, void* data) = 0; |
| 42 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) |
| 43 #undef MAKE_CASE |
| 44 }; |
| 45 |
| 46 |
| 47 // A simple closed interval. |
| 48 class Interval { |
| 49 public: |
| 50 Interval() : from_(kNone), to_(kNone) {} |
| 51 Interval(int from, int to) : from_(from), to_(to) {} |
| 52 Interval Union(Interval that) { |
| 53 if (that.from_ == kNone) |
| 54 return *this; |
| 55 else if (from_ == kNone) |
| 56 return that; |
| 57 else |
| 58 return Interval(Min(from_, that.from_), Max(to_, that.to_)); |
| 59 } |
| 60 bool Contains(int value) { return (from_ <= value) && (value <= to_); } |
| 61 bool is_empty() { return from_ == kNone; } |
| 62 int from() const { return from_; } |
| 63 int to() const { return to_; } |
| 64 static Interval Empty() { return Interval(); } |
| 65 static const int kNone = -1; |
| 66 |
| 67 private: |
| 68 int from_; |
| 69 int to_; |
| 70 }; |
| 71 |
| 72 |
| 73 // Represents code units in the range from from_ to to_, both ends are |
| 74 // inclusive. |
| 75 class CharacterRange { |
| 76 public: |
| 77 CharacterRange() : from_(0), to_(0) {} |
| 78 // For compatibility with the CHECK_OK macro |
| 79 CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT |
| 80 CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) {} |
| 81 static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges, |
| 82 Zone* zone); |
| 83 static Vector<const int> GetWordBounds(); |
| 84 static inline CharacterRange Singleton(uc16 value) { |
| 85 return CharacterRange(value, value); |
| 86 } |
| 87 static inline CharacterRange Range(uc16 from, uc16 to) { |
| 88 DCHECK(from <= to); |
| 89 return CharacterRange(from, to); |
| 90 } |
| 91 static inline CharacterRange Everything() { |
| 92 return CharacterRange(0, 0xFFFF); |
| 93 } |
| 94 bool Contains(uc16 i) { return from_ <= i && i <= to_; } |
| 95 uc16 from() const { return from_; } |
| 96 void set_from(uc16 value) { from_ = value; } |
| 97 uc16 to() const { return to_; } |
| 98 void set_to(uc16 value) { to_ = value; } |
| 99 bool is_valid() { return from_ <= to_; } |
| 100 bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; } |
| 101 bool IsSingleton() { return (from_ == to_); } |
| 102 void AddCaseEquivalents(Isolate* isolate, Zone* zone, |
| 103 ZoneList<CharacterRange>* ranges, bool is_one_byte); |
| 104 static void Split(ZoneList<CharacterRange>* base, Vector<const int> overlay, |
| 105 ZoneList<CharacterRange>** included, |
| 106 ZoneList<CharacterRange>** excluded, Zone* zone); |
| 107 // Whether a range list is in canonical form: Ranges ordered by from value, |
| 108 // and ranges non-overlapping and non-adjacent. |
| 109 static bool IsCanonical(ZoneList<CharacterRange>* ranges); |
| 110 // Convert range list to canonical form. The characters covered by the ranges |
| 111 // will still be the same, but no character is in more than one range, and |
| 112 // adjacent ranges are merged. The resulting list may be shorter than the |
| 113 // original, but cannot be longer. |
| 114 static void Canonicalize(ZoneList<CharacterRange>* ranges); |
| 115 // Negate the contents of a character range in canonical form. |
| 116 static void Negate(ZoneList<CharacterRange>* src, |
| 117 ZoneList<CharacterRange>* dst, Zone* zone); |
| 118 static const int kStartMarker = (1 << 24); |
| 119 static const int kPayloadMask = (1 << 24) - 1; |
| 120 |
| 121 private: |
| 122 uc16 from_; |
| 123 uc16 to_; |
| 124 }; |
| 125 |
| 126 |
| 127 class CharacterSet final BASE_EMBEDDED { |
| 128 public: |
| 129 explicit CharacterSet(uc16 standard_set_type) |
| 130 : ranges_(NULL), standard_set_type_(standard_set_type) {} |
| 131 explicit CharacterSet(ZoneList<CharacterRange>* ranges) |
| 132 : ranges_(ranges), standard_set_type_(0) {} |
| 133 ZoneList<CharacterRange>* ranges(Zone* zone); |
| 134 uc16 standard_set_type() { return standard_set_type_; } |
| 135 void set_standard_set_type(uc16 special_set_type) { |
| 136 standard_set_type_ = special_set_type; |
| 137 } |
| 138 bool is_standard() { return standard_set_type_ != 0; } |
| 139 void Canonicalize(); |
| 140 |
| 141 private: |
| 142 ZoneList<CharacterRange>* ranges_; |
| 143 // If non-zero, the value represents a standard set (e.g., all whitespace |
| 144 // characters) without having to expand the ranges. |
| 145 uc16 standard_set_type_; |
| 146 }; |
| 147 |
| 148 |
| 149 class TextElement final BASE_EMBEDDED { |
| 150 public: |
| 151 enum TextType { ATOM, CHAR_CLASS }; |
| 152 |
| 153 static TextElement Atom(RegExpAtom* atom); |
| 154 static TextElement CharClass(RegExpCharacterClass* char_class); |
| 155 |
| 156 int cp_offset() const { return cp_offset_; } |
| 157 void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } |
| 158 int length() const; |
| 159 |
| 160 TextType text_type() const { return text_type_; } |
| 161 |
| 162 RegExpTree* tree() const { return tree_; } |
| 163 |
| 164 RegExpAtom* atom() const { |
| 165 DCHECK(text_type() == ATOM); |
| 166 return reinterpret_cast<RegExpAtom*>(tree()); |
| 167 } |
| 168 |
| 169 RegExpCharacterClass* char_class() const { |
| 170 DCHECK(text_type() == CHAR_CLASS); |
| 171 return reinterpret_cast<RegExpCharacterClass*>(tree()); |
| 172 } |
| 173 |
| 174 private: |
| 175 TextElement(TextType text_type, RegExpTree* tree) |
| 176 : cp_offset_(-1), text_type_(text_type), tree_(tree) {} |
| 177 |
| 178 int cp_offset_; |
| 179 TextType text_type_; |
| 180 RegExpTree* tree_; |
| 181 }; |
| 182 |
| 183 |
| 184 class RegExpTree : public ZoneObject { |
| 185 public: |
| 186 static const int kInfinity = kMaxInt; |
| 187 virtual ~RegExpTree() {} |
| 188 virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; |
| 189 virtual RegExpNode* ToNode(RegExpCompiler* compiler, |
| 190 RegExpNode* on_success) = 0; |
| 191 virtual bool IsTextElement() { return false; } |
| 192 virtual bool IsAnchoredAtStart() { return false; } |
| 193 virtual bool IsAnchoredAtEnd() { return false; } |
| 194 virtual int min_match() = 0; |
| 195 virtual int max_match() = 0; |
| 196 // Returns the interval of registers used for captures within this |
| 197 // expression. |
| 198 virtual Interval CaptureRegisters() { return Interval::Empty(); } |
| 199 virtual void AppendToText(RegExpText* text, Zone* zone); |
| 200 std::ostream& Print(std::ostream& os, Zone* zone); // NOLINT |
| 201 #define MAKE_ASTYPE(Name) \ |
| 202 virtual RegExp##Name* As##Name(); \ |
| 203 virtual bool Is##Name(); |
| 204 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) |
| 205 #undef MAKE_ASTYPE |
| 206 }; |
| 207 |
| 208 |
| 209 class RegExpDisjunction final : public RegExpTree { |
| 210 public: |
| 211 explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives); |
| 212 void* Accept(RegExpVisitor* visitor, void* data) override; |
| 213 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
| 214 RegExpDisjunction* AsDisjunction() override; |
| 215 Interval CaptureRegisters() override; |
| 216 bool IsDisjunction() override; |
| 217 bool IsAnchoredAtStart() override; |
| 218 bool IsAnchoredAtEnd() override; |
| 219 int min_match() override { return min_match_; } |
| 220 int max_match() override { return max_match_; } |
| 221 ZoneList<RegExpTree*>* alternatives() { return alternatives_; } |
| 222 |
| 223 private: |
| 224 bool SortConsecutiveAtoms(RegExpCompiler* compiler); |
| 225 void RationalizeConsecutiveAtoms(RegExpCompiler* compiler); |
| 226 void FixSingleCharacterDisjunctions(RegExpCompiler* compiler); |
| 227 ZoneList<RegExpTree*>* alternatives_; |
| 228 int min_match_; |
| 229 int max_match_; |
| 230 }; |
| 231 |
| 232 |
| 233 class RegExpAlternative final : public RegExpTree { |
| 234 public: |
| 235 explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes); |
| 236 void* Accept(RegExpVisitor* visitor, void* data) override; |
| 237 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
| 238 RegExpAlternative* AsAlternative() override; |
| 239 Interval CaptureRegisters() override; |
| 240 bool IsAlternative() override; |
| 241 bool IsAnchoredAtStart() override; |
| 242 bool IsAnchoredAtEnd() override; |
| 243 int min_match() override { return min_match_; } |
| 244 int max_match() override { return max_match_; } |
| 245 ZoneList<RegExpTree*>* nodes() { return nodes_; } |
| 246 |
| 247 private: |
| 248 ZoneList<RegExpTree*>* nodes_; |
| 249 int min_match_; |
| 250 int max_match_; |
| 251 }; |
| 252 |
| 253 |
| 254 class RegExpAssertion final : public RegExpTree { |
| 255 public: |
| 256 enum AssertionType { |
| 257 START_OF_LINE, |
| 258 START_OF_INPUT, |
| 259 END_OF_LINE, |
| 260 END_OF_INPUT, |
| 261 BOUNDARY, |
| 262 NON_BOUNDARY |
| 263 }; |
| 264 explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {} |
| 265 void* Accept(RegExpVisitor* visitor, void* data) override; |
| 266 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
| 267 RegExpAssertion* AsAssertion() override; |
| 268 bool IsAssertion() override; |
| 269 bool IsAnchoredAtStart() override; |
| 270 bool IsAnchoredAtEnd() override; |
| 271 int min_match() override { return 0; } |
| 272 int max_match() override { return 0; } |
| 273 AssertionType assertion_type() { return assertion_type_; } |
| 274 |
| 275 private: |
| 276 AssertionType assertion_type_; |
| 277 }; |
| 278 |
| 279 |
| 280 class RegExpCharacterClass final : public RegExpTree { |
| 281 public: |
| 282 RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated) |
| 283 : set_(ranges), is_negated_(is_negated) {} |
| 284 explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {} |
| 285 void* Accept(RegExpVisitor* visitor, void* data) override; |
| 286 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
| 287 RegExpCharacterClass* AsCharacterClass() override; |
| 288 bool IsCharacterClass() override; |
| 289 bool IsTextElement() override { return true; } |
| 290 int min_match() override { return 1; } |
| 291 int max_match() override { return 1; } |
| 292 void AppendToText(RegExpText* text, Zone* zone) override; |
| 293 CharacterSet character_set() { return set_; } |
| 294 // TODO(lrn): Remove need for complex version if is_standard that |
| 295 // recognizes a mangled standard set and just do { return set_.is_special(); } |
| 296 bool is_standard(Zone* zone); |
| 297 // Returns a value representing the standard character set if is_standard() |
| 298 // returns true. |
| 299 // Currently used values are: |
| 300 // s : unicode whitespace |
| 301 // S : unicode non-whitespace |
| 302 // w : ASCII word character (digit, letter, underscore) |
| 303 // W : non-ASCII word character |
| 304 // d : ASCII digit |
| 305 // D : non-ASCII digit |
| 306 // . : non-unicode non-newline |
| 307 // * : All characters |
| 308 uc16 standard_type() { return set_.standard_set_type(); } |
| 309 ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); } |
| 310 bool is_negated() { return is_negated_; } |
| 311 |
| 312 private: |
| 313 CharacterSet set_; |
| 314 bool is_negated_; |
| 315 }; |
| 316 |
| 317 |
| 318 class RegExpAtom final : public RegExpTree { |
| 319 public: |
| 320 explicit RegExpAtom(Vector<const uc16> data) : data_(data) {} |
| 321 void* Accept(RegExpVisitor* visitor, void* data) override; |
| 322 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
| 323 RegExpAtom* AsAtom() override; |
| 324 bool IsAtom() override; |
| 325 bool IsTextElement() override { return true; } |
| 326 int min_match() override { return data_.length(); } |
| 327 int max_match() override { return data_.length(); } |
| 328 void AppendToText(RegExpText* text, Zone* zone) override; |
| 329 Vector<const uc16> data() { return data_; } |
| 330 int length() { return data_.length(); } |
| 331 |
| 332 private: |
| 333 Vector<const uc16> data_; |
| 334 }; |
| 335 |
| 336 |
| 337 class RegExpText final : public RegExpTree { |
| 338 public: |
| 339 explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {} |
| 340 void* Accept(RegExpVisitor* visitor, void* data) override; |
| 341 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
| 342 RegExpText* AsText() override; |
| 343 bool IsText() override; |
| 344 bool IsTextElement() override { return true; } |
| 345 int min_match() override { return length_; } |
| 346 int max_match() override { return length_; } |
| 347 void AppendToText(RegExpText* text, Zone* zone) override; |
| 348 void AddElement(TextElement elm, Zone* zone) { |
| 349 elements_.Add(elm, zone); |
| 350 length_ += elm.length(); |
| 351 } |
| 352 ZoneList<TextElement>* elements() { return &elements_; } |
| 353 |
| 354 private: |
| 355 ZoneList<TextElement> elements_; |
| 356 int length_; |
| 357 }; |
| 358 |
| 359 |
| 360 class RegExpQuantifier final : public RegExpTree { |
| 361 public: |
| 362 enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE }; |
| 363 RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body) |
| 364 : body_(body), |
| 365 min_(min), |
| 366 max_(max), |
| 367 min_match_(min * body->min_match()), |
| 368 quantifier_type_(type) { |
| 369 if (max > 0 && body->max_match() > kInfinity / max) { |
| 370 max_match_ = kInfinity; |
| 371 } else { |
| 372 max_match_ = max * body->max_match(); |
| 373 } |
| 374 } |
| 375 void* Accept(RegExpVisitor* visitor, void* data) override; |
| 376 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
| 377 static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body, |
| 378 RegExpCompiler* compiler, RegExpNode* on_success, |
| 379 bool not_at_start = false); |
| 380 RegExpQuantifier* AsQuantifier() override; |
| 381 Interval CaptureRegisters() override; |
| 382 bool IsQuantifier() override; |
| 383 int min_match() override { return min_match_; } |
| 384 int max_match() override { return max_match_; } |
| 385 int min() { return min_; } |
| 386 int max() { return max_; } |
| 387 bool is_possessive() { return quantifier_type_ == POSSESSIVE; } |
| 388 bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; } |
| 389 bool is_greedy() { return quantifier_type_ == GREEDY; } |
| 390 RegExpTree* body() { return body_; } |
| 391 |
| 392 private: |
| 393 RegExpTree* body_; |
| 394 int min_; |
| 395 int max_; |
| 396 int min_match_; |
| 397 int max_match_; |
| 398 QuantifierType quantifier_type_; |
| 399 }; |
| 400 |
| 401 |
| 402 class RegExpCapture final : public RegExpTree { |
| 403 public: |
| 404 explicit RegExpCapture(int index) : body_(NULL), index_(index) {} |
| 405 void* Accept(RegExpVisitor* visitor, void* data) override; |
| 406 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
| 407 static RegExpNode* ToNode(RegExpTree* body, int index, |
| 408 RegExpCompiler* compiler, RegExpNode* on_success); |
| 409 RegExpCapture* AsCapture() override; |
| 410 bool IsAnchoredAtStart() override; |
| 411 bool IsAnchoredAtEnd() override; |
| 412 Interval CaptureRegisters() override; |
| 413 bool IsCapture() override; |
| 414 int min_match() override { return body_->min_match(); } |
| 415 int max_match() override { return body_->max_match(); } |
| 416 RegExpTree* body() { return body_; } |
| 417 void set_body(RegExpTree* body) { body_ = body; } |
| 418 int index() { return index_; } |
| 419 static int StartRegister(int index) { return index * 2; } |
| 420 static int EndRegister(int index) { return index * 2 + 1; } |
| 421 |
| 422 private: |
| 423 RegExpTree* body_; |
| 424 int index_; |
| 425 }; |
| 426 |
| 427 |
| 428 class RegExpLookaround final : public RegExpTree { |
| 429 public: |
| 430 enum Type { LOOKAHEAD, LOOKBEHIND }; |
| 431 |
| 432 RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count, |
| 433 int capture_from, Type type) |
| 434 : body_(body), |
| 435 is_positive_(is_positive), |
| 436 capture_count_(capture_count), |
| 437 capture_from_(capture_from), |
| 438 type_(type) {} |
| 439 |
| 440 void* Accept(RegExpVisitor* visitor, void* data) override; |
| 441 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
| 442 RegExpLookaround* AsLookaround() override; |
| 443 Interval CaptureRegisters() override; |
| 444 bool IsLookaround() override; |
| 445 bool IsAnchoredAtStart() override; |
| 446 int min_match() override { return 0; } |
| 447 int max_match() override { return 0; } |
| 448 RegExpTree* body() { return body_; } |
| 449 bool is_positive() { return is_positive_; } |
| 450 int capture_count() { return capture_count_; } |
| 451 int capture_from() { return capture_from_; } |
| 452 Type type() { return type_; } |
| 453 |
| 454 private: |
| 455 RegExpTree* body_; |
| 456 bool is_positive_; |
| 457 int capture_count_; |
| 458 int capture_from_; |
| 459 Type type_; |
| 460 }; |
| 461 |
| 462 |
| 463 class RegExpBackReference final : public RegExpTree { |
| 464 public: |
| 465 explicit RegExpBackReference(RegExpCapture* capture) : capture_(capture) {} |
| 466 void* Accept(RegExpVisitor* visitor, void* data) override; |
| 467 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
| 468 RegExpBackReference* AsBackReference() override; |
| 469 bool IsBackReference() override; |
| 470 int min_match() override { return 0; } |
| 471 // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite |
| 472 // recursion, we give up. Ignorance is bliss. |
| 473 int max_match() override { return kInfinity; } |
| 474 int index() { return capture_->index(); } |
| 475 RegExpCapture* capture() { return capture_; } |
| 476 |
| 477 private: |
| 478 RegExpCapture* capture_; |
| 479 }; |
| 480 |
| 481 |
| 482 class RegExpEmpty final : public RegExpTree { |
| 483 public: |
| 484 RegExpEmpty() {} |
| 485 void* Accept(RegExpVisitor* visitor, void* data) override; |
| 486 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; |
| 487 RegExpEmpty* AsEmpty() override; |
| 488 bool IsEmpty() override; |
| 489 int min_match() override { return 0; } |
| 490 int max_match() override { return 0; } |
| 491 }; |
| 492 |
| 493 } // namespace internal |
| 494 } // namespace v8 |
| 495 |
| 496 #endif // V8_REGEXP_REGEXP_AST_H_ |
OLD | NEW |