Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(683)

Side by Side Diff: src/regexp/regexp-ast.h

Issue 1578253005: [regexp] implement character classes for unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: more tests Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/jsregexp.cc ('k') | src/regexp/regexp-ast.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #ifndef V8_REGEXP_REGEXP_AST_H_ 5 #ifndef V8_REGEXP_REGEXP_AST_H_
6 #define V8_REGEXP_REGEXP_AST_H_ 6 #define V8_REGEXP_REGEXP_AST_H_
7 7
8 #include "src/objects.h"
8 #include "src/utils.h" 9 #include "src/utils.h"
9 #include "src/zone.h" 10 #include "src/zone.h"
10 11
11 namespace v8 { 12 namespace v8 {
12 namespace internal { 13 namespace internal {
13 14
14 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ 15 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
15 VISIT(Disjunction) \ 16 VISIT(Disjunction) \
16 VISIT(Alternative) \ 17 VISIT(Alternative) \
17 VISIT(Assertion) \ 18 VISIT(Assertion) \
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
70 }; 71 };
71 72
72 73
73 // Represents code units in the range from from_ to to_, both ends are 74 // Represents code units in the range from from_ to to_, both ends are
74 // inclusive. 75 // inclusive.
75 class CharacterRange { 76 class CharacterRange {
76 public: 77 public:
77 CharacterRange() : from_(0), to_(0) {} 78 CharacterRange() : from_(0), to_(0) {}
78 // For compatibility with the CHECK_OK macro 79 // For compatibility with the CHECK_OK macro
79 CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT 80 CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT
80 CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) {} 81 CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {}
81 static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges, 82 static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
82 Zone* zone); 83 Zone* zone);
83 static Vector<const int> GetWordBounds(); 84 static Vector<const int> GetWordBounds();
84 static inline CharacterRange Singleton(uc16 value) { 85 static inline CharacterRange Singleton(uc32 value) {
85 return CharacterRange(value, value); 86 return CharacterRange(value, value);
86 } 87 }
87 static inline CharacterRange Range(uc16 from, uc16 to) { 88 static inline CharacterRange Range(uc32 from, uc32 to) {
88 DCHECK(from <= to); 89 DCHECK(0 <= from && to <= String::kMaxCodePoint);
90 DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to));
89 return CharacterRange(from, to); 91 return CharacterRange(from, to);
90 } 92 }
91 static inline CharacterRange Everything() { 93 static inline CharacterRange Everything() {
92 return CharacterRange(0, 0xFFFF); 94 return CharacterRange(0, String::kMaxCodePoint);
93 } 95 }
94 bool Contains(uc16 i) { return from_ <= i && i <= to_; } 96 static inline ZoneList<CharacterRange>* List(Zone* zone,
95 uc16 from() const { return from_; } 97 CharacterRange range) {
96 void set_from(uc16 value) { from_ = value; } 98 ZoneList<CharacterRange>* list =
97 uc16 to() const { return to_; } 99 new (zone) ZoneList<CharacterRange>(1, zone);
98 void set_to(uc16 value) { to_ = value; } 100 list->Add(range, zone);
101 return list;
102 }
103 bool Contains(uc32 i) { return from_ <= i && i <= to_; }
104 uc32 from() const { return from_; }
105 void set_from(uc32 value) { from_ = value; }
106 uc32 to() const { return to_; }
107 void set_to(uc32 value) { to_ = value; }
99 bool is_valid() { return from_ <= to_; } 108 bool is_valid() { return from_ <= to_; }
100 bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; } 109 bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
101 bool IsSingleton() { return (from_ == to_); } 110 bool IsSingleton() { return (from_ == to_); }
102 void AddCaseEquivalents(Isolate* isolate, Zone* zone, 111 void AddCaseEquivalents(Isolate* isolate, Zone* zone,
103 ZoneList<CharacterRange>* ranges, bool is_one_byte); 112 ZoneList<CharacterRange>* ranges, bool is_one_byte);
104 static void Split(ZoneList<CharacterRange>* base, Vector<const int> overlay,
105 ZoneList<CharacterRange>** included,
106 ZoneList<CharacterRange>** excluded, Zone* zone);
107 // Whether a range list is in canonical form: Ranges ordered by from value, 113 // Whether a range list is in canonical form: Ranges ordered by from value,
108 // and ranges non-overlapping and non-adjacent. 114 // and ranges non-overlapping and non-adjacent.
109 static bool IsCanonical(ZoneList<CharacterRange>* ranges); 115 static bool IsCanonical(ZoneList<CharacterRange>* ranges);
110 // Convert range list to canonical form. The characters covered by the ranges 116 // Convert range list to canonical form. The characters covered by the ranges
111 // will still be the same, but no character is in more than one range, and 117 // will still be the same, but no character is in more than one range, and
112 // adjacent ranges are merged. The resulting list may be shorter than the 118 // adjacent ranges are merged. The resulting list may be shorter than the
113 // original, but cannot be longer. 119 // original, but cannot be longer.
114 static void Canonicalize(ZoneList<CharacterRange>* ranges); 120 static void Canonicalize(ZoneList<CharacterRange>* ranges);
115 // Negate the contents of a character range in canonical form. 121 // Negate the contents of a character range in canonical form.
116 static void Negate(ZoneList<CharacterRange>* src, 122 static void Negate(ZoneList<CharacterRange>* src,
117 ZoneList<CharacterRange>* dst, Zone* zone); 123 ZoneList<CharacterRange>* dst, Zone* zone);
118 static const int kStartMarker = (1 << 24); 124 static const int kStartMarker = (1 << 24);
119 static const int kPayloadMask = (1 << 24) - 1; 125 static const int kPayloadMask = (1 << 24) - 1;
120 126
121 private: 127 private:
122 uc16 from_; 128 uc32 from_;
123 uc16 to_; 129 uc32 to_;
124 }; 130 };
125 131
126 132
127 class CharacterSet final BASE_EMBEDDED { 133 class CharacterSet final BASE_EMBEDDED {
128 public: 134 public:
129 explicit CharacterSet(uc16 standard_set_type) 135 explicit CharacterSet(uc16 standard_set_type)
130 : ranges_(NULL), standard_set_type_(standard_set_type) {} 136 : ranges_(NULL), standard_set_type_(standard_set_type) {}
131 explicit CharacterSet(ZoneList<CharacterRange>* ranges) 137 explicit CharacterSet(ZoneList<CharacterRange>* ranges)
132 : ranges_(ranges), standard_set_type_(0) {} 138 : ranges_(ranges), standard_set_type_(0) {}
133 ZoneList<CharacterRange>* ranges(Zone* zone); 139 ZoneList<CharacterRange>* ranges(Zone* zone);
(...skipping 146 matching lines...) Expand 10 before | Expand all | Expand 10 after
280 class RegExpCharacterClass final : public RegExpTree { 286 class RegExpCharacterClass final : public RegExpTree {
281 public: 287 public:
282 RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated) 288 RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated)
283 : set_(ranges), is_negated_(is_negated) {} 289 : set_(ranges), is_negated_(is_negated) {}
284 explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {} 290 explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {}
285 void* Accept(RegExpVisitor* visitor, void* data) override; 291 void* Accept(RegExpVisitor* visitor, void* data) override;
286 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 292 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
287 RegExpCharacterClass* AsCharacterClass() override; 293 RegExpCharacterClass* AsCharacterClass() override;
288 bool IsCharacterClass() override; 294 bool IsCharacterClass() override;
289 bool IsTextElement() override { return true; } 295 bool IsTextElement() override { return true; }
296 bool NeedsDesugaringForUnicode(Zone* zone);
290 int min_match() override { return 1; } 297 int min_match() override { return 1; }
291 int max_match() override { return 1; } 298 int max_match() override { return 1; }
292 void AppendToText(RegExpText* text, Zone* zone) override; 299 void AppendToText(RegExpText* text, Zone* zone) override;
293 CharacterSet character_set() { return set_; } 300 CharacterSet character_set() { return set_; }
294 // TODO(lrn): Remove need for complex version if is_standard that 301 // TODO(lrn): Remove need for complex version if is_standard that
295 // recognizes a mangled standard set and just do { return set_.is_special(); } 302 // recognizes a mangled standard set and just do { return set_.is_special(); }
296 bool is_standard(Zone* zone); 303 bool is_standard(Zone* zone);
297 // Returns a value representing the standard character set if is_standard() 304 // Returns a value representing the standard character set if is_standard()
298 // returns true. 305 // returns true.
299 // Currently used values are: 306 // Currently used values are:
(...skipping 144 matching lines...) Expand 10 before | Expand all | Expand 10 after
444 bool IsLookaround() override; 451 bool IsLookaround() override;
445 bool IsAnchoredAtStart() override; 452 bool IsAnchoredAtStart() override;
446 int min_match() override { return 0; } 453 int min_match() override { return 0; }
447 int max_match() override { return 0; } 454 int max_match() override { return 0; }
448 RegExpTree* body() { return body_; } 455 RegExpTree* body() { return body_; }
449 bool is_positive() { return is_positive_; } 456 bool is_positive() { return is_positive_; }
450 int capture_count() { return capture_count_; } 457 int capture_count() { return capture_count_; }
451 int capture_from() { return capture_from_; } 458 int capture_from() { return capture_from_; }
452 Type type() { return type_; } 459 Type type() { return type_; }
453 460
461 class Builder {
462 public:
463 Builder(bool is_positive, RegExpNode* on_success,
464 int stack_pointer_register, int position_register,
465 int capture_register_count = 0, int capture_register_start = 0);
466 RegExpNode* on_match_success() { return on_match_success_; }
467 RegExpNode* ForMatch(RegExpNode* match);
468
469 private:
470 bool is_positive_;
471 RegExpNode* on_match_success_;
472 RegExpNode* on_success_;
473 int stack_pointer_register_;
474 int position_register_;
475 };
476
454 private: 477 private:
455 RegExpTree* body_; 478 RegExpTree* body_;
456 bool is_positive_; 479 bool is_positive_;
457 int capture_count_; 480 int capture_count_;
458 int capture_from_; 481 int capture_from_;
459 Type type_; 482 Type type_;
460 }; 483 };
461 484
462 485
463 class RegExpBackReference final : public RegExpTree { 486 class RegExpBackReference final : public RegExpTree {
(...skipping 23 matching lines...) Expand all
487 RegExpEmpty* AsEmpty() override; 510 RegExpEmpty* AsEmpty() override;
488 bool IsEmpty() override; 511 bool IsEmpty() override;
489 int min_match() override { return 0; } 512 int min_match() override { return 0; }
490 int max_match() override { return 0; } 513 int max_match() override { return 0; }
491 }; 514 };
492 515
493 } // namespace internal 516 } // namespace internal
494 } // namespace v8 517 } // namespace v8
495 518
496 #endif // V8_REGEXP_REGEXP_AST_H_ 519 #endif // V8_REGEXP_REGEXP_AST_H_
OLDNEW
« no previous file with comments | « src/regexp/jsregexp.cc ('k') | src/regexp/regexp-ast.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698