Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(18)

Side by Side Diff: src/regexp/regexp-ast.h

Issue 1565183002: [regexp] move regexp parser into own files. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: fix test compile Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/jsregexp.cc ('k') | src/regexp/regexp-ast.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef V8_REGEXP_REGEXP_AST_H_
6 #define V8_REGEXP_REGEXP_AST_H_
7
8 #include "src/utils.h"
9 #include "src/zone.h"
10
11 namespace v8 {
12 namespace internal {
13
14 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
15 VISIT(Disjunction) \
16 VISIT(Alternative) \
17 VISIT(Assertion) \
18 VISIT(CharacterClass) \
19 VISIT(Atom) \
20 VISIT(Quantifier) \
21 VISIT(Capture) \
22 VISIT(Lookaround) \
23 VISIT(BackReference) \
24 VISIT(Empty) \
25 VISIT(Text)
26
27
28 #define FORWARD_DECLARE(Name) class RegExp##Name;
29 FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
30 #undef FORWARD_DECLARE
31
32 class RegExpCompiler;
33 class RegExpNode;
34 class RegExpTree;
35
36
37 class RegExpVisitor BASE_EMBEDDED {
38 public:
39 virtual ~RegExpVisitor() {}
40 #define MAKE_CASE(Name) \
41 virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
42 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
43 #undef MAKE_CASE
44 };
45
46
47 // A simple closed interval.
48 class Interval {
49 public:
50 Interval() : from_(kNone), to_(kNone) {}
51 Interval(int from, int to) : from_(from), to_(to) {}
52 Interval Union(Interval that) {
53 if (that.from_ == kNone)
54 return *this;
55 else if (from_ == kNone)
56 return that;
57 else
58 return Interval(Min(from_, that.from_), Max(to_, that.to_));
59 }
60 bool Contains(int value) { return (from_ <= value) && (value <= to_); }
61 bool is_empty() { return from_ == kNone; }
62 int from() const { return from_; }
63 int to() const { return to_; }
64 static Interval Empty() { return Interval(); }
65 static const int kNone = -1;
66
67 private:
68 int from_;
69 int to_;
70 };
71
72
73 // Represents code units in the range from from_ to to_, both ends are
74 // inclusive.
75 class CharacterRange {
76 public:
77 CharacterRange() : from_(0), to_(0) {}
78 // For compatibility with the CHECK_OK macro
79 CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT
80 CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) {}
81 static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
82 Zone* zone);
83 static Vector<const int> GetWordBounds();
84 static inline CharacterRange Singleton(uc16 value) {
85 return CharacterRange(value, value);
86 }
87 static inline CharacterRange Range(uc16 from, uc16 to) {
88 DCHECK(from <= to);
89 return CharacterRange(from, to);
90 }
91 static inline CharacterRange Everything() {
92 return CharacterRange(0, 0xFFFF);
93 }
94 bool Contains(uc16 i) { return from_ <= i && i <= to_; }
95 uc16 from() const { return from_; }
96 void set_from(uc16 value) { from_ = value; }
97 uc16 to() const { return to_; }
98 void set_to(uc16 value) { to_ = value; }
99 bool is_valid() { return from_ <= to_; }
100 bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
101 bool IsSingleton() { return (from_ == to_); }
102 void AddCaseEquivalents(Isolate* isolate, Zone* zone,
103 ZoneList<CharacterRange>* ranges, bool is_one_byte);
104 static void Split(ZoneList<CharacterRange>* base, Vector<const int> overlay,
105 ZoneList<CharacterRange>** included,
106 ZoneList<CharacterRange>** excluded, Zone* zone);
107 // Whether a range list is in canonical form: Ranges ordered by from value,
108 // and ranges non-overlapping and non-adjacent.
109 static bool IsCanonical(ZoneList<CharacterRange>* ranges);
110 // Convert range list to canonical form. The characters covered by the ranges
111 // will still be the same, but no character is in more than one range, and
112 // adjacent ranges are merged. The resulting list may be shorter than the
113 // original, but cannot be longer.
114 static void Canonicalize(ZoneList<CharacterRange>* ranges);
115 // Negate the contents of a character range in canonical form.
116 static void Negate(ZoneList<CharacterRange>* src,
117 ZoneList<CharacterRange>* dst, Zone* zone);
118 static const int kStartMarker = (1 << 24);
119 static const int kPayloadMask = (1 << 24) - 1;
120
121 private:
122 uc16 from_;
123 uc16 to_;
124 };
125
126
127 class CharacterSet final BASE_EMBEDDED {
128 public:
129 explicit CharacterSet(uc16 standard_set_type)
130 : ranges_(NULL), standard_set_type_(standard_set_type) {}
131 explicit CharacterSet(ZoneList<CharacterRange>* ranges)
132 : ranges_(ranges), standard_set_type_(0) {}
133 ZoneList<CharacterRange>* ranges(Zone* zone);
134 uc16 standard_set_type() { return standard_set_type_; }
135 void set_standard_set_type(uc16 special_set_type) {
136 standard_set_type_ = special_set_type;
137 }
138 bool is_standard() { return standard_set_type_ != 0; }
139 void Canonicalize();
140
141 private:
142 ZoneList<CharacterRange>* ranges_;
143 // If non-zero, the value represents a standard set (e.g., all whitespace
144 // characters) without having to expand the ranges.
145 uc16 standard_set_type_;
146 };
147
148
149 class TextElement final BASE_EMBEDDED {
150 public:
151 enum TextType { ATOM, CHAR_CLASS };
152
153 static TextElement Atom(RegExpAtom* atom);
154 static TextElement CharClass(RegExpCharacterClass* char_class);
155
156 int cp_offset() const { return cp_offset_; }
157 void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
158 int length() const;
159
160 TextType text_type() const { return text_type_; }
161
162 RegExpTree* tree() const { return tree_; }
163
164 RegExpAtom* atom() const {
165 DCHECK(text_type() == ATOM);
166 return reinterpret_cast<RegExpAtom*>(tree());
167 }
168
169 RegExpCharacterClass* char_class() const {
170 DCHECK(text_type() == CHAR_CLASS);
171 return reinterpret_cast<RegExpCharacterClass*>(tree());
172 }
173
174 private:
175 TextElement(TextType text_type, RegExpTree* tree)
176 : cp_offset_(-1), text_type_(text_type), tree_(tree) {}
177
178 int cp_offset_;
179 TextType text_type_;
180 RegExpTree* tree_;
181 };
182
183
184 class RegExpTree : public ZoneObject {
185 public:
186 static const int kInfinity = kMaxInt;
187 virtual ~RegExpTree() {}
188 virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
189 virtual RegExpNode* ToNode(RegExpCompiler* compiler,
190 RegExpNode* on_success) = 0;
191 virtual bool IsTextElement() { return false; }
192 virtual bool IsAnchoredAtStart() { return false; }
193 virtual bool IsAnchoredAtEnd() { return false; }
194 virtual int min_match() = 0;
195 virtual int max_match() = 0;
196 // Returns the interval of registers used for captures within this
197 // expression.
198 virtual Interval CaptureRegisters() { return Interval::Empty(); }
199 virtual void AppendToText(RegExpText* text, Zone* zone);
200 std::ostream& Print(std::ostream& os, Zone* zone); // NOLINT
201 #define MAKE_ASTYPE(Name) \
202 virtual RegExp##Name* As##Name(); \
203 virtual bool Is##Name();
204 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
205 #undef MAKE_ASTYPE
206 };
207
208
209 class RegExpDisjunction final : public RegExpTree {
210 public:
211 explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
212 void* Accept(RegExpVisitor* visitor, void* data) override;
213 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
214 RegExpDisjunction* AsDisjunction() override;
215 Interval CaptureRegisters() override;
216 bool IsDisjunction() override;
217 bool IsAnchoredAtStart() override;
218 bool IsAnchoredAtEnd() override;
219 int min_match() override { return min_match_; }
220 int max_match() override { return max_match_; }
221 ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
222
223 private:
224 bool SortConsecutiveAtoms(RegExpCompiler* compiler);
225 void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
226 void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
227 ZoneList<RegExpTree*>* alternatives_;
228 int min_match_;
229 int max_match_;
230 };
231
232
233 class RegExpAlternative final : public RegExpTree {
234 public:
235 explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
236 void* Accept(RegExpVisitor* visitor, void* data) override;
237 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
238 RegExpAlternative* AsAlternative() override;
239 Interval CaptureRegisters() override;
240 bool IsAlternative() override;
241 bool IsAnchoredAtStart() override;
242 bool IsAnchoredAtEnd() override;
243 int min_match() override { return min_match_; }
244 int max_match() override { return max_match_; }
245 ZoneList<RegExpTree*>* nodes() { return nodes_; }
246
247 private:
248 ZoneList<RegExpTree*>* nodes_;
249 int min_match_;
250 int max_match_;
251 };
252
253
254 class RegExpAssertion final : public RegExpTree {
255 public:
256 enum AssertionType {
257 START_OF_LINE,
258 START_OF_INPUT,
259 END_OF_LINE,
260 END_OF_INPUT,
261 BOUNDARY,
262 NON_BOUNDARY
263 };
264 explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {}
265 void* Accept(RegExpVisitor* visitor, void* data) override;
266 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
267 RegExpAssertion* AsAssertion() override;
268 bool IsAssertion() override;
269 bool IsAnchoredAtStart() override;
270 bool IsAnchoredAtEnd() override;
271 int min_match() override { return 0; }
272 int max_match() override { return 0; }
273 AssertionType assertion_type() { return assertion_type_; }
274
275 private:
276 AssertionType assertion_type_;
277 };
278
279
280 class RegExpCharacterClass final : public RegExpTree {
281 public:
282 RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated)
283 : set_(ranges), is_negated_(is_negated) {}
284 explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {}
285 void* Accept(RegExpVisitor* visitor, void* data) override;
286 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
287 RegExpCharacterClass* AsCharacterClass() override;
288 bool IsCharacterClass() override;
289 bool IsTextElement() override { return true; }
290 int min_match() override { return 1; }
291 int max_match() override { return 1; }
292 void AppendToText(RegExpText* text, Zone* zone) override;
293 CharacterSet character_set() { return set_; }
294 // TODO(lrn): Remove need for complex version if is_standard that
295 // recognizes a mangled standard set and just do { return set_.is_special(); }
296 bool is_standard(Zone* zone);
297 // Returns a value representing the standard character set if is_standard()
298 // returns true.
299 // Currently used values are:
300 // s : unicode whitespace
301 // S : unicode non-whitespace
302 // w : ASCII word character (digit, letter, underscore)
303 // W : non-ASCII word character
304 // d : ASCII digit
305 // D : non-ASCII digit
306 // . : non-unicode non-newline
307 // * : All characters
308 uc16 standard_type() { return set_.standard_set_type(); }
309 ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
310 bool is_negated() { return is_negated_; }
311
312 private:
313 CharacterSet set_;
314 bool is_negated_;
315 };
316
317
318 class RegExpAtom final : public RegExpTree {
319 public:
320 explicit RegExpAtom(Vector<const uc16> data) : data_(data) {}
321 void* Accept(RegExpVisitor* visitor, void* data) override;
322 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
323 RegExpAtom* AsAtom() override;
324 bool IsAtom() override;
325 bool IsTextElement() override { return true; }
326 int min_match() override { return data_.length(); }
327 int max_match() override { return data_.length(); }
328 void AppendToText(RegExpText* text, Zone* zone) override;
329 Vector<const uc16> data() { return data_; }
330 int length() { return data_.length(); }
331
332 private:
333 Vector<const uc16> data_;
334 };
335
336
337 class RegExpText final : public RegExpTree {
338 public:
339 explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {}
340 void* Accept(RegExpVisitor* visitor, void* data) override;
341 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
342 RegExpText* AsText() override;
343 bool IsText() override;
344 bool IsTextElement() override { return true; }
345 int min_match() override { return length_; }
346 int max_match() override { return length_; }
347 void AppendToText(RegExpText* text, Zone* zone) override;
348 void AddElement(TextElement elm, Zone* zone) {
349 elements_.Add(elm, zone);
350 length_ += elm.length();
351 }
352 ZoneList<TextElement>* elements() { return &elements_; }
353
354 private:
355 ZoneList<TextElement> elements_;
356 int length_;
357 };
358
359
360 class RegExpQuantifier final : public RegExpTree {
361 public:
362 enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
363 RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body)
364 : body_(body),
365 min_(min),
366 max_(max),
367 min_match_(min * body->min_match()),
368 quantifier_type_(type) {
369 if (max > 0 && body->max_match() > kInfinity / max) {
370 max_match_ = kInfinity;
371 } else {
372 max_match_ = max * body->max_match();
373 }
374 }
375 void* Accept(RegExpVisitor* visitor, void* data) override;
376 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
377 static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body,
378 RegExpCompiler* compiler, RegExpNode* on_success,
379 bool not_at_start = false);
380 RegExpQuantifier* AsQuantifier() override;
381 Interval CaptureRegisters() override;
382 bool IsQuantifier() override;
383 int min_match() override { return min_match_; }
384 int max_match() override { return max_match_; }
385 int min() { return min_; }
386 int max() { return max_; }
387 bool is_possessive() { return quantifier_type_ == POSSESSIVE; }
388 bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
389 bool is_greedy() { return quantifier_type_ == GREEDY; }
390 RegExpTree* body() { return body_; }
391
392 private:
393 RegExpTree* body_;
394 int min_;
395 int max_;
396 int min_match_;
397 int max_match_;
398 QuantifierType quantifier_type_;
399 };
400
401
402 class RegExpCapture final : public RegExpTree {
403 public:
404 explicit RegExpCapture(int index) : body_(NULL), index_(index) {}
405 void* Accept(RegExpVisitor* visitor, void* data) override;
406 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
407 static RegExpNode* ToNode(RegExpTree* body, int index,
408 RegExpCompiler* compiler, RegExpNode* on_success);
409 RegExpCapture* AsCapture() override;
410 bool IsAnchoredAtStart() override;
411 bool IsAnchoredAtEnd() override;
412 Interval CaptureRegisters() override;
413 bool IsCapture() override;
414 int min_match() override { return body_->min_match(); }
415 int max_match() override { return body_->max_match(); }
416 RegExpTree* body() { return body_; }
417 void set_body(RegExpTree* body) { body_ = body; }
418 int index() { return index_; }
419 static int StartRegister(int index) { return index * 2; }
420 static int EndRegister(int index) { return index * 2 + 1; }
421
422 private:
423 RegExpTree* body_;
424 int index_;
425 };
426
427
428 class RegExpLookaround final : public RegExpTree {
429 public:
430 enum Type { LOOKAHEAD, LOOKBEHIND };
431
432 RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count,
433 int capture_from, Type type)
434 : body_(body),
435 is_positive_(is_positive),
436 capture_count_(capture_count),
437 capture_from_(capture_from),
438 type_(type) {}
439
440 void* Accept(RegExpVisitor* visitor, void* data) override;
441 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
442 RegExpLookaround* AsLookaround() override;
443 Interval CaptureRegisters() override;
444 bool IsLookaround() override;
445 bool IsAnchoredAtStart() override;
446 int min_match() override { return 0; }
447 int max_match() override { return 0; }
448 RegExpTree* body() { return body_; }
449 bool is_positive() { return is_positive_; }
450 int capture_count() { return capture_count_; }
451 int capture_from() { return capture_from_; }
452 Type type() { return type_; }
453
454 private:
455 RegExpTree* body_;
456 bool is_positive_;
457 int capture_count_;
458 int capture_from_;
459 Type type_;
460 };
461
462
463 class RegExpBackReference final : public RegExpTree {
464 public:
465 explicit RegExpBackReference(RegExpCapture* capture) : capture_(capture) {}
466 void* Accept(RegExpVisitor* visitor, void* data) override;
467 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
468 RegExpBackReference* AsBackReference() override;
469 bool IsBackReference() override;
470 int min_match() override { return 0; }
471 // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
472 // recursion, we give up. Ignorance is bliss.
473 int max_match() override { return kInfinity; }
474 int index() { return capture_->index(); }
475 RegExpCapture* capture() { return capture_; }
476
477 private:
478 RegExpCapture* capture_;
479 };
480
481
482 class RegExpEmpty final : public RegExpTree {
483 public:
484 RegExpEmpty() {}
485 void* Accept(RegExpVisitor* visitor, void* data) override;
486 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
487 RegExpEmpty* AsEmpty() override;
488 bool IsEmpty() override;
489 int min_match() override { return 0; }
490 int max_match() override { return 0; }
491 };
492
493 } // namespace internal
494 } // namespace v8
495
496 #endif // V8_REGEXP_REGEXP_AST_H_
OLDNEW
« no previous file with comments | « src/regexp/jsregexp.cc ('k') | src/regexp/regexp-ast.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698