Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(451)

Side by Side Diff: src/regexp/regexp-parser.h

Issue 1565183002: [regexp] move regexp parser into own files. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: fix test compile Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/regexp-macro-assembler.h ('k') | src/regexp/regexp-parser.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef V8_REGEXP_REGEXP_PARSER_H_
6 #define V8_REGEXP_REGEXP_PARSER_H_
7
8 #include "src/objects.h"
9 #include "src/regexp/regexp-ast.h"
10 #include "src/zone.h"
11
12 namespace v8 {
13 namespace internal {
14
15 struct RegExpCompileData;
16
17
18 // A BufferedZoneList is an automatically growing list, just like (and backed
19 // by) a ZoneList, that is optimized for the case of adding and removing
20 // a single element. The last element added is stored outside the backing list,
21 // and if no more than one element is ever added, the ZoneList isn't even
22 // allocated.
23 // Elements must not be NULL pointers.
24 template <typename T, int initial_size>
25 class BufferedZoneList {
26 public:
27 BufferedZoneList() : list_(NULL), last_(NULL) {}
28
29 // Adds element at end of list. This element is buffered and can
30 // be read using last() or removed using RemoveLast until a new Add or until
31 // RemoveLast or GetList has been called.
32 void Add(T* value, Zone* zone) {
33 if (last_ != NULL) {
34 if (list_ == NULL) {
35 list_ = new (zone) ZoneList<T*>(initial_size, zone);
36 }
37 list_->Add(last_, zone);
38 }
39 last_ = value;
40 }
41
42 T* last() {
43 DCHECK(last_ != NULL);
44 return last_;
45 }
46
47 T* RemoveLast() {
48 DCHECK(last_ != NULL);
49 T* result = last_;
50 if ((list_ != NULL) && (list_->length() > 0))
51 last_ = list_->RemoveLast();
52 else
53 last_ = NULL;
54 return result;
55 }
56
57 T* Get(int i) {
58 DCHECK((0 <= i) && (i < length()));
59 if (list_ == NULL) {
60 DCHECK_EQ(0, i);
61 return last_;
62 } else {
63 if (i == list_->length()) {
64 DCHECK(last_ != NULL);
65 return last_;
66 } else {
67 return list_->at(i);
68 }
69 }
70 }
71
72 void Clear() {
73 list_ = NULL;
74 last_ = NULL;
75 }
76
77 int length() {
78 int length = (list_ == NULL) ? 0 : list_->length();
79 return length + ((last_ == NULL) ? 0 : 1);
80 }
81
82 ZoneList<T*>* GetList(Zone* zone) {
83 if (list_ == NULL) {
84 list_ = new (zone) ZoneList<T*>(initial_size, zone);
85 }
86 if (last_ != NULL) {
87 list_->Add(last_, zone);
88 last_ = NULL;
89 }
90 return list_;
91 }
92
93 private:
94 ZoneList<T*>* list_;
95 T* last_;
96 };
97
98
99 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
100 class RegExpBuilder : public ZoneObject {
101 public:
102 explicit RegExpBuilder(Zone* zone);
103 void AddCharacter(uc16 character);
104 // "Adds" an empty expression. Does nothing except consume a
105 // following quantifier
106 void AddEmpty();
107 void AddAtom(RegExpTree* tree);
108 void AddAssertion(RegExpTree* tree);
109 void NewAlternative(); // '|'
110 void AddQuantifierToAtom(int min, int max,
111 RegExpQuantifier::QuantifierType type);
112 RegExpTree* ToRegExp();
113
114 private:
115 void FlushCharacters();
116 void FlushText();
117 void FlushTerms();
118 Zone* zone() const { return zone_; }
119
120 Zone* zone_;
121 bool pending_empty_;
122 ZoneList<uc16>* characters_;
123 BufferedZoneList<RegExpTree, 2> terms_;
124 BufferedZoneList<RegExpTree, 2> text_;
125 BufferedZoneList<RegExpTree, 2> alternatives_;
126 #ifdef DEBUG
127 enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
128 #define LAST(x) last_added_ = x;
129 #else
130 #define LAST(x)
131 #endif
132 };
133
134
135 class RegExpParser BASE_EMBEDDED {
136 public:
137 RegExpParser(FlatStringReader* in, Handle<String>* error, bool multiline_mode,
138 bool unicode, Isolate* isolate, Zone* zone);
139
140 static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
141 bool multiline, bool unicode,
142 RegExpCompileData* result);
143
144 RegExpTree* ParsePattern();
145 RegExpTree* ParseDisjunction();
146 RegExpTree* ParseGroup();
147 RegExpTree* ParseCharacterClass();
148
149 // Parses a {...,...} quantifier and stores the range in the given
150 // out parameters.
151 bool ParseIntervalQuantifier(int* min_out, int* max_out);
152
153 // Parses and returns a single escaped character. The character
154 // must not be 'b' or 'B' since they are usually handle specially.
155 uc32 ParseClassCharacterEscape();
156
157 // Checks whether the following is a length-digit hexadecimal number,
158 // and sets the value if it is.
159 bool ParseHexEscape(int length, uc32* value);
160 bool ParseUnicodeEscape(uc32* value);
161 bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
162
163 uc32 ParseOctalLiteral();
164
165 // Tries to parse the input as a back reference. If successful it
166 // stores the result in the output parameter and returns true. If
167 // it fails it will push back the characters read so the same characters
168 // can be reparsed.
169 bool ParseBackReferenceIndex(int* index_out);
170
171 CharacterRange ParseClassAtom(uc16* char_class);
172 RegExpTree* ReportError(Vector<const char> message);
173 void Advance();
174 void Advance(int dist);
175 void Reset(int pos);
176
177 // Reports whether the pattern might be used as a literal search string.
178 // Only use if the result of the parse is a single atom node.
179 bool simple();
180 bool contains_anchor() { return contains_anchor_; }
181 void set_contains_anchor() { contains_anchor_ = true; }
182 int captures_started() { return captures_started_; }
183 int position() { return next_pos_ - 1; }
184 bool failed() { return failed_; }
185
186 static bool IsSyntaxCharacter(uc32 c);
187
188 static const int kMaxCaptures = 1 << 16;
189 static const uc32 kEndMarker = (1 << 21);
190
191 private:
192 enum SubexpressionType {
193 INITIAL,
194 CAPTURE, // All positive values represent captures.
195 POSITIVE_LOOKAROUND,
196 NEGATIVE_LOOKAROUND,
197 GROUPING
198 };
199
200 class RegExpParserState : public ZoneObject {
201 public:
202 RegExpParserState(RegExpParserState* previous_state,
203 SubexpressionType group_type,
204 RegExpLookaround::Type lookaround_type,
205 int disjunction_capture_index, Zone* zone)
206 : previous_state_(previous_state),
207 builder_(new (zone) RegExpBuilder(zone)),
208 group_type_(group_type),
209 lookaround_type_(lookaround_type),
210 disjunction_capture_index_(disjunction_capture_index) {}
211 // Parser state of containing expression, if any.
212 RegExpParserState* previous_state() { return previous_state_; }
213 bool IsSubexpression() { return previous_state_ != NULL; }
214 // RegExpBuilder building this regexp's AST.
215 RegExpBuilder* builder() { return builder_; }
216 // Type of regexp being parsed (parenthesized group or entire regexp).
217 SubexpressionType group_type() { return group_type_; }
218 // Lookahead or Lookbehind.
219 RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
220 // Index in captures array of first capture in this sub-expression, if any.
221 // Also the capture index of this sub-expression itself, if group_type
222 // is CAPTURE.
223 int capture_index() { return disjunction_capture_index_; }
224
225 // Check whether the parser is inside a capture group with the given index.
226 bool IsInsideCaptureGroup(int index);
227
228 private:
229 // Linked list implementation of stack of states.
230 RegExpParserState* previous_state_;
231 // Builder for the stored disjunction.
232 RegExpBuilder* builder_;
233 // Stored disjunction type (capture, look-ahead or grouping), if any.
234 SubexpressionType group_type_;
235 // Stored read direction.
236 RegExpLookaround::Type lookaround_type_;
237 // Stored disjunction's capture index (if any).
238 int disjunction_capture_index_;
239 };
240
241 // Return the 1-indexed RegExpCapture object, allocate if necessary.
242 RegExpCapture* GetCapture(int index);
243
244 Isolate* isolate() { return isolate_; }
245 Zone* zone() const { return zone_; }
246
247 uc32 current() { return current_; }
248 bool has_more() { return has_more_; }
249 bool has_next() { return next_pos_ < in()->length(); }
250 uc32 Next();
251 FlatStringReader* in() { return in_; }
252 void ScanForCaptures();
253
254 Isolate* isolate_;
255 Zone* zone_;
256 Handle<String>* error_;
257 ZoneList<RegExpCapture*>* captures_;
258 FlatStringReader* in_;
259 uc32 current_;
260 int next_pos_;
261 int captures_started_;
262 // The capture count is only valid after we have scanned for captures.
263 int capture_count_;
264 bool has_more_;
265 bool multiline_;
266 bool unicode_;
267 bool simple_;
268 bool contains_anchor_;
269 bool is_scanned_for_captures_;
270 bool failed_;
271 };
272
273 } // namespace internal
274 } // namespace v8
275
276 #endif // V8_REGEXP_REGEXP_PARSER_H_
OLDNEW
« no previous file with comments | « src/regexp/regexp-macro-assembler.h ('k') | src/regexp/regexp-parser.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698