Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(656)

Side by Side Diff: src/regexp/regexp-parser.h

Issue 1578253005: [regexp] implement character classes for unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: more tests Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/regexp-ast.cc ('k') | src/regexp/regexp-parser.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #ifndef V8_REGEXP_REGEXP_PARSER_H_ 5 #ifndef V8_REGEXP_REGEXP_PARSER_H_
6 #define V8_REGEXP_REGEXP_PARSER_H_ 6 #define V8_REGEXP_REGEXP_PARSER_H_
7 7
8 #include "src/objects.h" 8 #include "src/objects.h"
9 #include "src/regexp/regexp-ast.h" 9 #include "src/regexp/regexp-ast.h"
10 #include "src/zone.h" 10 #include "src/zone.h"
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
92 92
93 private: 93 private:
94 ZoneList<T*>* list_; 94 ZoneList<T*>* list_;
95 T* last_; 95 T* last_;
96 }; 96 };
97 97
98 98
99 // Accumulates RegExp atoms and assertions into lists of terms and alternatives. 99 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
100 class RegExpBuilder : public ZoneObject { 100 class RegExpBuilder : public ZoneObject {
101 public: 101 public:
102 explicit RegExpBuilder(Zone* zone); 102 RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
103 void AddCharacter(uc16 character); 103 void AddCharacter(uc16 character);
104 void AddUnicodeCharacter(uc32 character); 104 void AddUnicodeCharacter(uc32 character);
105 // "Adds" an empty expression. Does nothing except consume a 105 // "Adds" an empty expression. Does nothing except consume a
106 // following quantifier 106 // following quantifier
107 void AddEmpty(); 107 void AddEmpty();
108 void AddCharacterClass(RegExpCharacterClass* cc);
108 void AddAtom(RegExpTree* tree); 109 void AddAtom(RegExpTree* tree);
110 void AddTerm(RegExpTree* tree);
109 void AddAssertion(RegExpTree* tree); 111 void AddAssertion(RegExpTree* tree);
110 void NewAlternative(); // '|' 112 void NewAlternative(); // '|'
111 void AddQuantifierToAtom(int min, int max, 113 void AddQuantifierToAtom(int min, int max,
112 RegExpQuantifier::QuantifierType type); 114 RegExpQuantifier::QuantifierType type);
113 RegExpTree* ToRegExp(); 115 RegExpTree* ToRegExp();
114 116
115 private: 117 private:
118 static const uc16 kNoPendingSurrogate = 0;
119 void AddLeadSurrogate(uc16 lead_surrogate);
120 void AddTrailSurrogate(uc16 trail_surrogate);
121 void FlushPendingSurrogate();
116 void FlushCharacters(); 122 void FlushCharacters();
117 void FlushText(); 123 void FlushText();
118 void FlushTerms(); 124 void FlushTerms();
119 Zone* zone() const { return zone_; } 125 Zone* zone() const { return zone_; }
126 bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
120 127
121 Zone* zone_; 128 Zone* zone_;
122 bool pending_empty_; 129 bool pending_empty_;
130 JSRegExp::Flags flags_;
123 ZoneList<uc16>* characters_; 131 ZoneList<uc16>* characters_;
132 uc16 pending_surrogate_;
124 BufferedZoneList<RegExpTree, 2> terms_; 133 BufferedZoneList<RegExpTree, 2> terms_;
125 BufferedZoneList<RegExpTree, 2> text_; 134 BufferedZoneList<RegExpTree, 2> text_;
126 BufferedZoneList<RegExpTree, 2> alternatives_; 135 BufferedZoneList<RegExpTree, 2> alternatives_;
127 #ifdef DEBUG 136 #ifdef DEBUG
128 enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_; 137 enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
129 #define LAST(x) last_added_ = x; 138 #define LAST(x) last_added_ = x;
130 #else 139 #else
131 #define LAST(x) 140 #define LAST(x)
132 #endif 141 #endif
133 }; 142 };
134 143
135 144
136 class RegExpParser BASE_EMBEDDED { 145 class RegExpParser BASE_EMBEDDED {
137 public: 146 public:
138 RegExpParser(FlatStringReader* in, Handle<String>* error, bool multiline_mode, 147 RegExpParser(FlatStringReader* in, Handle<String>* error,
139 bool unicode, Isolate* isolate, Zone* zone); 148 JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
140 149
141 static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input, 150 static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
142 bool multiline, bool unicode, 151 JSRegExp::Flags flags, RegExpCompileData* result);
143 RegExpCompileData* result);
144 152
145 RegExpTree* ParsePattern(); 153 RegExpTree* ParsePattern();
146 RegExpTree* ParseDisjunction(); 154 RegExpTree* ParseDisjunction();
147 RegExpTree* ParseGroup(); 155 RegExpTree* ParseGroup();
148 RegExpTree* ParseCharacterClass(); 156 RegExpTree* ParseCharacterClass();
149 157
150 // Parses a {...,...} quantifier and stores the range in the given 158 // Parses a {...,...} quantifier and stores the range in the given
151 // out parameters. 159 // out parameters.
152 bool ParseIntervalQuantifier(int* min_out, int* max_out); 160 bool ParseIntervalQuantifier(int* min_out, int* max_out);
153 161
(...skipping 22 matching lines...) Expand all
176 void Reset(int pos); 184 void Reset(int pos);
177 185
178 // Reports whether the pattern might be used as a literal search string. 186 // Reports whether the pattern might be used as a literal search string.
179 // Only use if the result of the parse is a single atom node. 187 // Only use if the result of the parse is a single atom node.
180 bool simple(); 188 bool simple();
181 bool contains_anchor() { return contains_anchor_; } 189 bool contains_anchor() { return contains_anchor_; }
182 void set_contains_anchor() { contains_anchor_ = true; } 190 void set_contains_anchor() { contains_anchor_ = true; }
183 int captures_started() { return captures_started_; } 191 int captures_started() { return captures_started_; }
184 int position() { return next_pos_ - 1; } 192 int position() { return next_pos_ - 1; }
185 bool failed() { return failed_; } 193 bool failed() { return failed_; }
194 bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
195 bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
186 196
187 static bool IsSyntaxCharacter(uc32 c); 197 static bool IsSyntaxCharacter(uc32 c);
188 198
189 static const int kMaxCaptures = 1 << 16; 199 static const int kMaxCaptures = 1 << 16;
190 static const uc32 kEndMarker = (1 << 21); 200 static const uc32 kEndMarker = (1 << 21);
191 201
192 private: 202 private:
193 enum SubexpressionType { 203 enum SubexpressionType {
194 INITIAL, 204 INITIAL,
195 CAPTURE, // All positive values represent captures. 205 CAPTURE, // All positive values represent captures.
196 POSITIVE_LOOKAROUND, 206 POSITIVE_LOOKAROUND,
197 NEGATIVE_LOOKAROUND, 207 NEGATIVE_LOOKAROUND,
198 GROUPING 208 GROUPING
199 }; 209 };
200 210
201 class RegExpParserState : public ZoneObject { 211 class RegExpParserState : public ZoneObject {
202 public: 212 public:
203 RegExpParserState(RegExpParserState* previous_state, 213 RegExpParserState(RegExpParserState* previous_state,
204 SubexpressionType group_type, 214 SubexpressionType group_type,
205 RegExpLookaround::Type lookaround_type, 215 RegExpLookaround::Type lookaround_type,
206 int disjunction_capture_index, Zone* zone) 216 int disjunction_capture_index, JSRegExp::Flags flags,
217 Zone* zone)
207 : previous_state_(previous_state), 218 : previous_state_(previous_state),
208 builder_(new (zone) RegExpBuilder(zone)), 219 builder_(new (zone) RegExpBuilder(zone, flags)),
209 group_type_(group_type), 220 group_type_(group_type),
210 lookaround_type_(lookaround_type), 221 lookaround_type_(lookaround_type),
211 disjunction_capture_index_(disjunction_capture_index) {} 222 disjunction_capture_index_(disjunction_capture_index) {}
212 // Parser state of containing expression, if any. 223 // Parser state of containing expression, if any.
213 RegExpParserState* previous_state() { return previous_state_; } 224 RegExpParserState* previous_state() { return previous_state_; }
214 bool IsSubexpression() { return previous_state_ != NULL; } 225 bool IsSubexpression() { return previous_state_ != NULL; }
215 // RegExpBuilder building this regexp's AST. 226 // RegExpBuilder building this regexp's AST.
216 RegExpBuilder* builder() { return builder_; } 227 RegExpBuilder* builder() { return builder_; }
217 // Type of regexp being parsed (parenthesized group or entire regexp). 228 // Type of regexp being parsed (parenthesized group or entire regexp).
218 SubexpressionType group_type() { return group_type_; } 229 SubexpressionType group_type() { return group_type_; }
(...skipping 23 matching lines...) Expand all
242 // Return the 1-indexed RegExpCapture object, allocate if necessary. 253 // Return the 1-indexed RegExpCapture object, allocate if necessary.
243 RegExpCapture* GetCapture(int index); 254 RegExpCapture* GetCapture(int index);
244 255
245 Isolate* isolate() { return isolate_; } 256 Isolate* isolate() { return isolate_; }
246 Zone* zone() const { return zone_; } 257 Zone* zone() const { return zone_; }
247 258
248 uc32 current() { return current_; } 259 uc32 current() { return current_; }
249 bool has_more() { return has_more_; } 260 bool has_more() { return has_more_; }
250 bool has_next() { return next_pos_ < in()->length(); } 261 bool has_next() { return next_pos_ < in()->length(); }
251 uc32 Next(); 262 uc32 Next();
263 template <bool update_position>
264 uc32 ReadNext();
252 FlatStringReader* in() { return in_; } 265 FlatStringReader* in() { return in_; }
253 void ScanForCaptures(); 266 void ScanForCaptures();
254 267
255 Isolate* isolate_; 268 Isolate* isolate_;
256 Zone* zone_; 269 Zone* zone_;
257 Handle<String>* error_; 270 Handle<String>* error_;
258 ZoneList<RegExpCapture*>* captures_; 271 ZoneList<RegExpCapture*>* captures_;
259 FlatStringReader* in_; 272 FlatStringReader* in_;
260 uc32 current_; 273 uc32 current_;
274 JSRegExp::Flags flags_;
261 int next_pos_; 275 int next_pos_;
262 int captures_started_; 276 int captures_started_;
263 // The capture count is only valid after we have scanned for captures. 277 // The capture count is only valid after we have scanned for captures.
264 int capture_count_; 278 int capture_count_;
265 bool has_more_; 279 bool has_more_;
266 bool multiline_;
267 bool unicode_;
268 bool simple_; 280 bool simple_;
269 bool contains_anchor_; 281 bool contains_anchor_;
270 bool is_scanned_for_captures_; 282 bool is_scanned_for_captures_;
271 bool failed_; 283 bool failed_;
272 }; 284 };
273 285
274 } // namespace internal 286 } // namespace internal
275 } // namespace v8 287 } // namespace v8
276 288
277 #endif // V8_REGEXP_REGEXP_PARSER_H_ 289 #endif // V8_REGEXP_REGEXP_PARSER_H_
OLDNEW
« no previous file with comments | « src/regexp/regexp-ast.cc ('k') | src/regexp/regexp-parser.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698