Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(208)

Side by Side Diff: runtime/vm/regexp_parser.cc

Issue 683433003: Integrate the Irregexp Regular Expression Engine. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: more comments Created 6 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « runtime/vm/regexp_parser.h ('k') | runtime/vm/symbols.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 #include "vm/longjump.h"
6 #include "vm/object_store.h"
5 #include "vm/regexp_parser.h" 7 #include "vm/regexp_parser.h"
6 8
7 // SNIP
8
9 namespace dart { 9 namespace dart {
10 10
11 RegExpBuilder::RegExpBuilder(Zone* zone) 11 #define I isolate()
12 : zone_(zone), 12
13 // Enables possessive quantifier syntax for testing.
14 static const bool FLAG_regexp_possessive_quantifier = false;
15
16 RegExpBuilder::RegExpBuilder()
17 : isolate_(Isolate::Current()),
13 pending_empty_(false), 18 pending_empty_(false),
14 characters_(NULL), 19 characters_(NULL),
15 terms_(), 20 terms_(),
21 text_(),
16 alternatives_() 22 alternatives_()
17 #ifdef DEBUG 23 #ifdef DEBUG
18 , last_added_(ADD_NONE) 24 , last_added_(ADD_NONE)
19 #endif 25 #endif
20 {} 26 {}
21 27
22 28
23 void RegExpBuilder::FlushCharacters() { 29 void RegExpBuilder::FlushCharacters() {
24 pending_empty_ = false; 30 pending_empty_ = false;
25 if (characters_ != NULL) { 31 if (characters_ != NULL) {
26 RegExpTree* atom = new(zone()) RegExpAtom(characters_->ToConstVector()); 32 RegExpTree* atom = new(I) RegExpAtom(characters_);
27 characters_ = NULL; 33 characters_ = NULL;
28 text_.Add(atom, zone()); 34 text_.Add(atom);
29 LAST(ADD_ATOM); 35 LAST(ADD_ATOM);
30 } 36 }
31 } 37 }
32 38
33 39
34 void RegExpBuilder::FlushText() { 40 void RegExpBuilder::FlushText() {
35 FlushCharacters(); 41 FlushCharacters();
36 int num_text = text_.length(); 42 intptr_t num_text = text_.length();
37 if (num_text == 0) { 43 if (num_text == 0) {
38 return; 44 return;
39 } else if (num_text == 1) { 45 } else if (num_text == 1) {
40 terms_.Add(text_.last(), zone()); 46 terms_.Add(text_.Last());
41 } else { 47 } else {
42 RegExpText* text = new(zone()) RegExpText(zone()); 48 RegExpText* text = new(I) RegExpText();
43 for (int i = 0; i < num_text; i++) 49 for (intptr_t i = 0; i < num_text; i++)
44 text_.Get(i)->AppendToText(text, zone()); 50 text_[i]->AppendToText(text);
45 terms_.Add(text, zone()); 51 terms_.Add(text);
46 } 52 }
47 text_.Clear(); 53 text_.Clear();
48 } 54 }
49 55
50 56
51 void RegExpBuilder::AddCharacter(uc16 c) { 57 void RegExpBuilder::AddCharacter(uint16_t c) {
52 pending_empty_ = false; 58 pending_empty_ = false;
53 if (characters_ == NULL) { 59 if (characters_ == NULL) {
54 characters_ = new(zone()) ZoneList<uc16>(4, zone()); 60 characters_ = new(I) ZoneGrowableArray<uint16_t>(4);
55 } 61 }
56 characters_->Add(c, zone()); 62 characters_->Add(c);
57 LAST(ADD_CHAR); 63 LAST(ADD_CHAR);
58 } 64 }
59 65
60 66
61 void RegExpBuilder::AddEmpty() { 67 void RegExpBuilder::AddEmpty() {
62 pending_empty_ = true; 68 pending_empty_ = true;
63 } 69 }
64 70
65 71
66 void RegExpBuilder::AddAtom(RegExpTree* term) { 72 void RegExpBuilder::AddAtom(RegExpTree* term) {
67 if (term->IsEmpty()) { 73 if (term->IsEmpty()) {
68 AddEmpty(); 74 AddEmpty();
69 return; 75 return;
70 } 76 }
71 if (term->IsTextElement()) { 77 if (term->IsTextElement()) {
72 FlushCharacters(); 78 FlushCharacters();
73 text_.Add(term, zone()); 79 text_.Add(term);
74 } else { 80 } else {
75 FlushText(); 81 FlushText();
76 terms_.Add(term, zone()); 82 terms_.Add(term);
77 } 83 }
78 LAST(ADD_ATOM); 84 LAST(ADD_ATOM);
79 } 85 }
80 86
81 87
82 void RegExpBuilder::AddAssertion(RegExpTree* assert) { 88 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
83 FlushText(); 89 FlushText();
84 terms_.Add(assert, zone()); 90 terms_.Add(assert);
85 LAST(ADD_ASSERT); 91 LAST(ADD_ASSERT);
86 } 92 }
87 93
88 94
89 void RegExpBuilder::NewAlternative() { 95 void RegExpBuilder::NewAlternative() {
90 FlushTerms(); 96 FlushTerms();
91 } 97 }
92 98
93 99
94 void RegExpBuilder::FlushTerms() { 100 void RegExpBuilder::FlushTerms() {
95 FlushText(); 101 FlushText();
96 int num_terms = terms_.length(); 102 intptr_t num_terms = terms_.length();
97 RegExpTree* alternative; 103 RegExpTree* alternative;
98 if (num_terms == 0) { 104 if (num_terms == 0) {
99 alternative = RegExpEmpty::GetInstance(); 105 alternative = RegExpEmpty::GetInstance();
100 } else if (num_terms == 1) { 106 } else if (num_terms == 1) {
101 alternative = terms_.last(); 107 alternative = terms_.Last();
102 } else { 108 } else {
103 alternative = new(zone()) RegExpAlternative(terms_.GetList(zone())); 109 ZoneGrowableArray<RegExpTree*>* terms =
110 new(I) ZoneGrowableArray<RegExpTree*>();
111 for (intptr_t i = 0; i < terms_.length(); i++) {
112 terms->Add(terms_[i]);
113 }
114 alternative = new(I) RegExpAlternative(terms);
104 } 115 }
105 alternatives_.Add(alternative, zone()); 116 alternatives_.Add(alternative);
106 terms_.Clear(); 117 terms_.Clear();
107 LAST(ADD_NONE); 118 LAST(ADD_NONE);
108 } 119 }
109 120
110 121
111 RegExpTree* RegExpBuilder::ToRegExp() { 122 RegExpTree* RegExpBuilder::ToRegExp() {
112 FlushTerms(); 123 FlushTerms();
113 int num_alternatives = alternatives_.length(); 124 intptr_t num_alternatives = alternatives_.length();
114 if (num_alternatives == 0) { 125 if (num_alternatives == 0) {
115 return RegExpEmpty::GetInstance(); 126 return RegExpEmpty::GetInstance();
116 } 127 }
117 if (num_alternatives == 1) { 128 if (num_alternatives == 1) {
118 return alternatives_.last(); 129 return alternatives_.Last();
119 } 130 }
120 return new(zone()) RegExpDisjunction(alternatives_.GetList(zone())); 131 ZoneGrowableArray<RegExpTree*>* alternatives =
132 new(I) ZoneGrowableArray<RegExpTree*>();
133 for (intptr_t i = 0; i < alternatives_.length(); i++) {
134 alternatives->Add(alternatives_[i]);
135 }
136 return new(I) RegExpDisjunction(alternatives);
121 } 137 }
122 138
123 139
124 void RegExpBuilder::AddQuantifierToAtom( 140 void RegExpBuilder::AddQuantifierToAtom(
125 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { 141 intptr_t min,
142 intptr_t max,
143 RegExpQuantifier::QuantifierType quantifier_type) {
126 if (pending_empty_) { 144 if (pending_empty_) {
127 pending_empty_ = false; 145 pending_empty_ = false;
128 return; 146 return;
129 } 147 }
130 RegExpTree* atom; 148 RegExpTree* atom;
131 if (characters_ != NULL) { 149 if (characters_ != NULL) {
132 DCHECK(last_added_ == ADD_CHAR); 150 DEBUG_ASSERT(last_added_ == ADD_CHAR);
133 // Last atom was character. 151 // Last atom was character.
134 Vector<const uc16> char_vector = characters_->ToConstVector(); 152
135 int num_chars = char_vector.length(); 153 ZoneGrowableArray<uint16_t> *char_vector =
154 new(I) ZoneGrowableArray<uint16_t>();
155 char_vector->AddArray(*characters_);
156 intptr_t num_chars = char_vector->length();
136 if (num_chars > 1) { 157 if (num_chars > 1) {
137 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1); 158 ZoneGrowableArray<uint16_t> *prefix =
138 text_.Add(new(zone()) RegExpAtom(prefix), zone()); 159 new(I) ZoneGrowableArray<uint16_t>();
139 char_vector = char_vector.SubVector(num_chars - 1, num_chars); 160 for (intptr_t i = 0; i < num_chars - 1; i++) {
161 prefix->Add(char_vector->At(i));
162 }
163 text_.Add(new(I) RegExpAtom(prefix));
164 ZoneGrowableArray<uint16_t> *tail = new(I) ZoneGrowableArray<uint16_t>();
165 tail->Add(char_vector->At(num_chars - 1));
166 char_vector = tail;
140 } 167 }
141 characters_ = NULL; 168 characters_ = NULL;
142 atom = new(zone()) RegExpAtom(char_vector); 169 atom = new(I) RegExpAtom(char_vector);
143 FlushText(); 170 FlushText();
144 } else if (text_.length() > 0) { 171 } else if (text_.length() > 0) {
145 DCHECK(last_added_ == ADD_ATOM); 172 DEBUG_ASSERT(last_added_ == ADD_ATOM);
146 atom = text_.RemoveLast(); 173 atom = text_.RemoveLast();
147 FlushText(); 174 FlushText();
148 } else if (terms_.length() > 0) { 175 } else if (terms_.length() > 0) {
149 DCHECK(last_added_ == ADD_ATOM); 176 DEBUG_ASSERT(last_added_ == ADD_ATOM);
150 atom = terms_.RemoveLast(); 177 atom = terms_.RemoveLast();
151 if (atom->max_match() == 0) { 178 if (atom->max_match() == 0) {
152 // Guaranteed to only match an empty string. 179 // Guaranteed to only match an empty string.
153 LAST(ADD_TERM); 180 LAST(ADD_TERM);
154 if (min == 0) { 181 if (min == 0) {
155 return; 182 return;
156 } 183 }
157 terms_.Add(atom, zone()); 184 terms_.Add(atom);
158 return; 185 return;
159 } 186 }
160 } else { 187 } else {
161 // Only call immediately after adding an atom or character! 188 // Only call immediately after adding an atom or character!
162 UNREACHABLE(); 189 UNREACHABLE();
163 return; 190 return;
164 } 191 }
165 terms_.Add( 192 terms_.Add(new(I) RegExpQuantifier(min, max, quantifier_type, atom));
166 new(zone()) RegExpQuantifier(min, max, quantifier_type, atom), zone());
167 LAST(ADD_TERM); 193 LAST(ADD_TERM);
168 } 194 }
169 195
170 // SNIP 196 // ----------------------------------------------------------------------------
197 // Implementation of Parser
171 198
172 // ---------------------------------------------------------------------------- 199 RegExpParser::RegExpParser(const String& in,
173 // Regular expressions 200 String* error,
174 201 bool multiline)
175 RegExpParser::RegExpParser(FlatStringReader* in, 202 : isolate_(Isolate::Current()),
176 Handle<String>* error,
177 bool multiline,
178 Zone* zone)
179 : isolate_(zone->isolate()),
180 zone_(zone),
181 error_(error), 203 error_(error),
182 captures_(NULL), 204 captures_(NULL),
183 in_(in), 205 in_(in),
184 current_(kEndMarker), 206 current_(kEndMarker),
185 next_pos_(0), 207 next_pos_(0),
186 capture_count_(0), 208 capture_count_(0),
187 has_more_(true), 209 has_more_(true),
188 multiline_(multiline), 210 multiline_(multiline),
189 simple_(false), 211 simple_(false),
190 contains_anchor_(false), 212 contains_anchor_(false),
191 is_scanned_for_captures_(false), 213 is_scanned_for_captures_(false),
192 failed_(false) { 214 failed_(false) {
193 Advance(); 215 Advance();
194 } 216 }
195 217
196 218
197 uc32 RegExpParser::Next() { 219 bool RegExpParser::ParseFunction(ParsedFunction *parsed_function) {
220 Isolate* isolate = parsed_function->isolate();
221 JSRegExp& regexp = JSRegExp::Handle(parsed_function->function().regexp());
222
223 const String& pattern = String::Handle(regexp.pattern());
224 const bool multiline = regexp.is_multi_line();
225
226 RegExpCompileData* compile_data = new(isolate) RegExpCompileData();
227 if (!RegExpParser::ParseRegExp(pattern, multiline, compile_data)) {
228 // Parsing failures are handled in the JSRegExp factory constructor.
229 UNREACHABLE();
230 }
231
232 regexp.set_num_bracket_expressions(compile_data->capture_count);
233 if (compile_data->simple) {
234 regexp.set_is_simple();
235 } else {
236 regexp.set_is_complex();
237 }
238
239 parsed_function->SetRegExpCompileData(compile_data);
240
241 return true;
242 }
243
244
245 uint32_t RegExpParser::Next() {
198 if (has_next()) { 246 if (has_next()) {
199 return in()->Get(next_pos_); 247 return in().CharAt(next_pos_);
200 } else { 248 } else {
201 return kEndMarker; 249 return kEndMarker;
202 } 250 }
203 } 251 }
204 252
205 253
206 void RegExpParser::Advance() { 254 void RegExpParser::Advance() {
207 if (next_pos_ < in()->length()) { 255 if (next_pos_ < in().Length()) {
208 StackLimitCheck check(isolate()); 256 current_ = in().CharAt(next_pos_);
209 if (check.HasOverflowed()) { 257 next_pos_++;
210 ReportError(CStrVector(Isolate::kStackOverflowMessage));
211 } else if (zone()->excess_allocation()) {
212 ReportError(CStrVector("Regular expression too large"));
213 } else {
214 current_ = in()->Get(next_pos_);
215 next_pos_++;
216 }
217 } else { 258 } else {
218 current_ = kEndMarker; 259 current_ = kEndMarker;
219 has_more_ = false; 260 has_more_ = false;
220 } 261 }
221 } 262 }
222 263
223 264
224 void RegExpParser::Reset(int pos) { 265 void RegExpParser::Reset(intptr_t pos) {
225 next_pos_ = pos; 266 next_pos_ = pos;
226 has_more_ = (pos < in()->length()); 267 has_more_ = (pos < in().Length());
227 Advance(); 268 Advance();
228 } 269 }
229 270
230 271
231 void RegExpParser::Advance(int dist) { 272 void RegExpParser::Advance(intptr_t dist) {
232 next_pos_ += dist - 1; 273 next_pos_ += dist - 1;
233 Advance(); 274 Advance();
234 } 275 }
235 276
236 277
237 bool RegExpParser::simple() { 278 bool RegExpParser::simple() {
238 return simple_; 279 return simple_;
239 } 280 }
240 281
241 282
242 RegExpTree* RegExpParser::ReportError(Vector<const char> message) { 283 void RegExpParser::ReportError(const char* message) {
243 failed_ = true; 284 failed_ = true;
244 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked(); 285 *error_ = String::New(message);
245 // Zip to the end to make sure the no more input is read. 286 // Zip to the end to make sure the no more input is read.
246 current_ = kEndMarker; 287 current_ = kEndMarker;
247 next_pos_ = in()->length(); 288 next_pos_ = in().Length();
248 return NULL; 289
290 const Error& error = Error::Handle(LanguageError::New(*error_));
291 Report::LongJump(error);
292 UNREACHABLE();
249 } 293 }
250 294
251 295
252 // Pattern :: 296 // Pattern ::
253 // Disjunction 297 // Disjunction
254 RegExpTree* RegExpParser::ParsePattern() { 298 RegExpTree* RegExpParser::ParsePattern() {
255 RegExpTree* result = ParseDisjunction(CHECK_FAILED); 299 RegExpTree* result = ParseDisjunction();
256 DCHECK(!has_more()); 300 ASSERT(!has_more());
257 // If the result of parsing is a literal string atom, and it has the 301 // If the result of parsing is a literal string atom, and it has the
258 // same length as the input, then the atom is identical to the input. 302 // same length as the input, then the atom is identical to the input.
259 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) { 303 if (result->IsAtom() && result->AsAtom()->length() == in().Length()) {
260 simple_ = true; 304 simple_ = true;
261 } 305 }
262 return result; 306 return result;
263 } 307 }
264 308
265 309
266 // Disjunction :: 310 // Disjunction ::
267 // Alternative 311 // Alternative
268 // Alternative | Disjunction 312 // Alternative | Disjunction
269 // Alternative :: 313 // Alternative ::
270 // [empty] 314 // [empty]
271 // Term Alternative 315 // Term Alternative
272 // Term :: 316 // Term ::
273 // Assertion 317 // Assertion
274 // Atom 318 // Atom
275 // Atom Quantifier 319 // Atom Quantifier
276 RegExpTree* RegExpParser::ParseDisjunction() { 320 RegExpTree* RegExpParser::ParseDisjunction() {
277 // Used to store current state while parsing subexpressions. 321 // Used to store current state while parsing subexpressions.
278 RegExpParserState initial_state(NULL, INITIAL, 0, zone()); 322 RegExpParserState initial_state(NULL, INITIAL, 0, I);
279 RegExpParserState* stored_state = &initial_state; 323 RegExpParserState* stored_state = &initial_state;
280 // Cache the builder in a local variable for quick access. 324 // Cache the builder in a local variable for quick access.
281 RegExpBuilder* builder = initial_state.builder(); 325 RegExpBuilder* builder = initial_state.builder();
282 while (true) { 326 while (true) {
283 switch (current()) { 327 switch (current()) {
284 case kEndMarker: 328 case kEndMarker:
285 if (stored_state->IsSubexpression()) { 329 if (stored_state->IsSubexpression()) {
286 // Inside a parenthesized group when hitting end of input. 330 // Inside a parenthesized group when hitting end of input.
287 ReportError(CStrVector("Unterminated group") CHECK_FAILED); 331 ReportError("Unterminated group");
332 UNREACHABLE();
288 } 333 }
289 DCHECK_EQ(INITIAL, stored_state->group_type()); 334 ASSERT(INITIAL == stored_state->group_type());
290 // Parsing completed successfully. 335 // Parsing completed successfully.
291 return builder->ToRegExp(); 336 return builder->ToRegExp();
292 case ')': { 337 case ')': {
293 if (!stored_state->IsSubexpression()) { 338 if (!stored_state->IsSubexpression()) {
294 ReportError(CStrVector("Unmatched ')'") CHECK_FAILED); 339 ReportError("Unmatched ')'");
340 UNREACHABLE();
295 } 341 }
296 DCHECK_NE(INITIAL, stored_state->group_type()); 342 ASSERT(INITIAL != stored_state->group_type());
297 343
298 Advance(); 344 Advance();
299 // End disjunction parsing and convert builder content to new single 345 // End disjunction parsing and convert builder content to new single
300 // regexp atom. 346 // regexp atom.
301 RegExpTree* body = builder->ToRegExp(); 347 RegExpTree* body = builder->ToRegExp();
302 348
303 int end_capture_index = captures_started(); 349 intptr_t end_capture_index = captures_started();
304 350
305 int capture_index = stored_state->capture_index(); 351 intptr_t capture_index = stored_state->capture_index();
306 SubexpressionType group_type = stored_state->group_type(); 352 SubexpressionType group_type = stored_state->group_type();
307 353
308 // Restore previous state. 354 // Restore previous state.
309 stored_state = stored_state->previous_state(); 355 stored_state = stored_state->previous_state();
310 builder = stored_state->builder(); 356 builder = stored_state->builder();
311 357
312 // Build result of subexpression. 358 // Build result of subexpression.
313 if (group_type == CAPTURE) { 359 if (group_type == CAPTURE) {
314 RegExpCapture* capture = new(zone()) RegExpCapture(body, capture_index); 360 RegExpCapture* capture = new(I) RegExpCapture(body, capture_index);
315 captures_->at(capture_index - 1) = capture; 361 (*captures_)[capture_index - 1] = capture;
316 body = capture; 362 body = capture;
317 } else if (group_type != GROUPING) { 363 } else if (group_type != GROUPING) {
318 DCHECK(group_type == POSITIVE_LOOKAHEAD || 364 ASSERT(group_type == POSITIVE_LOOKAHEAD ||
319 group_type == NEGATIVE_LOOKAHEAD); 365 group_type == NEGATIVE_LOOKAHEAD);
320 bool is_positive = (group_type == POSITIVE_LOOKAHEAD); 366 bool is_positive = (group_type == POSITIVE_LOOKAHEAD);
321 body = new(zone()) RegExpLookahead(body, 367 body = new(I) RegExpLookahead(body,
322 is_positive, 368 is_positive,
323 end_capture_index - capture_index, 369 end_capture_index - capture_index,
324 capture_index); 370 capture_index);
325 } 371 }
326 builder->AddAtom(body); 372 builder->AddAtom(body);
327 // For compatability with JSC and ES3, we allow quantifiers after 373 // For compatibility with JSC and ES3, we allow quantifiers after
328 // lookaheads, and break in all cases. 374 // lookaheads, and break in all cases.
329 break; 375 break;
330 } 376 }
331 case '|': { 377 case '|': {
332 Advance(); 378 Advance();
333 builder->NewAlternative(); 379 builder->NewAlternative();
334 continue; 380 continue;
335 } 381 }
336 case '*': 382 case '*':
337 case '+': 383 case '+':
338 case '?': 384 case '?':
339 return ReportError(CStrVector("Nothing to repeat")); 385 ReportError("Nothing to repeat");
386 UNREACHABLE();
340 case '^': { 387 case '^': {
341 Advance(); 388 Advance();
342 if (multiline_) { 389 if (multiline_) {
343 builder->AddAssertion( 390 builder->AddAssertion(
344 new(zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); 391 new(I) RegExpAssertion(RegExpAssertion::START_OF_LINE));
345 } else { 392 } else {
346 builder->AddAssertion( 393 builder->AddAssertion(
347 new(zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); 394 new(I) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
348 set_contains_anchor(); 395 set_contains_anchor();
349 } 396 }
350 continue; 397 continue;
351 } 398 }
352 case '$': { 399 case '$': {
353 Advance(); 400 Advance();
354 RegExpAssertion::AssertionType assertion_type = 401 RegExpAssertion::AssertionType assertion_type =
355 multiline_ ? RegExpAssertion::END_OF_LINE : 402 multiline_ ? RegExpAssertion::END_OF_LINE :
356 RegExpAssertion::END_OF_INPUT; 403 RegExpAssertion::END_OF_INPUT;
357 builder->AddAssertion(new(zone()) RegExpAssertion(assertion_type)); 404 builder->AddAssertion(new RegExpAssertion(assertion_type));
358 continue; 405 continue;
359 } 406 }
360 case '.': { 407 case '.': {
361 Advance(); 408 Advance();
362 // everything except \x0a, \x0d, \u2028 and \u2029 409 // everything except \x0a, \x0d, \u2028 and \u2029
363 ZoneList<CharacterRange>* ranges = 410 ZoneGrowableArray<CharacterRange>* ranges =
364 new(zone()) ZoneList<CharacterRange>(2, zone()); 411 new ZoneGrowableArray<CharacterRange>(2);
365 CharacterRange::AddClassEscape('.', ranges, zone()); 412 CharacterRange::AddClassEscape('.', ranges);
366 RegExpTree* atom = new(zone()) RegExpCharacterClass(ranges, false); 413 RegExpTree* atom = new RegExpCharacterClass(ranges, false);
367 builder->AddAtom(atom); 414 builder->AddAtom(atom);
368 break; 415 break;
369 } 416 }
370 case '(': { 417 case '(': {
371 SubexpressionType subexpr_type = CAPTURE; 418 SubexpressionType subexpr_type = CAPTURE;
372 Advance(); 419 Advance();
373 if (current() == '?') { 420 if (current() == '?') {
374 switch (Next()) { 421 switch (Next()) {
375 case ':': 422 case ':':
376 subexpr_type = GROUPING; 423 subexpr_type = GROUPING;
377 break; 424 break;
378 case '=': 425 case '=':
379 subexpr_type = POSITIVE_LOOKAHEAD; 426 subexpr_type = POSITIVE_LOOKAHEAD;
380 break; 427 break;
381 case '!': 428 case '!':
382 subexpr_type = NEGATIVE_LOOKAHEAD; 429 subexpr_type = NEGATIVE_LOOKAHEAD;
383 break; 430 break;
384 default: 431 default:
385 ReportError(CStrVector("Invalid group") CHECK_FAILED); 432 ReportError("Invalid group");
386 break; 433 UNREACHABLE();
387 } 434 }
388 Advance(2); 435 Advance(2);
389 } else { 436 } else {
390 if (captures_ == NULL) { 437 if (captures_ == NULL) {
391 captures_ = new(zone()) ZoneList<RegExpCapture*>(2, zone()); 438 captures_ = new ZoneGrowableArray<RegExpCapture*>(2);
392 } 439 }
393 if (captures_started() >= kMaxCaptures) { 440 if (captures_started() >= kMaxCaptures) {
394 ReportError(CStrVector("Too many captures") CHECK_FAILED); 441 ReportError("Too many captures");
442 UNREACHABLE();
395 } 443 }
396 captures_->Add(NULL, zone()); 444 captures_->Add(NULL);
397 } 445 }
398 // Store current state and begin new disjunction parsing. 446 // Store current state and begin new disjunction parsing.
399 stored_state = new(zone()) RegExpParserState(stored_state, subexpr_type, 447 stored_state = new RegExpParserState(stored_state, subexpr_type,
400 captures_started(), zone()); 448 captures_started(), I);
401 builder = stored_state->builder(); 449 builder = stored_state->builder();
402 continue; 450 continue;
403 } 451 }
404 case '[': { 452 case '[': {
405 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED); 453 RegExpTree* atom = ParseCharacterClass();
406 builder->AddAtom(atom); 454 builder->AddAtom(atom);
407 break; 455 break;
408 } 456 }
409 // Atom :: 457 // Atom ::
410 // \ AtomEscape 458 // \ AtomEscape
411 case '\\': 459 case '\\':
412 switch (Next()) { 460 switch (Next()) {
413 case kEndMarker: 461 case kEndMarker:
414 return ReportError(CStrVector("\\ at end of pattern")); 462 ReportError("\\ at end of pattern");
463 UNREACHABLE();
415 case 'b': 464 case 'b':
416 Advance(2); 465 Advance(2);
417 builder->AddAssertion( 466 builder->AddAssertion(
418 new(zone()) RegExpAssertion(RegExpAssertion::BOUNDARY)); 467 new RegExpAssertion(RegExpAssertion::BOUNDARY));
419 continue; 468 continue;
420 case 'B': 469 case 'B':
421 Advance(2); 470 Advance(2);
422 builder->AddAssertion( 471 builder->AddAssertion(
423 new(zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY)); 472 new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
424 continue; 473 continue;
425 // AtomEscape :: 474 // AtomEscape ::
426 // CharacterClassEscape 475 // CharacterClassEscape
427 // 476 //
428 // CharacterClassEscape :: one of 477 // CharacterClassEscape :: one of
429 // d D s S w W 478 // d D s S w W
430 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': { 479 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {
431 uc32 c = Next(); 480 uint32_t c = Next();
432 Advance(2); 481 Advance(2);
433 ZoneList<CharacterRange>* ranges = 482 ZoneGrowableArray<CharacterRange>* ranges =
434 new(zone()) ZoneList<CharacterRange>(2, zone()); 483 new ZoneGrowableArray<CharacterRange>(2);
435 CharacterRange::AddClassEscape(c, ranges, zone()); 484 CharacterRange::AddClassEscape(c, ranges);
436 RegExpTree* atom = new(zone()) RegExpCharacterClass(ranges, false); 485 RegExpTree* atom = new RegExpCharacterClass(ranges, false);
437 builder->AddAtom(atom); 486 builder->AddAtom(atom);
438 break; 487 break;
439 } 488 }
440 case '1': case '2': case '3': case '4': case '5': case '6': 489 case '1': case '2': case '3': case '4': case '5': case '6':
441 case '7': case '8': case '9': { 490 case '7': case '8': case '9': {
442 int index = 0; 491 intptr_t index = 0;
443 if (ParseBackReferenceIndex(&index)) { 492 if (ParseBackReferenceIndex(&index)) {
444 RegExpCapture* capture = NULL; 493 RegExpCapture* capture = NULL;
445 if (captures_ != NULL && index <= captures_->length()) { 494 if (captures_ != NULL && index <= captures_->length()) {
446 capture = captures_->at(index - 1); 495 capture = captures_->At(index - 1);
447 } 496 }
448 if (capture == NULL) { 497 if (capture == NULL) {
449 builder->AddEmpty(); 498 builder->AddEmpty();
450 break; 499 break;
451 } 500 }
452 RegExpTree* atom = new(zone()) RegExpBackReference(capture); 501 RegExpTree* atom = new RegExpBackReference(capture);
453 builder->AddAtom(atom); 502 builder->AddAtom(atom);
454 break; 503 break;
455 } 504 }
456 uc32 first_digit = Next(); 505 uint32_t first_digit = Next();
457 if (first_digit == '8' || first_digit == '9') { 506 if (first_digit == '8' || first_digit == '9') {
458 // Treat as identity escape 507 // Treat as identity escape
459 builder->AddCharacter(first_digit); 508 builder->AddCharacter(first_digit);
460 Advance(2); 509 Advance(2);
461 break; 510 break;
462 } 511 }
463 } 512 }
464 // FALLTHROUGH 513 // FALLTHROUGH
465 case '0': { 514 case '0': {
466 Advance(); 515 Advance();
467 uc32 octal = ParseOctalLiteral(); 516 uint32_t octal = ParseOctalLiteral();
468 builder->AddCharacter(octal); 517 builder->AddCharacter(octal);
469 break; 518 break;
470 } 519 }
471 // ControlEscape :: one of 520 // ControlEscape :: one of
472 // f n r t v 521 // f n r t v
473 case 'f': 522 case 'f':
474 Advance(2); 523 Advance(2);
475 builder->AddCharacter('\f'); 524 builder->AddCharacter('\f');
476 break; 525 break;
477 case 'n': 526 case 'n':
478 Advance(2); 527 Advance(2);
479 builder->AddCharacter('\n'); 528 builder->AddCharacter('\n');
480 break; 529 break;
481 case 'r': 530 case 'r':
482 Advance(2); 531 Advance(2);
483 builder->AddCharacter('\r'); 532 builder->AddCharacter('\r');
484 break; 533 break;
485 case 't': 534 case 't':
486 Advance(2); 535 Advance(2);
487 builder->AddCharacter('\t'); 536 builder->AddCharacter('\t');
488 break; 537 break;
489 case 'v': 538 case 'v':
490 Advance(2); 539 Advance(2);
491 builder->AddCharacter('\v'); 540 builder->AddCharacter('\v');
492 break; 541 break;
493 case 'c': { 542 case 'c': {
494 Advance(); 543 Advance();
495 uc32 controlLetter = Next(); 544 uint32_t controlLetter = Next();
496 // Special case if it is an ASCII letter. 545 // Special case if it is an ASCII letter.
497 // Convert lower case letters to uppercase. 546 // Convert lower case letters to uppercase.
498 uc32 letter = controlLetter & ~('a' ^ 'A'); 547 uint32_t letter = controlLetter & ~('a' ^ 'A');
499 if (letter < 'A' || 'Z' < letter) { 548 if (letter < 'A' || 'Z' < letter) {
500 // controlLetter is not in range 'A'-'Z' or 'a'-'z'. 549 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
501 // This is outside the specification. We match JSC in 550 // This is outside the specification. We match JSC in
502 // reading the backslash as a literal character instead 551 // reading the backslash as a literal character instead
503 // of as starting an escape. 552 // of as starting an escape.
504 builder->AddCharacter('\\'); 553 builder->AddCharacter('\\');
505 } else { 554 } else {
506 Advance(2); 555 Advance(2);
507 builder->AddCharacter(controlLetter & 0x1f); 556 builder->AddCharacter(controlLetter & 0x1f);
508 } 557 }
509 break; 558 break;
510 } 559 }
511 case 'x': { 560 case 'x': {
512 Advance(2); 561 Advance(2);
513 uc32 value; 562 uint32_t value;
514 if (ParseHexEscape(2, &value)) { 563 if (ParseHexEscape(2, &value)) {
515 builder->AddCharacter(value); 564 builder->AddCharacter(value);
516 } else { 565 } else {
517 builder->AddCharacter('x'); 566 builder->AddCharacter('x');
518 } 567 }
519 break; 568 break;
520 } 569 }
521 case 'u': { 570 case 'u': {
522 Advance(2); 571 Advance(2);
523 uc32 value; 572 uint32_t value;
524 if (ParseHexEscape(4, &value)) { 573 if (ParseHexEscape(4, &value)) {
525 builder->AddCharacter(value); 574 builder->AddCharacter(value);
526 } else { 575 } else {
527 builder->AddCharacter('u'); 576 builder->AddCharacter('u');
528 } 577 }
529 break; 578 break;
530 } 579 }
531 default: 580 default:
532 // Identity escape. 581 // Identity escape.
533 builder->AddCharacter(Next()); 582 builder->AddCharacter(Next());
534 Advance(2); 583 Advance(2);
535 break; 584 break;
536 } 585 }
537 break; 586 break;
538 case '{': { 587 case '{': {
539 int dummy; 588 intptr_t dummy;
540 if (ParseIntervalQuantifier(&dummy, &dummy)) { 589 if (ParseIntervalQuantifier(&dummy, &dummy)) {
541 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED); 590 ReportError("Nothing to repeat");
591 UNREACHABLE();
542 } 592 }
543 // fallthrough 593 // fallthrough
544 } 594 }
545 default: 595 default:
546 builder->AddCharacter(current()); 596 builder->AddCharacter(current());
547 Advance(); 597 Advance();
548 break; 598 break;
549 } // end switch(current()) 599 } // end switch(current())
550 600
551 int min; 601 intptr_t min;
552 int max; 602 intptr_t max;
553 switch (current()) { 603 switch (current()) {
554 // QuantifierPrefix :: 604 // QuantifierPrefix ::
555 // * 605 // *
556 // + 606 // +
557 // ? 607 // ?
558 // { 608 // {
559 case '*': 609 case '*':
560 min = 0; 610 min = 0;
561 max = RegExpTree::kInfinity; 611 max = RegExpTree::kInfinity;
562 Advance(); 612 Advance();
563 break; 613 break;
564 case '+': 614 case '+':
565 min = 1; 615 min = 1;
566 max = RegExpTree::kInfinity; 616 max = RegExpTree::kInfinity;
567 Advance(); 617 Advance();
568 break; 618 break;
569 case '?': 619 case '?':
570 min = 0; 620 min = 0;
571 max = 1; 621 max = 1;
572 Advance(); 622 Advance();
573 break; 623 break;
574 case '{': 624 case '{':
575 if (ParseIntervalQuantifier(&min, &max)) { 625 if (ParseIntervalQuantifier(&min, &max)) {
576 if (max < min) { 626 if (max < min) {
577 ReportError(CStrVector("numbers out of order in {} quantifier.") 627 ReportError("numbers out of order in {} quantifier.");
578 CHECK_FAILED); 628 UNREACHABLE();
579 } 629 }
580 break; 630 break;
581 } else { 631 } else {
582 continue; 632 continue;
583 } 633 }
584 default: 634 default:
585 continue; 635 continue;
586 } 636 }
587 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; 637 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
588 if (current() == '?') { 638 if (current() == '?') {
589 quantifier_type = RegExpQuantifier::NON_GREEDY; 639 quantifier_type = RegExpQuantifier::NON_GREEDY;
590 Advance(); 640 Advance();
591 } else if (FLAG_regexp_possessive_quantifier && current() == '+') { 641 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
592 // FLAG_regexp_possessive_quantifier is a debug-only flag. 642 // FLAG_regexp_possessive_quantifier is a debug-only flag.
593 quantifier_type = RegExpQuantifier::POSSESSIVE; 643 quantifier_type = RegExpQuantifier::POSSESSIVE;
594 Advance(); 644 Advance();
595 } 645 }
596 builder->AddQuantifierToAtom(min, max, quantifier_type); 646 builder->AddQuantifierToAtom(min, max, quantifier_type);
597 } 647 }
598 } 648 }
599 649
600 650
601 #ifdef DEBUG 651 #ifdef DEBUG
602 // Currently only used in an DCHECK. 652 // Currently only used in an ASSERT.
603 static bool IsSpecialClassEscape(uc32 c) { 653 static bool IsSpecialClassEscape(uint32_t c) {
604 switch (c) { 654 switch (c) {
605 case 'd': case 'D': 655 case 'd': case 'D':
606 case 's': case 'S': 656 case 's': case 'S':
607 case 'w': case 'W': 657 case 'w': case 'W':
608 return true; 658 return true;
609 default: 659 default:
610 return false; 660 return false;
611 } 661 }
612 } 662 }
613 #endif 663 #endif
614 664
615 665
616 // In order to know whether an escape is a backreference or not we have to scan 666 // In order to know whether an escape is a backreference or not we have to scan
617 // the entire regexp and find the number of capturing parentheses. However we 667 // the entire regexp and find the number of capturing parentheses. However we
618 // don't want to scan the regexp twice unless it is necessary. This mini-parser 668 // don't want to scan the regexp twice unless it is necessary. This mini-parser
619 // is called when needed. It can see the difference between capturing and 669 // is called when needed. It can see the difference between capturing and
620 // noncapturing parentheses and can skip character classes and backslash-escaped 670 // noncapturing parentheses and can skip character classes and backslash-escaped
621 // characters. 671 // characters.
622 void RegExpParser::ScanForCaptures() { 672 void RegExpParser::ScanForCaptures() {
623 // Start with captures started previous to current position 673 // Start with captures started previous to current position
624 int capture_count = captures_started(); 674 intptr_t capture_count = captures_started();
625 // Add count of captures after this position. 675 // Add count of captures after this position.
626 int n; 676 intptr_t n;
627 while ((n = current()) != kEndMarker) { 677 while ((n = current()) != kEndMarker) {
628 Advance(); 678 Advance();
629 switch (n) { 679 switch (n) {
630 case '\\': 680 case '\\':
631 Advance(); 681 Advance();
632 break; 682 break;
633 case '[': { 683 case '[': {
634 int c; 684 intptr_t c;
635 while ((c = current()) != kEndMarker) { 685 while ((c = current()) != kEndMarker) {
636 Advance(); 686 Advance();
637 if (c == '\\') { 687 if (c == '\\') {
638 Advance(); 688 Advance();
639 } else { 689 } else {
640 if (c == ']') break; 690 if (c == ']') break;
641 } 691 }
642 } 692 }
643 break; 693 break;
644 } 694 }
645 case '(': 695 case '(':
646 if (current() != '?') capture_count++; 696 if (current() != '?') capture_count++;
647 break; 697 break;
648 } 698 }
649 } 699 }
650 capture_count_ = capture_count; 700 capture_count_ = capture_count;
651 is_scanned_for_captures_ = true; 701 is_scanned_for_captures_ = true;
652 } 702 }
653 703
654 704
655 bool RegExpParser::ParseBackReferenceIndex(int* index_out) { 705 static inline bool IsDecimalDigit(int32_t c) {
656 DCHECK_EQ('\\', current()); 706 return '0' <= c && c <= '9';
657 DCHECK('1' <= Next() && Next() <= '9'); 707 }
708
709
710 bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) {
711 ASSERT('\\' == current());
712 ASSERT('1' <= Next() && Next() <= '9');
658 // Try to parse a decimal literal that is no greater than the total number 713 // Try to parse a decimal literal that is no greater than the total number
659 // of left capturing parentheses in the input. 714 // of left capturing parentheses in the input.
660 int start = position(); 715 intptr_t start = position();
661 int value = Next() - '0'; 716 intptr_t value = Next() - '0';
662 Advance(2); 717 Advance(2);
663 while (true) { 718 while (true) {
664 uc32 c = current(); 719 uint32_t c = current();
665 if (IsDecimalDigit(c)) { 720 if (IsDecimalDigit(c)) {
666 value = 10 * value + (c - '0'); 721 value = 10 * value + (c - '0');
667 if (value > kMaxCaptures) { 722 if (value > kMaxCaptures) {
668 Reset(start); 723 Reset(start);
669 return false; 724 return false;
670 } 725 }
671 Advance(); 726 Advance();
672 } else { 727 } else {
673 break; 728 break;
674 } 729 }
675 } 730 }
676 if (value > captures_started()) { 731 if (value > captures_started()) {
677 if (!is_scanned_for_captures_) { 732 if (!is_scanned_for_captures_) {
678 int saved_position = position(); 733 intptr_t saved_position = position();
679 ScanForCaptures(); 734 ScanForCaptures();
680 Reset(saved_position); 735 Reset(saved_position);
681 } 736 }
682 if (value > capture_count_) { 737 if (value > capture_count_) {
683 Reset(start); 738 Reset(start);
684 return false; 739 return false;
685 } 740 }
686 } 741 }
687 *index_out = value; 742 *index_out = value;
688 return true; 743 return true;
689 } 744 }
690 745
691 746
692 // QuantifierPrefix :: 747 // QuantifierPrefix ::
693 // { DecimalDigits } 748 // { DecimalDigits }
694 // { DecimalDigits , } 749 // { DecimalDigits , }
695 // { DecimalDigits , DecimalDigits } 750 // { DecimalDigits , DecimalDigits }
696 // 751 //
697 // Returns true if parsing succeeds, and set the min_out and max_out 752 // Returns true if parsing succeeds, and set the min_out and max_out
698 // values. Values are truncated to RegExpTree::kInfinity if they overflow. 753 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
699 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) { 754 bool RegExpParser::ParseIntervalQuantifier(intptr_t* min_out,
700 DCHECK_EQ(current(), '{'); 755 intptr_t* max_out) {
701 int start = position(); 756 ASSERT(current() == '{');
757 intptr_t start = position();
702 Advance(); 758 Advance();
703 int min = 0; 759 intptr_t min = 0;
704 if (!IsDecimalDigit(current())) { 760 if (!IsDecimalDigit(current())) {
705 Reset(start); 761 Reset(start);
706 return false; 762 return false;
707 } 763 }
708 while (IsDecimalDigit(current())) { 764 while (IsDecimalDigit(current())) {
709 int next = current() - '0'; 765 intptr_t next = current() - '0';
710 if (min > (RegExpTree::kInfinity - next) / 10) { 766 if (min > (RegExpTree::kInfinity - next) / 10) {
711 // Overflow. Skip past remaining decimal digits and return -1. 767 // Overflow. Skip past remaining decimal digits and return -1.
712 do { 768 do {
713 Advance(); 769 Advance();
714 } while (IsDecimalDigit(current())); 770 } while (IsDecimalDigit(current()));
715 min = RegExpTree::kInfinity; 771 min = RegExpTree::kInfinity;
716 break; 772 break;
717 } 773 }
718 min = 10 * min + next; 774 min = 10 * min + next;
719 Advance(); 775 Advance();
720 } 776 }
721 int max = 0; 777 intptr_t max = 0;
722 if (current() == '}') { 778 if (current() == '}') {
723 max = min; 779 max = min;
724 Advance(); 780 Advance();
725 } else if (current() == ',') { 781 } else if (current() == ',') {
726 Advance(); 782 Advance();
727 if (current() == '}') { 783 if (current() == '}') {
728 max = RegExpTree::kInfinity; 784 max = RegExpTree::kInfinity;
729 Advance(); 785 Advance();
730 } else { 786 } else {
731 while (IsDecimalDigit(current())) { 787 while (IsDecimalDigit(current())) {
732 int next = current() - '0'; 788 intptr_t next = current() - '0';
733 if (max > (RegExpTree::kInfinity - next) / 10) { 789 if (max > (RegExpTree::kInfinity - next) / 10) {
734 do { 790 do {
735 Advance(); 791 Advance();
736 } while (IsDecimalDigit(current())); 792 } while (IsDecimalDigit(current()));
737 max = RegExpTree::kInfinity; 793 max = RegExpTree::kInfinity;
738 break; 794 break;
739 } 795 }
740 max = 10 * max + next; 796 max = 10 * max + next;
741 Advance(); 797 Advance();
742 } 798 }
743 if (current() != '}') { 799 if (current() != '}') {
744 Reset(start); 800 Reset(start);
745 return false; 801 return false;
746 } 802 }
747 Advance(); 803 Advance();
748 } 804 }
749 } else { 805 } else {
750 Reset(start); 806 Reset(start);
751 return false; 807 return false;
752 } 808 }
753 *min_out = min; 809 *min_out = min;
754 *max_out = max; 810 *max_out = max;
755 return true; 811 return true;
756 } 812 }
757 813
758 814
759 uc32 RegExpParser::ParseOctalLiteral() { 815 uint32_t RegExpParser::ParseOctalLiteral() {
760 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker); 816 ASSERT(('0' <= current() && current() <= '7') || current() == kEndMarker);
761 // For compatibility with some other browsers (not all), we parse 817 // For compatibility with some other browsers (not all), we parse
762 // up to three octal digits with a value below 256. 818 // up to three octal digits with a value below 256.
763 uc32 value = current() - '0'; 819 uint32_t value = current() - '0';
764 Advance(); 820 Advance();
765 if ('0' <= current() && current() <= '7') { 821 if ('0' <= current() && current() <= '7') {
766 value = value * 8 + current() - '0'; 822 value = value * 8 + current() - '0';
767 Advance(); 823 Advance();
768 if (value < 32 && '0' <= current() && current() <= '7') { 824 if (value < 32 && '0' <= current() && current() <= '7') {
769 value = value * 8 + current() - '0'; 825 value = value * 8 + current() - '0';
770 Advance(); 826 Advance();
771 } 827 }
772 } 828 }
773 return value; 829 return value;
774 } 830 }
775 831
776 832
777 bool RegExpParser::ParseHexEscape(int length, uc32 *value) { 833 // Returns the value (0 .. 15) of a hexadecimal character c.
778 int start = position(); 834 // If c is not a legal hexadecimal character, returns a value < 0.
779 uc32 val = 0; 835 static inline intptr_t HexValue(uint32_t c) {
836 c -= '0';
837 if (static_cast<unsigned>(c) <= 9) return c;
838 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
839 if (static_cast<unsigned>(c) <= 5) return c + 10;
840 return -1;
841 }
842
843
844 bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t *value) {
845 intptr_t start = position();
846 uint32_t val = 0;
780 bool done = false; 847 bool done = false;
781 for (int i = 0; !done; i++) { 848 for (intptr_t i = 0; !done; i++) {
782 uc32 c = current(); 849 uint32_t c = current();
783 int d = HexValue(c); 850 intptr_t d = HexValue(c);
784 if (d < 0) { 851 if (d < 0) {
785 Reset(start); 852 Reset(start);
786 return false; 853 return false;
787 } 854 }
788 val = val * 16 + d; 855 val = val * 16 + d;
789 Advance(); 856 Advance();
790 if (i == length - 1) { 857 if (i == length - 1) {
791 done = true; 858 done = true;
792 } 859 }
793 } 860 }
794 *value = val; 861 *value = val;
795 return true; 862 return true;
796 } 863 }
797 864
798 865
799 uc32 RegExpParser::ParseClassCharacterEscape() { 866 uint32_t RegExpParser::ParseClassCharacterEscape() {
800 DCHECK(current() == '\\'); 867 ASSERT(current() == '\\');
801 DCHECK(has_next() && !IsSpecialClassEscape(Next())); 868 DEBUG_ASSERT(has_next() && !IsSpecialClassEscape(Next()));
802 Advance(); 869 Advance();
803 switch (current()) { 870 switch (current()) {
804 case 'b': 871 case 'b':
805 Advance(); 872 Advance();
806 return '\b'; 873 return '\b';
807 // ControlEscape :: one of 874 // ControlEscape :: one of
808 // f n r t v 875 // f n r t v
809 case 'f': 876 case 'f':
810 Advance(); 877 Advance();
811 return '\f'; 878 return '\f';
812 case 'n': 879 case 'n':
813 Advance(); 880 Advance();
814 return '\n'; 881 return '\n';
815 case 'r': 882 case 'r':
816 Advance(); 883 Advance();
817 return '\r'; 884 return '\r';
818 case 't': 885 case 't':
819 Advance(); 886 Advance();
820 return '\t'; 887 return '\t';
821 case 'v': 888 case 'v':
822 Advance(); 889 Advance();
823 return '\v'; 890 return '\v';
824 case 'c': { 891 case 'c': {
825 uc32 controlLetter = Next(); 892 uint32_t controlLetter = Next();
826 uc32 letter = controlLetter & ~('A' ^ 'a'); 893 uint32_t letter = controlLetter & ~('A' ^ 'a');
827 // For compatibility with JSC, inside a character class 894 // For compatibility with JSC, inside a character class
828 // we also accept digits and underscore as control characters. 895 // we also accept digits and underscore as control characters.
829 if ((controlLetter >= '0' && controlLetter <= '9') || 896 if ((controlLetter >= '0' && controlLetter <= '9') ||
830 controlLetter == '_' || 897 controlLetter == '_' ||
831 (letter >= 'A' && letter <= 'Z')) { 898 (letter >= 'A' && letter <= 'Z')) {
832 Advance(2); 899 Advance(2);
833 // Control letters mapped to ASCII control characters in the range 900 // Control letters mapped to ASCII control characters in the range
834 // 0x00-0x1f. 901 // 0x00-0x1f.
835 return controlLetter & 0x1f; 902 return controlLetter & 0x1f;
836 } 903 }
837 // We match JSC in reading the backslash as a literal 904 // We match JSC in reading the backslash as a literal
838 // character instead of as starting an escape. 905 // character instead of as starting an escape.
839 return '\\'; 906 return '\\';
840 } 907 }
841 case '0': case '1': case '2': case '3': case '4': case '5': 908 case '0': case '1': case '2': case '3': case '4': case '5':
842 case '6': case '7': 909 case '6': case '7':
843 // For compatibility, we interpret a decimal escape that isn't 910 // For compatibility, we interpret a decimal escape that isn't
844 // a back reference (and therefore either \0 or not valid according 911 // a back reference (and therefore either \0 or not valid according
845 // to the specification) as a 1..3 digit octal character code. 912 // to the specification) as a 1..3 digit octal character code.
846 return ParseOctalLiteral(); 913 return ParseOctalLiteral();
847 case 'x': { 914 case 'x': {
848 Advance(); 915 Advance();
849 uc32 value; 916 uint32_t value;
850 if (ParseHexEscape(2, &value)) { 917 if (ParseHexEscape(2, &value)) {
851 return value; 918 return value;
852 } 919 }
853 // If \x is not followed by a two-digit hexadecimal, treat it 920 // If \x is not followed by a two-digit hexadecimal, treat it
854 // as an identity escape. 921 // as an identity escape.
855 return 'x'; 922 return 'x';
856 } 923 }
857 case 'u': { 924 case 'u': {
858 Advance(); 925 Advance();
859 uc32 value; 926 uint32_t value;
860 if (ParseHexEscape(4, &value)) { 927 if (ParseHexEscape(4, &value)) {
861 return value; 928 return value;
862 } 929 }
863 // If \u is not followed by a four-digit hexadecimal, treat it 930 // If \u is not followed by a four-digit hexadecimal, treat it
864 // as an identity escape. 931 // as an identity escape.
865 return 'u'; 932 return 'u';
866 } 933 }
867 default: { 934 default: {
868 // Extended identity escape. We accept any character that hasn't 935 // Extended identity escape. We accept any character that hasn't
869 // been matched by a more specific case, not just the subset required 936 // been matched by a more specific case, not just the subset required
870 // by the ECMAScript specification. 937 // by the ECMAScript specification.
871 uc32 result = current(); 938 uint32_t result = current();
872 Advance(); 939 Advance();
873 return result; 940 return result;
874 } 941 }
875 } 942 }
876 return 0; 943 return 0;
877 } 944 }
878 945
879 946
880 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { 947 CharacterRange RegExpParser::ParseClassAtom(uint16_t* char_class) {
881 DCHECK_EQ(0, *char_class); 948 ASSERT(0 == *char_class);
882 uc32 first = current(); 949 uint32_t first = current();
883 if (first == '\\') { 950 if (first == '\\') {
884 switch (Next()) { 951 switch (Next()) {
885 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': { 952 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {
886 *char_class = Next(); 953 *char_class = Next();
887 Advance(2); 954 Advance(2);
888 return CharacterRange::Singleton(0); // Return dummy value. 955 return CharacterRange::Singleton(0); // Return dummy value.
889 } 956 }
890 case kEndMarker: 957 case kEndMarker:
891 return ReportError(CStrVector("\\ at end of pattern")); 958 ReportError("\\ at end of pattern");
959 UNREACHABLE();
892 default: 960 default:
893 uc32 c = ParseClassCharacterEscape(CHECK_FAILED); 961 uint32_t c = ParseClassCharacterEscape();
894 return CharacterRange::Singleton(c); 962 return CharacterRange::Singleton(c);
895 } 963 }
896 } else { 964 } else {
897 Advance(); 965 Advance();
898 return CharacterRange::Singleton(first); 966 return CharacterRange::Singleton(first);
899 } 967 }
900 } 968 }
901 969
902 970
903 static const uc16 kNoCharClass = 0; 971 static const uint16_t kNoCharClass = 0;
904 972
905 // Adds range or pre-defined character class to character ranges. 973 // Adds range or pre-defined character class to character ranges.
906 // If char_class is not kInvalidClass, it's interpreted as a class 974 // If char_class is not kInvalidClass, it's interpreted as a class
907 // escape (i.e., 's' means whitespace, from '\s'). 975 // escape (i.e., 's' means whitespace, from '\s').
908 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, 976 static inline void AddRangeOrEscape(ZoneGrowableArray<CharacterRange>* ranges,
909 uc16 char_class, 977 uint16_t char_class,
910 CharacterRange range, 978 CharacterRange range) {
911 Zone* zone) {
912 if (char_class != kNoCharClass) { 979 if (char_class != kNoCharClass) {
913 CharacterRange::AddClassEscape(char_class, ranges, zone); 980 CharacterRange::AddClassEscape(char_class, ranges);
914 } else { 981 } else {
915 ranges->Add(range, zone); 982 ranges->Add(range);
916 } 983 }
917 } 984 }
918 985
919 986
920 RegExpTree* RegExpParser::ParseCharacterClass() { 987 RegExpTree* RegExpParser::ParseCharacterClass() {
921 static const char* kUnterminated = "Unterminated character class"; 988 static const char* kUnterminated = "Unterminated character class";
922 static const char* kRangeOutOfOrder = "Range out of order in character class"; 989 static const char* kRangeOutOfOrder = "Range out of order in character class";
923 990
924 DCHECK_EQ(current(), '['); 991 ASSERT(current() == '[');
925 Advance(); 992 Advance();
926 bool is_negated = false; 993 bool is_negated = false;
927 if (current() == '^') { 994 if (current() == '^') {
928 is_negated = true; 995 is_negated = true;
929 Advance(); 996 Advance();
930 } 997 }
931 ZoneList<CharacterRange>* ranges = 998 ZoneGrowableArray<CharacterRange>* ranges =
932 new(zone()) ZoneList<CharacterRange>(2, zone()); 999 new(I) ZoneGrowableArray<CharacterRange>(2);
933 while (has_more() && current() != ']') { 1000 while (has_more() && current() != ']') {
934 uc16 char_class = kNoCharClass; 1001 uint16_t char_class = kNoCharClass;
935 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED); 1002 CharacterRange first = ParseClassAtom(&char_class);
936 if (current() == '-') { 1003 if (current() == '-') {
937 Advance(); 1004 Advance();
938 if (current() == kEndMarker) { 1005 if (current() == kEndMarker) {
939 // If we reach the end we break out of the loop and let the 1006 // If we reach the end we break out of the loop and let the
940 // following code report an error. 1007 // following code report an error.
941 break; 1008 break;
942 } else if (current() == ']') { 1009 } else if (current() == ']') {
943 AddRangeOrEscape(ranges, char_class, first, zone()); 1010 AddRangeOrEscape(ranges, char_class, first);
944 ranges->Add(CharacterRange::Singleton('-'), zone()); 1011 ranges->Add(CharacterRange::Singleton('-'));
945 break; 1012 break;
946 } 1013 }
947 uc16 char_class_2 = kNoCharClass; 1014 uint16_t char_class_2 = kNoCharClass;
948 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); 1015 CharacterRange next = ParseClassAtom(&char_class_2);
949 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { 1016 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
950 // Either end is an escaped character class. Treat the '-' verbatim. 1017 // Either end is an escaped character class. Treat the '-' verbatim.
951 AddRangeOrEscape(ranges, char_class, first, zone()); 1018 AddRangeOrEscape(ranges, char_class, first);
952 ranges->Add(CharacterRange::Singleton('-'), zone()); 1019 ranges->Add(CharacterRange::Singleton('-'));
953 AddRangeOrEscape(ranges, char_class_2, next, zone()); 1020 AddRangeOrEscape(ranges, char_class_2, next);
954 continue; 1021 continue;
955 } 1022 }
956 if (first.from() > next.to()) { 1023 if (first.from() > next.to()) {
957 return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED); 1024 ReportError(kRangeOutOfOrder);
1025 UNREACHABLE();
958 } 1026 }
959 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone()); 1027 ranges->Add(CharacterRange::Range(first.from(), next.to()));
960 } else { 1028 } else {
961 AddRangeOrEscape(ranges, char_class, first, zone()); 1029 AddRangeOrEscape(ranges, char_class, first);
962 } 1030 }
963 } 1031 }
964 if (!has_more()) { 1032 if (!has_more()) {
965 return ReportError(CStrVector(kUnterminated) CHECK_FAILED); 1033 ReportError(kUnterminated);
1034 UNREACHABLE();
966 } 1035 }
967 Advance(); 1036 Advance();
968 if (ranges->length() == 0) { 1037 if (ranges->length() == 0) {
969 ranges->Add(CharacterRange::Everything(), zone()); 1038 ranges->Add(CharacterRange::Everything());
970 is_negated = !is_negated; 1039 is_negated = !is_negated;
971 } 1040 }
972 return new(zone()) RegExpCharacterClass(ranges, is_negated); 1041 return new(I) RegExpCharacterClass(ranges, is_negated);
973 } 1042 }
974 1043
975 1044
976 // ---------------------------------------------------------------------------- 1045 // ----------------------------------------------------------------------------
977 // The Parser interface. 1046 // The Parser interface.
978 1047
979 bool RegExpParser::ParseRegExp(FlatStringReader* input, 1048 bool RegExpParser::ParseRegExp(const String& input,
980 bool multiline, 1049 bool multiline,
981 RegExpCompileData* result, 1050 RegExpCompileData* result) {
982 Zone* zone) { 1051 ASSERT(result != NULL);
983 DCHECK(result != NULL); 1052 LongJumpScope jump;
984 RegExpParser parser(input, &result->error, multiline, zone); 1053 RegExpParser parser(input, &result->error, multiline);
985 RegExpTree* tree = parser.ParsePattern(); 1054 if (setjmp(*jump.Set()) == 0) {
986 if (parser.failed()) { 1055 RegExpTree* tree = parser.ParsePattern();
987 DCHECK(tree == NULL); 1056 ASSERT(tree != NULL);
988 DCHECK(!result->error.is_null()); 1057 ASSERT(result->error.IsNull());
989 } else {
990 DCHECK(tree != NULL);
991 DCHECK(result->error.is_null());
992 result->tree = tree; 1058 result->tree = tree;
993 int capture_count = parser.captures_started(); 1059 intptr_t capture_count = parser.captures_started();
994 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; 1060 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
995 result->contains_anchor = parser.contains_anchor(); 1061 result->contains_anchor = parser.contains_anchor();
996 result->capture_count = capture_count; 1062 result->capture_count = capture_count;
1063 } else {
1064 ASSERT(!result->error.IsNull());
1065 Isolate::Current()->object_store()->clear_sticky_error();
1066
1067 // Throw a FormatException on parsing failures.
1068 const String& message = String::Handle(
1069 String::Concat(result->error, input));
1070 const Array& args = Array::Handle(Array::New(1));
1071 args.SetAt(0, message);
1072
1073 Exceptions::ThrowByType(Exceptions::kFormat, args);
997 } 1074 }
998 return !parser.failed(); 1075 return !parser.failed();
999 } 1076 }
1000 1077
1001 // SNIP
1002
1003 } // namespace dart 1078 } // namespace dart
OLDNEW
« no previous file with comments | « runtime/vm/regexp_parser.h ('k') | runtime/vm/symbols.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698