Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(454)

Side by Side Diff: runtime/vm/regexp_parser.cc

Issue 2481873005: clang-format runtime/vm (Closed)
Patch Set: Merge Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « runtime/vm/regexp_parser.h ('k') | runtime/vm/regexp_test.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 #include "vm/longjump.h" 5 #include "vm/longjump.h"
6 #include "vm/object_store.h" 6 #include "vm/object_store.h"
7 #include "vm/regexp_parser.h" 7 #include "vm/regexp_parser.h"
8 8
9 namespace dart { 9 namespace dart {
10 10
11 #define Z zone() 11 #define Z zone()
12 12
13 // Enables possessive quantifier syntax for testing. 13 // Enables possessive quantifier syntax for testing.
14 static const bool FLAG_regexp_possessive_quantifier = false; 14 static const bool FLAG_regexp_possessive_quantifier = false;
15 15
16 RegExpBuilder::RegExpBuilder() 16 RegExpBuilder::RegExpBuilder()
17 : zone_(Thread::Current()->zone()), 17 : zone_(Thread::Current()->zone()),
18 pending_empty_(false), 18 pending_empty_(false),
19 characters_(NULL), 19 characters_(NULL),
20 terms_(), 20 terms_(),
21 text_(), 21 text_(),
22 alternatives_() 22 alternatives_()
23 #ifdef DEBUG 23 #ifdef DEBUG
24 , last_added_(ADD_NONE) 24 ,
25 last_added_(ADD_NONE)
25 #endif 26 #endif
26 {} 27 {
28 }
27 29
28 30
29 void RegExpBuilder::FlushCharacters() { 31 void RegExpBuilder::FlushCharacters() {
30 pending_empty_ = false; 32 pending_empty_ = false;
31 if (characters_ != NULL) { 33 if (characters_ != NULL) {
32 RegExpTree* atom = new(Z) RegExpAtom(characters_); 34 RegExpTree* atom = new (Z) RegExpAtom(characters_);
33 characters_ = NULL; 35 characters_ = NULL;
34 text_.Add(atom); 36 text_.Add(atom);
35 LAST(ADD_ATOM); 37 LAST(ADD_ATOM);
36 } 38 }
37 } 39 }
38 40
39 41
40 void RegExpBuilder::FlushText() { 42 void RegExpBuilder::FlushText() {
41 FlushCharacters(); 43 FlushCharacters();
42 intptr_t num_text = text_.length(); 44 intptr_t num_text = text_.length();
43 if (num_text == 0) { 45 if (num_text == 0) {
44 return; 46 return;
45 } else if (num_text == 1) { 47 } else if (num_text == 1) {
46 terms_.Add(text_.Last()); 48 terms_.Add(text_.Last());
47 } else { 49 } else {
48 RegExpText* text = new(Z) RegExpText(); 50 RegExpText* text = new (Z) RegExpText();
49 for (intptr_t i = 0; i < num_text; i++) 51 for (intptr_t i = 0; i < num_text; i++)
50 text_[i]->AppendToText(text); 52 text_[i]->AppendToText(text);
51 terms_.Add(text); 53 terms_.Add(text);
52 } 54 }
53 text_.Clear(); 55 text_.Clear();
54 } 56 }
55 57
56 58
57 void RegExpBuilder::AddCharacter(uint16_t c) { 59 void RegExpBuilder::AddCharacter(uint16_t c) {
58 pending_empty_ = false; 60 pending_empty_ = false;
59 if (characters_ == NULL) { 61 if (characters_ == NULL) {
60 characters_ = new(Z) ZoneGrowableArray<uint16_t>(4); 62 characters_ = new (Z) ZoneGrowableArray<uint16_t>(4);
61 } 63 }
62 characters_->Add(c); 64 characters_->Add(c);
63 LAST(ADD_CHAR); 65 LAST(ADD_CHAR);
64 } 66 }
65 67
66 68
67 void RegExpBuilder::AddEmpty() { 69 void RegExpBuilder::AddEmpty() {
68 pending_empty_ = true; 70 pending_empty_ = true;
69 } 71 }
70 72
(...skipping 29 matching lines...) Expand all
100 void RegExpBuilder::FlushTerms() { 102 void RegExpBuilder::FlushTerms() {
101 FlushText(); 103 FlushText();
102 intptr_t num_terms = terms_.length(); 104 intptr_t num_terms = terms_.length();
103 RegExpTree* alternative; 105 RegExpTree* alternative;
104 if (num_terms == 0) { 106 if (num_terms == 0) {
105 alternative = RegExpEmpty::GetInstance(); 107 alternative = RegExpEmpty::GetInstance();
106 } else if (num_terms == 1) { 108 } else if (num_terms == 1) {
107 alternative = terms_.Last(); 109 alternative = terms_.Last();
108 } else { 110 } else {
109 ZoneGrowableArray<RegExpTree*>* terms = 111 ZoneGrowableArray<RegExpTree*>* terms =
110 new(Z) ZoneGrowableArray<RegExpTree*>(); 112 new (Z) ZoneGrowableArray<RegExpTree*>();
111 for (intptr_t i = 0; i < terms_.length(); i++) { 113 for (intptr_t i = 0; i < terms_.length(); i++) {
112 terms->Add(terms_[i]); 114 terms->Add(terms_[i]);
113 } 115 }
114 alternative = new(Z) RegExpAlternative(terms); 116 alternative = new (Z) RegExpAlternative(terms);
115 } 117 }
116 alternatives_.Add(alternative); 118 alternatives_.Add(alternative);
117 terms_.Clear(); 119 terms_.Clear();
118 LAST(ADD_NONE); 120 LAST(ADD_NONE);
119 } 121 }
120 122
121 123
122 RegExpTree* RegExpBuilder::ToRegExp() { 124 RegExpTree* RegExpBuilder::ToRegExp() {
123 FlushTerms(); 125 FlushTerms();
124 intptr_t num_alternatives = alternatives_.length(); 126 intptr_t num_alternatives = alternatives_.length();
125 if (num_alternatives == 0) { 127 if (num_alternatives == 0) {
126 return RegExpEmpty::GetInstance(); 128 return RegExpEmpty::GetInstance();
127 } 129 }
128 if (num_alternatives == 1) { 130 if (num_alternatives == 1) {
129 return alternatives_.Last(); 131 return alternatives_.Last();
130 } 132 }
131 ZoneGrowableArray<RegExpTree*>* alternatives = 133 ZoneGrowableArray<RegExpTree*>* alternatives =
132 new(Z) ZoneGrowableArray<RegExpTree*>(); 134 new (Z) ZoneGrowableArray<RegExpTree*>();
133 for (intptr_t i = 0; i < alternatives_.length(); i++) { 135 for (intptr_t i = 0; i < alternatives_.length(); i++) {
134 alternatives->Add(alternatives_[i]); 136 alternatives->Add(alternatives_[i]);
135 } 137 }
136 return new(Z) RegExpDisjunction(alternatives); 138 return new (Z) RegExpDisjunction(alternatives);
137 } 139 }
138 140
139 141
140 void RegExpBuilder::AddQuantifierToAtom( 142 void RegExpBuilder::AddQuantifierToAtom(
141 intptr_t min, 143 intptr_t min,
142 intptr_t max, 144 intptr_t max,
143 RegExpQuantifier::QuantifierType quantifier_type) { 145 RegExpQuantifier::QuantifierType quantifier_type) {
144 if (pending_empty_) { 146 if (pending_empty_) {
145 pending_empty_ = false; 147 pending_empty_ = false;
146 return; 148 return;
147 } 149 }
148 RegExpTree* atom; 150 RegExpTree* atom;
149 if (characters_ != NULL) { 151 if (characters_ != NULL) {
150 DEBUG_ASSERT(last_added_ == ADD_CHAR); 152 DEBUG_ASSERT(last_added_ == ADD_CHAR);
151 // Last atom was character. 153 // Last atom was character.
152 154
153 ZoneGrowableArray<uint16_t> *char_vector = 155 ZoneGrowableArray<uint16_t>* char_vector =
154 new(Z) ZoneGrowableArray<uint16_t>(); 156 new (Z) ZoneGrowableArray<uint16_t>();
155 char_vector->AddArray(*characters_); 157 char_vector->AddArray(*characters_);
156 intptr_t num_chars = char_vector->length(); 158 intptr_t num_chars = char_vector->length();
157 if (num_chars > 1) { 159 if (num_chars > 1) {
158 ZoneGrowableArray<uint16_t> *prefix = 160 ZoneGrowableArray<uint16_t>* prefix =
159 new(Z) ZoneGrowableArray<uint16_t>(); 161 new (Z) ZoneGrowableArray<uint16_t>();
160 for (intptr_t i = 0; i < num_chars - 1; i++) { 162 for (intptr_t i = 0; i < num_chars - 1; i++) {
161 prefix->Add(char_vector->At(i)); 163 prefix->Add(char_vector->At(i));
162 } 164 }
163 text_.Add(new(Z) RegExpAtom(prefix)); 165 text_.Add(new (Z) RegExpAtom(prefix));
164 ZoneGrowableArray<uint16_t> *tail = new(Z) ZoneGrowableArray<uint16_t>(); 166 ZoneGrowableArray<uint16_t>* tail = new (Z) ZoneGrowableArray<uint16_t>();
165 tail->Add(char_vector->At(num_chars - 1)); 167 tail->Add(char_vector->At(num_chars - 1));
166 char_vector = tail; 168 char_vector = tail;
167 } 169 }
168 characters_ = NULL; 170 characters_ = NULL;
169 atom = new(Z) RegExpAtom(char_vector); 171 atom = new (Z) RegExpAtom(char_vector);
170 FlushText(); 172 FlushText();
171 } else if (text_.length() > 0) { 173 } else if (text_.length() > 0) {
172 DEBUG_ASSERT(last_added_ == ADD_ATOM); 174 DEBUG_ASSERT(last_added_ == ADD_ATOM);
173 atom = text_.RemoveLast(); 175 atom = text_.RemoveLast();
174 FlushText(); 176 FlushText();
175 } else if (terms_.length() > 0) { 177 } else if (terms_.length() > 0) {
176 DEBUG_ASSERT(last_added_ == ADD_ATOM); 178 DEBUG_ASSERT(last_added_ == ADD_ATOM);
177 atom = terms_.RemoveLast(); 179 atom = terms_.RemoveLast();
178 if (atom->max_match() == 0) { 180 if (atom->max_match() == 0) {
179 // Guaranteed to only match an empty string. 181 // Guaranteed to only match an empty string.
180 LAST(ADD_TERM); 182 LAST(ADD_TERM);
181 if (min == 0) { 183 if (min == 0) {
182 return; 184 return;
183 } 185 }
184 terms_.Add(atom); 186 terms_.Add(atom);
185 return; 187 return;
186 } 188 }
187 } else { 189 } else {
188 // Only call immediately after adding an atom or character! 190 // Only call immediately after adding an atom or character!
189 UNREACHABLE(); 191 UNREACHABLE();
190 return; 192 return;
191 } 193 }
192 terms_.Add(new(Z) RegExpQuantifier(min, max, quantifier_type, atom)); 194 terms_.Add(new (Z) RegExpQuantifier(min, max, quantifier_type, atom));
193 LAST(ADD_TERM); 195 LAST(ADD_TERM);
194 } 196 }
195 197
196 // ---------------------------------------------------------------------------- 198 // ----------------------------------------------------------------------------
197 // Implementation of Parser 199 // Implementation of Parser
198 200
199 RegExpParser::RegExpParser(const String& in, 201 RegExpParser::RegExpParser(const String& in, String* error, bool multiline)
200 String* error,
201 bool multiline)
202 : zone_(Thread::Current()->zone()), 202 : zone_(Thread::Current()->zone()),
203 error_(error), 203 error_(error),
204 captures_(NULL), 204 captures_(NULL),
205 in_(in), 205 in_(in),
206 current_(kEndMarker), 206 current_(kEndMarker),
207 next_pos_(0), 207 next_pos_(0),
208 capture_count_(0), 208 capture_count_(0),
209 has_more_(true), 209 has_more_(true),
210 multiline_(multiline), 210 multiline_(multiline),
211 simple_(false), 211 simple_(false),
212 contains_anchor_(false), 212 contains_anchor_(false),
213 is_scanned_for_captures_(false), 213 is_scanned_for_captures_(false),
214 failed_(false) { 214 failed_(false) {
215 Advance(); 215 Advance();
216 } 216 }
217 217
218 218
219 bool RegExpParser::ParseFunction(ParsedFunction *parsed_function) { 219 bool RegExpParser::ParseFunction(ParsedFunction* parsed_function) {
220 VMTagScope tagScope(parsed_function->thread(), 220 VMTagScope tagScope(parsed_function->thread(),
221 VMTag::kCompileParseRegExpTagId); 221 VMTag::kCompileParseRegExpTagId);
222 Zone* zone = parsed_function->zone(); 222 Zone* zone = parsed_function->zone();
223 RegExp& regexp = RegExp::Handle(parsed_function->function().regexp()); 223 RegExp& regexp = RegExp::Handle(parsed_function->function().regexp());
224 224
225 const String& pattern = String::Handle(regexp.pattern()); 225 const String& pattern = String::Handle(regexp.pattern());
226 const bool multiline = regexp.is_multi_line(); 226 const bool multiline = regexp.is_multi_line();
227 227
228 RegExpCompileData* compile_data = new(zone) RegExpCompileData(); 228 RegExpCompileData* compile_data = new (zone) RegExpCompileData();
229 if (!RegExpParser::ParseRegExp(pattern, multiline, compile_data)) { 229 if (!RegExpParser::ParseRegExp(pattern, multiline, compile_data)) {
230 // Parsing failures are handled in the RegExp factory constructor. 230 // Parsing failures are handled in the RegExp factory constructor.
231 UNREACHABLE(); 231 UNREACHABLE();
232 } 232 }
233 233
234 regexp.set_num_bracket_expressions(compile_data->capture_count); 234 regexp.set_num_bracket_expressions(compile_data->capture_count);
235 if (compile_data->simple) { 235 if (compile_data->simple) {
236 regexp.set_is_simple(); 236 regexp.set_is_simple();
237 } else { 237 } else {
238 regexp.set_is_complex(); 238 regexp.set_is_complex();
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
320 // Atom 320 // Atom
321 // Atom Quantifier 321 // Atom Quantifier
322 RegExpTree* RegExpParser::ParseDisjunction() { 322 RegExpTree* RegExpParser::ParseDisjunction() {
323 // Used to store current state while parsing subexpressions. 323 // Used to store current state while parsing subexpressions.
324 RegExpParserState initial_state(NULL, INITIAL, 0, Z); 324 RegExpParserState initial_state(NULL, INITIAL, 0, Z);
325 RegExpParserState* stored_state = &initial_state; 325 RegExpParserState* stored_state = &initial_state;
326 // Cache the builder in a local variable for quick access. 326 // Cache the builder in a local variable for quick access.
327 RegExpBuilder* builder = initial_state.builder(); 327 RegExpBuilder* builder = initial_state.builder();
328 while (true) { 328 while (true) {
329 switch (current()) { 329 switch (current()) {
330 case kEndMarker: 330 case kEndMarker:
331 if (stored_state->IsSubexpression()) { 331 if (stored_state->IsSubexpression()) {
332 // Inside a parenthesized group when hitting end of input. 332 // Inside a parenthesized group when hitting end of input.
333 ReportError("Unterminated group"); 333 ReportError("Unterminated group");
334 UNREACHABLE();
335 }
336 ASSERT(INITIAL == stored_state->group_type());
337 // Parsing completed successfully.
338 return builder->ToRegExp();
339 case ')': {
340 if (!stored_state->IsSubexpression()) {
341 ReportError("Unmatched ')'");
342 UNREACHABLE();
343 }
344 ASSERT(INITIAL != stored_state->group_type());
345
346 Advance();
347 // End disjunction parsing and convert builder content to new single
348 // regexp atom.
349 RegExpTree* body = builder->ToRegExp();
350
351 intptr_t end_capture_index = captures_started();
352
353 intptr_t capture_index = stored_state->capture_index();
354 SubexpressionType group_type = stored_state->group_type();
355
356 // Restore previous state.
357 stored_state = stored_state->previous_state();
358 builder = stored_state->builder();
359
360 // Build result of subexpression.
361 if (group_type == CAPTURE) {
362 RegExpCapture* capture = new (Z) RegExpCapture(body, capture_index);
363 (*captures_)[capture_index - 1] = capture;
364 body = capture;
365 } else if (group_type != GROUPING) {
366 ASSERT(group_type == POSITIVE_LOOKAHEAD ||
367 group_type == NEGATIVE_LOOKAHEAD);
368 bool is_positive = (group_type == POSITIVE_LOOKAHEAD);
369 body = new (Z)
370 RegExpLookahead(body, is_positive,
371 end_capture_index - capture_index, capture_index);
372 }
373 builder->AddAtom(body);
374 // For compatibility with JSC and ES3, we allow quantifiers after
375 // lookaheads, and break in all cases.
376 break;
377 }
378 case '|': {
379 Advance();
380 builder->NewAlternative();
381 continue;
382 }
383 case '*':
384 case '+':
385 case '?':
386 ReportError("Nothing to repeat");
334 UNREACHABLE(); 387 UNREACHABLE();
335 } 388 case '^': {
336 ASSERT(INITIAL == stored_state->group_type()); 389 Advance();
337 // Parsing completed successfully. 390 if (multiline_) {
338 return builder->ToRegExp(); 391 builder->AddAssertion(
339 case ')': { 392 new (Z) RegExpAssertion(RegExpAssertion::START_OF_LINE));
340 if (!stored_state->IsSubexpression()) { 393 } else {
341 ReportError("Unmatched ')'"); 394 builder->AddAssertion(
342 UNREACHABLE(); 395 new (Z) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
343 } 396 set_contains_anchor();
344 ASSERT(INITIAL != stored_state->group_type()); 397 }
345 398 continue;
346 Advance(); 399 }
347 // End disjunction parsing and convert builder content to new single 400 case '$': {
348 // regexp atom. 401 Advance();
349 RegExpTree* body = builder->ToRegExp(); 402 RegExpAssertion::AssertionType assertion_type =
350 403 multiline_ ? RegExpAssertion::END_OF_LINE
351 intptr_t end_capture_index = captures_started(); 404 : RegExpAssertion::END_OF_INPUT;
352 405 builder->AddAssertion(new RegExpAssertion(assertion_type));
353 intptr_t capture_index = stored_state->capture_index(); 406 continue;
354 SubexpressionType group_type = stored_state->group_type(); 407 }
355 408 case '.': {
356 // Restore previous state. 409 Advance();
357 stored_state = stored_state->previous_state(); 410 // everything except \x0a, \x0d, \u2028 and \u2029
358 builder = stored_state->builder();
359
360 // Build result of subexpression.
361 if (group_type == CAPTURE) {
362 RegExpCapture* capture = new(Z) RegExpCapture(body, capture_index);
363 (*captures_)[capture_index - 1] = capture;
364 body = capture;
365 } else if (group_type != GROUPING) {
366 ASSERT(group_type == POSITIVE_LOOKAHEAD ||
367 group_type == NEGATIVE_LOOKAHEAD);
368 bool is_positive = (group_type == POSITIVE_LOOKAHEAD);
369 body = new(Z) RegExpLookahead(body,
370 is_positive,
371 end_capture_index - capture_index,
372 capture_index);
373 }
374 builder->AddAtom(body);
375 // For compatibility with JSC and ES3, we allow quantifiers after
376 // lookaheads, and break in all cases.
377 break;
378 }
379 case '|': {
380 Advance();
381 builder->NewAlternative();
382 continue;
383 }
384 case '*':
385 case '+':
386 case '?':
387 ReportError("Nothing to repeat");
388 UNREACHABLE();
389 case '^': {
390 Advance();
391 if (multiline_) {
392 builder->AddAssertion(
393 new(Z) RegExpAssertion(RegExpAssertion::START_OF_LINE));
394 } else {
395 builder->AddAssertion(
396 new(Z) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
397 set_contains_anchor();
398 }
399 continue;
400 }
401 case '$': {
402 Advance();
403 RegExpAssertion::AssertionType assertion_type =
404 multiline_ ? RegExpAssertion::END_OF_LINE :
405 RegExpAssertion::END_OF_INPUT;
406 builder->AddAssertion(new RegExpAssertion(assertion_type));
407 continue;
408 }
409 case '.': {
410 Advance();
411 // everything except \x0a, \x0d, \u2028 and \u2029
412 ZoneGrowableArray<CharacterRange>* ranges =
413 new ZoneGrowableArray<CharacterRange>(2);
414 CharacterRange::AddClassEscape('.', ranges);
415 RegExpTree* atom = new RegExpCharacterClass(ranges, false);
416 builder->AddAtom(atom);
417 break;
418 }
419 case '(': {
420 SubexpressionType subexpr_type = CAPTURE;
421 Advance();
422 if (current() == '?') {
423 switch (Next()) {
424 case ':':
425 subexpr_type = GROUPING;
426 break;
427 case '=':
428 subexpr_type = POSITIVE_LOOKAHEAD;
429 break;
430 case '!':
431 subexpr_type = NEGATIVE_LOOKAHEAD;
432 break;
433 default:
434 ReportError("Invalid group");
435 UNREACHABLE();
436 }
437 Advance(2);
438 } else {
439 if (captures_ == NULL) {
440 captures_ = new ZoneGrowableArray<RegExpCapture*>(2);
441 }
442 if (captures_started() >= kMaxCaptures) {
443 ReportError("Too many captures");
444 UNREACHABLE();
445 }
446 captures_->Add(NULL);
447 }
448 // Store current state and begin new disjunction parsing.
449 stored_state = new RegExpParserState(stored_state, subexpr_type,
450 captures_started(), Z);
451 builder = stored_state->builder();
452 continue;
453 }
454 case '[': {
455 RegExpTree* atom = ParseCharacterClass();
456 builder->AddAtom(atom);
457 break;
458 }
459 // Atom ::
460 // \ AtomEscape
461 case '\\':
462 switch (Next()) {
463 case kEndMarker:
464 ReportError("\\ at end of pattern");
465 UNREACHABLE();
466 case 'b':
467 Advance(2);
468 builder->AddAssertion(
469 new RegExpAssertion(RegExpAssertion::BOUNDARY));
470 continue;
471 case 'B':
472 Advance(2);
473 builder->AddAssertion(
474 new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
475 continue;
476 // AtomEscape ::
477 // CharacterClassEscape
478 //
479 // CharacterClassEscape :: one of
480 // d D s S w W
481 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {
482 uint32_t c = Next();
483 Advance(2);
484 ZoneGrowableArray<CharacterRange>* ranges = 411 ZoneGrowableArray<CharacterRange>* ranges =
485 new ZoneGrowableArray<CharacterRange>(2); 412 new ZoneGrowableArray<CharacterRange>(2);
486 CharacterRange::AddClassEscape(c, ranges); 413 CharacterRange::AddClassEscape('.', ranges);
487 RegExpTree* atom = new RegExpCharacterClass(ranges, false); 414 RegExpTree* atom = new RegExpCharacterClass(ranges, false);
488 builder->AddAtom(atom); 415 builder->AddAtom(atom);
489 break; 416 break;
490 } 417 }
491 case '1': case '2': case '3': case '4': case '5': case '6': 418 case '(': {
492 case '7': case '8': case '9': { 419 SubexpressionType subexpr_type = CAPTURE;
493 intptr_t index = 0; 420 Advance();
494 if (ParseBackReferenceIndex(&index)) { 421 if (current() == '?') {
495 RegExpCapture* capture = NULL; 422 switch (Next()) {
496 if (captures_ != NULL && index <= captures_->length()) { 423 case ':':
497 capture = captures_->At(index - 1); 424 subexpr_type = GROUPING;
498 } 425 break;
499 if (capture == NULL) { 426 case '=':
500 builder->AddEmpty(); 427 subexpr_type = POSITIVE_LOOKAHEAD;
501 break; 428 break;
502 } 429 case '!':
503 RegExpTree* atom = new RegExpBackReference(capture); 430 subexpr_type = NEGATIVE_LOOKAHEAD;
504 builder->AddAtom(atom); 431 break;
505 break; 432 default:
506 } 433 ReportError("Invalid group");
507 uint32_t first_digit = Next(); 434 UNREACHABLE();
508 if (first_digit == '8' || first_digit == '9') { 435 }
509 // Treat as identity escape
510 builder->AddCharacter(first_digit);
511 Advance(2); 436 Advance(2);
512 break;
513 }
514 }
515 // FALLTHROUGH
516 case '0': {
517 Advance();
518 uint32_t octal = ParseOctalLiteral();
519 builder->AddCharacter(octal);
520 break;
521 }
522 // ControlEscape :: one of
523 // f n r t v
524 case 'f':
525 Advance(2);
526 builder->AddCharacter('\f');
527 break;
528 case 'n':
529 Advance(2);
530 builder->AddCharacter('\n');
531 break;
532 case 'r':
533 Advance(2);
534 builder->AddCharacter('\r');
535 break;
536 case 't':
537 Advance(2);
538 builder->AddCharacter('\t');
539 break;
540 case 'v':
541 Advance(2);
542 builder->AddCharacter('\v');
543 break;
544 case 'c': {
545 Advance();
546 uint32_t controlLetter = Next();
547 // Special case if it is an ASCII letter.
548 // Convert lower case letters to uppercase.
549 uint32_t letter = controlLetter & ~('a' ^ 'A');
550 if (letter < 'A' || 'Z' < letter) {
551 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
552 // This is outside the specification. We match JSC in
553 // reading the backslash as a literal character instead
554 // of as starting an escape.
555 builder->AddCharacter('\\');
556 } else { 437 } else {
557 Advance(2); 438 if (captures_ == NULL) {
558 builder->AddCharacter(controlLetter & 0x1f); 439 captures_ = new ZoneGrowableArray<RegExpCapture*>(2);
559 } 440 }
560 break; 441 if (captures_started() >= kMaxCaptures) {
561 } 442 ReportError("Too many captures");
562 case 'x': { 443 UNREACHABLE();
563 Advance(2); 444 }
564 uint32_t value; 445 captures_->Add(NULL);
565 if (ParseHexEscape(2, &value)) { 446 }
566 builder->AddCharacter(value); 447 // Store current state and begin new disjunction parsing.
567 } else { 448 stored_state = new RegExpParserState(stored_state, subexpr_type,
568 builder->AddCharacter('x'); 449 captures_started(), Z);
569 } 450 builder = stored_state->builder();
570 break; 451 continue;
571 } 452 }
572 case 'u': { 453 case '[': {
573 Advance(2); 454 RegExpTree* atom = ParseCharacterClass();
574 uint32_t value; 455 builder->AddAtom(atom);
575 if (ParseHexEscape(4, &value)) { 456 break;
576 builder->AddCharacter(value); 457 }
577 } else { 458 // Atom ::
578 builder->AddCharacter('u'); 459 // \ AtomEscape
579 } 460 case '\\':
580 break; 461 switch (Next()) {
462 case kEndMarker:
463 ReportError("\\ at end of pattern");
464 UNREACHABLE();
465 case 'b':
466 Advance(2);
467 builder->AddAssertion(
468 new RegExpAssertion(RegExpAssertion::BOUNDARY));
469 continue;
470 case 'B':
471 Advance(2);
472 builder->AddAssertion(
473 new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
474 continue;
475 // AtomEscape ::
476 // CharacterClassEscape
477 //
478 // CharacterClassEscape :: one of
479 // d D s S w W
480 case 'd':
481 case 'D':
482 case 's':
483 case 'S':
484 case 'w':
485 case 'W': {
486 uint32_t c = Next();
487 Advance(2);
488 ZoneGrowableArray<CharacterRange>* ranges =
489 new ZoneGrowableArray<CharacterRange>(2);
490 CharacterRange::AddClassEscape(c, ranges);
491 RegExpTree* atom = new RegExpCharacterClass(ranges, false);
492 builder->AddAtom(atom);
493 break;
494 }
495 case '1':
496 case '2':
497 case '3':
498 case '4':
499 case '5':
500 case '6':
501 case '7':
502 case '8':
503 case '9': {
504 intptr_t index = 0;
505 if (ParseBackReferenceIndex(&index)) {
506 RegExpCapture* capture = NULL;
507 if (captures_ != NULL && index <= captures_->length()) {
508 capture = captures_->At(index - 1);
509 }
510 if (capture == NULL) {
511 builder->AddEmpty();
512 break;
513 }
514 RegExpTree* atom = new RegExpBackReference(capture);
515 builder->AddAtom(atom);
516 break;
517 }
518 uint32_t first_digit = Next();
519 if (first_digit == '8' || first_digit == '9') {
520 // Treat as identity escape
521 builder->AddCharacter(first_digit);
522 Advance(2);
523 break;
524 }
525 }
526 // FALLTHROUGH
527 case '0': {
528 Advance();
529 uint32_t octal = ParseOctalLiteral();
530 builder->AddCharacter(octal);
531 break;
532 }
533 // ControlEscape :: one of
534 // f n r t v
535 case 'f':
536 Advance(2);
537 builder->AddCharacter('\f');
538 break;
539 case 'n':
540 Advance(2);
541 builder->AddCharacter('\n');
542 break;
543 case 'r':
544 Advance(2);
545 builder->AddCharacter('\r');
546 break;
547 case 't':
548 Advance(2);
549 builder->AddCharacter('\t');
550 break;
551 case 'v':
552 Advance(2);
553 builder->AddCharacter('\v');
554 break;
555 case 'c': {
556 Advance();
557 uint32_t controlLetter = Next();
558 // Special case if it is an ASCII letter.
559 // Convert lower case letters to uppercase.
560 uint32_t letter = controlLetter & ~('a' ^ 'A');
561 if (letter < 'A' || 'Z' < letter) {
562 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
563 // This is outside the specification. We match JSC in
564 // reading the backslash as a literal character instead
565 // of as starting an escape.
566 builder->AddCharacter('\\');
567 } else {
568 Advance(2);
569 builder->AddCharacter(controlLetter & 0x1f);
570 }
571 break;
572 }
573 case 'x': {
574 Advance(2);
575 uint32_t value;
576 if (ParseHexEscape(2, &value)) {
577 builder->AddCharacter(value);
578 } else {
579 builder->AddCharacter('x');
580 }
581 break;
582 }
583 case 'u': {
584 Advance(2);
585 uint32_t value;
586 if (ParseHexEscape(4, &value)) {
587 builder->AddCharacter(value);
588 } else {
589 builder->AddCharacter('u');
590 }
591 break;
592 }
593 default:
594 // Identity escape.
595 builder->AddCharacter(Next());
596 Advance(2);
597 break;
598 }
599 break;
600 case '{': {
601 intptr_t dummy;
602 if (ParseIntervalQuantifier(&dummy, &dummy)) {
603 ReportError("Nothing to repeat");
604 UNREACHABLE();
605 }
606 // fallthrough
581 } 607 }
582 default: 608 default:
583 // Identity escape. 609 builder->AddCharacter(current());
584 builder->AddCharacter(Next()); 610 Advance();
585 Advance(2); 611 break;
586 break;
587 }
588 break;
589 case '{': {
590 intptr_t dummy;
591 if (ParseIntervalQuantifier(&dummy, &dummy)) {
592 ReportError("Nothing to repeat");
593 UNREACHABLE();
594 }
595 // fallthrough
596 }
597 default:
598 builder->AddCharacter(current());
599 Advance();
600 break;
601 } // end switch(current()) 612 } // end switch(current())
602 613
603 intptr_t min; 614 intptr_t min;
604 intptr_t max; 615 intptr_t max;
605 switch (current()) { 616 switch (current()) {
606 // QuantifierPrefix :: 617 // QuantifierPrefix ::
607 // * 618 // *
608 // + 619 // +
609 // ? 620 // ?
610 // { 621 // {
611 case '*': 622 case '*':
612 min = 0; 623 min = 0;
613 max = RegExpTree::kInfinity; 624 max = RegExpTree::kInfinity;
614 Advance(); 625 Advance();
615 break; 626 break;
616 case '+': 627 case '+':
617 min = 1; 628 min = 1;
618 max = RegExpTree::kInfinity; 629 max = RegExpTree::kInfinity;
619 Advance(); 630 Advance();
620 break; 631 break;
621 case '?': 632 case '?':
622 min = 0; 633 min = 0;
623 max = 1; 634 max = 1;
624 Advance(); 635 Advance();
625 break; 636 break;
626 case '{': 637 case '{':
627 if (ParseIntervalQuantifier(&min, &max)) { 638 if (ParseIntervalQuantifier(&min, &max)) {
628 if (max < min) { 639 if (max < min) {
629 ReportError("numbers out of order in {} quantifier."); 640 ReportError("numbers out of order in {} quantifier.");
630 UNREACHABLE(); 641 UNREACHABLE();
631 } 642 }
632 break; 643 break;
633 } else { 644 } else {
634 continue; 645 continue;
635 } 646 }
636 default: 647 default:
637 continue; 648 continue;
638 } 649 }
639 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; 650 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
640 if (current() == '?') { 651 if (current() == '?') {
641 quantifier_type = RegExpQuantifier::NON_GREEDY; 652 quantifier_type = RegExpQuantifier::NON_GREEDY;
642 Advance(); 653 Advance();
643 } else if (FLAG_regexp_possessive_quantifier && current() == '+') { 654 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
644 // FLAG_regexp_possessive_quantifier is a debug-only flag. 655 // FLAG_regexp_possessive_quantifier is a debug-only flag.
645 quantifier_type = RegExpQuantifier::POSSESSIVE; 656 quantifier_type = RegExpQuantifier::POSSESSIVE;
646 Advance(); 657 Advance();
647 } 658 }
648 builder->AddQuantifierToAtom(min, max, quantifier_type); 659 builder->AddQuantifierToAtom(min, max, quantifier_type);
649 } 660 }
650 } 661 }
651 662
652 663
653 #ifdef DEBUG 664 #ifdef DEBUG
654 // Currently only used in an ASSERT. 665 // Currently only used in an ASSERT.
655 static bool IsSpecialClassEscape(uint32_t c) { 666 static bool IsSpecialClassEscape(uint32_t c) {
656 switch (c) { 667 switch (c) {
657 case 'd': case 'D': 668 case 'd':
658 case 's': case 'S': 669 case 'D':
659 case 'w': case 'W': 670 case 's':
671 case 'S':
672 case 'w':
673 case 'W':
660 return true; 674 return true;
661 default: 675 default:
662 return false; 676 return false;
663 } 677 }
664 } 678 }
665 #endif 679 #endif
666 680
667 681
668 // In order to know whether an escape is a backreference or not we have to scan 682 // In order to know whether an escape is a backreference or not we have to scan
669 // the entire regexp and find the number of capturing parentheses. However we 683 // the entire regexp and find the number of capturing parentheses. However we
(...skipping 166 matching lines...) Expand 10 before | Expand all | Expand 10 after
836 // If c is not a legal hexadecimal character, returns a value < 0. 850 // If c is not a legal hexadecimal character, returns a value < 0.
837 static inline intptr_t HexValue(uint32_t c) { 851 static inline intptr_t HexValue(uint32_t c) {
838 c -= '0'; 852 c -= '0';
839 if (static_cast<unsigned>(c) <= 9) return c; 853 if (static_cast<unsigned>(c) <= 9) return c;
840 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36. 854 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
841 if (static_cast<unsigned>(c) <= 5) return c + 10; 855 if (static_cast<unsigned>(c) <= 5) return c + 10;
842 return -1; 856 return -1;
843 } 857 }
844 858
845 859
846 bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t *value) { 860 bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t* value) {
847 intptr_t start = position(); 861 intptr_t start = position();
848 uint32_t val = 0; 862 uint32_t val = 0;
849 bool done = false; 863 bool done = false;
850 for (intptr_t i = 0; !done; i++) { 864 for (intptr_t i = 0; !done; i++) {
851 uint32_t c = current(); 865 uint32_t c = current();
852 intptr_t d = HexValue(c); 866 intptr_t d = HexValue(c);
853 if (d < 0) { 867 if (d < 0) {
854 Reset(start); 868 Reset(start);
855 return false; 869 return false;
856 } 870 }
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
889 return '\t'; 903 return '\t';
890 case 'v': 904 case 'v':
891 Advance(); 905 Advance();
892 return '\v'; 906 return '\v';
893 case 'c': { 907 case 'c': {
894 uint32_t controlLetter = Next(); 908 uint32_t controlLetter = Next();
895 uint32_t letter = controlLetter & ~('A' ^ 'a'); 909 uint32_t letter = controlLetter & ~('A' ^ 'a');
896 // For compatibility with JSC, inside a character class 910 // For compatibility with JSC, inside a character class
897 // we also accept digits and underscore as control characters. 911 // we also accept digits and underscore as control characters.
898 if ((controlLetter >= '0' && controlLetter <= '9') || 912 if ((controlLetter >= '0' && controlLetter <= '9') ||
899 controlLetter == '_' || 913 controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
900 (letter >= 'A' && letter <= 'Z')) {
901 Advance(2); 914 Advance(2);
902 // Control letters mapped to ASCII control characters in the range 915 // Control letters mapped to ASCII control characters in the range
903 // 0x00-0x1f. 916 // 0x00-0x1f.
904 return controlLetter & 0x1f; 917 return controlLetter & 0x1f;
905 } 918 }
906 // We match JSC in reading the backslash as a literal 919 // We match JSC in reading the backslash as a literal
907 // character instead of as starting an escape. 920 // character instead of as starting an escape.
908 return '\\'; 921 return '\\';
909 } 922 }
910 case '0': case '1': case '2': case '3': case '4': case '5': 923 case '0':
911 case '6': case '7': 924 case '1':
925 case '2':
926 case '3':
927 case '4':
928 case '5':
929 case '6':
930 case '7':
912 // For compatibility, we interpret a decimal escape that isn't 931 // For compatibility, we interpret a decimal escape that isn't
913 // a back reference (and therefore either \0 or not valid according 932 // a back reference (and therefore either \0 or not valid according
914 // to the specification) as a 1..3 digit octal character code. 933 // to the specification) as a 1..3 digit octal character code.
915 return ParseOctalLiteral(); 934 return ParseOctalLiteral();
916 case 'x': { 935 case 'x': {
917 Advance(); 936 Advance();
918 uint32_t value; 937 uint32_t value;
919 if (ParseHexEscape(2, &value)) { 938 if (ParseHexEscape(2, &value)) {
920 return value; 939 return value;
921 } 940 }
(...skipping 22 matching lines...) Expand all
944 } 963 }
945 return 0; 964 return 0;
946 } 965 }
947 966
948 967
949 CharacterRange RegExpParser::ParseClassAtom(uint16_t* char_class) { 968 CharacterRange RegExpParser::ParseClassAtom(uint16_t* char_class) {
950 ASSERT(0 == *char_class); 969 ASSERT(0 == *char_class);
951 uint32_t first = current(); 970 uint32_t first = current();
952 if (first == '\\') { 971 if (first == '\\') {
953 switch (Next()) { 972 switch (Next()) {
954 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': { 973 case 'w':
974 case 'W':
975 case 'd':
976 case 'D':
977 case 's':
978 case 'S': {
955 *char_class = Next(); 979 *char_class = Next();
956 Advance(2); 980 Advance(2);
957 return CharacterRange::Singleton(0); // Return dummy value. 981 return CharacterRange::Singleton(0); // Return dummy value.
958 } 982 }
959 case kEndMarker: 983 case kEndMarker:
960 ReportError("\\ at end of pattern"); 984 ReportError("\\ at end of pattern");
961 UNREACHABLE(); 985 UNREACHABLE();
962 default: 986 default:
963 uint32_t c = ParseClassCharacterEscape(); 987 uint32_t c = ParseClassCharacterEscape();
964 return CharacterRange::Singleton(c); 988 return CharacterRange::Singleton(c);
(...skipping 26 matching lines...) Expand all
991 static const char* kRangeOutOfOrder = "Range out of order in character class"; 1015 static const char* kRangeOutOfOrder = "Range out of order in character class";
992 1016
993 ASSERT(current() == '['); 1017 ASSERT(current() == '[');
994 Advance(); 1018 Advance();
995 bool is_negated = false; 1019 bool is_negated = false;
996 if (current() == '^') { 1020 if (current() == '^') {
997 is_negated = true; 1021 is_negated = true;
998 Advance(); 1022 Advance();
999 } 1023 }
1000 ZoneGrowableArray<CharacterRange>* ranges = 1024 ZoneGrowableArray<CharacterRange>* ranges =
1001 new(Z) ZoneGrowableArray<CharacterRange>(2); 1025 new (Z) ZoneGrowableArray<CharacterRange>(2);
1002 while (has_more() && current() != ']') { 1026 while (has_more() && current() != ']') {
1003 uint16_t char_class = kNoCharClass; 1027 uint16_t char_class = kNoCharClass;
1004 CharacterRange first = ParseClassAtom(&char_class); 1028 CharacterRange first = ParseClassAtom(&char_class);
1005 if (current() == '-') { 1029 if (current() == '-') {
1006 Advance(); 1030 Advance();
1007 if (current() == kEndMarker) { 1031 if (current() == kEndMarker) {
1008 // If we reach the end we break out of the loop and let the 1032 // If we reach the end we break out of the loop and let the
1009 // following code report an error. 1033 // following code report an error.
1010 break; 1034 break;
1011 } else if (current() == ']') { 1035 } else if (current() == ']') {
(...skipping 21 matching lines...) Expand all
1033 } 1057 }
1034 if (!has_more()) { 1058 if (!has_more()) {
1035 ReportError(kUnterminated); 1059 ReportError(kUnterminated);
1036 UNREACHABLE(); 1060 UNREACHABLE();
1037 } 1061 }
1038 Advance(); 1062 Advance();
1039 if (ranges->length() == 0) { 1063 if (ranges->length() == 0) {
1040 ranges->Add(CharacterRange::Everything()); 1064 ranges->Add(CharacterRange::Everything());
1041 is_negated = !is_negated; 1065 is_negated = !is_negated;
1042 } 1066 }
1043 return new(Z) RegExpCharacterClass(ranges, is_negated); 1067 return new (Z) RegExpCharacterClass(ranges, is_negated);
1044 } 1068 }
1045 1069
1046 1070
1047 // ---------------------------------------------------------------------------- 1071 // ----------------------------------------------------------------------------
1048 // The Parser interface. 1072 // The Parser interface.
1049 1073
1050 bool RegExpParser::ParseRegExp(const String& input, 1074 bool RegExpParser::ParseRegExp(const String& input,
1051 bool multiline, 1075 bool multiline,
1052 RegExpCompileData* result) { 1076 RegExpCompileData* result) {
1053 ASSERT(result != NULL); 1077 ASSERT(result != NULL);
1054 LongJumpScope jump; 1078 LongJumpScope jump;
1055 RegExpParser parser(input, &result->error, multiline); 1079 RegExpParser parser(input, &result->error, multiline);
1056 if (setjmp(*jump.Set()) == 0) { 1080 if (setjmp(*jump.Set()) == 0) {
1057 RegExpTree* tree = parser.ParsePattern(); 1081 RegExpTree* tree = parser.ParsePattern();
1058 ASSERT(tree != NULL); 1082 ASSERT(tree != NULL);
1059 ASSERT(result->error.IsNull()); 1083 ASSERT(result->error.IsNull());
1060 result->tree = tree; 1084 result->tree = tree;
1061 intptr_t capture_count = parser.captures_started(); 1085 intptr_t capture_count = parser.captures_started();
1062 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; 1086 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1063 result->contains_anchor = parser.contains_anchor(); 1087 result->contains_anchor = parser.contains_anchor();
1064 result->capture_count = capture_count; 1088 result->capture_count = capture_count;
1065 } else { 1089 } else {
1066 ASSERT(!result->error.IsNull()); 1090 ASSERT(!result->error.IsNull());
1067 Thread::Current()->clear_sticky_error(); 1091 Thread::Current()->clear_sticky_error();
1068 1092
1069 // Throw a FormatException on parsing failures. 1093 // Throw a FormatException on parsing failures.
1070 const String& message = String::Handle( 1094 const String& message =
1071 String::Concat(result->error, input)); 1095 String::Handle(String::Concat(result->error, input));
1072 const Array& args = Array::Handle(Array::New(1)); 1096 const Array& args = Array::Handle(Array::New(1));
1073 args.SetAt(0, message); 1097 args.SetAt(0, message);
1074 1098
1075 Exceptions::ThrowByType(Exceptions::kFormat, args); 1099 Exceptions::ThrowByType(Exceptions::kFormat, args);
1076 } 1100 }
1077 return !parser.failed(); 1101 return !parser.failed();
1078 } 1102 }
1079 1103
1080 } // namespace dart 1104 } // namespace dart
OLDNEW
« no previous file with comments | « runtime/vm/regexp_parser.h ('k') | runtime/vm/regexp_test.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698