Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2504)

Side by Side Diff: src/parsing/parser.cc

Issue 1565183002: [regexp] move regexp parser into own files. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: fix test compile Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/parsing/parser.h ('k') | src/parsing/scanner.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/parsing/parser.h" 5 #include "src/parsing/parser.h"
6 6
7 #include "src/api.h" 7 #include "src/api.h"
8 #include "src/ast/ast.h" 8 #include "src/ast/ast.h"
9 #include "src/ast/ast-expression-visitor.h" 9 #include "src/ast/ast-expression-visitor.h"
10 #include "src/ast/ast-literal-reindexer.h" 10 #include "src/ast/ast-literal-reindexer.h"
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after
88 set_stack_limit(isolate_->stack_guard()->real_climit()); 88 set_stack_limit(isolate_->stack_guard()->real_climit());
89 set_unicode_cache(isolate_->unicode_cache()); 89 set_unicode_cache(isolate_->unicode_cache());
90 set_script(script); 90 set_script(script);
91 91
92 if (script->type() == Script::TYPE_NATIVE) { 92 if (script->type() == Script::TYPE_NATIVE) {
93 set_native(); 93 set_native();
94 } 94 }
95 } 95 }
96 96
97 97
98 RegExpBuilder::RegExpBuilder(Zone* zone)
99 : zone_(zone),
100 pending_empty_(false),
101 characters_(NULL),
102 terms_(),
103 alternatives_()
104 #ifdef DEBUG
105 , last_added_(ADD_NONE)
106 #endif
107 {}
108
109
110 void RegExpBuilder::FlushCharacters() {
111 pending_empty_ = false;
112 if (characters_ != NULL) {
113 RegExpTree* atom = new(zone()) RegExpAtom(characters_->ToConstVector());
114 characters_ = NULL;
115 text_.Add(atom, zone());
116 LAST(ADD_ATOM);
117 }
118 }
119
120
121 void RegExpBuilder::FlushText() {
122 FlushCharacters();
123 int num_text = text_.length();
124 if (num_text == 0) {
125 return;
126 } else if (num_text == 1) {
127 terms_.Add(text_.last(), zone());
128 } else {
129 RegExpText* text = new(zone()) RegExpText(zone());
130 for (int i = 0; i < num_text; i++)
131 text_.Get(i)->AppendToText(text, zone());
132 terms_.Add(text, zone());
133 }
134 text_.Clear();
135 }
136
137
138 void RegExpBuilder::AddCharacter(uc16 c) {
139 pending_empty_ = false;
140 if (characters_ == NULL) {
141 characters_ = new(zone()) ZoneList<uc16>(4, zone());
142 }
143 characters_->Add(c, zone());
144 LAST(ADD_CHAR);
145 }
146
147
148 void RegExpBuilder::AddEmpty() {
149 pending_empty_ = true;
150 }
151
152
153 void RegExpBuilder::AddAtom(RegExpTree* term) {
154 if (term->IsEmpty()) {
155 AddEmpty();
156 return;
157 }
158 if (term->IsTextElement()) {
159 FlushCharacters();
160 text_.Add(term, zone());
161 } else {
162 FlushText();
163 terms_.Add(term, zone());
164 }
165 LAST(ADD_ATOM);
166 }
167
168
169 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
170 FlushText();
171 terms_.Add(assert, zone());
172 LAST(ADD_ASSERT);
173 }
174
175
176 void RegExpBuilder::NewAlternative() {
177 FlushTerms();
178 }
179
180
181 void RegExpBuilder::FlushTerms() {
182 FlushText();
183 int num_terms = terms_.length();
184 RegExpTree* alternative;
185 if (num_terms == 0) {
186 alternative = new (zone()) RegExpEmpty();
187 } else if (num_terms == 1) {
188 alternative = terms_.last();
189 } else {
190 alternative = new(zone()) RegExpAlternative(terms_.GetList(zone()));
191 }
192 alternatives_.Add(alternative, zone());
193 terms_.Clear();
194 LAST(ADD_NONE);
195 }
196
197
198 RegExpTree* RegExpBuilder::ToRegExp() {
199 FlushTerms();
200 int num_alternatives = alternatives_.length();
201 if (num_alternatives == 0) return new (zone()) RegExpEmpty();
202 if (num_alternatives == 1) return alternatives_.last();
203 return new(zone()) RegExpDisjunction(alternatives_.GetList(zone()));
204 }
205
206
207 void RegExpBuilder::AddQuantifierToAtom(
208 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
209 if (pending_empty_) {
210 pending_empty_ = false;
211 return;
212 }
213 RegExpTree* atom;
214 if (characters_ != NULL) {
215 DCHECK(last_added_ == ADD_CHAR);
216 // Last atom was character.
217 Vector<const uc16> char_vector = characters_->ToConstVector();
218 int num_chars = char_vector.length();
219 if (num_chars > 1) {
220 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
221 text_.Add(new(zone()) RegExpAtom(prefix), zone());
222 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
223 }
224 characters_ = NULL;
225 atom = new(zone()) RegExpAtom(char_vector);
226 FlushText();
227 } else if (text_.length() > 0) {
228 DCHECK(last_added_ == ADD_ATOM);
229 atom = text_.RemoveLast();
230 FlushText();
231 } else if (terms_.length() > 0) {
232 DCHECK(last_added_ == ADD_ATOM);
233 atom = terms_.RemoveLast();
234 if (atom->max_match() == 0) {
235 // Guaranteed to only match an empty string.
236 LAST(ADD_TERM);
237 if (min == 0) {
238 return;
239 }
240 terms_.Add(atom, zone());
241 return;
242 }
243 } else {
244 // Only call immediately after adding an atom or character!
245 UNREACHABLE();
246 return;
247 }
248 terms_.Add(
249 new(zone()) RegExpQuantifier(min, max, quantifier_type, atom), zone());
250 LAST(ADD_TERM);
251 }
252
253
254 FunctionEntry ParseData::GetFunctionEntry(int start) { 98 FunctionEntry ParseData::GetFunctionEntry(int start) {
255 // The current pre-data entry must be a FunctionEntry with the given 99 // The current pre-data entry must be a FunctionEntry with the given
256 // start position. 100 // start position.
257 if ((function_index_ + FunctionEntry::kSize <= Length()) && 101 if ((function_index_ + FunctionEntry::kSize <= Length()) &&
258 (static_cast<int>(Data()[function_index_]) == start)) { 102 (static_cast<int>(Data()[function_index_]) == start)) {
259 int index = function_index_; 103 int index = function_index_;
260 function_index_ += FunctionEntry::kSize; 104 function_index_ += FunctionEntry::kSize;
261 Vector<unsigned> subvector(&(Data()[index]), FunctionEntry::kSize); 105 Vector<unsigned> subvector(&(Data()[index]), FunctionEntry::kSize);
262 return FunctionEntry(subvector); 106 return FunctionEntry(subvector);
263 } 107 }
(...skipping 4927 matching lines...) Expand 10 before | Expand all | Expand 10 after
5191 for (int i = 0; i < use_counts_[feature]; ++i) { 5035 for (int i = 0; i < use_counts_[feature]; ++i) {
5192 isolate->CountUsage(v8::Isolate::UseCounterFeature(feature)); 5036 isolate->CountUsage(v8::Isolate::UseCounterFeature(feature));
5193 } 5037 }
5194 } 5038 }
5195 isolate->counters()->total_preparse_skipped()->Increment( 5039 isolate->counters()->total_preparse_skipped()->Increment(
5196 total_preparse_skipped_); 5040 total_preparse_skipped_);
5197 } 5041 }
5198 5042
5199 5043
5200 // ---------------------------------------------------------------------------- 5044 // ----------------------------------------------------------------------------
5201 // Regular expressions
5202
5203
5204 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
5205 bool multiline, bool unicode, Isolate* isolate,
5206 Zone* zone)
5207 : isolate_(isolate),
5208 zone_(zone),
5209 error_(error),
5210 captures_(NULL),
5211 in_(in),
5212 current_(kEndMarker),
5213 next_pos_(0),
5214 captures_started_(0),
5215 capture_count_(0),
5216 has_more_(true),
5217 multiline_(multiline),
5218 unicode_(unicode),
5219 simple_(false),
5220 contains_anchor_(false),
5221 is_scanned_for_captures_(false),
5222 failed_(false) {
5223 Advance();
5224 }
5225
5226
5227 uc32 RegExpParser::Next() {
5228 if (has_next()) {
5229 return in()->Get(next_pos_);
5230 } else {
5231 return kEndMarker;
5232 }
5233 }
5234
5235
5236 void RegExpParser::Advance() {
5237 if (next_pos_ < in()->length()) {
5238 StackLimitCheck check(isolate());
5239 if (check.HasOverflowed()) {
5240 ReportError(CStrVector(Isolate::kStackOverflowMessage));
5241 } else if (zone()->excess_allocation()) {
5242 ReportError(CStrVector("Regular expression too large"));
5243 } else {
5244 current_ = in()->Get(next_pos_);
5245 next_pos_++;
5246 }
5247 } else {
5248 current_ = kEndMarker;
5249 // Advance so that position() points to 1-after-the-last-character. This is
5250 // important so that Reset() to this position works correctly.
5251 next_pos_ = in()->length() + 1;
5252 has_more_ = false;
5253 }
5254 }
5255
5256
5257 void RegExpParser::Reset(int pos) {
5258 next_pos_ = pos;
5259 has_more_ = (pos < in()->length());
5260 Advance();
5261 }
5262
5263
5264 void RegExpParser::Advance(int dist) {
5265 next_pos_ += dist - 1;
5266 Advance();
5267 }
5268
5269
5270 bool RegExpParser::simple() {
5271 return simple_;
5272 }
5273
5274
5275 bool RegExpParser::IsSyntaxCharacter(uc32 c) {
5276 return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
5277 c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
5278 c == '{' || c == '}' || c == '|';
5279 }
5280
5281
5282 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
5283 failed_ = true;
5284 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
5285 // Zip to the end to make sure the no more input is read.
5286 current_ = kEndMarker;
5287 next_pos_ = in()->length();
5288 return NULL;
5289 }
5290
5291
5292 // Pattern ::
5293 // Disjunction
5294 RegExpTree* RegExpParser::ParsePattern() {
5295 RegExpTree* result = ParseDisjunction(CHECK_FAILED);
5296 DCHECK(!has_more());
5297 // If the result of parsing is a literal string atom, and it has the
5298 // same length as the input, then the atom is identical to the input.
5299 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
5300 simple_ = true;
5301 }
5302 return result;
5303 }
5304
5305
5306 // Disjunction ::
5307 // Alternative
5308 // Alternative | Disjunction
5309 // Alternative ::
5310 // [empty]
5311 // Term Alternative
5312 // Term ::
5313 // Assertion
5314 // Atom
5315 // Atom Quantifier
5316 RegExpTree* RegExpParser::ParseDisjunction() {
5317 // Used to store current state while parsing subexpressions.
5318 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
5319 zone());
5320 RegExpParserState* state = &initial_state;
5321 // Cache the builder in a local variable for quick access.
5322 RegExpBuilder* builder = initial_state.builder();
5323 while (true) {
5324 switch (current()) {
5325 case kEndMarker:
5326 if (state->IsSubexpression()) {
5327 // Inside a parenthesized group when hitting end of input.
5328 ReportError(CStrVector("Unterminated group") CHECK_FAILED);
5329 }
5330 DCHECK_EQ(INITIAL, state->group_type());
5331 // Parsing completed successfully.
5332 return builder->ToRegExp();
5333 case ')': {
5334 if (!state->IsSubexpression()) {
5335 ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
5336 }
5337 DCHECK_NE(INITIAL, state->group_type());
5338
5339 Advance();
5340 // End disjunction parsing and convert builder content to new single
5341 // regexp atom.
5342 RegExpTree* body = builder->ToRegExp();
5343
5344 int end_capture_index = captures_started();
5345
5346 int capture_index = state->capture_index();
5347 SubexpressionType group_type = state->group_type();
5348
5349 // Build result of subexpression.
5350 if (group_type == CAPTURE) {
5351 RegExpCapture* capture = GetCapture(capture_index);
5352 capture->set_body(body);
5353 body = capture;
5354 } else if (group_type != GROUPING) {
5355 DCHECK(group_type == POSITIVE_LOOKAROUND ||
5356 group_type == NEGATIVE_LOOKAROUND);
5357 bool is_positive = (group_type == POSITIVE_LOOKAROUND);
5358 body = new (zone()) RegExpLookaround(
5359 body, is_positive, end_capture_index - capture_index, capture_index,
5360 state->lookaround_type());
5361 }
5362
5363 // Restore previous state.
5364 state = state->previous_state();
5365 builder = state->builder();
5366
5367 builder->AddAtom(body);
5368 // For compatability with JSC and ES3, we allow quantifiers after
5369 // lookaheads, and break in all cases.
5370 break;
5371 }
5372 case '|': {
5373 Advance();
5374 builder->NewAlternative();
5375 continue;
5376 }
5377 case '*':
5378 case '+':
5379 case '?':
5380 return ReportError(CStrVector("Nothing to repeat"));
5381 case '^': {
5382 Advance();
5383 if (multiline_) {
5384 builder->AddAssertion(
5385 new(zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
5386 } else {
5387 builder->AddAssertion(
5388 new(zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
5389 set_contains_anchor();
5390 }
5391 continue;
5392 }
5393 case '$': {
5394 Advance();
5395 RegExpAssertion::AssertionType assertion_type =
5396 multiline_ ? RegExpAssertion::END_OF_LINE :
5397 RegExpAssertion::END_OF_INPUT;
5398 builder->AddAssertion(new(zone()) RegExpAssertion(assertion_type));
5399 continue;
5400 }
5401 case '.': {
5402 Advance();
5403 // everything except \x0a, \x0d, \u2028 and \u2029
5404 ZoneList<CharacterRange>* ranges =
5405 new(zone()) ZoneList<CharacterRange>(2, zone());
5406 CharacterRange::AddClassEscape('.', ranges, zone());
5407 RegExpTree* atom = new(zone()) RegExpCharacterClass(ranges, false);
5408 builder->AddAtom(atom);
5409 break;
5410 }
5411 case '(': {
5412 SubexpressionType subexpr_type = CAPTURE;
5413 RegExpLookaround::Type lookaround_type = state->lookaround_type();
5414 Advance();
5415 if (current() == '?') {
5416 switch (Next()) {
5417 case ':':
5418 subexpr_type = GROUPING;
5419 break;
5420 case '=':
5421 lookaround_type = RegExpLookaround::LOOKAHEAD;
5422 subexpr_type = POSITIVE_LOOKAROUND;
5423 break;
5424 case '!':
5425 lookaround_type = RegExpLookaround::LOOKAHEAD;
5426 subexpr_type = NEGATIVE_LOOKAROUND;
5427 break;
5428 case '<':
5429 if (FLAG_harmony_regexp_lookbehind) {
5430 Advance();
5431 lookaround_type = RegExpLookaround::LOOKBEHIND;
5432 if (Next() == '=') {
5433 subexpr_type = POSITIVE_LOOKAROUND;
5434 break;
5435 } else if (Next() == '!') {
5436 subexpr_type = NEGATIVE_LOOKAROUND;
5437 break;
5438 }
5439 }
5440 // Fall through.
5441 default:
5442 ReportError(CStrVector("Invalid group") CHECK_FAILED);
5443 break;
5444 }
5445 Advance(2);
5446 } else {
5447 if (captures_started_ >= kMaxCaptures) {
5448 ReportError(CStrVector("Too many captures") CHECK_FAILED);
5449 }
5450 captures_started_++;
5451 }
5452 // Store current state and begin new disjunction parsing.
5453 state = new (zone()) RegExpParserState(
5454 state, subexpr_type, lookaround_type, captures_started_, zone());
5455 builder = state->builder();
5456 continue;
5457 }
5458 case '[': {
5459 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);
5460 builder->AddAtom(atom);
5461 break;
5462 }
5463 // Atom ::
5464 // \ AtomEscape
5465 case '\\':
5466 switch (Next()) {
5467 case kEndMarker:
5468 return ReportError(CStrVector("\\ at end of pattern"));
5469 case 'b':
5470 Advance(2);
5471 builder->AddAssertion(
5472 new(zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
5473 continue;
5474 case 'B':
5475 Advance(2);
5476 builder->AddAssertion(
5477 new(zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
5478 continue;
5479 // AtomEscape ::
5480 // CharacterClassEscape
5481 //
5482 // CharacterClassEscape :: one of
5483 // d D s S w W
5484 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {
5485 uc32 c = Next();
5486 Advance(2);
5487 ZoneList<CharacterRange>* ranges =
5488 new(zone()) ZoneList<CharacterRange>(2, zone());
5489 CharacterRange::AddClassEscape(c, ranges, zone());
5490 RegExpTree* atom = new(zone()) RegExpCharacterClass(ranges, false);
5491 builder->AddAtom(atom);
5492 break;
5493 }
5494 case '1': case '2': case '3': case '4': case '5': case '6':
5495 case '7': case '8': case '9': {
5496 int index = 0;
5497 if (ParseBackReferenceIndex(&index)) {
5498 if (state->IsInsideCaptureGroup(index)) {
5499 // The back reference is inside the capture group it refers to.
5500 // Nothing can possibly have been captured yet, so we use empty
5501 // instead. This ensures that, when checking a back reference,
5502 // the capture registers of the referenced capture are either
5503 // both set or both cleared.
5504 builder->AddEmpty();
5505 } else {
5506 RegExpCapture* capture = GetCapture(index);
5507 RegExpTree* atom = new (zone()) RegExpBackReference(capture);
5508 builder->AddAtom(atom);
5509 }
5510 break;
5511 }
5512 uc32 first_digit = Next();
5513 if (first_digit == '8' || first_digit == '9') {
5514 // If the 'u' flag is present, only syntax characters can be escaped,
5515 // no other identity escapes are allowed. If the 'u' flag is not
5516 // present, all identity escapes are allowed.
5517 if (!FLAG_harmony_unicode_regexps || !unicode_) {
5518 builder->AddCharacter(first_digit);
5519 Advance(2);
5520 } else {
5521 return ReportError(CStrVector("Invalid escape"));
5522 }
5523 break;
5524 }
5525 }
5526 // FALLTHROUGH
5527 case '0': {
5528 Advance();
5529 uc32 octal = ParseOctalLiteral();
5530 builder->AddCharacter(octal);
5531 break;
5532 }
5533 // ControlEscape :: one of
5534 // f n r t v
5535 case 'f':
5536 Advance(2);
5537 builder->AddCharacter('\f');
5538 break;
5539 case 'n':
5540 Advance(2);
5541 builder->AddCharacter('\n');
5542 break;
5543 case 'r':
5544 Advance(2);
5545 builder->AddCharacter('\r');
5546 break;
5547 case 't':
5548 Advance(2);
5549 builder->AddCharacter('\t');
5550 break;
5551 case 'v':
5552 Advance(2);
5553 builder->AddCharacter('\v');
5554 break;
5555 case 'c': {
5556 Advance();
5557 uc32 controlLetter = Next();
5558 // Special case if it is an ASCII letter.
5559 // Convert lower case letters to uppercase.
5560 uc32 letter = controlLetter & ~('a' ^ 'A');
5561 if (letter < 'A' || 'Z' < letter) {
5562 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
5563 // This is outside the specification. We match JSC in
5564 // reading the backslash as a literal character instead
5565 // of as starting an escape.
5566 builder->AddCharacter('\\');
5567 } else {
5568 Advance(2);
5569 builder->AddCharacter(controlLetter & 0x1f);
5570 }
5571 break;
5572 }
5573 case 'x': {
5574 Advance(2);
5575 uc32 value;
5576 if (ParseHexEscape(2, &value)) {
5577 builder->AddCharacter(value);
5578 } else if (!FLAG_harmony_unicode_regexps || !unicode_) {
5579 builder->AddCharacter('x');
5580 } else {
5581 // If the 'u' flag is present, invalid escapes are not treated as
5582 // identity escapes.
5583 return ReportError(CStrVector("Invalid escape"));
5584 }
5585 break;
5586 }
5587 case 'u': {
5588 Advance(2);
5589 uc32 value;
5590 if (ParseUnicodeEscape(&value)) {
5591 builder->AddCharacter(value);
5592 } else if (!FLAG_harmony_unicode_regexps || !unicode_) {
5593 builder->AddCharacter('u');
5594 } else {
5595 // If the 'u' flag is present, invalid escapes are not treated as
5596 // identity escapes.
5597 return ReportError(CStrVector("Invalid unicode escape"));
5598 }
5599 break;
5600 }
5601 default:
5602 Advance();
5603 // If the 'u' flag is present, only syntax characters can be escaped, no
5604 // other identity escapes are allowed. If the 'u' flag is not present,
5605 // all identity escapes are allowed.
5606 if (!FLAG_harmony_unicode_regexps || !unicode_ ||
5607 IsSyntaxCharacter(current())) {
5608 builder->AddCharacter(current());
5609 Advance();
5610 } else {
5611 return ReportError(CStrVector("Invalid escape"));
5612 }
5613 break;
5614 }
5615 break;
5616 case '{': {
5617 int dummy;
5618 if (ParseIntervalQuantifier(&dummy, &dummy)) {
5619 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
5620 }
5621 // fallthrough
5622 }
5623 default:
5624 builder->AddCharacter(current());
5625 Advance();
5626 break;
5627 } // end switch(current())
5628
5629 int min;
5630 int max;
5631 switch (current()) {
5632 // QuantifierPrefix ::
5633 // *
5634 // +
5635 // ?
5636 // {
5637 case '*':
5638 min = 0;
5639 max = RegExpTree::kInfinity;
5640 Advance();
5641 break;
5642 case '+':
5643 min = 1;
5644 max = RegExpTree::kInfinity;
5645 Advance();
5646 break;
5647 case '?':
5648 min = 0;
5649 max = 1;
5650 Advance();
5651 break;
5652 case '{':
5653 if (ParseIntervalQuantifier(&min, &max)) {
5654 if (max < min) {
5655 ReportError(CStrVector("numbers out of order in {} quantifier.")
5656 CHECK_FAILED);
5657 }
5658 break;
5659 } else {
5660 continue;
5661 }
5662 default:
5663 continue;
5664 }
5665 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
5666 if (current() == '?') {
5667 quantifier_type = RegExpQuantifier::NON_GREEDY;
5668 Advance();
5669 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
5670 // FLAG_regexp_possessive_quantifier is a debug-only flag.
5671 quantifier_type = RegExpQuantifier::POSSESSIVE;
5672 Advance();
5673 }
5674 builder->AddQuantifierToAtom(min, max, quantifier_type);
5675 }
5676 }
5677
5678
5679 #ifdef DEBUG
5680 // Currently only used in an DCHECK.
5681 static bool IsSpecialClassEscape(uc32 c) {
5682 switch (c) {
5683 case 'd': case 'D':
5684 case 's': case 'S':
5685 case 'w': case 'W':
5686 return true;
5687 default:
5688 return false;
5689 }
5690 }
5691 #endif
5692
5693
5694 // In order to know whether an escape is a backreference or not we have to scan
5695 // the entire regexp and find the number of capturing parentheses. However we
5696 // don't want to scan the regexp twice unless it is necessary. This mini-parser
5697 // is called when needed. It can see the difference between capturing and
5698 // noncapturing parentheses and can skip character classes and backslash-escaped
5699 // characters.
5700 void RegExpParser::ScanForCaptures() {
5701 // Start with captures started previous to current position
5702 int capture_count = captures_started();
5703 // Add count of captures after this position.
5704 int n;
5705 while ((n = current()) != kEndMarker) {
5706 Advance();
5707 switch (n) {
5708 case '\\':
5709 Advance();
5710 break;
5711 case '[': {
5712 int c;
5713 while ((c = current()) != kEndMarker) {
5714 Advance();
5715 if (c == '\\') {
5716 Advance();
5717 } else {
5718 if (c == ']') break;
5719 }
5720 }
5721 break;
5722 }
5723 case '(':
5724 if (current() != '?') capture_count++;
5725 break;
5726 }
5727 }
5728 capture_count_ = capture_count;
5729 is_scanned_for_captures_ = true;
5730 }
5731
5732
5733 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
5734 DCHECK_EQ('\\', current());
5735 DCHECK('1' <= Next() && Next() <= '9');
5736 // Try to parse a decimal literal that is no greater than the total number
5737 // of left capturing parentheses in the input.
5738 int start = position();
5739 int value = Next() - '0';
5740 Advance(2);
5741 while (true) {
5742 uc32 c = current();
5743 if (IsDecimalDigit(c)) {
5744 value = 10 * value + (c - '0');
5745 if (value > kMaxCaptures) {
5746 Reset(start);
5747 return false;
5748 }
5749 Advance();
5750 } else {
5751 break;
5752 }
5753 }
5754 if (value > captures_started()) {
5755 if (!is_scanned_for_captures_) {
5756 int saved_position = position();
5757 ScanForCaptures();
5758 Reset(saved_position);
5759 }
5760 if (value > capture_count_) {
5761 Reset(start);
5762 return false;
5763 }
5764 }
5765 *index_out = value;
5766 return true;
5767 }
5768
5769
5770 RegExpCapture* RegExpParser::GetCapture(int index) {
5771 // The index for the capture groups are one-based. Its index in the list is
5772 // zero-based.
5773 int know_captures =
5774 is_scanned_for_captures_ ? capture_count_ : captures_started_;
5775 DCHECK(index <= know_captures);
5776 if (captures_ == NULL) {
5777 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
5778 }
5779 while (captures_->length() < know_captures) {
5780 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
5781 }
5782 return captures_->at(index - 1);
5783 }
5784
5785
5786 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
5787 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
5788 if (s->group_type() != CAPTURE) continue;
5789 // Return true if we found the matching capture index.
5790 if (index == s->capture_index()) return true;
5791 // Abort if index is larger than what has been parsed up till this state.
5792 if (index > s->capture_index()) return false;
5793 }
5794 return false;
5795 }
5796
5797
5798 // QuantifierPrefix ::
5799 // { DecimalDigits }
5800 // { DecimalDigits , }
5801 // { DecimalDigits , DecimalDigits }
5802 //
5803 // Returns true if parsing succeeds, and set the min_out and max_out
5804 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
5805 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
5806 DCHECK_EQ(current(), '{');
5807 int start = position();
5808 Advance();
5809 int min = 0;
5810 if (!IsDecimalDigit(current())) {
5811 Reset(start);
5812 return false;
5813 }
5814 while (IsDecimalDigit(current())) {
5815 int next = current() - '0';
5816 if (min > (RegExpTree::kInfinity - next) / 10) {
5817 // Overflow. Skip past remaining decimal digits and return -1.
5818 do {
5819 Advance();
5820 } while (IsDecimalDigit(current()));
5821 min = RegExpTree::kInfinity;
5822 break;
5823 }
5824 min = 10 * min + next;
5825 Advance();
5826 }
5827 int max = 0;
5828 if (current() == '}') {
5829 max = min;
5830 Advance();
5831 } else if (current() == ',') {
5832 Advance();
5833 if (current() == '}') {
5834 max = RegExpTree::kInfinity;
5835 Advance();
5836 } else {
5837 while (IsDecimalDigit(current())) {
5838 int next = current() - '0';
5839 if (max > (RegExpTree::kInfinity - next) / 10) {
5840 do {
5841 Advance();
5842 } while (IsDecimalDigit(current()));
5843 max = RegExpTree::kInfinity;
5844 break;
5845 }
5846 max = 10 * max + next;
5847 Advance();
5848 }
5849 if (current() != '}') {
5850 Reset(start);
5851 return false;
5852 }
5853 Advance();
5854 }
5855 } else {
5856 Reset(start);
5857 return false;
5858 }
5859 *min_out = min;
5860 *max_out = max;
5861 return true;
5862 }
5863
5864
5865 uc32 RegExpParser::ParseOctalLiteral() {
5866 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
5867 // For compatibility with some other browsers (not all), we parse
5868 // up to three octal digits with a value below 256.
5869 uc32 value = current() - '0';
5870 Advance();
5871 if ('0' <= current() && current() <= '7') {
5872 value = value * 8 + current() - '0';
5873 Advance();
5874 if (value < 32 && '0' <= current() && current() <= '7') {
5875 value = value * 8 + current() - '0';
5876 Advance();
5877 }
5878 }
5879 return value;
5880 }
5881
5882
5883 bool RegExpParser::ParseHexEscape(int length, uc32* value) {
5884 int start = position();
5885 uc32 val = 0;
5886 for (int i = 0; i < length; ++i) {
5887 uc32 c = current();
5888 int d = HexValue(c);
5889 if (d < 0) {
5890 Reset(start);
5891 return false;
5892 }
5893 val = val * 16 + d;
5894 Advance();
5895 }
5896 *value = val;
5897 return true;
5898 }
5899
5900
5901 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
5902 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
5903 // allowed). In the latter case, the number of hex digits between { } is
5904 // arbitrary. \ and u have already been read.
5905 if (current() == '{' && FLAG_harmony_unicode_regexps && unicode_) {
5906 int start = position();
5907 Advance();
5908 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
5909 if (current() == '}') {
5910 Advance();
5911 return true;
5912 }
5913 }
5914 Reset(start);
5915 return false;
5916 }
5917 // \u but no {, or \u{...} escapes not allowed.
5918 return ParseHexEscape(4, value);
5919 }
5920
5921
5922 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
5923 uc32 x = 0;
5924 int d = HexValue(current());
5925 if (d < 0) {
5926 return false;
5927 }
5928 while (d >= 0) {
5929 x = x * 16 + d;
5930 if (x > max_value) {
5931 return false;
5932 }
5933 Advance();
5934 d = HexValue(current());
5935 }
5936 *value = x;
5937 return true;
5938 }
5939
5940
5941 uc32 RegExpParser::ParseClassCharacterEscape() {
5942 DCHECK(current() == '\\');
5943 DCHECK(has_next() && !IsSpecialClassEscape(Next()));
5944 Advance();
5945 switch (current()) {
5946 case 'b':
5947 Advance();
5948 return '\b';
5949 // ControlEscape :: one of
5950 // f n r t v
5951 case 'f':
5952 Advance();
5953 return '\f';
5954 case 'n':
5955 Advance();
5956 return '\n';
5957 case 'r':
5958 Advance();
5959 return '\r';
5960 case 't':
5961 Advance();
5962 return '\t';
5963 case 'v':
5964 Advance();
5965 return '\v';
5966 case 'c': {
5967 uc32 controlLetter = Next();
5968 uc32 letter = controlLetter & ~('A' ^ 'a');
5969 // For compatibility with JSC, inside a character class
5970 // we also accept digits and underscore as control characters.
5971 if ((controlLetter >= '0' && controlLetter <= '9') ||
5972 controlLetter == '_' ||
5973 (letter >= 'A' && letter <= 'Z')) {
5974 Advance(2);
5975 // Control letters mapped to ASCII control characters in the range
5976 // 0x00-0x1f.
5977 return controlLetter & 0x1f;
5978 }
5979 // We match JSC in reading the backslash as a literal
5980 // character instead of as starting an escape.
5981 return '\\';
5982 }
5983 case '0': case '1': case '2': case '3': case '4': case '5':
5984 case '6': case '7':
5985 // For compatibility, we interpret a decimal escape that isn't
5986 // a back reference (and therefore either \0 or not valid according
5987 // to the specification) as a 1..3 digit octal character code.
5988 return ParseOctalLiteral();
5989 case 'x': {
5990 Advance();
5991 uc32 value;
5992 if (ParseHexEscape(2, &value)) {
5993 return value;
5994 }
5995 if (!FLAG_harmony_unicode_regexps || !unicode_) {
5996 // If \x is not followed by a two-digit hexadecimal, treat it
5997 // as an identity escape.
5998 return 'x';
5999 }
6000 // If the 'u' flag is present, invalid escapes are not treated as
6001 // identity escapes.
6002 ReportError(CStrVector("Invalid escape"));
6003 return 0;
6004 }
6005 case 'u': {
6006 Advance();
6007 uc32 value;
6008 if (ParseUnicodeEscape(&value)) {
6009 return value;
6010 }
6011 if (!FLAG_harmony_unicode_regexps || !unicode_) {
6012 return 'u';
6013 }
6014 // If the 'u' flag is present, invalid escapes are not treated as
6015 // identity escapes.
6016 ReportError(CStrVector("Invalid unicode escape"));
6017 return 0;
6018 }
6019 default: {
6020 uc32 result = current();
6021 // If the 'u' flag is present, only syntax characters can be escaped, no
6022 // other identity escapes are allowed. If the 'u' flag is not present, all
6023 // identity escapes are allowed.
6024 if (!FLAG_harmony_unicode_regexps || !unicode_ ||
6025 IsSyntaxCharacter(result)) {
6026 Advance();
6027 return result;
6028 }
6029 ReportError(CStrVector("Invalid escape"));
6030 return 0;
6031 }
6032 }
6033 return 0;
6034 }
6035
6036
6037 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
6038 DCHECK_EQ(0, *char_class);
6039 uc32 first = current();
6040 if (first == '\\') {
6041 switch (Next()) {
6042 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {
6043 *char_class = Next();
6044 Advance(2);
6045 return CharacterRange::Singleton(0); // Return dummy value.
6046 }
6047 case kEndMarker:
6048 return ReportError(CStrVector("\\ at end of pattern"));
6049 default:
6050 uc32 c = ParseClassCharacterEscape(CHECK_FAILED);
6051 return CharacterRange::Singleton(c);
6052 }
6053 } else {
6054 Advance();
6055 return CharacterRange::Singleton(first);
6056 }
6057 }
6058
6059
6060 static const uc16 kNoCharClass = 0;
6061
6062 // Adds range or pre-defined character class to character ranges.
6063 // If char_class is not kInvalidClass, it's interpreted as a class
6064 // escape (i.e., 's' means whitespace, from '\s').
6065 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
6066 uc16 char_class,
6067 CharacterRange range,
6068 Zone* zone) {
6069 if (char_class != kNoCharClass) {
6070 CharacterRange::AddClassEscape(char_class, ranges, zone);
6071 } else {
6072 ranges->Add(range, zone);
6073 }
6074 }
6075
6076
6077 RegExpTree* RegExpParser::ParseCharacterClass() {
6078 static const char* kUnterminated = "Unterminated character class";
6079 static const char* kRangeOutOfOrder = "Range out of order in character class";
6080
6081 DCHECK_EQ(current(), '[');
6082 Advance();
6083 bool is_negated = false;
6084 if (current() == '^') {
6085 is_negated = true;
6086 Advance();
6087 }
6088 ZoneList<CharacterRange>* ranges =
6089 new(zone()) ZoneList<CharacterRange>(2, zone());
6090 while (has_more() && current() != ']') {
6091 uc16 char_class = kNoCharClass;
6092 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
6093 if (current() == '-') {
6094 Advance();
6095 if (current() == kEndMarker) {
6096 // If we reach the end we break out of the loop and let the
6097 // following code report an error.
6098 break;
6099 } else if (current() == ']') {
6100 AddRangeOrEscape(ranges, char_class, first, zone());
6101 ranges->Add(CharacterRange::Singleton('-'), zone());
6102 break;
6103 }
6104 uc16 char_class_2 = kNoCharClass;
6105 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
6106 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
6107 // Either end is an escaped character class. Treat the '-' verbatim.
6108 AddRangeOrEscape(ranges, char_class, first, zone());
6109 ranges->Add(CharacterRange::Singleton('-'), zone());
6110 AddRangeOrEscape(ranges, char_class_2, next, zone());
6111 continue;
6112 }
6113 if (first.from() > next.to()) {
6114 return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
6115 }
6116 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
6117 } else {
6118 AddRangeOrEscape(ranges, char_class, first, zone());
6119 }
6120 }
6121 if (!has_more()) {
6122 return ReportError(CStrVector(kUnterminated) CHECK_FAILED);
6123 }
6124 Advance();
6125 if (ranges->length() == 0) {
6126 ranges->Add(CharacterRange::Everything(), zone());
6127 is_negated = !is_negated;
6128 }
6129 return new (zone()) RegExpCharacterClass(ranges, is_negated);
6130 }
6131
6132
6133 // ----------------------------------------------------------------------------
6134 // The Parser interface. 5045 // The Parser interface.
6135 5046
6136 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
6137 FlatStringReader* input, bool multiline,
6138 bool unicode, RegExpCompileData* result) {
6139 DCHECK(result != NULL);
6140 RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);
6141 RegExpTree* tree = parser.ParsePattern();
6142 if (parser.failed()) {
6143 DCHECK(tree == NULL);
6144 DCHECK(!result->error.is_null());
6145 } else {
6146 DCHECK(tree != NULL);
6147 DCHECK(result->error.is_null());
6148 result->tree = tree;
6149 int capture_count = parser.captures_started();
6150 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
6151 result->contains_anchor = parser.contains_anchor();
6152 result->capture_count = capture_count;
6153 }
6154 return !parser.failed();
6155 }
6156
6157 5047
6158 bool Parser::ParseStatic(ParseInfo* info) { 5048 bool Parser::ParseStatic(ParseInfo* info) {
6159 Parser parser(info); 5049 Parser parser(info);
6160 if (parser.Parse(info)) { 5050 if (parser.Parse(info)) {
6161 info->set_language_mode(info->literal()->language_mode()); 5051 info->set_language_mode(info->literal()->language_mode());
6162 return true; 5052 return true;
6163 } 5053 }
6164 return false; 5054 return false;
6165 } 5055 }
6166 5056
(...skipping 393 matching lines...) Expand 10 before | Expand all | Expand 10 after
6560 auto class_literal = value->AsClassLiteral(); 5450 auto class_literal = value->AsClassLiteral();
6561 if (class_literal->raw_name() == nullptr) { 5451 if (class_literal->raw_name() == nullptr) {
6562 class_literal->set_raw_name(name); 5452 class_literal->set_raw_name(name);
6563 } 5453 }
6564 } 5454 }
6565 } 5455 }
6566 5456
6567 5457
6568 } // namespace internal 5458 } // namespace internal
6569 } // namespace v8 5459 } // namespace v8
OLDNEW
« no previous file with comments | « src/parsing/parser.h ('k') | src/parsing/scanner.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698