OLD | NEW |
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/parsing/parser.h" | 5 #include "src/parsing/parser.h" |
6 | 6 |
7 #include "src/api.h" | 7 #include "src/api.h" |
8 #include "src/ast/ast.h" | 8 #include "src/ast/ast.h" |
9 #include "src/ast/ast-expression-visitor.h" | 9 #include "src/ast/ast-expression-visitor.h" |
10 #include "src/ast/ast-literal-reindexer.h" | 10 #include "src/ast/ast-literal-reindexer.h" |
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
88 set_stack_limit(isolate_->stack_guard()->real_climit()); | 88 set_stack_limit(isolate_->stack_guard()->real_climit()); |
89 set_unicode_cache(isolate_->unicode_cache()); | 89 set_unicode_cache(isolate_->unicode_cache()); |
90 set_script(script); | 90 set_script(script); |
91 | 91 |
92 if (script->type() == Script::TYPE_NATIVE) { | 92 if (script->type() == Script::TYPE_NATIVE) { |
93 set_native(); | 93 set_native(); |
94 } | 94 } |
95 } | 95 } |
96 | 96 |
97 | 97 |
98 RegExpBuilder::RegExpBuilder(Zone* zone) | |
99 : zone_(zone), | |
100 pending_empty_(false), | |
101 characters_(NULL), | |
102 terms_(), | |
103 alternatives_() | |
104 #ifdef DEBUG | |
105 , last_added_(ADD_NONE) | |
106 #endif | |
107 {} | |
108 | |
109 | |
110 void RegExpBuilder::FlushCharacters() { | |
111 pending_empty_ = false; | |
112 if (characters_ != NULL) { | |
113 RegExpTree* atom = new(zone()) RegExpAtom(characters_->ToConstVector()); | |
114 characters_ = NULL; | |
115 text_.Add(atom, zone()); | |
116 LAST(ADD_ATOM); | |
117 } | |
118 } | |
119 | |
120 | |
121 void RegExpBuilder::FlushText() { | |
122 FlushCharacters(); | |
123 int num_text = text_.length(); | |
124 if (num_text == 0) { | |
125 return; | |
126 } else if (num_text == 1) { | |
127 terms_.Add(text_.last(), zone()); | |
128 } else { | |
129 RegExpText* text = new(zone()) RegExpText(zone()); | |
130 for (int i = 0; i < num_text; i++) | |
131 text_.Get(i)->AppendToText(text, zone()); | |
132 terms_.Add(text, zone()); | |
133 } | |
134 text_.Clear(); | |
135 } | |
136 | |
137 | |
138 void RegExpBuilder::AddCharacter(uc16 c) { | |
139 pending_empty_ = false; | |
140 if (characters_ == NULL) { | |
141 characters_ = new(zone()) ZoneList<uc16>(4, zone()); | |
142 } | |
143 characters_->Add(c, zone()); | |
144 LAST(ADD_CHAR); | |
145 } | |
146 | |
147 | |
148 void RegExpBuilder::AddEmpty() { | |
149 pending_empty_ = true; | |
150 } | |
151 | |
152 | |
153 void RegExpBuilder::AddAtom(RegExpTree* term) { | |
154 if (term->IsEmpty()) { | |
155 AddEmpty(); | |
156 return; | |
157 } | |
158 if (term->IsTextElement()) { | |
159 FlushCharacters(); | |
160 text_.Add(term, zone()); | |
161 } else { | |
162 FlushText(); | |
163 terms_.Add(term, zone()); | |
164 } | |
165 LAST(ADD_ATOM); | |
166 } | |
167 | |
168 | |
169 void RegExpBuilder::AddAssertion(RegExpTree* assert) { | |
170 FlushText(); | |
171 terms_.Add(assert, zone()); | |
172 LAST(ADD_ASSERT); | |
173 } | |
174 | |
175 | |
176 void RegExpBuilder::NewAlternative() { | |
177 FlushTerms(); | |
178 } | |
179 | |
180 | |
181 void RegExpBuilder::FlushTerms() { | |
182 FlushText(); | |
183 int num_terms = terms_.length(); | |
184 RegExpTree* alternative; | |
185 if (num_terms == 0) { | |
186 alternative = new (zone()) RegExpEmpty(); | |
187 } else if (num_terms == 1) { | |
188 alternative = terms_.last(); | |
189 } else { | |
190 alternative = new(zone()) RegExpAlternative(terms_.GetList(zone())); | |
191 } | |
192 alternatives_.Add(alternative, zone()); | |
193 terms_.Clear(); | |
194 LAST(ADD_NONE); | |
195 } | |
196 | |
197 | |
198 RegExpTree* RegExpBuilder::ToRegExp() { | |
199 FlushTerms(); | |
200 int num_alternatives = alternatives_.length(); | |
201 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); | |
202 if (num_alternatives == 1) return alternatives_.last(); | |
203 return new(zone()) RegExpDisjunction(alternatives_.GetList(zone())); | |
204 } | |
205 | |
206 | |
207 void RegExpBuilder::AddQuantifierToAtom( | |
208 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { | |
209 if (pending_empty_) { | |
210 pending_empty_ = false; | |
211 return; | |
212 } | |
213 RegExpTree* atom; | |
214 if (characters_ != NULL) { | |
215 DCHECK(last_added_ == ADD_CHAR); | |
216 // Last atom was character. | |
217 Vector<const uc16> char_vector = characters_->ToConstVector(); | |
218 int num_chars = char_vector.length(); | |
219 if (num_chars > 1) { | |
220 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1); | |
221 text_.Add(new(zone()) RegExpAtom(prefix), zone()); | |
222 char_vector = char_vector.SubVector(num_chars - 1, num_chars); | |
223 } | |
224 characters_ = NULL; | |
225 atom = new(zone()) RegExpAtom(char_vector); | |
226 FlushText(); | |
227 } else if (text_.length() > 0) { | |
228 DCHECK(last_added_ == ADD_ATOM); | |
229 atom = text_.RemoveLast(); | |
230 FlushText(); | |
231 } else if (terms_.length() > 0) { | |
232 DCHECK(last_added_ == ADD_ATOM); | |
233 atom = terms_.RemoveLast(); | |
234 if (atom->max_match() == 0) { | |
235 // Guaranteed to only match an empty string. | |
236 LAST(ADD_TERM); | |
237 if (min == 0) { | |
238 return; | |
239 } | |
240 terms_.Add(atom, zone()); | |
241 return; | |
242 } | |
243 } else { | |
244 // Only call immediately after adding an atom or character! | |
245 UNREACHABLE(); | |
246 return; | |
247 } | |
248 terms_.Add( | |
249 new(zone()) RegExpQuantifier(min, max, quantifier_type, atom), zone()); | |
250 LAST(ADD_TERM); | |
251 } | |
252 | |
253 | |
254 FunctionEntry ParseData::GetFunctionEntry(int start) { | 98 FunctionEntry ParseData::GetFunctionEntry(int start) { |
255 // The current pre-data entry must be a FunctionEntry with the given | 99 // The current pre-data entry must be a FunctionEntry with the given |
256 // start position. | 100 // start position. |
257 if ((function_index_ + FunctionEntry::kSize <= Length()) && | 101 if ((function_index_ + FunctionEntry::kSize <= Length()) && |
258 (static_cast<int>(Data()[function_index_]) == start)) { | 102 (static_cast<int>(Data()[function_index_]) == start)) { |
259 int index = function_index_; | 103 int index = function_index_; |
260 function_index_ += FunctionEntry::kSize; | 104 function_index_ += FunctionEntry::kSize; |
261 Vector<unsigned> subvector(&(Data()[index]), FunctionEntry::kSize); | 105 Vector<unsigned> subvector(&(Data()[index]), FunctionEntry::kSize); |
262 return FunctionEntry(subvector); | 106 return FunctionEntry(subvector); |
263 } | 107 } |
(...skipping 4927 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5191 for (int i = 0; i < use_counts_[feature]; ++i) { | 5035 for (int i = 0; i < use_counts_[feature]; ++i) { |
5192 isolate->CountUsage(v8::Isolate::UseCounterFeature(feature)); | 5036 isolate->CountUsage(v8::Isolate::UseCounterFeature(feature)); |
5193 } | 5037 } |
5194 } | 5038 } |
5195 isolate->counters()->total_preparse_skipped()->Increment( | 5039 isolate->counters()->total_preparse_skipped()->Increment( |
5196 total_preparse_skipped_); | 5040 total_preparse_skipped_); |
5197 } | 5041 } |
5198 | 5042 |
5199 | 5043 |
5200 // ---------------------------------------------------------------------------- | 5044 // ---------------------------------------------------------------------------- |
5201 // Regular expressions | |
5202 | |
5203 | |
5204 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, | |
5205 bool multiline, bool unicode, Isolate* isolate, | |
5206 Zone* zone) | |
5207 : isolate_(isolate), | |
5208 zone_(zone), | |
5209 error_(error), | |
5210 captures_(NULL), | |
5211 in_(in), | |
5212 current_(kEndMarker), | |
5213 next_pos_(0), | |
5214 captures_started_(0), | |
5215 capture_count_(0), | |
5216 has_more_(true), | |
5217 multiline_(multiline), | |
5218 unicode_(unicode), | |
5219 simple_(false), | |
5220 contains_anchor_(false), | |
5221 is_scanned_for_captures_(false), | |
5222 failed_(false) { | |
5223 Advance(); | |
5224 } | |
5225 | |
5226 | |
5227 uc32 RegExpParser::Next() { | |
5228 if (has_next()) { | |
5229 return in()->Get(next_pos_); | |
5230 } else { | |
5231 return kEndMarker; | |
5232 } | |
5233 } | |
5234 | |
5235 | |
5236 void RegExpParser::Advance() { | |
5237 if (next_pos_ < in()->length()) { | |
5238 StackLimitCheck check(isolate()); | |
5239 if (check.HasOverflowed()) { | |
5240 ReportError(CStrVector(Isolate::kStackOverflowMessage)); | |
5241 } else if (zone()->excess_allocation()) { | |
5242 ReportError(CStrVector("Regular expression too large")); | |
5243 } else { | |
5244 current_ = in()->Get(next_pos_); | |
5245 next_pos_++; | |
5246 } | |
5247 } else { | |
5248 current_ = kEndMarker; | |
5249 // Advance so that position() points to 1-after-the-last-character. This is | |
5250 // important so that Reset() to this position works correctly. | |
5251 next_pos_ = in()->length() + 1; | |
5252 has_more_ = false; | |
5253 } | |
5254 } | |
5255 | |
5256 | |
5257 void RegExpParser::Reset(int pos) { | |
5258 next_pos_ = pos; | |
5259 has_more_ = (pos < in()->length()); | |
5260 Advance(); | |
5261 } | |
5262 | |
5263 | |
5264 void RegExpParser::Advance(int dist) { | |
5265 next_pos_ += dist - 1; | |
5266 Advance(); | |
5267 } | |
5268 | |
5269 | |
5270 bool RegExpParser::simple() { | |
5271 return simple_; | |
5272 } | |
5273 | |
5274 | |
5275 bool RegExpParser::IsSyntaxCharacter(uc32 c) { | |
5276 return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' || | |
5277 c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' || | |
5278 c == '{' || c == '}' || c == '|'; | |
5279 } | |
5280 | |
5281 | |
5282 RegExpTree* RegExpParser::ReportError(Vector<const char> message) { | |
5283 failed_ = true; | |
5284 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked(); | |
5285 // Zip to the end to make sure the no more input is read. | |
5286 current_ = kEndMarker; | |
5287 next_pos_ = in()->length(); | |
5288 return NULL; | |
5289 } | |
5290 | |
5291 | |
5292 // Pattern :: | |
5293 // Disjunction | |
5294 RegExpTree* RegExpParser::ParsePattern() { | |
5295 RegExpTree* result = ParseDisjunction(CHECK_FAILED); | |
5296 DCHECK(!has_more()); | |
5297 // If the result of parsing is a literal string atom, and it has the | |
5298 // same length as the input, then the atom is identical to the input. | |
5299 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) { | |
5300 simple_ = true; | |
5301 } | |
5302 return result; | |
5303 } | |
5304 | |
5305 | |
5306 // Disjunction :: | |
5307 // Alternative | |
5308 // Alternative | Disjunction | |
5309 // Alternative :: | |
5310 // [empty] | |
5311 // Term Alternative | |
5312 // Term :: | |
5313 // Assertion | |
5314 // Atom | |
5315 // Atom Quantifier | |
5316 RegExpTree* RegExpParser::ParseDisjunction() { | |
5317 // Used to store current state while parsing subexpressions. | |
5318 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, | |
5319 zone()); | |
5320 RegExpParserState* state = &initial_state; | |
5321 // Cache the builder in a local variable for quick access. | |
5322 RegExpBuilder* builder = initial_state.builder(); | |
5323 while (true) { | |
5324 switch (current()) { | |
5325 case kEndMarker: | |
5326 if (state->IsSubexpression()) { | |
5327 // Inside a parenthesized group when hitting end of input. | |
5328 ReportError(CStrVector("Unterminated group") CHECK_FAILED); | |
5329 } | |
5330 DCHECK_EQ(INITIAL, state->group_type()); | |
5331 // Parsing completed successfully. | |
5332 return builder->ToRegExp(); | |
5333 case ')': { | |
5334 if (!state->IsSubexpression()) { | |
5335 ReportError(CStrVector("Unmatched ')'") CHECK_FAILED); | |
5336 } | |
5337 DCHECK_NE(INITIAL, state->group_type()); | |
5338 | |
5339 Advance(); | |
5340 // End disjunction parsing and convert builder content to new single | |
5341 // regexp atom. | |
5342 RegExpTree* body = builder->ToRegExp(); | |
5343 | |
5344 int end_capture_index = captures_started(); | |
5345 | |
5346 int capture_index = state->capture_index(); | |
5347 SubexpressionType group_type = state->group_type(); | |
5348 | |
5349 // Build result of subexpression. | |
5350 if (group_type == CAPTURE) { | |
5351 RegExpCapture* capture = GetCapture(capture_index); | |
5352 capture->set_body(body); | |
5353 body = capture; | |
5354 } else if (group_type != GROUPING) { | |
5355 DCHECK(group_type == POSITIVE_LOOKAROUND || | |
5356 group_type == NEGATIVE_LOOKAROUND); | |
5357 bool is_positive = (group_type == POSITIVE_LOOKAROUND); | |
5358 body = new (zone()) RegExpLookaround( | |
5359 body, is_positive, end_capture_index - capture_index, capture_index, | |
5360 state->lookaround_type()); | |
5361 } | |
5362 | |
5363 // Restore previous state. | |
5364 state = state->previous_state(); | |
5365 builder = state->builder(); | |
5366 | |
5367 builder->AddAtom(body); | |
5368 // For compatability with JSC and ES3, we allow quantifiers after | |
5369 // lookaheads, and break in all cases. | |
5370 break; | |
5371 } | |
5372 case '|': { | |
5373 Advance(); | |
5374 builder->NewAlternative(); | |
5375 continue; | |
5376 } | |
5377 case '*': | |
5378 case '+': | |
5379 case '?': | |
5380 return ReportError(CStrVector("Nothing to repeat")); | |
5381 case '^': { | |
5382 Advance(); | |
5383 if (multiline_) { | |
5384 builder->AddAssertion( | |
5385 new(zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); | |
5386 } else { | |
5387 builder->AddAssertion( | |
5388 new(zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); | |
5389 set_contains_anchor(); | |
5390 } | |
5391 continue; | |
5392 } | |
5393 case '$': { | |
5394 Advance(); | |
5395 RegExpAssertion::AssertionType assertion_type = | |
5396 multiline_ ? RegExpAssertion::END_OF_LINE : | |
5397 RegExpAssertion::END_OF_INPUT; | |
5398 builder->AddAssertion(new(zone()) RegExpAssertion(assertion_type)); | |
5399 continue; | |
5400 } | |
5401 case '.': { | |
5402 Advance(); | |
5403 // everything except \x0a, \x0d, \u2028 and \u2029 | |
5404 ZoneList<CharacterRange>* ranges = | |
5405 new(zone()) ZoneList<CharacterRange>(2, zone()); | |
5406 CharacterRange::AddClassEscape('.', ranges, zone()); | |
5407 RegExpTree* atom = new(zone()) RegExpCharacterClass(ranges, false); | |
5408 builder->AddAtom(atom); | |
5409 break; | |
5410 } | |
5411 case '(': { | |
5412 SubexpressionType subexpr_type = CAPTURE; | |
5413 RegExpLookaround::Type lookaround_type = state->lookaround_type(); | |
5414 Advance(); | |
5415 if (current() == '?') { | |
5416 switch (Next()) { | |
5417 case ':': | |
5418 subexpr_type = GROUPING; | |
5419 break; | |
5420 case '=': | |
5421 lookaround_type = RegExpLookaround::LOOKAHEAD; | |
5422 subexpr_type = POSITIVE_LOOKAROUND; | |
5423 break; | |
5424 case '!': | |
5425 lookaround_type = RegExpLookaround::LOOKAHEAD; | |
5426 subexpr_type = NEGATIVE_LOOKAROUND; | |
5427 break; | |
5428 case '<': | |
5429 if (FLAG_harmony_regexp_lookbehind) { | |
5430 Advance(); | |
5431 lookaround_type = RegExpLookaround::LOOKBEHIND; | |
5432 if (Next() == '=') { | |
5433 subexpr_type = POSITIVE_LOOKAROUND; | |
5434 break; | |
5435 } else if (Next() == '!') { | |
5436 subexpr_type = NEGATIVE_LOOKAROUND; | |
5437 break; | |
5438 } | |
5439 } | |
5440 // Fall through. | |
5441 default: | |
5442 ReportError(CStrVector("Invalid group") CHECK_FAILED); | |
5443 break; | |
5444 } | |
5445 Advance(2); | |
5446 } else { | |
5447 if (captures_started_ >= kMaxCaptures) { | |
5448 ReportError(CStrVector("Too many captures") CHECK_FAILED); | |
5449 } | |
5450 captures_started_++; | |
5451 } | |
5452 // Store current state and begin new disjunction parsing. | |
5453 state = new (zone()) RegExpParserState( | |
5454 state, subexpr_type, lookaround_type, captures_started_, zone()); | |
5455 builder = state->builder(); | |
5456 continue; | |
5457 } | |
5458 case '[': { | |
5459 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED); | |
5460 builder->AddAtom(atom); | |
5461 break; | |
5462 } | |
5463 // Atom :: | |
5464 // \ AtomEscape | |
5465 case '\\': | |
5466 switch (Next()) { | |
5467 case kEndMarker: | |
5468 return ReportError(CStrVector("\\ at end of pattern")); | |
5469 case 'b': | |
5470 Advance(2); | |
5471 builder->AddAssertion( | |
5472 new(zone()) RegExpAssertion(RegExpAssertion::BOUNDARY)); | |
5473 continue; | |
5474 case 'B': | |
5475 Advance(2); | |
5476 builder->AddAssertion( | |
5477 new(zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY)); | |
5478 continue; | |
5479 // AtomEscape :: | |
5480 // CharacterClassEscape | |
5481 // | |
5482 // CharacterClassEscape :: one of | |
5483 // d D s S w W | |
5484 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': { | |
5485 uc32 c = Next(); | |
5486 Advance(2); | |
5487 ZoneList<CharacterRange>* ranges = | |
5488 new(zone()) ZoneList<CharacterRange>(2, zone()); | |
5489 CharacterRange::AddClassEscape(c, ranges, zone()); | |
5490 RegExpTree* atom = new(zone()) RegExpCharacterClass(ranges, false); | |
5491 builder->AddAtom(atom); | |
5492 break; | |
5493 } | |
5494 case '1': case '2': case '3': case '4': case '5': case '6': | |
5495 case '7': case '8': case '9': { | |
5496 int index = 0; | |
5497 if (ParseBackReferenceIndex(&index)) { | |
5498 if (state->IsInsideCaptureGroup(index)) { | |
5499 // The back reference is inside the capture group it refers to. | |
5500 // Nothing can possibly have been captured yet, so we use empty | |
5501 // instead. This ensures that, when checking a back reference, | |
5502 // the capture registers of the referenced capture are either | |
5503 // both set or both cleared. | |
5504 builder->AddEmpty(); | |
5505 } else { | |
5506 RegExpCapture* capture = GetCapture(index); | |
5507 RegExpTree* atom = new (zone()) RegExpBackReference(capture); | |
5508 builder->AddAtom(atom); | |
5509 } | |
5510 break; | |
5511 } | |
5512 uc32 first_digit = Next(); | |
5513 if (first_digit == '8' || first_digit == '9') { | |
5514 // If the 'u' flag is present, only syntax characters can be escaped, | |
5515 // no other identity escapes are allowed. If the 'u' flag is not | |
5516 // present, all identity escapes are allowed. | |
5517 if (!FLAG_harmony_unicode_regexps || !unicode_) { | |
5518 builder->AddCharacter(first_digit); | |
5519 Advance(2); | |
5520 } else { | |
5521 return ReportError(CStrVector("Invalid escape")); | |
5522 } | |
5523 break; | |
5524 } | |
5525 } | |
5526 // FALLTHROUGH | |
5527 case '0': { | |
5528 Advance(); | |
5529 uc32 octal = ParseOctalLiteral(); | |
5530 builder->AddCharacter(octal); | |
5531 break; | |
5532 } | |
5533 // ControlEscape :: one of | |
5534 // f n r t v | |
5535 case 'f': | |
5536 Advance(2); | |
5537 builder->AddCharacter('\f'); | |
5538 break; | |
5539 case 'n': | |
5540 Advance(2); | |
5541 builder->AddCharacter('\n'); | |
5542 break; | |
5543 case 'r': | |
5544 Advance(2); | |
5545 builder->AddCharacter('\r'); | |
5546 break; | |
5547 case 't': | |
5548 Advance(2); | |
5549 builder->AddCharacter('\t'); | |
5550 break; | |
5551 case 'v': | |
5552 Advance(2); | |
5553 builder->AddCharacter('\v'); | |
5554 break; | |
5555 case 'c': { | |
5556 Advance(); | |
5557 uc32 controlLetter = Next(); | |
5558 // Special case if it is an ASCII letter. | |
5559 // Convert lower case letters to uppercase. | |
5560 uc32 letter = controlLetter & ~('a' ^ 'A'); | |
5561 if (letter < 'A' || 'Z' < letter) { | |
5562 // controlLetter is not in range 'A'-'Z' or 'a'-'z'. | |
5563 // This is outside the specification. We match JSC in | |
5564 // reading the backslash as a literal character instead | |
5565 // of as starting an escape. | |
5566 builder->AddCharacter('\\'); | |
5567 } else { | |
5568 Advance(2); | |
5569 builder->AddCharacter(controlLetter & 0x1f); | |
5570 } | |
5571 break; | |
5572 } | |
5573 case 'x': { | |
5574 Advance(2); | |
5575 uc32 value; | |
5576 if (ParseHexEscape(2, &value)) { | |
5577 builder->AddCharacter(value); | |
5578 } else if (!FLAG_harmony_unicode_regexps || !unicode_) { | |
5579 builder->AddCharacter('x'); | |
5580 } else { | |
5581 // If the 'u' flag is present, invalid escapes are not treated as | |
5582 // identity escapes. | |
5583 return ReportError(CStrVector("Invalid escape")); | |
5584 } | |
5585 break; | |
5586 } | |
5587 case 'u': { | |
5588 Advance(2); | |
5589 uc32 value; | |
5590 if (ParseUnicodeEscape(&value)) { | |
5591 builder->AddCharacter(value); | |
5592 } else if (!FLAG_harmony_unicode_regexps || !unicode_) { | |
5593 builder->AddCharacter('u'); | |
5594 } else { | |
5595 // If the 'u' flag is present, invalid escapes are not treated as | |
5596 // identity escapes. | |
5597 return ReportError(CStrVector("Invalid unicode escape")); | |
5598 } | |
5599 break; | |
5600 } | |
5601 default: | |
5602 Advance(); | |
5603 // If the 'u' flag is present, only syntax characters can be escaped, no | |
5604 // other identity escapes are allowed. If the 'u' flag is not present, | |
5605 // all identity escapes are allowed. | |
5606 if (!FLAG_harmony_unicode_regexps || !unicode_ || | |
5607 IsSyntaxCharacter(current())) { | |
5608 builder->AddCharacter(current()); | |
5609 Advance(); | |
5610 } else { | |
5611 return ReportError(CStrVector("Invalid escape")); | |
5612 } | |
5613 break; | |
5614 } | |
5615 break; | |
5616 case '{': { | |
5617 int dummy; | |
5618 if (ParseIntervalQuantifier(&dummy, &dummy)) { | |
5619 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED); | |
5620 } | |
5621 // fallthrough | |
5622 } | |
5623 default: | |
5624 builder->AddCharacter(current()); | |
5625 Advance(); | |
5626 break; | |
5627 } // end switch(current()) | |
5628 | |
5629 int min; | |
5630 int max; | |
5631 switch (current()) { | |
5632 // QuantifierPrefix :: | |
5633 // * | |
5634 // + | |
5635 // ? | |
5636 // { | |
5637 case '*': | |
5638 min = 0; | |
5639 max = RegExpTree::kInfinity; | |
5640 Advance(); | |
5641 break; | |
5642 case '+': | |
5643 min = 1; | |
5644 max = RegExpTree::kInfinity; | |
5645 Advance(); | |
5646 break; | |
5647 case '?': | |
5648 min = 0; | |
5649 max = 1; | |
5650 Advance(); | |
5651 break; | |
5652 case '{': | |
5653 if (ParseIntervalQuantifier(&min, &max)) { | |
5654 if (max < min) { | |
5655 ReportError(CStrVector("numbers out of order in {} quantifier.") | |
5656 CHECK_FAILED); | |
5657 } | |
5658 break; | |
5659 } else { | |
5660 continue; | |
5661 } | |
5662 default: | |
5663 continue; | |
5664 } | |
5665 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; | |
5666 if (current() == '?') { | |
5667 quantifier_type = RegExpQuantifier::NON_GREEDY; | |
5668 Advance(); | |
5669 } else if (FLAG_regexp_possessive_quantifier && current() == '+') { | |
5670 // FLAG_regexp_possessive_quantifier is a debug-only flag. | |
5671 quantifier_type = RegExpQuantifier::POSSESSIVE; | |
5672 Advance(); | |
5673 } | |
5674 builder->AddQuantifierToAtom(min, max, quantifier_type); | |
5675 } | |
5676 } | |
5677 | |
5678 | |
5679 #ifdef DEBUG | |
5680 // Currently only used in an DCHECK. | |
5681 static bool IsSpecialClassEscape(uc32 c) { | |
5682 switch (c) { | |
5683 case 'd': case 'D': | |
5684 case 's': case 'S': | |
5685 case 'w': case 'W': | |
5686 return true; | |
5687 default: | |
5688 return false; | |
5689 } | |
5690 } | |
5691 #endif | |
5692 | |
5693 | |
5694 // In order to know whether an escape is a backreference or not we have to scan | |
5695 // the entire regexp and find the number of capturing parentheses. However we | |
5696 // don't want to scan the regexp twice unless it is necessary. This mini-parser | |
5697 // is called when needed. It can see the difference between capturing and | |
5698 // noncapturing parentheses and can skip character classes and backslash-escaped | |
5699 // characters. | |
5700 void RegExpParser::ScanForCaptures() { | |
5701 // Start with captures started previous to current position | |
5702 int capture_count = captures_started(); | |
5703 // Add count of captures after this position. | |
5704 int n; | |
5705 while ((n = current()) != kEndMarker) { | |
5706 Advance(); | |
5707 switch (n) { | |
5708 case '\\': | |
5709 Advance(); | |
5710 break; | |
5711 case '[': { | |
5712 int c; | |
5713 while ((c = current()) != kEndMarker) { | |
5714 Advance(); | |
5715 if (c == '\\') { | |
5716 Advance(); | |
5717 } else { | |
5718 if (c == ']') break; | |
5719 } | |
5720 } | |
5721 break; | |
5722 } | |
5723 case '(': | |
5724 if (current() != '?') capture_count++; | |
5725 break; | |
5726 } | |
5727 } | |
5728 capture_count_ = capture_count; | |
5729 is_scanned_for_captures_ = true; | |
5730 } | |
5731 | |
5732 | |
5733 bool RegExpParser::ParseBackReferenceIndex(int* index_out) { | |
5734 DCHECK_EQ('\\', current()); | |
5735 DCHECK('1' <= Next() && Next() <= '9'); | |
5736 // Try to parse a decimal literal that is no greater than the total number | |
5737 // of left capturing parentheses in the input. | |
5738 int start = position(); | |
5739 int value = Next() - '0'; | |
5740 Advance(2); | |
5741 while (true) { | |
5742 uc32 c = current(); | |
5743 if (IsDecimalDigit(c)) { | |
5744 value = 10 * value + (c - '0'); | |
5745 if (value > kMaxCaptures) { | |
5746 Reset(start); | |
5747 return false; | |
5748 } | |
5749 Advance(); | |
5750 } else { | |
5751 break; | |
5752 } | |
5753 } | |
5754 if (value > captures_started()) { | |
5755 if (!is_scanned_for_captures_) { | |
5756 int saved_position = position(); | |
5757 ScanForCaptures(); | |
5758 Reset(saved_position); | |
5759 } | |
5760 if (value > capture_count_) { | |
5761 Reset(start); | |
5762 return false; | |
5763 } | |
5764 } | |
5765 *index_out = value; | |
5766 return true; | |
5767 } | |
5768 | |
5769 | |
5770 RegExpCapture* RegExpParser::GetCapture(int index) { | |
5771 // The index for the capture groups are one-based. Its index in the list is | |
5772 // zero-based. | |
5773 int know_captures = | |
5774 is_scanned_for_captures_ ? capture_count_ : captures_started_; | |
5775 DCHECK(index <= know_captures); | |
5776 if (captures_ == NULL) { | |
5777 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone()); | |
5778 } | |
5779 while (captures_->length() < know_captures) { | |
5780 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone()); | |
5781 } | |
5782 return captures_->at(index - 1); | |
5783 } | |
5784 | |
5785 | |
5786 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) { | |
5787 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { | |
5788 if (s->group_type() != CAPTURE) continue; | |
5789 // Return true if we found the matching capture index. | |
5790 if (index == s->capture_index()) return true; | |
5791 // Abort if index is larger than what has been parsed up till this state. | |
5792 if (index > s->capture_index()) return false; | |
5793 } | |
5794 return false; | |
5795 } | |
5796 | |
5797 | |
5798 // QuantifierPrefix :: | |
5799 // { DecimalDigits } | |
5800 // { DecimalDigits , } | |
5801 // { DecimalDigits , DecimalDigits } | |
5802 // | |
5803 // Returns true if parsing succeeds, and set the min_out and max_out | |
5804 // values. Values are truncated to RegExpTree::kInfinity if they overflow. | |
5805 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) { | |
5806 DCHECK_EQ(current(), '{'); | |
5807 int start = position(); | |
5808 Advance(); | |
5809 int min = 0; | |
5810 if (!IsDecimalDigit(current())) { | |
5811 Reset(start); | |
5812 return false; | |
5813 } | |
5814 while (IsDecimalDigit(current())) { | |
5815 int next = current() - '0'; | |
5816 if (min > (RegExpTree::kInfinity - next) / 10) { | |
5817 // Overflow. Skip past remaining decimal digits and return -1. | |
5818 do { | |
5819 Advance(); | |
5820 } while (IsDecimalDigit(current())); | |
5821 min = RegExpTree::kInfinity; | |
5822 break; | |
5823 } | |
5824 min = 10 * min + next; | |
5825 Advance(); | |
5826 } | |
5827 int max = 0; | |
5828 if (current() == '}') { | |
5829 max = min; | |
5830 Advance(); | |
5831 } else if (current() == ',') { | |
5832 Advance(); | |
5833 if (current() == '}') { | |
5834 max = RegExpTree::kInfinity; | |
5835 Advance(); | |
5836 } else { | |
5837 while (IsDecimalDigit(current())) { | |
5838 int next = current() - '0'; | |
5839 if (max > (RegExpTree::kInfinity - next) / 10) { | |
5840 do { | |
5841 Advance(); | |
5842 } while (IsDecimalDigit(current())); | |
5843 max = RegExpTree::kInfinity; | |
5844 break; | |
5845 } | |
5846 max = 10 * max + next; | |
5847 Advance(); | |
5848 } | |
5849 if (current() != '}') { | |
5850 Reset(start); | |
5851 return false; | |
5852 } | |
5853 Advance(); | |
5854 } | |
5855 } else { | |
5856 Reset(start); | |
5857 return false; | |
5858 } | |
5859 *min_out = min; | |
5860 *max_out = max; | |
5861 return true; | |
5862 } | |
5863 | |
5864 | |
5865 uc32 RegExpParser::ParseOctalLiteral() { | |
5866 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker); | |
5867 // For compatibility with some other browsers (not all), we parse | |
5868 // up to three octal digits with a value below 256. | |
5869 uc32 value = current() - '0'; | |
5870 Advance(); | |
5871 if ('0' <= current() && current() <= '7') { | |
5872 value = value * 8 + current() - '0'; | |
5873 Advance(); | |
5874 if (value < 32 && '0' <= current() && current() <= '7') { | |
5875 value = value * 8 + current() - '0'; | |
5876 Advance(); | |
5877 } | |
5878 } | |
5879 return value; | |
5880 } | |
5881 | |
5882 | |
5883 bool RegExpParser::ParseHexEscape(int length, uc32* value) { | |
5884 int start = position(); | |
5885 uc32 val = 0; | |
5886 for (int i = 0; i < length; ++i) { | |
5887 uc32 c = current(); | |
5888 int d = HexValue(c); | |
5889 if (d < 0) { | |
5890 Reset(start); | |
5891 return false; | |
5892 } | |
5893 val = val * 16 + d; | |
5894 Advance(); | |
5895 } | |
5896 *value = val; | |
5897 return true; | |
5898 } | |
5899 | |
5900 | |
5901 bool RegExpParser::ParseUnicodeEscape(uc32* value) { | |
5902 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are | |
5903 // allowed). In the latter case, the number of hex digits between { } is | |
5904 // arbitrary. \ and u have already been read. | |
5905 if (current() == '{' && FLAG_harmony_unicode_regexps && unicode_) { | |
5906 int start = position(); | |
5907 Advance(); | |
5908 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { | |
5909 if (current() == '}') { | |
5910 Advance(); | |
5911 return true; | |
5912 } | |
5913 } | |
5914 Reset(start); | |
5915 return false; | |
5916 } | |
5917 // \u but no {, or \u{...} escapes not allowed. | |
5918 return ParseHexEscape(4, value); | |
5919 } | |
5920 | |
5921 | |
5922 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { | |
5923 uc32 x = 0; | |
5924 int d = HexValue(current()); | |
5925 if (d < 0) { | |
5926 return false; | |
5927 } | |
5928 while (d >= 0) { | |
5929 x = x * 16 + d; | |
5930 if (x > max_value) { | |
5931 return false; | |
5932 } | |
5933 Advance(); | |
5934 d = HexValue(current()); | |
5935 } | |
5936 *value = x; | |
5937 return true; | |
5938 } | |
5939 | |
5940 | |
5941 uc32 RegExpParser::ParseClassCharacterEscape() { | |
5942 DCHECK(current() == '\\'); | |
5943 DCHECK(has_next() && !IsSpecialClassEscape(Next())); | |
5944 Advance(); | |
5945 switch (current()) { | |
5946 case 'b': | |
5947 Advance(); | |
5948 return '\b'; | |
5949 // ControlEscape :: one of | |
5950 // f n r t v | |
5951 case 'f': | |
5952 Advance(); | |
5953 return '\f'; | |
5954 case 'n': | |
5955 Advance(); | |
5956 return '\n'; | |
5957 case 'r': | |
5958 Advance(); | |
5959 return '\r'; | |
5960 case 't': | |
5961 Advance(); | |
5962 return '\t'; | |
5963 case 'v': | |
5964 Advance(); | |
5965 return '\v'; | |
5966 case 'c': { | |
5967 uc32 controlLetter = Next(); | |
5968 uc32 letter = controlLetter & ~('A' ^ 'a'); | |
5969 // For compatibility with JSC, inside a character class | |
5970 // we also accept digits and underscore as control characters. | |
5971 if ((controlLetter >= '0' && controlLetter <= '9') || | |
5972 controlLetter == '_' || | |
5973 (letter >= 'A' && letter <= 'Z')) { | |
5974 Advance(2); | |
5975 // Control letters mapped to ASCII control characters in the range | |
5976 // 0x00-0x1f. | |
5977 return controlLetter & 0x1f; | |
5978 } | |
5979 // We match JSC in reading the backslash as a literal | |
5980 // character instead of as starting an escape. | |
5981 return '\\'; | |
5982 } | |
5983 case '0': case '1': case '2': case '3': case '4': case '5': | |
5984 case '6': case '7': | |
5985 // For compatibility, we interpret a decimal escape that isn't | |
5986 // a back reference (and therefore either \0 or not valid according | |
5987 // to the specification) as a 1..3 digit octal character code. | |
5988 return ParseOctalLiteral(); | |
5989 case 'x': { | |
5990 Advance(); | |
5991 uc32 value; | |
5992 if (ParseHexEscape(2, &value)) { | |
5993 return value; | |
5994 } | |
5995 if (!FLAG_harmony_unicode_regexps || !unicode_) { | |
5996 // If \x is not followed by a two-digit hexadecimal, treat it | |
5997 // as an identity escape. | |
5998 return 'x'; | |
5999 } | |
6000 // If the 'u' flag is present, invalid escapes are not treated as | |
6001 // identity escapes. | |
6002 ReportError(CStrVector("Invalid escape")); | |
6003 return 0; | |
6004 } | |
6005 case 'u': { | |
6006 Advance(); | |
6007 uc32 value; | |
6008 if (ParseUnicodeEscape(&value)) { | |
6009 return value; | |
6010 } | |
6011 if (!FLAG_harmony_unicode_regexps || !unicode_) { | |
6012 return 'u'; | |
6013 } | |
6014 // If the 'u' flag is present, invalid escapes are not treated as | |
6015 // identity escapes. | |
6016 ReportError(CStrVector("Invalid unicode escape")); | |
6017 return 0; | |
6018 } | |
6019 default: { | |
6020 uc32 result = current(); | |
6021 // If the 'u' flag is present, only syntax characters can be escaped, no | |
6022 // other identity escapes are allowed. If the 'u' flag is not present, all | |
6023 // identity escapes are allowed. | |
6024 if (!FLAG_harmony_unicode_regexps || !unicode_ || | |
6025 IsSyntaxCharacter(result)) { | |
6026 Advance(); | |
6027 return result; | |
6028 } | |
6029 ReportError(CStrVector("Invalid escape")); | |
6030 return 0; | |
6031 } | |
6032 } | |
6033 return 0; | |
6034 } | |
6035 | |
6036 | |
6037 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { | |
6038 DCHECK_EQ(0, *char_class); | |
6039 uc32 first = current(); | |
6040 if (first == '\\') { | |
6041 switch (Next()) { | |
6042 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': { | |
6043 *char_class = Next(); | |
6044 Advance(2); | |
6045 return CharacterRange::Singleton(0); // Return dummy value. | |
6046 } | |
6047 case kEndMarker: | |
6048 return ReportError(CStrVector("\\ at end of pattern")); | |
6049 default: | |
6050 uc32 c = ParseClassCharacterEscape(CHECK_FAILED); | |
6051 return CharacterRange::Singleton(c); | |
6052 } | |
6053 } else { | |
6054 Advance(); | |
6055 return CharacterRange::Singleton(first); | |
6056 } | |
6057 } | |
6058 | |
6059 | |
6060 static const uc16 kNoCharClass = 0; | |
6061 | |
6062 // Adds range or pre-defined character class to character ranges. | |
6063 // If char_class is not kInvalidClass, it's interpreted as a class | |
6064 // escape (i.e., 's' means whitespace, from '\s'). | |
6065 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, | |
6066 uc16 char_class, | |
6067 CharacterRange range, | |
6068 Zone* zone) { | |
6069 if (char_class != kNoCharClass) { | |
6070 CharacterRange::AddClassEscape(char_class, ranges, zone); | |
6071 } else { | |
6072 ranges->Add(range, zone); | |
6073 } | |
6074 } | |
6075 | |
6076 | |
6077 RegExpTree* RegExpParser::ParseCharacterClass() { | |
6078 static const char* kUnterminated = "Unterminated character class"; | |
6079 static const char* kRangeOutOfOrder = "Range out of order in character class"; | |
6080 | |
6081 DCHECK_EQ(current(), '['); | |
6082 Advance(); | |
6083 bool is_negated = false; | |
6084 if (current() == '^') { | |
6085 is_negated = true; | |
6086 Advance(); | |
6087 } | |
6088 ZoneList<CharacterRange>* ranges = | |
6089 new(zone()) ZoneList<CharacterRange>(2, zone()); | |
6090 while (has_more() && current() != ']') { | |
6091 uc16 char_class = kNoCharClass; | |
6092 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED); | |
6093 if (current() == '-') { | |
6094 Advance(); | |
6095 if (current() == kEndMarker) { | |
6096 // If we reach the end we break out of the loop and let the | |
6097 // following code report an error. | |
6098 break; | |
6099 } else if (current() == ']') { | |
6100 AddRangeOrEscape(ranges, char_class, first, zone()); | |
6101 ranges->Add(CharacterRange::Singleton('-'), zone()); | |
6102 break; | |
6103 } | |
6104 uc16 char_class_2 = kNoCharClass; | |
6105 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); | |
6106 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { | |
6107 // Either end is an escaped character class. Treat the '-' verbatim. | |
6108 AddRangeOrEscape(ranges, char_class, first, zone()); | |
6109 ranges->Add(CharacterRange::Singleton('-'), zone()); | |
6110 AddRangeOrEscape(ranges, char_class_2, next, zone()); | |
6111 continue; | |
6112 } | |
6113 if (first.from() > next.to()) { | |
6114 return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED); | |
6115 } | |
6116 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone()); | |
6117 } else { | |
6118 AddRangeOrEscape(ranges, char_class, first, zone()); | |
6119 } | |
6120 } | |
6121 if (!has_more()) { | |
6122 return ReportError(CStrVector(kUnterminated) CHECK_FAILED); | |
6123 } | |
6124 Advance(); | |
6125 if (ranges->length() == 0) { | |
6126 ranges->Add(CharacterRange::Everything(), zone()); | |
6127 is_negated = !is_negated; | |
6128 } | |
6129 return new (zone()) RegExpCharacterClass(ranges, is_negated); | |
6130 } | |
6131 | |
6132 | |
6133 // ---------------------------------------------------------------------------- | |
6134 // The Parser interface. | 5045 // The Parser interface. |
6135 | 5046 |
6136 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, | |
6137 FlatStringReader* input, bool multiline, | |
6138 bool unicode, RegExpCompileData* result) { | |
6139 DCHECK(result != NULL); | |
6140 RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone); | |
6141 RegExpTree* tree = parser.ParsePattern(); | |
6142 if (parser.failed()) { | |
6143 DCHECK(tree == NULL); | |
6144 DCHECK(!result->error.is_null()); | |
6145 } else { | |
6146 DCHECK(tree != NULL); | |
6147 DCHECK(result->error.is_null()); | |
6148 result->tree = tree; | |
6149 int capture_count = parser.captures_started(); | |
6150 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; | |
6151 result->contains_anchor = parser.contains_anchor(); | |
6152 result->capture_count = capture_count; | |
6153 } | |
6154 return !parser.failed(); | |
6155 } | |
6156 | |
6157 | 5047 |
6158 bool Parser::ParseStatic(ParseInfo* info) { | 5048 bool Parser::ParseStatic(ParseInfo* info) { |
6159 Parser parser(info); | 5049 Parser parser(info); |
6160 if (parser.Parse(info)) { | 5050 if (parser.Parse(info)) { |
6161 info->set_language_mode(info->literal()->language_mode()); | 5051 info->set_language_mode(info->literal()->language_mode()); |
6162 return true; | 5052 return true; |
6163 } | 5053 } |
6164 return false; | 5054 return false; |
6165 } | 5055 } |
6166 | 5056 |
(...skipping 393 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6560 auto class_literal = value->AsClassLiteral(); | 5450 auto class_literal = value->AsClassLiteral(); |
6561 if (class_literal->raw_name() == nullptr) { | 5451 if (class_literal->raw_name() == nullptr) { |
6562 class_literal->set_raw_name(name); | 5452 class_literal->set_raw_name(name); |
6563 } | 5453 } |
6564 } | 5454 } |
6565 } | 5455 } |
6566 | 5456 |
6567 | 5457 |
6568 } // namespace internal | 5458 } // namespace internal |
6569 } // namespace v8 | 5459 } // namespace v8 |
OLD | NEW |