OLD | NEW |
---|---|
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. | 1 // Copyright 2006-2008 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 16 matching lines...) Expand all Loading... | |
27 | 27 |
28 #include "v8.h" | 28 #include "v8.h" |
29 | 29 |
30 #include "api.h" | 30 #include "api.h" |
31 #include "ast.h" | 31 #include "ast.h" |
32 #include "bootstrapper.h" | 32 #include "bootstrapper.h" |
33 #include "platform.h" | 33 #include "platform.h" |
34 #include "runtime.h" | 34 #include "runtime.h" |
35 #include "parser.h" | 35 #include "parser.h" |
36 #include "scopes.h" | 36 #include "scopes.h" |
37 #include "string-stream.h" | |
37 | 38 |
38 namespace v8 { namespace internal { | 39 namespace v8 { namespace internal { |
39 | 40 |
40 class ParserFactory; | 41 class ParserFactory; |
41 class ParserLog; | 42 class ParserLog; |
42 class TemporaryScope; | 43 class TemporaryScope; |
43 template <typename T> class ZoneListWrapper; | 44 template <typename T> class ZoneListWrapper; |
44 | 45 |
45 | 46 |
46 class Parser { | 47 class Parser { |
(...skipping 173 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
220 Handle<String> type, | 221 Handle<String> type, |
221 Vector< Handle<Object> > arguments); | 222 Vector< Handle<Object> > arguments); |
222 | 223 |
223 friend class Target; | 224 friend class Target; |
224 friend class TargetScope; | 225 friend class TargetScope; |
225 friend class LexicalScope; | 226 friend class LexicalScope; |
226 friend class TemporaryScope; | 227 friend class TemporaryScope; |
227 }; | 228 }; |
228 | 229 |
229 | 230 |
231 template <typename T, int initial_size> | |
232 class BufferedZoneList { | |
233 public: | |
234 | |
235 BufferedZoneList() : | |
Mads Ager (chromium)
2008/11/25 21:09:41
All on one line.
| |
236 list_(NULL), last_(NULL) {} | |
237 | |
238 // Adds element at end of list. This element is buffered and can | |
239 // be read using last() or removed using RemoveLast until a new Add or until | |
240 // RemoveLast or GetList has been called. | |
241 void Add(T* value) { | |
242 if (last_ != NULL) { | |
243 if (list_ == NULL) { | |
244 list_ = new ZoneList<T*>(initial_size); | |
245 } | |
246 list_->Add(last_); | |
247 } | |
248 last_ = value; | |
249 } | |
250 | |
251 T* last() { | |
252 ASSERT(last_ != NULL); | |
253 return last_; | |
254 } | |
255 | |
256 T* RemoveLast() { | |
257 ASSERT(last_ != NULL); | |
258 T* result = last_; | |
259 if (list_ != NULL && list_->length() > 0) | |
260 last_ = list_->RemoveLast(); | |
261 else | |
262 last_ = NULL; | |
263 return result; | |
264 } | |
265 | |
266 T* Get(int i) { | |
267 ASSERT(0 <= i && i < length()); | |
268 if (list_ == NULL) { | |
269 ASSERT_EQ(0, i); | |
270 return last_; | |
271 } else { | |
272 if (i == list_->length()) { | |
273 ASSERT(last_ != NULL); | |
274 return last_; | |
275 } else { | |
276 return list_->at(i); | |
277 } | |
278 } | |
279 } | |
280 | |
281 void Clear() { | |
282 list_ = NULL; | |
283 last_ = NULL; | |
284 } | |
285 | |
286 int length() { | |
287 int length = (list_ == NULL) ? 0 : list_->length(); | |
288 return length + ((last_ == NULL) ? 0 : 1); | |
289 } | |
290 | |
291 ZoneList<T*>* GetList() { | |
292 if (list_ == NULL) { | |
293 list_ = new ZoneList<T*>(initial_size); | |
294 } | |
295 if (last_ != NULL) { | |
296 list_->Add(last_); | |
297 last_ = NULL; | |
298 } | |
299 return list_; | |
300 } | |
301 | |
302 private: | |
303 ZoneList<T*>* list_; | |
304 T* last_; | |
305 }; | |
306 | |
307 // Accumulates RegExp atoms and assertions into lists of terms and alternatives. | |
308 class RegExpBuilder { | |
309 public: | |
310 RegExpBuilder(); | |
311 void AddCharacter(uc16 character); | |
312 // "Adds" an empty expression. Does nothing except consume a | |
313 // following quantifier | |
314 void AddEmpty(); | |
315 void AddAtom(RegExpTree* tree); | |
316 void AddAssertion(RegExpTree* tree); | |
317 void NewAlternative(); // '|' | |
318 void AddQuantifierToAtom(int min, int max, bool is_greedy); | |
319 RegExpTree* ToRegExp(); | |
320 private: | |
321 void FlushCharacters(); | |
322 void FlushText(); | |
323 void FlushTerms(); | |
324 bool pending_empty_; | |
325 ZoneList<uc16>* characters_; | |
326 BufferedZoneList<RegExpTree, 2> terms_; | |
327 BufferedZoneList<RegExpTree, 2> text_; | |
328 BufferedZoneList<RegExpTree, 2> alternatives_; | |
329 #ifdef DEBUG | |
330 enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_; | |
331 #define LAST(x) last_added_ = x; | |
332 #else | |
333 #define LAST(x) | |
334 #endif | |
335 }; | |
336 | |
337 | |
338 RegExpBuilder::RegExpBuilder() | |
339 : pending_empty_(false), characters_(NULL), terms_(), alternatives_() | |
Mads Ager (chromium)
2008/11/25 21:09:41
Four space indent. More occurrences below.
| |
340 #ifdef DEBUG | |
341 , last_added_(ADD_NONE) | |
342 #endif | |
343 {} | |
344 | |
345 | |
346 void RegExpBuilder::FlushCharacters() { | |
347 pending_empty_ = false; | |
348 if (characters_ != NULL) { | |
349 RegExpTree* atom = new RegExpAtom(characters_->ToConstVector()); | |
350 characters_ = NULL; | |
351 text_.Add(atom); | |
352 LAST(ADD_ATOM); | |
353 } | |
354 } | |
355 | |
356 | |
357 void RegExpBuilder::FlushText() { | |
358 FlushCharacters(); | |
359 int num_text = text_.length(); | |
360 if (num_text == 0) { | |
361 return; | |
362 } else if (num_text == 1) { | |
363 terms_.Add(text_.last()); | |
364 } else { | |
365 RegExpText* text = new RegExpText(); | |
366 for (int i = 0; i < num_text; i++) | |
367 text_.Get(i)->AppendToText(text); | |
368 terms_.Add(text); | |
369 } | |
370 text_.Clear(); | |
371 } | |
372 | |
373 | |
374 void RegExpBuilder::AddCharacter(uc16 c) { | |
375 pending_empty_ = false; | |
376 if (characters_ == NULL) { | |
377 characters_ = new ZoneList<uc16>(4); | |
378 } | |
379 characters_->Add(c); | |
380 LAST(ADD_CHAR); | |
381 } | |
382 | |
383 | |
384 void RegExpBuilder::AddEmpty() { | |
385 pending_empty_ = true; | |
386 } | |
387 | |
388 | |
389 void RegExpBuilder::AddAtom(RegExpTree* term) { | |
390 if (term->IsEmpty()) { | |
391 AddEmpty(); | |
392 return; | |
393 } | |
394 if (term->IsTextElement()) { | |
395 FlushCharacters(); | |
396 text_.Add(term); | |
397 } else { | |
398 FlushText(); | |
399 terms_.Add(term); | |
400 } | |
401 LAST(ADD_ATOM); | |
402 } | |
403 | |
404 | |
405 void RegExpBuilder::AddAssertion(RegExpTree* assert) { | |
406 FlushText(); | |
407 terms_.Add(assert); | |
408 LAST(ADD_ASSERT); | |
409 } | |
410 | |
411 | |
412 void RegExpBuilder::NewAlternative() { | |
413 FlushTerms(); | |
414 } | |
415 | |
416 | |
417 void RegExpBuilder::FlushTerms() { | |
418 FlushText(); | |
419 int num_terms = terms_.length(); | |
420 RegExpTree* alternative; | |
421 if (num_terms == 0) { | |
422 alternative = RegExpEmpty::GetInstance(); | |
423 } else if (num_terms == 1) { | |
424 alternative = terms_.last(); | |
425 } else { | |
426 alternative = new RegExpAlternative(terms_.GetList()); | |
427 } | |
428 alternatives_.Add(alternative); | |
429 terms_.Clear(); | |
430 LAST(ADD_NONE); | |
431 } | |
432 | |
433 | |
434 RegExpTree* RegExpBuilder::ToRegExp() { | |
435 FlushTerms(); | |
436 int num_alternatives = alternatives_.length(); | |
437 if (num_alternatives == 0) { | |
438 return RegExpEmpty::GetInstance(); | |
439 } | |
440 if (num_alternatives == 1) { | |
441 return alternatives_.last(); | |
442 } | |
443 return new RegExpDisjunction(alternatives_.GetList()); | |
444 } | |
445 | |
446 | |
447 void RegExpBuilder::AddQuantifierToAtom(int min, int max, bool is_greedy) { | |
448 if (pending_empty_) { | |
449 pending_empty_ = false; | |
450 return; | |
451 } | |
452 RegExpTree* atom; | |
453 if (characters_ != NULL) { | |
454 ASSERT(last_added_ == ADD_CHAR); | |
455 // Last atom was character. | |
456 Vector<const uc16> char_vector = characters_->ToConstVector(); | |
457 int num_chars = char_vector.length(); | |
458 if (num_chars > 1) { | |
459 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1); | |
460 text_.Add(new RegExpAtom(prefix)); | |
461 char_vector = char_vector.SubVector(num_chars - 1, num_chars); | |
462 } | |
463 characters_ = NULL; | |
464 atom = new RegExpAtom(char_vector); | |
465 FlushText(); | |
466 } else if (text_.length() > 0) { | |
467 ASSERT(last_added_ == ADD_ATOM); | |
468 atom = text_.RemoveLast(); | |
469 FlushText(); | |
470 } else if (terms_.length() > 0) { | |
471 ASSERT(last_added_ == ADD_ATOM); | |
472 atom = terms_.RemoveLast(); | |
473 if (atom->IsLookahead() || atom->IsAssertion()) { | |
474 // Guaranteed not to match a non-empty string. | |
475 // Assertion as an atom can happen as, e.g., (?:\b) | |
476 LAST(ADD_TERM); | |
477 if (min == 0) { | |
478 return; | |
479 } | |
480 terms_.Add(atom); | |
481 return; | |
482 } | |
483 } else { | |
484 // Only call immediately after adding an atom or character! | |
485 UNREACHABLE(); | |
486 return; | |
487 } | |
488 terms_.Add(new RegExpQuantifier(min, max, is_greedy, atom)); | |
489 LAST(ADD_TERM); | |
490 } | |
491 | |
492 | |
493 class RegExpParser { | |
494 public: | |
495 RegExpParser(FlatStringReader* in, | |
496 Handle<String>* error, | |
497 bool multiline_mode); | |
498 RegExpTree* ParsePattern(bool* ok); | |
499 RegExpTree* ParseDisjunction(bool* ok); | |
500 RegExpTree* ParseGroup(bool* ok); | |
501 RegExpTree* ParseCharacterClass(bool* ok); | |
502 | |
503 // Parses a {...,...} quantifier and stores the range in the given | |
504 // out parameters. | |
505 bool ParseIntervalQuantifier(int* min_out, int* max_out); | |
506 | |
507 // Parses and returns a single escaped character. The character | |
508 // must not be 'b' or 'B' since they are usually handle specially. | |
509 uc32 ParseClassCharacterEscape(bool* ok); | |
510 | |
511 // Checks whether the following is a length-digit hexadecimal number, | |
512 // and sets the value if it is. | |
513 bool ParseHexEscape(int length, uc32* value); | |
514 | |
515 uc32 ParseControlLetterEscape(bool* ok); | |
516 uc32 ParseOctalLiteral(); | |
517 | |
518 // Tries to parse the input as a back reference. If successful it | |
519 // stores the result in the output parameter and returns true. If | |
520 // it fails it will push back the characters read so the same characters | |
521 // can be reparsed. | |
522 bool ParseBackReferenceIndex(int* index_out); | |
523 | |
524 CharacterRange ParseClassAtom(bool* is_char_class, | |
525 ZoneList<CharacterRange>* ranges, | |
526 bool* ok); | |
527 RegExpTree* ReportError(Vector<const char> message, bool* ok); | |
528 void Advance(); | |
529 void Advance(int dist); | |
530 void Reset(int pos); | |
531 | |
532 bool HasCharacterEscapes(); | |
533 | |
534 int captures_started() { return captures_ == NULL ? 0 : captures_->length(); } | |
535 int position() { return next_pos_ - 1; } | |
536 | |
537 static const uc32 kEndMarker = (1 << 21); | |
538 private: | |
539 uc32 current() { return current_; } | |
540 bool has_more() { return has_more_; } | |
541 bool has_next() { return next_pos_ < in()->length(); } | |
542 uc32 Next(); | |
543 FlatStringReader* in() { return in_; } | |
544 void ScanForCaptures(); | |
545 bool CaptureAvailable(int index); | |
546 uc32 current_; | |
547 bool has_more_; | |
548 bool multiline_mode_; | |
549 int next_pos_; | |
550 FlatStringReader* in_; | |
551 Handle<String>* error_; | |
552 bool has_character_escapes_; | |
553 bool is_scanned_for_captures_; | |
554 ZoneList<RegExpCapture*>* captures_; | |
555 int capture_count_; | |
556 }; | |
557 | |
558 | |
230 // A temporary scope stores information during parsing, just like | 559 // A temporary scope stores information during parsing, just like |
231 // a plain scope. However, temporary scopes are not kept around | 560 // a plain scope. However, temporary scopes are not kept around |
232 // after parsing or referenced by syntax trees so they can be stack- | 561 // after parsing or referenced by syntax trees so they can be stack- |
233 // allocated and hence used by the pre-parser. | 562 // allocated and hence used by the pre-parser. |
234 class TemporaryScope BASE_EMBEDDED { | 563 class TemporaryScope BASE_EMBEDDED { |
235 public: | 564 public: |
236 explicit TemporaryScope(Parser* parser); | 565 explicit TemporaryScope(Parser* parser); |
237 ~TemporaryScope(); | 566 ~TemporaryScope(); |
238 | 567 |
239 int NextMaterializedLiteralIndex() { | 568 int NextMaterializedLiteralIndex() { |
(...skipping 2917 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3157 } | 3486 } |
3158 ZoneList<Expression*>* args = new ZoneList<Expression*>(2); | 3487 ZoneList<Expression*>* args = new ZoneList<Expression*>(2); |
3159 args->Add(new Literal(type)); | 3488 args->Add(new Literal(type)); |
3160 args->Add(new Literal(array)); | 3489 args->Add(new Literal(array)); |
3161 return new Throw(new CallRuntime(constructor, NULL, args), | 3490 return new Throw(new CallRuntime(constructor, NULL, args), |
3162 scanner().location().beg_pos); | 3491 scanner().location().beg_pos); |
3163 } | 3492 } |
3164 | 3493 |
3165 | 3494 |
3166 // ---------------------------------------------------------------------------- | 3495 // ---------------------------------------------------------------------------- |
3496 // Regular expressions | |
3497 | |
3498 | |
3499 RegExpParser::RegExpParser(FlatStringReader* in, | |
3500 Handle<String>* error, | |
3501 bool multiline_mode) | |
3502 : current_(kEndMarker), | |
3503 has_more_(true), | |
3504 multiline_mode_(multiline_mode), | |
3505 next_pos_(0), | |
3506 in_(in), | |
3507 error_(error), | |
3508 has_character_escapes_(false), | |
3509 is_scanned_for_captures_(false), | |
3510 captures_(NULL), | |
3511 capture_count_(0) { | |
3512 Advance(1); | |
3513 } | |
3514 | |
3515 | |
3516 uc32 RegExpParser::Next() { | |
3517 if (has_next()) { | |
3518 return in()->Get(next_pos_); | |
3519 } else { | |
3520 return kEndMarker; | |
3521 } | |
3522 } | |
3523 | |
3524 | |
3525 void RegExpParser::Advance() { | |
3526 if (next_pos_ < in()->length()) { | |
3527 current_ = in()->Get(next_pos_); | |
3528 next_pos_++; | |
3529 } else { | |
3530 current_ = kEndMarker; | |
3531 has_more_ = false; | |
3532 } | |
3533 } | |
3534 | |
3535 | |
3536 void RegExpParser::Reset(int pos) { | |
3537 next_pos_ = pos; | |
3538 Advance(); | |
3539 } | |
3540 | |
3541 | |
3542 void RegExpParser::Advance(int dist) { | |
3543 for (int i = 0; i < dist; i++) | |
3544 Advance(); | |
3545 } | |
3546 | |
3547 | |
3548 // Reports whether the parsed string atoms contain any characters that were | |
3549 // escaped in the original pattern. If not, all atoms are proper substrings | |
3550 // of the original pattern. | |
3551 bool RegExpParser::HasCharacterEscapes() { | |
3552 return has_character_escapes_; | |
3553 } | |
3554 | |
3555 RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) { | |
3556 *ok = false; | |
3557 *error_ = Factory::NewStringFromAscii(message, NOT_TENURED); | |
3558 return NULL; | |
3559 } | |
3560 | |
3561 | |
3562 // Pattern :: | |
3563 // Disjunction | |
3564 RegExpTree* RegExpParser::ParsePattern(bool* ok) { | |
3565 RegExpTree* result = ParseDisjunction(CHECK_OK); | |
3566 if (has_more()) { | |
3567 ReportError(CStrVector("Unmatched ')'"), CHECK_OK); | |
3568 } | |
3569 return result; | |
3570 } | |
3571 | |
3572 | |
3573 bool RegExpParser::CaptureAvailable(int index) { | |
3574 if (captures_ == NULL) return false; | |
3575 if (index >= captures_->length()) return false; | |
3576 RegExpCapture* capture = captures_->at(index); | |
3577 return capture != NULL && capture->available() == CAPTURE_AVAILABLE; | |
3578 } | |
3579 | |
3580 | |
3581 // Disjunction :: | |
3582 // Alternative | |
3583 // Alternative | Disjunction | |
3584 // Alternative :: | |
3585 // [empty] | |
3586 // Term Alternative | |
3587 // Term :: | |
3588 // Assertion | |
3589 // Atom | |
3590 // Atom Quantifier | |
3591 RegExpTree* RegExpParser::ParseDisjunction(bool* ok) { | |
3592 RegExpBuilder builder; | |
3593 int capture_start_index = captures_started(); | |
3594 while (true) { | |
3595 switch (current()) { | |
3596 case kEndMarker: | |
3597 case ')': | |
3598 return builder.ToRegExp(); | |
3599 case '|': { | |
3600 Advance(); | |
3601 builder.NewAlternative(); | |
3602 int capture_new_alt_start_index = captures_started(); | |
3603 for (int i = capture_start_index; i < capture_new_alt_start_index; i++) { | |
3604 RegExpCapture* capture = captures_->at(i); | |
3605 if (capture->available() == CAPTURE_AVAILABLE) { | |
3606 capture->set_available(CAPTURE_UNREACHABLE); | |
3607 } | |
3608 } | |
3609 capture_start_index = capture_new_alt_start_index; | |
3610 continue; | |
3611 } | |
3612 case '*': | |
3613 case '+': | |
3614 case '?': | |
3615 ReportError(CStrVector("Nothing to repeat"), CHECK_OK); | |
3616 case '^': { | |
3617 Advance(); | |
3618 RegExpAssertion::Type type = | |
3619 multiline_mode_ ? RegExpAssertion::START_OF_LINE : | |
3620 RegExpAssertion::START_OF_INPUT; | |
3621 builder.AddAssertion(new RegExpAssertion(type)); | |
3622 continue; | |
3623 } | |
3624 case '$': { | |
3625 Advance(); | |
3626 RegExpAssertion::Type type = | |
3627 multiline_mode_ ? RegExpAssertion::END_OF_LINE : | |
3628 RegExpAssertion::END_OF_INPUT; | |
3629 builder.AddAssertion(new RegExpAssertion(type)); | |
3630 continue; | |
3631 } | |
3632 case '.': { | |
3633 Advance(); | |
3634 // everything except \x0a, \x0d, \u2028 and \u2029 | |
3635 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2); | |
3636 CharacterRange::AddClassEscape('.', ranges); | |
3637 RegExpTree* atom = new RegExpCharacterClass(ranges, false); | |
3638 builder.AddAtom(atom); | |
3639 break; | |
3640 } | |
3641 case '(': { | |
3642 RegExpTree* atom = ParseGroup(CHECK_OK); | |
3643 builder.AddAtom(atom); | |
3644 break; | |
3645 } | |
3646 case '[': { | |
3647 RegExpTree* atom = ParseCharacterClass(CHECK_OK); | |
3648 builder.AddAtom(atom); | |
3649 break; | |
3650 } | |
3651 // Atom :: | |
3652 // \ AtomEscape | |
3653 case '\\': | |
3654 switch (Next()) { | |
3655 case kEndMarker: | |
3656 ReportError(CStrVector("\\ at end of pattern"), CHECK_OK); | |
3657 case 'b': | |
3658 Advance(2); | |
3659 builder.AddAssertion( | |
3660 new RegExpAssertion(RegExpAssertion::BOUNDARY)); | |
3661 continue; | |
3662 case 'B': | |
3663 Advance(2); | |
3664 builder.AddAssertion( | |
3665 new RegExpAssertion(RegExpAssertion::NON_BOUNDARY)); | |
3666 continue; | |
3667 // AtomEscape :: | |
3668 // CharacterClassEscape | |
3669 // | |
3670 // CharacterClassEscape :: one of | |
3671 // d D s S w W | |
3672 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': { | |
3673 uc32 c = Next(); | |
3674 Advance(2); | |
3675 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2); | |
3676 CharacterRange::AddClassEscape(c, ranges); | |
3677 RegExpTree* atom = new RegExpCharacterClass(ranges, false); | |
3678 builder.AddAtom(atom); | |
3679 goto has_read_atom; // Avoid setting has_character_escapes_. | |
3680 } | |
3681 case '1': case '2': case '3': case '4': case '5': case '6': | |
3682 case '7': case '8': case '9': { | |
3683 int index = 0; | |
3684 if (ParseBackReferenceIndex(&index)) { | |
3685 if (!CaptureAvailable(index - 1)) { | |
3686 // Prepare to ignore a following quantifier | |
3687 builder.AddEmpty(); | |
3688 goto has_read_atom; | |
3689 } | |
3690 RegExpCapture* capture = captures_->at(index - 1); | |
3691 RegExpTree* atom = new RegExpBackReference(capture); | |
3692 builder.AddAtom(atom); | |
3693 goto has_read_atom; // Avoid setting has_character_escapes_. | |
3694 } | |
3695 uc32 first_digit = Next(); | |
3696 if (first_digit == '8' || first_digit == '9') { | |
3697 // Treat as identity escape | |
3698 builder.AddCharacter(first_digit); | |
3699 Advance(2); | |
3700 break; | |
3701 } | |
3702 } | |
3703 // FALLTHROUGH | |
3704 case '0': { | |
3705 Advance(); | |
3706 uc32 octal = ParseOctalLiteral(); | |
3707 builder.AddCharacter(octal); | |
3708 break; | |
3709 } | |
3710 // ControlEscape :: one of | |
3711 // f n r t v | |
3712 case 'f': | |
3713 Advance(2); | |
3714 builder.AddCharacter('\f'); | |
3715 break; | |
3716 case 'n': | |
3717 Advance(2); | |
3718 builder.AddCharacter('\n'); | |
3719 break; | |
3720 case 'r': | |
3721 Advance(2); | |
3722 builder.AddCharacter('\r'); | |
3723 break; | |
3724 case 't': | |
3725 Advance(2); | |
3726 builder.AddCharacter('\t'); | |
3727 break; | |
3728 case 'v': | |
3729 Advance(2); | |
3730 builder.AddCharacter('\v'); | |
3731 break; | |
3732 case 'c': { | |
3733 Advance(2); | |
3734 uc32 control = ParseControlLetterEscape(ok); | |
3735 builder.AddCharacter(control); | |
3736 break; | |
3737 } | |
3738 case 'x': { | |
3739 Advance(2); | |
3740 uc32 value; | |
3741 if (ParseHexEscape(2, &value)) { | |
3742 builder.AddCharacter(value); | |
3743 } else { | |
3744 builder.AddCharacter('x'); | |
3745 } | |
3746 break; | |
3747 } | |
3748 case 'u': { | |
3749 Advance(2); | |
3750 uc32 value; | |
3751 if (ParseHexEscape(4, &value)) { | |
3752 builder.AddCharacter(value); | |
3753 } else { | |
3754 builder.AddCharacter('u'); | |
3755 } | |
3756 break; | |
3757 } | |
3758 default: | |
3759 // Identity escape. | |
3760 builder.AddCharacter(Next()); | |
3761 Advance(2); | |
3762 break; | |
3763 } | |
3764 has_character_escapes_ = true; | |
3765 break; | |
3766 case '{': { | |
3767 int dummy; | |
3768 if (ParseIntervalQuantifier(&dummy, &dummy)) { | |
3769 ReportError(CStrVector("Nothing to repeat"), CHECK_OK); | |
3770 } | |
3771 // fallthrough | |
3772 } | |
3773 default: | |
3774 builder.AddCharacter(current()); | |
3775 Advance(); | |
3776 break; | |
3777 } // end switch(current()) | |
3778 | |
3779 has_read_atom: | |
3780 int min; | |
3781 int max; | |
3782 switch (current()) { | |
3783 // QuantifierPrefix :: | |
3784 // * | |
3785 // + | |
3786 // ? | |
3787 // { | |
3788 case '*': | |
3789 min = 0; | |
3790 max = RegExpQuantifier::kInfinity; | |
3791 Advance(); | |
3792 break; | |
3793 case '+': | |
3794 min = 1; | |
3795 max = RegExpQuantifier::kInfinity; | |
3796 Advance(); | |
3797 break; | |
3798 case '?': | |
3799 min = 0; | |
3800 max = 1; | |
3801 Advance(); | |
3802 break; | |
3803 case '{': | |
3804 if (ParseIntervalQuantifier(&min, &max)) { | |
3805 break; | |
3806 } else { | |
3807 continue; | |
3808 } | |
3809 default: | |
3810 continue; | |
3811 } | |
3812 bool is_greedy = true; | |
3813 if (current() == '?') { | |
3814 is_greedy = false; | |
3815 Advance(); | |
3816 } | |
3817 builder.AddQuantifierToAtom(min, max, is_greedy); | |
3818 } | |
3819 } | |
3820 | |
3821 class SourceCharacter { | |
3822 public: | |
3823 static bool Is(uc32 c) { | |
3824 switch (c) { | |
3825 // case ']': case '}': | |
3826 // In spidermonkey and jsc these are treated as source characters | |
3827 // so we do too. | |
3828 case '^': case '$': case '\\': case '.': case '*': case '+': | |
3829 case '?': case '(': case ')': case '[': case '{': case '|': | |
3830 case RegExpParser::kEndMarker: | |
3831 return false; | |
3832 default: | |
3833 return true; | |
3834 } | |
3835 } | |
3836 }; | |
3837 | |
3838 | |
3839 static unibrow::Predicate<SourceCharacter> source_character; | |
3840 | |
3841 | |
3842 static inline bool IsSourceCharacter(uc32 c) { | |
3843 return source_character.get(c); | |
3844 } | |
3845 | |
3846 #ifdef DEBUG | |
3847 // Currently only used in an ASSERT. | |
3848 static bool IsSpecialClassEscape(uc32 c) { | |
3849 switch (c) { | |
3850 case 'd': case 'D': | |
3851 case 's': case 'S': | |
3852 case 'w': case 'W': | |
3853 return true; | |
3854 default: | |
3855 return false; | |
3856 } | |
3857 } | |
3858 #endif | |
3859 | |
3860 | |
3861 // In order to know whether an escape is a backreference or not we have to scan | |
3862 // the entire regexp and find the number of capturing parentheses. However we | |
3863 // don't want to scan the regexp twice unless it is necessary. This mini-parser | |
3864 // is called when needed. It can see the difference between capturing and | |
3865 // noncapturing parentheses and can skip character classes and backslash-escaped | |
3866 // characters. | |
3867 void RegExpParser::ScanForCaptures() { | |
3868 int n; | |
3869 while ((n = current()) != kEndMarker) { | |
3870 Advance(); | |
3871 switch (n) { | |
3872 case '\\': | |
3873 Advance(); | |
3874 break; | |
3875 case '[': { | |
3876 int c; | |
3877 while ((c = current()) != kEndMarker) { | |
3878 Advance(); | |
3879 if (c == '\\') { | |
3880 Advance(); | |
3881 } else { | |
3882 if (c == ']') break; | |
3883 } | |
3884 } | |
3885 break; | |
3886 } | |
3887 case '(': | |
3888 if (current() != '?') capture_count_++; | |
3889 break; | |
3890 } | |
3891 } | |
3892 is_scanned_for_captures_ = true; | |
3893 } | |
3894 | |
3895 | |
3896 bool RegExpParser::ParseBackReferenceIndex(int* index_out) { | |
3897 ASSERT_EQ('\\', current()); | |
3898 ASSERT('1' <= Next() && Next() <= '9'); | |
3899 // Try to parse a decimal literal that is no greater than the number | |
3900 // of previously encountered left capturing parentheses. | |
3901 // This is a not according the the ECMAScript specification. According to | |
3902 // that, one must accept values up to the total number of left capturing | |
3903 // parentheses in the entire input, even if they are meaningless. | |
3904 if (!is_scanned_for_captures_) { | |
3905 int saved_position = position(); | |
3906 ScanForCaptures(); | |
3907 Reset(saved_position); | |
3908 } | |
3909 if (capture_count_ == 0) return false; | |
3910 int start = position(); | |
3911 int value = Next() - '0'; | |
3912 if (value > capture_count_) return false; | |
3913 Advance(2); | |
3914 while (true) { | |
3915 uc32 c = current(); | |
3916 if (IsDecimalDigit(c)) { | |
3917 value = 10 * value + (c - '0'); | |
3918 if (value > capture_count_) { | |
3919 Reset(start); | |
3920 return false; | |
3921 } | |
3922 Advance(); | |
3923 } else { | |
3924 break; | |
3925 } | |
3926 } | |
3927 *index_out = value; | |
3928 return true; | |
3929 } | |
3930 | |
3931 | |
3932 // QuantifierPrefix :: | |
3933 // { DecimalDigits } | |
3934 // { DecimalDigits , } | |
3935 // { DecimalDigits , DecimalDigits } | |
3936 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) { | |
3937 ASSERT_EQ(current(), '{'); | |
3938 int start = position(); | |
3939 Advance(); | |
3940 int min = 0; | |
3941 if (!IsDecimalDigit(current())) { | |
3942 Reset(start); | |
3943 return false; | |
3944 } | |
3945 while (IsDecimalDigit(current())) { | |
3946 min = 10 * min + (current() - '0'); | |
3947 Advance(); | |
3948 } | |
3949 int max = 0; | |
3950 if (current() == '}') { | |
3951 max = min; | |
3952 Advance(); | |
3953 } else if (current() == ',') { | |
3954 Advance(); | |
3955 if (current() == '}') { | |
3956 max = RegExpQuantifier::kInfinity; | |
3957 Advance(); | |
3958 } else { | |
3959 while (IsDecimalDigit(current())) { | |
3960 max = 10 * max + (current() - '0'); | |
3961 Advance(); | |
3962 } | |
3963 if (current() != '}') { | |
3964 Reset(start); | |
3965 return false; | |
3966 } | |
3967 Advance(); | |
3968 } | |
3969 } else { | |
3970 Reset(start); | |
3971 return false; | |
3972 } | |
3973 *min_out = min; | |
3974 *max_out = max; | |
3975 return true; | |
3976 } | |
3977 | |
3978 | |
3979 // Upper and lower case letters differ by one bit. | |
3980 STATIC_CHECK(('a' ^ 'A') == 0x20); | |
3981 | |
3982 uc32 RegExpParser::ParseControlLetterEscape(bool* ok) { | |
3983 if (!has_more()) { | |
3984 ReportError(CStrVector("\\c at end of pattern"), ok); | |
3985 return '\0'; | |
3986 } | |
3987 uc32 letter = current() & ~(0x20); // Collapse upper and lower case letters. | |
3988 if (letter < 'A' || 'Z' < letter) { | |
3989 // Non-spec error-correction: "\c" followed by non-control letter is | |
3990 // interpreted as an IdentityEscape of 'c'. | |
3991 return 'c'; | |
3992 } | |
3993 Advance(); | |
3994 return letter & 0x1f; // Remainder modulo 32, per specification. | |
3995 } | |
3996 | |
3997 | |
3998 uc32 RegExpParser::ParseOctalLiteral() { | |
3999 ASSERT('0' <= current() && current() <= '7'); | |
4000 // For compatibility with some other browsers (not all), we parse | |
4001 // up to three octal digits with a value below 256. | |
4002 uc32 value = current() - '0'; | |
4003 Advance(); | |
4004 if ('0' <= current() && current() <= '7') { | |
4005 value = value * 8 + current() - '0'; | |
4006 Advance(); | |
4007 if (value < 32 && '0' <= current() && current() <= '7') { | |
4008 value = value * 8 + current() - '0'; | |
4009 Advance(); | |
4010 } | |
4011 } | |
4012 return value; | |
4013 } | |
4014 | |
4015 | |
4016 bool RegExpParser::ParseHexEscape(int length, uc32 *value) { | |
4017 int start = position(); | |
4018 uc32 val = 0; | |
4019 bool done = false; | |
4020 for (int i = 0; !done; i++) { | |
4021 uc32 c = current(); | |
4022 int d = HexValue(c); | |
4023 if (d < 0) { | |
4024 Reset(start); | |
4025 return false; | |
4026 } | |
4027 val = val * 16 + d; | |
4028 Advance(); | |
4029 if (i == length - 1) { | |
4030 done = true; | |
4031 } | |
4032 } | |
4033 *value = val; | |
4034 return true; | |
4035 } | |
4036 | |
4037 | |
4038 uc32 RegExpParser::ParseClassCharacterEscape(bool* ok) { | |
4039 ASSERT(current() == '\\'); | |
4040 ASSERT(has_next() && !IsSpecialClassEscape(Next())); | |
4041 Advance(); | |
4042 switch (current()) { | |
4043 case 'b': | |
4044 Advance(); | |
4045 return '\b'; | |
4046 // ControlEscape :: one of | |
4047 // f n r t v | |
4048 case 'f': | |
4049 Advance(); | |
4050 return '\f'; | |
4051 case 'n': | |
4052 Advance(); | |
4053 return '\n'; | |
4054 case 'r': | |
4055 Advance(); | |
4056 return '\r'; | |
4057 case 't': | |
4058 Advance(); | |
4059 return '\t'; | |
4060 case 'v': | |
4061 Advance(); | |
4062 return '\v'; | |
4063 case 'c': | |
4064 return ParseControlLetterEscape(ok); | |
4065 case '0': case '1': case '2': case '3': case '4': case '5': | |
4066 case '6': case '7': | |
4067 // For compatibility, we interpret a decimal escape that isn't | |
4068 // a back reference (and therefore either \0 or not valid according | |
4069 // to the specification) as a 1..3 digit octal character code. | |
4070 return ParseOctalLiteral(); | |
4071 case 'x': { | |
4072 Advance(); | |
4073 uc32 value; | |
4074 if (ParseHexEscape(2, &value)) { | |
4075 return value; | |
4076 } | |
4077 // If \x is not followed by a two-digit hexadecimal, treat it | |
4078 // as an identity escape. | |
4079 return 'x'; | |
4080 } | |
4081 case 'u': { | |
4082 Advance(); | |
4083 uc32 value; | |
4084 if (ParseHexEscape(4, &value)) { | |
4085 return value; | |
4086 } | |
4087 // If \u is not followed by a four-digit hexadecimal, treat it | |
4088 // as an identity escape. | |
4089 return 'u'; | |
4090 } | |
4091 default: { | |
4092 // Extended identity escape. We accept any character that hasn't | |
4093 // been matched by a more specific case, not just the subset required | |
4094 // by the ECMAScript specification. | |
4095 uc32 result = current(); | |
4096 Advance(); | |
4097 return result; | |
4098 } | |
4099 } | |
4100 return 0; | |
4101 } | |
4102 | |
4103 | |
4104 RegExpTree* RegExpParser::ParseGroup(bool* ok) { | |
4105 ASSERT_EQ(current(), '('); | |
4106 char type = '('; | |
4107 Advance(); | |
4108 if (current() == '?') { | |
4109 switch (Next()) { | |
4110 case ':': case '=': case '!': | |
4111 type = Next(); | |
4112 Advance(2); | |
4113 break; | |
4114 default: | |
4115 ReportError(CStrVector("Invalid group"), CHECK_OK); | |
4116 break; | |
4117 } | |
4118 } else { | |
4119 if (captures_ == NULL) { | |
4120 captures_ = new ZoneList<RegExpCapture*>(2); | |
4121 } | |
4122 captures_->Add(NULL); | |
4123 if (!is_scanned_for_captures_) capture_count_++; | |
4124 } | |
4125 int capture_index = captures_started(); | |
4126 RegExpTree* body = ParseDisjunction(CHECK_OK); | |
4127 if (current() != ')') { | |
4128 ReportError(CStrVector("Unterminated group"), CHECK_OK); | |
4129 } | |
4130 Advance(); | |
4131 | |
4132 int end_capture_index = captures_started(); | |
4133 if (type == '!') { | |
4134 // Captures inside a negative lookahead are never available outside it. | |
4135 for (int i = capture_index; i < end_capture_index; i++) { | |
4136 RegExpCapture* capture = captures_->at(i); | |
4137 ASSERT(capture != NULL); | |
4138 capture->set_available(CAPTURE_PERMANENTLY_UNREACHABLE); | |
4139 } | |
4140 } else { | |
4141 // Captures temporarily unavailable because they are in different | |
4142 // alternatives are all available after the disjunction. | |
4143 for (int i = capture_index; i < end_capture_index; i++) { | |
4144 RegExpCapture* capture = captures_->at(i); | |
4145 ASSERT(capture != NULL); | |
4146 if (capture->available() == CAPTURE_UNREACHABLE) { | |
4147 capture->set_available(CAPTURE_AVAILABLE); | |
4148 } | |
4149 } | |
4150 } | |
4151 | |
4152 if (type == '(') { | |
4153 RegExpCapture* capture = new RegExpCapture(body, capture_index); | |
4154 captures_->at(capture_index - 1) = capture; | |
4155 return capture; | |
4156 } else if (type == ':') { | |
4157 return body; | |
4158 } else { | |
4159 ASSERT(type == '=' || type == '!'); | |
4160 bool is_positive = (type == '='); | |
4161 return new RegExpLookahead(body, is_positive); | |
4162 } | |
4163 } | |
4164 | |
4165 | |
4166 CharacterRange RegExpParser::ParseClassAtom(bool* is_char_class, | |
4167 ZoneList<CharacterRange>* ranges, | |
4168 bool* ok) { | |
4169 ASSERT_EQ(false, *is_char_class); | |
4170 uc32 first = current(); | |
4171 if (first == '\\') { | |
4172 switch (Next()) { | |
4173 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': { | |
4174 *is_char_class = true; | |
4175 uc32 c = Next(); | |
4176 CharacterRange::AddClassEscape(c, ranges); | |
4177 Advance(2); | |
4178 return NULL; | |
4179 } | |
4180 default: | |
4181 uc32 c = ParseClassCharacterEscape(CHECK_OK); | |
4182 return CharacterRange::Singleton(c); | |
4183 } | |
4184 } else { | |
4185 Advance(); | |
4186 return CharacterRange::Singleton(first); | |
4187 } | |
4188 } | |
4189 | |
4190 | |
4191 RegExpTree* RegExpParser::ParseCharacterClass(bool* ok) { | |
4192 static const char* kUnterminated = "Unterminated character class"; | |
4193 static const char* kIllegal = "Illegal character class"; | |
4194 static const char* kRangeOutOfOrder = "Range out of order in character class"; | |
4195 | |
4196 ASSERT_EQ(current(), '['); | |
4197 Advance(); | |
4198 bool is_negated = false; | |
4199 if (current() == '^') { | |
4200 is_negated = true; | |
4201 Advance(); | |
4202 } | |
4203 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2); | |
4204 while (has_more() && current() != ']') { | |
4205 bool is_char_class = false; | |
4206 CharacterRange first = ParseClassAtom(&is_char_class, ranges, CHECK_OK); | |
4207 if (!is_char_class) { | |
4208 if (current() == '-') { | |
4209 Advance(); | |
4210 if (current() == kEndMarker) { | |
4211 // If we reach the end we break out of the loop and let the | |
4212 // following code report an error. | |
4213 break; | |
4214 } else if (current() == ']') { | |
4215 ranges->Add(first); | |
4216 ranges->Add(CharacterRange::Singleton('-')); | |
4217 break; | |
4218 } | |
4219 CharacterRange next = | |
4220 ParseClassAtom(&is_char_class, ranges, CHECK_OK); | |
4221 if (is_char_class) { | |
4222 return ReportError(CStrVector(kIllegal), CHECK_OK); | |
4223 } | |
4224 if (first.from() > next.to()) { | |
4225 return ReportError(CStrVector(kRangeOutOfOrder), CHECK_OK); | |
4226 } | |
4227 ranges->Add(CharacterRange::Range(first.from(), next.to())); | |
4228 } else { | |
4229 ranges->Add(first); | |
4230 } | |
4231 } | |
4232 } | |
4233 if (!has_more()) { | |
4234 return ReportError(CStrVector(kUnterminated), CHECK_OK); | |
4235 } | |
4236 Advance(); | |
4237 if (ranges->length() == 0) { | |
4238 ranges->Add(CharacterRange::Range(0, 0xffff)); | |
4239 is_negated = !is_negated; | |
4240 } | |
4241 return new RegExpCharacterClass(ranges, is_negated); | |
4242 } | |
4243 | |
4244 | |
4245 // ---------------------------------------------------------------------------- | |
3167 // The Parser interface. | 4246 // The Parser interface. |
3168 | 4247 |
3169 // MakeAST() is just a wrapper for the corresponding Parser calls | 4248 // MakeAST() is just a wrapper for the corresponding Parser calls |
3170 // so we don't have to expose the entire Parser class in the .h file. | 4249 // so we don't have to expose the entire Parser class in the .h file. |
3171 | 4250 |
3172 static bool always_allow_natives_syntax = false; | 4251 static bool always_allow_natives_syntax = false; |
3173 | 4252 |
3174 | 4253 |
3175 ParserMessage::~ParserMessage() { | 4254 ParserMessage::~ParserMessage() { |
3176 for (int i = 0; i < args().length(); i++) | 4255 for (int i = 0; i < args().length(); i++) |
(...skipping 27 matching lines...) Expand all Loading... | |
3204 PreParser parser(no_script, allow_natives_syntax, extension); | 4283 PreParser parser(no_script, allow_natives_syntax, extension); |
3205 if (!parser.PreParseProgram(stream)) return NULL; | 4284 if (!parser.PreParseProgram(stream)) return NULL; |
3206 // The list owns the backing store so we need to clone the vector. | 4285 // The list owns the backing store so we need to clone the vector. |
3207 // That way, the result will be exactly the right size rather than | 4286 // That way, the result will be exactly the right size rather than |
3208 // the expected 50% too large. | 4287 // the expected 50% too large. |
3209 Vector<unsigned> store = parser.recorder()->store()->ToVector().Clone(); | 4288 Vector<unsigned> store = parser.recorder()->store()->ToVector().Clone(); |
3210 return new ScriptDataImpl(store); | 4289 return new ScriptDataImpl(store); |
3211 } | 4290 } |
3212 | 4291 |
3213 | 4292 |
4293 bool ParseRegExp(FlatStringReader* input, RegExpParseResult* result) { | |
4294 ASSERT(result != NULL); | |
4295 // Get multiline flag somehow | |
Mads Ager (chromium)
2008/11/25 21:09:41
Is this a TODO or a comment on what the code is do
Erik Corry
2008/11/26 12:18:36
It's a TODO. Fixed.
| |
4296 RegExpParser parser(input, &result->error, false); | |
4297 bool ok = true; | |
4298 result->tree = parser.ParsePattern(&ok); | |
4299 if (!ok) { | |
4300 ASSERT(result->tree == NULL); | |
4301 ASSERT(!result->error.is_null()); | |
4302 } else { | |
4303 ASSERT(result->tree != NULL); | |
4304 ASSERT(result->error.is_null()); | |
4305 } | |
4306 if (ok) { | |
4307 result->has_character_escapes = parser.HasCharacterEscapes(); | |
4308 result->capture_count = parser.captures_started(); | |
4309 } | |
4310 return ok; | |
4311 } | |
4312 | |
4313 | |
3214 FunctionLiteral* MakeAST(bool compile_in_global_context, | 4314 FunctionLiteral* MakeAST(bool compile_in_global_context, |
3215 Handle<Script> script, | 4315 Handle<Script> script, |
3216 v8::Extension* extension, | 4316 v8::Extension* extension, |
3217 ScriptDataImpl* pre_data) { | 4317 ScriptDataImpl* pre_data) { |
3218 bool allow_natives_syntax = | 4318 bool allow_natives_syntax = |
3219 always_allow_natives_syntax || | 4319 always_allow_natives_syntax || |
3220 FLAG_allow_natives_syntax || | 4320 FLAG_allow_natives_syntax || |
3221 Bootstrapper::IsActive(); | 4321 Bootstrapper::IsActive(); |
3222 AstBuildingParser parser(script, allow_natives_syntax, extension, pre_data); | 4322 AstBuildingParser parser(script, allow_natives_syntax, extension, pre_data); |
3223 if (pre_data != NULL && pre_data->has_error()) { | 4323 if (pre_data != NULL && pre_data->has_error()) { |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3256 start_position, | 4356 start_position, |
3257 is_expression); | 4357 is_expression); |
3258 return result; | 4358 return result; |
3259 } | 4359 } |
3260 | 4360 |
3261 | 4361 |
3262 #undef NEW | 4362 #undef NEW |
3263 | 4363 |
3264 | 4364 |
3265 } } // namespace v8::internal | 4365 } } // namespace v8::internal |
OLD | NEW |