Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(330)

Side by Side Diff: src/parser.cc

Issue 12427: Merge regexp2000 back into bleeding_edge (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 12 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 16 matching lines...) Expand all
27 27
28 #include "v8.h" 28 #include "v8.h"
29 29
30 #include "api.h" 30 #include "api.h"
31 #include "ast.h" 31 #include "ast.h"
32 #include "bootstrapper.h" 32 #include "bootstrapper.h"
33 #include "platform.h" 33 #include "platform.h"
34 #include "runtime.h" 34 #include "runtime.h"
35 #include "parser.h" 35 #include "parser.h"
36 #include "scopes.h" 36 #include "scopes.h"
37 #include "string-stream.h"
37 38
38 namespace v8 { namespace internal { 39 namespace v8 { namespace internal {
39 40
40 class ParserFactory; 41 class ParserFactory;
41 class ParserLog; 42 class ParserLog;
42 class TemporaryScope; 43 class TemporaryScope;
43 template <typename T> class ZoneListWrapper; 44 template <typename T> class ZoneListWrapper;
44 45
45 46
46 class Parser { 47 class Parser {
(...skipping 173 matching lines...) Expand 10 before | Expand all | Expand 10 after
220 Handle<String> type, 221 Handle<String> type,
221 Vector< Handle<Object> > arguments); 222 Vector< Handle<Object> > arguments);
222 223
223 friend class Target; 224 friend class Target;
224 friend class TargetScope; 225 friend class TargetScope;
225 friend class LexicalScope; 226 friend class LexicalScope;
226 friend class TemporaryScope; 227 friend class TemporaryScope;
227 }; 228 };
228 229
229 230
231 template <typename T, int initial_size>
232 class BufferedZoneList {
233 public:
234
235 BufferedZoneList() :
Mads Ager (chromium) 2008/11/25 21:09:41 All on one line.
236 list_(NULL), last_(NULL) {}
237
238 // Adds element at end of list. This element is buffered and can
239 // be read using last() or removed using RemoveLast until a new Add or until
240 // RemoveLast or GetList has been called.
241 void Add(T* value) {
242 if (last_ != NULL) {
243 if (list_ == NULL) {
244 list_ = new ZoneList<T*>(initial_size);
245 }
246 list_->Add(last_);
247 }
248 last_ = value;
249 }
250
251 T* last() {
252 ASSERT(last_ != NULL);
253 return last_;
254 }
255
256 T* RemoveLast() {
257 ASSERT(last_ != NULL);
258 T* result = last_;
259 if (list_ != NULL && list_->length() > 0)
260 last_ = list_->RemoveLast();
261 else
262 last_ = NULL;
263 return result;
264 }
265
266 T* Get(int i) {
267 ASSERT(0 <= i && i < length());
268 if (list_ == NULL) {
269 ASSERT_EQ(0, i);
270 return last_;
271 } else {
272 if (i == list_->length()) {
273 ASSERT(last_ != NULL);
274 return last_;
275 } else {
276 return list_->at(i);
277 }
278 }
279 }
280
281 void Clear() {
282 list_ = NULL;
283 last_ = NULL;
284 }
285
286 int length() {
287 int length = (list_ == NULL) ? 0 : list_->length();
288 return length + ((last_ == NULL) ? 0 : 1);
289 }
290
291 ZoneList<T*>* GetList() {
292 if (list_ == NULL) {
293 list_ = new ZoneList<T*>(initial_size);
294 }
295 if (last_ != NULL) {
296 list_->Add(last_);
297 last_ = NULL;
298 }
299 return list_;
300 }
301
302 private:
303 ZoneList<T*>* list_;
304 T* last_;
305 };
306
307 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
308 class RegExpBuilder {
309 public:
310 RegExpBuilder();
311 void AddCharacter(uc16 character);
312 // "Adds" an empty expression. Does nothing except consume a
313 // following quantifier
314 void AddEmpty();
315 void AddAtom(RegExpTree* tree);
316 void AddAssertion(RegExpTree* tree);
317 void NewAlternative(); // '|'
318 void AddQuantifierToAtom(int min, int max, bool is_greedy);
319 RegExpTree* ToRegExp();
320 private:
321 void FlushCharacters();
322 void FlushText();
323 void FlushTerms();
324 bool pending_empty_;
325 ZoneList<uc16>* characters_;
326 BufferedZoneList<RegExpTree, 2> terms_;
327 BufferedZoneList<RegExpTree, 2> text_;
328 BufferedZoneList<RegExpTree, 2> alternatives_;
329 #ifdef DEBUG
330 enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_;
331 #define LAST(x) last_added_ = x;
332 #else
333 #define LAST(x)
334 #endif
335 };
336
337
338 RegExpBuilder::RegExpBuilder()
339 : pending_empty_(false), characters_(NULL), terms_(), alternatives_()
Mads Ager (chromium) 2008/11/25 21:09:41 Four space indent. More occurrences below.
340 #ifdef DEBUG
341 , last_added_(ADD_NONE)
342 #endif
343 {}
344
345
346 void RegExpBuilder::FlushCharacters() {
347 pending_empty_ = false;
348 if (characters_ != NULL) {
349 RegExpTree* atom = new RegExpAtom(characters_->ToConstVector());
350 characters_ = NULL;
351 text_.Add(atom);
352 LAST(ADD_ATOM);
353 }
354 }
355
356
357 void RegExpBuilder::FlushText() {
358 FlushCharacters();
359 int num_text = text_.length();
360 if (num_text == 0) {
361 return;
362 } else if (num_text == 1) {
363 terms_.Add(text_.last());
364 } else {
365 RegExpText* text = new RegExpText();
366 for (int i = 0; i < num_text; i++)
367 text_.Get(i)->AppendToText(text);
368 terms_.Add(text);
369 }
370 text_.Clear();
371 }
372
373
374 void RegExpBuilder::AddCharacter(uc16 c) {
375 pending_empty_ = false;
376 if (characters_ == NULL) {
377 characters_ = new ZoneList<uc16>(4);
378 }
379 characters_->Add(c);
380 LAST(ADD_CHAR);
381 }
382
383
384 void RegExpBuilder::AddEmpty() {
385 pending_empty_ = true;
386 }
387
388
389 void RegExpBuilder::AddAtom(RegExpTree* term) {
390 if (term->IsEmpty()) {
391 AddEmpty();
392 return;
393 }
394 if (term->IsTextElement()) {
395 FlushCharacters();
396 text_.Add(term);
397 } else {
398 FlushText();
399 terms_.Add(term);
400 }
401 LAST(ADD_ATOM);
402 }
403
404
405 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
406 FlushText();
407 terms_.Add(assert);
408 LAST(ADD_ASSERT);
409 }
410
411
412 void RegExpBuilder::NewAlternative() {
413 FlushTerms();
414 }
415
416
417 void RegExpBuilder::FlushTerms() {
418 FlushText();
419 int num_terms = terms_.length();
420 RegExpTree* alternative;
421 if (num_terms == 0) {
422 alternative = RegExpEmpty::GetInstance();
423 } else if (num_terms == 1) {
424 alternative = terms_.last();
425 } else {
426 alternative = new RegExpAlternative(terms_.GetList());
427 }
428 alternatives_.Add(alternative);
429 terms_.Clear();
430 LAST(ADD_NONE);
431 }
432
433
434 RegExpTree* RegExpBuilder::ToRegExp() {
435 FlushTerms();
436 int num_alternatives = alternatives_.length();
437 if (num_alternatives == 0) {
438 return RegExpEmpty::GetInstance();
439 }
440 if (num_alternatives == 1) {
441 return alternatives_.last();
442 }
443 return new RegExpDisjunction(alternatives_.GetList());
444 }
445
446
447 void RegExpBuilder::AddQuantifierToAtom(int min, int max, bool is_greedy) {
448 if (pending_empty_) {
449 pending_empty_ = false;
450 return;
451 }
452 RegExpTree* atom;
453 if (characters_ != NULL) {
454 ASSERT(last_added_ == ADD_CHAR);
455 // Last atom was character.
456 Vector<const uc16> char_vector = characters_->ToConstVector();
457 int num_chars = char_vector.length();
458 if (num_chars > 1) {
459 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
460 text_.Add(new RegExpAtom(prefix));
461 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
462 }
463 characters_ = NULL;
464 atom = new RegExpAtom(char_vector);
465 FlushText();
466 } else if (text_.length() > 0) {
467 ASSERT(last_added_ == ADD_ATOM);
468 atom = text_.RemoveLast();
469 FlushText();
470 } else if (terms_.length() > 0) {
471 ASSERT(last_added_ == ADD_ATOM);
472 atom = terms_.RemoveLast();
473 if (atom->IsLookahead() || atom->IsAssertion()) {
474 // Guaranteed not to match a non-empty string.
475 // Assertion as an atom can happen as, e.g., (?:\b)
476 LAST(ADD_TERM);
477 if (min == 0) {
478 return;
479 }
480 terms_.Add(atom);
481 return;
482 }
483 } else {
484 // Only call immediately after adding an atom or character!
485 UNREACHABLE();
486 return;
487 }
488 terms_.Add(new RegExpQuantifier(min, max, is_greedy, atom));
489 LAST(ADD_TERM);
490 }
491
492
493 class RegExpParser {
494 public:
495 RegExpParser(FlatStringReader* in,
496 Handle<String>* error,
497 bool multiline_mode);
498 RegExpTree* ParsePattern(bool* ok);
499 RegExpTree* ParseDisjunction(bool* ok);
500 RegExpTree* ParseGroup(bool* ok);
501 RegExpTree* ParseCharacterClass(bool* ok);
502
503 // Parses a {...,...} quantifier and stores the range in the given
504 // out parameters.
505 bool ParseIntervalQuantifier(int* min_out, int* max_out);
506
507 // Parses and returns a single escaped character. The character
508 // must not be 'b' or 'B' since they are usually handle specially.
509 uc32 ParseClassCharacterEscape(bool* ok);
510
511 // Checks whether the following is a length-digit hexadecimal number,
512 // and sets the value if it is.
513 bool ParseHexEscape(int length, uc32* value);
514
515 uc32 ParseControlLetterEscape(bool* ok);
516 uc32 ParseOctalLiteral();
517
518 // Tries to parse the input as a back reference. If successful it
519 // stores the result in the output parameter and returns true. If
520 // it fails it will push back the characters read so the same characters
521 // can be reparsed.
522 bool ParseBackReferenceIndex(int* index_out);
523
524 CharacterRange ParseClassAtom(bool* is_char_class,
525 ZoneList<CharacterRange>* ranges,
526 bool* ok);
527 RegExpTree* ReportError(Vector<const char> message, bool* ok);
528 void Advance();
529 void Advance(int dist);
530 void Reset(int pos);
531
532 bool HasCharacterEscapes();
533
534 int captures_started() { return captures_ == NULL ? 0 : captures_->length(); }
535 int position() { return next_pos_ - 1; }
536
537 static const uc32 kEndMarker = (1 << 21);
538 private:
539 uc32 current() { return current_; }
540 bool has_more() { return has_more_; }
541 bool has_next() { return next_pos_ < in()->length(); }
542 uc32 Next();
543 FlatStringReader* in() { return in_; }
544 void ScanForCaptures();
545 bool CaptureAvailable(int index);
546 uc32 current_;
547 bool has_more_;
548 bool multiline_mode_;
549 int next_pos_;
550 FlatStringReader* in_;
551 Handle<String>* error_;
552 bool has_character_escapes_;
553 bool is_scanned_for_captures_;
554 ZoneList<RegExpCapture*>* captures_;
555 int capture_count_;
556 };
557
558
230 // A temporary scope stores information during parsing, just like 559 // A temporary scope stores information during parsing, just like
231 // a plain scope. However, temporary scopes are not kept around 560 // a plain scope. However, temporary scopes are not kept around
232 // after parsing or referenced by syntax trees so they can be stack- 561 // after parsing or referenced by syntax trees so they can be stack-
233 // allocated and hence used by the pre-parser. 562 // allocated and hence used by the pre-parser.
234 class TemporaryScope BASE_EMBEDDED { 563 class TemporaryScope BASE_EMBEDDED {
235 public: 564 public:
236 explicit TemporaryScope(Parser* parser); 565 explicit TemporaryScope(Parser* parser);
237 ~TemporaryScope(); 566 ~TemporaryScope();
238 567
239 int NextMaterializedLiteralIndex() { 568 int NextMaterializedLiteralIndex() {
(...skipping 2917 matching lines...) Expand 10 before | Expand all | Expand 10 after
3157 } 3486 }
3158 ZoneList<Expression*>* args = new ZoneList<Expression*>(2); 3487 ZoneList<Expression*>* args = new ZoneList<Expression*>(2);
3159 args->Add(new Literal(type)); 3488 args->Add(new Literal(type));
3160 args->Add(new Literal(array)); 3489 args->Add(new Literal(array));
3161 return new Throw(new CallRuntime(constructor, NULL, args), 3490 return new Throw(new CallRuntime(constructor, NULL, args),
3162 scanner().location().beg_pos); 3491 scanner().location().beg_pos);
3163 } 3492 }
3164 3493
3165 3494
3166 // ---------------------------------------------------------------------------- 3495 // ----------------------------------------------------------------------------
3496 // Regular expressions
3497
3498
3499 RegExpParser::RegExpParser(FlatStringReader* in,
3500 Handle<String>* error,
3501 bool multiline_mode)
3502 : current_(kEndMarker),
3503 has_more_(true),
3504 multiline_mode_(multiline_mode),
3505 next_pos_(0),
3506 in_(in),
3507 error_(error),
3508 has_character_escapes_(false),
3509 is_scanned_for_captures_(false),
3510 captures_(NULL),
3511 capture_count_(0) {
3512 Advance(1);
3513 }
3514
3515
3516 uc32 RegExpParser::Next() {
3517 if (has_next()) {
3518 return in()->Get(next_pos_);
3519 } else {
3520 return kEndMarker;
3521 }
3522 }
3523
3524
3525 void RegExpParser::Advance() {
3526 if (next_pos_ < in()->length()) {
3527 current_ = in()->Get(next_pos_);
3528 next_pos_++;
3529 } else {
3530 current_ = kEndMarker;
3531 has_more_ = false;
3532 }
3533 }
3534
3535
3536 void RegExpParser::Reset(int pos) {
3537 next_pos_ = pos;
3538 Advance();
3539 }
3540
3541
3542 void RegExpParser::Advance(int dist) {
3543 for (int i = 0; i < dist; i++)
3544 Advance();
3545 }
3546
3547
3548 // Reports whether the parsed string atoms contain any characters that were
3549 // escaped in the original pattern. If not, all atoms are proper substrings
3550 // of the original pattern.
3551 bool RegExpParser::HasCharacterEscapes() {
3552 return has_character_escapes_;
3553 }
3554
3555 RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) {
3556 *ok = false;
3557 *error_ = Factory::NewStringFromAscii(message, NOT_TENURED);
3558 return NULL;
3559 }
3560
3561
3562 // Pattern ::
3563 // Disjunction
3564 RegExpTree* RegExpParser::ParsePattern(bool* ok) {
3565 RegExpTree* result = ParseDisjunction(CHECK_OK);
3566 if (has_more()) {
3567 ReportError(CStrVector("Unmatched ')'"), CHECK_OK);
3568 }
3569 return result;
3570 }
3571
3572
3573 bool RegExpParser::CaptureAvailable(int index) {
3574 if (captures_ == NULL) return false;
3575 if (index >= captures_->length()) return false;
3576 RegExpCapture* capture = captures_->at(index);
3577 return capture != NULL && capture->available() == CAPTURE_AVAILABLE;
3578 }
3579
3580
3581 // Disjunction ::
3582 // Alternative
3583 // Alternative | Disjunction
3584 // Alternative ::
3585 // [empty]
3586 // Term Alternative
3587 // Term ::
3588 // Assertion
3589 // Atom
3590 // Atom Quantifier
3591 RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {
3592 RegExpBuilder builder;
3593 int capture_start_index = captures_started();
3594 while (true) {
3595 switch (current()) {
3596 case kEndMarker:
3597 case ')':
3598 return builder.ToRegExp();
3599 case '|': {
3600 Advance();
3601 builder.NewAlternative();
3602 int capture_new_alt_start_index = captures_started();
3603 for (int i = capture_start_index; i < capture_new_alt_start_index; i++) {
3604 RegExpCapture* capture = captures_->at(i);
3605 if (capture->available() == CAPTURE_AVAILABLE) {
3606 capture->set_available(CAPTURE_UNREACHABLE);
3607 }
3608 }
3609 capture_start_index = capture_new_alt_start_index;
3610 continue;
3611 }
3612 case '*':
3613 case '+':
3614 case '?':
3615 ReportError(CStrVector("Nothing to repeat"), CHECK_OK);
3616 case '^': {
3617 Advance();
3618 RegExpAssertion::Type type =
3619 multiline_mode_ ? RegExpAssertion::START_OF_LINE :
3620 RegExpAssertion::START_OF_INPUT;
3621 builder.AddAssertion(new RegExpAssertion(type));
3622 continue;
3623 }
3624 case '$': {
3625 Advance();
3626 RegExpAssertion::Type type =
3627 multiline_mode_ ? RegExpAssertion::END_OF_LINE :
3628 RegExpAssertion::END_OF_INPUT;
3629 builder.AddAssertion(new RegExpAssertion(type));
3630 continue;
3631 }
3632 case '.': {
3633 Advance();
3634 // everything except \x0a, \x0d, \u2028 and \u2029
3635 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
3636 CharacterRange::AddClassEscape('.', ranges);
3637 RegExpTree* atom = new RegExpCharacterClass(ranges, false);
3638 builder.AddAtom(atom);
3639 break;
3640 }
3641 case '(': {
3642 RegExpTree* atom = ParseGroup(CHECK_OK);
3643 builder.AddAtom(atom);
3644 break;
3645 }
3646 case '[': {
3647 RegExpTree* atom = ParseCharacterClass(CHECK_OK);
3648 builder.AddAtom(atom);
3649 break;
3650 }
3651 // Atom ::
3652 // \ AtomEscape
3653 case '\\':
3654 switch (Next()) {
3655 case kEndMarker:
3656 ReportError(CStrVector("\\ at end of pattern"), CHECK_OK);
3657 case 'b':
3658 Advance(2);
3659 builder.AddAssertion(
3660 new RegExpAssertion(RegExpAssertion::BOUNDARY));
3661 continue;
3662 case 'B':
3663 Advance(2);
3664 builder.AddAssertion(
3665 new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
3666 continue;
3667 // AtomEscape ::
3668 // CharacterClassEscape
3669 //
3670 // CharacterClassEscape :: one of
3671 // d D s S w W
3672 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {
3673 uc32 c = Next();
3674 Advance(2);
3675 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
3676 CharacterRange::AddClassEscape(c, ranges);
3677 RegExpTree* atom = new RegExpCharacterClass(ranges, false);
3678 builder.AddAtom(atom);
3679 goto has_read_atom; // Avoid setting has_character_escapes_.
3680 }
3681 case '1': case '2': case '3': case '4': case '5': case '6':
3682 case '7': case '8': case '9': {
3683 int index = 0;
3684 if (ParseBackReferenceIndex(&index)) {
3685 if (!CaptureAvailable(index - 1)) {
3686 // Prepare to ignore a following quantifier
3687 builder.AddEmpty();
3688 goto has_read_atom;
3689 }
3690 RegExpCapture* capture = captures_->at(index - 1);
3691 RegExpTree* atom = new RegExpBackReference(capture);
3692 builder.AddAtom(atom);
3693 goto has_read_atom; // Avoid setting has_character_escapes_.
3694 }
3695 uc32 first_digit = Next();
3696 if (first_digit == '8' || first_digit == '9') {
3697 // Treat as identity escape
3698 builder.AddCharacter(first_digit);
3699 Advance(2);
3700 break;
3701 }
3702 }
3703 // FALLTHROUGH
3704 case '0': {
3705 Advance();
3706 uc32 octal = ParseOctalLiteral();
3707 builder.AddCharacter(octal);
3708 break;
3709 }
3710 // ControlEscape :: one of
3711 // f n r t v
3712 case 'f':
3713 Advance(2);
3714 builder.AddCharacter('\f');
3715 break;
3716 case 'n':
3717 Advance(2);
3718 builder.AddCharacter('\n');
3719 break;
3720 case 'r':
3721 Advance(2);
3722 builder.AddCharacter('\r');
3723 break;
3724 case 't':
3725 Advance(2);
3726 builder.AddCharacter('\t');
3727 break;
3728 case 'v':
3729 Advance(2);
3730 builder.AddCharacter('\v');
3731 break;
3732 case 'c': {
3733 Advance(2);
3734 uc32 control = ParseControlLetterEscape(ok);
3735 builder.AddCharacter(control);
3736 break;
3737 }
3738 case 'x': {
3739 Advance(2);
3740 uc32 value;
3741 if (ParseHexEscape(2, &value)) {
3742 builder.AddCharacter(value);
3743 } else {
3744 builder.AddCharacter('x');
3745 }
3746 break;
3747 }
3748 case 'u': {
3749 Advance(2);
3750 uc32 value;
3751 if (ParseHexEscape(4, &value)) {
3752 builder.AddCharacter(value);
3753 } else {
3754 builder.AddCharacter('u');
3755 }
3756 break;
3757 }
3758 default:
3759 // Identity escape.
3760 builder.AddCharacter(Next());
3761 Advance(2);
3762 break;
3763 }
3764 has_character_escapes_ = true;
3765 break;
3766 case '{': {
3767 int dummy;
3768 if (ParseIntervalQuantifier(&dummy, &dummy)) {
3769 ReportError(CStrVector("Nothing to repeat"), CHECK_OK);
3770 }
3771 // fallthrough
3772 }
3773 default:
3774 builder.AddCharacter(current());
3775 Advance();
3776 break;
3777 } // end switch(current())
3778
3779 has_read_atom:
3780 int min;
3781 int max;
3782 switch (current()) {
3783 // QuantifierPrefix ::
3784 // *
3785 // +
3786 // ?
3787 // {
3788 case '*':
3789 min = 0;
3790 max = RegExpQuantifier::kInfinity;
3791 Advance();
3792 break;
3793 case '+':
3794 min = 1;
3795 max = RegExpQuantifier::kInfinity;
3796 Advance();
3797 break;
3798 case '?':
3799 min = 0;
3800 max = 1;
3801 Advance();
3802 break;
3803 case '{':
3804 if (ParseIntervalQuantifier(&min, &max)) {
3805 break;
3806 } else {
3807 continue;
3808 }
3809 default:
3810 continue;
3811 }
3812 bool is_greedy = true;
3813 if (current() == '?') {
3814 is_greedy = false;
3815 Advance();
3816 }
3817 builder.AddQuantifierToAtom(min, max, is_greedy);
3818 }
3819 }
3820
3821 class SourceCharacter {
3822 public:
3823 static bool Is(uc32 c) {
3824 switch (c) {
3825 // case ']': case '}':
3826 // In spidermonkey and jsc these are treated as source characters
3827 // so we do too.
3828 case '^': case '$': case '\\': case '.': case '*': case '+':
3829 case '?': case '(': case ')': case '[': case '{': case '|':
3830 case RegExpParser::kEndMarker:
3831 return false;
3832 default:
3833 return true;
3834 }
3835 }
3836 };
3837
3838
3839 static unibrow::Predicate<SourceCharacter> source_character;
3840
3841
3842 static inline bool IsSourceCharacter(uc32 c) {
3843 return source_character.get(c);
3844 }
3845
3846 #ifdef DEBUG
3847 // Currently only used in an ASSERT.
3848 static bool IsSpecialClassEscape(uc32 c) {
3849 switch (c) {
3850 case 'd': case 'D':
3851 case 's': case 'S':
3852 case 'w': case 'W':
3853 return true;
3854 default:
3855 return false;
3856 }
3857 }
3858 #endif
3859
3860
3861 // In order to know whether an escape is a backreference or not we have to scan
3862 // the entire regexp and find the number of capturing parentheses. However we
3863 // don't want to scan the regexp twice unless it is necessary. This mini-parser
3864 // is called when needed. It can see the difference between capturing and
3865 // noncapturing parentheses and can skip character classes and backslash-escaped
3866 // characters.
3867 void RegExpParser::ScanForCaptures() {
3868 int n;
3869 while ((n = current()) != kEndMarker) {
3870 Advance();
3871 switch (n) {
3872 case '\\':
3873 Advance();
3874 break;
3875 case '[': {
3876 int c;
3877 while ((c = current()) != kEndMarker) {
3878 Advance();
3879 if (c == '\\') {
3880 Advance();
3881 } else {
3882 if (c == ']') break;
3883 }
3884 }
3885 break;
3886 }
3887 case '(':
3888 if (current() != '?') capture_count_++;
3889 break;
3890 }
3891 }
3892 is_scanned_for_captures_ = true;
3893 }
3894
3895
3896 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
3897 ASSERT_EQ('\\', current());
3898 ASSERT('1' <= Next() && Next() <= '9');
3899 // Try to parse a decimal literal that is no greater than the number
3900 // of previously encountered left capturing parentheses.
3901 // This is a not according the the ECMAScript specification. According to
3902 // that, one must accept values up to the total number of left capturing
3903 // parentheses in the entire input, even if they are meaningless.
3904 if (!is_scanned_for_captures_) {
3905 int saved_position = position();
3906 ScanForCaptures();
3907 Reset(saved_position);
3908 }
3909 if (capture_count_ == 0) return false;
3910 int start = position();
3911 int value = Next() - '0';
3912 if (value > capture_count_) return false;
3913 Advance(2);
3914 while (true) {
3915 uc32 c = current();
3916 if (IsDecimalDigit(c)) {
3917 value = 10 * value + (c - '0');
3918 if (value > capture_count_) {
3919 Reset(start);
3920 return false;
3921 }
3922 Advance();
3923 } else {
3924 break;
3925 }
3926 }
3927 *index_out = value;
3928 return true;
3929 }
3930
3931
3932 // QuantifierPrefix ::
3933 // { DecimalDigits }
3934 // { DecimalDigits , }
3935 // { DecimalDigits , DecimalDigits }
3936 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
3937 ASSERT_EQ(current(), '{');
3938 int start = position();
3939 Advance();
3940 int min = 0;
3941 if (!IsDecimalDigit(current())) {
3942 Reset(start);
3943 return false;
3944 }
3945 while (IsDecimalDigit(current())) {
3946 min = 10 * min + (current() - '0');
3947 Advance();
3948 }
3949 int max = 0;
3950 if (current() == '}') {
3951 max = min;
3952 Advance();
3953 } else if (current() == ',') {
3954 Advance();
3955 if (current() == '}') {
3956 max = RegExpQuantifier::kInfinity;
3957 Advance();
3958 } else {
3959 while (IsDecimalDigit(current())) {
3960 max = 10 * max + (current() - '0');
3961 Advance();
3962 }
3963 if (current() != '}') {
3964 Reset(start);
3965 return false;
3966 }
3967 Advance();
3968 }
3969 } else {
3970 Reset(start);
3971 return false;
3972 }
3973 *min_out = min;
3974 *max_out = max;
3975 return true;
3976 }
3977
3978
3979 // Upper and lower case letters differ by one bit.
3980 STATIC_CHECK(('a' ^ 'A') == 0x20);
3981
3982 uc32 RegExpParser::ParseControlLetterEscape(bool* ok) {
3983 if (!has_more()) {
3984 ReportError(CStrVector("\\c at end of pattern"), ok);
3985 return '\0';
3986 }
3987 uc32 letter = current() & ~(0x20); // Collapse upper and lower case letters.
3988 if (letter < 'A' || 'Z' < letter) {
3989 // Non-spec error-correction: "\c" followed by non-control letter is
3990 // interpreted as an IdentityEscape of 'c'.
3991 return 'c';
3992 }
3993 Advance();
3994 return letter & 0x1f; // Remainder modulo 32, per specification.
3995 }
3996
3997
3998 uc32 RegExpParser::ParseOctalLiteral() {
3999 ASSERT('0' <= current() && current() <= '7');
4000 // For compatibility with some other browsers (not all), we parse
4001 // up to three octal digits with a value below 256.
4002 uc32 value = current() - '0';
4003 Advance();
4004 if ('0' <= current() && current() <= '7') {
4005 value = value * 8 + current() - '0';
4006 Advance();
4007 if (value < 32 && '0' <= current() && current() <= '7') {
4008 value = value * 8 + current() - '0';
4009 Advance();
4010 }
4011 }
4012 return value;
4013 }
4014
4015
4016 bool RegExpParser::ParseHexEscape(int length, uc32 *value) {
4017 int start = position();
4018 uc32 val = 0;
4019 bool done = false;
4020 for (int i = 0; !done; i++) {
4021 uc32 c = current();
4022 int d = HexValue(c);
4023 if (d < 0) {
4024 Reset(start);
4025 return false;
4026 }
4027 val = val * 16 + d;
4028 Advance();
4029 if (i == length - 1) {
4030 done = true;
4031 }
4032 }
4033 *value = val;
4034 return true;
4035 }
4036
4037
4038 uc32 RegExpParser::ParseClassCharacterEscape(bool* ok) {
4039 ASSERT(current() == '\\');
4040 ASSERT(has_next() && !IsSpecialClassEscape(Next()));
4041 Advance();
4042 switch (current()) {
4043 case 'b':
4044 Advance();
4045 return '\b';
4046 // ControlEscape :: one of
4047 // f n r t v
4048 case 'f':
4049 Advance();
4050 return '\f';
4051 case 'n':
4052 Advance();
4053 return '\n';
4054 case 'r':
4055 Advance();
4056 return '\r';
4057 case 't':
4058 Advance();
4059 return '\t';
4060 case 'v':
4061 Advance();
4062 return '\v';
4063 case 'c':
4064 return ParseControlLetterEscape(ok);
4065 case '0': case '1': case '2': case '3': case '4': case '5':
4066 case '6': case '7':
4067 // For compatibility, we interpret a decimal escape that isn't
4068 // a back reference (and therefore either \0 or not valid according
4069 // to the specification) as a 1..3 digit octal character code.
4070 return ParseOctalLiteral();
4071 case 'x': {
4072 Advance();
4073 uc32 value;
4074 if (ParseHexEscape(2, &value)) {
4075 return value;
4076 }
4077 // If \x is not followed by a two-digit hexadecimal, treat it
4078 // as an identity escape.
4079 return 'x';
4080 }
4081 case 'u': {
4082 Advance();
4083 uc32 value;
4084 if (ParseHexEscape(4, &value)) {
4085 return value;
4086 }
4087 // If \u is not followed by a four-digit hexadecimal, treat it
4088 // as an identity escape.
4089 return 'u';
4090 }
4091 default: {
4092 // Extended identity escape. We accept any character that hasn't
4093 // been matched by a more specific case, not just the subset required
4094 // by the ECMAScript specification.
4095 uc32 result = current();
4096 Advance();
4097 return result;
4098 }
4099 }
4100 return 0;
4101 }
4102
4103
4104 RegExpTree* RegExpParser::ParseGroup(bool* ok) {
4105 ASSERT_EQ(current(), '(');
4106 char type = '(';
4107 Advance();
4108 if (current() == '?') {
4109 switch (Next()) {
4110 case ':': case '=': case '!':
4111 type = Next();
4112 Advance(2);
4113 break;
4114 default:
4115 ReportError(CStrVector("Invalid group"), CHECK_OK);
4116 break;
4117 }
4118 } else {
4119 if (captures_ == NULL) {
4120 captures_ = new ZoneList<RegExpCapture*>(2);
4121 }
4122 captures_->Add(NULL);
4123 if (!is_scanned_for_captures_) capture_count_++;
4124 }
4125 int capture_index = captures_started();
4126 RegExpTree* body = ParseDisjunction(CHECK_OK);
4127 if (current() != ')') {
4128 ReportError(CStrVector("Unterminated group"), CHECK_OK);
4129 }
4130 Advance();
4131
4132 int end_capture_index = captures_started();
4133 if (type == '!') {
4134 // Captures inside a negative lookahead are never available outside it.
4135 for (int i = capture_index; i < end_capture_index; i++) {
4136 RegExpCapture* capture = captures_->at(i);
4137 ASSERT(capture != NULL);
4138 capture->set_available(CAPTURE_PERMANENTLY_UNREACHABLE);
4139 }
4140 } else {
4141 // Captures temporarily unavailable because they are in different
4142 // alternatives are all available after the disjunction.
4143 for (int i = capture_index; i < end_capture_index; i++) {
4144 RegExpCapture* capture = captures_->at(i);
4145 ASSERT(capture != NULL);
4146 if (capture->available() == CAPTURE_UNREACHABLE) {
4147 capture->set_available(CAPTURE_AVAILABLE);
4148 }
4149 }
4150 }
4151
4152 if (type == '(') {
4153 RegExpCapture* capture = new RegExpCapture(body, capture_index);
4154 captures_->at(capture_index - 1) = capture;
4155 return capture;
4156 } else if (type == ':') {
4157 return body;
4158 } else {
4159 ASSERT(type == '=' || type == '!');
4160 bool is_positive = (type == '=');
4161 return new RegExpLookahead(body, is_positive);
4162 }
4163 }
4164
4165
4166 CharacterRange RegExpParser::ParseClassAtom(bool* is_char_class,
4167 ZoneList<CharacterRange>* ranges,
4168 bool* ok) {
4169 ASSERT_EQ(false, *is_char_class);
4170 uc32 first = current();
4171 if (first == '\\') {
4172 switch (Next()) {
4173 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {
4174 *is_char_class = true;
4175 uc32 c = Next();
4176 CharacterRange::AddClassEscape(c, ranges);
4177 Advance(2);
4178 return NULL;
4179 }
4180 default:
4181 uc32 c = ParseClassCharacterEscape(CHECK_OK);
4182 return CharacterRange::Singleton(c);
4183 }
4184 } else {
4185 Advance();
4186 return CharacterRange::Singleton(first);
4187 }
4188 }
4189
4190
4191 RegExpTree* RegExpParser::ParseCharacterClass(bool* ok) {
4192 static const char* kUnterminated = "Unterminated character class";
4193 static const char* kIllegal = "Illegal character class";
4194 static const char* kRangeOutOfOrder = "Range out of order in character class";
4195
4196 ASSERT_EQ(current(), '[');
4197 Advance();
4198 bool is_negated = false;
4199 if (current() == '^') {
4200 is_negated = true;
4201 Advance();
4202 }
4203 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
4204 while (has_more() && current() != ']') {
4205 bool is_char_class = false;
4206 CharacterRange first = ParseClassAtom(&is_char_class, ranges, CHECK_OK);
4207 if (!is_char_class) {
4208 if (current() == '-') {
4209 Advance();
4210 if (current() == kEndMarker) {
4211 // If we reach the end we break out of the loop and let the
4212 // following code report an error.
4213 break;
4214 } else if (current() == ']') {
4215 ranges->Add(first);
4216 ranges->Add(CharacterRange::Singleton('-'));
4217 break;
4218 }
4219 CharacterRange next =
4220 ParseClassAtom(&is_char_class, ranges, CHECK_OK);
4221 if (is_char_class) {
4222 return ReportError(CStrVector(kIllegal), CHECK_OK);
4223 }
4224 if (first.from() > next.to()) {
4225 return ReportError(CStrVector(kRangeOutOfOrder), CHECK_OK);
4226 }
4227 ranges->Add(CharacterRange::Range(first.from(), next.to()));
4228 } else {
4229 ranges->Add(first);
4230 }
4231 }
4232 }
4233 if (!has_more()) {
4234 return ReportError(CStrVector(kUnterminated), CHECK_OK);
4235 }
4236 Advance();
4237 if (ranges->length() == 0) {
4238 ranges->Add(CharacterRange::Range(0, 0xffff));
4239 is_negated = !is_negated;
4240 }
4241 return new RegExpCharacterClass(ranges, is_negated);
4242 }
4243
4244
4245 // ----------------------------------------------------------------------------
3167 // The Parser interface. 4246 // The Parser interface.
3168 4247
3169 // MakeAST() is just a wrapper for the corresponding Parser calls 4248 // MakeAST() is just a wrapper for the corresponding Parser calls
3170 // so we don't have to expose the entire Parser class in the .h file. 4249 // so we don't have to expose the entire Parser class in the .h file.
3171 4250
3172 static bool always_allow_natives_syntax = false; 4251 static bool always_allow_natives_syntax = false;
3173 4252
3174 4253
3175 ParserMessage::~ParserMessage() { 4254 ParserMessage::~ParserMessage() {
3176 for (int i = 0; i < args().length(); i++) 4255 for (int i = 0; i < args().length(); i++)
(...skipping 27 matching lines...) Expand all
3204 PreParser parser(no_script, allow_natives_syntax, extension); 4283 PreParser parser(no_script, allow_natives_syntax, extension);
3205 if (!parser.PreParseProgram(stream)) return NULL; 4284 if (!parser.PreParseProgram(stream)) return NULL;
3206 // The list owns the backing store so we need to clone the vector. 4285 // The list owns the backing store so we need to clone the vector.
3207 // That way, the result will be exactly the right size rather than 4286 // That way, the result will be exactly the right size rather than
3208 // the expected 50% too large. 4287 // the expected 50% too large.
3209 Vector<unsigned> store = parser.recorder()->store()->ToVector().Clone(); 4288 Vector<unsigned> store = parser.recorder()->store()->ToVector().Clone();
3210 return new ScriptDataImpl(store); 4289 return new ScriptDataImpl(store);
3211 } 4290 }
3212 4291
3213 4292
4293 bool ParseRegExp(FlatStringReader* input, RegExpParseResult* result) {
4294 ASSERT(result != NULL);
4295 // Get multiline flag somehow
Mads Ager (chromium) 2008/11/25 21:09:41 Is this a TODO or a comment on what the code is do
Erik Corry 2008/11/26 12:18:36 It's a TODO. Fixed.
4296 RegExpParser parser(input, &result->error, false);
4297 bool ok = true;
4298 result->tree = parser.ParsePattern(&ok);
4299 if (!ok) {
4300 ASSERT(result->tree == NULL);
4301 ASSERT(!result->error.is_null());
4302 } else {
4303 ASSERT(result->tree != NULL);
4304 ASSERT(result->error.is_null());
4305 }
4306 if (ok) {
4307 result->has_character_escapes = parser.HasCharacterEscapes();
4308 result->capture_count = parser.captures_started();
4309 }
4310 return ok;
4311 }
4312
4313
3214 FunctionLiteral* MakeAST(bool compile_in_global_context, 4314 FunctionLiteral* MakeAST(bool compile_in_global_context,
3215 Handle<Script> script, 4315 Handle<Script> script,
3216 v8::Extension* extension, 4316 v8::Extension* extension,
3217 ScriptDataImpl* pre_data) { 4317 ScriptDataImpl* pre_data) {
3218 bool allow_natives_syntax = 4318 bool allow_natives_syntax =
3219 always_allow_natives_syntax || 4319 always_allow_natives_syntax ||
3220 FLAG_allow_natives_syntax || 4320 FLAG_allow_natives_syntax ||
3221 Bootstrapper::IsActive(); 4321 Bootstrapper::IsActive();
3222 AstBuildingParser parser(script, allow_natives_syntax, extension, pre_data); 4322 AstBuildingParser parser(script, allow_natives_syntax, extension, pre_data);
3223 if (pre_data != NULL && pre_data->has_error()) { 4323 if (pre_data != NULL && pre_data->has_error()) {
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
3256 start_position, 4356 start_position,
3257 is_expression); 4357 is_expression);
3258 return result; 4358 return result;
3259 } 4359 }
3260 4360
3261 4361
3262 #undef NEW 4362 #undef NEW
3263 4363
3264 4364
3265 } } // namespace v8::internal 4365 } } // namespace v8::internal
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698