src/parser.cc - Issue 12427: Merge regexp2000 back into bleeding_edge

Side by Side Diff: src/parser.cc

Issue 12427: Merge regexp2000 back into bleeding_edge (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 12 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved.	1 // Copyright 2006-2008 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 16 matching lines...) Expand all Loading...
27	27

28 #include "v8.h"	28 #include "v8.h"

29	29

30 #include "api.h"	30 #include "api.h"

31 #include "ast.h"	31 #include "ast.h"

32 #include "bootstrapper.h"	32 #include "bootstrapper.h"

33 #include "platform.h"	33 #include "platform.h"

34 #include "runtime.h"	34 #include "runtime.h"

35 #include "parser.h"	35 #include "parser.h"

36 #include "scopes.h"	36 #include "scopes.h"

	37 #include "string-stream.h"

37	38

38 namespace v8 { namespace internal {	39 namespace v8 { namespace internal {

39	40

40 class ParserFactory;	41 class ParserFactory;

41 class ParserLog;	42 class ParserLog;

42 class TemporaryScope;	43 class TemporaryScope;

43 template <typename T> class ZoneListWrapper;	44 template <typename T> class ZoneListWrapper;

44	45

45	46

46 class Parser {	47 class Parser {

(...skipping 173 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
220 Handle<String> type,	221 Handle<String> type,

221 Vector< Handle<Object> > arguments);	222 Vector< Handle<Object> > arguments);

222	223

223 friend class Target;	224 friend class Target;

224 friend class TargetScope;	225 friend class TargetScope;

225 friend class LexicalScope;	226 friend class LexicalScope;

226 friend class TemporaryScope;	227 friend class TemporaryScope;

227 };	228 };

228	229

229	230

	231 template <typename T, int initial_size>

	232 class BufferedZoneList {

	233 public:

	234

	235 BufferedZoneList() :
	Mads Ager (chromium) 2008/11/25 21:09:41 All on one line. All on one line.
	236 list_(NULL), last_(NULL) {}

	237

	238 // Adds element at end of list. This element is buffered and can

	239 // be read using last() or removed using RemoveLast until a new Add or until

	240 // RemoveLast or GetList has been called.

	241 void Add(T* value) {

	242 if (last_ != NULL) {

	243 if (list_ == NULL) {

	244 list_ = new ZoneList<T*>(initial_size);

	245 }

	246 list_->Add(last_);

	247 }

	248 last_ = value;

	249 }

	250

	251 T* last() {

	252 ASSERT(last_ != NULL);

	253 return last_;

	254 }

	255

	256 T* RemoveLast() {

	257 ASSERT(last_ != NULL);

	258 T* result = last_;

	259 if (list_ != NULL && list_->length() > 0)

	260 last_ = list_->RemoveLast();

	261 else

	262 last_ = NULL;

	263 return result;

	264 }

	265

	266 T* Get(int i) {

	267 ASSERT(0 <= i && i < length());

	268 if (list_ == NULL) {

	269 ASSERT_EQ(0, i);

	270 return last_;

	271 } else {

	272 if (i == list_->length()) {

	273 ASSERT(last_ != NULL);

	274 return last_;

	275 } else {

	276 return list_->at(i);

	277 }

	278 }

	279 }

	280

	281 void Clear() {

	282 list_ = NULL;

	283 last_ = NULL;

	284 }

	285

	286 int length() {

	287 int length = (list_ == NULL) ? 0 : list_->length();

	288 return length + ((last_ == NULL) ? 0 : 1);

	289 }

	290

	291 ZoneList<T> GetList() {

	292 if (list_ == NULL) {

	293 list_ = new ZoneList<T*>(initial_size);

	294 }

	295 if (last_ != NULL) {

	296 list_->Add(last_);

	297 last_ = NULL;

	298 }

	299 return list_;

	300 }

	301

	302 private:

	303 ZoneList<T> list_;

	304 T* last_;

	305 };

	306

	307 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.

	308 class RegExpBuilder {

	309 public:

	310 RegExpBuilder();

	311 void AddCharacter(uc16 character);

	312 // "Adds" an empty expression. Does nothing except consume a

	313 // following quantifier

	314 void AddEmpty();

	315 void AddAtom(RegExpTree* tree);

	316 void AddAssertion(RegExpTree* tree);

	317 void NewAlternative(); // '\|'

	318 void AddQuantifierToAtom(int min, int max, bool is_greedy);

	319 RegExpTree* ToRegExp();

	320 private:

	321 void FlushCharacters();

	322 void FlushText();

	323 void FlushTerms();

	324 bool pending_empty_;

	325 ZoneList<uc16>* characters_;

	326 BufferedZoneList<RegExpTree, 2> terms_;

	327 BufferedZoneList<RegExpTree, 2> text_;

	328 BufferedZoneList<RegExpTree, 2> alternatives_;

	329 #ifdef DEBUG

	330 enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_;

	331 #define LAST(x) last_added_ = x;

	332 #else

	333 #define LAST(x)

	334 #endif

	335 };

	336

	337

	338 RegExpBuilder::RegExpBuilder()

	339 : pending_empty_(false), characters_(NULL), terms_(), alternatives_()
	Mads Ager (chromium) 2008/11/25 21:09:41 Four space indent. More occurrences below. Four space indent. More occurrences below.
	340 #ifdef DEBUG

	341 , last_added_(ADD_NONE)

	342 #endif

	343 {}

	344

	345

	346 void RegExpBuilder::FlushCharacters() {

	347 pending_empty_ = false;

	348 if (characters_ != NULL) {

	349 RegExpTree* atom = new RegExpAtom(characters_->ToConstVector());

	350 characters_ = NULL;

	351 text_.Add(atom);

	352 LAST(ADD_ATOM);

	353 }

	354 }

	355

	356

	357 void RegExpBuilder::FlushText() {

	358 FlushCharacters();

	359 int num_text = text_.length();

	360 if (num_text == 0) {

	361 return;

	362 } else if (num_text == 1) {

	363 terms_.Add(text_.last());

	364 } else {

	365 RegExpText* text = new RegExpText();

	366 for (int i = 0; i < num_text; i++)

	367 text_.Get(i)->AppendToText(text);

	368 terms_.Add(text);

	369 }

	370 text_.Clear();

	371 }

	372

	373

	374 void RegExpBuilder::AddCharacter(uc16 c) {

	375 pending_empty_ = false;

	376 if (characters_ == NULL) {

	377 characters_ = new ZoneList<uc16>(4);

	378 }

	379 characters_->Add(c);

	380 LAST(ADD_CHAR);

	381 }

	382

	383

	384 void RegExpBuilder::AddEmpty() {

	385 pending_empty_ = true;

	386 }

	387

	388

	389 void RegExpBuilder::AddAtom(RegExpTree* term) {

	390 if (term->IsEmpty()) {

	391 AddEmpty();

	392 return;

	393 }

	394 if (term->IsTextElement()) {

	395 FlushCharacters();

	396 text_.Add(term);

	397 } else {

	398 FlushText();

	399 terms_.Add(term);

	400 }

	401 LAST(ADD_ATOM);

	402 }

	403

	404

	405 void RegExpBuilder::AddAssertion(RegExpTree* assert) {

	406 FlushText();

	407 terms_.Add(assert);

	408 LAST(ADD_ASSERT);

	409 }

	410

	411

	412 void RegExpBuilder::NewAlternative() {

	413 FlushTerms();

	414 }

	415

	416

	417 void RegExpBuilder::FlushTerms() {

	418 FlushText();

	419 int num_terms = terms_.length();

	420 RegExpTree* alternative;

	421 if (num_terms == 0) {

	422 alternative = RegExpEmpty::GetInstance();

	423 } else if (num_terms == 1) {

	424 alternative = terms_.last();

	425 } else {

	426 alternative = new RegExpAlternative(terms_.GetList());

	427 }

	428 alternatives_.Add(alternative);

	429 terms_.Clear();

	430 LAST(ADD_NONE);

	431 }

	432

	433

	434 RegExpTree* RegExpBuilder::ToRegExp() {

	435 FlushTerms();

	436 int num_alternatives = alternatives_.length();

	437 if (num_alternatives == 0) {

	438 return RegExpEmpty::GetInstance();

	439 }

	440 if (num_alternatives == 1) {

	441 return alternatives_.last();

	442 }

	443 return new RegExpDisjunction(alternatives_.GetList());

	444 }

	445

	446

	447 void RegExpBuilder::AddQuantifierToAtom(int min, int max, bool is_greedy) {

	448 if (pending_empty_) {

	449 pending_empty_ = false;

	450 return;

	451 }

	452 RegExpTree* atom;

	453 if (characters_ != NULL) {

	454 ASSERT(last_added_ == ADD_CHAR);

	455 // Last atom was character.

	456 Vector<const uc16> char_vector = characters_->ToConstVector();

	457 int num_chars = char_vector.length();

	458 if (num_chars > 1) {

	459 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);

	460 text_.Add(new RegExpAtom(prefix));

	461 char_vector = char_vector.SubVector(num_chars - 1, num_chars);

	462 }

	463 characters_ = NULL;

	464 atom = new RegExpAtom(char_vector);

	465 FlushText();

	466 } else if (text_.length() > 0) {

	467 ASSERT(last_added_ == ADD_ATOM);

	468 atom = text_.RemoveLast();

	469 FlushText();

	470 } else if (terms_.length() > 0) {

	471 ASSERT(last_added_ == ADD_ATOM);

	472 atom = terms_.RemoveLast();

	473 if (atom->IsLookahead() \|\| atom->IsAssertion()) {

	474 // Guaranteed not to match a non-empty string.

	475 // Assertion as an atom can happen as, e.g., (?:\b)

	476 LAST(ADD_TERM);

	477 if (min == 0) {

	478 return;

	479 }

	480 terms_.Add(atom);

	481 return;

	482 }

	483 } else {

	484 // Only call immediately after adding an atom or character!

	485 UNREACHABLE();

	486 return;

	487 }

	488 terms_.Add(new RegExpQuantifier(min, max, is_greedy, atom));

	489 LAST(ADD_TERM);

	490 }

	491

	492

	493 class RegExpParser {

	494 public:

	495 RegExpParser(FlatStringReader* in,

	496 Handle<String>* error,

	497 bool multiline_mode);

	498 RegExpTree* ParsePattern(bool* ok);

	499 RegExpTree* ParseDisjunction(bool* ok);

	500 RegExpTree* ParseGroup(bool* ok);

	501 RegExpTree* ParseCharacterClass(bool* ok);

	502

	503 // Parses a {...,...} quantifier and stores the range in the given

	504 // out parameters.

	505 bool ParseIntervalQuantifier(int* min_out, int* max_out);

	506

	507 // Parses and returns a single escaped character. The character

	508 // must not be 'b' or 'B' since they are usually handle specially.

	509 uc32 ParseClassCharacterEscape(bool* ok);

	510

	511 // Checks whether the following is a length-digit hexadecimal number,

	512 // and sets the value if it is.

	513 bool ParseHexEscape(int length, uc32* value);

	514

	515 uc32 ParseControlLetterEscape(bool* ok);

	516 uc32 ParseOctalLiteral();

	517

	518 // Tries to parse the input as a back reference. If successful it

	519 // stores the result in the output parameter and returns true. If

	520 // it fails it will push back the characters read so the same characters

	521 // can be reparsed.

	522 bool ParseBackReferenceIndex(int* index_out);

	523

	524 CharacterRange ParseClassAtom(bool* is_char_class,

	525 ZoneList<CharacterRange>* ranges,

	526 bool* ok);

	527 RegExpTree* ReportError(Vector<const char> message, bool* ok);

	528 void Advance();

	529 void Advance(int dist);

	530 void Reset(int pos);

	531

	532 bool HasCharacterEscapes();

	533

	534 int captures_started() { return captures_ == NULL ? 0 : captures_->length(); }

	535 int position() { return next_pos_ - 1; }

	536

	537 static const uc32 kEndMarker = (1 << 21);

	538 private:

	539 uc32 current() { return current_; }

	540 bool has_more() { return has_more_; }

	541 bool has_next() { return next_pos_ < in()->length(); }

	542 uc32 Next();

	543 FlatStringReader* in() { return in_; }

	544 void ScanForCaptures();

	545 bool CaptureAvailable(int index);

	546 uc32 current_;

	547 bool has_more_;

	548 bool multiline_mode_;

	549 int next_pos_;

	550 FlatStringReader* in_;

	551 Handle<String>* error_;

	552 bool has_character_escapes_;

	553 bool is_scanned_for_captures_;

	554 ZoneList<RegExpCapture> captures_;

	555 int capture_count_;

	556 };

	557

	558

230 // A temporary scope stores information during parsing, just like	559 // A temporary scope stores information during parsing, just like

231 // a plain scope. However, temporary scopes are not kept around	560 // a plain scope. However, temporary scopes are not kept around

232 // after parsing or referenced by syntax trees so they can be stack-	561 // after parsing or referenced by syntax trees so they can be stack-

233 // allocated and hence used by the pre-parser.	562 // allocated and hence used by the pre-parser.

234 class TemporaryScope BASE_EMBEDDED {	563 class TemporaryScope BASE_EMBEDDED {

235 public:	564 public:

236 explicit TemporaryScope(Parser* parser);	565 explicit TemporaryScope(Parser* parser);

237 ~TemporaryScope();	566 ~TemporaryScope();

238	567

239 int NextMaterializedLiteralIndex() {	568 int NextMaterializedLiteralIndex() {

(...skipping 2917 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3157 }	3486 }

3158 ZoneList<Expression> args = new ZoneList<Expression*>(2);	3487 ZoneList<Expression> args = new ZoneList<Expression*>(2);

3159 args->Add(new Literal(type));	3488 args->Add(new Literal(type));

3160 args->Add(new Literal(array));	3489 args->Add(new Literal(array));

3161 return new Throw(new CallRuntime(constructor, NULL, args),	3490 return new Throw(new CallRuntime(constructor, NULL, args),

3162 scanner().location().beg_pos);	3491 scanner().location().beg_pos);

3163 }	3492 }

3164	3493

3165	3494

3166 // ----------------------------------------------------------------------------	3495 // ----------------------------------------------------------------------------

	3496 // Regular expressions

	3497

	3498

	3499 RegExpParser::RegExpParser(FlatStringReader* in,

	3500 Handle<String>* error,

	3501 bool multiline_mode)

	3502 : current_(kEndMarker),

	3503 has_more_(true),

	3504 multiline_mode_(multiline_mode),

	3505 next_pos_(0),

	3506 in_(in),

	3507 error_(error),

	3508 has_character_escapes_(false),

	3509 is_scanned_for_captures_(false),

	3510 captures_(NULL),

	3511 capture_count_(0) {

	3512 Advance(1);

	3513 }

	3514

	3515

	3516 uc32 RegExpParser::Next() {

	3517 if (has_next()) {

	3518 return in()->Get(next_pos_);

	3519 } else {

	3520 return kEndMarker;

	3521 }

	3522 }

	3523

	3524

	3525 void RegExpParser::Advance() {

	3526 if (next_pos_ < in()->length()) {

	3527 current_ = in()->Get(next_pos_);

	3528 next_pos_++;

	3529 } else {

	3530 current_ = kEndMarker;

	3531 has_more_ = false;

	3532 }

	3533 }

	3534

	3535

	3536 void RegExpParser::Reset(int pos) {

	3537 next_pos_ = pos;

	3538 Advance();

	3539 }

	3540

	3541

	3542 void RegExpParser::Advance(int dist) {

	3543 for (int i = 0; i < dist; i++)

	3544 Advance();

	3545 }

	3546

	3547

	3548 // Reports whether the parsed string atoms contain any characters that were

	3549 // escaped in the original pattern. If not, all atoms are proper substrings

	3550 // of the original pattern.

	3551 bool RegExpParser::HasCharacterEscapes() {

	3552 return has_character_escapes_;

	3553 }

	3554

	3555 RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) {

	3556 *ok = false;

	3557 *error_ = Factory::NewStringFromAscii(message, NOT_TENURED);

	3558 return NULL;

	3559 }

	3560

	3561

	3562 // Pattern ::

	3563 // Disjunction

	3564 RegExpTree* RegExpParser::ParsePattern(bool* ok) {

	3565 RegExpTree* result = ParseDisjunction(CHECK_OK);

	3566 if (has_more()) {

	3567 ReportError(CStrVector("Unmatched ')'"), CHECK_OK);

	3568 }

	3569 return result;

	3570 }

	3571

	3572

	3573 bool RegExpParser::CaptureAvailable(int index) {

	3574 if (captures_ == NULL) return false;

	3575 if (index >= captures_->length()) return false;

	3576 RegExpCapture* capture = captures_->at(index);

	3577 return capture != NULL && capture->available() == CAPTURE_AVAILABLE;

	3578 }

	3579

	3580

	3581 // Disjunction ::

	3582 // Alternative

	3583 // Alternative \| Disjunction

	3584 // Alternative ::

	3585 // [empty]

	3586 // Term Alternative

	3587 // Term ::

	3588 // Assertion

	3589 // Atom

	3590 // Atom Quantifier

	3591 RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {

	3592 RegExpBuilder builder;

	3593 int capture_start_index = captures_started();

	3594 while (true) {

	3595 switch (current()) {

	3596 case kEndMarker:

	3597 case ')':

	3598 return builder.ToRegExp();

	3599 case '\|': {

	3600 Advance();

	3601 builder.NewAlternative();

	3602 int capture_new_alt_start_index = captures_started();

	3603 for (int i = capture_start_index; i < capture_new_alt_start_index; i++) {

	3604 RegExpCapture* capture = captures_->at(i);

	3605 if (capture->available() == CAPTURE_AVAILABLE) {

	3606 capture->set_available(CAPTURE_UNREACHABLE);

	3607 }

	3608 }

	3609 capture_start_index = capture_new_alt_start_index;

	3610 continue;

	3611 }

	3612 case '*':

	3613 case '+':

	3614 case '?':

	3615 ReportError(CStrVector("Nothing to repeat"), CHECK_OK);

	3616 case '^': {

	3617 Advance();

	3618 RegExpAssertion::Type type =

	3619 multiline_mode_ ? RegExpAssertion::START_OF_LINE :

	3620 RegExpAssertion::START_OF_INPUT;

	3621 builder.AddAssertion(new RegExpAssertion(type));

	3622 continue;

	3623 }

	3624 case '$': {

	3625 Advance();

	3626 RegExpAssertion::Type type =

	3627 multiline_mode_ ? RegExpAssertion::END_OF_LINE :

	3628 RegExpAssertion::END_OF_INPUT;

	3629 builder.AddAssertion(new RegExpAssertion(type));

	3630 continue;

	3631 }

	3632 case '.': {

	3633 Advance();

	3634 // everything except \x0a, \x0d, \u2028 and \u2029

	3635 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);

	3636 CharacterRange::AddClassEscape('.', ranges);

	3637 RegExpTree* atom = new RegExpCharacterClass(ranges, false);

	3638 builder.AddAtom(atom);

	3639 break;

	3640 }

	3641 case '(': {

	3642 RegExpTree* atom = ParseGroup(CHECK_OK);

	3643 builder.AddAtom(atom);

	3644 break;

	3645 }

	3646 case '[': {

	3647 RegExpTree* atom = ParseCharacterClass(CHECK_OK);

	3648 builder.AddAtom(atom);

	3649 break;

	3650 }

	3651 // Atom ::

	3652 // \ AtomEscape

	3653 case '\\':

	3654 switch (Next()) {

	3655 case kEndMarker:

	3656 ReportError(CStrVector("\\ at end of pattern"), CHECK_OK);

	3657 case 'b':

	3658 Advance(2);

	3659 builder.AddAssertion(

	3660 new RegExpAssertion(RegExpAssertion::BOUNDARY));

	3661 continue;

	3662 case 'B':

	3663 Advance(2);

	3664 builder.AddAssertion(

	3665 new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));

	3666 continue;

	3667 // AtomEscape ::

	3668 // CharacterClassEscape

	3669 //

	3670 // CharacterClassEscape :: one of

	3671 // d D s S w W

	3672 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {

	3673 uc32 c = Next();

	3674 Advance(2);

	3675 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);

	3676 CharacterRange::AddClassEscape(c, ranges);

	3677 RegExpTree* atom = new RegExpCharacterClass(ranges, false);

	3678 builder.AddAtom(atom);

	3679 goto has_read_atom; // Avoid setting has_character_escapes_.

	3680 }

	3681 case '1': case '2': case '3': case '4': case '5': case '6':

	3682 case '7': case '8': case '9': {

	3683 int index = 0;

	3684 if (ParseBackReferenceIndex(&index)) {

	3685 if (!CaptureAvailable(index - 1)) {

	3686 // Prepare to ignore a following quantifier

	3687 builder.AddEmpty();

	3688 goto has_read_atom;

	3689 }

	3690 RegExpCapture* capture = captures_->at(index - 1);

	3691 RegExpTree* atom = new RegExpBackReference(capture);

	3692 builder.AddAtom(atom);

	3693 goto has_read_atom; // Avoid setting has_character_escapes_.

	3694 }

	3695 uc32 first_digit = Next();

	3696 if (first_digit == '8' \|\| first_digit == '9') {

	3697 // Treat as identity escape

	3698 builder.AddCharacter(first_digit);

	3699 Advance(2);

	3700 break;

	3701 }

	3702 }

	3703 // FALLTHROUGH

	3704 case '0': {

	3705 Advance();

	3706 uc32 octal = ParseOctalLiteral();

	3707 builder.AddCharacter(octal);

	3708 break;

	3709 }

	3710 // ControlEscape :: one of

	3711 // f n r t v

	3712 case 'f':

	3713 Advance(2);

	3714 builder.AddCharacter('\f');

	3715 break;

	3716 case 'n':

	3717 Advance(2);

	3718 builder.AddCharacter('\n');

	3719 break;

	3720 case 'r':

	3721 Advance(2);

	3722 builder.AddCharacter('\r');

	3723 break;

	3724 case 't':

	3725 Advance(2);

	3726 builder.AddCharacter('\t');

	3727 break;

	3728 case 'v':

	3729 Advance(2);

	3730 builder.AddCharacter('\v');

	3731 break;

	3732 case 'c': {

	3733 Advance(2);

	3734 uc32 control = ParseControlLetterEscape(ok);

	3735 builder.AddCharacter(control);

	3736 break;

	3737 }

	3738 case 'x': {

	3739 Advance(2);

	3740 uc32 value;

	3741 if (ParseHexEscape(2, &value)) {

	3742 builder.AddCharacter(value);

	3743 } else {

	3744 builder.AddCharacter('x');

	3745 }

	3746 break;

	3747 }

	3748 case 'u': {

	3749 Advance(2);

	3750 uc32 value;

	3751 if (ParseHexEscape(4, &value)) {

	3752 builder.AddCharacter(value);

	3753 } else {

	3754 builder.AddCharacter('u');

	3755 }

	3756 break;

	3757 }

	3758 default:

	3759 // Identity escape.

	3760 builder.AddCharacter(Next());

	3761 Advance(2);

	3762 break;

	3763 }

	3764 has_character_escapes_ = true;

	3765 break;

	3766 case '{': {

	3767 int dummy;

	3768 if (ParseIntervalQuantifier(&dummy, &dummy)) {

	3769 ReportError(CStrVector("Nothing to repeat"), CHECK_OK);

	3770 }

	3771 // fallthrough

	3772 }

	3773 default:

	3774 builder.AddCharacter(current());

	3775 Advance();

	3776 break;

	3777 } // end switch(current())

	3778

	3779 has_read_atom:

	3780 int min;

	3781 int max;

	3782 switch (current()) {

	3783 // QuantifierPrefix ::

	3784 // *

	3785 // +

	3786 // ?

	3787 // {

	3788 case '*':

	3789 min = 0;

	3790 max = RegExpQuantifier::kInfinity;

	3791 Advance();

	3792 break;

	3793 case '+':

	3794 min = 1;

	3795 max = RegExpQuantifier::kInfinity;

	3796 Advance();

	3797 break;

	3798 case '?':

	3799 min = 0;

	3800 max = 1;

	3801 Advance();

	3802 break;

	3803 case '{':

	3804 if (ParseIntervalQuantifier(&min, &max)) {

	3805 break;

	3806 } else {

	3807 continue;

	3808 }

	3809 default:

	3810 continue;

	3811 }

	3812 bool is_greedy = true;

	3813 if (current() == '?') {

	3814 is_greedy = false;

	3815 Advance();

	3816 }

	3817 builder.AddQuantifierToAtom(min, max, is_greedy);

	3818 }

	3819 }

	3820

	3821 class SourceCharacter {

	3822 public:

	3823 static bool Is(uc32 c) {

	3824 switch (c) {

	3825 // case ']': case '}':

	3826 // In spidermonkey and jsc these are treated as source characters

	3827 // so we do too.

	3828 case '^': case '$': case '\\': case '.': case '*': case '+':

	3829 case '?': case '(': case ')': case '[': case '{': case '\|':

	3830 case RegExpParser::kEndMarker:

	3831 return false;

	3832 default:

	3833 return true;

	3834 }

	3835 }

	3836 };

	3837

	3838

	3839 static unibrow::Predicate<SourceCharacter> source_character;

	3840

	3841

	3842 static inline bool IsSourceCharacter(uc32 c) {

	3843 return source_character.get(c);

	3844 }

	3845

	3846 #ifdef DEBUG

	3847 // Currently only used in an ASSERT.

	3848 static bool IsSpecialClassEscape(uc32 c) {

	3849 switch (c) {

	3850 case 'd': case 'D':

	3851 case 's': case 'S':

	3852 case 'w': case 'W':

	3853 return true;

	3854 default:

	3855 return false;

	3856 }

	3857 }

	3858 #endif

	3859

	3860

	3861 // In order to know whether an escape is a backreference or not we have to scan

	3862 // the entire regexp and find the number of capturing parentheses. However we

	3863 // don't want to scan the regexp twice unless it is necessary. This mini-parser

	3864 // is called when needed. It can see the difference between capturing and

	3865 // noncapturing parentheses and can skip character classes and backslash-escaped

	3866 // characters.

	3867 void RegExpParser::ScanForCaptures() {

	3868 int n;

	3869 while ((n = current()) != kEndMarker) {

	3870 Advance();

	3871 switch (n) {

	3872 case '\\':

	3873 Advance();

	3874 break;

	3875 case '[': {

	3876 int c;

	3877 while ((c = current()) != kEndMarker) {

	3878 Advance();

	3879 if (c == '\\') {

	3880 Advance();

	3881 } else {

	3882 if (c == ']') break;

	3883 }

	3884 }

	3885 break;

	3886 }

	3887 case '(':

	3888 if (current() != '?') capture_count_++;

	3889 break;

	3890 }

	3891 }

	3892 is_scanned_for_captures_ = true;

	3893 }

	3894

	3895

	3896 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {

	3897 ASSERT_EQ('\\', current());

	3898 ASSERT('1' <= Next() && Next() <= '9');

	3899 // Try to parse a decimal literal that is no greater than the number

	3900 // of previously encountered left capturing parentheses.

	3901 // This is a not according the the ECMAScript specification. According to

	3902 // that, one must accept values up to the total number of left capturing

	3903 // parentheses in the entire input, even if they are meaningless.

	3904 if (!is_scanned_for_captures_) {

	3905 int saved_position = position();

	3906 ScanForCaptures();

	3907 Reset(saved_position);

	3908 }

	3909 if (capture_count_ == 0) return false;

	3910 int start = position();

	3911 int value = Next() - '0';

	3912 if (value > capture_count_) return false;

	3913 Advance(2);

	3914 while (true) {

	3915 uc32 c = current();

	3916 if (IsDecimalDigit(c)) {

	3917 value = 10 * value + (c - '0');

	3918 if (value > capture_count_) {

	3919 Reset(start);

	3920 return false;

	3921 }

	3922 Advance();

	3923 } else {

	3924 break;

	3925 }

	3926 }

	3927 *index_out = value;

	3928 return true;

	3929 }

	3930

	3931

	3932 // QuantifierPrefix ::

	3933 // { DecimalDigits }

	3934 // { DecimalDigits , }

	3935 // { DecimalDigits , DecimalDigits }

	3936 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {

	3937 ASSERT_EQ(current(), '{');

	3938 int start = position();

	3939 Advance();

	3940 int min = 0;

	3941 if (!IsDecimalDigit(current())) {

	3942 Reset(start);

	3943 return false;

	3944 }

	3945 while (IsDecimalDigit(current())) {

	3946 min = 10 * min + (current() - '0');

	3947 Advance();

	3948 }

	3949 int max = 0;

	3950 if (current() == '}') {

	3951 max = min;

	3952 Advance();

	3953 } else if (current() == ',') {

	3954 Advance();

	3955 if (current() == '}') {

	3956 max = RegExpQuantifier::kInfinity;

	3957 Advance();

	3958 } else {

	3959 while (IsDecimalDigit(current())) {

	3960 max = 10 * max + (current() - '0');

	3961 Advance();

	3962 }

	3963 if (current() != '}') {

	3964 Reset(start);

	3965 return false;

	3966 }

	3967 Advance();

	3968 }

	3969 } else {

	3970 Reset(start);

	3971 return false;

	3972 }

	3973 *min_out = min;

	3974 *max_out = max;

	3975 return true;

	3976 }

	3977

	3978

	3979 // Upper and lower case letters differ by one bit.

	3980 STATIC_CHECK(('a' ^ 'A') == 0x20);

	3981

	3982 uc32 RegExpParser::ParseControlLetterEscape(bool* ok) {

	3983 if (!has_more()) {

	3984 ReportError(CStrVector("\\c at end of pattern"), ok);

	3985 return '\0';

	3986 }

	3987 uc32 letter = current() & ~(0x20); // Collapse upper and lower case letters.

	3988 if (letter < 'A' \|\| 'Z' < letter) {

	3989 // Non-spec error-correction: "\c" followed by non-control letter is

	3990 // interpreted as an IdentityEscape of 'c'.

	3991 return 'c';

	3992 }

	3993 Advance();

	3994 return letter & 0x1f; // Remainder modulo 32, per specification.

	3995 }

	3996

	3997

	3998 uc32 RegExpParser::ParseOctalLiteral() {

	3999 ASSERT('0' <= current() && current() <= '7');

	4000 // For compatibility with some other browsers (not all), we parse

	4001 // up to three octal digits with a value below 256.

	4002 uc32 value = current() - '0';

	4003 Advance();

	4004 if ('0' <= current() && current() <= '7') {

	4005 value = value * 8 + current() - '0';

	4006 Advance();

	4007 if (value < 32 && '0' <= current() && current() <= '7') {

	4008 value = value * 8 + current() - '0';

	4009 Advance();

	4010 }

	4011 }

	4012 return value;

	4013 }

	4014

	4015

	4016 bool RegExpParser::ParseHexEscape(int length, uc32 *value) {

	4017 int start = position();

	4018 uc32 val = 0;

	4019 bool done = false;

	4020 for (int i = 0; !done; i++) {

	4021 uc32 c = current();

	4022 int d = HexValue(c);

	4023 if (d < 0) {

	4024 Reset(start);

	4025 return false;

	4026 }

	4027 val = val * 16 + d;

	4028 Advance();

	4029 if (i == length - 1) {

	4030 done = true;

	4031 }

	4032 }

	4033 *value = val;

	4034 return true;

	4035 }

	4036

	4037

	4038 uc32 RegExpParser::ParseClassCharacterEscape(bool* ok) {

	4039 ASSERT(current() == '\\');

	4040 ASSERT(has_next() && !IsSpecialClassEscape(Next()));

	4041 Advance();

	4042 switch (current()) {

	4043 case 'b':

	4044 Advance();

	4045 return '\b';

	4046 // ControlEscape :: one of

	4047 // f n r t v

	4048 case 'f':

	4049 Advance();

	4050 return '\f';

	4051 case 'n':

	4052 Advance();

	4053 return '\n';

	4054 case 'r':

	4055 Advance();

	4056 return '\r';

	4057 case 't':

	4058 Advance();

	4059 return '\t';

	4060 case 'v':

	4061 Advance();

	4062 return '\v';

	4063 case 'c':

	4064 return ParseControlLetterEscape(ok);

	4065 case '0': case '1': case '2': case '3': case '4': case '5':

	4066 case '6': case '7':

	4067 // For compatibility, we interpret a decimal escape that isn't

	4068 // a back reference (and therefore either \0 or not valid according

	4069 // to the specification) as a 1..3 digit octal character code.

	4070 return ParseOctalLiteral();

	4071 case 'x': {

	4072 Advance();

	4073 uc32 value;

	4074 if (ParseHexEscape(2, &value)) {

	4075 return value;

	4076 }

	4077 // If \x is not followed by a two-digit hexadecimal, treat it

	4078 // as an identity escape.

	4079 return 'x';

	4080 }

	4081 case 'u': {

	4082 Advance();

	4083 uc32 value;

	4084 if (ParseHexEscape(4, &value)) {

	4085 return value;

	4086 }

	4087 // If \u is not followed by a four-digit hexadecimal, treat it

	4088 // as an identity escape.

	4089 return 'u';

	4090 }

	4091 default: {

	4092 // Extended identity escape. We accept any character that hasn't

	4093 // been matched by a more specific case, not just the subset required

	4094 // by the ECMAScript specification.

	4095 uc32 result = current();

	4096 Advance();

	4097 return result;

	4098 }

	4099 }

	4100 return 0;

	4101 }

	4102

	4103

	4104 RegExpTree* RegExpParser::ParseGroup(bool* ok) {

	4105 ASSERT_EQ(current(), '(');

	4106 char type = '(';

	4107 Advance();

	4108 if (current() == '?') {

	4109 switch (Next()) {

	4110 case ':': case '=': case '!':

	4111 type = Next();

	4112 Advance(2);

	4113 break;

	4114 default:

	4115 ReportError(CStrVector("Invalid group"), CHECK_OK);

	4116 break;

	4117 }

	4118 } else {

	4119 if (captures_ == NULL) {

	4120 captures_ = new ZoneList<RegExpCapture*>(2);

	4121 }

	4122 captures_->Add(NULL);

	4123 if (!is_scanned_for_captures_) capture_count_++;

	4124 }

	4125 int capture_index = captures_started();

	4126 RegExpTree* body = ParseDisjunction(CHECK_OK);

	4127 if (current() != ')') {

	4128 ReportError(CStrVector("Unterminated group"), CHECK_OK);

	4129 }

	4130 Advance();

	4131

	4132 int end_capture_index = captures_started();

	4133 if (type == '!') {

	4134 // Captures inside a negative lookahead are never available outside it.

	4135 for (int i = capture_index; i < end_capture_index; i++) {

	4136 RegExpCapture* capture = captures_->at(i);

	4137 ASSERT(capture != NULL);

	4138 capture->set_available(CAPTURE_PERMANENTLY_UNREACHABLE);

	4139 }

	4140 } else {

	4141 // Captures temporarily unavailable because they are in different

	4142 // alternatives are all available after the disjunction.

	4143 for (int i = capture_index; i < end_capture_index; i++) {

	4144 RegExpCapture* capture = captures_->at(i);

	4145 ASSERT(capture != NULL);

	4146 if (capture->available() == CAPTURE_UNREACHABLE) {

	4147 capture->set_available(CAPTURE_AVAILABLE);

	4148 }

	4149 }

	4150 }

	4151

	4152 if (type == '(') {

	4153 RegExpCapture* capture = new RegExpCapture(body, capture_index);

	4154 captures_->at(capture_index - 1) = capture;

	4155 return capture;

	4156 } else if (type == ':') {

	4157 return body;

	4158 } else {

	4159 ASSERT(type == '=' \|\| type == '!');

	4160 bool is_positive = (type == '=');

	4161 return new RegExpLookahead(body, is_positive);

	4162 }

	4163 }

	4164

	4165

	4166 CharacterRange RegExpParser::ParseClassAtom(bool* is_char_class,

	4167 ZoneList<CharacterRange>* ranges,

	4168 bool* ok) {

	4169 ASSERT_EQ(false, *is_char_class);

	4170 uc32 first = current();

	4171 if (first == '\\') {

	4172 switch (Next()) {

	4173 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {

	4174 *is_char_class = true;

	4175 uc32 c = Next();

	4176 CharacterRange::AddClassEscape(c, ranges);

	4177 Advance(2);

	4178 return NULL;

	4179 }

	4180 default:

	4181 uc32 c = ParseClassCharacterEscape(CHECK_OK);

	4182 return CharacterRange::Singleton(c);

	4183 }

	4184 } else {

	4185 Advance();

	4186 return CharacterRange::Singleton(first);

	4187 }

	4188 }

	4189

	4190

	4191 RegExpTree* RegExpParser::ParseCharacterClass(bool* ok) {

	4192 static const char* kUnterminated = "Unterminated character class";

	4193 static const char* kIllegal = "Illegal character class";

	4194 static const char* kRangeOutOfOrder = "Range out of order in character class";

	4195

	4196 ASSERT_EQ(current(), '[');

	4197 Advance();

	4198 bool is_negated = false;

	4199 if (current() == '^') {

	4200 is_negated = true;

	4201 Advance();

	4202 }

	4203 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);

	4204 while (has_more() && current() != ']') {

	4205 bool is_char_class = false;

	4206 CharacterRange first = ParseClassAtom(&is_char_class, ranges, CHECK_OK);

	4207 if (!is_char_class) {

	4208 if (current() == '-') {

	4209 Advance();

	4210 if (current() == kEndMarker) {

	4211 // If we reach the end we break out of the loop and let the

	4212 // following code report an error.

	4213 break;

	4214 } else if (current() == ']') {

	4215 ranges->Add(first);

	4216 ranges->Add(CharacterRange::Singleton('-'));

	4217 break;

	4218 }

	4219 CharacterRange next =

	4220 ParseClassAtom(&is_char_class, ranges, CHECK_OK);

	4221 if (is_char_class) {

	4222 return ReportError(CStrVector(kIllegal), CHECK_OK);

	4223 }

	4224 if (first.from() > next.to()) {

	4225 return ReportError(CStrVector(kRangeOutOfOrder), CHECK_OK);

	4226 }

	4227 ranges->Add(CharacterRange::Range(first.from(), next.to()));

	4228 } else {

	4229 ranges->Add(first);

	4230 }

	4231 }

	4232 }

	4233 if (!has_more()) {

	4234 return ReportError(CStrVector(kUnterminated), CHECK_OK);

	4235 }

	4236 Advance();

	4237 if (ranges->length() == 0) {

	4238 ranges->Add(CharacterRange::Range(0, 0xffff));

	4239 is_negated = !is_negated;

	4240 }

	4241 return new RegExpCharacterClass(ranges, is_negated);

	4242 }

	4243

	4244

	4245 // ----------------------------------------------------------------------------

3167 // The Parser interface.	4246 // The Parser interface.

3168	4247

3169 // MakeAST() is just a wrapper for the corresponding Parser calls	4248 // MakeAST() is just a wrapper for the corresponding Parser calls

3170 // so we don't have to expose the entire Parser class in the .h file.	4249 // so we don't have to expose the entire Parser class in the .h file.

3171	4250

3172 static bool always_allow_natives_syntax = false;	4251 static bool always_allow_natives_syntax = false;

3173	4252

3174	4253

3175 ParserMessage::~ParserMessage() {	4254 ParserMessage::~ParserMessage() {

3176 for (int i = 0; i < args().length(); i++)	4255 for (int i = 0; i < args().length(); i++)

(...skipping 27 matching lines...) Expand all Loading...
3204 PreParser parser(no_script, allow_natives_syntax, extension);	4283 PreParser parser(no_script, allow_natives_syntax, extension);

3205 if (!parser.PreParseProgram(stream)) return NULL;	4284 if (!parser.PreParseProgram(stream)) return NULL;

3206 // The list owns the backing store so we need to clone the vector.	4285 // The list owns the backing store so we need to clone the vector.

3207 // That way, the result will be exactly the right size rather than	4286 // That way, the result will be exactly the right size rather than

3208 // the expected 50% too large.	4287 // the expected 50% too large.

3209 Vector<unsigned> store = parser.recorder()->store()->ToVector().Clone();	4288 Vector<unsigned> store = parser.recorder()->store()->ToVector().Clone();

3210 return new ScriptDataImpl(store);	4289 return new ScriptDataImpl(store);

3211 }	4290 }

3212	4291

3213	4292

	4293 bool ParseRegExp(FlatStringReader* input, RegExpParseResult* result) {

	4294 ASSERT(result != NULL);

	4295 // Get multiline flag somehow
	Mads Ager (chromium) 2008/11/25 21:09:41 Is this a TODO or a comment on what the code is do Is this a TODO or a comment on what the code is doing? Erik Corry 2008/11/26 12:18:36 It's a TODO. Fixed. Show quoted text On 2008/11/25 21:09:41, Mads Ager wrote: > Is this a TODO or a comment on what the code is doing? It's a TODO. Fixed.
	4296 RegExpParser parser(input, &result->error, false);

	4297 bool ok = true;

	4298 result->tree = parser.ParsePattern(&ok);

	4299 if (!ok) {

	4300 ASSERT(result->tree == NULL);

	4301 ASSERT(!result->error.is_null());

	4302 } else {

	4303 ASSERT(result->tree != NULL);

	4304 ASSERT(result->error.is_null());

	4305 }

	4306 if (ok) {

	4307 result->has_character_escapes = parser.HasCharacterEscapes();

	4308 result->capture_count = parser.captures_started();

	4309 }

	4310 return ok;

	4311 }

	4312

	4313

3214 FunctionLiteral* MakeAST(bool compile_in_global_context,	4314 FunctionLiteral* MakeAST(bool compile_in_global_context,

3215 Handle<Script> script,	4315 Handle<Script> script,

3216 v8::Extension* extension,	4316 v8::Extension* extension,

3217 ScriptDataImpl* pre_data) {	4317 ScriptDataImpl* pre_data) {

3218 bool allow_natives_syntax =	4318 bool allow_natives_syntax =

3219 always_allow_natives_syntax \|\|	4319 always_allow_natives_syntax \|\|

3220 FLAG_allow_natives_syntax \|\|	4320 FLAG_allow_natives_syntax \|\|

3221 Bootstrapper::IsActive();	4321 Bootstrapper::IsActive();

3222 AstBuildingParser parser(script, allow_natives_syntax, extension, pre_data);	4322 AstBuildingParser parser(script, allow_natives_syntax, extension, pre_data);

3223 if (pre_data != NULL && pre_data->has_error()) {	4323 if (pre_data != NULL && pre_data->has_error()) {

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3256 start_position,	4356 start_position,

3257 is_expression);	4357 is_expression);

3258 return result;	4358 return result;

3259 }	4359 }

3260	4360

3261	4361

3262 #undef NEW	4362 #undef NEW

3263	4363

3264	4364

3265 } } // namespace v8::internal	4365 } } // namespace v8::internal

OLD	NEW

« src/objects.cc ('K') | « src/parser.h ('k') | src/regexp-macro-assembler.h » ('j') | src/regexp-macro-assembler-ia32.h » ('J')