Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(83)

Side by Side Diff: src/scanner-base.cc

Issue 6824071: Cleanup of ScannerConstants, now named UnicodeCache. (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge
Patch Set: Created 9 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2010 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution. 11 // with the distribution.
(...skipping 16 matching lines...) Expand all
28 // Features shared by parsing and pre-parsing scanners. 28 // Features shared by parsing and pre-parsing scanners.
29 29
30 #include "../include/v8stdint.h" 30 #include "../include/v8stdint.h"
31 #include "scanner-base.h" 31 #include "scanner-base.h"
32 #include "char-predicates-inl.h" 32 #include "char-predicates-inl.h"
33 33
34 namespace v8 { 34 namespace v8 {
35 namespace internal { 35 namespace internal {
36 36
37 // ---------------------------------------------------------------------------- 37 // ----------------------------------------------------------------------------
38 // Compound predicates.
39
40 bool ScannerConstants::IsIdentifier(unibrow::CharacterStream* buffer) {
41 // Checks whether the buffer contains an identifier (no escape).
42 if (!buffer->has_more()) return false;
43 if (!kIsIdentifierStart.get(buffer->GetNext())) {
44 return false;
45 }
46 while (buffer->has_more()) {
47 if (!kIsIdentifierPart.get(buffer->GetNext())) {
48 return false;
49 }
50 }
51 return true;
52 }
53
54 // ----------------------------------------------------------------------------
55 // Scanner 38 // Scanner
56 39
57 Scanner::Scanner(ScannerConstants* scanner_constants) 40 Scanner::Scanner(UnicodeCache* unicode_cache)
58 : scanner_constants_(scanner_constants), 41 : unicode_cache_(unicode_cache),
59 octal_pos_(kNoOctalLocation) { 42 octal_pos_(kNoOctalLocation) {
Karl Klose 2011/04/12 07:55:40 I think this newline should be removed.
Lasse Reichstein 2011/04/12 08:18:47 Done.
60 } 43 }
61 44
62 45
63 uc32 Scanner::ScanHexEscape(uc32 c, int length) { 46 uc32 Scanner::ScanHexEscape(uc32 c, int length) {
64 ASSERT(length <= 4); // prevent overflow 47 ASSERT(length <= 4); // prevent overflow
65 48
66 uc32 digits[4]; 49 uc32 digits[4];
67 uc32 x = 0; 50 uc32 x = 0;
68 for (int i = 0; i < length; i++) { 51 for (int i = 0; i < length; i++) {
69 digits[i] = c0_; 52 digits[i] = c0_;
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
107 if (c != '0' || i > 0) { 90 if (c != '0' || i > 0) {
108 octal_pos_ = source_pos() - i - 1; // Already advanced 91 octal_pos_ = source_pos() - i - 1; // Already advanced
109 } 92 }
110 return x; 93 return x;
111 } 94 }
112 95
113 96
114 // ---------------------------------------------------------------------------- 97 // ----------------------------------------------------------------------------
115 // JavaScriptScanner 98 // JavaScriptScanner
116 99
117 JavaScriptScanner::JavaScriptScanner(ScannerConstants* scanner_contants) 100 JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants)
118 : Scanner(scanner_contants) { } 101 : Scanner(scanner_contants) { }
119 102
120 103
121 Token::Value JavaScriptScanner::Next() { 104 Token::Value JavaScriptScanner::Next() {
122 current_ = next_; 105 current_ = next_;
123 has_line_terminator_before_next_ = false; 106 has_line_terminator_before_next_ = false;
124 Scan(); 107 Scan();
125 return current_.token; 108 return current_.token;
126 } 109 }
127 110
128 111
129 static inline bool IsByteOrderMark(uc32 c) { 112 static inline bool IsByteOrderMark(uc32 c) {
130 // The Unicode value U+FFFE is guaranteed never to be assigned as a 113 // The Unicode value U+FFFE is guaranteed never to be assigned as a
131 // Unicode character; this implies that in a Unicode context the 114 // Unicode character; this implies that in a Unicode context the
132 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF 115 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
133 // character expressed in little-endian byte order (since it could 116 // character expressed in little-endian byte order (since it could
134 // not be a U+FFFE character expressed in big-endian byte 117 // not be a U+FFFE character expressed in big-endian byte
135 // order). Nevertheless, we check for it to be compatible with 118 // order). Nevertheless, we check for it to be compatible with
136 // Spidermonkey. 119 // Spidermonkey.
137 return c == 0xFEFF || c == 0xFFFE; 120 return c == 0xFEFF || c == 0xFFFE;
138 } 121 }
139 122
140 123
141 bool JavaScriptScanner::SkipWhiteSpace() { 124 bool JavaScriptScanner::SkipWhiteSpace() {
142 int start_position = source_pos(); 125 int start_position = source_pos();
143 126
144 while (true) { 127 while (true) {
145 // We treat byte-order marks (BOMs) as whitespace for better 128 // We treat byte-order marks (BOMs) as whitespace for better
146 // compatibility with Spidermonkey and other JavaScript engines. 129 // compatibility with Spidermonkey and other JavaScript engines.
147 while (scanner_constants_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) { 130 while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
148 // IsWhiteSpace() includes line terminators! 131 // IsWhiteSpace() includes line terminators!
149 if (scanner_constants_->IsLineTerminator(c0_)) { 132 if (unicode_cache_->IsLineTerminator(c0_)) {
150 // Ignore line terminators, but remember them. This is necessary 133 // Ignore line terminators, but remember them. This is necessary
151 // for automatic semicolon insertion. 134 // for automatic semicolon insertion.
152 has_line_terminator_before_next_ = true; 135 has_line_terminator_before_next_ = true;
153 } 136 }
154 Advance(); 137 Advance();
155 } 138 }
156 139
157 // If there is an HTML comment end '-->' at the beginning of a 140 // If there is an HTML comment end '-->' at the beginning of a
158 // line (with only whitespace in front of it), we treat the rest 141 // line (with only whitespace in front of it), we treat the rest
159 // of the line as a comment. This is in line with the way 142 // of the line as a comment. This is in line with the way
(...skipping 19 matching lines...) Expand all
179 162
180 163
181 Token::Value JavaScriptScanner::SkipSingleLineComment() { 164 Token::Value JavaScriptScanner::SkipSingleLineComment() {
182 Advance(); 165 Advance();
183 166
184 // The line terminator at the end of the line is not considered 167 // The line terminator at the end of the line is not considered
185 // to be part of the single-line comment; it is recognized 168 // to be part of the single-line comment; it is recognized
186 // separately by the lexical grammar and becomes part of the 169 // separately by the lexical grammar and becomes part of the
187 // stream of input elements for the syntactic grammar (see 170 // stream of input elements for the syntactic grammar (see
188 // ECMA-262, section 7.4, page 12). 171 // ECMA-262, section 7.4, page 12).
189 while (c0_ >= 0 && !scanner_constants_->IsLineTerminator(c0_)) { 172 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
190 Advance(); 173 Advance();
191 } 174 }
192 175
193 return Token::WHITESPACE; 176 return Token::WHITESPACE;
194 } 177 }
195 178
196 179
197 Token::Value JavaScriptScanner::SkipMultiLineComment() { 180 Token::Value JavaScriptScanner::SkipMultiLineComment() {
198 ASSERT(c0_ == '*'); 181 ASSERT(c0_ == '*');
199 Advance(); 182 Advance();
(...skipping 244 matching lines...) Expand 10 before | Expand all | Expand 10 after
444 427
445 case '?': 428 case '?':
446 token = Select(Token::CONDITIONAL); 429 token = Select(Token::CONDITIONAL);
447 break; 430 break;
448 431
449 case '~': 432 case '~':
450 token = Select(Token::BIT_NOT); 433 token = Select(Token::BIT_NOT);
451 break; 434 break;
452 435
453 default: 436 default:
454 if (scanner_constants_->IsIdentifierStart(c0_)) { 437 if (unicode_cache_->IsIdentifierStart(c0_)) {
455 token = ScanIdentifierOrKeyword(); 438 token = ScanIdentifierOrKeyword();
456 } else if (IsDecimalDigit(c0_)) { 439 } else if (IsDecimalDigit(c0_)) {
457 token = ScanNumber(false); 440 token = ScanNumber(false);
458 } else if (SkipWhiteSpace()) { 441 } else if (SkipWhiteSpace()) {
459 token = Token::WHITESPACE; 442 token = Token::WHITESPACE;
460 } else if (c0_ < 0) { 443 } else if (c0_ < 0) {
461 token = Token::EOS; 444 token = Token::EOS;
462 } else { 445 } else {
463 token = Select(Token::ILLEGAL); 446 token = Select(Token::ILLEGAL);
464 } 447 }
(...skipping 27 matching lines...) Expand all
492 } 475 }
493 Scan(); 476 Scan();
494 } 477 }
495 478
496 479
497 void JavaScriptScanner::ScanEscape() { 480 void JavaScriptScanner::ScanEscape() {
498 uc32 c = c0_; 481 uc32 c = c0_;
499 Advance(); 482 Advance();
500 483
501 // Skip escaped newlines. 484 // Skip escaped newlines.
502 if (scanner_constants_->IsLineTerminator(c)) { 485 if (unicode_cache_->IsLineTerminator(c)) {
503 // Allow CR+LF newlines in multiline string literals. 486 // Allow CR+LF newlines in multiline string literals.
504 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance(); 487 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
505 // Allow LF+CR newlines in multiline string literals. 488 // Allow LF+CR newlines in multiline string literals.
506 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance(); 489 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
507 return; 490 return;
508 } 491 }
509 492
510 switch (c) { 493 switch (c) {
511 case '\'': // fall through 494 case '\'': // fall through
512 case '"' : // fall through 495 case '"' : // fall through
(...skipping 22 matching lines...) Expand all
535 AddLiteralChar(c); 518 AddLiteralChar(c);
536 } 519 }
537 520
538 521
539 Token::Value JavaScriptScanner::ScanString() { 522 Token::Value JavaScriptScanner::ScanString() {
540 uc32 quote = c0_; 523 uc32 quote = c0_;
541 Advance(); // consume quote 524 Advance(); // consume quote
542 525
543 LiteralScope literal(this); 526 LiteralScope literal(this);
544 while (c0_ != quote && c0_ >= 0 527 while (c0_ != quote && c0_ >= 0
545 && !scanner_constants_->IsLineTerminator(c0_)) { 528 && !unicode_cache_->IsLineTerminator(c0_)) {
546 uc32 c = c0_; 529 uc32 c = c0_;
547 Advance(); 530 Advance();
548 if (c == '\\') { 531 if (c == '\\') {
549 if (c0_ < 0) return Token::ILLEGAL; 532 if (c0_ < 0) return Token::ILLEGAL;
550 ScanEscape(); 533 ScanEscape();
551 } else { 534 } else {
552 AddLiteralChar(c); 535 AddLiteralChar(c);
553 } 536 }
554 } 537 }
555 if (c0_ != quote) return Token::ILLEGAL; 538 if (c0_ != quote) return Token::ILLEGAL;
(...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after
634 // we must have at least one decimal digit after 'e'/'E' 617 // we must have at least one decimal digit after 'e'/'E'
635 return Token::ILLEGAL; 618 return Token::ILLEGAL;
636 } 619 }
637 ScanDecimalDigits(); 620 ScanDecimalDigits();
638 } 621 }
639 622
640 // The source character immediately following a numeric literal must 623 // The source character immediately following a numeric literal must
641 // not be an identifier start or a decimal digit; see ECMA-262 624 // not be an identifier start or a decimal digit; see ECMA-262
642 // section 7.8.3, page 17 (note that we read only one decimal digit 625 // section 7.8.3, page 17 (note that we read only one decimal digit
643 // if the value is 0). 626 // if the value is 0).
644 if (IsDecimalDigit(c0_) || scanner_constants_->IsIdentifierStart(c0_)) 627 if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
645 return Token::ILLEGAL; 628 return Token::ILLEGAL;
646 629
647 literal.Complete(); 630 literal.Complete();
648 631
649 return Token::NUMBER; 632 return Token::NUMBER;
650 } 633 }
651 634
652 635
653 uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() { 636 uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {
654 Advance(); 637 Advance();
655 if (c0_ != 'u') return unibrow::Utf8::kBadChar; 638 if (c0_ != 'u') return unibrow::Utf8::kBadChar;
656 Advance(); 639 Advance();
657 uc32 c = ScanHexEscape('u', 4); 640 uc32 c = ScanHexEscape('u', 4);
658 // We do not allow a unicode escape sequence to start another 641 // We do not allow a unicode escape sequence to start another
659 // unicode escape sequence. 642 // unicode escape sequence.
660 if (c == '\\') return unibrow::Utf8::kBadChar; 643 if (c == '\\') return unibrow::Utf8::kBadChar;
661 return c; 644 return c;
662 } 645 }
663 646
664 647
665 Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() { 648 Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {
666 ASSERT(scanner_constants_->IsIdentifierStart(c0_)); 649 ASSERT(unicode_cache_->IsIdentifierStart(c0_));
667 LiteralScope literal(this); 650 LiteralScope literal(this);
668 KeywordMatcher keyword_match; 651 KeywordMatcher keyword_match;
669 // Scan identifier start character. 652 // Scan identifier start character.
670 if (c0_ == '\\') { 653 if (c0_ == '\\') {
671 uc32 c = ScanIdentifierUnicodeEscape(); 654 uc32 c = ScanIdentifierUnicodeEscape();
672 // Only allow legal identifier start characters. 655 // Only allow legal identifier start characters.
673 if (!scanner_constants_->IsIdentifierStart(c)) return Token::ILLEGAL; 656 if (!unicode_cache_->IsIdentifierStart(c)) return Token::ILLEGAL;
674 AddLiteralChar(c); 657 AddLiteralChar(c);
675 return ScanIdentifierSuffix(&literal); 658 return ScanIdentifierSuffix(&literal);
676 } 659 }
677 660
678 uc32 first_char = c0_; 661 uc32 first_char = c0_;
679 Advance(); 662 Advance();
680 AddLiteralChar(first_char); 663 AddLiteralChar(first_char);
681 if (!keyword_match.AddChar(first_char)) { 664 if (!keyword_match.AddChar(first_char)) {
682 return ScanIdentifierSuffix(&literal); 665 return ScanIdentifierSuffix(&literal);
683 } 666 }
684 667
685 // Scan the rest of the identifier characters. 668 // Scan the rest of the identifier characters.
686 while (scanner_constants_->IsIdentifierPart(c0_)) { 669 while (unicode_cache_->IsIdentifierPart(c0_)) {
687 if (c0_ != '\\') { 670 if (c0_ != '\\') {
688 uc32 next_char = c0_; 671 uc32 next_char = c0_;
689 Advance(); 672 Advance();
690 AddLiteralChar(next_char); 673 AddLiteralChar(next_char);
691 if (keyword_match.AddChar(next_char)) continue; 674 if (keyword_match.AddChar(next_char)) continue;
692 } 675 }
693 // Fallthrough if no loner able to complete keyword. 676 // Fallthrough if no loner able to complete keyword.
694 return ScanIdentifierSuffix(&literal); 677 return ScanIdentifierSuffix(&literal);
695 } 678 }
696 literal.Complete(); 679 literal.Complete();
697 680
698 return keyword_match.token(); 681 return keyword_match.token();
699 } 682 }
700 683
701 684
702 Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) { 685 Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {
703 // Scan the rest of the identifier characters. 686 // Scan the rest of the identifier characters.
704 while (scanner_constants_->IsIdentifierPart(c0_)) { 687 while (unicode_cache_->IsIdentifierPart(c0_)) {
705 if (c0_ == '\\') { 688 if (c0_ == '\\') {
706 uc32 c = ScanIdentifierUnicodeEscape(); 689 uc32 c = ScanIdentifierUnicodeEscape();
707 // Only allow legal identifier part characters. 690 // Only allow legal identifier part characters.
708 if (!scanner_constants_->IsIdentifierPart(c)) return Token::ILLEGAL; 691 if (!unicode_cache_->IsIdentifierPart(c)) return Token::ILLEGAL;
709 AddLiteralChar(c); 692 AddLiteralChar(c);
710 } else { 693 } else {
711 AddLiteralChar(c0_); 694 AddLiteralChar(c0_);
712 Advance(); 695 Advance();
713 } 696 }
714 } 697 }
715 literal->Complete(); 698 literal->Complete();
716 699
717 return Token::IDENTIFIER; 700 return Token::IDENTIFIER;
718 } 701 }
719 702
720 703
721 bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) { 704 bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {
722 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags 705 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
723 bool in_character_class = false; 706 bool in_character_class = false;
724 707
725 // Previous token is either '/' or '/=', in the second case, the 708 // Previous token is either '/' or '/=', in the second case, the
726 // pattern starts at =. 709 // pattern starts at =.
727 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); 710 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
728 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); 711 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
729 712
730 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, 713 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
731 // the scanner should pass uninterpreted bodies to the RegExp 714 // the scanner should pass uninterpreted bodies to the RegExp
732 // constructor. 715 // constructor.
733 LiteralScope literal(this); 716 LiteralScope literal(this);
734 if (seen_equal) 717 if (seen_equal)
735 AddLiteralChar('='); 718 AddLiteralChar('=');
736 719
737 while (c0_ != '/' || in_character_class) { 720 while (c0_ != '/' || in_character_class) {
738 if (scanner_constants_->IsLineTerminator(c0_) || c0_ < 0) return false; 721 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
739 if (c0_ == '\\') { // Escape sequence. 722 if (c0_ == '\\') { // Escape sequence.
740 AddLiteralCharAdvance(); 723 AddLiteralCharAdvance();
741 if (scanner_constants_->IsLineTerminator(c0_) || c0_ < 0) return false; 724 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
742 AddLiteralCharAdvance(); 725 AddLiteralCharAdvance();
743 // If the escape allows more characters, i.e., \x??, \u????, or \c?, 726 // If the escape allows more characters, i.e., \x??, \u????, or \c?,
744 // only "safe" characters are allowed (letters, digits, underscore), 727 // only "safe" characters are allowed (letters, digits, underscore),
745 // otherwise the escape isn't valid and the invalid character has 728 // otherwise the escape isn't valid and the invalid character has
746 // its normal meaning. I.e., we can just continue scanning without 729 // its normal meaning. I.e., we can just continue scanning without
747 // worrying whether the following characters are part of the escape 730 // worrying whether the following characters are part of the escape
748 // or not, since any '/', '\\' or '[' is guaranteed to not be part 731 // or not, since any '/', '\\' or '[' is guaranteed to not be part
749 // of the escape sequence. 732 // of the escape sequence.
750 } else { // Unescaped character. 733 } else { // Unescaped character.
751 if (c0_ == '[') in_character_class = true; 734 if (c0_ == '[') in_character_class = true;
752 if (c0_ == ']') in_character_class = false; 735 if (c0_ == ']') in_character_class = false;
753 AddLiteralCharAdvance(); 736 AddLiteralCharAdvance();
754 } 737 }
755 } 738 }
756 Advance(); // consume '/' 739 Advance(); // consume '/'
757 740
758 literal.Complete(); 741 literal.Complete();
759 742
760 return true; 743 return true;
761 } 744 }
762 745
763 746
764 bool JavaScriptScanner::ScanRegExpFlags() { 747 bool JavaScriptScanner::ScanRegExpFlags() {
765 // Scan regular expression flags. 748 // Scan regular expression flags.
766 LiteralScope literal(this); 749 LiteralScope literal(this);
767 while (scanner_constants_->IsIdentifierPart(c0_)) { 750 while (unicode_cache_->IsIdentifierPart(c0_)) {
768 if (c0_ == '\\') { 751 if (c0_ == '\\') {
769 uc32 c = ScanIdentifierUnicodeEscape(); 752 uc32 c = ScanIdentifierUnicodeEscape();
770 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) { 753 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
771 // We allow any escaped character, unlike the restriction on 754 // We allow any escaped character, unlike the restriction on
772 // IdentifierPart when it is used to build an IdentifierName. 755 // IdentifierPart when it is used to build an IdentifierName.
773 AddLiteralChar(c); 756 AddLiteralChar(c);
774 continue; 757 continue;
775 } 758 }
776 } 759 }
777 AddLiteralCharAdvance(); 760 AddLiteralCharAdvance();
(...skipping 177 matching lines...) Expand 10 before | Expand all | Expand 10 after
955 if (MatchKeywordStart(input, "with", 1, Token::WITH)) return; 938 if (MatchKeywordStart(input, "with", 1, Token::WITH)) return;
956 break; 939 break;
957 case UNMATCHABLE: 940 case UNMATCHABLE:
958 break; 941 break;
959 } 942 }
960 // On fallthrough, it's a failure. 943 // On fallthrough, it's a failure.
961 state_ = UNMATCHABLE; 944 state_ = UNMATCHABLE;
962 } 945 }
963 946
964 } } // namespace v8::internal 947 } } // namespace v8::internal
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698