src/scanner-base.cc - Issue 6824071: Cleanup of ScannerConstants, now named UnicodeCache.

Side by Side Diff: src/scanner-base.cc

Issue 6824071: Cleanup of ScannerConstants, now named UnicodeCache. (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Patch Set: Addressed review comments. Created 9 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2010 the V8 project authors. All rights reserved.	1 // Copyright 2011 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

11 // with the distribution.	11 // with the distribution.

(...skipping 16 matching lines...) Expand all Loading...
28 // Features shared by parsing and pre-parsing scanners.	28 // Features shared by parsing and pre-parsing scanners.

29	29

30 #include "../include/v8stdint.h"	30 #include "../include/v8stdint.h"

31 #include "scanner-base.h"	31 #include "scanner-base.h"

32 #include "char-predicates-inl.h"	32 #include "char-predicates-inl.h"

33	33

34 namespace v8 {	34 namespace v8 {

35 namespace internal {	35 namespace internal {

36	36

37 // ----------------------------------------------------------------------------	37 // ----------------------------------------------------------------------------

38 // Compound predicates.

39

40 bool ScannerConstants::IsIdentifier(unibrow::CharacterStream* buffer) {

41 // Checks whether the buffer contains an identifier (no escape).

42 if (!buffer->has_more()) return false;

43 if (!kIsIdentifierStart.get(buffer->GetNext())) {

44 return false;

45 }

46 while (buffer->has_more()) {

47 if (!kIsIdentifierPart.get(buffer->GetNext())) {

48 return false;

49 }

50 }

51 return true;

52 }

53

54 // ----------------------------------------------------------------------------

55 // Scanner	38 // Scanner

56	39

57 Scanner::Scanner(ScannerConstants* scanner_constants)	40 Scanner::Scanner(UnicodeCache* unicode_cache)

58 : scanner_constants_(scanner_constants),	41 : unicode_cache_(unicode_cache),

59 octal_pos_(kNoOctalLocation) {	42 octal_pos_(kNoOctalLocation) { }

60 }

61	43

62	44

63 uc32 Scanner::ScanHexEscape(uc32 c, int length) {	45 uc32 Scanner::ScanHexEscape(uc32 c, int length) {

64 ASSERT(length <= 4); // prevent overflow	46 ASSERT(length <= 4); // prevent overflow

65	47

66 uc32 digits[4];	48 uc32 digits[4];

67 uc32 x = 0;	49 uc32 x = 0;

68 for (int i = 0; i < length; i++) {	50 for (int i = 0; i < length; i++) {

69 digits[i] = c0_;	51 digits[i] = c0_;

70 int d = HexValue(c0_);	52 int d = HexValue(c0_);

(...skipping 36 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
107 if (c != '0' \|\| i > 0) {	89 if (c != '0' \|\| i > 0) {

108 octal_pos_ = source_pos() - i - 1; // Already advanced	90 octal_pos_ = source_pos() - i - 1; // Already advanced

109 }	91 }

110 return x;	92 return x;

111 }	93 }

112	94

113	95

114 // ----------------------------------------------------------------------------	96 // ----------------------------------------------------------------------------

115 // JavaScriptScanner	97 // JavaScriptScanner

116	98

117 JavaScriptScanner::JavaScriptScanner(ScannerConstants* scanner_contants)	99 JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants)

118 : Scanner(scanner_contants) { }	100 : Scanner(scanner_contants) { }

119	101

120	102

121 Token::Value JavaScriptScanner::Next() {	103 Token::Value JavaScriptScanner::Next() {

122 current_ = next_;	104 current_ = next_;

123 has_line_terminator_before_next_ = false;	105 has_line_terminator_before_next_ = false;

124 Scan();	106 Scan();

125 return current_.token;	107 return current_.token;

126 }	108 }

127	109

128	110

129 static inline bool IsByteOrderMark(uc32 c) {	111 static inline bool IsByteOrderMark(uc32 c) {

130 // The Unicode value U+FFFE is guaranteed never to be assigned as a	112 // The Unicode value U+FFFE is guaranteed never to be assigned as a

131 // Unicode character; this implies that in a Unicode context the	113 // Unicode character; this implies that in a Unicode context the

132 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF	114 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF

133 // character expressed in little-endian byte order (since it could	115 // character expressed in little-endian byte order (since it could

134 // not be a U+FFFE character expressed in big-endian byte	116 // not be a U+FFFE character expressed in big-endian byte

135 // order). Nevertheless, we check for it to be compatible with	117 // order). Nevertheless, we check for it to be compatible with

136 // Spidermonkey.	118 // Spidermonkey.

137 return c == 0xFEFF \|\| c == 0xFFFE;	119 return c == 0xFEFF \|\| c == 0xFFFE;

138 }	120 }

139	121

140	122

141 bool JavaScriptScanner::SkipWhiteSpace() {	123 bool JavaScriptScanner::SkipWhiteSpace() {

142 int start_position = source_pos();	124 int start_position = source_pos();

143	125

144 while (true) {	126 while (true) {

145 // We treat byte-order marks (BOMs) as whitespace for better	127 // We treat byte-order marks (BOMs) as whitespace for better

146 // compatibility with Spidermonkey and other JavaScript engines.	128 // compatibility with Spidermonkey and other JavaScript engines.

147 while (scanner_constants_->IsWhiteSpace(c0_) \|\| IsByteOrderMark(c0_)) {	129 while (unicode_cache_->IsWhiteSpace(c0_) \|\| IsByteOrderMark(c0_)) {

148 // IsWhiteSpace() includes line terminators!	130 // IsWhiteSpace() includes line terminators!

149 if (scanner_constants_->IsLineTerminator(c0_)) {	131 if (unicode_cache_->IsLineTerminator(c0_)) {

150 // Ignore line terminators, but remember them. This is necessary	132 // Ignore line terminators, but remember them. This is necessary

151 // for automatic semicolon insertion.	133 // for automatic semicolon insertion.

152 has_line_terminator_before_next_ = true;	134 has_line_terminator_before_next_ = true;

153 }	135 }

154 Advance();	136 Advance();

155 }	137 }

156	138

157 // If there is an HTML comment end '-->' at the beginning of a	139 // If there is an HTML comment end '-->' at the beginning of a

158 // line (with only whitespace in front of it), we treat the rest	140 // line (with only whitespace in front of it), we treat the rest

159 // of the line as a comment. This is in line with the way	141 // of the line as a comment. This is in line with the way

(...skipping 19 matching lines...) Expand all Loading...
179	161

180	162

181 Token::Value JavaScriptScanner::SkipSingleLineComment() {	163 Token::Value JavaScriptScanner::SkipSingleLineComment() {

182 Advance();	164 Advance();

183	165

184 // The line terminator at the end of the line is not considered	166 // The line terminator at the end of the line is not considered

185 // to be part of the single-line comment; it is recognized	167 // to be part of the single-line comment; it is recognized

186 // separately by the lexical grammar and becomes part of the	168 // separately by the lexical grammar and becomes part of the

187 // stream of input elements for the syntactic grammar (see	169 // stream of input elements for the syntactic grammar (see

188 // ECMA-262, section 7.4, page 12).	170 // ECMA-262, section 7.4, page 12).

189 while (c0_ >= 0 && !scanner_constants_->IsLineTerminator(c0_)) {	171 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {

190 Advance();	172 Advance();

191 }	173 }

192	174

193 return Token::WHITESPACE;	175 return Token::WHITESPACE;

194 }	176 }

195	177

196	178

197 Token::Value JavaScriptScanner::SkipMultiLineComment() {	179 Token::Value JavaScriptScanner::SkipMultiLineComment() {

198 ASSERT(c0_ == '*');	180 ASSERT(c0_ == '*');

199 Advance();	181 Advance();

(...skipping 244 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
444	426

445 case '?':	427 case '?':

446 token = Select(Token::CONDITIONAL);	428 token = Select(Token::CONDITIONAL);

447 break;	429 break;

448	430

449 case '~':	431 case '~':

450 token = Select(Token::BIT_NOT);	432 token = Select(Token::BIT_NOT);

451 break;	433 break;

452	434

453 default:	435 default:

454 if (scanner_constants_->IsIdentifierStart(c0_)) {	436 if (unicode_cache_->IsIdentifierStart(c0_)) {

455 token = ScanIdentifierOrKeyword();	437 token = ScanIdentifierOrKeyword();

456 } else if (IsDecimalDigit(c0_)) {	438 } else if (IsDecimalDigit(c0_)) {

457 token = ScanNumber(false);	439 token = ScanNumber(false);

458 } else if (SkipWhiteSpace()) {	440 } else if (SkipWhiteSpace()) {

459 token = Token::WHITESPACE;	441 token = Token::WHITESPACE;

460 } else if (c0_ < 0) {	442 } else if (c0_ < 0) {

461 token = Token::EOS;	443 token = Token::EOS;

462 } else {	444 } else {

463 token = Select(Token::ILLEGAL);	445 token = Select(Token::ILLEGAL);

464 }	446 }

(...skipping 27 matching lines...) Expand all Loading...
492 }	474 }

493 Scan();	475 Scan();

494 }	476 }

495	477

496	478

497 void JavaScriptScanner::ScanEscape() {	479 void JavaScriptScanner::ScanEscape() {

498 uc32 c = c0_;	480 uc32 c = c0_;

499 Advance();	481 Advance();

500	482

501 // Skip escaped newlines.	483 // Skip escaped newlines.

502 if (scanner_constants_->IsLineTerminator(c)) {	484 if (unicode_cache_->IsLineTerminator(c)) {

503 // Allow CR+LF newlines in multiline string literals.	485 // Allow CR+LF newlines in multiline string literals.

504 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();	486 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();

505 // Allow LF+CR newlines in multiline string literals.	487 // Allow LF+CR newlines in multiline string literals.

506 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();	488 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();

507 return;	489 return;

508 }	490 }

509	491

510 switch (c) {	492 switch (c) {

511 case '\'': // fall through	493 case '\'': // fall through

512 case '"' : // fall through	494 case '"' : // fall through

(...skipping 22 matching lines...) Expand all Loading...
535 AddLiteralChar(c);	517 AddLiteralChar(c);

536 }	518 }

537	519

538	520

539 Token::Value JavaScriptScanner::ScanString() {	521 Token::Value JavaScriptScanner::ScanString() {

540 uc32 quote = c0_;	522 uc32 quote = c0_;

541 Advance(); // consume quote	523 Advance(); // consume quote

542	524

543 LiteralScope literal(this);	525 LiteralScope literal(this);

544 while (c0_ != quote && c0_ >= 0	526 while (c0_ != quote && c0_ >= 0

545 && !scanner_constants_->IsLineTerminator(c0_)) {	527 && !unicode_cache_->IsLineTerminator(c0_)) {

546 uc32 c = c0_;	528 uc32 c = c0_;

547 Advance();	529 Advance();

548 if (c == '\\') {	530 if (c == '\\') {

549 if (c0_ < 0) return Token::ILLEGAL;	531 if (c0_ < 0) return Token::ILLEGAL;

550 ScanEscape();	532 ScanEscape();

551 } else {	533 } else {

552 AddLiteralChar(c);	534 AddLiteralChar(c);

553 }	535 }

554 }	536 }

555 if (c0_ != quote) return Token::ILLEGAL;	537 if (c0_ != quote) return Token::ILLEGAL;

(...skipping 78 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
634 // we must have at least one decimal digit after 'e'/'E'	616 // we must have at least one decimal digit after 'e'/'E'

635 return Token::ILLEGAL;	617 return Token::ILLEGAL;

636 }	618 }

637 ScanDecimalDigits();	619 ScanDecimalDigits();

638 }	620 }

639	621

640 // The source character immediately following a numeric literal must	622 // The source character immediately following a numeric literal must

641 // not be an identifier start or a decimal digit; see ECMA-262	623 // not be an identifier start or a decimal digit; see ECMA-262

642 // section 7.8.3, page 17 (note that we read only one decimal digit	624 // section 7.8.3, page 17 (note that we read only one decimal digit

643 // if the value is 0).	625 // if the value is 0).

644 if (IsDecimalDigit(c0_) \|\| scanner_constants_->IsIdentifierStart(c0_))	626 if (IsDecimalDigit(c0_) \|\| unicode_cache_->IsIdentifierStart(c0_))

645 return Token::ILLEGAL;	627 return Token::ILLEGAL;

646	628

647 literal.Complete();	629 literal.Complete();

648	630

649 return Token::NUMBER;	631 return Token::NUMBER;

650 }	632 }

651	633

652	634

653 uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {	635 uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {

654 Advance();	636 Advance();

655 if (c0_ != 'u') return unibrow::Utf8::kBadChar;	637 if (c0_ != 'u') return unibrow::Utf8::kBadChar;

656 Advance();	638 Advance();

657 uc32 c = ScanHexEscape('u', 4);	639 uc32 c = ScanHexEscape('u', 4);

658 // We do not allow a unicode escape sequence to start another	640 // We do not allow a unicode escape sequence to start another

659 // unicode escape sequence.	641 // unicode escape sequence.

660 if (c == '\\') return unibrow::Utf8::kBadChar;	642 if (c == '\\') return unibrow::Utf8::kBadChar;

661 return c;	643 return c;

662 }	644 }

663	645

664	646

665 Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {	647 Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {

666 ASSERT(scanner_constants_->IsIdentifierStart(c0_));	648 ASSERT(unicode_cache_->IsIdentifierStart(c0_));

667 LiteralScope literal(this);	649 LiteralScope literal(this);

668 KeywordMatcher keyword_match;	650 KeywordMatcher keyword_match;

669 // Scan identifier start character.	651 // Scan identifier start character.

670 if (c0_ == '\\') {	652 if (c0_ == '\\') {

671 uc32 c = ScanIdentifierUnicodeEscape();	653 uc32 c = ScanIdentifierUnicodeEscape();

672 // Only allow legal identifier start characters.	654 // Only allow legal identifier start characters.

673 if (!scanner_constants_->IsIdentifierStart(c)) return Token::ILLEGAL;	655 if (!unicode_cache_->IsIdentifierStart(c)) return Token::ILLEGAL;

674 AddLiteralChar(c);	656 AddLiteralChar(c);

675 return ScanIdentifierSuffix(&literal);	657 return ScanIdentifierSuffix(&literal);

676 }	658 }

677	659

678 uc32 first_char = c0_;	660 uc32 first_char = c0_;

679 Advance();	661 Advance();

680 AddLiteralChar(first_char);	662 AddLiteralChar(first_char);

681 if (!keyword_match.AddChar(first_char)) {	663 if (!keyword_match.AddChar(first_char)) {

682 return ScanIdentifierSuffix(&literal);	664 return ScanIdentifierSuffix(&literal);

683 }	665 }

684	666

685 // Scan the rest of the identifier characters.	667 // Scan the rest of the identifier characters.

686 while (scanner_constants_->IsIdentifierPart(c0_)) {	668 while (unicode_cache_->IsIdentifierPart(c0_)) {

687 if (c0_ != '\\') {	669 if (c0_ != '\\') {

688 uc32 next_char = c0_;	670 uc32 next_char = c0_;

689 Advance();	671 Advance();

690 AddLiteralChar(next_char);	672 AddLiteralChar(next_char);

691 if (keyword_match.AddChar(next_char)) continue;	673 if (keyword_match.AddChar(next_char)) continue;

692 }	674 }

693 // Fallthrough if no loner able to complete keyword.	675 // Fallthrough if no loner able to complete keyword.

694 return ScanIdentifierSuffix(&literal);	676 return ScanIdentifierSuffix(&literal);

695 }	677 }

696 literal.Complete();	678 literal.Complete();

697	679

698 return keyword_match.token();	680 return keyword_match.token();

699 }	681 }

700	682

701	683

702 Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {	684 Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {

703 // Scan the rest of the identifier characters.	685 // Scan the rest of the identifier characters.

704 while (scanner_constants_->IsIdentifierPart(c0_)) {	686 while (unicode_cache_->IsIdentifierPart(c0_)) {

705 if (c0_ == '\\') {	687 if (c0_ == '\\') {

706 uc32 c = ScanIdentifierUnicodeEscape();	688 uc32 c = ScanIdentifierUnicodeEscape();

707 // Only allow legal identifier part characters.	689 // Only allow legal identifier part characters.

708 if (!scanner_constants_->IsIdentifierPart(c)) return Token::ILLEGAL;	690 if (!unicode_cache_->IsIdentifierPart(c)) return Token::ILLEGAL;

709 AddLiteralChar(c);	691 AddLiteralChar(c);

710 } else {	692 } else {

711 AddLiteralChar(c0_);	693 AddLiteralChar(c0_);

712 Advance();	694 Advance();

713 }	695 }

714 }	696 }

715 literal->Complete();	697 literal->Complete();

716	698

717 return Token::IDENTIFIER;	699 return Token::IDENTIFIER;

718 }	700 }

719	701

720	702

721 bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {	703 bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {

722 // Scan: ('/' \| '/=') RegularExpressionBody '/' RegularExpressionFlags	704 // Scan: ('/' \| '/=') RegularExpressionBody '/' RegularExpressionFlags

723 bool in_character_class = false;	705 bool in_character_class = false;

724	706

725 // Previous token is either '/' or '/=', in the second case, the	707 // Previous token is either '/' or '/=', in the second case, the

726 // pattern starts at =.	708 // pattern starts at =.

727 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);	709 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);

728 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);	710 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);

729	711

730 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,	712 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,

731 // the scanner should pass uninterpreted bodies to the RegExp	713 // the scanner should pass uninterpreted bodies to the RegExp

732 // constructor.	714 // constructor.

733 LiteralScope literal(this);	715 LiteralScope literal(this);

734 if (seen_equal)	716 if (seen_equal)

735 AddLiteralChar('=');	717 AddLiteralChar('=');

736	718

737 while (c0_ != '/' \|\| in_character_class) {	719 while (c0_ != '/' \|\| in_character_class) {

738 if (scanner_constants_->IsLineTerminator(c0_) \|\| c0_ < 0) return false;	720 if (unicode_cache_->IsLineTerminator(c0_) \|\| c0_ < 0) return false;

739 if (c0_ == '\\') { // Escape sequence.	721 if (c0_ == '\\') { // Escape sequence.

740 AddLiteralCharAdvance();	722 AddLiteralCharAdvance();

741 if (scanner_constants_->IsLineTerminator(c0_) \|\| c0_ < 0) return false;	723 if (unicode_cache_->IsLineTerminator(c0_) \|\| c0_ < 0) return false;

742 AddLiteralCharAdvance();	724 AddLiteralCharAdvance();

743 // If the escape allows more characters, i.e., \x??, \u????, or \c?,	725 // If the escape allows more characters, i.e., \x??, \u????, or \c?,

744 // only "safe" characters are allowed (letters, digits, underscore),	726 // only "safe" characters are allowed (letters, digits, underscore),

745 // otherwise the escape isn't valid and the invalid character has	727 // otherwise the escape isn't valid and the invalid character has

746 // its normal meaning. I.e., we can just continue scanning without	728 // its normal meaning. I.e., we can just continue scanning without

747 // worrying whether the following characters are part of the escape	729 // worrying whether the following characters are part of the escape

748 // or not, since any '/', '\\' or '[' is guaranteed to not be part	730 // or not, since any '/', '\\' or '[' is guaranteed to not be part

749 // of the escape sequence.	731 // of the escape sequence.

750 } else { // Unescaped character.	732 } else { // Unescaped character.

751 if (c0_ == '[') in_character_class = true;	733 if (c0_ == '[') in_character_class = true;

752 if (c0_ == ']') in_character_class = false;	734 if (c0_ == ']') in_character_class = false;

753 AddLiteralCharAdvance();	735 AddLiteralCharAdvance();

754 }	736 }

755 }	737 }

756 Advance(); // consume '/'	738 Advance(); // consume '/'

757	739

758 literal.Complete();	740 literal.Complete();

759	741

760 return true;	742 return true;

761 }	743 }

762	744

763	745

764 bool JavaScriptScanner::ScanRegExpFlags() {	746 bool JavaScriptScanner::ScanRegExpFlags() {

765 // Scan regular expression flags.	747 // Scan regular expression flags.

766 LiteralScope literal(this);	748 LiteralScope literal(this);

767 while (scanner_constants_->IsIdentifierPart(c0_)) {	749 while (unicode_cache_->IsIdentifierPart(c0_)) {

768 if (c0_ == '\\') {	750 if (c0_ == '\\') {

769 uc32 c = ScanIdentifierUnicodeEscape();	751 uc32 c = ScanIdentifierUnicodeEscape();

770 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {	752 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {

771 // We allow any escaped character, unlike the restriction on	753 // We allow any escaped character, unlike the restriction on

772 // IdentifierPart when it is used to build an IdentifierName.	754 // IdentifierPart when it is used to build an IdentifierName.

773 AddLiteralChar(c);	755 AddLiteralChar(c);

774 continue;	756 continue;

775 }	757 }

776 }	758 }

777 AddLiteralCharAdvance();	759 AddLiteralCharAdvance();

(...skipping 177 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
955 if (MatchKeywordStart(input, "with", 1, Token::WITH)) return;	937 if (MatchKeywordStart(input, "with", 1, Token::WITH)) return;

956 break;	938 break;

957 case UNMATCHABLE:	939 case UNMATCHABLE:

958 break;	940 break;

959 }	941 }

960 // On fallthrough, it's a failure.	942 // On fallthrough, it's a failure.

961 state_ = UNMATCHABLE;	943 state_ = UNMATCHABLE;

962 }	944 }

963	945

964 } } // namespace v8::internal	946 } } // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/scanner-base.h ('k') | test/cctest/test-conversions.cc » ('j') | no next file with comments »