src/scanner-base.cc - Issue 7558017: Simpler (and a bit faster) keyword matcher

Side by Side Diff: src/scanner-base.cc

Issue 7558017: Simpler (and a bit faster) keyword matcher (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Patch Set: Review fixes Created 9 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 the V8 project authors. All rights reserved.	1 // Copyright 2011 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 644 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
655 if (c0_ != 'u') return unibrow::Utf8::kBadChar;	655 if (c0_ != 'u') return unibrow::Utf8::kBadChar;

656 Advance();	656 Advance();

657 uc32 c = ScanHexEscape('u', 4);	657 uc32 c = ScanHexEscape('u', 4);

658 // We do not allow a unicode escape sequence to start another	658 // We do not allow a unicode escape sequence to start another

659 // unicode escape sequence.	659 // unicode escape sequence.

660 if (c == '\\') return unibrow::Utf8::kBadChar;	660 if (c == '\\') return unibrow::Utf8::kBadChar;

661 return c;	661 return c;

662 }	662 }

663	663

664	664

	665 // ----------------------------------------------------------------------------

	666 // Keyword Matcher

	667

	668 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \

	669 KEYWORD_GROUP('b') \

	670 KEYWORD("break", BREAK) \

	671 KEYWORD_GROUP('c') \

	672 KEYWORD("case", CASE) \

	673 KEYWORD("catch", CATCH) \

	674 KEYWORD("class", FUTURE_RESERVED_WORD) \

	675 KEYWORD("const", CONST) \

	676 KEYWORD("continue", CONTINUE) \

	677 KEYWORD_GROUP('d') \

	678 KEYWORD("debugger", DEBUGGER) \

	679 KEYWORD("default", DEFAULT) \

	680 KEYWORD("delete", DELETE) \

	681 KEYWORD("do", DO) \

	682 KEYWORD_GROUP('e') \

	683 KEYWORD("else", ELSE) \

	684 KEYWORD("enum", FUTURE_RESERVED_WORD) \

	685 KEYWORD("export", FUTURE_RESERVED_WORD) \

	686 KEYWORD("extends", FUTURE_RESERVED_WORD) \

	687 KEYWORD_GROUP('f') \

	688 KEYWORD("false", FALSE_LITERAL) \

	689 KEYWORD("finally", FINALLY) \

	690 KEYWORD("for", FOR) \

	691 KEYWORD("function", FUNCTION) \

	692 KEYWORD_GROUP('i') \

	693 KEYWORD("if", IF) \

	694 KEYWORD("implements", FUTURE_STRICT_RESERVED_WORD) \

	695 KEYWORD("import", FUTURE_RESERVED_WORD) \

	696 KEYWORD("in", IN) \

	697 KEYWORD("instanceof", INSTANCEOF) \

	698 KEYWORD("interface", FUTURE_STRICT_RESERVED_WORD) \

	699 KEYWORD_GROUP('l') \

	700 KEYWORD("let", FUTURE_STRICT_RESERVED_WORD) \

	701 KEYWORD_GROUP('n') \

	702 KEYWORD("new", NEW) \

	703 KEYWORD("null", NULL_LITERAL) \

	704 KEYWORD_GROUP('p') \

	705 KEYWORD("package", FUTURE_STRICT_RESERVED_WORD) \

	706 KEYWORD("private", FUTURE_STRICT_RESERVED_WORD) \

	707 KEYWORD("protected", FUTURE_STRICT_RESERVED_WORD) \

	708 KEYWORD("public", FUTURE_STRICT_RESERVED_WORD) \

	709 KEYWORD_GROUP('r') \

	710 KEYWORD("return", RETURN) \

	711 KEYWORD_GROUP('s') \

	712 KEYWORD("static", FUTURE_STRICT_RESERVED_WORD) \

	713 KEYWORD("super", FUTURE_RESERVED_WORD) \

	714 KEYWORD("switch", SWITCH) \

	715 KEYWORD_GROUP('t') \

	716 KEYWORD("this", THIS) \

	717 KEYWORD("throw", THROW) \

	718 KEYWORD("true", TRUE_LITERAL) \

	719 KEYWORD("try", TRY) \

	720 KEYWORD("typeof", TYPEOF) \

	721 KEYWORD_GROUP('v') \

	722 KEYWORD("var", VAR) \

	723 KEYWORD("void", VOID) \

	724 KEYWORD_GROUP('w') \

	725 KEYWORD("while", WHILE) \

	726 KEYWORD("with", WITH) \

	727 KEYWORD_GROUP('y') \

	728 KEYWORD("yield", FUTURE_STRICT_RESERVED_WORD)

	729

	730

	731 static Token::Value KeywordOrIdentifierToken(const char* input,

	732 int input_length) {

	733 ASSERT(input_length >= 1);

	734 const int kMinLength = 2;

	735 const int kMaxLength = 10;

	736 if (input_length < kMinLength \|\| input_length > kMaxLength) {

	737 return Token::IDENTIFIER;

	738 }

	739 switch (input[0]) {

	740 default:

	741 #define KEYWORD_GROUP_CASE(ch) \

	742 break; \

	743 case ch:

	744 #define KEYWORD(keyword, token) \

	745 { \

	746 /* 'keyword' is a char array, so sizeof(keyword) is */ \

	747 /* strlen(keyword) plus 1 for the NUL char. */ \

	748 const int keyword_length = sizeof(keyword) - 1; \

	749 STATIC_ASSERT(keyword_length >= kMinLength); \

	750 STATIC_ASSERT(keyword_length <= kMaxLength); \

	751 if (input_length == keyword_length && \

	752 input[1] == keyword[1] && \

	753 (keyword_length <= 2 \|\| input[2] == keyword[2]) && \

	754 (keyword_length <= 3 \|\| input[3] == keyword[3]) && \

	755 (keyword_length <= 4 \|\| input[4] == keyword[4]) && \

	756 (keyword_length <= 5 \|\| input[5] == keyword[5]) && \

	757 (keyword_length <= 6 \|\| input[6] == keyword[6]) && \

	758 (keyword_length <= 7 \|\| input[7] == keyword[7]) && \

	759 (keyword_length <= 8 \|\| input[8] == keyword[8]) && \

	760 (keyword_length <= 9 \|\| input[9] == keyword[9])) { \

	761 return Token::token; \

	762 } \

	763 }

	764 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)

	765 }

	766 return Token::IDENTIFIER;

	767 }

	768

	769

665 Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {	770 Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {

666 ASSERT(unicode_cache_->IsIdentifierStart(c0_));	771 ASSERT(unicode_cache_->IsIdentifierStart(c0_));

667 LiteralScope literal(this);	772 LiteralScope literal(this);

668 KeywordMatcher keyword_match;

669 // Scan identifier start character.	773 // Scan identifier start character.

670 if (c0_ == '\\') {	774 if (c0_ == '\\') {

671 uc32 c = ScanIdentifierUnicodeEscape();	775 uc32 c = ScanIdentifierUnicodeEscape();

672 // Only allow legal identifier start characters.	776 // Only allow legal identifier start characters.

673 if (!unicode_cache_->IsIdentifierStart(c)) return Token::ILLEGAL;	777 if (!unicode_cache_->IsIdentifierStart(c)) return Token::ILLEGAL;

674 AddLiteralChar(c);	778 AddLiteralChar(c);

675 return ScanIdentifierSuffix(&literal);	779 return ScanIdentifierSuffix(&literal);

676 }	780 }

677	781

678 uc32 first_char = c0_;	782 uc32 first_char = c0_;

679 Advance();	783 Advance();

680 AddLiteralChar(first_char);	784 AddLiteralChar(first_char);

681 if (!keyword_match.AddChar(first_char)) {

682 return ScanIdentifierSuffix(&literal);

683 }

684	785

685 // Scan the rest of the identifier characters.	786 // Scan the rest of the identifier characters.

686 while (unicode_cache_->IsIdentifierPart(c0_)) {	787 while (unicode_cache_->IsIdentifierPart(c0_)) {

687 if (c0_ != '\\') {	788 if (c0_ != '\\') {

688 uc32 next_char = c0_;	789 uc32 next_char = c0_;

689 Advance();	790 Advance();

690 AddLiteralChar(next_char);	791 AddLiteralChar(next_char);

691 if (keyword_match.AddChar(next_char)) continue;	792 continue;

692 }	793 }

693 // Fallthrough if no loner able to complete keyword.	794 // Fallthrough if no longer able to complete keyword.

694 return ScanIdentifierSuffix(&literal);	795 return ScanIdentifierSuffix(&literal);

695 }	796 }

	797

696 literal.Complete();	798 literal.Complete();

697	799

698 return keyword_match.token();	800 if (next_.literal_chars->is_ascii()) {

	801 Vector<const char> chars = next_.literal_chars->ascii_literal();

	802 return KeywordOrIdentifierToken(chars.start(), chars.length());

	803 }

	804

	805 return Token::IDENTIFIER;

699 }	806 }

700	807

701	808

702 Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {	809 Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {

703 // Scan the rest of the identifier characters.	810 // Scan the rest of the identifier characters.

704 while (unicode_cache_->IsIdentifierPart(c0_)) {	811 while (unicode_cache_->IsIdentifierPart(c0_)) {

705 if (c0_ == '\\') {	812 if (c0_ == '\\') {

706 uc32 c = ScanIdentifierUnicodeEscape();	813 uc32 c = ScanIdentifierUnicodeEscape();

707 // Only allow legal identifier part characters.	814 // Only allow legal identifier part characters.

708 if (!unicode_cache_->IsIdentifierPart(c)) return Token::ILLEGAL;	815 if (!unicode_cache_->IsIdentifierPart(c)) return Token::ILLEGAL;

(...skipping 69 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
778 }	885 }

779 }	886 }

780 AddLiteralCharAdvance();	887 AddLiteralCharAdvance();

781 }	888 }

782 literal.Complete();	889 literal.Complete();

783	890

784 next_.location.end_pos = source_pos() - 1;	891 next_.location.end_pos = source_pos() - 1;

785 return true;	892 return true;

786 }	893 }

787	894

788 // ----------------------------------------------------------------------------

789 // Keyword Matcher

790

791 const KeywordMatcher::FirstState KeywordMatcher::first_states_[] = {

792 { "break", KEYWORD_PREFIX, Token::BREAK },

793 { NULL, C, Token::ILLEGAL },

794 { NULL, D, Token::ILLEGAL },

795 { NULL, E, Token::ILLEGAL },

796 { NULL, F, Token::ILLEGAL },

797 { NULL, UNMATCHABLE, Token::ILLEGAL },

798 { NULL, UNMATCHABLE, Token::ILLEGAL },

799 { NULL, I, Token::ILLEGAL },

800 { NULL, UNMATCHABLE, Token::ILLEGAL },

801 { NULL, UNMATCHABLE, Token::ILLEGAL },

802 { "let", KEYWORD_PREFIX, Token::FUTURE_STRICT_RESERVED_WORD },

803 { NULL, UNMATCHABLE, Token::ILLEGAL },

804 { NULL, N, Token::ILLEGAL },

805 { NULL, UNMATCHABLE, Token::ILLEGAL },

806 { NULL, P, Token::ILLEGAL },

807 { NULL, UNMATCHABLE, Token::ILLEGAL },

808 { "return", KEYWORD_PREFIX, Token::RETURN },

809 { NULL, S, Token::ILLEGAL },

810 { NULL, T, Token::ILLEGAL },

811 { NULL, UNMATCHABLE, Token::ILLEGAL },

812 { NULL, V, Token::ILLEGAL },

813 { NULL, W, Token::ILLEGAL },

814 { NULL, UNMATCHABLE, Token::ILLEGAL },

815 { "yield", KEYWORD_PREFIX, Token::FUTURE_STRICT_RESERVED_WORD }

816 };

817

818

819 void KeywordMatcher::Step(unibrow::uchar input) {

820 switch (state_) {

821 case INITIAL: {

822 // matching the first character is the only state with significant fanout.

823 // Match only lower-case letters in range 'b'..'y'.

824 unsigned int offset = input - kFirstCharRangeMin;

825 if (offset < kFirstCharRangeLength) {

826 state_ = first_states_[offset].state;

827 if (state_ == KEYWORD_PREFIX) {

828 keyword_ = first_states_[offset].keyword;

829 counter_ = 1;

830 keyword_token_ = first_states_[offset].token;

831 }

832 return;

833 }

834 break;

835 }

836 case KEYWORD_PREFIX:

837 if (static_cast<unibrow::uchar>(keyword_[counter_]) == input) {

838 counter_++;

839 if (keyword_[counter_] == '\0') {

840 state_ = KEYWORD_MATCHED;

841 token_ = keyword_token_;

842 }

843 return;

844 }

845 break;

846 case KEYWORD_MATCHED:

847 token_ = Token::IDENTIFIER;

848 break;

849 case C:

850 if (MatchState(input, 'a', CA)) return;

851 if (MatchKeywordStart(input, "class", 1,

852 Token::FUTURE_RESERVED_WORD)) return;

853 if (MatchState(input, 'o', CO)) return;

854 break;

855 case CA:

856 if (MatchKeywordStart(input, "case", 2, Token::CASE)) return;

857 if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return;

858 break;

859 case CO:

860 if (MatchState(input, 'n', CON)) return;

861 break;

862 case CON:

863 if (MatchKeywordStart(input, "const", 3, Token::CONST)) return;

864 if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return;

865 break;

866 case D:

867 if (MatchState(input, 'e', DE)) return;

868 if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return;

869 break;

870 case DE:

871 if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return;

872 if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return;

873 if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return;

874 break;

875 case E:

876 if (MatchKeywordStart(input, "else", 1, Token::ELSE)) return;

877 if (MatchKeywordStart(input, "enum", 1,

878 Token::FUTURE_RESERVED_WORD)) return;

879 if (MatchState(input, 'x', EX)) return;

880 break;

881 case EX:

882 if (MatchKeywordStart(input, "export", 2,

883 Token::FUTURE_RESERVED_WORD)) return;

884 if (MatchKeywordStart(input, "extends", 2,

885 Token::FUTURE_RESERVED_WORD)) return;

886 break;

887 case F:

888 if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return;

889 if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return;

890 if (MatchKeywordStart(input, "for", 1, Token::FOR)) return;

891 if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return;

892 break;

893 case I:

894 if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return;

895 if (MatchState(input, 'm', IM)) return;

896 if (MatchKeyword(input, 'n', IN, Token::IN)) return;

897 break;

898 case IM:

899 if (MatchState(input, 'p', IMP)) return;

900 break;

901 case IMP:

902 if (MatchKeywordStart(input, "implements", 3,

903 Token::FUTURE_STRICT_RESERVED_WORD )) return;

904 if (MatchKeywordStart(input, "import", 3,

905 Token::FUTURE_RESERVED_WORD)) return;

906 break;

907 case IN:

908 token_ = Token::IDENTIFIER;

909 if (MatchKeywordStart(input, "interface", 2,

910 Token::FUTURE_STRICT_RESERVED_WORD)) return;

911 if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) return;

912 break;

913 case N:

914 if (MatchKeywordStart(input, "new", 1, Token::NEW)) return;

915 if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return;

916 break;

917 case P:

918 if (MatchKeywordStart(input, "package", 1,

919 Token::FUTURE_STRICT_RESERVED_WORD)) return;

920 if (MatchState(input, 'r', PR)) return;

921 if (MatchKeywordStart(input, "public", 1,

922 Token::FUTURE_STRICT_RESERVED_WORD)) return;

923 break;

924 case PR:

925 if (MatchKeywordStart(input, "private", 2,

926 Token::FUTURE_STRICT_RESERVED_WORD)) return;

927 if (MatchKeywordStart(input, "protected", 2,

928 Token::FUTURE_STRICT_RESERVED_WORD)) return;

929 break;

930 case S:

931 if (MatchKeywordStart(input, "static", 1,

932 Token::FUTURE_STRICT_RESERVED_WORD)) return;

933 if (MatchKeywordStart(input, "super", 1,

934 Token::FUTURE_RESERVED_WORD)) return;

935 if (MatchKeywordStart(input, "switch", 1,

936 Token::SWITCH)) return;

937 break;

938 case T:

939 if (MatchState(input, 'h', TH)) return;

940 if (MatchState(input, 'r', TR)) return;

941 if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return;

942 break;

943 case TH:

944 if (MatchKeywordStart(input, "this", 2, Token::THIS)) return;

945 if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return;

946 break;

947 case TR:

948 if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return;

949 if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return;

950 break;

951 case V:

952 if (MatchKeywordStart(input, "var", 1, Token::VAR)) return;

953 if (MatchKeywordStart(input, "void", 1, Token::VOID)) return;

954 break;

955 case W:

956 if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return;

957 if (MatchKeywordStart(input, "with", 1, Token::WITH)) return;

958 break;

959 case UNMATCHABLE:

960 break;

961 }

962 // On fallthrough, it's a failure.

963 state_ = UNMATCHABLE;

964 }

965

966 } } // namespace v8::internal	895 } } // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/scanner-base.h ('k') | src/token.h » ('j') | no next file with comments »