src/scanner-base.cc - Issue 7677012: Make scanner handle invalid unicode escapes in identifiers correctly.

Side by Side Diff: src/scanner-base.cc

Issue 7677012: Make scanner handle invalid unicode escapes in identifiers correctly. (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Patch Set: Created 9 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 the V8 project authors. All rights reserved.	1 // Copyright 2011 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 23 matching lines...) Expand all Loading...
34 namespace v8 {	34 namespace v8 {

35 namespace internal {	35 namespace internal {

36	36

37 // ----------------------------------------------------------------------------	37 // ----------------------------------------------------------------------------

38 // Scanner	38 // Scanner

39	39

40 Scanner::Scanner(UnicodeCache* unicode_cache)	40 Scanner::Scanner(UnicodeCache* unicode_cache)

41 : unicode_cache_(unicode_cache) { }	41 : unicode_cache_(unicode_cache) { }

42	42

43	43

44 uc32 Scanner::ScanHexEscape(uc32 c, int length) {	44 uc32 Scanner::ScanHexNumber(int expected_length) {

45 ASSERT(length <= 4); // prevent overflow	45 ASSERT(expected_length <= 4); // prevent overflow

46	46

47 uc32 digits[4];	47 uc32 digits[4] = { 0, 0, 0, 0 };

48 uc32 x = 0;	48 uc32 x = 0;

49 for (int i = 0; i < length; i++) {	49 for (int i = 0; i < expected_length; i++) {

50 digits[i] = c0_;	50 digits[i] = c0_;

51 int d = HexValue(c0_);	51 int d = HexValue(c0_);

52 if (d < 0) {	52 if (d < 0) {

53 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes	53 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes

54 // should be illegal, but other JS VMs just return the	54 // should be illegal, but other JS VMs just return the

55 // non-escaped version of the original character.	55 // non-escaped version of the original character.

56	56

57 // Push back digits read, except the last one (in c0_).	57 // Push back digits that we have advanced past.

58 for (int j = i-1; j >= 0; j--) {	58 for (int j = i-1; j >= 0; j--) {

59 PushBack(digits[j]);	59 PushBack(digits[j]);

60 }	60 }

61 // Notice: No handling of error - treat it as "\u"->"u".	61 return -1;

62 return c;

63 }	62 }

64 x = x * 16 + d;	63 x = x * 16 + d;

65 Advance();	64 Advance();

66 }	65 }

67	66

68 return x;	67 return x;

69 }	68 }

70	69

71	70

72	71

(...skipping 560 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
633	632

634 switch (c) {	633 switch (c) {

635 case '\'': // fall through	634 case '\'': // fall through

636 case '"' : // fall through	635 case '"' : // fall through

637 case '\\': break;	636 case '\\': break;

638 case 'b' : c = '\b'; break;	637 case 'b' : c = '\b'; break;

639 case 'f' : c = '\f'; break;	638 case 'f' : c = '\f'; break;

640 case 'n' : c = '\n'; break;	639 case 'n' : c = '\n'; break;

641 case 'r' : c = '\r'; break;	640 case 'r' : c = '\r'; break;

642 case 't' : c = '\t'; break;	641 case 't' : c = '\t'; break;

643 case 'u' : c = ScanHexEscape(c, 4); break;	642 case 'u' : {

	643 c = ScanHexNumber(4);

	644 if (c < 0) c = 'u';

	645 break;

	646 }

644 case 'v' : c = '\v'; break;	647 case 'v' : c = '\v'; break;

645 case 'x' : c = ScanHexEscape(c, 2); break;	648 case 'x' : {

	649 c = ScanHexNumber(2);

	650 if (c < 0) c = 'x';

	651 break;

	652 }

646 case '0' : // fall through	653 case '0' : // fall through

647 case '1' : // fall through	654 case '1' : // fall through

648 case '2' : // fall through	655 case '2' : // fall through

649 case '3' : // fall through	656 case '3' : // fall through

650 case '4' : // fall through	657 case '4' : // fall through

651 case '5' : // fall through	658 case '5' : // fall through

652 case '6' : // fall through	659 case '6' : // fall through

653 case '7' : c = ScanOctalEscape(c, 2); break;	660 case '7' : c = ScanOctalEscape(c, 2); break;

654 }	661 }

655	662

(...skipping 139 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
795 return Token::ILLEGAL;	802 return Token::ILLEGAL;

796	803

797 literal.Complete();	804 literal.Complete();

798	805

799 return Token::NUMBER;	806 return Token::NUMBER;

800 }	807 }

801	808

802	809

803 uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {	810 uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {

804 Advance();	811 Advance();

805 if (c0_ != 'u') return unibrow::Utf8::kBadChar;	812 if (c0_ != 'u') return -1;

806 Advance();	813 Advance();

807 uc32 c = ScanHexEscape('u', 4);	814 uc32 result = ScanHexNumber(4);

808 // We do not allow a unicode escape sequence to start another	815 if (result < 0) PushBack('u');

809 // unicode escape sequence.	816 return result;

810 if (c == '\\') return unibrow::Utf8::kBadChar;

811 return c;

812 }	817 }

813	818

814	819

815 // ----------------------------------------------------------------------------	820 // ----------------------------------------------------------------------------

816 // Keyword Matcher	821 // Keyword Matcher

817	822

818 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \	823 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \

819 KEYWORD_GROUP('b') \	824 KEYWORD_GROUP('b') \

820 KEYWORD("break", Token::BREAK) \	825 KEYWORD("break", Token::BREAK) \

821 KEYWORD_GROUP('c') \	826 KEYWORD_GROUP('c') \

(...skipping 97 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
919 }	924 }

920	925

921	926

922 Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {	927 Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {

923 ASSERT(unicode_cache_->IsIdentifierStart(c0_));	928 ASSERT(unicode_cache_->IsIdentifierStart(c0_));

924 LiteralScope literal(this);	929 LiteralScope literal(this);

925 // Scan identifier start character.	930 // Scan identifier start character.

926 if (c0_ == '\\') {	931 if (c0_ == '\\') {

927 uc32 c = ScanIdentifierUnicodeEscape();	932 uc32 c = ScanIdentifierUnicodeEscape();

928 // Only allow legal identifier start characters.	933 // Only allow legal identifier start characters.

929 if (!unicode_cache_->IsIdentifierStart(c)) return Token::ILLEGAL;	934 if (c < 0 \|\|

	935 c == '\\' \|\| // No recursive escapes.

	936 !unicode_cache_->IsIdentifierStart(c)) {

	937 return Token::ILLEGAL;

	938 }

930 AddLiteralChar(c);	939 AddLiteralChar(c);

931 return ScanIdentifierSuffix(&literal);	940 return ScanIdentifierSuffix(&literal);

932 }	941 }

933	942

934 uc32 first_char = c0_;	943 uc32 first_char = c0_;

935 Advance();	944 Advance();

936 AddLiteralChar(first_char);	945 AddLiteralChar(first_char);

937	946

938 // Scan the rest of the identifier characters.	947 // Scan the rest of the identifier characters.

939 while (unicode_cache_->IsIdentifierPart(c0_)) {	948 while (unicode_cache_->IsIdentifierPart(c0_)) {

(...skipping 19 matching lines...) Expand all Loading...
959 return Token::IDENTIFIER;	968 return Token::IDENTIFIER;

960 }	969 }

961	970

962	971

963 Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {	972 Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {

964 // Scan the rest of the identifier characters.	973 // Scan the rest of the identifier characters.

965 while (unicode_cache_->IsIdentifierPart(c0_)) {	974 while (unicode_cache_->IsIdentifierPart(c0_)) {

966 if (c0_ == '\\') {	975 if (c0_ == '\\') {

967 uc32 c = ScanIdentifierUnicodeEscape();	976 uc32 c = ScanIdentifierUnicodeEscape();

968 // Only allow legal identifier part characters.	977 // Only allow legal identifier part characters.

969 if (!unicode_cache_->IsIdentifierPart(c)) return Token::ILLEGAL;	978 if (c < 0 \|\|

	979 c == '\\' \|\|

	980 !unicode_cache_->IsIdentifierPart(c)) {

	981 return Token::ILLEGAL;

	982 }

970 AddLiteralChar(c);	983 AddLiteralChar(c);

971 } else {	984 } else {

972 AddLiteralChar(c0_);	985 AddLiteralChar(c0_);

973 Advance();	986 Advance();

974 }	987 }

975 }	988 }

976 literal->Complete();	989 literal->Complete();

977	990

978 return Token::IDENTIFIER;	991 return Token::IDENTIFIER;

979 }	992 }

980	993

981	994

982 bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {	995 bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {

983 // Scan: ('/' \| '/=') RegularExpressionBody '/' RegularExpressionFlags	996 // Scan: ('/' \| '/=') RegularExpressionBody '/' RegularExpressionFlags

984 bool in_character_class = false;	997 bool in_character_class = false;

985	998

986 // Previous token is either '/' or '/=', in the second case, the	999 // Previous token is either '/' or '/=', in the second case, the

987 // pattern starts at =.	1000 // pattern starts at =.

988 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);	1001 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);

989 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);	1002 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);

990	1003

991 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,	1004 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,

992 // the scanner should pass uninterpreted bodies to the RegExp	1005 // the scanner should pass uninterpreted bodies to the RegExp

993 // constructor.	1006 // constructor.

994 LiteralScope literal(this);	1007 LiteralScope literal(this);

995 if (seen_equal)	1008 if (seen_equal) {

996 AddLiteralChar('=');	1009 AddLiteralChar('=');

	1010 }

997	1011

998 while (c0_ != '/' \|\| in_character_class) {	1012 while (c0_ != '/' \|\| in_character_class) {

999 if (unicode_cache_->IsLineTerminator(c0_) \|\| c0_ < 0) return false;	1013 if (unicode_cache_->IsLineTerminator(c0_) \|\| c0_ < 0) return false;

1000 if (c0_ == '\\') { // Escape sequence.	1014 if (c0_ == '\\') { // Escape sequence.

1001 AddLiteralCharAdvance();	1015 AddLiteralCharAdvance();

1002 if (unicode_cache_->IsLineTerminator(c0_) \|\| c0_ < 0) return false;	1016 if (unicode_cache_->IsLineTerminator(c0_) \|\| c0_ < 0) return false;

1003 AddLiteralCharAdvance();	1017 AddLiteralCharAdvance();

1004 // If the escape allows more characters, i.e., \x??, \u????, or \c?,	1018 // If the escape allows more characters, i.e., \x??, \u????, or \c?,

1005 // only "safe" characters are allowed (letters, digits, underscore),	1019 // only "safe" characters are allowed (letters, digits, underscore),

1006 // otherwise the escape isn't valid and the invalid character has	1020 // otherwise the escape isn't valid and the invalid character has

(...skipping 11 matching lines...) Expand all Loading...
1018 }	1032 }

1019 }	1033 }

1020 Advance(); // consume '/'	1034 Advance(); // consume '/'

1021	1035

1022 literal.Complete();	1036 literal.Complete();

1023	1037

1024 return true;	1038 return true;

1025 }	1039 }

1026	1040

1027	1041

	1042 bool JavaScriptScanner::ScanLiteralUnicodeEscape() {

	1043 ASSERT(c0_ == '\\');

	1044 uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};

	1045 Advance();

	1046 int i = 1;

	1047 if (c0_ == 'u') {

	1048 Advance();

	1049 i++;

	1050 while (i < 6) {

	1051 Advance();

	1052 if (!IsHexDigit(c0_)) break;

	1053 chars_read[i] = c0_;

	1054 i++;

	1055 }

	1056 }

	1057 if (i < 6) {

	1058 // Incomplete escape. Undo all advances and return false.

	1059 while (i > 0) {

	1060 i--;

	1061 PushBack(chars_read[i]);

	1062 }

	1063 return false;

	1064 }

	1065 // Complete escape. Add all chars to current literal buffer.

	1066 for (int i = 0; i < 6; i++) {

	1067 AddLiteralChar(chars_read[i]);

	1068 return true;
	Rico 2011/08/18 11:43:13 Indention seems wrong Indention seems wrong Lasse Reichstein 2011/08/24 13:36:28 Argh, more than wrong. The return has moved itself Argh, more than wrong. The return has moved itself above the end '}' of the loop. Will add more tests since we didn't catch such a blatant bug.
	1069 }

	1070 }

	1071

	1072

1028 bool JavaScriptScanner::ScanRegExpFlags() {	1073 bool JavaScriptScanner::ScanRegExpFlags() {

1029 // Scan regular expression flags.	1074 // Scan regular expression flags.

1030 LiteralScope literal(this);	1075 LiteralScope literal(this);

1031 while (unicode_cache_->IsIdentifierPart(c0_)) {	1076 while (unicode_cache_->IsIdentifierPart(c0_)) {

1032 if (c0_ == '\\') {	1077 if (c0_ != '\\') {

1033 uc32 c = ScanIdentifierUnicodeEscape();	1078 AddLiteralCharAdvance();

1034 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {	1079 } else {

1035 // We allow any escaped character, unlike the restriction on	1080 if (!ScanLiteralUnicodeEscape()) {

1036 // IdentifierPart when it is used to build an IdentifierName.	1081 break;

1037 AddLiteralChar(c);

1038 continue;

1039 }	1082 }

1040 }	1083 }

1041 AddLiteralCharAdvance();

1042 }	1084 }

1043 literal.Complete();	1085 literal.Complete();

1044	1086

1045 next_.location.end_pos = source_pos() - 1;	1087 next_.location.end_pos = source_pos() - 1;

1046 return true;	1088 return true;

1047 }	1089 }

1048	1090

1049 } } // namespace v8::internal	1091 } } // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/scanner-base.h ('k') | test/mjsunit/regress/regress-1620.js » ('j') | test/mjsunit/regress/regress-1620.js » ('J')