src/scanner-base.h - Issue 5274002: Version 2.5.8...

Side by Side Diff: src/scanner-base.h

Issue 5274002: Version 2.5.8... (Closed) Base URL: http://v8.googlecode.com/svn/trunk/

Patch Set: Created 10 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2010 the V8 project authors. All rights reserved.	1 // Copyright 2010 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 19 matching lines...) Expand all Loading...
30 #ifndef V8_SCANNER_BASE_H_	30 #ifndef V8_SCANNER_BASE_H_

31 #define V8_SCANNER_BASE_H_	31 #define V8_SCANNER_BASE_H_

32	32

33 #include "globals.h"	33 #include "globals.h"

34 #include "checks.h"	34 #include "checks.h"

35 #include "allocation.h"	35 #include "allocation.h"

36 #include "token.h"	36 #include "token.h"

37 #include "unicode-inl.h"	37 #include "unicode-inl.h"

38 #include "char-predicates.h"	38 #include "char-predicates.h"

39 #include "utils.h"	39 #include "utils.h"

	40 #include "list-inl.h"

40	41

41 namespace v8 {	42 namespace v8 {

42 namespace internal {	43 namespace internal {

43	44

44 // Interface through which the scanner reads characters from the input source.	45 // Returns the value (0 .. 15) of a hexadecimal character c.

	46 // If c is not a legal hexadecimal character, returns a value < 0.

	47 inline int HexValue(uc32 c) {

	48 c -= '0';

	49 if (static_cast<unsigned>(c) <= 9) return c;

	50 c = (c \| 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.

	51 if (static_cast<unsigned>(c) <= 5) return c + 10;

	52 return -1;

	53 }

	54

	55 // ----------------------------------------------------------------------------

	56 // UTF16Buffer - scanner input source with pushback.

	57

45 class UTF16Buffer {	58 class UTF16Buffer {

46 public:	59 public:

47 UTF16Buffer();	60 UTF16Buffer();

48 virtual ~UTF16Buffer() {}	61 virtual ~UTF16Buffer() {}

49	62

50 virtual void PushBack(uc32 ch) = 0;	63 virtual void PushBack(uc32 ch) = 0;

51 // Returns a value < 0 when the buffer end is reached.	64 // Returns a value < 0 when the buffer end is reached.

52 virtual uc32 Advance() = 0;	65 virtual uc32 Advance() = 0;

53 virtual void SeekForward(int pos) = 0;	66 virtual void SeekForward(int pos) = 0;

54	67

55 int pos() const { return pos_; }	68 int pos() const { return pos_; }

56	69

	70 static const int kNoEndPosition = 1;

	71

57 protected:	72 protected:

	73 // Initial value of end_ before the input stream is initialized.

	74

58 int pos_; // Current position in the buffer.	75 int pos_; // Current position in the buffer.

59 int end_; // Position where scanning should stop (EOF).	76 int end_; // Position where scanning should stop (EOF).

60 };	77 };

61	78

62	79

63 class ScannerConstants : AllStatic {	80 class ScannerConstants : AllStatic {

64 public:	81 public:

65 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;	82 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;

66	83

67 static StaticResource<Utf8Decoder>* utf8_decoder() {	84 static StaticResource<Utf8Decoder>* utf8_decoder() {

68 return &utf8_decoder_;	85 return &utf8_decoder_;

69 }	86 }

70	87

71 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;	88 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;

72 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;	89 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;

73 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;	90 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;

74 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;	91 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;

75	92

76 static bool IsIdentifier(unibrow::CharacterStream* buffer);	93 static bool IsIdentifier(unibrow::CharacterStream* buffer);

77	94

78 private:	95 private:

79 static StaticResource<Utf8Decoder> utf8_decoder_;	96 static StaticResource<Utf8Decoder> utf8_decoder_;

80 };	97 };

81	98

	99 // ----------------------------------------------------------------------------

	100 // LiteralCollector - Collector of chars of literals.

	101

	102 class LiteralCollector {

	103 public:

	104 LiteralCollector();

	105 ~LiteralCollector();

	106

	107 inline void AddChar(uc32 c) {

	108 if (recording_) {

	109 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {

	110 buffer_.Add(static_cast<char>(c));

	111 } else {

	112 AddCharSlow(c);

	113 }

	114 }

	115 }

	116

	117 void StartLiteral() {

	118 buffer_.StartSequence();

	119 recording_ = true;

	120 }

	121

	122 Vector<const char> EndLiteral() {

	123 if (recording_) {

	124 recording_ = false;

	125 buffer_.Add(kEndMarker);

	126 Vector<char> sequence = buffer_.EndSequence();

	127 return Vector<const char>(sequence.start(), sequence.length());

	128 }

	129 return Vector<const char>();

	130 }

	131

	132 void DropLiteral() {

	133 if (recording_) {

	134 recording_ = false;

	135 buffer_.DropSequence();

	136 }

	137 }

	138

	139 void Reset() {

	140 buffer_.Reset();

	141 }

	142

	143 // The end marker added after a parsed literal.

	144 // Using zero allows the usage of strlen and similar functions on

	145 // identifiers and numbers (but not strings, since they may contain zero

	146 // bytes).

	147 static const char kEndMarker = '\x00';

	148 private:

	149 static const int kInitialCapacity = 256;

	150 SequenceCollector<char, 4> buffer_;

	151 bool recording_;

	152 void AddCharSlow(uc32 c);

	153 };

	154

	155 // ----------------------------------------------------------------------------

	156 // Scanner base-class.

	157

	158 // Generic functionality used by both JSON and JavaScript scanners.

	159 class Scanner {

	160 public:

	161 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;

	162

	163 class LiteralScope {

	164 public:

	165 explicit LiteralScope(Scanner* self);

	166 ~LiteralScope();

	167 void Complete();

	168

	169 private:

	170 Scanner* scanner_;

	171 bool complete_;

	172 };

	173

	174 Scanner();

	175

	176 // Returns the current token again.

	177 Token::Value current_token() { return current_.token; }

	178

	179 // One token look-ahead (past the token returned by Next()).

	180 Token::Value peek() const { return next_.token; }

	181

	182 struct Location {

	183 Location(int b, int e) : beg_pos(b), end_pos(e) { }

	184 Location() : beg_pos(0), end_pos(0) { }

	185 int beg_pos;

	186 int end_pos;

	187 };

	188

	189 // Returns the location information for the current token

	190 // (the token returned by Next()).

	191 Location location() const { return current_.location; }

	192 Location peek_location() const { return next_.location; }

	193

	194 // Returns the literal string, if any, for the current token (the

	195 // token returned by Next()). The string is 0-terminated and in

	196 // UTF-8 format; they may contain 0-characters. Literal strings are

	197 // collected for identifiers, strings, and numbers.

	198 // These functions only give the correct result if the literal

	199 // was scanned between calls to StartLiteral() and TerminateLiteral().

	200 const char* literal_string() const {

	201 return current_.literal_chars.start();

	202 }

	203

	204 int literal_length() const {

	205 // Excluding terminal '\x00' added by TerminateLiteral().

	206 return current_.literal_chars.length() - 1;

	207 }

	208

	209 Vector<const char> literal() const {

	210 return Vector<const char>(literal_string(), literal_length());

	211 }

	212

	213 // Returns the literal string for the next token (the token that

	214 // would be returned if Next() were called).

	215 const char* next_literal_string() const {

	216 return next_.literal_chars.start();

	217 }

	218

	219

	220 // Returns the length of the next token (that would be returned if

	221 // Next() were called).

	222 int next_literal_length() const {

	223 // Excluding terminal '\x00' added by TerminateLiteral().

	224 return next_.literal_chars.length() - 1;

	225 }

	226

	227 Vector<const char> next_literal() const {

	228 return Vector<const char>(next_literal_string(), next_literal_length());

	229 }

	230

	231 bool stack_overflow() { return stack_overflow_; }

	232

	233 static const int kCharacterLookaheadBufferSize = 1;

	234

	235 protected:

	236 // The current and look-ahead token.

	237 struct TokenDesc {

	238 Token::Value token;

	239 Location location;

	240 Vector<const char> literal_chars;

	241 };

	242

	243 // Call this after setting source_ to the input.

	244 void Init() {

	245 // Set c0_ (one character ahead)

	246 ASSERT(kCharacterLookaheadBufferSize == 1);

	247 Advance();

	248 // Initialize current_ to not refer to a literal.

	249 current_.literal_chars = Vector<const char>();

	250 // Reset literal buffer.

	251 literal_buffer_.Reset();

	252 }

	253

	254 // Literal buffer support

	255 inline void StartLiteral() {

	256 literal_buffer_.StartLiteral();

	257 }

	258

	259 inline void AddLiteralChar(uc32 c) {

	260 literal_buffer_.AddChar(c);

	261 }

	262

	263 // Complete scanning of a literal.

	264 inline void TerminateLiteral() {

	265 next_.literal_chars = literal_buffer_.EndLiteral();

	266 }

	267

	268 // Stops scanning of a literal and drop the collected characters,

	269 // e.g., due to an encountered error.

	270 inline void DropLiteral() {

	271 literal_buffer_.DropLiteral();

	272 }

	273

	274 inline void AddLiteralCharAdvance() {

	275 AddLiteralChar(c0_);

	276 Advance();

	277 }

	278

	279 // Low-level scanning support.

	280 void Advance() { c0_ = source_->Advance(); }

	281 void PushBack(uc32 ch) {

	282 source_->PushBack(ch);

	283 c0_ = ch;

	284 }

	285

	286 inline Token::Value Select(Token::Value tok) {

	287 Advance();

	288 return tok;

	289 }

	290

	291 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {

	292 Advance();

	293 if (c0_ == next) {

	294 Advance();

	295 return then;

	296 } else {

	297 return else_;

	298 }

	299 }

	300

	301 uc32 ScanHexEscape(uc32 c, int length);

	302 uc32 ScanOctalEscape(uc32 c, int length);

	303

	304 // Return the current source position.

	305 int source_pos() {

	306 return source_->pos() - kCharacterLookaheadBufferSize;

	307 }

	308

	309 TokenDesc current_; // desc for current token (as returned by Next())

	310 TokenDesc next_; // desc for next token (one token look-ahead)

	311

	312 // Input stream. Must be initialized to an UTF16Buffer.

	313 UTF16Buffer* source_;

	314

	315 // Buffer to hold literal values (identifiers, strings, numbers)

	316 // using '\x00'-terminated UTF-8 encoding. Handles allocation internally.

	317 LiteralCollector literal_buffer_;

	318

	319 bool stack_overflow_;

	320

	321 // One Unicode character look-ahead; c0_ < 0 at the end of the input.

	322 uc32 c0_;

	323 };

	324

	325 // ----------------------------------------------------------------------------

	326 // JavaScriptScanner - base logic for JavaScript scanning.

	327

	328 class JavaScriptScanner : public Scanner {

	329 public:

	330

	331 // Bit vector representing set of types of literals.

	332 enum LiteralType {

	333 kNoLiterals = 0,

	334 kLiteralNumber = 1,

	335 kLiteralIdentifier = 2,

	336 kLiteralString = 4,

	337 kLiteralRegExp = 8,

	338 kLiteralRegExpFlags = 16,

	339 kAllLiterals = 31

	340 };

	341

	342 // A LiteralScope that disables recording of some types of JavaScript

	343 // literals. If the scanner is configured to not record the specific

	344 // type of literal, the scope will not call StartLiteral.

	345 class LiteralScope {

	346 public:

	347 LiteralScope(JavaScriptScanner* self, LiteralType type)

	348 : scanner_(self), complete_(false) {

	349 if (scanner_->RecordsLiteral(type)) {

	350 scanner_->StartLiteral();

	351 }

	352 }

	353 ~LiteralScope() {

	354 if (!complete_) scanner_->DropLiteral();

	355 }

	356 void Complete() {

	357 scanner_->TerminateLiteral();

	358 complete_ = true;

	359 }

	360

	361 private:

	362 JavaScriptScanner* scanner_;

	363 bool complete_;

	364 };

	365

	366 JavaScriptScanner();

	367

	368 // Returns the next token.

	369 Token::Value Next();

	370

	371 // Returns true if there was a line terminator before the peek'ed token.

	372 bool has_line_terminator_before_next() const {

	373 return has_line_terminator_before_next_;

	374 }

	375

	376 // Scans the input as a regular expression pattern, previous

	377 // character(s) must be /(=). Returns true if a pattern is scanned.

	378 bool ScanRegExpPattern(bool seen_equal);

	379 // Returns true if regexp flags are scanned (always since flags can

	380 // be empty).

	381 bool ScanRegExpFlags();

	382

	383 // Tells whether the buffer contains an identifier (no escapes).

	384 // Used for checking if a property name is an identifier.

	385 static bool IsIdentifier(unibrow::CharacterStream* buffer);

	386

	387 // Seek forward to the given position. This operation does not

	388 // work in general, for instance when there are pushed back

	389 // characters, but works for seeking forward until simple delimiter

	390 // tokens, which is what it is used for.

	391 void SeekForward(int pos);

	392

	393 // Whether this scanner records the given literal type or not.

	394 bool RecordsLiteral(LiteralType type) {

	395 return (literal_flags_ & type) != 0;

	396 }

	397

	398 protected:

	399 bool SkipWhiteSpace();

	400 Token::Value SkipSingleLineComment();

	401 Token::Value SkipMultiLineComment();

	402

	403 // Scans a single JavaScript token.

	404 void Scan();

	405

	406 void ScanDecimalDigits();

	407 Token::Value ScanNumber(bool seen_period);

	408 Token::Value ScanIdentifierOrKeyword();

	409 Token::Value ScanIdentifierSuffix(LiteralScope* literal);

	410

	411 void ScanEscape();

	412 Token::Value ScanString();

	413

	414 // Scans a possible HTML comment -- begins with '<!'.

	415 Token::Value ScanHtmlComment();

	416

	417 // Decodes a unicode escape-sequence which is part of an identifier.

	418 // If the escape sequence cannot be decoded the result is kBadChar.

	419 uc32 ScanIdentifierUnicodeEscape();

	420

	421 int literal_flags_;

	422 bool has_line_terminator_before_next_;

	423 };

	424

	425

	426 // ----------------------------------------------------------------------------

	427 // Keyword matching state machine.

82	428

83 class KeywordMatcher {	429 class KeywordMatcher {

84 // Incrementally recognize keywords.	430 // Incrementally recognize keywords.

85 //	431 //

86 // Recognized keywords:	432 // Recognized keywords:

87 // break case catch const* continue debugger* default delete do else	433 // break case catch const* continue debugger* default delete do else

88 // finally false for function if in instanceof native* new null	434 // finally false for function if in instanceof native* new null

89 // return switch this throw true try typeof var void while with	435 // return switch this throw true try typeof var void while with

90 //	436 //

91 // *: Actually "future reserved keywords". These are the only ones we	437 // *: Actually "future reserved keywords". These are the only ones we

92 // recognize, the remaining are allowed as identifiers.	438 // recognize, the remaining are allowed as identifiers.

93 // In ES5 strict mode, we should disallow all reserved keywords.	439 // In ES5 strict mode, we should disallow all reserved keywords.

94 public:	440 public:

95 KeywordMatcher()	441 KeywordMatcher()

96 : state_(INITIAL),	442 : state_(INITIAL),

97 token_(Token::IDENTIFIER),	443 token_(Token::IDENTIFIER),

98 keyword_(NULL),	444 keyword_(NULL),

99 counter_(0),	445 counter_(0),

100 keyword_token_(Token::ILLEGAL) {}	446 keyword_token_(Token::ILLEGAL) {}

101	447

102 Token::Value token() { return token_; }	448 Token::Value token() { return token_; }

103	449

104 inline void AddChar(unibrow::uchar input) {	450 inline bool AddChar(unibrow::uchar input) {

105 if (state_ != UNMATCHABLE) {	451 if (state_ != UNMATCHABLE) {

106 Step(input);	452 Step(input);

107 }	453 }

	454 return state_ != UNMATCHABLE;

108 }	455 }

109	456

110 void Fail() {	457 void Fail() {

111 token_ = Token::IDENTIFIER;	458 token_ = Token::IDENTIFIER;

112 state_ = UNMATCHABLE;	459 state_ = UNMATCHABLE;

113 }	460 }

114	461

115 private:	462 private:

116 enum State {	463 enum State {

117 UNMATCHABLE,	464 UNMATCHABLE,

(...skipping 30 matching lines...) Expand all Loading...
148 kFirstCharRangeMax - kFirstCharRangeMin + 1;	495 kFirstCharRangeMax - kFirstCharRangeMin + 1;

149 // State map for first keyword character range.	496 // State map for first keyword character range.

150 static FirstState first_states_[kFirstCharRangeLength];	497 static FirstState first_states_[kFirstCharRangeLength];

151	498

152 // If input equals keyword's character at position, continue matching keyword	499 // If input equals keyword's character at position, continue matching keyword

153 // from that position.	500 // from that position.

154 inline bool MatchKeywordStart(unibrow::uchar input,	501 inline bool MatchKeywordStart(unibrow::uchar input,

155 const char* keyword,	502 const char* keyword,

156 int position,	503 int position,

157 Token::Value token_if_match) {	504 Token::Value token_if_match) {

158 if (input == static_cast<unibrow::uchar>(keyword[position])) {	505 if (input != static_cast<unibrow::uchar>(keyword[position])) {

159 state_ = KEYWORD_PREFIX;	506 return false;

160 this->keyword_ = keyword;

161 this->counter_ = position + 1;

162 this->keyword_token_ = token_if_match;

163 return true;

164 }	507 }

165 return false;	508 state_ = KEYWORD_PREFIX;

	509 this->keyword_ = keyword;

	510 this->counter_ = position + 1;

	511 this->keyword_token_ = token_if_match;

	512 return true;

166 }	513 }

167	514

168 // If input equals match character, transition to new state and return true.	515 // If input equals match character, transition to new state and return true.

169 inline bool MatchState(unibrow::uchar input, char match, State new_state) {	516 inline bool MatchState(unibrow::uchar input, char match, State new_state) {

170 if (input == static_cast<unibrow::uchar>(match)) {	517 if (input != static_cast<unibrow::uchar>(match)) {

171 state_ = new_state;	518 return false;

172 return true;

173 }	519 }

174 return false;	520 state_ = new_state;

	521 return true;

175 }	522 }

176	523

177 inline bool MatchKeyword(unibrow::uchar input,	524 inline bool MatchKeyword(unibrow::uchar input,

178 char match,	525 char match,

179 State new_state,	526 State new_state,

180 Token::Value keyword_token) {	527 Token::Value keyword_token) {

181 if (input != static_cast<unibrow::uchar>(match)) {	528 if (input != static_cast<unibrow::uchar>(match)) {

182 return false;	529 return false;

183 }	530 }

184 state_ = new_state;	531 state_ = new_state;

(...skipping 12 matching lines...) Expand all Loading...
197 // keyword with the current prefix).	544 // keyword with the current prefix).

198 const char* keyword_;	545 const char* keyword_;

199 int counter_;	546 int counter_;

200 Token::Value keyword_token_;	547 Token::Value keyword_token_;

201 };	548 };

202	549

203	550

204 } } // namespace v8::internal	551 } } // namespace v8::internal

205	552

206 #endif // V8_SCANNER_BASE_H_	553 #endif // V8_SCANNER_BASE_H_

OLD	NEW

« no previous file with comments | « src/scanner.cc ('k') | src/scanner-base.cc » ('j') | no next file with comments »