Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(199)

Side by Side Diff: src/scanner-base.h

Issue 5274002: Version 2.5.8... (Closed) Base URL: http://v8.googlecode.com/svn/trunk/
Patch Set: Created 10 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « src/scanner.cc ('k') | src/scanner-base.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2010 the V8 project authors. All rights reserved. 1 // Copyright 2010 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 19 matching lines...) Expand all
30 #ifndef V8_SCANNER_BASE_H_ 30 #ifndef V8_SCANNER_BASE_H_
31 #define V8_SCANNER_BASE_H_ 31 #define V8_SCANNER_BASE_H_
32 32
33 #include "globals.h" 33 #include "globals.h"
34 #include "checks.h" 34 #include "checks.h"
35 #include "allocation.h" 35 #include "allocation.h"
36 #include "token.h" 36 #include "token.h"
37 #include "unicode-inl.h" 37 #include "unicode-inl.h"
38 #include "char-predicates.h" 38 #include "char-predicates.h"
39 #include "utils.h" 39 #include "utils.h"
40 #include "list-inl.h"
40 41
41 namespace v8 { 42 namespace v8 {
42 namespace internal { 43 namespace internal {
43 44
44 // Interface through which the scanner reads characters from the input source. 45 // Returns the value (0 .. 15) of a hexadecimal character c.
46 // If c is not a legal hexadecimal character, returns a value < 0.
47 inline int HexValue(uc32 c) {
48 c -= '0';
49 if (static_cast<unsigned>(c) <= 9) return c;
50 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
51 if (static_cast<unsigned>(c) <= 5) return c + 10;
52 return -1;
53 }
54
55 // ----------------------------------------------------------------------------
56 // UTF16Buffer - scanner input source with pushback.
57
45 class UTF16Buffer { 58 class UTF16Buffer {
46 public: 59 public:
47 UTF16Buffer(); 60 UTF16Buffer();
48 virtual ~UTF16Buffer() {} 61 virtual ~UTF16Buffer() {}
49 62
50 virtual void PushBack(uc32 ch) = 0; 63 virtual void PushBack(uc32 ch) = 0;
51 // Returns a value < 0 when the buffer end is reached. 64 // Returns a value < 0 when the buffer end is reached.
52 virtual uc32 Advance() = 0; 65 virtual uc32 Advance() = 0;
53 virtual void SeekForward(int pos) = 0; 66 virtual void SeekForward(int pos) = 0;
54 67
55 int pos() const { return pos_; } 68 int pos() const { return pos_; }
56 69
70 static const int kNoEndPosition = 1;
71
57 protected: 72 protected:
73 // Initial value of end_ before the input stream is initialized.
74
58 int pos_; // Current position in the buffer. 75 int pos_; // Current position in the buffer.
59 int end_; // Position where scanning should stop (EOF). 76 int end_; // Position where scanning should stop (EOF).
60 }; 77 };
61 78
62 79
63 class ScannerConstants : AllStatic { 80 class ScannerConstants : AllStatic {
64 public: 81 public:
65 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; 82 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
66 83
67 static StaticResource<Utf8Decoder>* utf8_decoder() { 84 static StaticResource<Utf8Decoder>* utf8_decoder() {
68 return &utf8_decoder_; 85 return &utf8_decoder_;
69 } 86 }
70 87
71 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart; 88 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
72 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; 89 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
73 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; 90 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
74 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; 91 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
75 92
76 static bool IsIdentifier(unibrow::CharacterStream* buffer); 93 static bool IsIdentifier(unibrow::CharacterStream* buffer);
77 94
78 private: 95 private:
79 static StaticResource<Utf8Decoder> utf8_decoder_; 96 static StaticResource<Utf8Decoder> utf8_decoder_;
80 }; 97 };
81 98
99 // ----------------------------------------------------------------------------
100 // LiteralCollector - Collector of chars of literals.
101
102 class LiteralCollector {
103 public:
104 LiteralCollector();
105 ~LiteralCollector();
106
107 inline void AddChar(uc32 c) {
108 if (recording_) {
109 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
110 buffer_.Add(static_cast<char>(c));
111 } else {
112 AddCharSlow(c);
113 }
114 }
115 }
116
117 void StartLiteral() {
118 buffer_.StartSequence();
119 recording_ = true;
120 }
121
122 Vector<const char> EndLiteral() {
123 if (recording_) {
124 recording_ = false;
125 buffer_.Add(kEndMarker);
126 Vector<char> sequence = buffer_.EndSequence();
127 return Vector<const char>(sequence.start(), sequence.length());
128 }
129 return Vector<const char>();
130 }
131
132 void DropLiteral() {
133 if (recording_) {
134 recording_ = false;
135 buffer_.DropSequence();
136 }
137 }
138
139 void Reset() {
140 buffer_.Reset();
141 }
142
143 // The end marker added after a parsed literal.
144 // Using zero allows the usage of strlen and similar functions on
145 // identifiers and numbers (but not strings, since they may contain zero
146 // bytes).
147 static const char kEndMarker = '\x00';
148 private:
149 static const int kInitialCapacity = 256;
150 SequenceCollector<char, 4> buffer_;
151 bool recording_;
152 void AddCharSlow(uc32 c);
153 };
154
155 // ----------------------------------------------------------------------------
156 // Scanner base-class.
157
158 // Generic functionality used by both JSON and JavaScript scanners.
159 class Scanner {
160 public:
161 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
162
163 class LiteralScope {
164 public:
165 explicit LiteralScope(Scanner* self);
166 ~LiteralScope();
167 void Complete();
168
169 private:
170 Scanner* scanner_;
171 bool complete_;
172 };
173
174 Scanner();
175
176 // Returns the current token again.
177 Token::Value current_token() { return current_.token; }
178
179 // One token look-ahead (past the token returned by Next()).
180 Token::Value peek() const { return next_.token; }
181
182 struct Location {
183 Location(int b, int e) : beg_pos(b), end_pos(e) { }
184 Location() : beg_pos(0), end_pos(0) { }
185 int beg_pos;
186 int end_pos;
187 };
188
189 // Returns the location information for the current token
190 // (the token returned by Next()).
191 Location location() const { return current_.location; }
192 Location peek_location() const { return next_.location; }
193
194 // Returns the literal string, if any, for the current token (the
195 // token returned by Next()). The string is 0-terminated and in
196 // UTF-8 format; they may contain 0-characters. Literal strings are
197 // collected for identifiers, strings, and numbers.
198 // These functions only give the correct result if the literal
199 // was scanned between calls to StartLiteral() and TerminateLiteral().
200 const char* literal_string() const {
201 return current_.literal_chars.start();
202 }
203
204 int literal_length() const {
205 // Excluding terminal '\x00' added by TerminateLiteral().
206 return current_.literal_chars.length() - 1;
207 }
208
209 Vector<const char> literal() const {
210 return Vector<const char>(literal_string(), literal_length());
211 }
212
213 // Returns the literal string for the next token (the token that
214 // would be returned if Next() were called).
215 const char* next_literal_string() const {
216 return next_.literal_chars.start();
217 }
218
219
220 // Returns the length of the next token (that would be returned if
221 // Next() were called).
222 int next_literal_length() const {
223 // Excluding terminal '\x00' added by TerminateLiteral().
224 return next_.literal_chars.length() - 1;
225 }
226
227 Vector<const char> next_literal() const {
228 return Vector<const char>(next_literal_string(), next_literal_length());
229 }
230
231 bool stack_overflow() { return stack_overflow_; }
232
233 static const int kCharacterLookaheadBufferSize = 1;
234
235 protected:
236 // The current and look-ahead token.
237 struct TokenDesc {
238 Token::Value token;
239 Location location;
240 Vector<const char> literal_chars;
241 };
242
243 // Call this after setting source_ to the input.
244 void Init() {
245 // Set c0_ (one character ahead)
246 ASSERT(kCharacterLookaheadBufferSize == 1);
247 Advance();
248 // Initialize current_ to not refer to a literal.
249 current_.literal_chars = Vector<const char>();
250 // Reset literal buffer.
251 literal_buffer_.Reset();
252 }
253
254 // Literal buffer support
255 inline void StartLiteral() {
256 literal_buffer_.StartLiteral();
257 }
258
259 inline void AddLiteralChar(uc32 c) {
260 literal_buffer_.AddChar(c);
261 }
262
263 // Complete scanning of a literal.
264 inline void TerminateLiteral() {
265 next_.literal_chars = literal_buffer_.EndLiteral();
266 }
267
268 // Stops scanning of a literal and drop the collected characters,
269 // e.g., due to an encountered error.
270 inline void DropLiteral() {
271 literal_buffer_.DropLiteral();
272 }
273
274 inline void AddLiteralCharAdvance() {
275 AddLiteralChar(c0_);
276 Advance();
277 }
278
279 // Low-level scanning support.
280 void Advance() { c0_ = source_->Advance(); }
281 void PushBack(uc32 ch) {
282 source_->PushBack(ch);
283 c0_ = ch;
284 }
285
286 inline Token::Value Select(Token::Value tok) {
287 Advance();
288 return tok;
289 }
290
291 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
292 Advance();
293 if (c0_ == next) {
294 Advance();
295 return then;
296 } else {
297 return else_;
298 }
299 }
300
301 uc32 ScanHexEscape(uc32 c, int length);
302 uc32 ScanOctalEscape(uc32 c, int length);
303
304 // Return the current source position.
305 int source_pos() {
306 return source_->pos() - kCharacterLookaheadBufferSize;
307 }
308
309 TokenDesc current_; // desc for current token (as returned by Next())
310 TokenDesc next_; // desc for next token (one token look-ahead)
311
312 // Input stream. Must be initialized to an UTF16Buffer.
313 UTF16Buffer* source_;
314
315 // Buffer to hold literal values (identifiers, strings, numbers)
316 // using '\x00'-terminated UTF-8 encoding. Handles allocation internally.
317 LiteralCollector literal_buffer_;
318
319 bool stack_overflow_;
320
321 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
322 uc32 c0_;
323 };
324
325 // ----------------------------------------------------------------------------
326 // JavaScriptScanner - base logic for JavaScript scanning.
327
328 class JavaScriptScanner : public Scanner {
329 public:
330
331 // Bit vector representing set of types of literals.
332 enum LiteralType {
333 kNoLiterals = 0,
334 kLiteralNumber = 1,
335 kLiteralIdentifier = 2,
336 kLiteralString = 4,
337 kLiteralRegExp = 8,
338 kLiteralRegExpFlags = 16,
339 kAllLiterals = 31
340 };
341
342 // A LiteralScope that disables recording of some types of JavaScript
343 // literals. If the scanner is configured to not record the specific
344 // type of literal, the scope will not call StartLiteral.
345 class LiteralScope {
346 public:
347 LiteralScope(JavaScriptScanner* self, LiteralType type)
348 : scanner_(self), complete_(false) {
349 if (scanner_->RecordsLiteral(type)) {
350 scanner_->StartLiteral();
351 }
352 }
353 ~LiteralScope() {
354 if (!complete_) scanner_->DropLiteral();
355 }
356 void Complete() {
357 scanner_->TerminateLiteral();
358 complete_ = true;
359 }
360
361 private:
362 JavaScriptScanner* scanner_;
363 bool complete_;
364 };
365
366 JavaScriptScanner();
367
368 // Returns the next token.
369 Token::Value Next();
370
371 // Returns true if there was a line terminator before the peek'ed token.
372 bool has_line_terminator_before_next() const {
373 return has_line_terminator_before_next_;
374 }
375
376 // Scans the input as a regular expression pattern, previous
377 // character(s) must be /(=). Returns true if a pattern is scanned.
378 bool ScanRegExpPattern(bool seen_equal);
379 // Returns true if regexp flags are scanned (always since flags can
380 // be empty).
381 bool ScanRegExpFlags();
382
383 // Tells whether the buffer contains an identifier (no escapes).
384 // Used for checking if a property name is an identifier.
385 static bool IsIdentifier(unibrow::CharacterStream* buffer);
386
387 // Seek forward to the given position. This operation does not
388 // work in general, for instance when there are pushed back
389 // characters, but works for seeking forward until simple delimiter
390 // tokens, which is what it is used for.
391 void SeekForward(int pos);
392
393 // Whether this scanner records the given literal type or not.
394 bool RecordsLiteral(LiteralType type) {
395 return (literal_flags_ & type) != 0;
396 }
397
398 protected:
399 bool SkipWhiteSpace();
400 Token::Value SkipSingleLineComment();
401 Token::Value SkipMultiLineComment();
402
403 // Scans a single JavaScript token.
404 void Scan();
405
406 void ScanDecimalDigits();
407 Token::Value ScanNumber(bool seen_period);
408 Token::Value ScanIdentifierOrKeyword();
409 Token::Value ScanIdentifierSuffix(LiteralScope* literal);
410
411 void ScanEscape();
412 Token::Value ScanString();
413
414 // Scans a possible HTML comment -- begins with '<!'.
415 Token::Value ScanHtmlComment();
416
417 // Decodes a unicode escape-sequence which is part of an identifier.
418 // If the escape sequence cannot be decoded the result is kBadChar.
419 uc32 ScanIdentifierUnicodeEscape();
420
421 int literal_flags_;
422 bool has_line_terminator_before_next_;
423 };
424
425
426 // ----------------------------------------------------------------------------
427 // Keyword matching state machine.
82 428
83 class KeywordMatcher { 429 class KeywordMatcher {
84 // Incrementally recognize keywords. 430 // Incrementally recognize keywords.
85 // 431 //
86 // Recognized keywords: 432 // Recognized keywords:
87 // break case catch const* continue debugger* default delete do else 433 // break case catch const* continue debugger* default delete do else
88 // finally false for function if in instanceof native* new null 434 // finally false for function if in instanceof native* new null
89 // return switch this throw true try typeof var void while with 435 // return switch this throw true try typeof var void while with
90 // 436 //
91 // *: Actually "future reserved keywords". These are the only ones we 437 // *: Actually "future reserved keywords". These are the only ones we
92 // recognize, the remaining are allowed as identifiers. 438 // recognize, the remaining are allowed as identifiers.
93 // In ES5 strict mode, we should disallow all reserved keywords. 439 // In ES5 strict mode, we should disallow all reserved keywords.
94 public: 440 public:
95 KeywordMatcher() 441 KeywordMatcher()
96 : state_(INITIAL), 442 : state_(INITIAL),
97 token_(Token::IDENTIFIER), 443 token_(Token::IDENTIFIER),
98 keyword_(NULL), 444 keyword_(NULL),
99 counter_(0), 445 counter_(0),
100 keyword_token_(Token::ILLEGAL) {} 446 keyword_token_(Token::ILLEGAL) {}
101 447
102 Token::Value token() { return token_; } 448 Token::Value token() { return token_; }
103 449
104 inline void AddChar(unibrow::uchar input) { 450 inline bool AddChar(unibrow::uchar input) {
105 if (state_ != UNMATCHABLE) { 451 if (state_ != UNMATCHABLE) {
106 Step(input); 452 Step(input);
107 } 453 }
454 return state_ != UNMATCHABLE;
108 } 455 }
109 456
110 void Fail() { 457 void Fail() {
111 token_ = Token::IDENTIFIER; 458 token_ = Token::IDENTIFIER;
112 state_ = UNMATCHABLE; 459 state_ = UNMATCHABLE;
113 } 460 }
114 461
115 private: 462 private:
116 enum State { 463 enum State {
117 UNMATCHABLE, 464 UNMATCHABLE,
(...skipping 30 matching lines...) Expand all
148 kFirstCharRangeMax - kFirstCharRangeMin + 1; 495 kFirstCharRangeMax - kFirstCharRangeMin + 1;
149 // State map for first keyword character range. 496 // State map for first keyword character range.
150 static FirstState first_states_[kFirstCharRangeLength]; 497 static FirstState first_states_[kFirstCharRangeLength];
151 498
152 // If input equals keyword's character at position, continue matching keyword 499 // If input equals keyword's character at position, continue matching keyword
153 // from that position. 500 // from that position.
154 inline bool MatchKeywordStart(unibrow::uchar input, 501 inline bool MatchKeywordStart(unibrow::uchar input,
155 const char* keyword, 502 const char* keyword,
156 int position, 503 int position,
157 Token::Value token_if_match) { 504 Token::Value token_if_match) {
158 if (input == static_cast<unibrow::uchar>(keyword[position])) { 505 if (input != static_cast<unibrow::uchar>(keyword[position])) {
159 state_ = KEYWORD_PREFIX; 506 return false;
160 this->keyword_ = keyword;
161 this->counter_ = position + 1;
162 this->keyword_token_ = token_if_match;
163 return true;
164 } 507 }
165 return false; 508 state_ = KEYWORD_PREFIX;
509 this->keyword_ = keyword;
510 this->counter_ = position + 1;
511 this->keyword_token_ = token_if_match;
512 return true;
166 } 513 }
167 514
168 // If input equals match character, transition to new state and return true. 515 // If input equals match character, transition to new state and return true.
169 inline bool MatchState(unibrow::uchar input, char match, State new_state) { 516 inline bool MatchState(unibrow::uchar input, char match, State new_state) {
170 if (input == static_cast<unibrow::uchar>(match)) { 517 if (input != static_cast<unibrow::uchar>(match)) {
171 state_ = new_state; 518 return false;
172 return true;
173 } 519 }
174 return false; 520 state_ = new_state;
521 return true;
175 } 522 }
176 523
177 inline bool MatchKeyword(unibrow::uchar input, 524 inline bool MatchKeyword(unibrow::uchar input,
178 char match, 525 char match,
179 State new_state, 526 State new_state,
180 Token::Value keyword_token) { 527 Token::Value keyword_token) {
181 if (input != static_cast<unibrow::uchar>(match)) { 528 if (input != static_cast<unibrow::uchar>(match)) {
182 return false; 529 return false;
183 } 530 }
184 state_ = new_state; 531 state_ = new_state;
(...skipping 12 matching lines...) Expand all
197 // keyword with the current prefix). 544 // keyword with the current prefix).
198 const char* keyword_; 545 const char* keyword_;
199 int counter_; 546 int counter_;
200 Token::Value keyword_token_; 547 Token::Value keyword_token_;
201 }; 548 };
202 549
203 550
204 } } // namespace v8::internal 551 } } // namespace v8::internal
205 552
206 #endif // V8_SCANNER_BASE_H_ 553 #endif // V8_SCANNER_BASE_H_
OLDNEW
« no previous file with comments | « src/scanner.cc ('k') | src/scanner-base.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698