Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(99)

Side by Side Diff: src/scanner.cc

Issue 5136002: Extract scanner base/JS/JSON and move base and JS to scanner-base. (Closed)
Patch Set: Created 10 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/scanner.h ('k') | src/scanner-base.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2010 the V8 project authors. All rights reserved. 1 // Copyright 2010 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 18 matching lines...) Expand all
29 29
30 #include "ast.h" 30 #include "ast.h"
31 #include "handles.h" 31 #include "handles.h"
32 #include "scanner.h" 32 #include "scanner.h"
33 #include "unicode-inl.h" 33 #include "unicode-inl.h"
34 34
35 namespace v8 { 35 namespace v8 {
36 namespace internal { 36 namespace internal {
37 37
38 // ---------------------------------------------------------------------------- 38 // ----------------------------------------------------------------------------
39 // UTF8Buffer
40
41 UTF8Buffer::UTF8Buffer() : buffer_(kInitialCapacity), recording_(false) { }
42
43
44 UTF8Buffer::~UTF8Buffer() {}
45
46
47 void UTF8Buffer::AddCharSlow(uc32 c) {
48 ASSERT(static_cast<unsigned>(c) > unibrow::Utf8::kMaxOneByteChar);
49 int length = unibrow::Utf8::Length(c);
50 Vector<char> block = buffer_.AddBlock(length, '\0');
51 #ifdef DEBUG
52 int written_length = unibrow::Utf8::Encode(block.start(), c);
53 CHECK_EQ(length, written_length);
54 #else
55 unibrow::Utf8::Encode(block.start(), c);
56 #endif
57 }
58
59
60 // ----------------------------------------------------------------------------
61 // UTF16Buffer 39 // UTF16Buffer
62 40
63
64 UTF16Buffer::UTF16Buffer()
65 : pos_(0), end_(Scanner::kNoEndPosition) { }
66
67
68 // CharacterStreamUTF16Buffer 41 // CharacterStreamUTF16Buffer
69 CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer() 42 CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer()
70 : pushback_buffer_(0), last_(0), stream_(NULL) { } 43 : pushback_buffer_(0), last_(0), stream_(NULL) { }
71 44
72 45
73 void CharacterStreamUTF16Buffer::Initialize(Handle<String> data, 46 void CharacterStreamUTF16Buffer::Initialize(Handle<String> data,
74 unibrow::CharacterStream* input, 47 unibrow::CharacterStream* input,
75 int start_position, 48 int start_position,
76 int end_position) { 49 int end_position) {
77 stream_ = input; 50 stream_ = input;
78 if (start_position > 0) { 51 if (start_position > 0) {
79 SeekForward(start_position); 52 SeekForward(start_position);
80 } 53 }
81 end_ = end_position != Scanner::kNoEndPosition ? end_position : kMaxInt; 54 end_ = end_position != kNoEndPosition ? end_position : kMaxInt;
82 } 55 }
83 56
84 57
85 void CharacterStreamUTF16Buffer::PushBack(uc32 ch) { 58 void CharacterStreamUTF16Buffer::PushBack(uc32 ch) {
86 pushback_buffer()->Add(last_); 59 pushback_buffer()->Add(last_);
87 last_ = ch; 60 last_ = ch;
88 pos_--; 61 pos_--;
89 } 62 }
90 63
91 64
92 uc32 CharacterStreamUTF16Buffer::Advance() { 65 uc32 CharacterStreamUTF16Buffer::Advance() {
93 ASSERT(end_ != Scanner::kNoEndPosition); 66 ASSERT(end_ != kNoEndPosition);
94 ASSERT(end_ >= 0); 67 ASSERT(end_ >= 0);
95 // NOTE: It is of importance to Persian / Farsi resources that we do 68 // NOTE: It is of importance to Persian / Farsi resources that we do
96 // *not* strip format control characters in the scanner; see 69 // *not* strip format control characters in the scanner; see
97 // 70 //
98 // https://bugzilla.mozilla.org/show_bug.cgi?id=274152 71 // https://bugzilla.mozilla.org/show_bug.cgi?id=274152
99 // 72 //
100 // So, even though ECMA-262, section 7.1, page 11, dictates that we 73 // So, even though ECMA-262, section 7.1, page 11, dictates that we
101 // must remove Unicode format-control characters, we do not. This is 74 // must remove Unicode format-control characters, we do not. This is
102 // in line with how IE and SpiderMonkey handles it. 75 // in line with how IE and SpiderMonkey handles it.
103 if (!pushback_buffer()->is_empty()) { 76 if (!pushback_buffer()->is_empty()) {
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
136 if (!complete_) scanner_->DropLiteral(); 109 if (!complete_) scanner_->DropLiteral();
137 } 110 }
138 111
139 112
140 void Scanner::LiteralScope::Complete() { 113 void Scanner::LiteralScope::Complete() {
141 scanner_->TerminateLiteral(); 114 scanner_->TerminateLiteral();
142 complete_ = true; 115 complete_ = true;
143 } 116 }
144 117
145 // ---------------------------------------------------------------------------- 118 // ----------------------------------------------------------------------------
146 // Scanner 119 // V8JavaScriptScanner
147 120
148 Scanner::Scanner() 121 void V8JavaScriptScanner::Initialize(Handle<String> source) {
149 : has_line_terminator_before_next_(false), 122 source_ = stream_initializer_.Init(source, NULL, 0, source->length());
150 is_parsing_json_(false), 123 Init();
151 source_(NULL), 124 // Skip initial whitespace allowing HTML comment ends just like
152 stack_overflow_(false) {} 125 // after a newline and scan first token.
153 126 has_line_terminator_before_next_ = true;
154 127 SkipWhiteSpace();
155 void Scanner::Initialize(Handle<String> source, 128 Scan();
156 ParserLanguage language) {
157 Init(source, NULL, 0, source->length(), language);
158 } 129 }
159 130
160 131
161 void Scanner::Initialize(Handle<String> source, 132 void V8JavaScriptScanner::Initialize(Handle<String> source,
162 unibrow::CharacterStream* stream, 133 unibrow::CharacterStream* stream) {
163 ParserLanguage language) { 134 source_ = stream_initializer_.Init(source, stream,
164 Init(source, stream, 0, kNoEndPosition, language); 135 0, UTF16Buffer::kNoEndPosition);
136 Init();
137 // Skip initial whitespace allowing HTML comment ends just like
138 // after a newline and scan first token.
139 has_line_terminator_before_next_ = true;
140 SkipWhiteSpace();
141 Scan();
165 } 142 }
166 143
167 144
168 void Scanner::Initialize(Handle<String> source, 145 void V8JavaScriptScanner::Initialize(Handle<String> source,
169 int start_position, 146 int start_position,
170 int end_position, 147 int end_position) {
171 ParserLanguage language) { 148 source_ = stream_initializer_.Init(source, NULL,
172 Init(source, NULL, start_position, end_position, language); 149 start_position, end_position);
150 Init();
151 // Skip initial whitespace allowing HTML comment ends just like
152 // after a newline and scan first token.
153 has_line_terminator_before_next_ = true;
154 SkipWhiteSpace();
155 Scan();
173 } 156 }
174 157
175 158
176 void Scanner::Init(Handle<String> source, 159 Token::Value V8JavaScriptScanner::NextCheckStack() {
177 unibrow::CharacterStream* stream, 160 // BUG 1215673: Find a thread safe way to set a stack limit in
178 int start_position, 161 // pre-parse mode. Otherwise, we cannot safely pre-parse from other
179 int end_position, 162 // threads.
180 ParserLanguage language) { 163 StackLimitCheck check;
164 if (check.HasOverflowed()) {
165 stack_overflow_ = true;
166 current_ = next_;
167 next_.token = Token::ILLEGAL;
168 return current_.token;
169 } else {
170 return Next();
171 }
172 }
173
174
175 UTF16Buffer* StreamInitializer::Init(Handle<String> source,
176 unibrow::CharacterStream* stream,
177 int start_position,
178 int end_position) {
181 // Either initialize the scanner from a character stream or from a 179 // Either initialize the scanner from a character stream or from a
182 // string. 180 // string.
183 ASSERT(source.is_null() || stream == NULL); 181 ASSERT(source.is_null() || stream == NULL);
184 182
185 // Initialize the source buffer. 183 // Initialize the source buffer.
186 if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) { 184 if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) {
187 two_byte_string_buffer_.Initialize( 185 two_byte_string_buffer_.Initialize(
188 Handle<ExternalTwoByteString>::cast(source), 186 Handle<ExternalTwoByteString>::cast(source),
189 start_position, 187 start_position,
190 end_position); 188 end_position);
191 source_ = &two_byte_string_buffer_; 189 return &two_byte_string_buffer_;
192 } else if (!source.is_null() && StringShape(*source).IsExternalAscii()) { 190 } else if (!source.is_null() && StringShape(*source).IsExternalAscii()) {
193 ascii_string_buffer_.Initialize( 191 ascii_string_buffer_.Initialize(
194 Handle<ExternalAsciiString>::cast(source), 192 Handle<ExternalAsciiString>::cast(source),
195 start_position, 193 start_position,
196 end_position); 194 end_position);
197 source_ = &ascii_string_buffer_; 195 return &ascii_string_buffer_;
198 } else { 196 } else {
199 if (!source.is_null()) { 197 if (!source.is_null()) {
200 safe_string_input_buffer_.Reset(source.location()); 198 safe_string_input_buffer_.Reset(source.location());
201 stream = &safe_string_input_buffer_; 199 stream = &safe_string_input_buffer_;
202 } 200 }
203 char_stream_buffer_.Initialize(source, 201 char_stream_buffer_.Initialize(source,
204 stream, 202 stream,
205 start_position, 203 start_position,
206 end_position); 204 end_position);
207 source_ = &char_stream_buffer_; 205 return &char_stream_buffer_;
208 } 206 }
207 }
209 208
210 is_parsing_json_ = (language == JSON); 209 // ----------------------------------------------------------------------------
210 // JsonScanner
211 211
212 // Set c0_ (one character ahead) 212 JsonScanner::JsonScanner() {}
213 ASSERT(kCharacterLookaheadBufferSize == 1);
214 Advance();
215 // Initialize current_ to not refer to a literal.
216 current_.literal_chars = Vector<const char>();
217 // Reset literal buffer.
218 literal_buffer_.Reset();
219 213
220 // Skip initial whitespace allowing HTML comment ends just like 214
221 // after a newline and scan first token. 215 void JsonScanner::Initialize(Handle<String> source) {
222 has_line_terminator_before_next_ = true; 216 source_ = stream_initializer_.Init(source, NULL, 0, source->length());
223 SkipWhiteSpace(); 217 Init();
224 Scan(); 218 // Skip initial whitespace.
219 SkipJsonWhiteSpace();
220 // Preload first token as look-ahead.
221 ScanJson();
225 } 222 }
226 223
227 224
228 Token::Value Scanner::Next() { 225 Token::Value JsonScanner::Next() {
229 // BUG 1215673: Find a thread safe way to set a stack limit in 226 // BUG 1215673: Find a thread safe way to set a stack limit in
230 // pre-parse mode. Otherwise, we cannot safely pre-parse from other 227 // pre-parse mode. Otherwise, we cannot safely pre-parse from other
231 // threads. 228 // threads.
232 current_ = next_; 229 current_ = next_;
233 // Check for stack-overflow before returning any tokens. 230 // Check for stack-overflow before returning any tokens.
234 StackLimitCheck check; 231 StackLimitCheck check;
235 if (check.HasOverflowed()) { 232 if (check.HasOverflowed()) {
236 stack_overflow_ = true; 233 stack_overflow_ = true;
237 next_.token = Token::ILLEGAL; 234 next_.token = Token::ILLEGAL;
238 } else { 235 } else {
239 has_line_terminator_before_next_ = false; 236 ScanJson();
240 Scan();
241 } 237 }
242 return current_.token; 238 return current_.token;
243 } 239 }
244 240
245 241
246 void Scanner::StartLiteral() { 242 bool JsonScanner::SkipJsonWhiteSpace() {
247 literal_buffer_.StartLiteral();
248 }
249
250
251 void Scanner::AddLiteralChar(uc32 c) {
252 literal_buffer_.AddChar(c);
253 }
254
255
256 void Scanner::TerminateLiteral() {
257 next_.literal_chars = literal_buffer_.EndLiteral();
258 }
259
260
261 void Scanner::DropLiteral() {
262 literal_buffer_.DropLiteral();
263 }
264
265
266 void Scanner::AddLiteralCharAdvance() {
267 AddLiteralChar(c0_);
268 Advance();
269 }
270
271
272 static inline bool IsByteOrderMark(uc32 c) {
273 // The Unicode value U+FFFE is guaranteed never to be assigned as a
274 // Unicode character; this implies that in a Unicode context the
275 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
276 // character expressed in little-endian byte order (since it could
277 // not be a U+FFFE character expressed in big-endian byte
278 // order). Nevertheless, we check for it to be compatible with
279 // Spidermonkey.
280 return c == 0xFEFF || c == 0xFFFE;
281 }
282
283
284 bool Scanner::SkipJsonWhiteSpace() {
285 int start_position = source_pos(); 243 int start_position = source_pos();
286 // JSON WhiteSpace is tab, carrige-return, newline and space. 244 // JSON WhiteSpace is tab, carrige-return, newline and space.
287 while (c0_ == ' ' || c0_ == '\n' || c0_ == '\r' || c0_ == '\t') { 245 while (c0_ == ' ' || c0_ == '\n' || c0_ == '\r' || c0_ == '\t') {
288 Advance(); 246 Advance();
289 } 247 }
290 return source_pos() != start_position; 248 return source_pos() != start_position;
291 } 249 }
292 250
293 251
294 bool Scanner::SkipJavaScriptWhiteSpace() { 252 void JsonScanner::ScanJson() {
295 int start_position = source_pos();
296
297 while (true) {
298 // We treat byte-order marks (BOMs) as whitespace for better
299 // compatibility with Spidermonkey and other JavaScript engines.
300 while (ScannerConstants::kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) {
301 // IsWhiteSpace() includes line terminators!
302 if (ScannerConstants::kIsLineTerminator.get(c0_)) {
303 // Ignore line terminators, but remember them. This is necessary
304 // for automatic semicolon insertion.
305 has_line_terminator_before_next_ = true;
306 }
307 Advance();
308 }
309
310 // If there is an HTML comment end '-->' at the beginning of a
311 // line (with only whitespace in front of it), we treat the rest
312 // of the line as a comment. This is in line with the way
313 // SpiderMonkey handles it.
314 if (c0_ == '-' && has_line_terminator_before_next_) {
315 Advance();
316 if (c0_ == '-') {
317 Advance();
318 if (c0_ == '>') {
319 // Treat the rest of the line as a comment.
320 SkipSingleLineComment();
321 // Continue skipping white space after the comment.
322 continue;
323 }
324 PushBack('-'); // undo Advance()
325 }
326 PushBack('-'); // undo Advance()
327 }
328 // Return whether or not we skipped any characters.
329 return source_pos() != start_position;
330 }
331 }
332
333
334 Token::Value Scanner::SkipSingleLineComment() {
335 Advance();
336
337 // The line terminator at the end of the line is not considered
338 // to be part of the single-line comment; it is recognized
339 // separately by the lexical grammar and becomes part of the
340 // stream of input elements for the syntactic grammar (see
341 // ECMA-262, section 7.4, page 12).
342 while (c0_ >= 0 && !ScannerConstants::kIsLineTerminator.get(c0_)) {
343 Advance();
344 }
345
346 return Token::WHITESPACE;
347 }
348
349
350 Token::Value Scanner::SkipMultiLineComment() {
351 ASSERT(c0_ == '*');
352 Advance();
353
354 while (c0_ >= 0) {
355 char ch = c0_;
356 Advance();
357 // If we have reached the end of the multi-line comment, we
358 // consume the '/' and insert a whitespace. This way all
359 // multi-line comments are treated as whitespace - even the ones
360 // containing line terminators. This contradicts ECMA-262, section
361 // 7.4, page 12, that says that multi-line comments containing
362 // line terminators should be treated as a line terminator, but it
363 // matches the behaviour of SpiderMonkey and KJS.
364 if (ch == '*' && c0_ == '/') {
365 c0_ = ' ';
366 return Token::WHITESPACE;
367 }
368 }
369
370 // Unterminated multi-line comment.
371 return Token::ILLEGAL;
372 }
373
374
375 Token::Value Scanner::ScanHtmlComment() {
376 // Check for <!-- comments.
377 ASSERT(c0_ == '!');
378 Advance();
379 if (c0_ == '-') {
380 Advance();
381 if (c0_ == '-') return SkipSingleLineComment();
382 PushBack('-'); // undo Advance()
383 }
384 PushBack('!'); // undo Advance()
385 ASSERT(c0_ == '!');
386 return Token::LT;
387 }
388
389
390
391 void Scanner::ScanJson() {
392 next_.literal_chars = Vector<const char>(); 253 next_.literal_chars = Vector<const char>();
393 Token::Value token; 254 Token::Value token;
394 has_line_terminator_before_next_ = false;
395 do { 255 do {
396 // Remember the position of the next token 256 // Remember the position of the next token
397 next_.location.beg_pos = source_pos(); 257 next_.location.beg_pos = source_pos();
398 switch (c0_) { 258 switch (c0_) {
399 case '\t': 259 case '\t':
400 case '\r': 260 case '\r':
401 case '\n': 261 case '\n':
402 case ' ': 262 case ' ':
403 Advance(); 263 Advance();
404 token = Token::WHITESPACE; 264 token = Token::WHITESPACE;
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
461 token = Select(Token::ILLEGAL); 321 token = Select(Token::ILLEGAL);
462 } 322 }
463 } 323 }
464 } while (token == Token::WHITESPACE); 324 } while (token == Token::WHITESPACE);
465 325
466 next_.location.end_pos = source_pos(); 326 next_.location.end_pos = source_pos();
467 next_.token = token; 327 next_.token = token;
468 } 328 }
469 329
470 330
471 Token::Value Scanner::ScanJsonString() { 331 Token::Value JsonScanner::ScanJsonString() {
472 ASSERT_EQ('"', c0_); 332 ASSERT_EQ('"', c0_);
473 Advance(); 333 Advance();
474 LiteralScope literal(this); 334 LiteralScope literal(this);
475 while (c0_ != '"' && c0_ > 0) { 335 while (c0_ != '"' && c0_ > 0) {
476 // Check for control character (0x00-0x1f) or unterminated string (<0). 336 // Check for control character (0x00-0x1f) or unterminated string (<0).
477 if (c0_ < 0x20) return Token::ILLEGAL; 337 if (c0_ < 0x20) return Token::ILLEGAL;
478 if (c0_ != '\\') { 338 if (c0_ != '\\') {
479 AddLiteralCharAdvance(); 339 AddLiteralCharAdvance();
480 } else { 340 } else {
481 Advance(); 341 Advance();
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
521 } 381 }
522 if (c0_ != '"') { 382 if (c0_ != '"') {
523 return Token::ILLEGAL; 383 return Token::ILLEGAL;
524 } 384 }
525 literal.Complete(); 385 literal.Complete();
526 Advance(); 386 Advance();
527 return Token::STRING; 387 return Token::STRING;
528 } 388 }
529 389
530 390
531 Token::Value Scanner::ScanJsonNumber() { 391 Token::Value JsonScanner::ScanJsonNumber() {
532 LiteralScope literal(this); 392 LiteralScope literal(this);
533 if (c0_ == '-') AddLiteralCharAdvance(); 393 if (c0_ == '-') AddLiteralCharAdvance();
534 if (c0_ == '0') { 394 if (c0_ == '0') {
535 AddLiteralCharAdvance(); 395 AddLiteralCharAdvance();
536 // Prefix zero is only allowed if it's the only digit before 396 // Prefix zero is only allowed if it's the only digit before
537 // a decimal point or exponent. 397 // a decimal point or exponent.
538 if ('0' <= c0_ && c0_ <= '9') return Token::ILLEGAL; 398 if ('0' <= c0_ && c0_ <= '9') return Token::ILLEGAL;
539 } else { 399 } else {
540 if (c0_ < '1' || c0_ > '9') return Token::ILLEGAL; 400 if (c0_ < '1' || c0_ > '9') return Token::ILLEGAL;
541 do { 401 do {
(...skipping 13 matching lines...) Expand all
555 if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL; 415 if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL;
556 do { 416 do {
557 AddLiteralCharAdvance(); 417 AddLiteralCharAdvance();
558 } while (c0_ >= '0' && c0_ <= '9'); 418 } while (c0_ >= '0' && c0_ <= '9');
559 } 419 }
560 literal.Complete(); 420 literal.Complete();
561 return Token::NUMBER; 421 return Token::NUMBER;
562 } 422 }
563 423
564 424
565 Token::Value Scanner::ScanJsonIdentifier(const char* text, 425 Token::Value JsonScanner::ScanJsonIdentifier(const char* text,
566 Token::Value token) { 426 Token::Value token) {
567 LiteralScope literal(this); 427 LiteralScope literal(this);
568 while (*text != '\0') { 428 while (*text != '\0') {
569 if (c0_ != *text) return Token::ILLEGAL; 429 if (c0_ != *text) return Token::ILLEGAL;
570 Advance(); 430 Advance();
571 text++; 431 text++;
572 } 432 }
573 if (ScannerConstants::kIsIdentifierPart.get(c0_)) return Token::ILLEGAL; 433 if (ScannerConstants::kIsIdentifierPart.get(c0_)) return Token::ILLEGAL;
574 literal.Complete(); 434 literal.Complete();
575 return token; 435 return token;
576 } 436 }
577 437
578 438
579 void Scanner::ScanJavaScript() {
580 next_.literal_chars = Vector<const char>();
581 Token::Value token;
582 do {
583 // Remember the position of the next token
584 next_.location.beg_pos = source_pos();
585
586 switch (c0_) {
587 case ' ':
588 case '\t':
589 Advance();
590 token = Token::WHITESPACE;
591 break;
592
593 case '\n':
594 Advance();
595 has_line_terminator_before_next_ = true;
596 token = Token::WHITESPACE;
597 break;
598
599 case '"': case '\'':
600 token = ScanString();
601 break;
602
603 case '<':
604 // < <= << <<= <!--
605 Advance();
606 if (c0_ == '=') {
607 token = Select(Token::LTE);
608 } else if (c0_ == '<') {
609 token = Select('=', Token::ASSIGN_SHL, Token::SHL);
610 } else if (c0_ == '!') {
611 token = ScanHtmlComment();
612 } else {
613 token = Token::LT;
614 }
615 break;
616
617 case '>':
618 // > >= >> >>= >>> >>>=
619 Advance();
620 if (c0_ == '=') {
621 token = Select(Token::GTE);
622 } else if (c0_ == '>') {
623 // >> >>= >>> >>>=
624 Advance();
625 if (c0_ == '=') {
626 token = Select(Token::ASSIGN_SAR);
627 } else if (c0_ == '>') {
628 token = Select('=', Token::ASSIGN_SHR, Token::SHR);
629 } else {
630 token = Token::SAR;
631 }
632 } else {
633 token = Token::GT;
634 }
635 break;
636
637 case '=':
638 // = == ===
639 Advance();
640 if (c0_ == '=') {
641 token = Select('=', Token::EQ_STRICT, Token::EQ);
642 } else {
643 token = Token::ASSIGN;
644 }
645 break;
646
647 case '!':
648 // ! != !==
649 Advance();
650 if (c0_ == '=') {
651 token = Select('=', Token::NE_STRICT, Token::NE);
652 } else {
653 token = Token::NOT;
654 }
655 break;
656
657 case '+':
658 // + ++ +=
659 Advance();
660 if (c0_ == '+') {
661 token = Select(Token::INC);
662 } else if (c0_ == '=') {
663 token = Select(Token::ASSIGN_ADD);
664 } else {
665 token = Token::ADD;
666 }
667 break;
668
669 case '-':
670 // - -- --> -=
671 Advance();
672 if (c0_ == '-') {
673 Advance();
674 if (c0_ == '>' && has_line_terminator_before_next_) {
675 // For compatibility with SpiderMonkey, we skip lines that
676 // start with an HTML comment end '-->'.
677 token = SkipSingleLineComment();
678 } else {
679 token = Token::DEC;
680 }
681 } else if (c0_ == '=') {
682 token = Select(Token::ASSIGN_SUB);
683 } else {
684 token = Token::SUB;
685 }
686 break;
687
688 case '*':
689 // * *=
690 token = Select('=', Token::ASSIGN_MUL, Token::MUL);
691 break;
692
693 case '%':
694 // % %=
695 token = Select('=', Token::ASSIGN_MOD, Token::MOD);
696 break;
697
698 case '/':
699 // / // /* /=
700 Advance();
701 if (c0_ == '/') {
702 token = SkipSingleLineComment();
703 } else if (c0_ == '*') {
704 token = SkipMultiLineComment();
705 } else if (c0_ == '=') {
706 token = Select(Token::ASSIGN_DIV);
707 } else {
708 token = Token::DIV;
709 }
710 break;
711
712 case '&':
713 // & && &=
714 Advance();
715 if (c0_ == '&') {
716 token = Select(Token::AND);
717 } else if (c0_ == '=') {
718 token = Select(Token::ASSIGN_BIT_AND);
719 } else {
720 token = Token::BIT_AND;
721 }
722 break;
723
724 case '|':
725 // | || |=
726 Advance();
727 if (c0_ == '|') {
728 token = Select(Token::OR);
729 } else if (c0_ == '=') {
730 token = Select(Token::ASSIGN_BIT_OR);
731 } else {
732 token = Token::BIT_OR;
733 }
734 break;
735
736 case '^':
737 // ^ ^=
738 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
739 break;
740
741 case '.':
742 // . Number
743 Advance();
744 if (IsDecimalDigit(c0_)) {
745 token = ScanNumber(true);
746 } else {
747 token = Token::PERIOD;
748 }
749 break;
750
751 case ':':
752 token = Select(Token::COLON);
753 break;
754
755 case ';':
756 token = Select(Token::SEMICOLON);
757 break;
758
759 case ',':
760 token = Select(Token::COMMA);
761 break;
762
763 case '(':
764 token = Select(Token::LPAREN);
765 break;
766
767 case ')':
768 token = Select(Token::RPAREN);
769 break;
770
771 case '[':
772 token = Select(Token::LBRACK);
773 break;
774
775 case ']':
776 token = Select(Token::RBRACK);
777 break;
778
779 case '{':
780 token = Select(Token::LBRACE);
781 break;
782
783 case '}':
784 token = Select(Token::RBRACE);
785 break;
786
787 case '?':
788 token = Select(Token::CONDITIONAL);
789 break;
790
791 case '~':
792 token = Select(Token::BIT_NOT);
793 break;
794
795 default:
796 if (ScannerConstants::kIsIdentifierStart.get(c0_)) {
797 token = ScanIdentifier();
798 } else if (IsDecimalDigit(c0_)) {
799 token = ScanNumber(false);
800 } else if (SkipWhiteSpace()) {
801 token = Token::WHITESPACE;
802 } else if (c0_ < 0) {
803 token = Token::EOS;
804 } else {
805 token = Select(Token::ILLEGAL);
806 }
807 break;
808 }
809
810 // Continue scanning for tokens as long as we're just skipping
811 // whitespace.
812 } while (token == Token::WHITESPACE);
813
814 next_.location.end_pos = source_pos();
815 next_.token = token;
816 }
817
818
819 void Scanner::SeekForward(int pos) {
820 source_->SeekForward(pos - 1);
821 Advance();
822 // This function is only called to seek to the location
823 // of the end of a function (at the "}" token). It doesn't matter
824 // whether there was a line terminator in the part we skip.
825 has_line_terminator_before_next_ = false;
826 Scan();
827 }
828
829
830 uc32 Scanner::ScanHexEscape(uc32 c, int length) {
831 ASSERT(length <= 4); // prevent overflow
832
833 uc32 digits[4];
834 uc32 x = 0;
835 for (int i = 0; i < length; i++) {
836 digits[i] = c0_;
837 int d = HexValue(c0_);
838 if (d < 0) {
839 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
840 // should be illegal, but other JS VMs just return the
841 // non-escaped version of the original character.
842
843 // Push back digits read, except the last one (in c0_).
844 for (int j = i-1; j >= 0; j--) {
845 PushBack(digits[j]);
846 }
847 // Notice: No handling of error - treat it as "\u"->"u".
848 return c;
849 }
850 x = x * 16 + d;
851 Advance();
852 }
853
854 return x;
855 }
856
857
858 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
859 // ECMA-262. Other JS VMs support them.
860 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
861 uc32 x = c - '0';
862 for (int i = 0; i < length; i++) {
863 int d = c0_ - '0';
864 if (d < 0 || d > 7) break;
865 int nx = x * 8 + d;
866 if (nx >= 256) break;
867 x = nx;
868 Advance();
869 }
870 return x;
871 }
872
873
874 void Scanner::ScanEscape() {
875 uc32 c = c0_;
876 Advance();
877
878 // Skip escaped newlines.
879 if (ScannerConstants::kIsLineTerminator.get(c)) {
880 // Allow CR+LF newlines in multiline string literals.
881 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
882 // Allow LF+CR newlines in multiline string literals.
883 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
884 return;
885 }
886
887 switch (c) {
888 case '\'': // fall through
889 case '"' : // fall through
890 case '\\': break;
891 case 'b' : c = '\b'; break;
892 case 'f' : c = '\f'; break;
893 case 'n' : c = '\n'; break;
894 case 'r' : c = '\r'; break;
895 case 't' : c = '\t'; break;
896 case 'u' : c = ScanHexEscape(c, 4); break;
897 case 'v' : c = '\v'; break;
898 case 'x' : c = ScanHexEscape(c, 2); break;
899 case '0' : // fall through
900 case '1' : // fall through
901 case '2' : // fall through
902 case '3' : // fall through
903 case '4' : // fall through
904 case '5' : // fall through
905 case '6' : // fall through
906 case '7' : c = ScanOctalEscape(c, 2); break;
907 }
908
909 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
910 // should be illegal, but they are commonly handled
911 // as non-escaped characters by JS VMs.
912 AddLiteralChar(c);
913 }
914
915
916 Token::Value Scanner::ScanString() {
917 uc32 quote = c0_;
918 Advance(); // consume quote
919
920 LiteralScope literal(this);
921 while (c0_ != quote && c0_ >= 0
922 && !ScannerConstants::kIsLineTerminator.get(c0_)) {
923 uc32 c = c0_;
924 Advance();
925 if (c == '\\') {
926 if (c0_ < 0) return Token::ILLEGAL;
927 ScanEscape();
928 } else {
929 AddLiteralChar(c);
930 }
931 }
932 if (c0_ != quote) return Token::ILLEGAL;
933 literal.Complete();
934
935 Advance(); // consume quote
936 return Token::STRING;
937 }
938
939
940 Token::Value Scanner::Select(Token::Value tok) {
941 Advance();
942 return tok;
943 }
944
945
946 Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) {
947 Advance();
948 if (c0_ == next) {
949 Advance();
950 return then;
951 } else {
952 return else_;
953 }
954 }
955
956
957 // Returns true if any decimal digits were scanned, returns false otherwise.
958 void Scanner::ScanDecimalDigits() {
959 while (IsDecimalDigit(c0_))
960 AddLiteralCharAdvance();
961 }
962
963
964 Token::Value Scanner::ScanNumber(bool seen_period) {
965 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
966
967 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
968
969 LiteralScope literal(this);
970 if (seen_period) {
971 // we have already seen a decimal point of the float
972 AddLiteralChar('.');
973 ScanDecimalDigits(); // we know we have at least one digit
974
975 } else {
976 // if the first character is '0' we must check for octals and hex
977 if (c0_ == '0') {
978 AddLiteralCharAdvance();
979
980 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
981 if (c0_ == 'x' || c0_ == 'X') {
982 // hex number
983 kind = HEX;
984 AddLiteralCharAdvance();
985 if (!IsHexDigit(c0_)) {
986 // we must have at least one hex digit after 'x'/'X'
987 return Token::ILLEGAL;
988 }
989 while (IsHexDigit(c0_)) {
990 AddLiteralCharAdvance();
991 }
992 } else if ('0' <= c0_ && c0_ <= '7') {
993 // (possible) octal number
994 kind = OCTAL;
995 while (true) {
996 if (c0_ == '8' || c0_ == '9') {
997 kind = DECIMAL;
998 break;
999 }
1000 if (c0_ < '0' || '7' < c0_) break;
1001 AddLiteralCharAdvance();
1002 }
1003 }
1004 }
1005
1006 // Parse decimal digits and allow trailing fractional part.
1007 if (kind == DECIMAL) {
1008 ScanDecimalDigits(); // optional
1009 if (c0_ == '.') {
1010 AddLiteralCharAdvance();
1011 ScanDecimalDigits(); // optional
1012 }
1013 }
1014 }
1015
1016 // scan exponent, if any
1017 if (c0_ == 'e' || c0_ == 'E') {
1018 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
1019 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed
1020 // scan exponent
1021 AddLiteralCharAdvance();
1022 if (c0_ == '+' || c0_ == '-')
1023 AddLiteralCharAdvance();
1024 if (!IsDecimalDigit(c0_)) {
1025 // we must have at least one decimal digit after 'e'/'E'
1026 return Token::ILLEGAL;
1027 }
1028 ScanDecimalDigits();
1029 }
1030
1031 // The source character immediately following a numeric literal must
1032 // not be an identifier start or a decimal digit; see ECMA-262
1033 // section 7.8.3, page 17 (note that we read only one decimal digit
1034 // if the value is 0).
1035 if (IsDecimalDigit(c0_) || ScannerConstants::kIsIdentifierStart.get(c0_))
1036 return Token::ILLEGAL;
1037
1038 literal.Complete();
1039
1040 return Token::NUMBER;
1041 }
1042
1043
1044 uc32 Scanner::ScanIdentifierUnicodeEscape() {
1045 Advance();
1046 if (c0_ != 'u') return unibrow::Utf8::kBadChar;
1047 Advance();
1048 uc32 c = ScanHexEscape('u', 4);
1049 // We do not allow a unicode escape sequence to start another
1050 // unicode escape sequence.
1051 if (c == '\\') return unibrow::Utf8::kBadChar;
1052 return c;
1053 }
1054
1055
1056 Token::Value Scanner::ScanIdentifier() {
1057 ASSERT(ScannerConstants::kIsIdentifierStart.get(c0_));
1058
1059 LiteralScope literal(this);
1060 KeywordMatcher keyword_match;
1061
1062 // Scan identifier start character.
1063 if (c0_ == '\\') {
1064 uc32 c = ScanIdentifierUnicodeEscape();
1065 // Only allow legal identifier start characters.
1066 if (!ScannerConstants::kIsIdentifierStart.get(c)) return Token::ILLEGAL;
1067 AddLiteralChar(c);
1068 keyword_match.Fail();
1069 } else {
1070 AddLiteralChar(c0_);
1071 keyword_match.AddChar(c0_);
1072 Advance();
1073 }
1074
1075 // Scan the rest of the identifier characters.
1076 while (ScannerConstants::kIsIdentifierPart.get(c0_)) {
1077 if (c0_ == '\\') {
1078 uc32 c = ScanIdentifierUnicodeEscape();
1079 // Only allow legal identifier part characters.
1080 if (!ScannerConstants::kIsIdentifierPart.get(c)) return Token::ILLEGAL;
1081 AddLiteralChar(c);
1082 keyword_match.Fail();
1083 } else {
1084 AddLiteralChar(c0_);
1085 keyword_match.AddChar(c0_);
1086 Advance();
1087 }
1088 }
1089 literal.Complete();
1090
1091 return keyword_match.token();
1092 }
1093
1094
1095 439
1096 bool Scanner::ScanRegExpPattern(bool seen_equal) {
1097 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1098 bool in_character_class = false;
1099
1100 // Previous token is either '/' or '/=', in the second case, the
1101 // pattern starts at =.
1102 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1103 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1104
1105 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1106 // the scanner should pass uninterpreted bodies to the RegExp
1107 // constructor.
1108 LiteralScope literal(this);
1109 if (seen_equal)
1110 AddLiteralChar('=');
1111
1112 while (c0_ != '/' || in_character_class) {
1113 if (ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) return false;
1114 if (c0_ == '\\') { // escaped character
1115 AddLiteralCharAdvance();
1116 if (ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) return false;
1117 AddLiteralCharAdvance();
1118 } else { // unescaped character
1119 if (c0_ == '[') in_character_class = true;
1120 if (c0_ == ']') in_character_class = false;
1121 AddLiteralCharAdvance();
1122 }
1123 }
1124 Advance(); // consume '/'
1125
1126 literal.Complete();
1127
1128 return true;
1129 }
1130
1131 bool Scanner::ScanRegExpFlags() {
1132 // Scan regular expression flags.
1133 LiteralScope literal(this);
1134 while (ScannerConstants::kIsIdentifierPart.get(c0_)) {
1135 if (c0_ == '\\') {
1136 uc32 c = ScanIdentifierUnicodeEscape();
1137 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
1138 // We allow any escaped character, unlike the restriction on
1139 // IdentifierPart when it is used to build an IdentifierName.
1140 AddLiteralChar(c);
1141 continue;
1142 }
1143 }
1144 AddLiteralCharAdvance();
1145 }
1146 literal.Complete();
1147
1148 next_.location.end_pos = source_pos() - 1;
1149 return true;
1150 }
1151
1152 } } // namespace v8::internal 440 } } // namespace v8::internal
OLDNEW
« no previous file with comments | « src/scanner.h ('k') | src/scanner-base.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698