OLD | NEW |
| (Empty) |
1 // Copyright 2011 the V8 project authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 // Features shared by parsing and pre-parsing scanners. | |
6 | |
7 #include "src/scanner.h" | |
8 | |
9 #include <stdint.h> | |
10 | |
11 #include <cmath> | |
12 | |
13 #include "src/ast-value-factory.h" | |
14 #include "src/char-predicates-inl.h" | |
15 #include "src/conversions-inl.h" | |
16 #include "src/list-inl.h" | |
17 #include "src/parser.h" | |
18 | |
19 namespace v8 { | |
20 namespace internal { | |
21 | |
22 | |
23 Handle<String> LiteralBuffer::Internalize(Isolate* isolate) const { | |
24 if (is_one_byte()) { | |
25 return isolate->factory()->InternalizeOneByteString(one_byte_literal()); | |
26 } | |
27 return isolate->factory()->InternalizeTwoByteString(two_byte_literal()); | |
28 } | |
29 | |
30 | |
31 // Default implementation for streams that do not support bookmarks. | |
32 bool Utf16CharacterStream::SetBookmark() { return false; } | |
33 void Utf16CharacterStream::ResetToBookmark() { UNREACHABLE(); } | |
34 | |
35 | |
36 // ---------------------------------------------------------------------------- | |
37 // Scanner | |
38 | |
39 Scanner::Scanner(UnicodeCache* unicode_cache) | |
40 : unicode_cache_(unicode_cache), | |
41 bookmark_c0_(kNoBookmark), | |
42 octal_pos_(Location::invalid()) { | |
43 bookmark_current_.literal_chars = &bookmark_current_literal_; | |
44 bookmark_current_.raw_literal_chars = &bookmark_current_raw_literal_; | |
45 bookmark_next_.literal_chars = &bookmark_next_literal_; | |
46 bookmark_next_.raw_literal_chars = &bookmark_next_raw_literal_; | |
47 } | |
48 | |
49 | |
50 void Scanner::Initialize(Utf16CharacterStream* source) { | |
51 source_ = source; | |
52 // Need to capture identifiers in order to recognize "get" and "set" | |
53 // in object literals. | |
54 Init(); | |
55 // Skip initial whitespace allowing HTML comment ends just like | |
56 // after a newline and scan first token. | |
57 has_line_terminator_before_next_ = true; | |
58 SkipWhiteSpace(); | |
59 Scan(); | |
60 } | |
61 | |
62 | |
63 template <bool capture_raw> | |
64 uc32 Scanner::ScanHexNumber(int expected_length) { | |
65 DCHECK(expected_length <= 4); // prevent overflow | |
66 | |
67 uc32 x = 0; | |
68 for (int i = 0; i < expected_length; i++) { | |
69 int d = HexValue(c0_); | |
70 if (d < 0) { | |
71 return -1; | |
72 } | |
73 x = x * 16 + d; | |
74 Advance<capture_raw>(); | |
75 } | |
76 | |
77 return x; | |
78 } | |
79 | |
80 | |
81 template <bool capture_raw> | |
82 uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value) { | |
83 uc32 x = 0; | |
84 int d = HexValue(c0_); | |
85 if (d < 0) { | |
86 return -1; | |
87 } | |
88 while (d >= 0) { | |
89 x = x * 16 + d; | |
90 if (x > max_value) return -1; | |
91 Advance<capture_raw>(); | |
92 d = HexValue(c0_); | |
93 } | |
94 return x; | |
95 } | |
96 | |
97 | |
98 // Ensure that tokens can be stored in a byte. | |
99 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100); | |
100 | |
101 // Table of one-character tokens, by character (0x00..0x7f only). | |
102 static const byte one_char_tokens[] = { | |
103 Token::ILLEGAL, | |
104 Token::ILLEGAL, | |
105 Token::ILLEGAL, | |
106 Token::ILLEGAL, | |
107 Token::ILLEGAL, | |
108 Token::ILLEGAL, | |
109 Token::ILLEGAL, | |
110 Token::ILLEGAL, | |
111 Token::ILLEGAL, | |
112 Token::ILLEGAL, | |
113 Token::ILLEGAL, | |
114 Token::ILLEGAL, | |
115 Token::ILLEGAL, | |
116 Token::ILLEGAL, | |
117 Token::ILLEGAL, | |
118 Token::ILLEGAL, | |
119 Token::ILLEGAL, | |
120 Token::ILLEGAL, | |
121 Token::ILLEGAL, | |
122 Token::ILLEGAL, | |
123 Token::ILLEGAL, | |
124 Token::ILLEGAL, | |
125 Token::ILLEGAL, | |
126 Token::ILLEGAL, | |
127 Token::ILLEGAL, | |
128 Token::ILLEGAL, | |
129 Token::ILLEGAL, | |
130 Token::ILLEGAL, | |
131 Token::ILLEGAL, | |
132 Token::ILLEGAL, | |
133 Token::ILLEGAL, | |
134 Token::ILLEGAL, | |
135 Token::ILLEGAL, | |
136 Token::ILLEGAL, | |
137 Token::ILLEGAL, | |
138 Token::ILLEGAL, | |
139 Token::ILLEGAL, | |
140 Token::ILLEGAL, | |
141 Token::ILLEGAL, | |
142 Token::ILLEGAL, | |
143 Token::LPAREN, // 0x28 | |
144 Token::RPAREN, // 0x29 | |
145 Token::ILLEGAL, | |
146 Token::ILLEGAL, | |
147 Token::COMMA, // 0x2c | |
148 Token::ILLEGAL, | |
149 Token::ILLEGAL, | |
150 Token::ILLEGAL, | |
151 Token::ILLEGAL, | |
152 Token::ILLEGAL, | |
153 Token::ILLEGAL, | |
154 Token::ILLEGAL, | |
155 Token::ILLEGAL, | |
156 Token::ILLEGAL, | |
157 Token::ILLEGAL, | |
158 Token::ILLEGAL, | |
159 Token::ILLEGAL, | |
160 Token::ILLEGAL, | |
161 Token::COLON, // 0x3a | |
162 Token::SEMICOLON, // 0x3b | |
163 Token::ILLEGAL, | |
164 Token::ILLEGAL, | |
165 Token::ILLEGAL, | |
166 Token::CONDITIONAL, // 0x3f | |
167 Token::ILLEGAL, | |
168 Token::ILLEGAL, | |
169 Token::ILLEGAL, | |
170 Token::ILLEGAL, | |
171 Token::ILLEGAL, | |
172 Token::ILLEGAL, | |
173 Token::ILLEGAL, | |
174 Token::ILLEGAL, | |
175 Token::ILLEGAL, | |
176 Token::ILLEGAL, | |
177 Token::ILLEGAL, | |
178 Token::ILLEGAL, | |
179 Token::ILLEGAL, | |
180 Token::ILLEGAL, | |
181 Token::ILLEGAL, | |
182 Token::ILLEGAL, | |
183 Token::ILLEGAL, | |
184 Token::ILLEGAL, | |
185 Token::ILLEGAL, | |
186 Token::ILLEGAL, | |
187 Token::ILLEGAL, | |
188 Token::ILLEGAL, | |
189 Token::ILLEGAL, | |
190 Token::ILLEGAL, | |
191 Token::ILLEGAL, | |
192 Token::ILLEGAL, | |
193 Token::ILLEGAL, | |
194 Token::LBRACK, // 0x5b | |
195 Token::ILLEGAL, | |
196 Token::RBRACK, // 0x5d | |
197 Token::ILLEGAL, | |
198 Token::ILLEGAL, | |
199 Token::ILLEGAL, | |
200 Token::ILLEGAL, | |
201 Token::ILLEGAL, | |
202 Token::ILLEGAL, | |
203 Token::ILLEGAL, | |
204 Token::ILLEGAL, | |
205 Token::ILLEGAL, | |
206 Token::ILLEGAL, | |
207 Token::ILLEGAL, | |
208 Token::ILLEGAL, | |
209 Token::ILLEGAL, | |
210 Token::ILLEGAL, | |
211 Token::ILLEGAL, | |
212 Token::ILLEGAL, | |
213 Token::ILLEGAL, | |
214 Token::ILLEGAL, | |
215 Token::ILLEGAL, | |
216 Token::ILLEGAL, | |
217 Token::ILLEGAL, | |
218 Token::ILLEGAL, | |
219 Token::ILLEGAL, | |
220 Token::ILLEGAL, | |
221 Token::ILLEGAL, | |
222 Token::ILLEGAL, | |
223 Token::ILLEGAL, | |
224 Token::ILLEGAL, | |
225 Token::ILLEGAL, | |
226 Token::LBRACE, // 0x7b | |
227 Token::ILLEGAL, | |
228 Token::RBRACE, // 0x7d | |
229 Token::BIT_NOT, // 0x7e | |
230 Token::ILLEGAL | |
231 }; | |
232 | |
233 | |
234 Token::Value Scanner::Next() { | |
235 if (next_.token == Token::EOS) { | |
236 next_.location.beg_pos = current_.location.beg_pos; | |
237 next_.location.end_pos = current_.location.end_pos; | |
238 } | |
239 current_ = next_; | |
240 if (V8_UNLIKELY(next_next_.token != Token::UNINITIALIZED)) { | |
241 next_ = next_next_; | |
242 next_next_.token = Token::UNINITIALIZED; | |
243 return current_.token; | |
244 } | |
245 has_line_terminator_before_next_ = false; | |
246 has_multiline_comment_before_next_ = false; | |
247 if (static_cast<unsigned>(c0_) <= 0x7f) { | |
248 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]); | |
249 if (token != Token::ILLEGAL) { | |
250 int pos = source_pos(); | |
251 next_.token = token; | |
252 next_.location.beg_pos = pos; | |
253 next_.location.end_pos = pos + 1; | |
254 Advance(); | |
255 return current_.token; | |
256 } | |
257 } | |
258 Scan(); | |
259 return current_.token; | |
260 } | |
261 | |
262 | |
263 Token::Value Scanner::PeekAhead() { | |
264 if (next_next_.token != Token::UNINITIALIZED) { | |
265 return next_next_.token; | |
266 } | |
267 TokenDesc prev = current_; | |
268 Next(); | |
269 Token::Value ret = next_.token; | |
270 next_next_ = next_; | |
271 next_ = current_; | |
272 current_ = prev; | |
273 return ret; | |
274 } | |
275 | |
276 | |
277 // TODO(yangguo): check whether this is actually necessary. | |
278 static inline bool IsLittleEndianByteOrderMark(uc32 c) { | |
279 // The Unicode value U+FFFE is guaranteed never to be assigned as a | |
280 // Unicode character; this implies that in a Unicode context the | |
281 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF | |
282 // character expressed in little-endian byte order (since it could | |
283 // not be a U+FFFE character expressed in big-endian byte | |
284 // order). Nevertheless, we check for it to be compatible with | |
285 // Spidermonkey. | |
286 return c == 0xFFFE; | |
287 } | |
288 | |
289 | |
290 bool Scanner::SkipWhiteSpace() { | |
291 int start_position = source_pos(); | |
292 | |
293 while (true) { | |
294 while (true) { | |
295 // The unicode cache accepts unsigned inputs. | |
296 if (c0_ < 0) break; | |
297 // Advance as long as character is a WhiteSpace or LineTerminator. | |
298 // Remember if the latter is the case. | |
299 if (unicode_cache_->IsLineTerminator(c0_)) { | |
300 has_line_terminator_before_next_ = true; | |
301 } else if (!unicode_cache_->IsWhiteSpace(c0_) && | |
302 !IsLittleEndianByteOrderMark(c0_)) { | |
303 break; | |
304 } | |
305 Advance(); | |
306 } | |
307 | |
308 // If there is an HTML comment end '-->' at the beginning of a | |
309 // line (with only whitespace in front of it), we treat the rest | |
310 // of the line as a comment. This is in line with the way | |
311 // SpiderMonkey handles it. | |
312 if (c0_ == '-' && has_line_terminator_before_next_) { | |
313 Advance(); | |
314 if (c0_ == '-') { | |
315 Advance(); | |
316 if (c0_ == '>') { | |
317 // Treat the rest of the line as a comment. | |
318 SkipSingleLineComment(); | |
319 // Continue skipping white space after the comment. | |
320 continue; | |
321 } | |
322 PushBack('-'); // undo Advance() | |
323 } | |
324 PushBack('-'); // undo Advance() | |
325 } | |
326 // Return whether or not we skipped any characters. | |
327 return source_pos() != start_position; | |
328 } | |
329 } | |
330 | |
331 | |
332 Token::Value Scanner::SkipSingleLineComment() { | |
333 Advance(); | |
334 | |
335 // The line terminator at the end of the line is not considered | |
336 // to be part of the single-line comment; it is recognized | |
337 // separately by the lexical grammar and becomes part of the | |
338 // stream of input elements for the syntactic grammar (see | |
339 // ECMA-262, section 7.4). | |
340 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { | |
341 Advance(); | |
342 } | |
343 | |
344 return Token::WHITESPACE; | |
345 } | |
346 | |
347 | |
348 Token::Value Scanner::SkipSourceURLComment() { | |
349 TryToParseSourceURLComment(); | |
350 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { | |
351 Advance(); | |
352 } | |
353 | |
354 return Token::WHITESPACE; | |
355 } | |
356 | |
357 | |
358 void Scanner::TryToParseSourceURLComment() { | |
359 // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this | |
360 // function will just return if it cannot parse a magic comment. | |
361 if (c0_ < 0 || !unicode_cache_->IsWhiteSpace(c0_)) return; | |
362 Advance(); | |
363 LiteralBuffer name; | |
364 while (c0_ >= 0 && !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) && | |
365 c0_ != '=') { | |
366 name.AddChar(c0_); | |
367 Advance(); | |
368 } | |
369 if (!name.is_one_byte()) return; | |
370 Vector<const uint8_t> name_literal = name.one_byte_literal(); | |
371 LiteralBuffer* value; | |
372 if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) { | |
373 value = &source_url_; | |
374 } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) { | |
375 value = &source_mapping_url_; | |
376 } else { | |
377 return; | |
378 } | |
379 if (c0_ != '=') | |
380 return; | |
381 Advance(); | |
382 value->Reset(); | |
383 while (c0_ >= 0 && unicode_cache_->IsWhiteSpace(c0_)) { | |
384 Advance(); | |
385 } | |
386 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { | |
387 // Disallowed characters. | |
388 if (c0_ == '"' || c0_ == '\'') { | |
389 value->Reset(); | |
390 return; | |
391 } | |
392 if (unicode_cache_->IsWhiteSpace(c0_)) { | |
393 break; | |
394 } | |
395 value->AddChar(c0_); | |
396 Advance(); | |
397 } | |
398 // Allow whitespace at the end. | |
399 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { | |
400 if (!unicode_cache_->IsWhiteSpace(c0_)) { | |
401 value->Reset(); | |
402 break; | |
403 } | |
404 Advance(); | |
405 } | |
406 } | |
407 | |
408 | |
409 Token::Value Scanner::SkipMultiLineComment() { | |
410 DCHECK(c0_ == '*'); | |
411 Advance(); | |
412 | |
413 while (c0_ >= 0) { | |
414 uc32 ch = c0_; | |
415 Advance(); | |
416 if (c0_ >= 0 && unicode_cache_->IsLineTerminator(ch)) { | |
417 // Following ECMA-262, section 7.4, a comment containing | |
418 // a newline will make the comment count as a line-terminator. | |
419 has_multiline_comment_before_next_ = true; | |
420 } | |
421 // If we have reached the end of the multi-line comment, we | |
422 // consume the '/' and insert a whitespace. This way all | |
423 // multi-line comments are treated as whitespace. | |
424 if (ch == '*' && c0_ == '/') { | |
425 c0_ = ' '; | |
426 return Token::WHITESPACE; | |
427 } | |
428 } | |
429 | |
430 // Unterminated multi-line comment. | |
431 return Token::ILLEGAL; | |
432 } | |
433 | |
434 | |
435 Token::Value Scanner::ScanHtmlComment() { | |
436 // Check for <!-- comments. | |
437 DCHECK(c0_ == '!'); | |
438 Advance(); | |
439 if (c0_ == '-') { | |
440 Advance(); | |
441 if (c0_ == '-') return SkipSingleLineComment(); | |
442 PushBack('-'); // undo Advance() | |
443 } | |
444 PushBack('!'); // undo Advance() | |
445 DCHECK(c0_ == '!'); | |
446 return Token::LT; | |
447 } | |
448 | |
449 | |
450 void Scanner::Scan() { | |
451 next_.literal_chars = NULL; | |
452 next_.raw_literal_chars = NULL; | |
453 Token::Value token; | |
454 do { | |
455 // Remember the position of the next token | |
456 next_.location.beg_pos = source_pos(); | |
457 | |
458 switch (c0_) { | |
459 case ' ': | |
460 case '\t': | |
461 Advance(); | |
462 token = Token::WHITESPACE; | |
463 break; | |
464 | |
465 case '\n': | |
466 Advance(); | |
467 has_line_terminator_before_next_ = true; | |
468 token = Token::WHITESPACE; | |
469 break; | |
470 | |
471 case '"': case '\'': | |
472 token = ScanString(); | |
473 break; | |
474 | |
475 case '<': | |
476 // < <= << <<= <!-- | |
477 Advance(); | |
478 if (c0_ == '=') { | |
479 token = Select(Token::LTE); | |
480 } else if (c0_ == '<') { | |
481 token = Select('=', Token::ASSIGN_SHL, Token::SHL); | |
482 } else if (c0_ == '!') { | |
483 token = ScanHtmlComment(); | |
484 } else { | |
485 token = Token::LT; | |
486 } | |
487 break; | |
488 | |
489 case '>': | |
490 // > >= >> >>= >>> >>>= | |
491 Advance(); | |
492 if (c0_ == '=') { | |
493 token = Select(Token::GTE); | |
494 } else if (c0_ == '>') { | |
495 // >> >>= >>> >>>= | |
496 Advance(); | |
497 if (c0_ == '=') { | |
498 token = Select(Token::ASSIGN_SAR); | |
499 } else if (c0_ == '>') { | |
500 token = Select('=', Token::ASSIGN_SHR, Token::SHR); | |
501 } else { | |
502 token = Token::SAR; | |
503 } | |
504 } else { | |
505 token = Token::GT; | |
506 } | |
507 break; | |
508 | |
509 case '=': | |
510 // = == === => | |
511 Advance(); | |
512 if (c0_ == '=') { | |
513 token = Select('=', Token::EQ_STRICT, Token::EQ); | |
514 } else if (c0_ == '>') { | |
515 token = Select(Token::ARROW); | |
516 } else { | |
517 token = Token::ASSIGN; | |
518 } | |
519 break; | |
520 | |
521 case '!': | |
522 // ! != !== | |
523 Advance(); | |
524 if (c0_ == '=') { | |
525 token = Select('=', Token::NE_STRICT, Token::NE); | |
526 } else { | |
527 token = Token::NOT; | |
528 } | |
529 break; | |
530 | |
531 case '+': | |
532 // + ++ += | |
533 Advance(); | |
534 if (c0_ == '+') { | |
535 token = Select(Token::INC); | |
536 } else if (c0_ == '=') { | |
537 token = Select(Token::ASSIGN_ADD); | |
538 } else { | |
539 token = Token::ADD; | |
540 } | |
541 break; | |
542 | |
543 case '-': | |
544 // - -- --> -= | |
545 Advance(); | |
546 if (c0_ == '-') { | |
547 Advance(); | |
548 if (c0_ == '>' && has_line_terminator_before_next_) { | |
549 // For compatibility with SpiderMonkey, we skip lines that | |
550 // start with an HTML comment end '-->'. | |
551 token = SkipSingleLineComment(); | |
552 } else { | |
553 token = Token::DEC; | |
554 } | |
555 } else if (c0_ == '=') { | |
556 token = Select(Token::ASSIGN_SUB); | |
557 } else { | |
558 token = Token::SUB; | |
559 } | |
560 break; | |
561 | |
562 case '*': | |
563 // * *= | |
564 token = Select('=', Token::ASSIGN_MUL, Token::MUL); | |
565 break; | |
566 | |
567 case '%': | |
568 // % %= | |
569 token = Select('=', Token::ASSIGN_MOD, Token::MOD); | |
570 break; | |
571 | |
572 case '/': | |
573 // / // /* /= | |
574 Advance(); | |
575 if (c0_ == '/') { | |
576 Advance(); | |
577 if (c0_ == '@' || c0_ == '#') { | |
578 Advance(); | |
579 token = SkipSourceURLComment(); | |
580 } else { | |
581 PushBack(c0_); | |
582 token = SkipSingleLineComment(); | |
583 } | |
584 } else if (c0_ == '*') { | |
585 token = SkipMultiLineComment(); | |
586 } else if (c0_ == '=') { | |
587 token = Select(Token::ASSIGN_DIV); | |
588 } else { | |
589 token = Token::DIV; | |
590 } | |
591 break; | |
592 | |
593 case '&': | |
594 // & && &= | |
595 Advance(); | |
596 if (c0_ == '&') { | |
597 token = Select(Token::AND); | |
598 } else if (c0_ == '=') { | |
599 token = Select(Token::ASSIGN_BIT_AND); | |
600 } else { | |
601 token = Token::BIT_AND; | |
602 } | |
603 break; | |
604 | |
605 case '|': | |
606 // | || |= | |
607 Advance(); | |
608 if (c0_ == '|') { | |
609 token = Select(Token::OR); | |
610 } else if (c0_ == '=') { | |
611 token = Select(Token::ASSIGN_BIT_OR); | |
612 } else { | |
613 token = Token::BIT_OR; | |
614 } | |
615 break; | |
616 | |
617 case '^': | |
618 // ^ ^= | |
619 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); | |
620 break; | |
621 | |
622 case '.': | |
623 // . Number | |
624 Advance(); | |
625 if (IsDecimalDigit(c0_)) { | |
626 token = ScanNumber(true); | |
627 } else { | |
628 token = Token::PERIOD; | |
629 if (c0_ == '.') { | |
630 Advance(); | |
631 if (c0_ == '.') { | |
632 Advance(); | |
633 token = Token::ELLIPSIS; | |
634 } else { | |
635 PushBack('.'); | |
636 } | |
637 } | |
638 } | |
639 break; | |
640 | |
641 case ':': | |
642 token = Select(Token::COLON); | |
643 break; | |
644 | |
645 case ';': | |
646 token = Select(Token::SEMICOLON); | |
647 break; | |
648 | |
649 case ',': | |
650 token = Select(Token::COMMA); | |
651 break; | |
652 | |
653 case '(': | |
654 token = Select(Token::LPAREN); | |
655 break; | |
656 | |
657 case ')': | |
658 token = Select(Token::RPAREN); | |
659 break; | |
660 | |
661 case '[': | |
662 token = Select(Token::LBRACK); | |
663 break; | |
664 | |
665 case ']': | |
666 token = Select(Token::RBRACK); | |
667 break; | |
668 | |
669 case '{': | |
670 token = Select(Token::LBRACE); | |
671 break; | |
672 | |
673 case '}': | |
674 token = Select(Token::RBRACE); | |
675 break; | |
676 | |
677 case '?': | |
678 token = Select(Token::CONDITIONAL); | |
679 break; | |
680 | |
681 case '~': | |
682 token = Select(Token::BIT_NOT); | |
683 break; | |
684 | |
685 case '`': | |
686 token = ScanTemplateStart(); | |
687 break; | |
688 | |
689 default: | |
690 if (c0_ < 0) { | |
691 token = Token::EOS; | |
692 } else if (unicode_cache_->IsIdentifierStart(c0_)) { | |
693 token = ScanIdentifierOrKeyword(); | |
694 } else if (IsDecimalDigit(c0_)) { | |
695 token = ScanNumber(false); | |
696 } else if (SkipWhiteSpace()) { | |
697 token = Token::WHITESPACE; | |
698 } else { | |
699 token = Select(Token::ILLEGAL); | |
700 } | |
701 break; | |
702 } | |
703 | |
704 // Continue scanning for tokens as long as we're just skipping | |
705 // whitespace. | |
706 } while (token == Token::WHITESPACE); | |
707 | |
708 next_.location.end_pos = source_pos(); | |
709 next_.token = token; | |
710 } | |
711 | |
712 | |
713 void Scanner::SeekForward(int pos) { | |
714 // After this call, we will have the token at the given position as | |
715 // the "next" token. The "current" token will be invalid. | |
716 if (pos == next_.location.beg_pos) return; | |
717 int current_pos = source_pos(); | |
718 DCHECK_EQ(next_.location.end_pos, current_pos); | |
719 // Positions inside the lookahead token aren't supported. | |
720 DCHECK(pos >= current_pos); | |
721 if (pos != current_pos) { | |
722 source_->SeekForward(pos - source_->pos()); | |
723 Advance(); | |
724 // This function is only called to seek to the location | |
725 // of the end of a function (at the "}" token). It doesn't matter | |
726 // whether there was a line terminator in the part we skip. | |
727 has_line_terminator_before_next_ = false; | |
728 has_multiline_comment_before_next_ = false; | |
729 } | |
730 Scan(); | |
731 } | |
732 | |
733 | |
734 template <bool capture_raw, bool in_template_literal> | |
735 bool Scanner::ScanEscape() { | |
736 uc32 c = c0_; | |
737 Advance<capture_raw>(); | |
738 | |
739 // Skip escaped newlines. | |
740 if (!in_template_literal && c0_ >= 0 && unicode_cache_->IsLineTerminator(c)) { | |
741 // Allow CR+LF newlines in multiline string literals. | |
742 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance<capture_raw>(); | |
743 // Allow LF+CR newlines in multiline string literals. | |
744 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance<capture_raw>(); | |
745 return true; | |
746 } | |
747 | |
748 switch (c) { | |
749 case '\'': // fall through | |
750 case '"' : // fall through | |
751 case '\\': break; | |
752 case 'b' : c = '\b'; break; | |
753 case 'f' : c = '\f'; break; | |
754 case 'n' : c = '\n'; break; | |
755 case 'r' : c = '\r'; break; | |
756 case 't' : c = '\t'; break; | |
757 case 'u' : { | |
758 c = ScanUnicodeEscape<capture_raw>(); | |
759 if (c < 0) return false; | |
760 break; | |
761 } | |
762 case 'v': | |
763 c = '\v'; | |
764 break; | |
765 case 'x': { | |
766 c = ScanHexNumber<capture_raw>(2); | |
767 if (c < 0) return false; | |
768 break; | |
769 } | |
770 case '0': // Fall through. | |
771 case '1': // fall through | |
772 case '2': // fall through | |
773 case '3': // fall through | |
774 case '4': // fall through | |
775 case '5': // fall through | |
776 case '6': // fall through | |
777 case '7': | |
778 c = ScanOctalEscape<capture_raw>(c, 2); | |
779 break; | |
780 } | |
781 | |
782 // According to ECMA-262, section 7.8.4, characters not covered by the | |
783 // above cases should be illegal, but they are commonly handled as | |
784 // non-escaped characters by JS VMs. | |
785 AddLiteralChar(c); | |
786 return true; | |
787 } | |
788 | |
789 | |
790 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of | |
791 // ECMA-262. Other JS VMs support them. | |
792 template <bool capture_raw> | |
793 uc32 Scanner::ScanOctalEscape(uc32 c, int length) { | |
794 uc32 x = c - '0'; | |
795 int i = 0; | |
796 for (; i < length; i++) { | |
797 int d = c0_ - '0'; | |
798 if (d < 0 || d > 7) break; | |
799 int nx = x * 8 + d; | |
800 if (nx >= 256) break; | |
801 x = nx; | |
802 Advance<capture_raw>(); | |
803 } | |
804 // Anything except '\0' is an octal escape sequence, illegal in strict mode. | |
805 // Remember the position of octal escape sequences so that an error | |
806 // can be reported later (in strict mode). | |
807 // We don't report the error immediately, because the octal escape can | |
808 // occur before the "use strict" directive. | |
809 if (c != '0' || i > 0) { | |
810 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1); | |
811 } | |
812 return x; | |
813 } | |
814 | |
815 | |
816 const int kMaxAscii = 127; | |
817 | |
818 | |
819 Token::Value Scanner::ScanString() { | |
820 uc32 quote = c0_; | |
821 Advance<false, false>(); // consume quote | |
822 | |
823 LiteralScope literal(this); | |
824 while (true) { | |
825 if (c0_ > kMaxAscii) { | |
826 HandleLeadSurrogate(); | |
827 break; | |
828 } | |
829 if (c0_ < 0 || c0_ == '\n' || c0_ == '\r') return Token::ILLEGAL; | |
830 if (c0_ == quote) { | |
831 literal.Complete(); | |
832 Advance<false, false>(); | |
833 return Token::STRING; | |
834 } | |
835 uc32 c = c0_; | |
836 if (c == '\\') break; | |
837 Advance<false, false>(); | |
838 AddLiteralChar(c); | |
839 } | |
840 | |
841 while (c0_ != quote && c0_ >= 0 | |
842 && !unicode_cache_->IsLineTerminator(c0_)) { | |
843 uc32 c = c0_; | |
844 Advance(); | |
845 if (c == '\\') { | |
846 if (c0_ < 0 || !ScanEscape<false, false>()) return Token::ILLEGAL; | |
847 } else { | |
848 AddLiteralChar(c); | |
849 } | |
850 } | |
851 if (c0_ != quote) return Token::ILLEGAL; | |
852 literal.Complete(); | |
853 | |
854 Advance(); // consume quote | |
855 return Token::STRING; | |
856 } | |
857 | |
858 | |
859 Token::Value Scanner::ScanTemplateSpan() { | |
860 // When scanning a TemplateSpan, we are looking for the following construct: | |
861 // TEMPLATE_SPAN :: | |
862 // ` LiteralChars* ${ | |
863 // | } LiteralChars* ${ | |
864 // | |
865 // TEMPLATE_TAIL :: | |
866 // ` LiteralChars* ` | |
867 // | } LiteralChar* ` | |
868 // | |
869 // A TEMPLATE_SPAN should always be followed by an Expression, while a | |
870 // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be | |
871 // followed by an Expression. | |
872 | |
873 Token::Value result = Token::TEMPLATE_SPAN; | |
874 LiteralScope literal(this); | |
875 StartRawLiteral(); | |
876 const bool capture_raw = true; | |
877 const bool in_template_literal = true; | |
878 | |
879 while (true) { | |
880 uc32 c = c0_; | |
881 Advance<capture_raw>(); | |
882 if (c == '`') { | |
883 result = Token::TEMPLATE_TAIL; | |
884 ReduceRawLiteralLength(1); | |
885 break; | |
886 } else if (c == '$' && c0_ == '{') { | |
887 Advance<capture_raw>(); // Consume '{' | |
888 ReduceRawLiteralLength(2); | |
889 break; | |
890 } else if (c == '\\') { | |
891 if (c0_ > 0 && unicode_cache_->IsLineTerminator(c0_)) { | |
892 // The TV of LineContinuation :: \ LineTerminatorSequence is the empty | |
893 // code unit sequence. | |
894 uc32 lastChar = c0_; | |
895 Advance<capture_raw>(); | |
896 if (lastChar == '\r') { | |
897 ReduceRawLiteralLength(1); // Remove \r | |
898 if (c0_ == '\n') { | |
899 Advance<capture_raw>(); // Adds \n | |
900 } else { | |
901 AddRawLiteralChar('\n'); | |
902 } | |
903 } | |
904 } else if (!ScanEscape<capture_raw, in_template_literal>()) { | |
905 return Token::ILLEGAL; | |
906 } | |
907 } else if (c < 0) { | |
908 // Unterminated template literal | |
909 PushBack(c); | |
910 break; | |
911 } else { | |
912 // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A. | |
913 // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence | |
914 // consisting of the CV 0x000A. | |
915 if (c == '\r') { | |
916 ReduceRawLiteralLength(1); // Remove \r | |
917 if (c0_ == '\n') { | |
918 Advance<capture_raw>(); // Adds \n | |
919 } else { | |
920 AddRawLiteralChar('\n'); | |
921 } | |
922 c = '\n'; | |
923 } | |
924 AddLiteralChar(c); | |
925 } | |
926 } | |
927 literal.Complete(); | |
928 next_.location.end_pos = source_pos(); | |
929 next_.token = result; | |
930 return result; | |
931 } | |
932 | |
933 | |
934 Token::Value Scanner::ScanTemplateStart() { | |
935 DCHECK(c0_ == '`'); | |
936 next_.location.beg_pos = source_pos(); | |
937 Advance(); // Consume ` | |
938 return ScanTemplateSpan(); | |
939 } | |
940 | |
941 | |
942 Token::Value Scanner::ScanTemplateContinuation() { | |
943 DCHECK_EQ(next_.token, Token::RBRACE); | |
944 next_.location.beg_pos = source_pos() - 1; // We already consumed } | |
945 return ScanTemplateSpan(); | |
946 } | |
947 | |
948 | |
949 void Scanner::ScanDecimalDigits() { | |
950 while (IsDecimalDigit(c0_)) | |
951 AddLiteralCharAdvance(); | |
952 } | |
953 | |
954 | |
955 Token::Value Scanner::ScanNumber(bool seen_period) { | |
956 DCHECK(IsDecimalDigit(c0_)); // the first digit of the number or the fraction | |
957 | |
958 enum { DECIMAL, HEX, OCTAL, IMPLICIT_OCTAL, BINARY } kind = DECIMAL; | |
959 | |
960 LiteralScope literal(this); | |
961 bool at_start = !seen_period; | |
962 if (seen_period) { | |
963 // we have already seen a decimal point of the float | |
964 AddLiteralChar('.'); | |
965 ScanDecimalDigits(); // we know we have at least one digit | |
966 | |
967 } else { | |
968 // if the first character is '0' we must check for octals and hex | |
969 if (c0_ == '0') { | |
970 int start_pos = source_pos(); // For reporting octal positions. | |
971 AddLiteralCharAdvance(); | |
972 | |
973 // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or | |
974 // an octal number. | |
975 if (c0_ == 'x' || c0_ == 'X') { | |
976 // hex number | |
977 kind = HEX; | |
978 AddLiteralCharAdvance(); | |
979 if (!IsHexDigit(c0_)) { | |
980 // we must have at least one hex digit after 'x'/'X' | |
981 return Token::ILLEGAL; | |
982 } | |
983 while (IsHexDigit(c0_)) { | |
984 AddLiteralCharAdvance(); | |
985 } | |
986 } else if (c0_ == 'o' || c0_ == 'O') { | |
987 kind = OCTAL; | |
988 AddLiteralCharAdvance(); | |
989 if (!IsOctalDigit(c0_)) { | |
990 // we must have at least one octal digit after 'o'/'O' | |
991 return Token::ILLEGAL; | |
992 } | |
993 while (IsOctalDigit(c0_)) { | |
994 AddLiteralCharAdvance(); | |
995 } | |
996 } else if (c0_ == 'b' || c0_ == 'B') { | |
997 kind = BINARY; | |
998 AddLiteralCharAdvance(); | |
999 if (!IsBinaryDigit(c0_)) { | |
1000 // we must have at least one binary digit after 'b'/'B' | |
1001 return Token::ILLEGAL; | |
1002 } | |
1003 while (IsBinaryDigit(c0_)) { | |
1004 AddLiteralCharAdvance(); | |
1005 } | |
1006 } else if ('0' <= c0_ && c0_ <= '7') { | |
1007 // (possible) octal number | |
1008 kind = IMPLICIT_OCTAL; | |
1009 while (true) { | |
1010 if (c0_ == '8' || c0_ == '9') { | |
1011 at_start = false; | |
1012 kind = DECIMAL; | |
1013 break; | |
1014 } | |
1015 if (c0_ < '0' || '7' < c0_) { | |
1016 // Octal literal finished. | |
1017 octal_pos_ = Location(start_pos, source_pos()); | |
1018 break; | |
1019 } | |
1020 AddLiteralCharAdvance(); | |
1021 } | |
1022 } | |
1023 } | |
1024 | |
1025 // Parse decimal digits and allow trailing fractional part. | |
1026 if (kind == DECIMAL) { | |
1027 if (at_start) { | |
1028 uint64_t value = 0; | |
1029 while (IsDecimalDigit(c0_)) { | |
1030 value = 10 * value + (c0_ - '0'); | |
1031 | |
1032 uc32 first_char = c0_; | |
1033 Advance<false, false>(); | |
1034 AddLiteralChar(first_char); | |
1035 } | |
1036 | |
1037 if (next_.literal_chars->one_byte_literal().length() <= 10 && | |
1038 value <= Smi::kMaxValue && c0_ != '.' && c0_ != 'e' && c0_ != 'E') { | |
1039 next_.smi_value_ = static_cast<int>(value); | |
1040 literal.Complete(); | |
1041 HandleLeadSurrogate(); | |
1042 | |
1043 return Token::SMI; | |
1044 } | |
1045 HandleLeadSurrogate(); | |
1046 } | |
1047 | |
1048 ScanDecimalDigits(); // optional | |
1049 if (c0_ == '.') { | |
1050 AddLiteralCharAdvance(); | |
1051 ScanDecimalDigits(); // optional | |
1052 } | |
1053 } | |
1054 } | |
1055 | |
1056 // scan exponent, if any | |
1057 if (c0_ == 'e' || c0_ == 'E') { | |
1058 DCHECK(kind != HEX); // 'e'/'E' must be scanned as part of the hex number | |
1059 if (kind != DECIMAL) return Token::ILLEGAL; | |
1060 // scan exponent | |
1061 AddLiteralCharAdvance(); | |
1062 if (c0_ == '+' || c0_ == '-') | |
1063 AddLiteralCharAdvance(); | |
1064 if (!IsDecimalDigit(c0_)) { | |
1065 // we must have at least one decimal digit after 'e'/'E' | |
1066 return Token::ILLEGAL; | |
1067 } | |
1068 ScanDecimalDigits(); | |
1069 } | |
1070 | |
1071 // The source character immediately following a numeric literal must | |
1072 // not be an identifier start or a decimal digit; see ECMA-262 | |
1073 // section 7.8.3, page 17 (note that we read only one decimal digit | |
1074 // if the value is 0). | |
1075 if (IsDecimalDigit(c0_) || | |
1076 (c0_ >= 0 && unicode_cache_->IsIdentifierStart(c0_))) | |
1077 return Token::ILLEGAL; | |
1078 | |
1079 literal.Complete(); | |
1080 | |
1081 return Token::NUMBER; | |
1082 } | |
1083 | |
1084 | |
1085 uc32 Scanner::ScanIdentifierUnicodeEscape() { | |
1086 Advance(); | |
1087 if (c0_ != 'u') return -1; | |
1088 Advance(); | |
1089 return ScanUnicodeEscape<false>(); | |
1090 } | |
1091 | |
1092 | |
1093 template <bool capture_raw> | |
1094 uc32 Scanner::ScanUnicodeEscape() { | |
1095 // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of | |
1096 // hex digits between { } is arbitrary. \ and u have already been read. | |
1097 if (c0_ == '{') { | |
1098 Advance<capture_raw>(); | |
1099 uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10ffff); | |
1100 if (cp < 0) { | |
1101 return -1; | |
1102 } | |
1103 if (c0_ != '}') { | |
1104 return -1; | |
1105 } | |
1106 Advance<capture_raw>(); | |
1107 return cp; | |
1108 } | |
1109 return ScanHexNumber<capture_raw>(4); | |
1110 } | |
1111 | |
1112 | |
1113 // ---------------------------------------------------------------------------- | |
1114 // Keyword Matcher | |
1115 | |
1116 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ | |
1117 KEYWORD_GROUP('b') \ | |
1118 KEYWORD("break", Token::BREAK) \ | |
1119 KEYWORD_GROUP('c') \ | |
1120 KEYWORD("case", Token::CASE) \ | |
1121 KEYWORD("catch", Token::CATCH) \ | |
1122 KEYWORD("class", Token::CLASS) \ | |
1123 KEYWORD("const", Token::CONST) \ | |
1124 KEYWORD("continue", Token::CONTINUE) \ | |
1125 KEYWORD_GROUP('d') \ | |
1126 KEYWORD("debugger", Token::DEBUGGER) \ | |
1127 KEYWORD("default", Token::DEFAULT) \ | |
1128 KEYWORD("delete", Token::DELETE) \ | |
1129 KEYWORD("do", Token::DO) \ | |
1130 KEYWORD_GROUP('e') \ | |
1131 KEYWORD("else", Token::ELSE) \ | |
1132 KEYWORD("enum", Token::FUTURE_RESERVED_WORD) \ | |
1133 KEYWORD("export", Token::EXPORT) \ | |
1134 KEYWORD("extends", Token::EXTENDS) \ | |
1135 KEYWORD_GROUP('f') \ | |
1136 KEYWORD("false", Token::FALSE_LITERAL) \ | |
1137 KEYWORD("finally", Token::FINALLY) \ | |
1138 KEYWORD("for", Token::FOR) \ | |
1139 KEYWORD("function", Token::FUNCTION) \ | |
1140 KEYWORD_GROUP('i') \ | |
1141 KEYWORD("if", Token::IF) \ | |
1142 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
1143 KEYWORD("import", Token::IMPORT) \ | |
1144 KEYWORD("in", Token::IN) \ | |
1145 KEYWORD("instanceof", Token::INSTANCEOF) \ | |
1146 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
1147 KEYWORD_GROUP('l') \ | |
1148 KEYWORD("let", Token::LET) \ | |
1149 KEYWORD_GROUP('n') \ | |
1150 KEYWORD("new", Token::NEW) \ | |
1151 KEYWORD("null", Token::NULL_LITERAL) \ | |
1152 KEYWORD_GROUP('p') \ | |
1153 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
1154 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
1155 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
1156 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
1157 KEYWORD_GROUP('r') \ | |
1158 KEYWORD("return", Token::RETURN) \ | |
1159 KEYWORD_GROUP('s') \ | |
1160 KEYWORD("static", Token::STATIC) \ | |
1161 KEYWORD("super", Token::SUPER) \ | |
1162 KEYWORD("switch", Token::SWITCH) \ | |
1163 KEYWORD_GROUP('t') \ | |
1164 KEYWORD("this", Token::THIS) \ | |
1165 KEYWORD("throw", Token::THROW) \ | |
1166 KEYWORD("true", Token::TRUE_LITERAL) \ | |
1167 KEYWORD("try", Token::TRY) \ | |
1168 KEYWORD("typeof", Token::TYPEOF) \ | |
1169 KEYWORD_GROUP('v') \ | |
1170 KEYWORD("var", Token::VAR) \ | |
1171 KEYWORD("void", Token::VOID) \ | |
1172 KEYWORD_GROUP('w') \ | |
1173 KEYWORD("while", Token::WHILE) \ | |
1174 KEYWORD("with", Token::WITH) \ | |
1175 KEYWORD_GROUP('y') \ | |
1176 KEYWORD("yield", Token::YIELD) | |
1177 | |
1178 | |
1179 static Token::Value KeywordOrIdentifierToken(const uint8_t* input, | |
1180 int input_length, bool escaped) { | |
1181 DCHECK(input_length >= 1); | |
1182 const int kMinLength = 2; | |
1183 const int kMaxLength = 10; | |
1184 if (input_length < kMinLength || input_length > kMaxLength) { | |
1185 return Token::IDENTIFIER; | |
1186 } | |
1187 switch (input[0]) { | |
1188 default: | |
1189 #define KEYWORD_GROUP_CASE(ch) \ | |
1190 break; \ | |
1191 case ch: | |
1192 #define KEYWORD(keyword, token) \ | |
1193 { \ | |
1194 /* 'keyword' is a char array, so sizeof(keyword) is */ \ | |
1195 /* strlen(keyword) plus 1 for the NUL char. */ \ | |
1196 const int keyword_length = sizeof(keyword) - 1; \ | |
1197 STATIC_ASSERT(keyword_length >= kMinLength); \ | |
1198 STATIC_ASSERT(keyword_length <= kMaxLength); \ | |
1199 if (input_length == keyword_length && input[1] == keyword[1] && \ | |
1200 (keyword_length <= 2 || input[2] == keyword[2]) && \ | |
1201 (keyword_length <= 3 || input[3] == keyword[3]) && \ | |
1202 (keyword_length <= 4 || input[4] == keyword[4]) && \ | |
1203 (keyword_length <= 5 || input[5] == keyword[5]) && \ | |
1204 (keyword_length <= 6 || input[6] == keyword[6]) && \ | |
1205 (keyword_length <= 7 || input[7] == keyword[7]) && \ | |
1206 (keyword_length <= 8 || input[8] == keyword[8]) && \ | |
1207 (keyword_length <= 9 || input[9] == keyword[9])) { \ | |
1208 if (escaped) { \ | |
1209 return token == Token::FUTURE_STRICT_RESERVED_WORD \ | |
1210 ? Token::ESCAPED_STRICT_RESERVED_WORD \ | |
1211 : Token::ESCAPED_KEYWORD; \ | |
1212 } \ | |
1213 return token; \ | |
1214 } \ | |
1215 } | |
1216 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD) | |
1217 } | |
1218 return Token::IDENTIFIER; | |
1219 } | |
1220 | |
1221 | |
1222 bool Scanner::IdentifierIsFutureStrictReserved( | |
1223 const AstRawString* string) const { | |
1224 // Keywords are always 1-byte strings. | |
1225 if (!string->is_one_byte()) return false; | |
1226 if (string->IsOneByteEqualTo("let") || string->IsOneByteEqualTo("static") || | |
1227 string->IsOneByteEqualTo("yield")) { | |
1228 return true; | |
1229 } | |
1230 return Token::FUTURE_STRICT_RESERVED_WORD == | |
1231 KeywordOrIdentifierToken(string->raw_data(), string->length(), false); | |
1232 } | |
1233 | |
1234 | |
1235 Token::Value Scanner::ScanIdentifierOrKeyword() { | |
1236 DCHECK(unicode_cache_->IsIdentifierStart(c0_)); | |
1237 LiteralScope literal(this); | |
1238 if (IsInRange(c0_, 'a', 'z')) { | |
1239 do { | |
1240 uc32 first_char = c0_; | |
1241 Advance<false, false>(); | |
1242 AddLiteralChar(first_char); | |
1243 } while (IsInRange(c0_, 'a', 'z')); | |
1244 | |
1245 if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '_' || | |
1246 c0_ == '$') { | |
1247 // Identifier starting with lowercase. | |
1248 uc32 first_char = c0_; | |
1249 Advance<false, false>(); | |
1250 AddLiteralChar(first_char); | |
1251 while (IsAsciiIdentifier(c0_)) { | |
1252 uc32 first_char = c0_; | |
1253 Advance<false, false>(); | |
1254 AddLiteralChar(first_char); | |
1255 } | |
1256 if (c0_ <= kMaxAscii && c0_ != '\\') { | |
1257 literal.Complete(); | |
1258 return Token::IDENTIFIER; | |
1259 } | |
1260 } else if (c0_ <= kMaxAscii && c0_ != '\\') { | |
1261 // Only a-z+: could be a keyword or identifier. | |
1262 literal.Complete(); | |
1263 Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal(); | |
1264 return KeywordOrIdentifierToken(chars.start(), chars.length(), false); | |
1265 } | |
1266 | |
1267 HandleLeadSurrogate(); | |
1268 } else if (IsInRange(c0_, 'A', 'Z') || c0_ == '_' || c0_ == '$') { | |
1269 do { | |
1270 uc32 first_char = c0_; | |
1271 Advance<false, false>(); | |
1272 AddLiteralChar(first_char); | |
1273 } while (IsAsciiIdentifier(c0_)); | |
1274 | |
1275 if (c0_ <= kMaxAscii && c0_ != '\\') { | |
1276 literal.Complete(); | |
1277 return Token::IDENTIFIER; | |
1278 } | |
1279 | |
1280 HandleLeadSurrogate(); | |
1281 } else if (c0_ == '\\') { | |
1282 // Scan identifier start character. | |
1283 uc32 c = ScanIdentifierUnicodeEscape(); | |
1284 // Only allow legal identifier start characters. | |
1285 if (c < 0 || | |
1286 c == '\\' || // No recursive escapes. | |
1287 !unicode_cache_->IsIdentifierStart(c)) { | |
1288 return Token::ILLEGAL; | |
1289 } | |
1290 AddLiteralChar(c); | |
1291 return ScanIdentifierSuffix(&literal, true); | |
1292 } else { | |
1293 uc32 first_char = c0_; | |
1294 Advance(); | |
1295 AddLiteralChar(first_char); | |
1296 } | |
1297 | |
1298 // Scan the rest of the identifier characters. | |
1299 while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) { | |
1300 if (c0_ != '\\') { | |
1301 uc32 next_char = c0_; | |
1302 Advance(); | |
1303 AddLiteralChar(next_char); | |
1304 continue; | |
1305 } | |
1306 // Fallthrough if no longer able to complete keyword. | |
1307 return ScanIdentifierSuffix(&literal, false); | |
1308 } | |
1309 | |
1310 literal.Complete(); | |
1311 | |
1312 if (next_.literal_chars->is_one_byte()) { | |
1313 Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal(); | |
1314 return KeywordOrIdentifierToken(chars.start(), chars.length(), false); | |
1315 } | |
1316 return Token::IDENTIFIER; | |
1317 } | |
1318 | |
1319 | |
1320 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal, | |
1321 bool escaped) { | |
1322 // Scan the rest of the identifier characters. | |
1323 while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) { | |
1324 if (c0_ == '\\') { | |
1325 uc32 c = ScanIdentifierUnicodeEscape(); | |
1326 escaped = true; | |
1327 // Only allow legal identifier part characters. | |
1328 if (c < 0 || | |
1329 c == '\\' || | |
1330 !unicode_cache_->IsIdentifierPart(c)) { | |
1331 return Token::ILLEGAL; | |
1332 } | |
1333 AddLiteralChar(c); | |
1334 } else { | |
1335 AddLiteralChar(c0_); | |
1336 Advance(); | |
1337 } | |
1338 } | |
1339 literal->Complete(); | |
1340 | |
1341 if (escaped && next_.literal_chars->is_one_byte()) { | |
1342 Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal(); | |
1343 return KeywordOrIdentifierToken(chars.start(), chars.length(), true); | |
1344 } | |
1345 return Token::IDENTIFIER; | |
1346 } | |
1347 | |
1348 | |
1349 bool Scanner::ScanRegExpPattern(bool seen_equal) { | |
1350 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags | |
1351 bool in_character_class = false; | |
1352 | |
1353 // Previous token is either '/' or '/=', in the second case, the | |
1354 // pattern starts at =. | |
1355 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); | |
1356 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); | |
1357 | |
1358 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, | |
1359 // the scanner should pass uninterpreted bodies to the RegExp | |
1360 // constructor. | |
1361 LiteralScope literal(this); | |
1362 if (seen_equal) { | |
1363 AddLiteralChar('='); | |
1364 } | |
1365 | |
1366 while (c0_ != '/' || in_character_class) { | |
1367 if (c0_ < 0 || unicode_cache_->IsLineTerminator(c0_)) return false; | |
1368 if (c0_ == '\\') { // Escape sequence. | |
1369 AddLiteralCharAdvance(); | |
1370 if (c0_ < 0 || unicode_cache_->IsLineTerminator(c0_)) return false; | |
1371 AddLiteralCharAdvance(); | |
1372 // If the escape allows more characters, i.e., \x??, \u????, or \c?, | |
1373 // only "safe" characters are allowed (letters, digits, underscore), | |
1374 // otherwise the escape isn't valid and the invalid character has | |
1375 // its normal meaning. I.e., we can just continue scanning without | |
1376 // worrying whether the following characters are part of the escape | |
1377 // or not, since any '/', '\\' or '[' is guaranteed to not be part | |
1378 // of the escape sequence. | |
1379 | |
1380 // TODO(896): At some point, parse RegExps more throughly to capture | |
1381 // octal esacpes in strict mode. | |
1382 } else { // Unescaped character. | |
1383 if (c0_ == '[') in_character_class = true; | |
1384 if (c0_ == ']') in_character_class = false; | |
1385 AddLiteralCharAdvance(); | |
1386 } | |
1387 } | |
1388 Advance(); // consume '/' | |
1389 | |
1390 literal.Complete(); | |
1391 | |
1392 return true; | |
1393 } | |
1394 | |
1395 | |
1396 Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() { | |
1397 // Scan regular expression flags. | |
1398 LiteralScope literal(this); | |
1399 int flags = 0; | |
1400 while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) { | |
1401 RegExp::Flags flag = RegExp::kNone; | |
1402 switch (c0_) { | |
1403 case 'g': | |
1404 flag = RegExp::kGlobal; | |
1405 break; | |
1406 case 'i': | |
1407 flag = RegExp::kIgnoreCase; | |
1408 break; | |
1409 case 'm': | |
1410 flag = RegExp::kMultiline; | |
1411 break; | |
1412 case 'u': | |
1413 if (!FLAG_harmony_unicode_regexps) return Nothing<RegExp::Flags>(); | |
1414 flag = RegExp::kUnicode; | |
1415 break; | |
1416 case 'y': | |
1417 if (!FLAG_harmony_regexps) return Nothing<RegExp::Flags>(); | |
1418 flag = RegExp::kSticky; | |
1419 break; | |
1420 default: | |
1421 return Nothing<RegExp::Flags>(); | |
1422 } | |
1423 if (flags & flag) return Nothing<RegExp::Flags>(); | |
1424 AddLiteralCharAdvance(); | |
1425 flags |= flag; | |
1426 } | |
1427 literal.Complete(); | |
1428 | |
1429 next_.location.end_pos = source_pos(); | |
1430 return Just(RegExp::Flags(flags)); | |
1431 } | |
1432 | |
1433 | |
1434 const AstRawString* Scanner::CurrentSymbol(AstValueFactory* ast_value_factory) { | |
1435 if (is_literal_one_byte()) { | |
1436 return ast_value_factory->GetOneByteString(literal_one_byte_string()); | |
1437 } | |
1438 return ast_value_factory->GetTwoByteString(literal_two_byte_string()); | |
1439 } | |
1440 | |
1441 | |
1442 const AstRawString* Scanner::NextSymbol(AstValueFactory* ast_value_factory) { | |
1443 if (is_next_literal_one_byte()) { | |
1444 return ast_value_factory->GetOneByteString(next_literal_one_byte_string()); | |
1445 } | |
1446 return ast_value_factory->GetTwoByteString(next_literal_two_byte_string()); | |
1447 } | |
1448 | |
1449 | |
1450 const AstRawString* Scanner::CurrentRawSymbol( | |
1451 AstValueFactory* ast_value_factory) { | |
1452 if (is_raw_literal_one_byte()) { | |
1453 return ast_value_factory->GetOneByteString(raw_literal_one_byte_string()); | |
1454 } | |
1455 return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string()); | |
1456 } | |
1457 | |
1458 | |
1459 double Scanner::DoubleValue() { | |
1460 DCHECK(is_literal_one_byte()); | |
1461 return StringToDouble( | |
1462 unicode_cache_, | |
1463 literal_one_byte_string(), | |
1464 ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY); | |
1465 } | |
1466 | |
1467 | |
1468 bool Scanner::ContainsDot() { | |
1469 DCHECK(is_literal_one_byte()); | |
1470 Vector<const uint8_t> str = literal_one_byte_string(); | |
1471 return std::find(str.begin(), str.end(), '.') != str.end(); | |
1472 } | |
1473 | |
1474 | |
1475 int Scanner::FindSymbol(DuplicateFinder* finder, int value) { | |
1476 if (is_literal_one_byte()) { | |
1477 return finder->AddOneByteSymbol(literal_one_byte_string(), value); | |
1478 } | |
1479 return finder->AddTwoByteSymbol(literal_two_byte_string(), value); | |
1480 } | |
1481 | |
1482 | |
1483 bool Scanner::SetBookmark() { | |
1484 if (c0_ != kNoBookmark && bookmark_c0_ == kNoBookmark && | |
1485 next_next_.token == Token::UNINITIALIZED && source_->SetBookmark()) { | |
1486 bookmark_c0_ = c0_; | |
1487 CopyTokenDesc(&bookmark_current_, ¤t_); | |
1488 CopyTokenDesc(&bookmark_next_, &next_); | |
1489 return true; | |
1490 } | |
1491 return false; | |
1492 } | |
1493 | |
1494 | |
1495 void Scanner::ResetToBookmark() { | |
1496 DCHECK(BookmarkHasBeenSet()); // Caller hasn't called SetBookmark. | |
1497 | |
1498 source_->ResetToBookmark(); | |
1499 c0_ = bookmark_c0_; | |
1500 StartLiteral(); | |
1501 StartRawLiteral(); | |
1502 CopyTokenDesc(&next_, &bookmark_current_); | |
1503 current_ = next_; | |
1504 StartLiteral(); | |
1505 StartRawLiteral(); | |
1506 CopyTokenDesc(&next_, &bookmark_next_); | |
1507 | |
1508 bookmark_c0_ = kBookmarkWasApplied; | |
1509 } | |
1510 | |
1511 | |
1512 bool Scanner::BookmarkHasBeenSet() { return bookmark_c0_ >= 0; } | |
1513 | |
1514 | |
1515 bool Scanner::BookmarkHasBeenReset() { | |
1516 return bookmark_c0_ == kBookmarkWasApplied; | |
1517 } | |
1518 | |
1519 | |
1520 void Scanner::DropBookmark() { bookmark_c0_ = kNoBookmark; } | |
1521 | |
1522 | |
1523 void Scanner::CopyTokenDesc(TokenDesc* to, TokenDesc* from) { | |
1524 DCHECK_NOT_NULL(to); | |
1525 DCHECK_NOT_NULL(from); | |
1526 to->token = from->token; | |
1527 to->location = from->location; | |
1528 to->literal_chars->CopyFrom(from->literal_chars); | |
1529 to->raw_literal_chars->CopyFrom(from->raw_literal_chars); | |
1530 } | |
1531 | |
1532 | |
1533 int DuplicateFinder::AddOneByteSymbol(Vector<const uint8_t> key, int value) { | |
1534 return AddSymbol(key, true, value); | |
1535 } | |
1536 | |
1537 | |
1538 int DuplicateFinder::AddTwoByteSymbol(Vector<const uint16_t> key, int value) { | |
1539 return AddSymbol(Vector<const uint8_t>::cast(key), false, value); | |
1540 } | |
1541 | |
1542 | |
1543 int DuplicateFinder::AddSymbol(Vector<const uint8_t> key, | |
1544 bool is_one_byte, | |
1545 int value) { | |
1546 uint32_t hash = Hash(key, is_one_byte); | |
1547 byte* encoding = BackupKey(key, is_one_byte); | |
1548 HashMap::Entry* entry = map_.LookupOrInsert(encoding, hash); | |
1549 int old_value = static_cast<int>(reinterpret_cast<intptr_t>(entry->value)); | |
1550 entry->value = | |
1551 reinterpret_cast<void*>(static_cast<intptr_t>(value | old_value)); | |
1552 return old_value; | |
1553 } | |
1554 | |
1555 | |
1556 int DuplicateFinder::AddNumber(Vector<const uint8_t> key, int value) { | |
1557 DCHECK(key.length() > 0); | |
1558 // Quick check for already being in canonical form. | |
1559 if (IsNumberCanonical(key)) { | |
1560 return AddOneByteSymbol(key, value); | |
1561 } | |
1562 | |
1563 int flags = ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY; | |
1564 double double_value = StringToDouble( | |
1565 unicode_constants_, key, flags, 0.0); | |
1566 int length; | |
1567 const char* string; | |
1568 if (!std::isfinite(double_value)) { | |
1569 string = "Infinity"; | |
1570 length = 8; // strlen("Infinity"); | |
1571 } else { | |
1572 string = DoubleToCString(double_value, | |
1573 Vector<char>(number_buffer_, kBufferSize)); | |
1574 length = StrLength(string); | |
1575 } | |
1576 return AddSymbol(Vector<const byte>(reinterpret_cast<const byte*>(string), | |
1577 length), true, value); | |
1578 } | |
1579 | |
1580 | |
1581 bool DuplicateFinder::IsNumberCanonical(Vector<const uint8_t> number) { | |
1582 // Test for a safe approximation of number literals that are already | |
1583 // in canonical form: max 15 digits, no leading zeroes, except an | |
1584 // integer part that is a single zero, and no trailing zeros below | |
1585 // the decimal point. | |
1586 int pos = 0; | |
1587 int length = number.length(); | |
1588 if (number.length() > 15) return false; | |
1589 if (number[pos] == '0') { | |
1590 pos++; | |
1591 } else { | |
1592 while (pos < length && | |
1593 static_cast<unsigned>(number[pos] - '0') <= ('9' - '0')) pos++; | |
1594 } | |
1595 if (length == pos) return true; | |
1596 if (number[pos] != '.') return false; | |
1597 pos++; | |
1598 bool invalid_last_digit = true; | |
1599 while (pos < length) { | |
1600 uint8_t digit = number[pos] - '0'; | |
1601 if (digit > '9' - '0') return false; | |
1602 invalid_last_digit = (digit == 0); | |
1603 pos++; | |
1604 } | |
1605 return !invalid_last_digit; | |
1606 } | |
1607 | |
1608 | |
1609 uint32_t DuplicateFinder::Hash(Vector<const uint8_t> key, bool is_one_byte) { | |
1610 // Primitive hash function, almost identical to the one used | |
1611 // for strings (except that it's seeded by the length and representation). | |
1612 int length = key.length(); | |
1613 uint32_t hash = (length << 1) | (is_one_byte ? 1 : 0); | |
1614 for (int i = 0; i < length; i++) { | |
1615 uint32_t c = key[i]; | |
1616 hash = (hash + c) * 1025; | |
1617 hash ^= (hash >> 6); | |
1618 } | |
1619 return hash; | |
1620 } | |
1621 | |
1622 | |
1623 bool DuplicateFinder::Match(void* first, void* second) { | |
1624 // Decode lengths. | |
1625 // Length + representation is encoded as base 128, most significant heptet | |
1626 // first, with a 8th bit being non-zero while there are more heptets. | |
1627 // The value encodes the number of bytes following, and whether the original | |
1628 // was Latin1. | |
1629 byte* s1 = reinterpret_cast<byte*>(first); | |
1630 byte* s2 = reinterpret_cast<byte*>(second); | |
1631 uint32_t length_one_byte_field = 0; | |
1632 byte c1; | |
1633 do { | |
1634 c1 = *s1; | |
1635 if (c1 != *s2) return false; | |
1636 length_one_byte_field = (length_one_byte_field << 7) | (c1 & 0x7f); | |
1637 s1++; | |
1638 s2++; | |
1639 } while ((c1 & 0x80) != 0); | |
1640 int length = static_cast<int>(length_one_byte_field >> 1); | |
1641 return memcmp(s1, s2, length) == 0; | |
1642 } | |
1643 | |
1644 | |
1645 byte* DuplicateFinder::BackupKey(Vector<const uint8_t> bytes, | |
1646 bool is_one_byte) { | |
1647 uint32_t one_byte_length = (bytes.length() << 1) | (is_one_byte ? 1 : 0); | |
1648 backing_store_.StartSequence(); | |
1649 // Emit one_byte_length as base-128 encoded number, with the 7th bit set | |
1650 // on the byte of every heptet except the last, least significant, one. | |
1651 if (one_byte_length >= (1 << 7)) { | |
1652 if (one_byte_length >= (1 << 14)) { | |
1653 if (one_byte_length >= (1 << 21)) { | |
1654 if (one_byte_length >= (1 << 28)) { | |
1655 backing_store_.Add( | |
1656 static_cast<uint8_t>((one_byte_length >> 28) | 0x80)); | |
1657 } | |
1658 backing_store_.Add( | |
1659 static_cast<uint8_t>((one_byte_length >> 21) | 0x80u)); | |
1660 } | |
1661 backing_store_.Add( | |
1662 static_cast<uint8_t>((one_byte_length >> 14) | 0x80u)); | |
1663 } | |
1664 backing_store_.Add(static_cast<uint8_t>((one_byte_length >> 7) | 0x80u)); | |
1665 } | |
1666 backing_store_.Add(static_cast<uint8_t>(one_byte_length & 0x7f)); | |
1667 | |
1668 backing_store_.AddBlock(bytes); | |
1669 return backing_store_.EndSequence().start(); | |
1670 } | |
1671 | |
1672 } // namespace internal | |
1673 } // namespace v8 | |
OLD | NEW |