OLD | NEW |
| (Empty) |
1 // Copyright 2011 the V8 project authors. All rights reserved. | |
2 // Redistribution and use in source and binary forms, with or without | |
3 // modification, are permitted provided that the following conditions are | |
4 // met: | |
5 // | |
6 // * Redistributions of source code must retain the above copyright | |
7 // notice, this list of conditions and the following disclaimer. | |
8 // * Redistributions in binary form must reproduce the above | |
9 // copyright notice, this list of conditions and the following | |
10 // disclaimer in the documentation and/or other materials provided | |
11 // with the distribution. | |
12 // * Neither the name of Google Inc. nor the names of its | |
13 // contributors may be used to endorse or promote products derived | |
14 // from this software without specific prior written permission. | |
15 // | |
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
27 | |
28 // Features shared by parsing and pre-parsing scanners. | |
29 | |
30 #include "../include/v8stdint.h" | |
31 #include "scanner-base.h" | |
32 #include "char-predicates-inl.h" | |
33 | |
34 namespace v8 { | |
35 namespace internal { | |
36 | |
37 // ---------------------------------------------------------------------------- | |
38 // Scanner | |
39 | |
40 Scanner::Scanner(UnicodeCache* unicode_cache) | |
41 : unicode_cache_(unicode_cache) { } | |
42 | |
43 | |
44 uc32 Scanner::ScanHexNumber(int expected_length) { | |
45 ASSERT(expected_length <= 4); // prevent overflow | |
46 | |
47 uc32 digits[4] = { 0, 0, 0, 0 }; | |
48 uc32 x = 0; | |
49 for (int i = 0; i < expected_length; i++) { | |
50 digits[i] = c0_; | |
51 int d = HexValue(c0_); | |
52 if (d < 0) { | |
53 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes | |
54 // should be illegal, but other JS VMs just return the | |
55 // non-escaped version of the original character. | |
56 | |
57 // Push back digits that we have advanced past. | |
58 for (int j = i-1; j >= 0; j--) { | |
59 PushBack(digits[j]); | |
60 } | |
61 return -1; | |
62 } | |
63 x = x * 16 + d; | |
64 Advance(); | |
65 } | |
66 | |
67 return x; | |
68 } | |
69 | |
70 | |
71 | |
72 // ---------------------------------------------------------------------------- | |
73 // JavaScriptScanner | |
74 | |
75 JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants) | |
76 : Scanner(scanner_contants), | |
77 octal_pos_(Location::invalid()), | |
78 harmony_block_scoping_(false) { } | |
79 | |
80 | |
81 void JavaScriptScanner::Initialize(UC16CharacterStream* source) { | |
82 source_ = source; | |
83 // Need to capture identifiers in order to recognize "get" and "set" | |
84 // in object literals. | |
85 Init(); | |
86 // Skip initial whitespace allowing HTML comment ends just like | |
87 // after a newline and scan first token. | |
88 has_line_terminator_before_next_ = true; | |
89 SkipWhiteSpace(); | |
90 Scan(); | |
91 } | |
92 | |
93 | |
94 // Ensure that tokens can be stored in a byte. | |
95 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100); | |
96 | |
97 // Table of one-character tokens, by character (0x00..0x7f only). | |
98 static const byte one_char_tokens[] = { | |
99 Token::ILLEGAL, | |
100 Token::ILLEGAL, | |
101 Token::ILLEGAL, | |
102 Token::ILLEGAL, | |
103 Token::ILLEGAL, | |
104 Token::ILLEGAL, | |
105 Token::ILLEGAL, | |
106 Token::ILLEGAL, | |
107 Token::ILLEGAL, | |
108 Token::ILLEGAL, | |
109 Token::ILLEGAL, | |
110 Token::ILLEGAL, | |
111 Token::ILLEGAL, | |
112 Token::ILLEGAL, | |
113 Token::ILLEGAL, | |
114 Token::ILLEGAL, | |
115 Token::ILLEGAL, | |
116 Token::ILLEGAL, | |
117 Token::ILLEGAL, | |
118 Token::ILLEGAL, | |
119 Token::ILLEGAL, | |
120 Token::ILLEGAL, | |
121 Token::ILLEGAL, | |
122 Token::ILLEGAL, | |
123 Token::ILLEGAL, | |
124 Token::ILLEGAL, | |
125 Token::ILLEGAL, | |
126 Token::ILLEGAL, | |
127 Token::ILLEGAL, | |
128 Token::ILLEGAL, | |
129 Token::ILLEGAL, | |
130 Token::ILLEGAL, | |
131 Token::ILLEGAL, | |
132 Token::ILLEGAL, | |
133 Token::ILLEGAL, | |
134 Token::ILLEGAL, | |
135 Token::ILLEGAL, | |
136 Token::ILLEGAL, | |
137 Token::ILLEGAL, | |
138 Token::ILLEGAL, | |
139 Token::LPAREN, // 0x28 | |
140 Token::RPAREN, // 0x29 | |
141 Token::ILLEGAL, | |
142 Token::ILLEGAL, | |
143 Token::COMMA, // 0x2c | |
144 Token::ILLEGAL, | |
145 Token::ILLEGAL, | |
146 Token::ILLEGAL, | |
147 Token::ILLEGAL, | |
148 Token::ILLEGAL, | |
149 Token::ILLEGAL, | |
150 Token::ILLEGAL, | |
151 Token::ILLEGAL, | |
152 Token::ILLEGAL, | |
153 Token::ILLEGAL, | |
154 Token::ILLEGAL, | |
155 Token::ILLEGAL, | |
156 Token::ILLEGAL, | |
157 Token::COLON, // 0x3a | |
158 Token::SEMICOLON, // 0x3b | |
159 Token::ILLEGAL, | |
160 Token::ILLEGAL, | |
161 Token::ILLEGAL, | |
162 Token::CONDITIONAL, // 0x3f | |
163 Token::ILLEGAL, | |
164 Token::ILLEGAL, | |
165 Token::ILLEGAL, | |
166 Token::ILLEGAL, | |
167 Token::ILLEGAL, | |
168 Token::ILLEGAL, | |
169 Token::ILLEGAL, | |
170 Token::ILLEGAL, | |
171 Token::ILLEGAL, | |
172 Token::ILLEGAL, | |
173 Token::ILLEGAL, | |
174 Token::ILLEGAL, | |
175 Token::ILLEGAL, | |
176 Token::ILLEGAL, | |
177 Token::ILLEGAL, | |
178 Token::ILLEGAL, | |
179 Token::ILLEGAL, | |
180 Token::ILLEGAL, | |
181 Token::ILLEGAL, | |
182 Token::ILLEGAL, | |
183 Token::ILLEGAL, | |
184 Token::ILLEGAL, | |
185 Token::ILLEGAL, | |
186 Token::ILLEGAL, | |
187 Token::ILLEGAL, | |
188 Token::ILLEGAL, | |
189 Token::ILLEGAL, | |
190 Token::LBRACK, // 0x5b | |
191 Token::ILLEGAL, | |
192 Token::RBRACK, // 0x5d | |
193 Token::ILLEGAL, | |
194 Token::ILLEGAL, | |
195 Token::ILLEGAL, | |
196 Token::ILLEGAL, | |
197 Token::ILLEGAL, | |
198 Token::ILLEGAL, | |
199 Token::ILLEGAL, | |
200 Token::ILLEGAL, | |
201 Token::ILLEGAL, | |
202 Token::ILLEGAL, | |
203 Token::ILLEGAL, | |
204 Token::ILLEGAL, | |
205 Token::ILLEGAL, | |
206 Token::ILLEGAL, | |
207 Token::ILLEGAL, | |
208 Token::ILLEGAL, | |
209 Token::ILLEGAL, | |
210 Token::ILLEGAL, | |
211 Token::ILLEGAL, | |
212 Token::ILLEGAL, | |
213 Token::ILLEGAL, | |
214 Token::ILLEGAL, | |
215 Token::ILLEGAL, | |
216 Token::ILLEGAL, | |
217 Token::ILLEGAL, | |
218 Token::ILLEGAL, | |
219 Token::ILLEGAL, | |
220 Token::ILLEGAL, | |
221 Token::ILLEGAL, | |
222 Token::LBRACE, // 0x7b | |
223 Token::ILLEGAL, | |
224 Token::RBRACE, // 0x7d | |
225 Token::BIT_NOT, // 0x7e | |
226 Token::ILLEGAL | |
227 }; | |
228 | |
229 | |
230 Token::Value JavaScriptScanner::Next() { | |
231 current_ = next_; | |
232 has_line_terminator_before_next_ = false; | |
233 has_multiline_comment_before_next_ = false; | |
234 if (static_cast<unsigned>(c0_) <= 0x7f) { | |
235 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]); | |
236 if (token != Token::ILLEGAL) { | |
237 int pos = source_pos(); | |
238 next_.token = token; | |
239 next_.location.beg_pos = pos; | |
240 next_.location.end_pos = pos + 1; | |
241 Advance(); | |
242 return current_.token; | |
243 } | |
244 } | |
245 Scan(); | |
246 return current_.token; | |
247 } | |
248 | |
249 | |
250 static inline bool IsByteOrderMark(uc32 c) { | |
251 // The Unicode value U+FFFE is guaranteed never to be assigned as a | |
252 // Unicode character; this implies that in a Unicode context the | |
253 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF | |
254 // character expressed in little-endian byte order (since it could | |
255 // not be a U+FFFE character expressed in big-endian byte | |
256 // order). Nevertheless, we check for it to be compatible with | |
257 // Spidermonkey. | |
258 return c == 0xFEFF || c == 0xFFFE; | |
259 } | |
260 | |
261 | |
262 bool JavaScriptScanner::SkipWhiteSpace() { | |
263 int start_position = source_pos(); | |
264 | |
265 while (true) { | |
266 // We treat byte-order marks (BOMs) as whitespace for better | |
267 // compatibility with Spidermonkey and other JavaScript engines. | |
268 while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) { | |
269 // IsWhiteSpace() includes line terminators! | |
270 if (unicode_cache_->IsLineTerminator(c0_)) { | |
271 // Ignore line terminators, but remember them. This is necessary | |
272 // for automatic semicolon insertion. | |
273 has_line_terminator_before_next_ = true; | |
274 } | |
275 Advance(); | |
276 } | |
277 | |
278 // If there is an HTML comment end '-->' at the beginning of a | |
279 // line (with only whitespace in front of it), we treat the rest | |
280 // of the line as a comment. This is in line with the way | |
281 // SpiderMonkey handles it. | |
282 if (c0_ == '-' && has_line_terminator_before_next_) { | |
283 Advance(); | |
284 if (c0_ == '-') { | |
285 Advance(); | |
286 if (c0_ == '>') { | |
287 // Treat the rest of the line as a comment. | |
288 SkipSingleLineComment(); | |
289 // Continue skipping white space after the comment. | |
290 continue; | |
291 } | |
292 PushBack('-'); // undo Advance() | |
293 } | |
294 PushBack('-'); // undo Advance() | |
295 } | |
296 // Return whether or not we skipped any characters. | |
297 return source_pos() != start_position; | |
298 } | |
299 } | |
300 | |
301 | |
302 Token::Value JavaScriptScanner::SkipSingleLineComment() { | |
303 Advance(); | |
304 | |
305 // The line terminator at the end of the line is not considered | |
306 // to be part of the single-line comment; it is recognized | |
307 // separately by the lexical grammar and becomes part of the | |
308 // stream of input elements for the syntactic grammar (see | |
309 // ECMA-262, section 7.4). | |
310 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { | |
311 Advance(); | |
312 } | |
313 | |
314 return Token::WHITESPACE; | |
315 } | |
316 | |
317 | |
318 Token::Value JavaScriptScanner::SkipMultiLineComment() { | |
319 ASSERT(c0_ == '*'); | |
320 Advance(); | |
321 | |
322 while (c0_ >= 0) { | |
323 uc32 ch = c0_; | |
324 Advance(); | |
325 if (unicode_cache_->IsLineTerminator(ch)) { | |
326 // Following ECMA-262, section 7.4, a comment containing | |
327 // a newline will make the comment count as a line-terminator. | |
328 has_multiline_comment_before_next_ = true; | |
329 } | |
330 // If we have reached the end of the multi-line comment, we | |
331 // consume the '/' and insert a whitespace. This way all | |
332 // multi-line comments are treated as whitespace. | |
333 if (ch == '*' && c0_ == '/') { | |
334 c0_ = ' '; | |
335 return Token::WHITESPACE; | |
336 } | |
337 } | |
338 | |
339 // Unterminated multi-line comment. | |
340 return Token::ILLEGAL; | |
341 } | |
342 | |
343 | |
344 Token::Value JavaScriptScanner::ScanHtmlComment() { | |
345 // Check for <!-- comments. | |
346 ASSERT(c0_ == '!'); | |
347 Advance(); | |
348 if (c0_ == '-') { | |
349 Advance(); | |
350 if (c0_ == '-') return SkipSingleLineComment(); | |
351 PushBack('-'); // undo Advance() | |
352 } | |
353 PushBack('!'); // undo Advance() | |
354 ASSERT(c0_ == '!'); | |
355 return Token::LT; | |
356 } | |
357 | |
358 | |
359 void JavaScriptScanner::Scan() { | |
360 next_.literal_chars = NULL; | |
361 Token::Value token; | |
362 do { | |
363 // Remember the position of the next token | |
364 next_.location.beg_pos = source_pos(); | |
365 | |
366 switch (c0_) { | |
367 case ' ': | |
368 case '\t': | |
369 Advance(); | |
370 token = Token::WHITESPACE; | |
371 break; | |
372 | |
373 case '\n': | |
374 Advance(); | |
375 has_line_terminator_before_next_ = true; | |
376 token = Token::WHITESPACE; | |
377 break; | |
378 | |
379 case '"': case '\'': | |
380 token = ScanString(); | |
381 break; | |
382 | |
383 case '<': | |
384 // < <= << <<= <!-- | |
385 Advance(); | |
386 if (c0_ == '=') { | |
387 token = Select(Token::LTE); | |
388 } else if (c0_ == '<') { | |
389 token = Select('=', Token::ASSIGN_SHL, Token::SHL); | |
390 } else if (c0_ == '!') { | |
391 token = ScanHtmlComment(); | |
392 } else { | |
393 token = Token::LT; | |
394 } | |
395 break; | |
396 | |
397 case '>': | |
398 // > >= >> >>= >>> >>>= | |
399 Advance(); | |
400 if (c0_ == '=') { | |
401 token = Select(Token::GTE); | |
402 } else if (c0_ == '>') { | |
403 // >> >>= >>> >>>= | |
404 Advance(); | |
405 if (c0_ == '=') { | |
406 token = Select(Token::ASSIGN_SAR); | |
407 } else if (c0_ == '>') { | |
408 token = Select('=', Token::ASSIGN_SHR, Token::SHR); | |
409 } else { | |
410 token = Token::SAR; | |
411 } | |
412 } else { | |
413 token = Token::GT; | |
414 } | |
415 break; | |
416 | |
417 case '=': | |
418 // = == === | |
419 Advance(); | |
420 if (c0_ == '=') { | |
421 token = Select('=', Token::EQ_STRICT, Token::EQ); | |
422 } else { | |
423 token = Token::ASSIGN; | |
424 } | |
425 break; | |
426 | |
427 case '!': | |
428 // ! != !== | |
429 Advance(); | |
430 if (c0_ == '=') { | |
431 token = Select('=', Token::NE_STRICT, Token::NE); | |
432 } else { | |
433 token = Token::NOT; | |
434 } | |
435 break; | |
436 | |
437 case '+': | |
438 // + ++ += | |
439 Advance(); | |
440 if (c0_ == '+') { | |
441 token = Select(Token::INC); | |
442 } else if (c0_ == '=') { | |
443 token = Select(Token::ASSIGN_ADD); | |
444 } else { | |
445 token = Token::ADD; | |
446 } | |
447 break; | |
448 | |
449 case '-': | |
450 // - -- --> -= | |
451 Advance(); | |
452 if (c0_ == '-') { | |
453 Advance(); | |
454 if (c0_ == '>' && has_line_terminator_before_next_) { | |
455 // For compatibility with SpiderMonkey, we skip lines that | |
456 // start with an HTML comment end '-->'. | |
457 token = SkipSingleLineComment(); | |
458 } else { | |
459 token = Token::DEC; | |
460 } | |
461 } else if (c0_ == '=') { | |
462 token = Select(Token::ASSIGN_SUB); | |
463 } else { | |
464 token = Token::SUB; | |
465 } | |
466 break; | |
467 | |
468 case '*': | |
469 // * *= | |
470 token = Select('=', Token::ASSIGN_MUL, Token::MUL); | |
471 break; | |
472 | |
473 case '%': | |
474 // % %= | |
475 token = Select('=', Token::ASSIGN_MOD, Token::MOD); | |
476 break; | |
477 | |
478 case '/': | |
479 // / // /* /= | |
480 Advance(); | |
481 if (c0_ == '/') { | |
482 token = SkipSingleLineComment(); | |
483 } else if (c0_ == '*') { | |
484 token = SkipMultiLineComment(); | |
485 } else if (c0_ == '=') { | |
486 token = Select(Token::ASSIGN_DIV); | |
487 } else { | |
488 token = Token::DIV; | |
489 } | |
490 break; | |
491 | |
492 case '&': | |
493 // & && &= | |
494 Advance(); | |
495 if (c0_ == '&') { | |
496 token = Select(Token::AND); | |
497 } else if (c0_ == '=') { | |
498 token = Select(Token::ASSIGN_BIT_AND); | |
499 } else { | |
500 token = Token::BIT_AND; | |
501 } | |
502 break; | |
503 | |
504 case '|': | |
505 // | || |= | |
506 Advance(); | |
507 if (c0_ == '|') { | |
508 token = Select(Token::OR); | |
509 } else if (c0_ == '=') { | |
510 token = Select(Token::ASSIGN_BIT_OR); | |
511 } else { | |
512 token = Token::BIT_OR; | |
513 } | |
514 break; | |
515 | |
516 case '^': | |
517 // ^ ^= | |
518 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); | |
519 break; | |
520 | |
521 case '.': | |
522 // . Number | |
523 Advance(); | |
524 if (IsDecimalDigit(c0_)) { | |
525 token = ScanNumber(true); | |
526 } else { | |
527 token = Token::PERIOD; | |
528 } | |
529 break; | |
530 | |
531 case ':': | |
532 token = Select(Token::COLON); | |
533 break; | |
534 | |
535 case ';': | |
536 token = Select(Token::SEMICOLON); | |
537 break; | |
538 | |
539 case ',': | |
540 token = Select(Token::COMMA); | |
541 break; | |
542 | |
543 case '(': | |
544 token = Select(Token::LPAREN); | |
545 break; | |
546 | |
547 case ')': | |
548 token = Select(Token::RPAREN); | |
549 break; | |
550 | |
551 case '[': | |
552 token = Select(Token::LBRACK); | |
553 break; | |
554 | |
555 case ']': | |
556 token = Select(Token::RBRACK); | |
557 break; | |
558 | |
559 case '{': | |
560 token = Select(Token::LBRACE); | |
561 break; | |
562 | |
563 case '}': | |
564 token = Select(Token::RBRACE); | |
565 break; | |
566 | |
567 case '?': | |
568 token = Select(Token::CONDITIONAL); | |
569 break; | |
570 | |
571 case '~': | |
572 token = Select(Token::BIT_NOT); | |
573 break; | |
574 | |
575 default: | |
576 if (unicode_cache_->IsIdentifierStart(c0_)) { | |
577 token = ScanIdentifierOrKeyword(); | |
578 } else if (IsDecimalDigit(c0_)) { | |
579 token = ScanNumber(false); | |
580 } else if (SkipWhiteSpace()) { | |
581 token = Token::WHITESPACE; | |
582 } else if (c0_ < 0) { | |
583 token = Token::EOS; | |
584 } else { | |
585 token = Select(Token::ILLEGAL); | |
586 } | |
587 break; | |
588 } | |
589 | |
590 // Continue scanning for tokens as long as we're just skipping | |
591 // whitespace. | |
592 } while (token == Token::WHITESPACE); | |
593 | |
594 next_.location.end_pos = source_pos(); | |
595 next_.token = token; | |
596 } | |
597 | |
598 | |
599 void JavaScriptScanner::SeekForward(int pos) { | |
600 // After this call, we will have the token at the given position as | |
601 // the "next" token. The "current" token will be invalid. | |
602 if (pos == next_.location.beg_pos) return; | |
603 int current_pos = source_pos(); | |
604 ASSERT_EQ(next_.location.end_pos, current_pos); | |
605 // Positions inside the lookahead token aren't supported. | |
606 ASSERT(pos >= current_pos); | |
607 if (pos != current_pos) { | |
608 source_->SeekForward(pos - source_->pos()); | |
609 Advance(); | |
610 // This function is only called to seek to the location | |
611 // of the end of a function (at the "}" token). It doesn't matter | |
612 // whether there was a line terminator in the part we skip. | |
613 has_line_terminator_before_next_ = false; | |
614 has_multiline_comment_before_next_ = false; | |
615 } | |
616 Scan(); | |
617 } | |
618 | |
619 | |
620 void JavaScriptScanner::ScanEscape() { | |
621 uc32 c = c0_; | |
622 Advance(); | |
623 | |
624 // Skip escaped newlines. | |
625 if (unicode_cache_->IsLineTerminator(c)) { | |
626 // Allow CR+LF newlines in multiline string literals. | |
627 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance(); | |
628 // Allow LF+CR newlines in multiline string literals. | |
629 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance(); | |
630 return; | |
631 } | |
632 | |
633 switch (c) { | |
634 case '\'': // fall through | |
635 case '"' : // fall through | |
636 case '\\': break; | |
637 case 'b' : c = '\b'; break; | |
638 case 'f' : c = '\f'; break; | |
639 case 'n' : c = '\n'; break; | |
640 case 'r' : c = '\r'; break; | |
641 case 't' : c = '\t'; break; | |
642 case 'u' : { | |
643 c = ScanHexNumber(4); | |
644 if (c < 0) c = 'u'; | |
645 break; | |
646 } | |
647 case 'v' : c = '\v'; break; | |
648 case 'x' : { | |
649 c = ScanHexNumber(2); | |
650 if (c < 0) c = 'x'; | |
651 break; | |
652 } | |
653 case '0' : // fall through | |
654 case '1' : // fall through | |
655 case '2' : // fall through | |
656 case '3' : // fall through | |
657 case '4' : // fall through | |
658 case '5' : // fall through | |
659 case '6' : // fall through | |
660 case '7' : c = ScanOctalEscape(c, 2); break; | |
661 } | |
662 | |
663 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these | |
664 // should be illegal, but they are commonly handled | |
665 // as non-escaped characters by JS VMs. | |
666 AddLiteralChar(c); | |
667 } | |
668 | |
669 | |
670 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of | |
671 // ECMA-262. Other JS VMs support them. | |
672 uc32 JavaScriptScanner::ScanOctalEscape(uc32 c, int length) { | |
673 uc32 x = c - '0'; | |
674 int i = 0; | |
675 for (; i < length; i++) { | |
676 int d = c0_ - '0'; | |
677 if (d < 0 || d > 7) break; | |
678 int nx = x * 8 + d; | |
679 if (nx >= 256) break; | |
680 x = nx; | |
681 Advance(); | |
682 } | |
683 // Anything except '\0' is an octal escape sequence, illegal in strict mode. | |
684 // Remember the position of octal escape sequences so that an error | |
685 // can be reported later (in strict mode). | |
686 // We don't report the error immediately, because the octal escape can | |
687 // occur before the "use strict" directive. | |
688 if (c != '0' || i > 0) { | |
689 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1); | |
690 } | |
691 return x; | |
692 } | |
693 | |
694 | |
695 Token::Value JavaScriptScanner::ScanString() { | |
696 uc32 quote = c0_; | |
697 Advance(); // consume quote | |
698 | |
699 LiteralScope literal(this); | |
700 while (c0_ != quote && c0_ >= 0 | |
701 && !unicode_cache_->IsLineTerminator(c0_)) { | |
702 uc32 c = c0_; | |
703 Advance(); | |
704 if (c == '\\') { | |
705 if (c0_ < 0) return Token::ILLEGAL; | |
706 ScanEscape(); | |
707 } else { | |
708 AddLiteralChar(c); | |
709 } | |
710 } | |
711 if (c0_ != quote) return Token::ILLEGAL; | |
712 literal.Complete(); | |
713 | |
714 Advance(); // consume quote | |
715 return Token::STRING; | |
716 } | |
717 | |
718 | |
719 void JavaScriptScanner::ScanDecimalDigits() { | |
720 while (IsDecimalDigit(c0_)) | |
721 AddLiteralCharAdvance(); | |
722 } | |
723 | |
724 | |
725 Token::Value JavaScriptScanner::ScanNumber(bool seen_period) { | |
726 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction | |
727 | |
728 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL; | |
729 | |
730 LiteralScope literal(this); | |
731 if (seen_period) { | |
732 // we have already seen a decimal point of the float | |
733 AddLiteralChar('.'); | |
734 ScanDecimalDigits(); // we know we have at least one digit | |
735 | |
736 } else { | |
737 // if the first character is '0' we must check for octals and hex | |
738 if (c0_ == '0') { | |
739 int start_pos = source_pos(); // For reporting octal positions. | |
740 AddLiteralCharAdvance(); | |
741 | |
742 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number | |
743 if (c0_ == 'x' || c0_ == 'X') { | |
744 // hex number | |
745 kind = HEX; | |
746 AddLiteralCharAdvance(); | |
747 if (!IsHexDigit(c0_)) { | |
748 // we must have at least one hex digit after 'x'/'X' | |
749 return Token::ILLEGAL; | |
750 } | |
751 while (IsHexDigit(c0_)) { | |
752 AddLiteralCharAdvance(); | |
753 } | |
754 } else if ('0' <= c0_ && c0_ <= '7') { | |
755 // (possible) octal number | |
756 kind = OCTAL; | |
757 while (true) { | |
758 if (c0_ == '8' || c0_ == '9') { | |
759 kind = DECIMAL; | |
760 break; | |
761 } | |
762 if (c0_ < '0' || '7' < c0_) { | |
763 // Octal literal finished. | |
764 octal_pos_ = Location(start_pos, source_pos()); | |
765 break; | |
766 } | |
767 AddLiteralCharAdvance(); | |
768 } | |
769 } | |
770 } | |
771 | |
772 // Parse decimal digits and allow trailing fractional part. | |
773 if (kind == DECIMAL) { | |
774 ScanDecimalDigits(); // optional | |
775 if (c0_ == '.') { | |
776 AddLiteralCharAdvance(); | |
777 ScanDecimalDigits(); // optional | |
778 } | |
779 } | |
780 } | |
781 | |
782 // scan exponent, if any | |
783 if (c0_ == 'e' || c0_ == 'E') { | |
784 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number | |
785 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed | |
786 // scan exponent | |
787 AddLiteralCharAdvance(); | |
788 if (c0_ == '+' || c0_ == '-') | |
789 AddLiteralCharAdvance(); | |
790 if (!IsDecimalDigit(c0_)) { | |
791 // we must have at least one decimal digit after 'e'/'E' | |
792 return Token::ILLEGAL; | |
793 } | |
794 ScanDecimalDigits(); | |
795 } | |
796 | |
797 // The source character immediately following a numeric literal must | |
798 // not be an identifier start or a decimal digit; see ECMA-262 | |
799 // section 7.8.3, page 17 (note that we read only one decimal digit | |
800 // if the value is 0). | |
801 if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_)) | |
802 return Token::ILLEGAL; | |
803 | |
804 literal.Complete(); | |
805 | |
806 return Token::NUMBER; | |
807 } | |
808 | |
809 | |
810 uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() { | |
811 Advance(); | |
812 if (c0_ != 'u') return -1; | |
813 Advance(); | |
814 uc32 result = ScanHexNumber(4); | |
815 if (result < 0) PushBack('u'); | |
816 return result; | |
817 } | |
818 | |
819 | |
820 // ---------------------------------------------------------------------------- | |
821 // Keyword Matcher | |
822 | |
823 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ | |
824 KEYWORD_GROUP('b') \ | |
825 KEYWORD("break", Token::BREAK) \ | |
826 KEYWORD_GROUP('c') \ | |
827 KEYWORD("case", Token::CASE) \ | |
828 KEYWORD("catch", Token::CATCH) \ | |
829 KEYWORD("class", Token::FUTURE_RESERVED_WORD) \ | |
830 KEYWORD("const", Token::CONST) \ | |
831 KEYWORD("continue", Token::CONTINUE) \ | |
832 KEYWORD_GROUP('d') \ | |
833 KEYWORD("debugger", Token::DEBUGGER) \ | |
834 KEYWORD("default", Token::DEFAULT) \ | |
835 KEYWORD("delete", Token::DELETE) \ | |
836 KEYWORD("do", Token::DO) \ | |
837 KEYWORD_GROUP('e') \ | |
838 KEYWORD("else", Token::ELSE) \ | |
839 KEYWORD("enum", Token::FUTURE_RESERVED_WORD) \ | |
840 KEYWORD("export", Token::FUTURE_RESERVED_WORD) \ | |
841 KEYWORD("extends", Token::FUTURE_RESERVED_WORD) \ | |
842 KEYWORD_GROUP('f') \ | |
843 KEYWORD("false", Token::FALSE_LITERAL) \ | |
844 KEYWORD("finally", Token::FINALLY) \ | |
845 KEYWORD("for", Token::FOR) \ | |
846 KEYWORD("function", Token::FUNCTION) \ | |
847 KEYWORD_GROUP('i') \ | |
848 KEYWORD("if", Token::IF) \ | |
849 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
850 KEYWORD("import", Token::FUTURE_RESERVED_WORD) \ | |
851 KEYWORD("in", Token::IN) \ | |
852 KEYWORD("instanceof", Token::INSTANCEOF) \ | |
853 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
854 KEYWORD_GROUP('l') \ | |
855 KEYWORD("let", harmony_block_scoping \ | |
856 ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \ | |
857 KEYWORD_GROUP('n') \ | |
858 KEYWORD("new", Token::NEW) \ | |
859 KEYWORD("null", Token::NULL_LITERAL) \ | |
860 KEYWORD_GROUP('p') \ | |
861 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
862 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
863 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
864 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
865 KEYWORD_GROUP('r') \ | |
866 KEYWORD("return", Token::RETURN) \ | |
867 KEYWORD_GROUP('s') \ | |
868 KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
869 KEYWORD("super", Token::FUTURE_RESERVED_WORD) \ | |
870 KEYWORD("switch", Token::SWITCH) \ | |
871 KEYWORD_GROUP('t') \ | |
872 KEYWORD("this", Token::THIS) \ | |
873 KEYWORD("throw", Token::THROW) \ | |
874 KEYWORD("true", Token::TRUE_LITERAL) \ | |
875 KEYWORD("try", Token::TRY) \ | |
876 KEYWORD("typeof", Token::TYPEOF) \ | |
877 KEYWORD_GROUP('v') \ | |
878 KEYWORD("var", Token::VAR) \ | |
879 KEYWORD("void", Token::VOID) \ | |
880 KEYWORD_GROUP('w') \ | |
881 KEYWORD("while", Token::WHILE) \ | |
882 KEYWORD("with", Token::WITH) \ | |
883 KEYWORD_GROUP('y') \ | |
884 KEYWORD("yield", Token::FUTURE_STRICT_RESERVED_WORD) | |
885 | |
886 | |
887 static Token::Value KeywordOrIdentifierToken(const char* input, | |
888 int input_length, | |
889 bool harmony_block_scoping) { | |
890 ASSERT(input_length >= 1); | |
891 const int kMinLength = 2; | |
892 const int kMaxLength = 10; | |
893 if (input_length < kMinLength || input_length > kMaxLength) { | |
894 return Token::IDENTIFIER; | |
895 } | |
896 switch (input[0]) { | |
897 default: | |
898 #define KEYWORD_GROUP_CASE(ch) \ | |
899 break; \ | |
900 case ch: | |
901 #define KEYWORD(keyword, token) \ | |
902 { \ | |
903 /* 'keyword' is a char array, so sizeof(keyword) is */ \ | |
904 /* strlen(keyword) plus 1 for the NUL char. */ \ | |
905 const int keyword_length = sizeof(keyword) - 1; \ | |
906 STATIC_ASSERT(keyword_length >= kMinLength); \ | |
907 STATIC_ASSERT(keyword_length <= kMaxLength); \ | |
908 if (input_length == keyword_length && \ | |
909 input[1] == keyword[1] && \ | |
910 (keyword_length <= 2 || input[2] == keyword[2]) && \ | |
911 (keyword_length <= 3 || input[3] == keyword[3]) && \ | |
912 (keyword_length <= 4 || input[4] == keyword[4]) && \ | |
913 (keyword_length <= 5 || input[5] == keyword[5]) && \ | |
914 (keyword_length <= 6 || input[6] == keyword[6]) && \ | |
915 (keyword_length <= 7 || input[7] == keyword[7]) && \ | |
916 (keyword_length <= 8 || input[8] == keyword[8]) && \ | |
917 (keyword_length <= 9 || input[9] == keyword[9])) { \ | |
918 return token; \ | |
919 } \ | |
920 } | |
921 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD) | |
922 } | |
923 return Token::IDENTIFIER; | |
924 } | |
925 | |
926 | |
927 Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() { | |
928 ASSERT(unicode_cache_->IsIdentifierStart(c0_)); | |
929 LiteralScope literal(this); | |
930 // Scan identifier start character. | |
931 if (c0_ == '\\') { | |
932 uc32 c = ScanIdentifierUnicodeEscape(); | |
933 // Only allow legal identifier start characters. | |
934 if (c < 0 || | |
935 c == '\\' || // No recursive escapes. | |
936 !unicode_cache_->IsIdentifierStart(c)) { | |
937 return Token::ILLEGAL; | |
938 } | |
939 AddLiteralChar(c); | |
940 return ScanIdentifierSuffix(&literal); | |
941 } | |
942 | |
943 uc32 first_char = c0_; | |
944 Advance(); | |
945 AddLiteralChar(first_char); | |
946 | |
947 // Scan the rest of the identifier characters. | |
948 while (unicode_cache_->IsIdentifierPart(c0_)) { | |
949 if (c0_ != '\\') { | |
950 uc32 next_char = c0_; | |
951 Advance(); | |
952 AddLiteralChar(next_char); | |
953 continue; | |
954 } | |
955 // Fallthrough if no longer able to complete keyword. | |
956 return ScanIdentifierSuffix(&literal); | |
957 } | |
958 | |
959 literal.Complete(); | |
960 | |
961 if (next_.literal_chars->is_ascii()) { | |
962 Vector<const char> chars = next_.literal_chars->ascii_literal(); | |
963 return KeywordOrIdentifierToken(chars.start(), | |
964 chars.length(), | |
965 harmony_block_scoping_); | |
966 } | |
967 | |
968 return Token::IDENTIFIER; | |
969 } | |
970 | |
971 | |
972 Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) { | |
973 // Scan the rest of the identifier characters. | |
974 while (unicode_cache_->IsIdentifierPart(c0_)) { | |
975 if (c0_ == '\\') { | |
976 uc32 c = ScanIdentifierUnicodeEscape(); | |
977 // Only allow legal identifier part characters. | |
978 if (c < 0 || | |
979 c == '\\' || | |
980 !unicode_cache_->IsIdentifierPart(c)) { | |
981 return Token::ILLEGAL; | |
982 } | |
983 AddLiteralChar(c); | |
984 } else { | |
985 AddLiteralChar(c0_); | |
986 Advance(); | |
987 } | |
988 } | |
989 literal->Complete(); | |
990 | |
991 return Token::IDENTIFIER; | |
992 } | |
993 | |
994 | |
995 bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) { | |
996 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags | |
997 bool in_character_class = false; | |
998 | |
999 // Previous token is either '/' or '/=', in the second case, the | |
1000 // pattern starts at =. | |
1001 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); | |
1002 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); | |
1003 | |
1004 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, | |
1005 // the scanner should pass uninterpreted bodies to the RegExp | |
1006 // constructor. | |
1007 LiteralScope literal(this); | |
1008 if (seen_equal) { | |
1009 AddLiteralChar('='); | |
1010 } | |
1011 | |
1012 while (c0_ != '/' || in_character_class) { | |
1013 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; | |
1014 if (c0_ == '\\') { // Escape sequence. | |
1015 AddLiteralCharAdvance(); | |
1016 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; | |
1017 AddLiteralCharAdvance(); | |
1018 // If the escape allows more characters, i.e., \x??, \u????, or \c?, | |
1019 // only "safe" characters are allowed (letters, digits, underscore), | |
1020 // otherwise the escape isn't valid and the invalid character has | |
1021 // its normal meaning. I.e., we can just continue scanning without | |
1022 // worrying whether the following characters are part of the escape | |
1023 // or not, since any '/', '\\' or '[' is guaranteed to not be part | |
1024 // of the escape sequence. | |
1025 | |
1026 // TODO(896): At some point, parse RegExps more throughly to capture | |
1027 // octal esacpes in strict mode. | |
1028 } else { // Unescaped character. | |
1029 if (c0_ == '[') in_character_class = true; | |
1030 if (c0_ == ']') in_character_class = false; | |
1031 AddLiteralCharAdvance(); | |
1032 } | |
1033 } | |
1034 Advance(); // consume '/' | |
1035 | |
1036 literal.Complete(); | |
1037 | |
1038 return true; | |
1039 } | |
1040 | |
1041 | |
1042 bool JavaScriptScanner::ScanLiteralUnicodeEscape() { | |
1043 ASSERT(c0_ == '\\'); | |
1044 uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0}; | |
1045 Advance(); | |
1046 int i = 1; | |
1047 if (c0_ == 'u') { | |
1048 i++; | |
1049 while (i < 6) { | |
1050 Advance(); | |
1051 if (!IsHexDigit(c0_)) break; | |
1052 chars_read[i] = c0_; | |
1053 i++; | |
1054 } | |
1055 } | |
1056 if (i < 6) { | |
1057 // Incomplete escape. Undo all advances and return false. | |
1058 while (i > 0) { | |
1059 i--; | |
1060 PushBack(chars_read[i]); | |
1061 } | |
1062 return false; | |
1063 } | |
1064 // Complete escape. Add all chars to current literal buffer. | |
1065 for (int i = 0; i < 6; i++) { | |
1066 AddLiteralChar(chars_read[i]); | |
1067 } | |
1068 return true; | |
1069 } | |
1070 | |
1071 | |
1072 bool JavaScriptScanner::ScanRegExpFlags() { | |
1073 // Scan regular expression flags. | |
1074 LiteralScope literal(this); | |
1075 while (unicode_cache_->IsIdentifierPart(c0_)) { | |
1076 if (c0_ != '\\') { | |
1077 AddLiteralCharAdvance(); | |
1078 } else { | |
1079 if (!ScanLiteralUnicodeEscape()) { | |
1080 break; | |
1081 } | |
1082 } | |
1083 } | |
1084 literal.Complete(); | |
1085 | |
1086 next_.location.end_pos = source_pos() - 1; | |
1087 return true; | |
1088 } | |
1089 | |
1090 } } // namespace v8::internal | |
OLD | NEW |