Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(38)

Side by Side Diff: src/prescanner.h

Issue 5188009: Merge preparser Scanner with main JavaScript scanner. (Closed)
Patch Set: Address review. Fix thinko in keyword matcher. Created 10 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/parser.cc ('k') | src/scanner.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2010 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28 #ifndef V8_PRESCANNER_H_
29 #define V8_PRESCANNER_H_
30
31 #include "token.h"
32 #include "char-predicates-inl.h"
33 #include "utils.h"
34 #include "scanner-base.h"
35
36 namespace v8 {
37 namespace preparser {
38
39 namespace i = v8::internal;
40
41 typedef int uc32;
42
43 class PreScannerStackGuard {
44 public:
45 explicit PreScannerStackGuard(int max_size)
46 : limit_(StackPoint().at() - max_size) { }
47 bool has_overflowed() {
48 return StackPoint().at() < limit_;
49 }
50 private:
51 class StackPoint {
52 public:
53 char* at() { return reinterpret_cast<char*>(this); }
54 };
55 char* limit_;
56 };
57
58
59 // Scanner for preparsing.
60 // InputStream is a source of UC16 characters with limited push-back.
61 // LiteralsBuffer is a collector of (UTF-8) characters used to capture literals.
62 class Scanner {
63 public:
64 enum LiteralType {
65 kLiteralNumber,
66 kLiteralIdentifier,
67 kLiteralString,
68 kLiteralRegExp,
69 kLiteralRegExpFlags
70 };
71
72 class LiteralScope {
73 public:
74 explicit LiteralScope(Scanner* self, LiteralType type);
75 ~LiteralScope();
76 void Complete();
77
78 private:
79 Scanner* scanner_;
80 bool complete_;
81 };
82
83 Scanner();
84
85 void Initialize(i::UTF16Buffer* stream);
86
87 // Returns the next token.
88 i::Token::Value Next();
89
90 // Returns the current token again.
91 i::Token::Value current_token() { return current_.token; }
92
93 // One token look-ahead (past the token returned by Next()).
94 i::Token::Value peek() const { return next_.token; }
95
96 // Returns true if there was a line terminator before the peek'ed token.
97 bool has_line_terminator_before_next() const {
98 return has_line_terminator_before_next_;
99 }
100
101 struct Location {
102 Location(int b, int e) : beg_pos(b), end_pos(e) { }
103 Location() : beg_pos(0), end_pos(0) { }
104 int beg_pos;
105 int end_pos;
106 };
107
108 // Returns the location information for the current token
109 // (the token returned by Next()).
110 Location location() const { return current_.location; }
111 // Returns the location information for the look-ahead token
112 // (the token returned by peek()).
113 Location peek_location() const { return next_.location; }
114
115 // Returns the literal string, if any, for the current token (the
116 // token returned by Next()). The string is 0-terminated and in
117 // UTF-8 format; they may contain 0-characters. Literal strings are
118 // collected for identifiers, strings, and numbers.
119 // These functions only give the correct result if the literal
120 // was scanned between calls to StartLiteral() and TerminateLiteral().
121 const char* literal_string() const {
122 return current_.literal_chars;
123 }
124
125 int literal_length() const {
126 // Excluding terminal '\x00' added by TerminateLiteral().
127 return current_.literal_length - 1;
128 }
129
130 i::Vector<const char> literal() const {
131 return i::Vector<const char>(literal_string(), literal_length());
132 }
133
134 // Returns the literal string for the next token (the token that
135 // would be returned if Next() were called).
136 const char* next_literal_string() const {
137 return next_.literal_chars;
138 }
139
140 // Returns the length of the next token (that would be returned if
141 // Next() were called).
142 int next_literal_length() const {
143 // Excluding terminal '\x00' added by TerminateLiteral().
144 return next_.literal_length - 1;
145 }
146
147 i::Vector<const char> next_literal() const {
148 return i::Vector<const char>(next_literal_string(), next_literal_length());
149 }
150
151 // Scans the input as a regular expression pattern, previous
152 // character(s) must be /(=). Returns true if a pattern is scanned.
153 bool ScanRegExpPattern(bool seen_equal);
154 // Returns true if regexp flags are scanned (always since flags can
155 // be empty).
156 bool ScanRegExpFlags();
157
158 // Seek forward to the given position. This operation does not
159 // work in general, for instance when there are pushed back
160 // characters, but works for seeking forward until simple delimiter
161 // tokens, which is what it is used for.
162 void SeekForward(int pos);
163
164 bool stack_overflow() { return stack_overflow_; }
165
166 static const int kCharacterLookaheadBufferSize = 1;
167 static const int kNoEndPosition = 1;
168
169 private:
170 // The current and look-ahead token.
171 struct TokenDesc {
172 i::Token::Value token;
173 Location location;
174 const char* literal_chars;
175 int literal_length;
176 };
177
178 // Default stack limit is 128K pointers.
179 static const int kMaxStackSize = 128 * 1024 * sizeof(void*); // NOLINT.
180
181 void Init(unibrow::CharacterStream* stream);
182
183 // Literal buffer support
184 inline void StartLiteral(LiteralType type);
185 inline void AddLiteralChar(uc32 ch);
186 inline void AddLiteralCharAdvance();
187 inline void TerminateLiteral();
188 // Stops scanning of a literal, e.g., due to an encountered error.
189 inline void DropLiteral();
190
191 // Low-level scanning support.
192 void Advance() { c0_ = source_->Advance(); }
193 void PushBack(uc32 ch) {
194 source_->PushBack(ch);
195 c0_ = ch;
196 }
197
198 bool SkipWhiteSpace();
199
200 i::Token::Value SkipSingleLineComment();
201 i::Token::Value SkipMultiLineComment();
202
203 inline i::Token::Value Select(i::Token::Value tok);
204 inline i::Token::Value Select(uc32 next,
205 i::Token::Value then,
206 i::Token::Value else_);
207
208 // Scans a single JavaScript token.
209 void Scan();
210
211 void ScanDecimalDigits();
212 i::Token::Value ScanNumber(bool seen_period);
213 i::Token::Value ScanIdentifier();
214 uc32 ScanHexEscape(uc32 c, int length);
215 uc32 ScanOctalEscape(uc32 c, int length);
216 void ScanEscape();
217 i::Token::Value ScanString();
218
219 // Scans a possible HTML comment -- begins with '<!'.
220 i::Token::Value ScanHtmlComment();
221
222 // Return the current source position.
223 int source_pos() {
224 return source_->pos() - kCharacterLookaheadBufferSize;
225 }
226
227 // Decodes a unicode escape-sequence which is part of an identifier.
228 // If the escape sequence cannot be decoded the result is kBadRune.
229 uc32 ScanIdentifierUnicodeEscape();
230
231 PreScannerStackGuard stack_guard_;
232
233 TokenDesc current_; // desc for current token (as returned by Next())
234 TokenDesc next_; // desc for next token (one token look-ahead)
235 bool has_line_terminator_before_next_;
236
237 // Source.
238 i::UTF16Buffer* source_;
239
240 // Buffer to hold literal values (identifiers, strings, numerals, regexps and
241 // regexp flags) using '\x00'-terminated UTF-8 encoding.
242 // Handles allocation internally.
243 // Notice that the '\x00' termination is meaningless for strings and regexps
244 // which may contain the zero-character, but can be used as terminator for
245 // identifiers, numerals and regexp flags.Collector
246 i::LiteralCollector literal_buffer_;
247
248 bool stack_overflow_;
249
250 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
251 uc32 c0_;
252 };
253
254
255 // ----------------------------------------------------------------------------
256 // Scanner::LiteralScope
257
258 Scanner::LiteralScope::LiteralScope(
259 Scanner* self, LiteralType type)
260 : scanner_(self), complete_(false) {
261 self->StartLiteral(type);
262 }
263
264
265 Scanner::LiteralScope::~LiteralScope() {
266 if (!complete_) scanner_->DropLiteral();
267 }
268
269 void Scanner::LiteralScope::Complete() {
270 scanner_->TerminateLiteral();
271 complete_ = true;
272 }
273
274
275 // ----------------------------------------------------------------------------
276 // Scanner.
277 Scanner::Scanner()
278 : stack_guard_(kMaxStackSize),
279 has_line_terminator_before_next_(false),
280 source_(NULL),
281 stack_overflow_(false) {}
282
283
284 void Scanner::Initialize(i::UTF16Buffer* stream) {
285 source_ = stream;
286
287 // Initialize current_ to not refer to a literal.
288 current_.literal_length = 0;
289 // Reset literal buffer.
290 literal_buffer_.Reset();
291
292 // Set c0_ (one character ahead)
293 ASSERT(kCharacterLookaheadBufferSize == 1);
294 Advance();
295
296 // Skip initial whitespace allowing HTML comment ends just like
297 // after a newline and scan first token.
298 has_line_terminator_before_next_ = true;
299 SkipWhiteSpace();
300 Scan();
301 }
302
303
304 i::Token::Value Scanner::Next() {
305 // BUG 1215673: Find a thread safe way to set a stack limit in
306 // pre-parse mode. Otherwise, we cannot safely pre-parse from other
307 // threads.
308 current_ = next_;
309 // Check for stack-overflow before returning any tokens.
310 if (stack_guard_.has_overflowed()) {
311 stack_overflow_ = true;
312 next_.token = i::Token::ILLEGAL;
313 } else {
314 has_line_terminator_before_next_ = false;
315 Scan();
316 }
317 return current_.token;
318 }
319
320
321 void Scanner::StartLiteral(LiteralType type) {
322 // Only record string and literal identifiers when preparsing.
323 // Those are the ones that are recorded as symbols. Numbers and
324 // regexps are not recorded.
325 if (type == kLiteralString || type == kLiteralIdentifier) {
326 literal_buffer_.StartLiteral();
327 }
328 }
329
330
331 void Scanner::AddLiteralChar(uc32 c) {
332 literal_buffer_.AddChar(c);
333 }
334
335
336 void Scanner::TerminateLiteral() {
337 i::Vector<const char> chars = literal_buffer_.EndLiteral();
338 next_.literal_chars = chars.start();
339 next_.literal_length = chars.length();
340 }
341
342
343 void Scanner::DropLiteral() {
344 literal_buffer_.DropLiteral();
345 }
346
347
348 void Scanner::AddLiteralCharAdvance() {
349 AddLiteralChar(c0_);
350 Advance();
351 }
352
353
354 static inline bool IsByteOrderMark(uc32 c) {
355 // The Unicode value U+FFFE is guaranteed never to be assigned as a
356 // Unicode character; this implies that in a Unicode context the
357 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
358 // character expressed in little-endian byte order (since it could
359 // not be a U+FFFE character expressed in big-endian byte
360 // order). Nevertheless, we check for it to be compatible with
361 // Spidermonkey.
362 return c == 0xFEFF || c == 0xFFFE;
363 }
364
365
366 bool Scanner::SkipWhiteSpace() {
367 int start_position = source_pos();
368
369 while (true) {
370 // We treat byte-order marks (BOMs) as whitespace for better
371 // compatibility with Spidermonkey and other JavaScript engines.
372 while (i::ScannerConstants::kIsWhiteSpace.get(c0_)
373 || IsByteOrderMark(c0_)) {
374 // IsWhiteSpace() includes line terminators!
375 if (i::ScannerConstants::kIsLineTerminator.get(c0_)) {
376 // Ignore line terminators, but remember them. This is necessary
377 // for automatic semicolon insertion.
378 has_line_terminator_before_next_ = true;
379 }
380 Advance();
381 }
382
383 // If there is an HTML comment end '-->' at the beginning of a
384 // line (with only whitespace in front of it), we treat the rest
385 // of the line as a comment. This is in line with the way
386 // SpiderMonkey handles it.
387 if (c0_ == '-' && has_line_terminator_before_next_) {
388 Advance();
389 if (c0_ == '-') {
390 Advance();
391 if (c0_ == '>') {
392 // Treat the rest of the line as a comment.
393 SkipSingleLineComment();
394 // Continue skipping white space after the comment.
395 continue;
396 }
397 PushBack('-'); // undo Advance()
398 }
399 PushBack('-'); // undo Advance()
400 }
401 // Return whether or not we skipped any characters.
402 return source_pos() != start_position;
403 }
404 }
405
406
407 i::Token::Value Scanner::SkipSingleLineComment() {
408 Advance();
409
410 // The line terminator at the end of the line is not considered
411 // to be part of the single-line comment; it is recognized
412 // separately by the lexical grammar and becomes part of the
413 // stream of input elements for the syntactic grammar (see
414 // ECMA-262, section 7.4, page 12).
415 while (c0_ >= 0 && !i::ScannerConstants::kIsLineTerminator.get(c0_)) {
416 Advance();
417 }
418
419 return i::Token::WHITESPACE;
420 }
421
422
423 i::Token::Value Scanner::SkipMultiLineComment() {
424 ASSERT(c0_ == '*');
425 Advance();
426
427 while (c0_ >= 0) {
428 char ch = c0_;
429 Advance();
430 // If we have reached the end of the multi-line comment, we
431 // consume the '/' and insert a whitespace. This way all
432 // multi-line comments are treated as whitespace - even the ones
433 // containing line terminators. This contradicts ECMA-262, section
434 // 7.4, page 12, that says that multi-line comments containing
435 // line terminators should be treated as a line terminator, but it
436 // matches the behaviour of SpiderMonkey and KJS.
437 if (ch == '*' && c0_ == '/') {
438 c0_ = ' ';
439 return i::Token::WHITESPACE;
440 }
441 }
442
443 // Unterminated multi-line comment.
444 return i::Token::ILLEGAL;
445 }
446
447
448 i::Token::Value Scanner::ScanHtmlComment() {
449 // Check for <!-- comments.
450 ASSERT(c0_ == '!');
451 Advance();
452 if (c0_ == '-') {
453 Advance();
454 if (c0_ == '-') return SkipSingleLineComment();
455 PushBack('-'); // undo Advance()
456 }
457 PushBack('!'); // undo Advance()
458 ASSERT(c0_ == '!');
459 return i::Token::LT;
460 }
461
462
463 void Scanner::Scan() {
464 next_.literal_length = 0;
465 i::Token::Value token;
466 do {
467 // Remember the position of the next token
468 next_.location.beg_pos = source_pos();
469
470 switch (c0_) {
471 case ' ':
472 case '\t':
473 Advance();
474 token = i::Token::WHITESPACE;
475 break;
476
477 case '\n':
478 Advance();
479 has_line_terminator_before_next_ = true;
480 token = i::Token::WHITESPACE;
481 break;
482
483 case '"': case '\'':
484 token = ScanString();
485 break;
486
487 case '<':
488 // < <= << <<= <!--
489 Advance();
490 if (c0_ == '=') {
491 token = Select(i::Token::LTE);
492 } else if (c0_ == '<') {
493 token = Select('=', i::Token::ASSIGN_SHL, i::Token::SHL);
494 } else if (c0_ == '!') {
495 token = ScanHtmlComment();
496 } else {
497 token = i::Token::LT;
498 }
499 break;
500
501 case '>':
502 // > >= >> >>= >>> >>>=
503 Advance();
504 if (c0_ == '=') {
505 token = Select(i::Token::GTE);
506 } else if (c0_ == '>') {
507 // >> >>= >>> >>>=
508 Advance();
509 if (c0_ == '=') {
510 token = Select(i::Token::ASSIGN_SAR);
511 } else if (c0_ == '>') {
512 token = Select('=', i::Token::ASSIGN_SHR, i::Token::SHR);
513 } else {
514 token = i::Token::SAR;
515 }
516 } else {
517 token = i::Token::GT;
518 }
519 break;
520
521 case '=':
522 // = == ===
523 Advance();
524 if (c0_ == '=') {
525 token = Select('=', i::Token::EQ_STRICT, i::Token::EQ);
526 } else {
527 token = i::Token::ASSIGN;
528 }
529 break;
530
531 case '!':
532 // ! != !==
533 Advance();
534 if (c0_ == '=') {
535 token = Select('=', i::Token::NE_STRICT, i::Token::NE);
536 } else {
537 token = i::Token::NOT;
538 }
539 break;
540
541 case '+':
542 // + ++ +=
543 Advance();
544 if (c0_ == '+') {
545 token = Select(i::Token::INC);
546 } else if (c0_ == '=') {
547 token = Select(i::Token::ASSIGN_ADD);
548 } else {
549 token = i::Token::ADD;
550 }
551 break;
552
553 case '-':
554 // - -- --> -=
555 Advance();
556 if (c0_ == '-') {
557 Advance();
558 if (c0_ == '>' && has_line_terminator_before_next_) {
559 // For compatibility with SpiderMonkey, we skip lines that
560 // start with an HTML comment end '-->'.
561 token = SkipSingleLineComment();
562 } else {
563 token = i::Token::DEC;
564 }
565 } else if (c0_ == '=') {
566 token = Select(i::Token::ASSIGN_SUB);
567 } else {
568 token = i::Token::SUB;
569 }
570 break;
571
572 case '*':
573 // * *=
574 token = Select('=', i::Token::ASSIGN_MUL, i::Token::MUL);
575 break;
576
577 case '%':
578 // % %=
579 token = Select('=', i::Token::ASSIGN_MOD, i::Token::MOD);
580 break;
581
582 case '/':
583 // / // /* /=
584 Advance();
585 if (c0_ == '/') {
586 token = SkipSingleLineComment();
587 } else if (c0_ == '*') {
588 token = SkipMultiLineComment();
589 } else if (c0_ == '=') {
590 token = Select(i::Token::ASSIGN_DIV);
591 } else {
592 token = i::Token::DIV;
593 }
594 break;
595
596 case '&':
597 // & && &=
598 Advance();
599 if (c0_ == '&') {
600 token = Select(i::Token::AND);
601 } else if (c0_ == '=') {
602 token = Select(i::Token::ASSIGN_BIT_AND);
603 } else {
604 token = i::Token::BIT_AND;
605 }
606 break;
607
608 case '|':
609 // | || |=
610 Advance();
611 if (c0_ == '|') {
612 token = Select(i::Token::OR);
613 } else if (c0_ == '=') {
614 token = Select(i::Token::ASSIGN_BIT_OR);
615 } else {
616 token = i::Token::BIT_OR;
617 }
618 break;
619
620 case '^':
621 // ^ ^=
622 token = Select('=', i::Token::ASSIGN_BIT_XOR, i::Token::BIT_XOR);
623 break;
624
625 case '.':
626 // . Number
627 Advance();
628 if (i::IsDecimalDigit(c0_)) {
629 token = ScanNumber(true);
630 } else {
631 token = i::Token::PERIOD;
632 }
633 break;
634
635 case ':':
636 token = Select(i::Token::COLON);
637 break;
638
639 case ';':
640 token = Select(i::Token::SEMICOLON);
641 break;
642
643 case ',':
644 token = Select(i::Token::COMMA);
645 break;
646
647 case '(':
648 token = Select(i::Token::LPAREN);
649 break;
650
651 case ')':
652 token = Select(i::Token::RPAREN);
653 break;
654
655 case '[':
656 token = Select(i::Token::LBRACK);
657 break;
658
659 case ']':
660 token = Select(i::Token::RBRACK);
661 break;
662
663 case '{':
664 token = Select(i::Token::LBRACE);
665 break;
666
667 case '}':
668 token = Select(i::Token::RBRACE);
669 break;
670
671 case '?':
672 token = Select(i::Token::CONDITIONAL);
673 break;
674
675 case '~':
676 token = Select(i::Token::BIT_NOT);
677 break;
678
679 default:
680 if (i::ScannerConstants::kIsIdentifierStart.get(c0_)) {
681 token = ScanIdentifier();
682 } else if (i::IsDecimalDigit(c0_)) {
683 token = ScanNumber(false);
684 } else if (SkipWhiteSpace()) {
685 token = i::Token::WHITESPACE;
686 } else if (c0_ < 0) {
687 token = i::Token::EOS;
688 } else {
689 token = Select(i::Token::ILLEGAL);
690 }
691 break;
692 }
693
694 // Continue scanning for tokens as long as we're just skipping
695 // whitespace.
696 } while (token == i::Token::WHITESPACE);
697
698 next_.location.end_pos = source_pos();
699 next_.token = token;
700 }
701
702
703 void Scanner::SeekForward(int pos) {
704 source_->SeekForward(pos - 1);
705 Advance();
706 // This function is only called to seek to the location
707 // of the end of a function (at the "}" token). It doesn't matter
708 // whether there was a line terminator in the part we skip.
709 has_line_terminator_before_next_ = false;
710 Scan();
711 }
712
713
714 uc32 Scanner::ScanHexEscape(uc32 c, int length) {
715 ASSERT(length <= 4); // prevent overflow
716
717 uc32 digits[4];
718 uc32 x = 0;
719 for (int i = 0; i < length; i++) {
720 digits[i] = c0_;
721 int d = i::HexValue(c0_);
722 if (d < 0) {
723 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
724 // should be illegal, but other JS VMs just return the
725 // non-escaped version of the original character.
726
727 // Push back digits read, except the last one (in c0_).
728 for (int j = i-1; j >= 0; j--) {
729 PushBack(digits[j]);
730 }
731 // Notice: No handling of error - treat it as "\u"->"u".
732 return c;
733 }
734 x = x * 16 + d;
735 Advance();
736 }
737
738 return x;
739 }
740
741
742 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
743 // ECMA-262. Other JS VMs support them.
744 uc32 Scanner::ScanOctalEscape(
745 uc32 c, int length) {
746 uc32 x = c - '0';
747 for (int i = 0; i < length; i++) {
748 int d = c0_ - '0';
749 if (d < 0 || d > 7) break;
750 int nx = x * 8 + d;
751 if (nx >= 256) break;
752 x = nx;
753 Advance();
754 }
755 return x;
756 }
757
758
759 void Scanner::ScanEscape() {
760 uc32 c = c0_;
761 Advance();
762
763 // Skip escaped newlines.
764 if (i::ScannerConstants::kIsLineTerminator.get(c)) {
765 // Allow CR+LF newlines in multiline string literals.
766 if (i::IsCarriageReturn(c) && i::IsLineFeed(c0_)) Advance();
767 // Allow LF+CR newlines in multiline string literals.
768 if (i::IsLineFeed(c) && i::IsCarriageReturn(c0_)) Advance();
769 return;
770 }
771
772 switch (c) {
773 case '\'': // fall through
774 case '"' : // fall through
775 case '\\': break;
776 case 'b' : c = '\b'; break;
777 case 'f' : c = '\f'; break;
778 case 'n' : c = '\n'; break;
779 case 'r' : c = '\r'; break;
780 case 't' : c = '\t'; break;
781 case 'u' : c = ScanHexEscape(c, 4); break;
782 case 'v' : c = '\v'; break;
783 case 'x' : c = ScanHexEscape(c, 2); break;
784 case '0' : // fall through
785 case '1' : // fall through
786 case '2' : // fall through
787 case '3' : // fall through
788 case '4' : // fall through
789 case '5' : // fall through
790 case '6' : // fall through
791 case '7' : c = ScanOctalEscape(c, 2); break;
792 }
793
794 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
795 // should be illegal, but they are commonly handled
796 // as non-escaped characters by JS VMs.
797 AddLiteralChar(c);
798 }
799
800
801 i::Token::Value Scanner::ScanString() {
802 uc32 quote = c0_;
803 Advance(); // consume quote
804
805 LiteralScope literal(this, kLiteralString);
806 while (c0_ != quote && c0_ >= 0
807 && !i::ScannerConstants::kIsLineTerminator.get(c0_)) {
808 uc32 c = c0_;
809 Advance();
810 if (c == '\\') {
811 if (c0_ < 0) return i::Token::ILLEGAL;
812 ScanEscape();
813 } else {
814 AddLiteralChar(c);
815 }
816 }
817 if (c0_ != quote) return i::Token::ILLEGAL;
818 literal.Complete();
819
820 Advance(); // consume quote
821 return i::Token::STRING;
822 }
823
824
825 i::Token::Value Scanner::Select(
826 i::Token::Value tok) {
827 Advance();
828 return tok;
829 }
830
831
832 i::Token::Value Scanner::Select(
833 uc32 next,
834 i::Token::Value then,
835 i::Token::Value else_) {
836 Advance();
837 if (c0_ == next) {
838 Advance();
839 return then;
840 } else {
841 return else_;
842 }
843 }
844
845
846 // Returns true if any decimal digits were scanned, returns false otherwise.
847 void Scanner::ScanDecimalDigits() {
848 while (i::IsDecimalDigit(c0_))
849 AddLiteralCharAdvance();
850 }
851
852
853 i::Token::Value Scanner::ScanNumber(
854 bool seen_period) {
855 // c0_ is the first digit of the number or the fraction.
856 ASSERT(i::IsDecimalDigit(c0_));
857
858 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
859
860 LiteralScope literal(this, kLiteralNumber);
861 if (seen_period) {
862 // we have already seen a decimal point of the float
863 AddLiteralChar('.');
864 ScanDecimalDigits(); // we know we have at least one digit
865
866 } else {
867 // if the first character is '0' we must check for octals and hex
868 if (c0_ == '0') {
869 AddLiteralCharAdvance();
870
871 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
872 if (c0_ == 'x' || c0_ == 'X') {
873 // hex number
874 kind = HEX;
875 AddLiteralCharAdvance();
876 if (!i::IsHexDigit(c0_)) {
877 // we must have at least one hex digit after 'x'/'X'
878 return i::Token::ILLEGAL;
879 }
880 while (i::IsHexDigit(c0_)) {
881 AddLiteralCharAdvance();
882 }
883 } else if ('0' <= c0_ && c0_ <= '7') {
884 // (possible) octal number
885 kind = OCTAL;
886 while (true) {
887 if (c0_ == '8' || c0_ == '9') {
888 kind = DECIMAL;
889 break;
890 }
891 if (c0_ < '0' || '7' < c0_) break;
892 AddLiteralCharAdvance();
893 }
894 }
895 }
896
897 // Parse decimal digits and allow trailing fractional part.
898 if (kind == DECIMAL) {
899 ScanDecimalDigits(); // optional
900 if (c0_ == '.') {
901 AddLiteralCharAdvance();
902 ScanDecimalDigits(); // optional
903 }
904 }
905 }
906
907 // scan exponent, if any
908 if (c0_ == 'e' || c0_ == 'E') {
909 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
910 if (kind == OCTAL) return i::Token::ILLEGAL;
911 // scan exponent
912 AddLiteralCharAdvance();
913 if (c0_ == '+' || c0_ == '-')
914 AddLiteralCharAdvance();
915 if (!i::IsDecimalDigit(c0_)) {
916 // we must have at least one decimal digit after 'e'/'E'
917 return i::Token::ILLEGAL;
918 }
919 ScanDecimalDigits();
920 }
921
922 // The source character immediately following a numeric literal must
923 // not be an identifier start or a decimal digit; see ECMA-262
924 // section 7.8.3, page 17 (note that we read only one decimal digit
925 // if the value is 0).
926 if (i::IsDecimalDigit(c0_)
927 || i::ScannerConstants::kIsIdentifierStart.get(c0_))
928 return i::Token::ILLEGAL;
929
930 literal.Complete();
931
932 return i::Token::NUMBER;
933 }
934
935
936 uc32 Scanner::ScanIdentifierUnicodeEscape() {
937 Advance();
938 if (c0_ != 'u') return unibrow::Utf8::kBadChar;
939 Advance();
940 uc32 c = ScanHexEscape('u', 4);
941 // We do not allow a unicode escape sequence to start another
942 // unicode escape sequence.
943 if (c == '\\') return unibrow::Utf8::kBadChar;
944 return c;
945 }
946
947
948 i::Token::Value Scanner::ScanIdentifier() {
949 ASSERT(i::ScannerConstants::kIsIdentifierStart.get(c0_));
950
951 LiteralScope literal(this, kLiteralIdentifier);
952 i::KeywordMatcher keyword_match;
953
954 // Scan identifier start character.
955 if (c0_ == '\\') {
956 uc32 c = ScanIdentifierUnicodeEscape();
957 // Only allow legal identifier start characters.
958 if (!i::ScannerConstants::kIsIdentifierStart.get(c)) {
959 return i::Token::ILLEGAL;
960 }
961 AddLiteralChar(c);
962 keyword_match.Fail();
963 } else {
964 AddLiteralChar(c0_);
965 keyword_match.AddChar(c0_);
966 Advance();
967 }
968
969 // Scan the rest of the identifier characters.
970 while (i::ScannerConstants::kIsIdentifierPart.get(c0_)) {
971 if (c0_ == '\\') {
972 uc32 c = ScanIdentifierUnicodeEscape();
973 // Only allow legal identifier part characters.
974 if (!i::ScannerConstants::kIsIdentifierPart.get(c)) {
975 return i::Token::ILLEGAL;
976 }
977 AddLiteralChar(c);
978 keyword_match.Fail();
979 } else {
980 AddLiteralChar(c0_);
981 keyword_match.AddChar(c0_);
982 Advance();
983 }
984 }
985 literal.Complete();
986
987 return keyword_match.token();
988 }
989
990
991 bool Scanner::ScanRegExpPattern(bool seen_equal) {
992 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
993 bool in_character_class = false;
994
995 // Previous token is either '/' or '/=', in the second case, the
996 // pattern starts at =.
997 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
998 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
999
1000 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1001 // the scanner should pass uninterpreted bodies to the RegExp
1002 // constructor.
1003 LiteralScope literal(this, kLiteralRegExp);
1004 if (seen_equal)
1005 AddLiteralChar('=');
1006
1007 while (c0_ != '/' || in_character_class) {
1008 if (i::ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) {
1009 return false;
1010 }
1011 if (c0_ == '\\') { // escaped character
1012 AddLiteralCharAdvance();
1013 if (i::ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) {
1014 return false;
1015 }
1016 AddLiteralCharAdvance();
1017 } else { // unescaped character
1018 if (c0_ == '[') in_character_class = true;
1019 if (c0_ == ']') in_character_class = false;
1020 AddLiteralCharAdvance();
1021 }
1022 }
1023 Advance(); // consume '/'
1024
1025 literal.Complete();
1026
1027 return true;
1028 }
1029
1030 bool Scanner::ScanRegExpFlags() {
1031 // Scan regular expression flags.
1032 LiteralScope literal(this, kLiteralRegExpFlags);
1033 while (i::ScannerConstants::kIsIdentifierPart.get(c0_)) {
1034 if (c0_ == '\\') {
1035 uc32 c = ScanIdentifierUnicodeEscape();
1036 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
1037 // We allow any escaped character, unlike the restriction on
1038 // IdentifierPart when it is used to build an IdentifierName.
1039 AddLiteralChar(c);
1040 continue;
1041 }
1042 }
1043 AddLiteralCharAdvance();
1044 }
1045 literal.Complete();
1046
1047 next_.location.end_pos = source_pos() - 1;
1048 return true;
1049 }
1050
1051
1052 } } // namespace v8::preparser
1053
1054 #endif // V8_PRESCANNER_H_
OLDNEW
« no previous file with comments | « src/parser.cc ('k') | src/scanner.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698