Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(90)

Side by Side Diff: src/prescanner.h

Issue 5063003: Add separate scanner only intended for preparsing. (Closed)
Patch Set: Created 10 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« src/parser.cc ('K') | « src/parser.cc ('k') | src/scanner.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2010 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28 #ifndef V8_PRESCANNER_H_
29 #define V8_PRESCANNER_H_
30
31 #include "token.h"
32 #include "char-predicates-inl.h"
33 #include "utils.h"
34 #include "scanner-base.h"
35
36 namespace v8 {
37 namespace preparser {
38
39 namespace i = v8::internal;
40
41 typedef int uc32;
42
43 int HexValue(uc32 c) {
44 int res = c | 0x20; // Uppercase letters.
45 int is_digit = (c & 0x10) >> 4; // 0 if non-digit, 1 if digit.
46 // What to add to digits to make them consecutive with 'a'-'f' letters.
47 int kDelta = 'a' - '9' - 1;
48 // What to subtract to digits and letters to get them back to the range 0..15.
49 int kStart = '0' + kDelta;
50 res -= kStart;
51 res += kDelta * is_digit;
52 return res;
53 }
54
55
56 class PreScannerStackGuard {
57 public:
58 explicit PreScannerStackGuard(int max_size)
59 : limit_(StackPoint().at() - max_size) { }
60 bool has_overflowed() {
61 return StackPoint().at() < limit_;
62 }
63 private:
64 class StackPoint {
65 public:
66 char* at() { return reinterpret_cast<char*>(this); }
67 };
68 char* limit_;
69 };
70
71
72 template <typename UTF16Buffer, typename UTF8Buffer>
Søren Thygesen Gjesse 2010/11/17 10:37:46 Please explain these template parameters a bit. It
Lasse Reichstein 2010/11/17 13:08:39 Agree. The long term plan is to have a stand-alone
73 class Scanner {
74 public:
75 enum LiteralType {
76 kLiteralNumber,
77 kLiteralIdentifier,
78 kLiteralString,
79 kLiteralRegExp,
80 kLiteralRegExpFlags
81 };
82
83 class LiteralScope {
84 public:
85 explicit LiteralScope(Scanner* self, LiteralType type);
86 ~LiteralScope();
87 void Complete();
88
89 private:
90 Scanner* scanner_;
91 bool complete_;
92 };
93
94 Scanner();
95
96 void Initialize(UTF16Buffer* stream);
97
98 // Returns the next token.
99 i::Token::Value Next();
100
101 // Returns the current token again.
102 i::Token::Value current_token() { return current_.token; }
103
104 // One token look-ahead (past the token returned by Next()).
105 i::Token::Value peek() const { return next_.token; }
106
107 // Returns true if there was a line terminator before the peek'ed token.
108 bool has_line_terminator_before_next() const {
109 return has_line_terminator_before_next_;
110 }
111
112 struct Location {
113 Location(int b, int e) : beg_pos(b), end_pos(e) { }
114 Location() : beg_pos(0), end_pos(0) { }
115 int beg_pos;
116 int end_pos;
117 };
118
119 // Returns the location information for the current token
120 // (the token returned by Next()).
121 Location location() const { return current_.location; }
Søren Thygesen Gjesse 2010/11/17 10:37:46 Maybe add a separate comment for peek_location.
Lasse Reichstein 2010/11/17 13:08:39 Done.
122 Location peek_location() const { return next_.location; }
123
124 // Returns the literal string, if any, for the current token (the
125 // token returned by Next()). The string is 0-terminated and in
126 // UTF-8 format; they may contain 0-characters. Literal strings are
127 // collected for identifiers, strings, and numbers.
128 // These functions only give the correct result if the literal
129 // was scanned between calls to StartLiteral() and TerminateLiteral().
130 const char* literal_string() const {
131 return current_.literal_chars;
132 }
133
134 int literal_length() const {
135 // Excluding terminal '\x00' added by TerminateLiteral().
136 return current_.literal_length - 1;
137 }
138
139 i::Vector<const char> literal() const {
140 return i::Vector<const char>(literal_string(), literal_length());
141 }
142
143 // Returns the literal string for the next token (the token that
144 // would be returned if Next() were called).
145 const char* next_literal_string() const {
146 return next_.literal_chars;
147 }
148
149
150 // Returns the length of the next token (that would be returned if
151 // Next() were called).
152 int next_literal_length() const {
153 // Excluding terminal '\x00' added by TerminateLiteral().
154 return next_.literal_length - 1;
155 }
156
157 i::Vector<const char> next_literal() const {
158 return i::Vector<const char>(next_literal_string(), next_literal_length());
159 }
160
161 // Scans the input as a regular expression pattern, previous
162 // character(s) must be /(=). Returns true if a pattern is scanned.
163 bool ScanRegExpPattern(bool seen_equal);
164 // Returns true if regexp flags are scanned (always since flags can
165 // be empty).
166 bool ScanRegExpFlags();
167
168 // Seek forward to the given position. This operation does not
169 // work in general, for instance when there are pushed back
170 // characters, but works for seeking forward until simple delimiter
171 // tokens, which is what it is used for.
172 void SeekForward(int pos);
173
174 bool stack_overflow() { return stack_overflow_; }
175
176 static const int kCharacterLookaheadBufferSize = 1;
177 static const int kNoEndPosition = 1;
178
179 private:
180 // The current and look-ahead token.
181 struct TokenDesc {
182 i::Token::Value token;
183 Location location;
184 const char* literal_chars;
185 int literal_length;
186 };
187
188 // Default stack limit is 128K pointers.
189 static const int kMaxStackSize = 128 * 1024 * sizeof(void*); // NOLINT.
190
191 void Init(unibrow::CharacterStream* stream);
192
193 // Literal buffer support
194 inline void StartLiteral(LiteralType type);
195 inline void AddChar(uc32 ch);
196 inline void AddCharAdvance();
197 inline void TerminateLiteral();
198 // Stops scanning of a literal, e.g., due to an encountered error.
199 inline void DropLiteral();
200
201 // Low-level scanning support.
202 void Advance() { c0_ = source_->Advance(); }
203 void PushBack(uc32 ch) {
204 source_->PushBack(ch);
205 c0_ = ch;
206 }
207
208 bool SkipWhiteSpace();
209
210 i::Token::Value SkipSingleLineComment();
211 i::Token::Value SkipMultiLineComment();
212
213 inline i::Token::Value Select(i::Token::Value tok);
214 inline i::Token::Value Select(uc32 next,
215 i::Token::Value then,
216 i::Token::Value else_);
217
218 // Scans a single JavaScript token.
219 void Scan();
220
221 void ScanDecimalDigits();
222 i::Token::Value ScanNumber(bool seen_period);
223 i::Token::Value ScanIdentifier();
224 uc32 ScanHexEscape(uc32 c, int length);
225 uc32 ScanOctalEscape(uc32 c, int length);
226 void ScanEscape();
227 i::Token::Value ScanString();
228
229 // Scans a possible HTML comment -- begins with '<!'.
230 i::Token::Value ScanHtmlComment();
231
232 // Return the current source position.
233 int source_pos() {
234 return source_->pos() - kCharacterLookaheadBufferSize;
235 }
236
237 // Decodes a unicode escape-sequence which is part of an identifier.
238 // If the escape sequence cannot be decoded the result is kBadRune.
239 uc32 ScanIdentifierUnicodeEscape();
240
241 PreScannerStackGuard stack_guard_;
242
243 TokenDesc current_; // desc for current token (as returned by Next())
244 TokenDesc next_; // desc for next token (one token look-ahead)
245 bool has_line_terminator_before_next_;
246
247 // Source.
248 UTF16Buffer* source_;
249
250 // Buffer to hold literal values (identifiers, strings, numerals, regexps and
251 // regexp flags) using '\x00'-terminated UTF-8 encoding.
252 // Handles allocation internally.
253 // Notice that the '\x00' termination is meaningless for strings and regexps
254 // which may contain the zero-character, but can be used as terminator for
255 // identifiers, numerals and regexp flags.
256 UTF8Buffer literal_buffer_;
257
258 bool stack_overflow_;
259
260 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
261 uc32 c0_;
262 };
263
264
265 // ----------------------------------------------------------------------------
266 // Scanner::LiteralScope
267
268 template <typename UTF16Buffer, typename UTF8Buffer>
269 Scanner<UTF16Buffer, UTF8Buffer>::LiteralScope::LiteralScope(
270 Scanner* self, LiteralType type)
271 : scanner_(self), complete_(false) {
272 self->StartLiteral(type);
273 }
274
275
276 template <typename UTF16Buffer, typename UTF8Buffer>
277 Scanner<UTF16Buffer, UTF8Buffer>::LiteralScope::~LiteralScope() {
278 if (!complete_) scanner_->DropLiteral();
279 }
280
281 template <typename UTF16Buffer, typename UTF8Buffer>
282 void Scanner<UTF16Buffer, UTF8Buffer>::LiteralScope::Complete() {
283 scanner_->TerminateLiteral();
284 complete_ = true;
285 }
286
287
288 // ----------------------------------------------------------------------------
289 // Scanner
290 template <typename UTF16Buffer, typename UTF8Buffer>
291 Scanner<UTF16Buffer, UTF8Buffer>::Scanner()
292 : stack_guard_(kMaxStackSize),
293 has_line_terminator_before_next_(false),
294 source_(NULL),
295 stack_overflow_(false) {}
296
297
298 template <typename UTF16Buffer, typename UTF8Buffer>
299 void Scanner<UTF16Buffer, UTF8Buffer>::Initialize(UTF16Buffer* stream) {
300 source_ = stream;
301
302 // Initialize current_ to not refer to a literal.
303 current_.literal_length = 0;
304 // Reset literal buffer.
305 literal_buffer_.Reset();
306
307 // Set c0_ (one character ahead)
308 ASSERT(kCharacterLookaheadBufferSize == 1);
309 Advance();
310
311 // Skip initial whitespace allowing HTML comment ends just like
312 // after a newline and scan first token.
313 has_line_terminator_before_next_ = true;
314 SkipWhiteSpace();
315 Scan();
316 }
317
318
319 template <typename UTF16Buffer, typename UTF8Buffer>
320 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::Next() {
321 // BUG 1215673: Find a thread safe way to set a stack limit in
322 // pre-parse mode. Otherwise, we cannot safely pre-parse from other
323 // threads.
324 current_ = next_;
325 // Check for stack-overflow before returning any tokens.
326 if (stack_guard_.has_overflowed()) {
327 stack_overflow_ = true;
328 next_.token = i::Token::ILLEGAL;
329 } else {
330 has_line_terminator_before_next_ = false;
331 Scan();
332 }
333 return current_.token;
334 }
335
336
337 template <typename UTF16Buffer, typename UTF8Buffer>
338 void Scanner<UTF16Buffer, UTF8Buffer>::StartLiteral(LiteralType type) {
339 // Only record string and literal identifiers when preparsing.
340 // Those are the ones that are recorded as symbols. Numbers and
341 // regexps are not recorded.
342 if (type == kLiteralString || type == kLiteralIdentifier) {
343 literal_buffer_.StartLiteral();
344 }
345 }
346
347
348 template <typename UTF16Buffer, typename UTF8Buffer>
349 void Scanner<UTF16Buffer, UTF8Buffer>::AddChar(uc32 c) {
Søren Thygesen Gjesse 2010/11/17 10:37:46 AddChar -> AddLiteralChar?
Lasse Reichstein 2010/11/17 13:08:39 Done. Also in scanner.h/.cc.
350 literal_buffer_.AddChar(c);
351 }
352
353
354 template <typename UTF16Buffer, typename UTF8Buffer>
355 void Scanner<UTF16Buffer, UTF8Buffer>::TerminateLiteral() {
356 i::Vector<const char> chars = literal_buffer_.EndLiteral();
357 next_.literal_chars = chars.start();
358 next_.literal_length = chars.length();
359 }
360
361
362 template <typename UTF16Buffer, typename UTF8Buffer>
363 void Scanner<UTF16Buffer, UTF8Buffer>::DropLiteral() {
364 literal_buffer_.DropLiteral();
365 }
366
367
368 template <typename UTF16Buffer, typename UTF8Buffer>
369 void Scanner<UTF16Buffer, UTF8Buffer>::AddCharAdvance() {
370 AddChar(c0_);
371 Advance();
372 }
373
374
375 static inline bool IsByteOrderMark(uc32 c) {
376 // The Unicode value U+FFFE is guaranteed never to be assigned as a
377 // Unicode character; this implies that in a Unicode context the
378 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
379 // character expressed in little-endian byte order (since it could
380 // not be a U+FFFE character expressed in big-endian byte
381 // order). Nevertheless, we check for it to be compatible with
382 // Spidermonkey.
383 return c == 0xFEFF || c == 0xFFFE;
384 }
385
386
387 template <typename UTF16Buffer, typename UTF8Buffer>
388 bool Scanner<UTF16Buffer, UTF8Buffer>::SkipWhiteSpace() {
389 int start_position = source_pos();
390
391 while (true) {
392 // We treat byte-order marks (BOMs) as whitespace for better
393 // compatibility with Spidermonkey and other JavaScript engines.
394 while (i::ScannerConstants::kIsWhiteSpace.get(c0_)
395 || IsByteOrderMark(c0_)) {
396 // IsWhiteSpace() includes line terminators!
397 if (i::ScannerConstants::kIsLineTerminator.get(c0_)) {
398 // Ignore line terminators, but remember them. This is necessary
399 // for automatic semicolon insertion.
400 has_line_terminator_before_next_ = true;
401 }
402 Advance();
403 }
404
405 // If there is an HTML comment end '-->' at the beginning of a
406 // line (with only whitespace in front of it), we treat the rest
407 // of the line as a comment. This is in line with the way
408 // SpiderMonkey handles it.
409 if (c0_ == '-' && has_line_terminator_before_next_) {
410 Advance();
411 if (c0_ == '-') {
412 Advance();
413 if (c0_ == '>') {
414 // Treat the rest of the line as a comment.
415 SkipSingleLineComment();
416 // Continue skipping white space after the comment.
417 continue;
418 }
419 PushBack('-'); // undo Advance()
420 }
421 PushBack('-'); // undo Advance()
422 }
423 // Return whether or not we skipped any characters.
424 return source_pos() != start_position;
425 }
426 }
427
428
429 template <typename UTF16Buffer, typename UTF8Buffer>
430 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::SkipSingleLineComment() {
431 Advance();
432
433 // The line terminator at the end of the line is not considered
434 // to be part of the single-line comment; it is recognized
435 // separately by the lexical grammar and becomes part of the
436 // stream of input elements for the syntactic grammar (see
437 // ECMA-262, section 7.4, page 12).
438 while (c0_ >= 0 && !i::ScannerConstants::kIsLineTerminator.get(c0_)) {
439 Advance();
440 }
441
442 return i::Token::WHITESPACE;
443 }
444
445
446 template <typename UTF16Buffer, typename UTF8Buffer>
447 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::SkipMultiLineComment() {
448 ASSERT(c0_ == '*');
449 Advance();
450
451 while (c0_ >= 0) {
452 char ch = c0_;
453 Advance();
454 // If we have reached the end of the multi-line comment, we
455 // consume the '/' and insert a whitespace. This way all
456 // multi-line comments are treated as whitespace - even the ones
457 // containing line terminators. This contradicts ECMA-262, section
458 // 7.4, page 12, that says that multi-line comments containing
459 // line terminators should be treated as a line terminator, but it
460 // matches the behaviour of SpiderMonkey and KJS.
461 if (ch == '*' && c0_ == '/') {
462 c0_ = ' ';
463 return i::Token::WHITESPACE;
464 }
465 }
466
467 // Unterminated multi-line comment.
468 return i::Token::ILLEGAL;
469 }
470
471
472 template <typename UTF16Buffer, typename UTF8Buffer>
473 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::ScanHtmlComment() {
474 // Check for <!-- comments.
475 ASSERT(c0_ == '!');
476 Advance();
477 if (c0_ == '-') {
478 Advance();
479 if (c0_ == '-') return SkipSingleLineComment();
480 PushBack('-'); // undo Advance()
481 }
482 PushBack('!'); // undo Advance()
483 ASSERT(c0_ == '!');
484 return i::Token::LT;
485 }
486
487
488 template <typename UTF16Buffer, typename UTF8Buffer>
489 void Scanner<UTF16Buffer, UTF8Buffer>::Scan() {
490 next_.literal_length = 0;
491 i::Token::Value token;
492 do {
493 // Remember the position of the next token
494 next_.location.beg_pos = source_pos();
495
496 switch (c0_) {
497 case ' ':
498 case '\t':
499 Advance();
500 token = i::Token::WHITESPACE;
501 break;
502
503 case '\n':
504 Advance();
505 has_line_terminator_before_next_ = true;
506 token = i::Token::WHITESPACE;
507 break;
508
509 case '"': case '\'':
510 token = ScanString();
511 break;
512
513 case '<':
514 // < <= << <<= <!--
515 Advance();
516 if (c0_ == '=') {
517 token = Select(i::Token::LTE);
518 } else if (c0_ == '<') {
519 token = Select('=', i::Token::ASSIGN_SHL, i::Token::SHL);
520 } else if (c0_ == '!') {
521 token = ScanHtmlComment();
522 } else {
523 token = i::Token::LT;
524 }
525 break;
526
527 case '>':
528 // > >= >> >>= >>> >>>=
529 Advance();
530 if (c0_ == '=') {
531 token = Select(i::Token::GTE);
532 } else if (c0_ == '>') {
533 // >> >>= >>> >>>=
534 Advance();
535 if (c0_ == '=') {
536 token = Select(i::Token::ASSIGN_SAR);
537 } else if (c0_ == '>') {
538 token = Select('=', i::Token::ASSIGN_SHR, i::Token::SHR);
539 } else {
540 token = i::Token::SAR;
541 }
542 } else {
543 token = i::Token::GT;
544 }
545 break;
546
547 case '=':
548 // = == ===
549 Advance();
550 if (c0_ == '=') {
551 token = Select('=', i::Token::EQ_STRICT, i::Token::EQ);
552 } else {
553 token = i::Token::ASSIGN;
554 }
555 break;
556
557 case '!':
558 // ! != !==
559 Advance();
560 if (c0_ == '=') {
561 token = Select('=', i::Token::NE_STRICT, i::Token::NE);
562 } else {
563 token = i::Token::NOT;
564 }
565 break;
566
567 case '+':
568 // + ++ +=
569 Advance();
570 if (c0_ == '+') {
571 token = Select(i::Token::INC);
572 } else if (c0_ == '=') {
573 token = Select(i::Token::ASSIGN_ADD);
574 } else {
575 token = i::Token::ADD;
576 }
577 break;
578
579 case '-':
580 // - -- --> -=
581 Advance();
582 if (c0_ == '-') {
583 Advance();
584 if (c0_ == '>' && has_line_terminator_before_next_) {
585 // For compatibility with SpiderMonkey, we skip lines that
586 // start with an HTML comment end '-->'.
587 token = SkipSingleLineComment();
588 } else {
589 token = i::Token::DEC;
590 }
591 } else if (c0_ == '=') {
592 token = Select(i::Token::ASSIGN_SUB);
593 } else {
594 token = i::Token::SUB;
595 }
596 break;
597
598 case '*':
599 // * *=
600 token = Select('=', i::Token::ASSIGN_MUL, i::Token::MUL);
601 break;
602
603 case '%':
604 // % %=
605 token = Select('=', i::Token::ASSIGN_MOD, i::Token::MOD);
606 break;
607
608 case '/':
609 // / // /* /=
610 Advance();
611 if (c0_ == '/') {
612 token = SkipSingleLineComment();
613 } else if (c0_ == '*') {
614 token = SkipMultiLineComment();
615 } else if (c0_ == '=') {
616 token = Select(i::Token::ASSIGN_DIV);
617 } else {
618 token = i::Token::DIV;
619 }
620 break;
621
622 case '&':
623 // & && &=
624 Advance();
625 if (c0_ == '&') {
626 token = Select(i::Token::AND);
627 } else if (c0_ == '=') {
628 token = Select(i::Token::ASSIGN_BIT_AND);
629 } else {
630 token = i::Token::BIT_AND;
631 }
632 break;
633
634 case '|':
635 // | || |=
636 Advance();
637 if (c0_ == '|') {
638 token = Select(i::Token::OR);
639 } else if (c0_ == '=') {
640 token = Select(i::Token::ASSIGN_BIT_OR);
641 } else {
642 token = i::Token::BIT_OR;
643 }
644 break;
645
646 case '^':
647 // ^ ^=
648 token = Select('=', i::Token::ASSIGN_BIT_XOR, i::Token::BIT_XOR);
649 break;
650
651 case '.':
652 // . Number
653 Advance();
654 if (i::IsDecimalDigit(c0_)) {
655 token = ScanNumber(true);
656 } else {
657 token = i::Token::PERIOD;
658 }
659 break;
660
661 case ':':
662 token = Select(i::Token::COLON);
663 break;
664
665 case ';':
666 token = Select(i::Token::SEMICOLON);
667 break;
668
669 case ',':
670 token = Select(i::Token::COMMA);
671 break;
672
673 case '(':
674 token = Select(i::Token::LPAREN);
675 break;
676
677 case ')':
678 token = Select(i::Token::RPAREN);
679 break;
680
681 case '[':
682 token = Select(i::Token::LBRACK);
683 break;
684
685 case ']':
686 token = Select(i::Token::RBRACK);
687 break;
688
689 case '{':
690 token = Select(i::Token::LBRACE);
691 break;
692
693 case '}':
694 token = Select(i::Token::RBRACE);
695 break;
696
697 case '?':
698 token = Select(i::Token::CONDITIONAL);
699 break;
700
701 case '~':
702 token = Select(i::Token::BIT_NOT);
703 break;
704
705 default:
706 if (i::ScannerConstants::kIsIdentifierStart.get(c0_)) {
707 token = ScanIdentifier();
708 } else if (i::IsDecimalDigit(c0_)) {
709 token = ScanNumber(false);
710 } else if (SkipWhiteSpace()) {
711 token = i::Token::WHITESPACE;
712 } else if (c0_ < 0) {
713 token = i::Token::EOS;
714 } else {
715 token = Select(i::Token::ILLEGAL);
716 }
717 break;
718 }
719
720 // Continue scanning for tokens as long as we're just skipping
721 // whitespace.
722 } while (token == i::Token::WHITESPACE);
723
724 next_.location.end_pos = source_pos();
725 next_.token = token;
726 }
727
728
729 template <typename UTF16Buffer, typename UTF8Buffer>
730 void Scanner<UTF16Buffer, UTF8Buffer>::SeekForward(int pos) {
731 source_->SeekForward(pos - 1);
732 Advance();
733 // This function is only called to seek to the location
734 // of the end of a function (at the "}" token). It doesn't matter
735 // whether there was a line terminator in the part we skip.
736 has_line_terminator_before_next_ = false;
737 Scan();
738 }
739
740
741 template <typename UTF16Buffer, typename UTF8Buffer>
742 uc32 Scanner<UTF16Buffer, UTF8Buffer>::ScanHexEscape(uc32 c, int length) {
743 ASSERT(length <= 4); // prevent overflow
744
745 uc32 digits[4];
746 uc32 x = 0;
747 for (int i = 0; i < length; i++) {
748 digits[i] = c0_;
749 int d = HexValue(c0_);
750 if (d < 0) {
751 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
752 // should be illegal, but other JS VMs just return the
753 // non-escaped version of the original character.
754
755 // Push back digits read, except the last one (in c0_).
756 for (int j = i-1; j >= 0; j--) {
757 PushBack(digits[j]);
758 }
759 // Notice: No handling of error - treat it as "\u"->"u".
760 return c;
761 }
762 x = x * 16 + d;
763 Advance();
764 }
765
766 return x;
767 }
768
769
770 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
771 // ECMA-262. Other JS VMs support them.
772 template <typename UTF16Buffer, typename UTF8Buffer>
773 uc32 Scanner<UTF16Buffer, UTF8Buffer>::ScanOctalEscape(
774 uc32 c, int length) {
775 uc32 x = c - '0';
776 for (int i = 0; i < length; i++) {
777 int d = c0_ - '0';
778 if (d < 0 || d > 7) break;
779 int nx = x * 8 + d;
780 if (nx >= 256) break;
781 x = nx;
782 Advance();
783 }
784 return x;
785 }
786
787
788 template <typename UTF16Buffer, typename UTF8Buffer>
789 void Scanner<UTF16Buffer, UTF8Buffer>::ScanEscape() {
790 uc32 c = c0_;
791 Advance();
792
793 // Skip escaped newlines.
794 if (i::ScannerConstants::kIsLineTerminator.get(c)) {
795 // Allow CR+LF newlines in multiline string literals.
796 if (i::IsCarriageReturn(c) && i::IsLineFeed(c0_)) Advance();
797 // Allow LF+CR newlines in multiline string literals.
798 if (i::IsLineFeed(c) && i::IsCarriageReturn(c0_)) Advance();
799 return;
800 }
801
802 switch (c) {
803 case '\'': // fall through
804 case '"' : // fall through
805 case '\\': break;
806 case 'b' : c = '\b'; break;
807 case 'f' : c = '\f'; break;
808 case 'n' : c = '\n'; break;
809 case 'r' : c = '\r'; break;
810 case 't' : c = '\t'; break;
811 case 'u' : c = ScanHexEscape(c, 4); break;
812 case 'v' : c = '\v'; break;
813 case 'x' : c = ScanHexEscape(c, 2); break;
814 case '0' : // fall through
815 case '1' : // fall through
816 case '2' : // fall through
817 case '3' : // fall through
818 case '4' : // fall through
819 case '5' : // fall through
820 case '6' : // fall through
821 case '7' : c = ScanOctalEscape(c, 2); break;
822 }
823
824 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
825 // should be illegal, but they are commonly handled
826 // as non-escaped characters by JS VMs.
827 AddChar(c);
828 }
829
830
831 template <typename UTF16Buffer, typename UTF8Buffer>
832 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::ScanString() {
833 uc32 quote = c0_;
834 Advance(); // consume quote
835
836 LiteralScope literal(this, kLiteralString);
837 while (c0_ != quote && c0_ >= 0
838 && !i::ScannerConstants::kIsLineTerminator.get(c0_)) {
839 uc32 c = c0_;
840 Advance();
841 if (c == '\\') {
842 if (c0_ < 0) return i::Token::ILLEGAL;
843 ScanEscape();
844 } else {
845 AddChar(c);
846 }
847 }
848 if (c0_ != quote) return i::Token::ILLEGAL;
849 literal.Complete();
850
851 Advance(); // consume quote
852 return i::Token::STRING;
853 }
854
855
856 template <typename UTF16Buffer, typename UTF8Buffer>
857 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::Select(i::Token::Value tok) {
858 Advance();
859 return tok;
860 }
861
862
863 template <typename UTF16Buffer, typename UTF8Buffer>
864 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::Select(
865 uc32 next,
866 i::Token::Value then,
867 i::Token::Value else_) {
868 Advance();
869 if (c0_ == next) {
870 Advance();
871 return then;
872 } else {
873 return else_;
874 }
875 }
876
877
878 // Returns true if any decimal digits were scanned, returns false otherwise.
879 template <typename UTF16Buffer, typename UTF8Buffer>
880 void Scanner<UTF16Buffer, UTF8Buffer>::ScanDecimalDigits() {
881 while (i::IsDecimalDigit(c0_))
882 AddCharAdvance();
883 }
884
885
886 template <typename UTF16Buffer, typename UTF8Buffer>
887 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::ScanNumber(bool seen_period) {
888 // c0_ is the first digit of the number or the fraction.
889 ASSERT(i::IsDecimalDigit(c0_));
890
891 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
892
893 LiteralScope literal(this, kLiteralNumber);
894 if (seen_period) {
895 // we have already seen a decimal point of the float
896 AddChar('.');
897 ScanDecimalDigits(); // we know we have at least one digit
898
899 } else {
900 // if the first character is '0' we must check for octals and hex
901 if (c0_ == '0') {
902 AddCharAdvance();
903
904 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
905 if (c0_ == 'x' || c0_ == 'X') {
906 // hex number
907 kind = HEX;
908 AddCharAdvance();
909 if (!i::IsHexDigit(c0_)) {
910 // we must have at least one hex digit after 'x'/'X'
911 return i::Token::ILLEGAL;
912 }
913 while (i::IsHexDigit(c0_)) {
914 AddCharAdvance();
915 }
916 } else if ('0' <= c0_ && c0_ <= '7') {
917 // (possible) octal number
918 kind = OCTAL;
919 while (true) {
920 if (c0_ == '8' || c0_ == '9') {
921 kind = DECIMAL;
922 break;
923 }
924 if (c0_ < '0' || '7' < c0_) break;
925 AddCharAdvance();
926 }
927 }
928 }
929
930 // Parse decimal digits and allow trailing fractional part.
931 if (kind == DECIMAL) {
932 ScanDecimalDigits(); // optional
933 if (c0_ == '.') {
934 AddCharAdvance();
935 ScanDecimalDigits(); // optional
936 }
937 }
938 }
939
940 // scan exponent, if any
941 if (c0_ == 'e' || c0_ == 'E') {
942 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
943 if (kind == OCTAL) return i::Token::ILLEGAL;
944 // scan exponent
945 AddCharAdvance();
946 if (c0_ == '+' || c0_ == '-')
947 AddCharAdvance();
948 if (!i::IsDecimalDigit(c0_)) {
949 // we must have at least one decimal digit after 'e'/'E'
950 return i::Token::ILLEGAL;
951 }
952 ScanDecimalDigits();
953 }
954
955 // The source character immediately following a numeric literal must
956 // not be an identifier start or a decimal digit; see ECMA-262
957 // section 7.8.3, page 17 (note that we read only one decimal digit
958 // if the value is 0).
959 if (i::IsDecimalDigit(c0_)
960 || i::ScannerConstants::kIsIdentifierStart.get(c0_))
961 return i::Token::ILLEGAL;
962
963 literal.Complete();
964
965 return i::Token::NUMBER;
966 }
967
968
969 template <typename UTF16Buffer, typename UTF8Buffer>
970 uc32 Scanner<UTF16Buffer, UTF8Buffer>::ScanIdentifierUnicodeEscape() {
971 Advance();
972 if (c0_ != 'u') return unibrow::Utf8::kBadChar;
973 Advance();
974 uc32 c = ScanHexEscape('u', 4);
975 // We do not allow a unicode escape sequence to start another
976 // unicode escape sequence.
977 if (c == '\\') return unibrow::Utf8::kBadChar;
978 return c;
979 }
980
981
982 template <typename UTF16Buffer, typename UTF8Buffer>
983 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::ScanIdentifier() {
984 ASSERT(i::ScannerConstants::kIsIdentifierStart.get(c0_));
985
986 LiteralScope literal(this, kLiteralIdentifier);
987 i::KeywordMatcher keyword_match;
988
989 // Scan identifier start character.
990 if (c0_ == '\\') {
991 uc32 c = ScanIdentifierUnicodeEscape();
992 // Only allow legal identifier start characters.
993 if (!i::ScannerConstants::kIsIdentifierStart.get(c)) {
994 return i::Token::ILLEGAL;
995 }
996 AddChar(c);
997 keyword_match.Fail();
998 } else {
999 AddChar(c0_);
1000 keyword_match.AddChar(c0_);
1001 Advance();
1002 }
1003
1004 // Scan the rest of the identifier characters.
1005 while (i::ScannerConstants::kIsIdentifierPart.get(c0_)) {
1006 if (c0_ == '\\') {
1007 uc32 c = ScanIdentifierUnicodeEscape();
1008 // Only allow legal identifier part characters.
1009 if (!i::ScannerConstants::kIsIdentifierPart.get(c)) {
1010 return i::Token::ILLEGAL;
1011 }
1012 AddChar(c);
1013 keyword_match.Fail();
1014 } else {
1015 AddChar(c0_);
1016 keyword_match.AddChar(c0_);
1017 Advance();
1018 }
1019 }
1020 literal.Complete();
1021
1022 return keyword_match.token();
1023 }
1024
1025
1026 template <typename UTF16Buffer, typename UTF8Buffer>
1027 bool Scanner<UTF16Buffer, UTF8Buffer>::ScanRegExpPattern(bool seen_equal) {
1028 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1029 bool in_character_class = false;
1030
1031 // Previous token is either '/' or '/=', in the second case, the
1032 // pattern starts at =.
1033 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1034 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1035
1036 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1037 // the scanner should pass uninterpreted bodies to the RegExp
1038 // constructor.
1039 LiteralScope literal(this, kLiteralRegExp);
1040 if (seen_equal)
1041 AddChar('=');
1042
1043 while (c0_ != '/' || in_character_class) {
1044 if (i::ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) {
1045 return false;
1046 }
1047 if (c0_ == '\\') { // escaped character
1048 AddCharAdvance();
1049 if (i::ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) {
1050 return false;
1051 }
1052 AddCharAdvance();
1053 } else { // unescaped character
1054 if (c0_ == '[') in_character_class = true;
1055 if (c0_ == ']') in_character_class = false;
1056 AddCharAdvance();
1057 }
1058 }
1059 Advance(); // consume '/'
1060
1061 literal.Complete();
1062
1063 return true;
1064 }
1065
1066 template <typename UTF16Buffer, typename UTF8Buffer>
1067 bool Scanner<UTF16Buffer, UTF8Buffer>::ScanRegExpFlags() {
1068 // Scan regular expression flags.
1069 LiteralScope literal(this, kLiteralRegExpFlags);
1070 while (i::ScannerConstants::kIsIdentifierPart.get(c0_)) {
1071 if (c0_ == '\\') {
1072 uc32 c = ScanIdentifierUnicodeEscape();
1073 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
1074 // We allow any escaped character, unlike the restriction on
1075 // IdentifierPart when it is used to build an IdentifierName.
1076 AddChar(c);
1077 continue;
1078 }
1079 }
1080 AddCharAdvance();
1081 }
1082 literal.Complete();
1083
1084 next_.location.end_pos = source_pos() - 1;
1085 return true;
1086 }
1087
1088
1089 } } // namespace v8::preparser
1090
1091 #endif // V8_PRESCANNER_H_
OLDNEW
« src/parser.cc ('K') | « src/parser.cc ('k') | src/scanner.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698