Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(856)

Side by Side Diff: src/prescanner.h

Issue 5063003: Add separate scanner only intended for preparsing. (Closed)
Patch Set: Address review comments. Created 10 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/parser.cc ('k') | src/scanner.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2010 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28 #ifndef V8_PRESCANNER_H_
29 #define V8_PRESCANNER_H_
30
31 #include "token.h"
32 #include "char-predicates-inl.h"
33 #include "utils.h"
34 #include "scanner-base.h"
35
36 namespace v8 {
37 namespace preparser {
38
39 namespace i = v8::internal;
40
41 typedef int uc32;
42
43 int HexValue(uc32 c) {
44 int res = c | 0x20; // Uppercase letters.
45 int is_digit = (c & 0x10) >> 4; // 0 if non-digit, 1 if digit.
46 // What to add to digits to make them consecutive with 'a'-'f' letters.
47 int kDelta = 'a' - '9' - 1;
48 // What to subtract to digits and letters to get them back to the range 0..15.
49 int kStart = '0' + kDelta;
50 res -= kStart;
51 res += kDelta * is_digit;
52 return res;
53 }
54
55
56 class PreScannerStackGuard {
57 public:
58 explicit PreScannerStackGuard(int max_size)
59 : limit_(StackPoint().at() - max_size) { }
60 bool has_overflowed() {
61 return StackPoint().at() < limit_;
62 }
63 private:
64 class StackPoint {
65 public:
66 char* at() { return reinterpret_cast<char*>(this); }
67 };
68 char* limit_;
69 };
70
71
72 // Scanner for preparsing.
73 // InputStream is a source of UC16 characters with limited push-back.
74 // LiteralsBuffer is a collector of (UTF-8) characters used to capture literals.
75 template <typename InputStream, typename LiteralsBuffer>
76 class Scanner {
77 public:
78 enum LiteralType {
79 kLiteralNumber,
80 kLiteralIdentifier,
81 kLiteralString,
82 kLiteralRegExp,
83 kLiteralRegExpFlags
84 };
85
86 class LiteralScope {
87 public:
88 explicit LiteralScope(Scanner* self, LiteralType type);
89 ~LiteralScope();
90 void Complete();
91
92 private:
93 Scanner* scanner_;
94 bool complete_;
95 };
96
97 Scanner();
98
99 void Initialize(InputStream* stream);
100
101 // Returns the next token.
102 i::Token::Value Next();
103
104 // Returns the current token again.
105 i::Token::Value current_token() { return current_.token; }
106
107 // One token look-ahead (past the token returned by Next()).
108 i::Token::Value peek() const { return next_.token; }
109
110 // Returns true if there was a line terminator before the peek'ed token.
111 bool has_line_terminator_before_next() const {
112 return has_line_terminator_before_next_;
113 }
114
115 struct Location {
116 Location(int b, int e) : beg_pos(b), end_pos(e) { }
117 Location() : beg_pos(0), end_pos(0) { }
118 int beg_pos;
119 int end_pos;
120 };
121
122 // Returns the location information for the current token
123 // (the token returned by Next()).
124 Location location() const { return current_.location; }
125 // Returns the location information for the look-ahead token
126 // (the token returned by peek()).
127 Location peek_location() const { return next_.location; }
128
129 // Returns the literal string, if any, for the current token (the
130 // token returned by Next()). The string is 0-terminated and in
131 // UTF-8 format; they may contain 0-characters. Literal strings are
132 // collected for identifiers, strings, and numbers.
133 // These functions only give the correct result if the literal
134 // was scanned between calls to StartLiteral() and TerminateLiteral().
135 const char* literal_string() const {
136 return current_.literal_chars;
137 }
138
139 int literal_length() const {
140 // Excluding terminal '\x00' added by TerminateLiteral().
141 return current_.literal_length - 1;
142 }
143
144 i::Vector<const char> literal() const {
145 return i::Vector<const char>(literal_string(), literal_length());
146 }
147
148 // Returns the literal string for the next token (the token that
149 // would be returned if Next() were called).
150 const char* next_literal_string() const {
151 return next_.literal_chars;
152 }
153
154
155 // Returns the length of the next token (that would be returned if
156 // Next() were called).
157 int next_literal_length() const {
158 // Excluding terminal '\x00' added by TerminateLiteral().
159 return next_.literal_length - 1;
160 }
161
162 i::Vector<const char> next_literal() const {
163 return i::Vector<const char>(next_literal_string(), next_literal_length());
164 }
165
166 // Scans the input as a regular expression pattern, previous
167 // character(s) must be /(=). Returns true if a pattern is scanned.
168 bool ScanRegExpPattern(bool seen_equal);
169 // Returns true if regexp flags are scanned (always since flags can
170 // be empty).
171 bool ScanRegExpFlags();
172
173 // Seek forward to the given position. This operation does not
174 // work in general, for instance when there are pushed back
175 // characters, but works for seeking forward until simple delimiter
176 // tokens, which is what it is used for.
177 void SeekForward(int pos);
178
179 bool stack_overflow() { return stack_overflow_; }
180
181 static const int kCharacterLookaheadBufferSize = 1;
182 static const int kNoEndPosition = 1;
183
184 private:
185 // The current and look-ahead token.
186 struct TokenDesc {
187 i::Token::Value token;
188 Location location;
189 const char* literal_chars;
190 int literal_length;
191 };
192
193 // Default stack limit is 128K pointers.
194 static const int kMaxStackSize = 128 * 1024 * sizeof(void*); // NOLINT.
195
196 void Init(unibrow::CharacterStream* stream);
197
198 // Literal buffer support
199 inline void StartLiteral(LiteralType type);
200 inline void AddLiteralChar(uc32 ch);
201 inline void AddLiteralCharAdvance();
202 inline void TerminateLiteral();
203 // Stops scanning of a literal, e.g., due to an encountered error.
204 inline void DropLiteral();
205
206 // Low-level scanning support.
207 void Advance() { c0_ = source_->Advance(); }
208 void PushBack(uc32 ch) {
209 source_->PushBack(ch);
210 c0_ = ch;
211 }
212
213 bool SkipWhiteSpace();
214
215 i::Token::Value SkipSingleLineComment();
216 i::Token::Value SkipMultiLineComment();
217
218 inline i::Token::Value Select(i::Token::Value tok);
219 inline i::Token::Value Select(uc32 next,
220 i::Token::Value then,
221 i::Token::Value else_);
222
223 // Scans a single JavaScript token.
224 void Scan();
225
226 void ScanDecimalDigits();
227 i::Token::Value ScanNumber(bool seen_period);
228 i::Token::Value ScanIdentifier();
229 uc32 ScanHexEscape(uc32 c, int length);
230 uc32 ScanOctalEscape(uc32 c, int length);
231 void ScanEscape();
232 i::Token::Value ScanString();
233
234 // Scans a possible HTML comment -- begins with '<!'.
235 i::Token::Value ScanHtmlComment();
236
237 // Return the current source position.
238 int source_pos() {
239 return source_->pos() - kCharacterLookaheadBufferSize;
240 }
241
242 // Decodes a unicode escape-sequence which is part of an identifier.
243 // If the escape sequence cannot be decoded the result is kBadRune.
244 uc32 ScanIdentifierUnicodeEscape();
245
246 PreScannerStackGuard stack_guard_;
247
248 TokenDesc current_; // desc for current token (as returned by Next())
249 TokenDesc next_; // desc for next token (one token look-ahead)
250 bool has_line_terminator_before_next_;
251
252 // Source.
253 InputStream* source_;
254
255 // Buffer to hold literal values (identifiers, strings, numerals, regexps and
256 // regexp flags) using '\x00'-terminated UTF-8 encoding.
257 // Handles allocation internally.
258 // Notice that the '\x00' termination is meaningless for strings and regexps
259 // which may contain the zero-character, but can be used as terminator for
260 // identifiers, numerals and regexp flags.
261 LiteralsBuffer literal_buffer_;
262
263 bool stack_overflow_;
264
265 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
266 uc32 c0_;
267 };
268
269
270 // ----------------------------------------------------------------------------
271 // Scanner::LiteralScope
272
273 template <typename InputStream, typename LiteralsBuffer>
274 Scanner<InputStream, LiteralsBuffer>::LiteralScope::LiteralScope(
275 Scanner* self, LiteralType type)
276 : scanner_(self), complete_(false) {
277 self->StartLiteral(type);
278 }
279
280
281 template <typename InputStream, typename LiteralsBuffer>
282 Scanner<InputStream, LiteralsBuffer>::LiteralScope::~LiteralScope() {
283 if (!complete_) scanner_->DropLiteral();
284 }
285
286 template <typename InputStream, typename LiteralsBuffer>
287 void Scanner<InputStream, LiteralsBuffer>::LiteralScope::Complete() {
288 scanner_->TerminateLiteral();
289 complete_ = true;
290 }
291
292
293 // ----------------------------------------------------------------------------
294 // Scanner.
295 template <typename InputStream, typename LiteralsBuffer>
296 Scanner<InputStream, LiteralsBuffer>::Scanner()
297 : stack_guard_(kMaxStackSize),
298 has_line_terminator_before_next_(false),
299 source_(NULL),
300 stack_overflow_(false) {}
301
302
303 template <typename InputStream, typename LiteralsBuffer>
304 void Scanner<InputStream, LiteralsBuffer>::Initialize(InputStream* stream) {
305 source_ = stream;
306
307 // Initialize current_ to not refer to a literal.
308 current_.literal_length = 0;
309 // Reset literal buffer.
310 literal_buffer_.Reset();
311
312 // Set c0_ (one character ahead)
313 ASSERT(kCharacterLookaheadBufferSize == 1);
314 Advance();
315
316 // Skip initial whitespace allowing HTML comment ends just like
317 // after a newline and scan first token.
318 has_line_terminator_before_next_ = true;
319 SkipWhiteSpace();
320 Scan();
321 }
322
323
324 template <typename InputStream, typename LiteralsBuffer>
325 i::Token::Value Scanner<InputStream, LiteralsBuffer>::Next() {
326 // BUG 1215673: Find a thread safe way to set a stack limit in
327 // pre-parse mode. Otherwise, we cannot safely pre-parse from other
328 // threads.
329 current_ = next_;
330 // Check for stack-overflow before returning any tokens.
331 if (stack_guard_.has_overflowed()) {
332 stack_overflow_ = true;
333 next_.token = i::Token::ILLEGAL;
334 } else {
335 has_line_terminator_before_next_ = false;
336 Scan();
337 }
338 return current_.token;
339 }
340
341
342 template <typename InputStream, typename LiteralsBuffer>
343 void Scanner<InputStream, LiteralsBuffer>::StartLiteral(LiteralType type) {
344 // Only record string and literal identifiers when preparsing.
345 // Those are the ones that are recorded as symbols. Numbers and
346 // regexps are not recorded.
347 if (type == kLiteralString || type == kLiteralIdentifier) {
348 literal_buffer_.StartLiteral();
349 }
350 }
351
352
353 template <typename InputStream, typename LiteralsBuffer>
354 void Scanner<InputStream, LiteralsBuffer>::AddLiteralChar(uc32 c) {
355 literal_buffer_.AddChar(c);
356 }
357
358
359 template <typename InputStream, typename LiteralsBuffer>
360 void Scanner<InputStream, LiteralsBuffer>::TerminateLiteral() {
361 i::Vector<const char> chars = literal_buffer_.EndLiteral();
362 next_.literal_chars = chars.start();
363 next_.literal_length = chars.length();
364 }
365
366
367 template <typename InputStream, typename LiteralsBuffer>
368 void Scanner<InputStream, LiteralsBuffer>::DropLiteral() {
369 literal_buffer_.DropLiteral();
370 }
371
372
373 template <typename InputStream, typename LiteralsBuffer>
374 void Scanner<InputStream, LiteralsBuffer>::AddLiteralCharAdvance() {
375 AddLiteralChar(c0_);
376 Advance();
377 }
378
379
380 static inline bool IsByteOrderMark(uc32 c) {
381 // The Unicode value U+FFFE is guaranteed never to be assigned as a
382 // Unicode character; this implies that in a Unicode context the
383 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
384 // character expressed in little-endian byte order (since it could
385 // not be a U+FFFE character expressed in big-endian byte
386 // order). Nevertheless, we check for it to be compatible with
387 // Spidermonkey.
388 return c == 0xFEFF || c == 0xFFFE;
389 }
390
391
392 template <typename InputStream, typename LiteralsBuffer>
393 bool Scanner<InputStream, LiteralsBuffer>::SkipWhiteSpace() {
394 int start_position = source_pos();
395
396 while (true) {
397 // We treat byte-order marks (BOMs) as whitespace for better
398 // compatibility with Spidermonkey and other JavaScript engines.
399 while (i::ScannerConstants::kIsWhiteSpace.get(c0_)
400 || IsByteOrderMark(c0_)) {
401 // IsWhiteSpace() includes line terminators!
402 if (i::ScannerConstants::kIsLineTerminator.get(c0_)) {
403 // Ignore line terminators, but remember them. This is necessary
404 // for automatic semicolon insertion.
405 has_line_terminator_before_next_ = true;
406 }
407 Advance();
408 }
409
410 // If there is an HTML comment end '-->' at the beginning of a
411 // line (with only whitespace in front of it), we treat the rest
412 // of the line as a comment. This is in line with the way
413 // SpiderMonkey handles it.
414 if (c0_ == '-' && has_line_terminator_before_next_) {
415 Advance();
416 if (c0_ == '-') {
417 Advance();
418 if (c0_ == '>') {
419 // Treat the rest of the line as a comment.
420 SkipSingleLineComment();
421 // Continue skipping white space after the comment.
422 continue;
423 }
424 PushBack('-'); // undo Advance()
425 }
426 PushBack('-'); // undo Advance()
427 }
428 // Return whether or not we skipped any characters.
429 return source_pos() != start_position;
430 }
431 }
432
433
434 template <typename InputStream, typename LiteralsBuffer>
435 i::Token::Value Scanner<InputStream, LiteralsBuffer>::SkipSingleLineComment() {
436 Advance();
437
438 // The line terminator at the end of the line is not considered
439 // to be part of the single-line comment; it is recognized
440 // separately by the lexical grammar and becomes part of the
441 // stream of input elements for the syntactic grammar (see
442 // ECMA-262, section 7.4, page 12).
443 while (c0_ >= 0 && !i::ScannerConstants::kIsLineTerminator.get(c0_)) {
444 Advance();
445 }
446
447 return i::Token::WHITESPACE;
448 }
449
450
451 template <typename InputStream, typename LiteralsBuffer>
452 i::Token::Value Scanner<InputStream, LiteralsBuffer>::SkipMultiLineComment() {
453 ASSERT(c0_ == '*');
454 Advance();
455
456 while (c0_ >= 0) {
457 char ch = c0_;
458 Advance();
459 // If we have reached the end of the multi-line comment, we
460 // consume the '/' and insert a whitespace. This way all
461 // multi-line comments are treated as whitespace - even the ones
462 // containing line terminators. This contradicts ECMA-262, section
463 // 7.4, page 12, that says that multi-line comments containing
464 // line terminators should be treated as a line terminator, but it
465 // matches the behaviour of SpiderMonkey and KJS.
466 if (ch == '*' && c0_ == '/') {
467 c0_ = ' ';
468 return i::Token::WHITESPACE;
469 }
470 }
471
472 // Unterminated multi-line comment.
473 return i::Token::ILLEGAL;
474 }
475
476
477 template <typename InputStream, typename LiteralsBuffer>
478 i::Token::Value Scanner<InputStream, LiteralsBuffer>::ScanHtmlComment() {
479 // Check for <!-- comments.
480 ASSERT(c0_ == '!');
481 Advance();
482 if (c0_ == '-') {
483 Advance();
484 if (c0_ == '-') return SkipSingleLineComment();
485 PushBack('-'); // undo Advance()
486 }
487 PushBack('!'); // undo Advance()
488 ASSERT(c0_ == '!');
489 return i::Token::LT;
490 }
491
492
493 template <typename InputStream, typename LiteralsBuffer>
494 void Scanner<InputStream, LiteralsBuffer>::Scan() {
495 next_.literal_length = 0;
496 i::Token::Value token;
497 do {
498 // Remember the position of the next token
499 next_.location.beg_pos = source_pos();
500
501 switch (c0_) {
502 case ' ':
503 case '\t':
504 Advance();
505 token = i::Token::WHITESPACE;
506 break;
507
508 case '\n':
509 Advance();
510 has_line_terminator_before_next_ = true;
511 token = i::Token::WHITESPACE;
512 break;
513
514 case '"': case '\'':
515 token = ScanString();
516 break;
517
518 case '<':
519 // < <= << <<= <!--
520 Advance();
521 if (c0_ == '=') {
522 token = Select(i::Token::LTE);
523 } else if (c0_ == '<') {
524 token = Select('=', i::Token::ASSIGN_SHL, i::Token::SHL);
525 } else if (c0_ == '!') {
526 token = ScanHtmlComment();
527 } else {
528 token = i::Token::LT;
529 }
530 break;
531
532 case '>':
533 // > >= >> >>= >>> >>>=
534 Advance();
535 if (c0_ == '=') {
536 token = Select(i::Token::GTE);
537 } else if (c0_ == '>') {
538 // >> >>= >>> >>>=
539 Advance();
540 if (c0_ == '=') {
541 token = Select(i::Token::ASSIGN_SAR);
542 } else if (c0_ == '>') {
543 token = Select('=', i::Token::ASSIGN_SHR, i::Token::SHR);
544 } else {
545 token = i::Token::SAR;
546 }
547 } else {
548 token = i::Token::GT;
549 }
550 break;
551
552 case '=':
553 // = == ===
554 Advance();
555 if (c0_ == '=') {
556 token = Select('=', i::Token::EQ_STRICT, i::Token::EQ);
557 } else {
558 token = i::Token::ASSIGN;
559 }
560 break;
561
562 case '!':
563 // ! != !==
564 Advance();
565 if (c0_ == '=') {
566 token = Select('=', i::Token::NE_STRICT, i::Token::NE);
567 } else {
568 token = i::Token::NOT;
569 }
570 break;
571
572 case '+':
573 // + ++ +=
574 Advance();
575 if (c0_ == '+') {
576 token = Select(i::Token::INC);
577 } else if (c0_ == '=') {
578 token = Select(i::Token::ASSIGN_ADD);
579 } else {
580 token = i::Token::ADD;
581 }
582 break;
583
584 case '-':
585 // - -- --> -=
586 Advance();
587 if (c0_ == '-') {
588 Advance();
589 if (c0_ == '>' && has_line_terminator_before_next_) {
590 // For compatibility with SpiderMonkey, we skip lines that
591 // start with an HTML comment end '-->'.
592 token = SkipSingleLineComment();
593 } else {
594 token = i::Token::DEC;
595 }
596 } else if (c0_ == '=') {
597 token = Select(i::Token::ASSIGN_SUB);
598 } else {
599 token = i::Token::SUB;
600 }
601 break;
602
603 case '*':
604 // * *=
605 token = Select('=', i::Token::ASSIGN_MUL, i::Token::MUL);
606 break;
607
608 case '%':
609 // % %=
610 token = Select('=', i::Token::ASSIGN_MOD, i::Token::MOD);
611 break;
612
613 case '/':
614 // / // /* /=
615 Advance();
616 if (c0_ == '/') {
617 token = SkipSingleLineComment();
618 } else if (c0_ == '*') {
619 token = SkipMultiLineComment();
620 } else if (c0_ == '=') {
621 token = Select(i::Token::ASSIGN_DIV);
622 } else {
623 token = i::Token::DIV;
624 }
625 break;
626
627 case '&':
628 // & && &=
629 Advance();
630 if (c0_ == '&') {
631 token = Select(i::Token::AND);
632 } else if (c0_ == '=') {
633 token = Select(i::Token::ASSIGN_BIT_AND);
634 } else {
635 token = i::Token::BIT_AND;
636 }
637 break;
638
639 case '|':
640 // | || |=
641 Advance();
642 if (c0_ == '|') {
643 token = Select(i::Token::OR);
644 } else if (c0_ == '=') {
645 token = Select(i::Token::ASSIGN_BIT_OR);
646 } else {
647 token = i::Token::BIT_OR;
648 }
649 break;
650
651 case '^':
652 // ^ ^=
653 token = Select('=', i::Token::ASSIGN_BIT_XOR, i::Token::BIT_XOR);
654 break;
655
656 case '.':
657 // . Number
658 Advance();
659 if (i::IsDecimalDigit(c0_)) {
660 token = ScanNumber(true);
661 } else {
662 token = i::Token::PERIOD;
663 }
664 break;
665
666 case ':':
667 token = Select(i::Token::COLON);
668 break;
669
670 case ';':
671 token = Select(i::Token::SEMICOLON);
672 break;
673
674 case ',':
675 token = Select(i::Token::COMMA);
676 break;
677
678 case '(':
679 token = Select(i::Token::LPAREN);
680 break;
681
682 case ')':
683 token = Select(i::Token::RPAREN);
684 break;
685
686 case '[':
687 token = Select(i::Token::LBRACK);
688 break;
689
690 case ']':
691 token = Select(i::Token::RBRACK);
692 break;
693
694 case '{':
695 token = Select(i::Token::LBRACE);
696 break;
697
698 case '}':
699 token = Select(i::Token::RBRACE);
700 break;
701
702 case '?':
703 token = Select(i::Token::CONDITIONAL);
704 break;
705
706 case '~':
707 token = Select(i::Token::BIT_NOT);
708 break;
709
710 default:
711 if (i::ScannerConstants::kIsIdentifierStart.get(c0_)) {
712 token = ScanIdentifier();
713 } else if (i::IsDecimalDigit(c0_)) {
714 token = ScanNumber(false);
715 } else if (SkipWhiteSpace()) {
716 token = i::Token::WHITESPACE;
717 } else if (c0_ < 0) {
718 token = i::Token::EOS;
719 } else {
720 token = Select(i::Token::ILLEGAL);
721 }
722 break;
723 }
724
725 // Continue scanning for tokens as long as we're just skipping
726 // whitespace.
727 } while (token == i::Token::WHITESPACE);
728
729 next_.location.end_pos = source_pos();
730 next_.token = token;
731 }
732
733
734 template <typename InputStream, typename LiteralsBuffer>
735 void Scanner<InputStream, LiteralsBuffer>::SeekForward(int pos) {
736 source_->SeekForward(pos - 1);
737 Advance();
738 // This function is only called to seek to the location
739 // of the end of a function (at the "}" token). It doesn't matter
740 // whether there was a line terminator in the part we skip.
741 has_line_terminator_before_next_ = false;
742 Scan();
743 }
744
745
746 template <typename InputStream, typename LiteralsBuffer>
747 uc32 Scanner<InputStream, LiteralsBuffer>::ScanHexEscape(uc32 c, int length) {
748 ASSERT(length <= 4); // prevent overflow
749
750 uc32 digits[4];
751 uc32 x = 0;
752 for (int i = 0; i < length; i++) {
753 digits[i] = c0_;
754 int d = HexValue(c0_);
755 if (d < 0) {
756 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
757 // should be illegal, but other JS VMs just return the
758 // non-escaped version of the original character.
759
760 // Push back digits read, except the last one (in c0_).
761 for (int j = i-1; j >= 0; j--) {
762 PushBack(digits[j]);
763 }
764 // Notice: No handling of error - treat it as "\u"->"u".
765 return c;
766 }
767 x = x * 16 + d;
768 Advance();
769 }
770
771 return x;
772 }
773
774
775 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
776 // ECMA-262. Other JS VMs support them.
777 template <typename InputStream, typename LiteralsBuffer>
778 uc32 Scanner<InputStream, LiteralsBuffer>::ScanOctalEscape(
779 uc32 c, int length) {
780 uc32 x = c - '0';
781 for (int i = 0; i < length; i++) {
782 int d = c0_ - '0';
783 if (d < 0 || d > 7) break;
784 int nx = x * 8 + d;
785 if (nx >= 256) break;
786 x = nx;
787 Advance();
788 }
789 return x;
790 }
791
792
793 template <typename InputStream, typename LiteralsBuffer>
794 void Scanner<InputStream, LiteralsBuffer>::ScanEscape() {
795 uc32 c = c0_;
796 Advance();
797
798 // Skip escaped newlines.
799 if (i::ScannerConstants::kIsLineTerminator.get(c)) {
800 // Allow CR+LF newlines in multiline string literals.
801 if (i::IsCarriageReturn(c) && i::IsLineFeed(c0_)) Advance();
802 // Allow LF+CR newlines in multiline string literals.
803 if (i::IsLineFeed(c) && i::IsCarriageReturn(c0_)) Advance();
804 return;
805 }
806
807 switch (c) {
808 case '\'': // fall through
809 case '"' : // fall through
810 case '\\': break;
811 case 'b' : c = '\b'; break;
812 case 'f' : c = '\f'; break;
813 case 'n' : c = '\n'; break;
814 case 'r' : c = '\r'; break;
815 case 't' : c = '\t'; break;
816 case 'u' : c = ScanHexEscape(c, 4); break;
817 case 'v' : c = '\v'; break;
818 case 'x' : c = ScanHexEscape(c, 2); break;
819 case '0' : // fall through
820 case '1' : // fall through
821 case '2' : // fall through
822 case '3' : // fall through
823 case '4' : // fall through
824 case '5' : // fall through
825 case '6' : // fall through
826 case '7' : c = ScanOctalEscape(c, 2); break;
827 }
828
829 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
830 // should be illegal, but they are commonly handled
831 // as non-escaped characters by JS VMs.
832 AddLiteralChar(c);
833 }
834
835
836 template <typename InputStream, typename LiteralsBuffer>
837 i::Token::Value Scanner<InputStream, LiteralsBuffer>::ScanString() {
838 uc32 quote = c0_;
839 Advance(); // consume quote
840
841 LiteralScope literal(this, kLiteralString);
842 while (c0_ != quote && c0_ >= 0
843 && !i::ScannerConstants::kIsLineTerminator.get(c0_)) {
844 uc32 c = c0_;
845 Advance();
846 if (c == '\\') {
847 if (c0_ < 0) return i::Token::ILLEGAL;
848 ScanEscape();
849 } else {
850 AddLiteralChar(c);
851 }
852 }
853 if (c0_ != quote) return i::Token::ILLEGAL;
854 literal.Complete();
855
856 Advance(); // consume quote
857 return i::Token::STRING;
858 }
859
860
861 template <typename InputStream, typename LiteralsBuffer>
862 i::Token::Value Scanner<InputStream, LiteralsBuffer>::Select(
863 i::Token::Value tok) {
864 Advance();
865 return tok;
866 }
867
868
869 template <typename InputStream, typename LiteralsBuffer>
870 i::Token::Value Scanner<InputStream, LiteralsBuffer>::Select(
871 uc32 next,
872 i::Token::Value then,
873 i::Token::Value else_) {
874 Advance();
875 if (c0_ == next) {
876 Advance();
877 return then;
878 } else {
879 return else_;
880 }
881 }
882
883
884 // Returns true if any decimal digits were scanned, returns false otherwise.
885 template <typename InputStream, typename LiteralsBuffer>
886 void Scanner<InputStream, LiteralsBuffer>::ScanDecimalDigits() {
887 while (i::IsDecimalDigit(c0_))
888 AddLiteralCharAdvance();
889 }
890
891
892 template <typename InputStream, typename LiteralsBuffer>
893 i::Token::Value Scanner<InputStream, LiteralsBuffer>::ScanNumber(
894 bool seen_period) {
895 // c0_ is the first digit of the number or the fraction.
896 ASSERT(i::IsDecimalDigit(c0_));
897
898 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
899
900 LiteralScope literal(this, kLiteralNumber);
901 if (seen_period) {
902 // we have already seen a decimal point of the float
903 AddLiteralChar('.');
904 ScanDecimalDigits(); // we know we have at least one digit
905
906 } else {
907 // if the first character is '0' we must check for octals and hex
908 if (c0_ == '0') {
909 AddLiteralCharAdvance();
910
911 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
912 if (c0_ == 'x' || c0_ == 'X') {
913 // hex number
914 kind = HEX;
915 AddLiteralCharAdvance();
916 if (!i::IsHexDigit(c0_)) {
917 // we must have at least one hex digit after 'x'/'X'
918 return i::Token::ILLEGAL;
919 }
920 while (i::IsHexDigit(c0_)) {
921 AddLiteralCharAdvance();
922 }
923 } else if ('0' <= c0_ && c0_ <= '7') {
924 // (possible) octal number
925 kind = OCTAL;
926 while (true) {
927 if (c0_ == '8' || c0_ == '9') {
928 kind = DECIMAL;
929 break;
930 }
931 if (c0_ < '0' || '7' < c0_) break;
932 AddLiteralCharAdvance();
933 }
934 }
935 }
936
937 // Parse decimal digits and allow trailing fractional part.
938 if (kind == DECIMAL) {
939 ScanDecimalDigits(); // optional
940 if (c0_ == '.') {
941 AddLiteralCharAdvance();
942 ScanDecimalDigits(); // optional
943 }
944 }
945 }
946
947 // scan exponent, if any
948 if (c0_ == 'e' || c0_ == 'E') {
949 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
950 if (kind == OCTAL) return i::Token::ILLEGAL;
951 // scan exponent
952 AddLiteralCharAdvance();
953 if (c0_ == '+' || c0_ == '-')
954 AddLiteralCharAdvance();
955 if (!i::IsDecimalDigit(c0_)) {
956 // we must have at least one decimal digit after 'e'/'E'
957 return i::Token::ILLEGAL;
958 }
959 ScanDecimalDigits();
960 }
961
962 // The source character immediately following a numeric literal must
963 // not be an identifier start or a decimal digit; see ECMA-262
964 // section 7.8.3, page 17 (note that we read only one decimal digit
965 // if the value is 0).
966 if (i::IsDecimalDigit(c0_)
967 || i::ScannerConstants::kIsIdentifierStart.get(c0_))
968 return i::Token::ILLEGAL;
969
970 literal.Complete();
971
972 return i::Token::NUMBER;
973 }
974
975
976 template <typename InputStream, typename LiteralsBuffer>
977 uc32 Scanner<InputStream, LiteralsBuffer>::ScanIdentifierUnicodeEscape() {
978 Advance();
979 if (c0_ != 'u') return unibrow::Utf8::kBadChar;
980 Advance();
981 uc32 c = ScanHexEscape('u', 4);
982 // We do not allow a unicode escape sequence to start another
983 // unicode escape sequence.
984 if (c == '\\') return unibrow::Utf8::kBadChar;
985 return c;
986 }
987
988
989 template <typename InputStream, typename LiteralsBuffer>
990 i::Token::Value Scanner<InputStream, LiteralsBuffer>::ScanIdentifier() {
991 ASSERT(i::ScannerConstants::kIsIdentifierStart.get(c0_));
992
993 LiteralScope literal(this, kLiteralIdentifier);
994 i::KeywordMatcher keyword_match;
995
996 // Scan identifier start character.
997 if (c0_ == '\\') {
998 uc32 c = ScanIdentifierUnicodeEscape();
999 // Only allow legal identifier start characters.
1000 if (!i::ScannerConstants::kIsIdentifierStart.get(c)) {
1001 return i::Token::ILLEGAL;
1002 }
1003 AddLiteralChar(c);
1004 keyword_match.Fail();
1005 } else {
1006 AddLiteralChar(c0_);
1007 keyword_match.AddChar(c0_);
1008 Advance();
1009 }
1010
1011 // Scan the rest of the identifier characters.
1012 while (i::ScannerConstants::kIsIdentifierPart.get(c0_)) {
1013 if (c0_ == '\\') {
1014 uc32 c = ScanIdentifierUnicodeEscape();
1015 // Only allow legal identifier part characters.
1016 if (!i::ScannerConstants::kIsIdentifierPart.get(c)) {
1017 return i::Token::ILLEGAL;
1018 }
1019 AddLiteralChar(c);
1020 keyword_match.Fail();
1021 } else {
1022 AddLiteralChar(c0_);
1023 keyword_match.AddChar(c0_);
1024 Advance();
1025 }
1026 }
1027 literal.Complete();
1028
1029 return keyword_match.token();
1030 }
1031
1032
1033 template <typename InputStream, typename LiteralsBuffer>
1034 bool Scanner<InputStream, LiteralsBuffer>::ScanRegExpPattern(bool seen_equal) {
1035 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1036 bool in_character_class = false;
1037
1038 // Previous token is either '/' or '/=', in the second case, the
1039 // pattern starts at =.
1040 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1041 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1042
1043 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1044 // the scanner should pass uninterpreted bodies to the RegExp
1045 // constructor.
1046 LiteralScope literal(this, kLiteralRegExp);
1047 if (seen_equal)
1048 AddLiteralChar('=');
1049
1050 while (c0_ != '/' || in_character_class) {
1051 if (i::ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) {
1052 return false;
1053 }
1054 if (c0_ == '\\') { // escaped character
1055 AddLiteralCharAdvance();
1056 if (i::ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) {
1057 return false;
1058 }
1059 AddLiteralCharAdvance();
1060 } else { // unescaped character
1061 if (c0_ == '[') in_character_class = true;
1062 if (c0_ == ']') in_character_class = false;
1063 AddLiteralCharAdvance();
1064 }
1065 }
1066 Advance(); // consume '/'
1067
1068 literal.Complete();
1069
1070 return true;
1071 }
1072
1073 template <typename InputStream, typename LiteralsBuffer>
1074 bool Scanner<InputStream, LiteralsBuffer>::ScanRegExpFlags() {
1075 // Scan regular expression flags.
1076 LiteralScope literal(this, kLiteralRegExpFlags);
1077 while (i::ScannerConstants::kIsIdentifierPart.get(c0_)) {
1078 if (c0_ == '\\') {
1079 uc32 c = ScanIdentifierUnicodeEscape();
1080 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
1081 // We allow any escaped character, unlike the restriction on
1082 // IdentifierPart when it is used to build an IdentifierName.
1083 AddLiteralChar(c);
1084 continue;
1085 }
1086 }
1087 AddLiteralCharAdvance();
1088 }
1089 literal.Complete();
1090
1091 next_.location.end_pos = source_pos() - 1;
1092 return true;
1093 }
1094
1095
1096 } } // namespace v8::preparser
1097
1098 #endif // V8_PRESCANNER_H_
OLDNEW
« no previous file with comments | « src/parser.cc ('k') | src/scanner.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698