OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2010 the V8 project authors. All rights reserved. | |
2 // Redistribution and use in source and binary forms, with or without | |
3 // modification, are permitted provided that the following conditions are | |
4 // met: | |
5 // | |
6 // * Redistributions of source code must retain the above copyright | |
7 // notice, this list of conditions and the following disclaimer. | |
8 // * Redistributions in binary form must reproduce the above | |
9 // copyright notice, this list of conditions and the following | |
10 // disclaimer in the documentation and/or other materials provided | |
11 // with the distribution. | |
12 // * Neither the name of Google Inc. nor the names of its | |
13 // contributors may be used to endorse or promote products derived | |
14 // from this software without specific prior written permission. | |
15 // | |
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
27 | |
28 #ifndef V8_PRESCANNER_H_ | |
29 #define V8_PRESCANNER_H_ | |
30 | |
31 #include "token.h" | |
32 #include "char-predicates-inl.h" | |
33 #include "utils.h" | |
34 #include "scanner-base.h" | |
35 | |
36 namespace v8 { | |
37 namespace preparser { | |
38 | |
39 namespace i = v8::internal; | |
40 | |
41 typedef int uc32; | |
42 | |
43 int HexValue(uc32 c) { | |
44 int res = c | 0x20; // Uppercase letters. | |
45 int is_digit = (c & 0x10) >> 4; // 0 if non-digit, 1 if digit. | |
46 // What to add to digits to make them consecutive with 'a'-'f' letters. | |
47 int kDelta = 'a' - '9' - 1; | |
48 // What to subtract to digits and letters to get them back to the range 0..15. | |
49 int kStart = '0' + kDelta; | |
50 res -= kStart; | |
51 res += kDelta * is_digit; | |
52 return res; | |
53 } | |
54 | |
55 | |
56 class PreScannerStackGuard { | |
57 public: | |
58 explicit PreScannerStackGuard(int max_size) | |
59 : limit_(StackPoint().at() - max_size) { } | |
60 bool has_overflowed() { | |
61 return StackPoint().at() < limit_; | |
62 } | |
63 private: | |
64 class StackPoint { | |
65 public: | |
66 char* at() { return reinterpret_cast<char*>(this); } | |
67 }; | |
68 char* limit_; | |
69 }; | |
70 | |
71 | |
72 template <typename UTF16Buffer, typename UTF8Buffer> | |
Søren Thygesen Gjesse
2010/11/17 10:37:46
Please explain these template parameters a bit. It
Lasse Reichstein
2010/11/17 13:08:39
Agree.
The long term plan is to have a stand-alone
| |
73 class Scanner { | |
74 public: | |
75 enum LiteralType { | |
76 kLiteralNumber, | |
77 kLiteralIdentifier, | |
78 kLiteralString, | |
79 kLiteralRegExp, | |
80 kLiteralRegExpFlags | |
81 }; | |
82 | |
83 class LiteralScope { | |
84 public: | |
85 explicit LiteralScope(Scanner* self, LiteralType type); | |
86 ~LiteralScope(); | |
87 void Complete(); | |
88 | |
89 private: | |
90 Scanner* scanner_; | |
91 bool complete_; | |
92 }; | |
93 | |
94 Scanner(); | |
95 | |
96 void Initialize(UTF16Buffer* stream); | |
97 | |
98 // Returns the next token. | |
99 i::Token::Value Next(); | |
100 | |
101 // Returns the current token again. | |
102 i::Token::Value current_token() { return current_.token; } | |
103 | |
104 // One token look-ahead (past the token returned by Next()). | |
105 i::Token::Value peek() const { return next_.token; } | |
106 | |
107 // Returns true if there was a line terminator before the peek'ed token. | |
108 bool has_line_terminator_before_next() const { | |
109 return has_line_terminator_before_next_; | |
110 } | |
111 | |
112 struct Location { | |
113 Location(int b, int e) : beg_pos(b), end_pos(e) { } | |
114 Location() : beg_pos(0), end_pos(0) { } | |
115 int beg_pos; | |
116 int end_pos; | |
117 }; | |
118 | |
119 // Returns the location information for the current token | |
120 // (the token returned by Next()). | |
121 Location location() const { return current_.location; } | |
Søren Thygesen Gjesse
2010/11/17 10:37:46
Maybe add a separate comment for peek_location.
Lasse Reichstein
2010/11/17 13:08:39
Done.
| |
122 Location peek_location() const { return next_.location; } | |
123 | |
124 // Returns the literal string, if any, for the current token (the | |
125 // token returned by Next()). The string is 0-terminated and in | |
126 // UTF-8 format; they may contain 0-characters. Literal strings are | |
127 // collected for identifiers, strings, and numbers. | |
128 // These functions only give the correct result if the literal | |
129 // was scanned between calls to StartLiteral() and TerminateLiteral(). | |
130 const char* literal_string() const { | |
131 return current_.literal_chars; | |
132 } | |
133 | |
134 int literal_length() const { | |
135 // Excluding terminal '\x00' added by TerminateLiteral(). | |
136 return current_.literal_length - 1; | |
137 } | |
138 | |
139 i::Vector<const char> literal() const { | |
140 return i::Vector<const char>(literal_string(), literal_length()); | |
141 } | |
142 | |
143 // Returns the literal string for the next token (the token that | |
144 // would be returned if Next() were called). | |
145 const char* next_literal_string() const { | |
146 return next_.literal_chars; | |
147 } | |
148 | |
149 | |
150 // Returns the length of the next token (that would be returned if | |
151 // Next() were called). | |
152 int next_literal_length() const { | |
153 // Excluding terminal '\x00' added by TerminateLiteral(). | |
154 return next_.literal_length - 1; | |
155 } | |
156 | |
157 i::Vector<const char> next_literal() const { | |
158 return i::Vector<const char>(next_literal_string(), next_literal_length()); | |
159 } | |
160 | |
161 // Scans the input as a regular expression pattern, previous | |
162 // character(s) must be /(=). Returns true if a pattern is scanned. | |
163 bool ScanRegExpPattern(bool seen_equal); | |
164 // Returns true if regexp flags are scanned (always since flags can | |
165 // be empty). | |
166 bool ScanRegExpFlags(); | |
167 | |
168 // Seek forward to the given position. This operation does not | |
169 // work in general, for instance when there are pushed back | |
170 // characters, but works for seeking forward until simple delimiter | |
171 // tokens, which is what it is used for. | |
172 void SeekForward(int pos); | |
173 | |
174 bool stack_overflow() { return stack_overflow_; } | |
175 | |
176 static const int kCharacterLookaheadBufferSize = 1; | |
177 static const int kNoEndPosition = 1; | |
178 | |
179 private: | |
180 // The current and look-ahead token. | |
181 struct TokenDesc { | |
182 i::Token::Value token; | |
183 Location location; | |
184 const char* literal_chars; | |
185 int literal_length; | |
186 }; | |
187 | |
188 // Default stack limit is 128K pointers. | |
189 static const int kMaxStackSize = 128 * 1024 * sizeof(void*); // NOLINT. | |
190 | |
191 void Init(unibrow::CharacterStream* stream); | |
192 | |
193 // Literal buffer support | |
194 inline void StartLiteral(LiteralType type); | |
195 inline void AddChar(uc32 ch); | |
196 inline void AddCharAdvance(); | |
197 inline void TerminateLiteral(); | |
198 // Stops scanning of a literal, e.g., due to an encountered error. | |
199 inline void DropLiteral(); | |
200 | |
201 // Low-level scanning support. | |
202 void Advance() { c0_ = source_->Advance(); } | |
203 void PushBack(uc32 ch) { | |
204 source_->PushBack(ch); | |
205 c0_ = ch; | |
206 } | |
207 | |
208 bool SkipWhiteSpace(); | |
209 | |
210 i::Token::Value SkipSingleLineComment(); | |
211 i::Token::Value SkipMultiLineComment(); | |
212 | |
213 inline i::Token::Value Select(i::Token::Value tok); | |
214 inline i::Token::Value Select(uc32 next, | |
215 i::Token::Value then, | |
216 i::Token::Value else_); | |
217 | |
218 // Scans a single JavaScript token. | |
219 void Scan(); | |
220 | |
221 void ScanDecimalDigits(); | |
222 i::Token::Value ScanNumber(bool seen_period); | |
223 i::Token::Value ScanIdentifier(); | |
224 uc32 ScanHexEscape(uc32 c, int length); | |
225 uc32 ScanOctalEscape(uc32 c, int length); | |
226 void ScanEscape(); | |
227 i::Token::Value ScanString(); | |
228 | |
229 // Scans a possible HTML comment -- begins with '<!'. | |
230 i::Token::Value ScanHtmlComment(); | |
231 | |
232 // Return the current source position. | |
233 int source_pos() { | |
234 return source_->pos() - kCharacterLookaheadBufferSize; | |
235 } | |
236 | |
237 // Decodes a unicode escape-sequence which is part of an identifier. | |
238 // If the escape sequence cannot be decoded the result is kBadRune. | |
239 uc32 ScanIdentifierUnicodeEscape(); | |
240 | |
241 PreScannerStackGuard stack_guard_; | |
242 | |
243 TokenDesc current_; // desc for current token (as returned by Next()) | |
244 TokenDesc next_; // desc for next token (one token look-ahead) | |
245 bool has_line_terminator_before_next_; | |
246 | |
247 // Source. | |
248 UTF16Buffer* source_; | |
249 | |
250 // Buffer to hold literal values (identifiers, strings, numerals, regexps and | |
251 // regexp flags) using '\x00'-terminated UTF-8 encoding. | |
252 // Handles allocation internally. | |
253 // Notice that the '\x00' termination is meaningless for strings and regexps | |
254 // which may contain the zero-character, but can be used as terminator for | |
255 // identifiers, numerals and regexp flags. | |
256 UTF8Buffer literal_buffer_; | |
257 | |
258 bool stack_overflow_; | |
259 | |
260 // One Unicode character look-ahead; c0_ < 0 at the end of the input. | |
261 uc32 c0_; | |
262 }; | |
263 | |
264 | |
265 // ---------------------------------------------------------------------------- | |
266 // Scanner::LiteralScope | |
267 | |
268 template <typename UTF16Buffer, typename UTF8Buffer> | |
269 Scanner<UTF16Buffer, UTF8Buffer>::LiteralScope::LiteralScope( | |
270 Scanner* self, LiteralType type) | |
271 : scanner_(self), complete_(false) { | |
272 self->StartLiteral(type); | |
273 } | |
274 | |
275 | |
276 template <typename UTF16Buffer, typename UTF8Buffer> | |
277 Scanner<UTF16Buffer, UTF8Buffer>::LiteralScope::~LiteralScope() { | |
278 if (!complete_) scanner_->DropLiteral(); | |
279 } | |
280 | |
281 template <typename UTF16Buffer, typename UTF8Buffer> | |
282 void Scanner<UTF16Buffer, UTF8Buffer>::LiteralScope::Complete() { | |
283 scanner_->TerminateLiteral(); | |
284 complete_ = true; | |
285 } | |
286 | |
287 | |
288 // ---------------------------------------------------------------------------- | |
289 // Scanner | |
290 template <typename UTF16Buffer, typename UTF8Buffer> | |
291 Scanner<UTF16Buffer, UTF8Buffer>::Scanner() | |
292 : stack_guard_(kMaxStackSize), | |
293 has_line_terminator_before_next_(false), | |
294 source_(NULL), | |
295 stack_overflow_(false) {} | |
296 | |
297 | |
298 template <typename UTF16Buffer, typename UTF8Buffer> | |
299 void Scanner<UTF16Buffer, UTF8Buffer>::Initialize(UTF16Buffer* stream) { | |
300 source_ = stream; | |
301 | |
302 // Initialize current_ to not refer to a literal. | |
303 current_.literal_length = 0; | |
304 // Reset literal buffer. | |
305 literal_buffer_.Reset(); | |
306 | |
307 // Set c0_ (one character ahead) | |
308 ASSERT(kCharacterLookaheadBufferSize == 1); | |
309 Advance(); | |
310 | |
311 // Skip initial whitespace allowing HTML comment ends just like | |
312 // after a newline and scan first token. | |
313 has_line_terminator_before_next_ = true; | |
314 SkipWhiteSpace(); | |
315 Scan(); | |
316 } | |
317 | |
318 | |
319 template <typename UTF16Buffer, typename UTF8Buffer> | |
320 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::Next() { | |
321 // BUG 1215673: Find a thread safe way to set a stack limit in | |
322 // pre-parse mode. Otherwise, we cannot safely pre-parse from other | |
323 // threads. | |
324 current_ = next_; | |
325 // Check for stack-overflow before returning any tokens. | |
326 if (stack_guard_.has_overflowed()) { | |
327 stack_overflow_ = true; | |
328 next_.token = i::Token::ILLEGAL; | |
329 } else { | |
330 has_line_terminator_before_next_ = false; | |
331 Scan(); | |
332 } | |
333 return current_.token; | |
334 } | |
335 | |
336 | |
337 template <typename UTF16Buffer, typename UTF8Buffer> | |
338 void Scanner<UTF16Buffer, UTF8Buffer>::StartLiteral(LiteralType type) { | |
339 // Only record string and literal identifiers when preparsing. | |
340 // Those are the ones that are recorded as symbols. Numbers and | |
341 // regexps are not recorded. | |
342 if (type == kLiteralString || type == kLiteralIdentifier) { | |
343 literal_buffer_.StartLiteral(); | |
344 } | |
345 } | |
346 | |
347 | |
348 template <typename UTF16Buffer, typename UTF8Buffer> | |
349 void Scanner<UTF16Buffer, UTF8Buffer>::AddChar(uc32 c) { | |
Søren Thygesen Gjesse
2010/11/17 10:37:46
AddChar -> AddLiteralChar?
Lasse Reichstein
2010/11/17 13:08:39
Done.
Also in scanner.h/.cc.
| |
350 literal_buffer_.AddChar(c); | |
351 } | |
352 | |
353 | |
354 template <typename UTF16Buffer, typename UTF8Buffer> | |
355 void Scanner<UTF16Buffer, UTF8Buffer>::TerminateLiteral() { | |
356 i::Vector<const char> chars = literal_buffer_.EndLiteral(); | |
357 next_.literal_chars = chars.start(); | |
358 next_.literal_length = chars.length(); | |
359 } | |
360 | |
361 | |
362 template <typename UTF16Buffer, typename UTF8Buffer> | |
363 void Scanner<UTF16Buffer, UTF8Buffer>::DropLiteral() { | |
364 literal_buffer_.DropLiteral(); | |
365 } | |
366 | |
367 | |
368 template <typename UTF16Buffer, typename UTF8Buffer> | |
369 void Scanner<UTF16Buffer, UTF8Buffer>::AddCharAdvance() { | |
370 AddChar(c0_); | |
371 Advance(); | |
372 } | |
373 | |
374 | |
375 static inline bool IsByteOrderMark(uc32 c) { | |
376 // The Unicode value U+FFFE is guaranteed never to be assigned as a | |
377 // Unicode character; this implies that in a Unicode context the | |
378 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF | |
379 // character expressed in little-endian byte order (since it could | |
380 // not be a U+FFFE character expressed in big-endian byte | |
381 // order). Nevertheless, we check for it to be compatible with | |
382 // Spidermonkey. | |
383 return c == 0xFEFF || c == 0xFFFE; | |
384 } | |
385 | |
386 | |
387 template <typename UTF16Buffer, typename UTF8Buffer> | |
388 bool Scanner<UTF16Buffer, UTF8Buffer>::SkipWhiteSpace() { | |
389 int start_position = source_pos(); | |
390 | |
391 while (true) { | |
392 // We treat byte-order marks (BOMs) as whitespace for better | |
393 // compatibility with Spidermonkey and other JavaScript engines. | |
394 while (i::ScannerConstants::kIsWhiteSpace.get(c0_) | |
395 || IsByteOrderMark(c0_)) { | |
396 // IsWhiteSpace() includes line terminators! | |
397 if (i::ScannerConstants::kIsLineTerminator.get(c0_)) { | |
398 // Ignore line terminators, but remember them. This is necessary | |
399 // for automatic semicolon insertion. | |
400 has_line_terminator_before_next_ = true; | |
401 } | |
402 Advance(); | |
403 } | |
404 | |
405 // If there is an HTML comment end '-->' at the beginning of a | |
406 // line (with only whitespace in front of it), we treat the rest | |
407 // of the line as a comment. This is in line with the way | |
408 // SpiderMonkey handles it. | |
409 if (c0_ == '-' && has_line_terminator_before_next_) { | |
410 Advance(); | |
411 if (c0_ == '-') { | |
412 Advance(); | |
413 if (c0_ == '>') { | |
414 // Treat the rest of the line as a comment. | |
415 SkipSingleLineComment(); | |
416 // Continue skipping white space after the comment. | |
417 continue; | |
418 } | |
419 PushBack('-'); // undo Advance() | |
420 } | |
421 PushBack('-'); // undo Advance() | |
422 } | |
423 // Return whether or not we skipped any characters. | |
424 return source_pos() != start_position; | |
425 } | |
426 } | |
427 | |
428 | |
429 template <typename UTF16Buffer, typename UTF8Buffer> | |
430 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::SkipSingleLineComment() { | |
431 Advance(); | |
432 | |
433 // The line terminator at the end of the line is not considered | |
434 // to be part of the single-line comment; it is recognized | |
435 // separately by the lexical grammar and becomes part of the | |
436 // stream of input elements for the syntactic grammar (see | |
437 // ECMA-262, section 7.4, page 12). | |
438 while (c0_ >= 0 && !i::ScannerConstants::kIsLineTerminator.get(c0_)) { | |
439 Advance(); | |
440 } | |
441 | |
442 return i::Token::WHITESPACE; | |
443 } | |
444 | |
445 | |
446 template <typename UTF16Buffer, typename UTF8Buffer> | |
447 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::SkipMultiLineComment() { | |
448 ASSERT(c0_ == '*'); | |
449 Advance(); | |
450 | |
451 while (c0_ >= 0) { | |
452 char ch = c0_; | |
453 Advance(); | |
454 // If we have reached the end of the multi-line comment, we | |
455 // consume the '/' and insert a whitespace. This way all | |
456 // multi-line comments are treated as whitespace - even the ones | |
457 // containing line terminators. This contradicts ECMA-262, section | |
458 // 7.4, page 12, that says that multi-line comments containing | |
459 // line terminators should be treated as a line terminator, but it | |
460 // matches the behaviour of SpiderMonkey and KJS. | |
461 if (ch == '*' && c0_ == '/') { | |
462 c0_ = ' '; | |
463 return i::Token::WHITESPACE; | |
464 } | |
465 } | |
466 | |
467 // Unterminated multi-line comment. | |
468 return i::Token::ILLEGAL; | |
469 } | |
470 | |
471 | |
472 template <typename UTF16Buffer, typename UTF8Buffer> | |
473 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::ScanHtmlComment() { | |
474 // Check for <!-- comments. | |
475 ASSERT(c0_ == '!'); | |
476 Advance(); | |
477 if (c0_ == '-') { | |
478 Advance(); | |
479 if (c0_ == '-') return SkipSingleLineComment(); | |
480 PushBack('-'); // undo Advance() | |
481 } | |
482 PushBack('!'); // undo Advance() | |
483 ASSERT(c0_ == '!'); | |
484 return i::Token::LT; | |
485 } | |
486 | |
487 | |
488 template <typename UTF16Buffer, typename UTF8Buffer> | |
489 void Scanner<UTF16Buffer, UTF8Buffer>::Scan() { | |
490 next_.literal_length = 0; | |
491 i::Token::Value token; | |
492 do { | |
493 // Remember the position of the next token | |
494 next_.location.beg_pos = source_pos(); | |
495 | |
496 switch (c0_) { | |
497 case ' ': | |
498 case '\t': | |
499 Advance(); | |
500 token = i::Token::WHITESPACE; | |
501 break; | |
502 | |
503 case '\n': | |
504 Advance(); | |
505 has_line_terminator_before_next_ = true; | |
506 token = i::Token::WHITESPACE; | |
507 break; | |
508 | |
509 case '"': case '\'': | |
510 token = ScanString(); | |
511 break; | |
512 | |
513 case '<': | |
514 // < <= << <<= <!-- | |
515 Advance(); | |
516 if (c0_ == '=') { | |
517 token = Select(i::Token::LTE); | |
518 } else if (c0_ == '<') { | |
519 token = Select('=', i::Token::ASSIGN_SHL, i::Token::SHL); | |
520 } else if (c0_ == '!') { | |
521 token = ScanHtmlComment(); | |
522 } else { | |
523 token = i::Token::LT; | |
524 } | |
525 break; | |
526 | |
527 case '>': | |
528 // > >= >> >>= >>> >>>= | |
529 Advance(); | |
530 if (c0_ == '=') { | |
531 token = Select(i::Token::GTE); | |
532 } else if (c0_ == '>') { | |
533 // >> >>= >>> >>>= | |
534 Advance(); | |
535 if (c0_ == '=') { | |
536 token = Select(i::Token::ASSIGN_SAR); | |
537 } else if (c0_ == '>') { | |
538 token = Select('=', i::Token::ASSIGN_SHR, i::Token::SHR); | |
539 } else { | |
540 token = i::Token::SAR; | |
541 } | |
542 } else { | |
543 token = i::Token::GT; | |
544 } | |
545 break; | |
546 | |
547 case '=': | |
548 // = == === | |
549 Advance(); | |
550 if (c0_ == '=') { | |
551 token = Select('=', i::Token::EQ_STRICT, i::Token::EQ); | |
552 } else { | |
553 token = i::Token::ASSIGN; | |
554 } | |
555 break; | |
556 | |
557 case '!': | |
558 // ! != !== | |
559 Advance(); | |
560 if (c0_ == '=') { | |
561 token = Select('=', i::Token::NE_STRICT, i::Token::NE); | |
562 } else { | |
563 token = i::Token::NOT; | |
564 } | |
565 break; | |
566 | |
567 case '+': | |
568 // + ++ += | |
569 Advance(); | |
570 if (c0_ == '+') { | |
571 token = Select(i::Token::INC); | |
572 } else if (c0_ == '=') { | |
573 token = Select(i::Token::ASSIGN_ADD); | |
574 } else { | |
575 token = i::Token::ADD; | |
576 } | |
577 break; | |
578 | |
579 case '-': | |
580 // - -- --> -= | |
581 Advance(); | |
582 if (c0_ == '-') { | |
583 Advance(); | |
584 if (c0_ == '>' && has_line_terminator_before_next_) { | |
585 // For compatibility with SpiderMonkey, we skip lines that | |
586 // start with an HTML comment end '-->'. | |
587 token = SkipSingleLineComment(); | |
588 } else { | |
589 token = i::Token::DEC; | |
590 } | |
591 } else if (c0_ == '=') { | |
592 token = Select(i::Token::ASSIGN_SUB); | |
593 } else { | |
594 token = i::Token::SUB; | |
595 } | |
596 break; | |
597 | |
598 case '*': | |
599 // * *= | |
600 token = Select('=', i::Token::ASSIGN_MUL, i::Token::MUL); | |
601 break; | |
602 | |
603 case '%': | |
604 // % %= | |
605 token = Select('=', i::Token::ASSIGN_MOD, i::Token::MOD); | |
606 break; | |
607 | |
608 case '/': | |
609 // / // /* /= | |
610 Advance(); | |
611 if (c0_ == '/') { | |
612 token = SkipSingleLineComment(); | |
613 } else if (c0_ == '*') { | |
614 token = SkipMultiLineComment(); | |
615 } else if (c0_ == '=') { | |
616 token = Select(i::Token::ASSIGN_DIV); | |
617 } else { | |
618 token = i::Token::DIV; | |
619 } | |
620 break; | |
621 | |
622 case '&': | |
623 // & && &= | |
624 Advance(); | |
625 if (c0_ == '&') { | |
626 token = Select(i::Token::AND); | |
627 } else if (c0_ == '=') { | |
628 token = Select(i::Token::ASSIGN_BIT_AND); | |
629 } else { | |
630 token = i::Token::BIT_AND; | |
631 } | |
632 break; | |
633 | |
634 case '|': | |
635 // | || |= | |
636 Advance(); | |
637 if (c0_ == '|') { | |
638 token = Select(i::Token::OR); | |
639 } else if (c0_ == '=') { | |
640 token = Select(i::Token::ASSIGN_BIT_OR); | |
641 } else { | |
642 token = i::Token::BIT_OR; | |
643 } | |
644 break; | |
645 | |
646 case '^': | |
647 // ^ ^= | |
648 token = Select('=', i::Token::ASSIGN_BIT_XOR, i::Token::BIT_XOR); | |
649 break; | |
650 | |
651 case '.': | |
652 // . Number | |
653 Advance(); | |
654 if (i::IsDecimalDigit(c0_)) { | |
655 token = ScanNumber(true); | |
656 } else { | |
657 token = i::Token::PERIOD; | |
658 } | |
659 break; | |
660 | |
661 case ':': | |
662 token = Select(i::Token::COLON); | |
663 break; | |
664 | |
665 case ';': | |
666 token = Select(i::Token::SEMICOLON); | |
667 break; | |
668 | |
669 case ',': | |
670 token = Select(i::Token::COMMA); | |
671 break; | |
672 | |
673 case '(': | |
674 token = Select(i::Token::LPAREN); | |
675 break; | |
676 | |
677 case ')': | |
678 token = Select(i::Token::RPAREN); | |
679 break; | |
680 | |
681 case '[': | |
682 token = Select(i::Token::LBRACK); | |
683 break; | |
684 | |
685 case ']': | |
686 token = Select(i::Token::RBRACK); | |
687 break; | |
688 | |
689 case '{': | |
690 token = Select(i::Token::LBRACE); | |
691 break; | |
692 | |
693 case '}': | |
694 token = Select(i::Token::RBRACE); | |
695 break; | |
696 | |
697 case '?': | |
698 token = Select(i::Token::CONDITIONAL); | |
699 break; | |
700 | |
701 case '~': | |
702 token = Select(i::Token::BIT_NOT); | |
703 break; | |
704 | |
705 default: | |
706 if (i::ScannerConstants::kIsIdentifierStart.get(c0_)) { | |
707 token = ScanIdentifier(); | |
708 } else if (i::IsDecimalDigit(c0_)) { | |
709 token = ScanNumber(false); | |
710 } else if (SkipWhiteSpace()) { | |
711 token = i::Token::WHITESPACE; | |
712 } else if (c0_ < 0) { | |
713 token = i::Token::EOS; | |
714 } else { | |
715 token = Select(i::Token::ILLEGAL); | |
716 } | |
717 break; | |
718 } | |
719 | |
720 // Continue scanning for tokens as long as we're just skipping | |
721 // whitespace. | |
722 } while (token == i::Token::WHITESPACE); | |
723 | |
724 next_.location.end_pos = source_pos(); | |
725 next_.token = token; | |
726 } | |
727 | |
728 | |
729 template <typename UTF16Buffer, typename UTF8Buffer> | |
730 void Scanner<UTF16Buffer, UTF8Buffer>::SeekForward(int pos) { | |
731 source_->SeekForward(pos - 1); | |
732 Advance(); | |
733 // This function is only called to seek to the location | |
734 // of the end of a function (at the "}" token). It doesn't matter | |
735 // whether there was a line terminator in the part we skip. | |
736 has_line_terminator_before_next_ = false; | |
737 Scan(); | |
738 } | |
739 | |
740 | |
741 template <typename UTF16Buffer, typename UTF8Buffer> | |
742 uc32 Scanner<UTF16Buffer, UTF8Buffer>::ScanHexEscape(uc32 c, int length) { | |
743 ASSERT(length <= 4); // prevent overflow | |
744 | |
745 uc32 digits[4]; | |
746 uc32 x = 0; | |
747 for (int i = 0; i < length; i++) { | |
748 digits[i] = c0_; | |
749 int d = HexValue(c0_); | |
750 if (d < 0) { | |
751 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes | |
752 // should be illegal, but other JS VMs just return the | |
753 // non-escaped version of the original character. | |
754 | |
755 // Push back digits read, except the last one (in c0_). | |
756 for (int j = i-1; j >= 0; j--) { | |
757 PushBack(digits[j]); | |
758 } | |
759 // Notice: No handling of error - treat it as "\u"->"u". | |
760 return c; | |
761 } | |
762 x = x * 16 + d; | |
763 Advance(); | |
764 } | |
765 | |
766 return x; | |
767 } | |
768 | |
769 | |
770 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of | |
771 // ECMA-262. Other JS VMs support them. | |
772 template <typename UTF16Buffer, typename UTF8Buffer> | |
773 uc32 Scanner<UTF16Buffer, UTF8Buffer>::ScanOctalEscape( | |
774 uc32 c, int length) { | |
775 uc32 x = c - '0'; | |
776 for (int i = 0; i < length; i++) { | |
777 int d = c0_ - '0'; | |
778 if (d < 0 || d > 7) break; | |
779 int nx = x * 8 + d; | |
780 if (nx >= 256) break; | |
781 x = nx; | |
782 Advance(); | |
783 } | |
784 return x; | |
785 } | |
786 | |
787 | |
788 template <typename UTF16Buffer, typename UTF8Buffer> | |
789 void Scanner<UTF16Buffer, UTF8Buffer>::ScanEscape() { | |
790 uc32 c = c0_; | |
791 Advance(); | |
792 | |
793 // Skip escaped newlines. | |
794 if (i::ScannerConstants::kIsLineTerminator.get(c)) { | |
795 // Allow CR+LF newlines in multiline string literals. | |
796 if (i::IsCarriageReturn(c) && i::IsLineFeed(c0_)) Advance(); | |
797 // Allow LF+CR newlines in multiline string literals. | |
798 if (i::IsLineFeed(c) && i::IsCarriageReturn(c0_)) Advance(); | |
799 return; | |
800 } | |
801 | |
802 switch (c) { | |
803 case '\'': // fall through | |
804 case '"' : // fall through | |
805 case '\\': break; | |
806 case 'b' : c = '\b'; break; | |
807 case 'f' : c = '\f'; break; | |
808 case 'n' : c = '\n'; break; | |
809 case 'r' : c = '\r'; break; | |
810 case 't' : c = '\t'; break; | |
811 case 'u' : c = ScanHexEscape(c, 4); break; | |
812 case 'v' : c = '\v'; break; | |
813 case 'x' : c = ScanHexEscape(c, 2); break; | |
814 case '0' : // fall through | |
815 case '1' : // fall through | |
816 case '2' : // fall through | |
817 case '3' : // fall through | |
818 case '4' : // fall through | |
819 case '5' : // fall through | |
820 case '6' : // fall through | |
821 case '7' : c = ScanOctalEscape(c, 2); break; | |
822 } | |
823 | |
824 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these | |
825 // should be illegal, but they are commonly handled | |
826 // as non-escaped characters by JS VMs. | |
827 AddChar(c); | |
828 } | |
829 | |
830 | |
831 template <typename UTF16Buffer, typename UTF8Buffer> | |
832 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::ScanString() { | |
833 uc32 quote = c0_; | |
834 Advance(); // consume quote | |
835 | |
836 LiteralScope literal(this, kLiteralString); | |
837 while (c0_ != quote && c0_ >= 0 | |
838 && !i::ScannerConstants::kIsLineTerminator.get(c0_)) { | |
839 uc32 c = c0_; | |
840 Advance(); | |
841 if (c == '\\') { | |
842 if (c0_ < 0) return i::Token::ILLEGAL; | |
843 ScanEscape(); | |
844 } else { | |
845 AddChar(c); | |
846 } | |
847 } | |
848 if (c0_ != quote) return i::Token::ILLEGAL; | |
849 literal.Complete(); | |
850 | |
851 Advance(); // consume quote | |
852 return i::Token::STRING; | |
853 } | |
854 | |
855 | |
856 template <typename UTF16Buffer, typename UTF8Buffer> | |
857 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::Select(i::Token::Value tok) { | |
858 Advance(); | |
859 return tok; | |
860 } | |
861 | |
862 | |
863 template <typename UTF16Buffer, typename UTF8Buffer> | |
864 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::Select( | |
865 uc32 next, | |
866 i::Token::Value then, | |
867 i::Token::Value else_) { | |
868 Advance(); | |
869 if (c0_ == next) { | |
870 Advance(); | |
871 return then; | |
872 } else { | |
873 return else_; | |
874 } | |
875 } | |
876 | |
877 | |
878 // Returns true if any decimal digits were scanned, returns false otherwise. | |
879 template <typename UTF16Buffer, typename UTF8Buffer> | |
880 void Scanner<UTF16Buffer, UTF8Buffer>::ScanDecimalDigits() { | |
881 while (i::IsDecimalDigit(c0_)) | |
882 AddCharAdvance(); | |
883 } | |
884 | |
885 | |
886 template <typename UTF16Buffer, typename UTF8Buffer> | |
887 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::ScanNumber(bool seen_period) { | |
888 // c0_ is the first digit of the number or the fraction. | |
889 ASSERT(i::IsDecimalDigit(c0_)); | |
890 | |
891 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL; | |
892 | |
893 LiteralScope literal(this, kLiteralNumber); | |
894 if (seen_period) { | |
895 // we have already seen a decimal point of the float | |
896 AddChar('.'); | |
897 ScanDecimalDigits(); // we know we have at least one digit | |
898 | |
899 } else { | |
900 // if the first character is '0' we must check for octals and hex | |
901 if (c0_ == '0') { | |
902 AddCharAdvance(); | |
903 | |
904 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number | |
905 if (c0_ == 'x' || c0_ == 'X') { | |
906 // hex number | |
907 kind = HEX; | |
908 AddCharAdvance(); | |
909 if (!i::IsHexDigit(c0_)) { | |
910 // we must have at least one hex digit after 'x'/'X' | |
911 return i::Token::ILLEGAL; | |
912 } | |
913 while (i::IsHexDigit(c0_)) { | |
914 AddCharAdvance(); | |
915 } | |
916 } else if ('0' <= c0_ && c0_ <= '7') { | |
917 // (possible) octal number | |
918 kind = OCTAL; | |
919 while (true) { | |
920 if (c0_ == '8' || c0_ == '9') { | |
921 kind = DECIMAL; | |
922 break; | |
923 } | |
924 if (c0_ < '0' || '7' < c0_) break; | |
925 AddCharAdvance(); | |
926 } | |
927 } | |
928 } | |
929 | |
930 // Parse decimal digits and allow trailing fractional part. | |
931 if (kind == DECIMAL) { | |
932 ScanDecimalDigits(); // optional | |
933 if (c0_ == '.') { | |
934 AddCharAdvance(); | |
935 ScanDecimalDigits(); // optional | |
936 } | |
937 } | |
938 } | |
939 | |
940 // scan exponent, if any | |
941 if (c0_ == 'e' || c0_ == 'E') { | |
942 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number | |
943 if (kind == OCTAL) return i::Token::ILLEGAL; | |
944 // scan exponent | |
945 AddCharAdvance(); | |
946 if (c0_ == '+' || c0_ == '-') | |
947 AddCharAdvance(); | |
948 if (!i::IsDecimalDigit(c0_)) { | |
949 // we must have at least one decimal digit after 'e'/'E' | |
950 return i::Token::ILLEGAL; | |
951 } | |
952 ScanDecimalDigits(); | |
953 } | |
954 | |
955 // The source character immediately following a numeric literal must | |
956 // not be an identifier start or a decimal digit; see ECMA-262 | |
957 // section 7.8.3, page 17 (note that we read only one decimal digit | |
958 // if the value is 0). | |
959 if (i::IsDecimalDigit(c0_) | |
960 || i::ScannerConstants::kIsIdentifierStart.get(c0_)) | |
961 return i::Token::ILLEGAL; | |
962 | |
963 literal.Complete(); | |
964 | |
965 return i::Token::NUMBER; | |
966 } | |
967 | |
968 | |
969 template <typename UTF16Buffer, typename UTF8Buffer> | |
970 uc32 Scanner<UTF16Buffer, UTF8Buffer>::ScanIdentifierUnicodeEscape() { | |
971 Advance(); | |
972 if (c0_ != 'u') return unibrow::Utf8::kBadChar; | |
973 Advance(); | |
974 uc32 c = ScanHexEscape('u', 4); | |
975 // We do not allow a unicode escape sequence to start another | |
976 // unicode escape sequence. | |
977 if (c == '\\') return unibrow::Utf8::kBadChar; | |
978 return c; | |
979 } | |
980 | |
981 | |
982 template <typename UTF16Buffer, typename UTF8Buffer> | |
983 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::ScanIdentifier() { | |
984 ASSERT(i::ScannerConstants::kIsIdentifierStart.get(c0_)); | |
985 | |
986 LiteralScope literal(this, kLiteralIdentifier); | |
987 i::KeywordMatcher keyword_match; | |
988 | |
989 // Scan identifier start character. | |
990 if (c0_ == '\\') { | |
991 uc32 c = ScanIdentifierUnicodeEscape(); | |
992 // Only allow legal identifier start characters. | |
993 if (!i::ScannerConstants::kIsIdentifierStart.get(c)) { | |
994 return i::Token::ILLEGAL; | |
995 } | |
996 AddChar(c); | |
997 keyword_match.Fail(); | |
998 } else { | |
999 AddChar(c0_); | |
1000 keyword_match.AddChar(c0_); | |
1001 Advance(); | |
1002 } | |
1003 | |
1004 // Scan the rest of the identifier characters. | |
1005 while (i::ScannerConstants::kIsIdentifierPart.get(c0_)) { | |
1006 if (c0_ == '\\') { | |
1007 uc32 c = ScanIdentifierUnicodeEscape(); | |
1008 // Only allow legal identifier part characters. | |
1009 if (!i::ScannerConstants::kIsIdentifierPart.get(c)) { | |
1010 return i::Token::ILLEGAL; | |
1011 } | |
1012 AddChar(c); | |
1013 keyword_match.Fail(); | |
1014 } else { | |
1015 AddChar(c0_); | |
1016 keyword_match.AddChar(c0_); | |
1017 Advance(); | |
1018 } | |
1019 } | |
1020 literal.Complete(); | |
1021 | |
1022 return keyword_match.token(); | |
1023 } | |
1024 | |
1025 | |
1026 template <typename UTF16Buffer, typename UTF8Buffer> | |
1027 bool Scanner<UTF16Buffer, UTF8Buffer>::ScanRegExpPattern(bool seen_equal) { | |
1028 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags | |
1029 bool in_character_class = false; | |
1030 | |
1031 // Previous token is either '/' or '/=', in the second case, the | |
1032 // pattern starts at =. | |
1033 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); | |
1034 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); | |
1035 | |
1036 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, | |
1037 // the scanner should pass uninterpreted bodies to the RegExp | |
1038 // constructor. | |
1039 LiteralScope literal(this, kLiteralRegExp); | |
1040 if (seen_equal) | |
1041 AddChar('='); | |
1042 | |
1043 while (c0_ != '/' || in_character_class) { | |
1044 if (i::ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) { | |
1045 return false; | |
1046 } | |
1047 if (c0_ == '\\') { // escaped character | |
1048 AddCharAdvance(); | |
1049 if (i::ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) { | |
1050 return false; | |
1051 } | |
1052 AddCharAdvance(); | |
1053 } else { // unescaped character | |
1054 if (c0_ == '[') in_character_class = true; | |
1055 if (c0_ == ']') in_character_class = false; | |
1056 AddCharAdvance(); | |
1057 } | |
1058 } | |
1059 Advance(); // consume '/' | |
1060 | |
1061 literal.Complete(); | |
1062 | |
1063 return true; | |
1064 } | |
1065 | |
1066 template <typename UTF16Buffer, typename UTF8Buffer> | |
1067 bool Scanner<UTF16Buffer, UTF8Buffer>::ScanRegExpFlags() { | |
1068 // Scan regular expression flags. | |
1069 LiteralScope literal(this, kLiteralRegExpFlags); | |
1070 while (i::ScannerConstants::kIsIdentifierPart.get(c0_)) { | |
1071 if (c0_ == '\\') { | |
1072 uc32 c = ScanIdentifierUnicodeEscape(); | |
1073 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) { | |
1074 // We allow any escaped character, unlike the restriction on | |
1075 // IdentifierPart when it is used to build an IdentifierName. | |
1076 AddChar(c); | |
1077 continue; | |
1078 } | |
1079 } | |
1080 AddCharAdvance(); | |
1081 } | |
1082 literal.Complete(); | |
1083 | |
1084 next_.location.end_pos = source_pos() - 1; | |
1085 return true; | |
1086 } | |
1087 | |
1088 | |
1089 } } // namespace v8::preparser | |
1090 | |
1091 #endif // V8_PRESCANNER_H_ | |
OLD | NEW |