OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | |
2 // for details. All rights reserved. Use of this source code is governed by a | |
3 // BSD-style license that can be found in the LICENSE file. | |
4 | |
5 part of scanner; | |
6 | |
7 abstract class Scanner { | |
8 Token tokenize(); | |
9 | |
10 factory Scanner(SourceFile file, {bool includeComments: false}) { | |
11 if (file is Utf8BytesSourceFile) { | |
12 return new Utf8BytesScanner(file, includeComments: includeComments); | |
13 } else { | |
14 return new StringScanner(file, includeComments: includeComments); | |
15 } | |
16 } | |
17 } | |
18 | |
19 abstract class AbstractScanner implements Scanner { | |
20 // TODO(ahe): Move this class to implementation. | |
21 | |
22 final bool includeComments; | |
23 | |
24 /** | |
25 * The string offset for the next token that will be created. | |
26 * | |
27 * Note that in the [Utf8BytesScanner], [stringOffset] and [scanOffset] values | |
28 * are different. One string character can be encoded using multiple UTF-8 | |
29 * bytes. | |
30 */ | |
31 int tokenStart = -1; | |
32 | |
33 /** | |
34 * A pointer to the token stream created by this scanner. The first token | |
35 * is a special token and not part of the source file. This is an | |
36 * implementation detail to avoids special cases in the scanner. This token | |
37 * is not exposed to clients of the scanner, which are expected to invoke | |
38 * [firstToken] to access the token stream. | |
39 */ | |
40 final Token tokens = new SymbolToken(EOF_INFO, -1); | |
41 | |
42 /** | |
43 * A pointer to the last scanned token. | |
44 */ | |
45 Token tail; | |
46 | |
47 /** | |
48 * The source file that is being scanned. This field can be [:null:]. | |
49 * If the source file is available, the scanner assigns its [:lineStarts:] and | |
50 * [:length:] fields at the end of [tokenize]. | |
51 */ | |
52 final SourceFile file; | |
53 | |
54 final List<int> lineStarts = <int>[0]; | |
55 | |
56 AbstractScanner(this.file, this.includeComments) { | |
57 this.tail = this.tokens; | |
58 } | |
59 | |
60 /** | |
61 * Advances and returns the next character. | |
62 * | |
63 * If the next character is non-ASCII, then the returned value depends on the | |
64 * scanner implementation. The [Utf8BytesScanner] returns a UTF-8 byte, while | |
65 * the [StringScanner] returns a UTF-16 code unit. | |
66 * | |
67 * The scanner ensures that [advance] is not invoked after it returned [$EOF]. | |
68 * This allows implementations to omit bound checks if the data structure ends | |
69 * with '0'. | |
70 */ | |
71 int advance(); | |
72 | |
73 /** | |
74 * Returns the current unicode character. | |
75 * | |
76 * If the current character is ASCII, then it is returned unchanged. | |
77 * | |
78 * The [Utf8BytesScanner] decodes the next unicode code point starting at the | |
79 * current position. Note that every unicode character is returned as a single | |
80 * code point, that is, for '\u{1d11e}' it returns 119070, and the following | |
81 * [advance] returns the next character. | |
82 * | |
83 * The [StringScanner] returns the current character unchanged, which might | |
84 * be a surrogate character. In the case of '\u{1d11e}', it returns the first | |
85 * code unit 55348, and the following [advance] returns the second code unit | |
86 * 56606. | |
87 * | |
88 * Invoking [currentAsUnicode] multiple times is safe, i.e., | |
89 * [:currentAsUnicode(next) == currentAsUnicode(currentAsUnicode(next)):]. | |
90 */ | |
91 int currentAsUnicode(int next); | |
92 | |
93 /** | |
94 * Returns the character at the next poisition. Like in [advance], the | |
95 * [Utf8BytesScanner] returns a UTF-8 byte, while the [StringScanner] returns | |
96 * a UTF-16 code unit. | |
97 */ | |
98 int peek(); | |
99 | |
100 /** | |
101 * Notifies the scanner that unicode characters were detected in either a | |
102 * comment or a string literal between [startScanOffset] and the current | |
103 * scan offset. | |
104 */ | |
105 void handleUnicode(int startScanOffset); | |
106 | |
107 /** | |
108 * Returns the current scan offset. | |
109 * | |
110 * In the [Utf8BytesScanner] this is the offset into the byte list, in the | |
111 * [StringScanner] the offset in the source string. | |
112 */ | |
113 int get scanOffset; | |
114 | |
115 /** | |
116 * Returns the current string offset. | |
117 * | |
118 * In the [StringScanner] this is identical to the [scanOffset]. In the | |
119 * [Utf8BytesScanner] it is computed based on encountered UTF-8 characters. | |
120 */ | |
121 int get stringOffset; | |
122 | |
123 /** | |
124 * Returns the first token scanned by this [Scanner]. | |
125 */ | |
126 Token firstToken(); | |
127 | |
128 /** | |
129 * Returns the last token scanned by this [Scanner]. | |
130 */ | |
131 Token previousToken(); | |
132 | |
133 /** | |
134 * Notifies that a new token starts at current offset. | |
135 */ | |
136 void beginToken() { | |
137 tokenStart = stringOffset; | |
138 } | |
139 | |
140 /** | |
141 * Appends a substring from the scan offset [:start:] to the current | |
142 * [:scanOffset:] plus the [:extraOffset:]. For example, if the current | |
143 * scanOffset is 10, then [:appendSubstringToken(5, -1):] will append the | |
144 * substring string [5,9). | |
145 * | |
146 * Note that [extraOffset] can only be used if the covered character(s) are | |
147 * known to be ASCII. | |
148 */ | |
149 void appendSubstringToken(PrecedenceInfo info, int start, | |
150 bool asciiOnly, [int extraOffset]); | |
151 | |
152 /** Documentation in subclass [ArrayBasedScanner]. */ | |
153 void appendPrecedenceToken(PrecedenceInfo info); | |
154 | |
155 /** Documentation in subclass [ArrayBasedScanner]. */ | |
156 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no); | |
157 | |
158 /** Documentation in subclass [ArrayBasedScanner]. */ | |
159 void appendKeywordToken(Keyword keyword); | |
160 | |
161 /** Documentation in subclass [ArrayBasedScanner]. */ | |
162 void appendEofToken(); | |
163 | |
164 /** Documentation in subclass [ArrayBasedScanner]. */ | |
165 void appendWhiteSpace(int next); | |
166 | |
167 /** Documentation in subclass [ArrayBasedScanner]. */ | |
168 void lineFeedInMultiline(); | |
169 | |
170 /** Documentation in subclass [ArrayBasedScanner]. */ | |
171 void appendBeginGroup(PrecedenceInfo info); | |
172 | |
173 /** Documentation in subclass [ArrayBasedScanner]. */ | |
174 int appendEndGroup(PrecedenceInfo info, int openKind); | |
175 | |
176 /** Documentation in subclass [ArrayBasedScanner]. */ | |
177 void appendGt(PrecedenceInfo info); | |
178 | |
179 /** Documentation in subclass [ArrayBasedScanner]. */ | |
180 void appendGtGt(PrecedenceInfo info); | |
181 | |
182 /** Documentation in subclass [ArrayBasedScanner]. */ | |
183 void appendComment(start, bool asciiOnly); | |
184 | |
185 /// Append [token] to the token stream. | |
186 void appendErrorToken(ErrorToken token); | |
187 | |
188 /** Documentation in subclass [ArrayBasedScanner]. */ | |
189 void discardOpenLt(); | |
190 | |
191 /// Return true when at EOF. | |
192 bool atEndOfFile(); | |
193 | |
194 Token tokenize() { | |
195 while (!atEndOfFile()) { | |
196 int next = advance(); | |
197 while (!identical(next, $EOF)) { | |
198 next = bigSwitch(next); | |
199 } | |
200 if (atEndOfFile()) { | |
201 appendEofToken(); | |
202 } else { | |
203 unexpected($EOF); | |
204 } | |
205 } | |
206 | |
207 if (file != null) { | |
208 file.length = stringOffset; | |
209 // One additional line start at the end, see [SourceFile.lineStarts]. | |
210 lineStarts.add(stringOffset + 1); | |
211 file.lineStarts = lineStarts; | |
212 } | |
213 | |
214 return firstToken(); | |
215 } | |
216 | |
217 int bigSwitch(int next) { | |
218 beginToken(); | |
219 if (identical(next, $SPACE) || identical(next, $TAB) | |
220 || identical(next, $LF) || identical(next, $CR)) { | |
221 appendWhiteSpace(next); | |
222 next = advance(); | |
223 // Sequences of spaces are common, so advance through them fast. | |
224 while (identical(next, $SPACE)) { | |
225 // We don't invoke [:appendWhiteSpace(next):] here for efficiency, | |
226 // assuming that it does not do anything for space characters. | |
227 next = advance(); | |
228 } | |
229 return next; | |
230 } | |
231 | |
232 if ($a <= next && next <= $z) { | |
233 if (identical($r, next)) { | |
234 return tokenizeRawStringKeywordOrIdentifier(next); | |
235 } | |
236 return tokenizeKeywordOrIdentifier(next, true); | |
237 } | |
238 | |
239 if (($A <= next && next <= $Z) || | |
240 identical(next, $_) || | |
241 identical(next, $$)) { | |
242 return tokenizeIdentifier(next, scanOffset, true); | |
243 } | |
244 | |
245 if (identical(next, $LT)) { | |
246 return tokenizeLessThan(next); | |
247 } | |
248 | |
249 if (identical(next, $GT)) { | |
250 return tokenizeGreaterThan(next); | |
251 } | |
252 | |
253 if (identical(next, $EQ)) { | |
254 return tokenizeEquals(next); | |
255 } | |
256 | |
257 if (identical(next, $BANG)) { | |
258 return tokenizeExclamation(next); | |
259 } | |
260 | |
261 if (identical(next, $PLUS)) { | |
262 return tokenizePlus(next); | |
263 } | |
264 | |
265 if (identical(next, $MINUS)) { | |
266 return tokenizeMinus(next); | |
267 } | |
268 | |
269 if (identical(next, $STAR)) { | |
270 return tokenizeMultiply(next); | |
271 } | |
272 | |
273 if (identical(next, $PERCENT)) { | |
274 return tokenizePercent(next); | |
275 } | |
276 | |
277 if (identical(next, $AMPERSAND)) { | |
278 return tokenizeAmpersand(next); | |
279 } | |
280 | |
281 if (identical(next, $BAR)) { | |
282 return tokenizeBar(next); | |
283 } | |
284 | |
285 if (identical(next, $CARET)) { | |
286 return tokenizeCaret(next); | |
287 } | |
288 | |
289 if (identical(next, $OPEN_SQUARE_BRACKET)) { | |
290 return tokenizeOpenSquareBracket(next); | |
291 } | |
292 | |
293 if (identical(next, $TILDE)) { | |
294 return tokenizeTilde(next); | |
295 } | |
296 | |
297 if (identical(next, $BACKSLASH)) { | |
298 appendPrecedenceToken(BACKSLASH_INFO); | |
299 return advance(); | |
300 } | |
301 | |
302 if (identical(next, $HASH)) { | |
303 return tokenizeTag(next); | |
304 } | |
305 | |
306 if (identical(next, $OPEN_PAREN)) { | |
307 appendBeginGroup(OPEN_PAREN_INFO); | |
308 return advance(); | |
309 } | |
310 | |
311 if (identical(next, $CLOSE_PAREN)) { | |
312 return appendEndGroup(CLOSE_PAREN_INFO, OPEN_PAREN_TOKEN); | |
313 } | |
314 | |
315 if (identical(next, $COMMA)) { | |
316 appendPrecedenceToken(COMMA_INFO); | |
317 return advance(); | |
318 } | |
319 | |
320 if (identical(next, $COLON)) { | |
321 appendPrecedenceToken(COLON_INFO); | |
322 return advance(); | |
323 } | |
324 | |
325 if (identical(next, $SEMICOLON)) { | |
326 appendPrecedenceToken(SEMICOLON_INFO); | |
327 // Type parameters and arguments cannot contain semicolon. | |
328 discardOpenLt(); | |
329 return advance(); | |
330 } | |
331 | |
332 if (identical(next, $QUESTION)) { | |
333 appendPrecedenceToken(QUESTION_INFO); | |
334 return advance(); | |
335 } | |
336 | |
337 if (identical(next, $CLOSE_SQUARE_BRACKET)) { | |
338 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO, | |
339 OPEN_SQUARE_BRACKET_TOKEN); | |
340 } | |
341 | |
342 if (identical(next, $BACKPING)) { | |
343 appendPrecedenceToken(BACKPING_INFO); | |
344 return advance(); | |
345 } | |
346 | |
347 if (identical(next, $OPEN_CURLY_BRACKET)) { | |
348 appendBeginGroup(OPEN_CURLY_BRACKET_INFO); | |
349 return advance(); | |
350 } | |
351 | |
352 if (identical(next, $CLOSE_CURLY_BRACKET)) { | |
353 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, | |
354 OPEN_CURLY_BRACKET_TOKEN); | |
355 } | |
356 | |
357 if (identical(next, $SLASH)) { | |
358 return tokenizeSlashOrComment(next); | |
359 } | |
360 | |
361 if (identical(next, $AT)) { | |
362 return tokenizeAt(next); | |
363 } | |
364 | |
365 if (identical(next, $DQ) || identical(next, $SQ)) { | |
366 return tokenizeString(next, scanOffset, false); | |
367 } | |
368 | |
369 if (identical(next, $PERIOD)) { | |
370 return tokenizeDotsOrNumber(next); | |
371 } | |
372 | |
373 if (identical(next, $0)) { | |
374 return tokenizeHexOrNumber(next); | |
375 } | |
376 | |
377 // TODO(ahe): Would a range check be faster? | |
378 if (identical(next, $1) || identical(next, $2) || identical(next, $3) | |
379 || identical(next, $4) || identical(next, $5) || identical(next, $6) | |
380 || identical(next, $7) || identical(next, $8) || identical(next, $9)) { | |
381 return tokenizeNumber(next); | |
382 } | |
383 | |
384 if (identical(next, $EOF)) { | |
385 return $EOF; | |
386 } | |
387 if (next < 0x1f) { | |
388 return unexpected(next); | |
389 } | |
390 | |
391 next = currentAsUnicode(next); | |
392 | |
393 // The following are non-ASCII characters. | |
394 | |
395 if (identical(next, $NBSP)) { | |
396 appendWhiteSpace(next); | |
397 return advance(); | |
398 } | |
399 | |
400 return unexpected(next); | |
401 } | |
402 | |
403 int tokenizeTag(int next) { | |
404 // # or #!.*[\n\r] | |
405 if (scanOffset == 0) { | |
406 if (identical(peek(), $BANG)) { | |
407 int start = scanOffset + 1; | |
408 bool asciiOnly = true; | |
409 do { | |
410 next = advance(); | |
411 if (next > 127) asciiOnly = false; | |
412 } while (!identical(next, $LF) && | |
413 !identical(next, $CR) && | |
414 !identical(next, $EOF)); | |
415 if (!asciiOnly) handleUnicode(start); | |
416 return next; | |
417 } | |
418 } | |
419 appendPrecedenceToken(HASH_INFO); | |
420 return advance(); | |
421 } | |
422 | |
423 int tokenizeTilde(int next) { | |
424 // ~ ~/ ~/= | |
425 next = advance(); | |
426 if (identical(next, $SLASH)) { | |
427 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO); | |
428 } else { | |
429 appendPrecedenceToken(TILDE_INFO); | |
430 return next; | |
431 } | |
432 } | |
433 | |
434 int tokenizeOpenSquareBracket(int next) { | |
435 // [ [] []= | |
436 next = advance(); | |
437 if (identical(next, $CLOSE_SQUARE_BRACKET)) { | |
438 Token token = previousToken(); | |
439 if (token is KeywordToken && token.keyword.syntax == 'operator' || | |
440 token is SymbolToken && token.info == HASH_INFO) { | |
441 return select($EQ, INDEX_EQ_INFO, INDEX_INFO); | |
442 } | |
443 } | |
444 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO); | |
445 return next; | |
446 } | |
447 | |
448 int tokenizeCaret(int next) { | |
449 // ^ ^= | |
450 return select($EQ, CARET_EQ_INFO, CARET_INFO); | |
451 } | |
452 | |
453 int tokenizeBar(int next) { | |
454 // | || |= | |
455 next = advance(); | |
456 if (identical(next, $BAR)) { | |
457 appendPrecedenceToken(BAR_BAR_INFO); | |
458 return advance(); | |
459 } else if (identical(next, $EQ)) { | |
460 appendPrecedenceToken(BAR_EQ_INFO); | |
461 return advance(); | |
462 } else { | |
463 appendPrecedenceToken(BAR_INFO); | |
464 return next; | |
465 } | |
466 } | |
467 | |
468 int tokenizeAmpersand(int next) { | |
469 // && &= & | |
470 next = advance(); | |
471 if (identical(next, $AMPERSAND)) { | |
472 appendPrecedenceToken(AMPERSAND_AMPERSAND_INFO); | |
473 return advance(); | |
474 } else if (identical(next, $EQ)) { | |
475 appendPrecedenceToken(AMPERSAND_EQ_INFO); | |
476 return advance(); | |
477 } else { | |
478 appendPrecedenceToken(AMPERSAND_INFO); | |
479 return next; | |
480 } | |
481 } | |
482 | |
483 int tokenizePercent(int next) { | |
484 // % %= | |
485 return select($EQ, PERCENT_EQ_INFO, PERCENT_INFO); | |
486 } | |
487 | |
488 int tokenizeMultiply(int next) { | |
489 // * *= | |
490 return select($EQ, STAR_EQ_INFO, STAR_INFO); | |
491 } | |
492 | |
493 int tokenizeMinus(int next) { | |
494 // - -- -= | |
495 next = advance(); | |
496 if (identical(next, $MINUS)) { | |
497 appendPrecedenceToken(MINUS_MINUS_INFO); | |
498 return advance(); | |
499 } else if (identical(next, $EQ)) { | |
500 appendPrecedenceToken(MINUS_EQ_INFO); | |
501 return advance(); | |
502 } else { | |
503 appendPrecedenceToken(MINUS_INFO); | |
504 return next; | |
505 } | |
506 } | |
507 | |
508 int tokenizePlus(int next) { | |
509 // + ++ += | |
510 next = advance(); | |
511 if (identical($PLUS, next)) { | |
512 appendPrecedenceToken(PLUS_PLUS_INFO); | |
513 return advance(); | |
514 } else if (identical($EQ, next)) { | |
515 appendPrecedenceToken(PLUS_EQ_INFO); | |
516 return advance(); | |
517 } else { | |
518 appendPrecedenceToken(PLUS_INFO); | |
519 return next; | |
520 } | |
521 } | |
522 | |
523 int tokenizeExclamation(int next) { | |
524 // ! != | |
525 // !== is kept for user-friendly error reporting. | |
526 | |
527 next = advance(); | |
528 if (identical(next, $EQ)) { | |
529 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO); | |
530 } | |
531 appendPrecedenceToken(BANG_INFO); | |
532 return next; | |
533 } | |
534 | |
535 int tokenizeEquals(int next) { | |
536 // = == => | |
537 // === is kept for user-friendly error reporting. | |
538 | |
539 // Type parameters and arguments cannot contain any token that | |
540 // starts with '='. | |
541 discardOpenLt(); | |
542 | |
543 next = advance(); | |
544 if (identical(next, $EQ)) { | |
545 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO); | |
546 } else if (identical(next, $GT)) { | |
547 appendPrecedenceToken(FUNCTION_INFO); | |
548 return advance(); | |
549 } | |
550 appendPrecedenceToken(EQ_INFO); | |
551 return next; | |
552 } | |
553 | |
554 int tokenizeGreaterThan(int next) { | |
555 // > >= >> >>= | |
556 next = advance(); | |
557 if (identical($EQ, next)) { | |
558 appendPrecedenceToken(GT_EQ_INFO); | |
559 return advance(); | |
560 } else if (identical($GT, next)) { | |
561 next = advance(); | |
562 if (identical($EQ, next)) { | |
563 appendPrecedenceToken(GT_GT_EQ_INFO); | |
564 return advance(); | |
565 } else { | |
566 appendGtGt(GT_GT_INFO); | |
567 return next; | |
568 } | |
569 } else { | |
570 appendGt(GT_INFO); | |
571 return next; | |
572 } | |
573 } | |
574 | |
575 int tokenizeLessThan(int next) { | |
576 // < <= << <<= | |
577 next = advance(); | |
578 if (identical($EQ, next)) { | |
579 appendPrecedenceToken(LT_EQ_INFO); | |
580 return advance(); | |
581 } else if (identical($LT, next)) { | |
582 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO); | |
583 } else { | |
584 appendBeginGroup(LT_INFO); | |
585 return next; | |
586 } | |
587 } | |
588 | |
589 int tokenizeNumber(int next) { | |
590 int start = scanOffset; | |
591 while (true) { | |
592 next = advance(); | |
593 if ($0 <= next && next <= $9) { | |
594 continue; | |
595 } else if (identical(next, $e) || identical(next, $E)) { | |
596 return tokenizeFractionPart(next, start); | |
597 } else { | |
598 if (identical(next, $PERIOD)) { | |
599 int nextnext = peek(); | |
600 if ($0 <= nextnext && nextnext <= $9) { | |
601 return tokenizeFractionPart(advance(), start); | |
602 } | |
603 } | |
604 appendSubstringToken(INT_INFO, start, true); | |
605 return next; | |
606 } | |
607 } | |
608 return null; | |
609 } | |
610 | |
611 int tokenizeHexOrNumber(int next) { | |
612 int x = peek(); | |
613 if (identical(x, $x) || identical(x, $X)) { | |
614 return tokenizeHex(next); | |
615 } | |
616 return tokenizeNumber(next); | |
617 } | |
618 | |
619 int tokenizeHex(int next) { | |
620 int start = scanOffset; | |
621 next = advance(); // Advance past the $x or $X. | |
622 bool hasDigits = false; | |
623 while (true) { | |
624 next = advance(); | |
625 if (($0 <= next && next <= $9) | |
626 || ($A <= next && next <= $F) | |
627 || ($a <= next && next <= $f)) { | |
628 hasDigits = true; | |
629 } else { | |
630 if (!hasDigits) { | |
631 unterminated('0x', shouldAdvance: false); | |
632 return next; | |
633 } | |
634 appendSubstringToken(HEXADECIMAL_INFO, start, true); | |
635 return next; | |
636 } | |
637 } | |
638 return null; | |
639 } | |
640 | |
641 int tokenizeDotsOrNumber(int next) { | |
642 int start = scanOffset; | |
643 next = advance(); | |
644 if (($0 <= next && next <= $9)) { | |
645 return tokenizeFractionPart(next, start); | |
646 } else if (identical($PERIOD, next)) { | |
647 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); | |
648 } else { | |
649 appendPrecedenceToken(PERIOD_INFO); | |
650 return next; | |
651 } | |
652 } | |
653 | |
654 int tokenizeFractionPart(int next, int start) { | |
655 bool done = false; | |
656 bool hasDigit = false; | |
657 LOOP: while (!done) { | |
658 if ($0 <= next && next <= $9) { | |
659 hasDigit = true; | |
660 } else if (identical($e, next) || identical($E, next)) { | |
661 hasDigit = true; | |
662 next = advance(); | |
663 if (identical(next, $PLUS) || identical(next, $MINUS)) { | |
664 next = advance(); | |
665 } | |
666 bool hasExponentDigits = false; | |
667 while (true) { | |
668 if ($0 <= next && next <= $9) { | |
669 hasExponentDigits = true; | |
670 } else { | |
671 if (!hasExponentDigits) { | |
672 unterminated('1e', shouldAdvance: false); | |
673 return next; | |
674 } | |
675 break; | |
676 } | |
677 next = advance(); | |
678 } | |
679 | |
680 done = true; | |
681 continue LOOP; | |
682 } else { | |
683 done = true; | |
684 continue LOOP; | |
685 } | |
686 next = advance(); | |
687 } | |
688 if (!hasDigit) { | |
689 // Reduce offset, we already advanced to the token past the period. | |
690 appendSubstringToken(INT_INFO, start, true, -1); | |
691 | |
692 // TODO(ahe): Wrong offset for the period. Cannot call beginToken because | |
693 // the scanner already advanced past the period. | |
694 if (identical($PERIOD, next)) { | |
695 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); | |
696 } | |
697 appendPrecedenceToken(PERIOD_INFO); | |
698 return next; | |
699 } | |
700 appendSubstringToken(DOUBLE_INFO, start, true); | |
701 return next; | |
702 } | |
703 | |
704 int tokenizeSlashOrComment(int next) { | |
705 int start = scanOffset; | |
706 next = advance(); | |
707 if (identical($STAR, next)) { | |
708 return tokenizeMultiLineComment(next, start); | |
709 } else if (identical($SLASH, next)) { | |
710 return tokenizeSingleLineComment(next, start); | |
711 } else if (identical($EQ, next)) { | |
712 appendPrecedenceToken(SLASH_EQ_INFO); | |
713 return advance(); | |
714 } else { | |
715 appendPrecedenceToken(SLASH_INFO); | |
716 return next; | |
717 } | |
718 } | |
719 | |
720 int tokenizeSingleLineComment(int next, int start) { | |
721 bool asciiOnly = true; | |
722 while (true) { | |
723 next = advance(); | |
724 if (next > 127) asciiOnly = false; | |
725 if (identical($LF, next) || | |
726 identical($CR, next) || | |
727 identical($EOF, next)) { | |
728 if (!asciiOnly) handleUnicode(start); | |
729 appendComment(start, asciiOnly); | |
730 return next; | |
731 } | |
732 } | |
733 return null; | |
734 } | |
735 | |
736 | |
737 int tokenizeMultiLineComment(int next, int start) { | |
738 bool asciiOnlyComment = true; // Track if the entire comment is ASCII. | |
739 bool asciiOnlyLines = true; // Track ASCII since the last handleUnicode. | |
740 int unicodeStart = start; | |
741 int nesting = 1; | |
742 next = advance(); | |
743 while (true) { | |
744 if (identical($EOF, next)) { | |
745 if (!asciiOnlyLines) handleUnicode(unicodeStart); | |
746 unterminated('/*'); | |
747 break; | |
748 } else if (identical($STAR, next)) { | |
749 next = advance(); | |
750 if (identical($SLASH, next)) { | |
751 --nesting; | |
752 if (0 == nesting) { | |
753 if (!asciiOnlyLines) handleUnicode(unicodeStart); | |
754 next = advance(); | |
755 appendComment(start, asciiOnlyComment); | |
756 break; | |
757 } else { | |
758 next = advance(); | |
759 } | |
760 } | |
761 } else if (identical($SLASH, next)) { | |
762 next = advance(); | |
763 if (identical($STAR, next)) { | |
764 next = advance(); | |
765 ++nesting; | |
766 } | |
767 } else if (identical(next, $LF)) { | |
768 if (!asciiOnlyLines) { | |
769 // Synchronize the string offset in the utf8 scanner. | |
770 handleUnicode(unicodeStart); | |
771 asciiOnlyLines = true; | |
772 unicodeStart = scanOffset; | |
773 } | |
774 lineFeedInMultiline(); | |
775 next = advance(); | |
776 } else { | |
777 if (next > 127) { | |
778 asciiOnlyLines = false; | |
779 asciiOnlyComment = false; | |
780 } | |
781 next = advance(); | |
782 } | |
783 } | |
784 return next; | |
785 } | |
786 | |
787 int tokenizeRawStringKeywordOrIdentifier(int next) { | |
788 // [next] is $r. | |
789 int nextnext = peek(); | |
790 if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) { | |
791 int start = scanOffset; | |
792 next = advance(); | |
793 return tokenizeString(next, start, true); | |
794 } | |
795 return tokenizeKeywordOrIdentifier(next, true); | |
796 } | |
797 | |
798 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) { | |
799 KeywordState state = KeywordState.KEYWORD_STATE; | |
800 int start = scanOffset; | |
801 while (state != null && $a <= next && next <= $z) { | |
802 state = state.next(next); | |
803 next = advance(); | |
804 } | |
805 if (state == null || state.keyword == null) { | |
806 return tokenizeIdentifier(next, start, allowDollar); | |
807 } | |
808 if (($A <= next && next <= $Z) || | |
809 ($0 <= next && next <= $9) || | |
810 identical(next, $_) || | |
811 identical(next, $$)) { | |
812 return tokenizeIdentifier(next, start, allowDollar); | |
813 } else { | |
814 appendKeywordToken(state.keyword); | |
815 return next; | |
816 } | |
817 } | |
818 | |
819 /** | |
820 * [allowDollar] can exclude '$', which is not allowed as part of a string | |
821 * interpolation identifier. | |
822 */ | |
823 int tokenizeIdentifier(int next, int start, bool allowDollar) { | |
824 while (true) { | |
825 if (($a <= next && next <= $z) || | |
826 ($A <= next && next <= $Z) || | |
827 ($0 <= next && next <= $9) || | |
828 identical(next, $_) || | |
829 (identical(next, $$) && allowDollar)) { | |
830 next = advance(); | |
831 } else { | |
832 // Identifier ends here. | |
833 if (start == scanOffset) { | |
834 return unexpected(next); | |
835 } else { | |
836 appendSubstringToken(IDENTIFIER_INFO, start, true); | |
837 } | |
838 break; | |
839 } | |
840 } | |
841 return next; | |
842 } | |
843 | |
844 int tokenizeAt(int next) { | |
845 appendPrecedenceToken(AT_INFO); | |
846 return advance(); | |
847 } | |
848 | |
849 int tokenizeString(int next, int start, bool raw) { | |
850 int quoteChar = next; | |
851 next = advance(); | |
852 if (identical(quoteChar, next)) { | |
853 next = advance(); | |
854 if (identical(quoteChar, next)) { | |
855 // Multiline string. | |
856 return tokenizeMultiLineString(quoteChar, start, raw); | |
857 } else { | |
858 // Empty string. | |
859 appendSubstringToken(STRING_INFO, start, true); | |
860 return next; | |
861 } | |
862 } | |
863 if (raw) { | |
864 return tokenizeSingleLineRawString(next, quoteChar, start); | |
865 } else { | |
866 return tokenizeSingleLineString(next, quoteChar, start); | |
867 } | |
868 } | |
869 | |
870 /** | |
871 * [next] is the first character after the quote. | |
872 * [start] is the scanOffset of the quote. | |
873 * | |
874 * The token contains a substring of the source file, including the | |
875 * string quotes, backslashes for escaping. For interpolated strings, | |
876 * the parts before and after are separate tokens. | |
877 * | |
878 * "a $b c" | |
879 * | |
880 * gives StringToken("a $), StringToken(b) and StringToken( c"). | |
881 */ | |
882 int tokenizeSingleLineString(int next, int quoteChar, int start) { | |
883 bool asciiOnly = true; | |
884 while (!identical(next, quoteChar)) { | |
885 if (identical(next, $BACKSLASH)) { | |
886 next = advance(); | |
887 } else if (identical(next, $$)) { | |
888 if (!asciiOnly) handleUnicode(start); | |
889 next = tokenizeStringInterpolation(start, asciiOnly); | |
890 start = scanOffset; | |
891 asciiOnly = true; | |
892 continue; | |
893 } | |
894 if (next <= $CR | |
895 && (identical(next, $LF) || | |
896 identical(next, $CR) || | |
897 identical(next, $EOF))) { | |
898 if (!asciiOnly) handleUnicode(start); | |
899 return unterminatedString(quoteChar); | |
900 } | |
901 if (next > 127) asciiOnly = false; | |
902 next = advance(); | |
903 } | |
904 if (!asciiOnly) handleUnicode(start); | |
905 // Advance past the quote character. | |
906 next = advance(); | |
907 appendSubstringToken(STRING_INFO, start, asciiOnly); | |
908 return next; | |
909 } | |
910 | |
911 int tokenizeStringInterpolation(int start, bool asciiOnly) { | |
912 appendSubstringToken(STRING_INFO, start, asciiOnly); | |
913 beginToken(); // $ starts here. | |
914 int next = advance(); | |
915 if (identical(next, $OPEN_CURLY_BRACKET)) { | |
916 return tokenizeInterpolatedExpression(next); | |
917 } else { | |
918 return tokenizeInterpolatedIdentifier(next); | |
919 } | |
920 } | |
921 | |
922 int tokenizeInterpolatedExpression(int next) { | |
923 appendBeginGroup(STRING_INTERPOLATION_INFO); | |
924 beginToken(); // The expression starts here. | |
925 next = advance(); // Move past the curly bracket. | |
926 while (!identical(next, $EOF) && !identical(next, $STX)) { | |
927 next = bigSwitch(next); | |
928 } | |
929 if (identical(next, $EOF)) return next; | |
930 next = advance(); // Move past the $STX. | |
931 beginToken(); // The string interpolation suffix starts here. | |
932 return next; | |
933 } | |
934 | |
935 int tokenizeInterpolatedIdentifier(int next) { | |
936 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO); | |
937 | |
938 if ($a <= next && next <= $z) { | |
939 beginToken(); // The identifier starts here. | |
940 next = tokenizeKeywordOrIdentifier(next, false); | |
941 } else if (($A <= next && next <= $Z) || identical(next, $_)) { | |
942 beginToken(); // The identifier starts here. | |
943 next = tokenizeIdentifier(next, scanOffset, false); | |
944 } else { | |
945 unterminated(r'$', shouldAdvance: false); | |
946 } | |
947 beginToken(); // The string interpolation suffix starts here. | |
948 return next; | |
949 } | |
950 | |
951 int tokenizeSingleLineRawString(int next, int quoteChar, int start) { | |
952 bool asciiOnly = true; | |
953 while (next != $EOF) { | |
954 if (identical(next, quoteChar)) { | |
955 if (!asciiOnly) handleUnicode(start); | |
956 next = advance(); | |
957 appendSubstringToken(STRING_INFO, start, asciiOnly); | |
958 return next; | |
959 } else if (identical(next, $LF) || identical(next, $CR)) { | |
960 if (!asciiOnly) handleUnicode(start); | |
961 return unterminatedRawString(quoteChar); | |
962 } else if (next > 127) { | |
963 asciiOnly = false; | |
964 } | |
965 next = advance(); | |
966 } | |
967 if (!asciiOnly) handleUnicode(start); | |
968 return unterminatedRawString(quoteChar); | |
969 } | |
970 | |
971 int tokenizeMultiLineRawString(int quoteChar, int start) { | |
972 bool asciiOnlyString = true; | |
973 bool asciiOnlyLine = true; | |
974 int unicodeStart = start; | |
975 int next = advance(); // Advance past the (last) quote (of three). | |
976 outer: while (!identical(next, $EOF)) { | |
977 while (!identical(next, quoteChar)) { | |
978 if (identical(next, $LF)) { | |
979 if (!asciiOnlyLine) { | |
980 // Synchronize the string offset in the utf8 scanner. | |
981 handleUnicode(unicodeStart); | |
982 asciiOnlyLine = true; | |
983 unicodeStart = scanOffset; | |
984 } | |
985 lineFeedInMultiline(); | |
986 } else if (next > 127) { | |
987 asciiOnlyLine = false; | |
988 asciiOnlyString = false; | |
989 } | |
990 next = advance(); | |
991 if (identical(next, $EOF)) break outer; | |
992 } | |
993 next = advance(); | |
994 if (identical(next, quoteChar)) { | |
995 next = advance(); | |
996 if (identical(next, quoteChar)) { | |
997 if (!asciiOnlyLine) handleUnicode(unicodeStart); | |
998 next = advance(); | |
999 appendSubstringToken(STRING_INFO, start, asciiOnlyString); | |
1000 return next; | |
1001 } | |
1002 } | |
1003 } | |
1004 if (!asciiOnlyLine) handleUnicode(unicodeStart); | |
1005 return unterminatedRawMultiLineString(quoteChar); | |
1006 } | |
1007 | |
1008 int tokenizeMultiLineString(int quoteChar, int start, bool raw) { | |
1009 if (raw) return tokenizeMultiLineRawString(quoteChar, start); | |
1010 bool asciiOnlyString = true; | |
1011 bool asciiOnlyLine = true; | |
1012 int unicodeStart = start; | |
1013 int next = advance(); // Advance past the (last) quote (of three). | |
1014 while (!identical(next, $EOF)) { | |
1015 if (identical(next, $$)) { | |
1016 if (!asciiOnlyLine) handleUnicode(unicodeStart); | |
1017 next = tokenizeStringInterpolation(start, asciiOnlyString); | |
1018 start = scanOffset; | |
1019 unicodeStart = start; | |
1020 asciiOnlyString = true; // A new string token is created for the rest. | |
1021 asciiOnlyLine = true; | |
1022 continue; | |
1023 } | |
1024 if (identical(next, quoteChar)) { | |
1025 next = advance(); | |
1026 if (identical(next, quoteChar)) { | |
1027 next = advance(); | |
1028 if (identical(next, quoteChar)) { | |
1029 if (!asciiOnlyLine) handleUnicode(unicodeStart); | |
1030 next = advance(); | |
1031 appendSubstringToken(STRING_INFO, start, asciiOnlyString); | |
1032 return next; | |
1033 } | |
1034 } | |
1035 continue; | |
1036 } | |
1037 if (identical(next, $BACKSLASH)) { | |
1038 next = advance(); | |
1039 if (identical(next, $EOF)) break; | |
1040 } | |
1041 if (identical(next, $LF)) { | |
1042 if (!asciiOnlyLine) { | |
1043 // Synchronize the string offset in the utf8 scanner. | |
1044 handleUnicode(unicodeStart); | |
1045 asciiOnlyLine = true; | |
1046 unicodeStart = scanOffset; | |
1047 } | |
1048 lineFeedInMultiline(); | |
1049 } else if (next > 127) { | |
1050 asciiOnlyString = false; | |
1051 asciiOnlyLine = false; | |
1052 } | |
1053 next = advance(); | |
1054 } | |
1055 if (!asciiOnlyLine) handleUnicode(unicodeStart); | |
1056 return unterminatedMultiLineString(quoteChar); | |
1057 } | |
1058 | |
1059 int unexpected(int character) { | |
1060 appendErrorToken(new BadInputToken(character, tokenStart)); | |
1061 return advanceAfterError(true); | |
1062 } | |
1063 | |
1064 int unterminated(String prefix, {bool shouldAdvance: true}) { | |
1065 appendErrorToken(new UnterminatedToken(prefix, tokenStart, stringOffset)); | |
1066 return advanceAfterError(shouldAdvance); | |
1067 } | |
1068 | |
1069 int unterminatedString(int quoteChar) { | |
1070 return unterminated(new String.fromCharCodes([quoteChar])); | |
1071 } | |
1072 | |
1073 int unterminatedRawString(int quoteChar) { | |
1074 return unterminated('r${new String.fromCharCodes([quoteChar])}'); | |
1075 } | |
1076 | |
1077 int unterminatedMultiLineString(int quoteChar) { | |
1078 return unterminated( | |
1079 new String.fromCharCodes([quoteChar, quoteChar, quoteChar])); | |
1080 } | |
1081 | |
1082 int unterminatedRawMultiLineString(int quoteChar) { | |
1083 return unterminated( | |
1084 'r${new String.fromCharCodes([quoteChar, quoteChar, quoteChar])}'); | |
1085 } | |
1086 | |
1087 int advanceAfterError(bool shouldAdvance) { | |
1088 if (atEndOfFile()) return $EOF; | |
1089 if (shouldAdvance) { | |
1090 return advance(); // Ensure progress. | |
1091 } else { | |
1092 return -1; | |
1093 } | |
1094 } | |
1095 | |
1096 void unmatchedBeginGroup(BeginGroupToken begin) { | |
1097 // We want to ensure that unmatched BeginGroupTokens are reported as | |
1098 // errors. However, the diet parser assumes that groups are well-balanced | |
1099 // and will never look at the endGroup token. This is a nice property that | |
1100 // allows us to skip quickly over correct code. By inserting an additional | |
1101 // synthetic token in the stream, we can keep ignoring endGroup tokens. | |
1102 // | |
1103 // [begin] --next--> [tail] | |
1104 // [begin] --endG--> [synthetic] --next--> [next] --next--> [tail] | |
1105 // | |
1106 // This allows the diet parser to skip from [begin] via endGroup to | |
1107 // [synthetic] and ignore the [synthetic] token (assuming it's correct), | |
1108 // then the error will be reported when parsing the [next] token. | |
1109 // | |
1110 // For example, tokenize("{[1};") produces: | |
1111 // | |
1112 // SymbolToken({) --endGroup-----+ | |
1113 // | | | |
1114 // next | | |
1115 // v | | |
1116 // SymbolToken([) --endGroup--+ | | |
1117 // | | | | |
1118 // next | | | |
1119 // v | | | |
1120 // StringToken(1) | | | |
1121 // | v | | |
1122 // next SymbolToken(]) | <- Synthetic token. | |
1123 // | | | | |
1124 // | next | | |
1125 // v | | | |
1126 // UnmatchedToken([)<---------+ | | |
1127 // | | | |
1128 // next | | |
1129 // v | | |
1130 // SymbolToken(})<---------------+ | |
1131 // | | |
1132 // next | |
1133 // v | |
1134 // SymbolToken(;) | |
1135 // | | |
1136 // next | |
1137 // v | |
1138 // EOF | |
1139 Token synthetic = | |
1140 new SymbolToken(closeBraceInfoFor(begin), begin.charOffset); | |
1141 UnmatchedToken next = new UnmatchedToken(begin); | |
1142 begin.endGroup = synthetic; | |
1143 synthetic.next = next; | |
1144 appendErrorToken(next); | |
1145 } | |
1146 } | |
1147 | |
1148 PrecedenceInfo closeBraceInfoFor(BeginGroupToken begin) { | |
1149 return const { | |
1150 '(': CLOSE_PAREN_INFO, | |
1151 '[': CLOSE_SQUARE_BRACKET_INFO, | |
1152 '{': CLOSE_CURLY_BRACKET_INFO, | |
1153 '<': GT_INFO, | |
1154 r'${': CLOSE_CURLY_BRACKET_INFO, | |
1155 }[begin.value]; | |
1156 } | |
OLD | NEW |