Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(49)

Side by Side Diff: sdk/lib/_internal/compiler/implementation/scanner/scanner.dart

Issue 694353007: Move dart2js from sdk/lib/_internal/compiler to pkg/compiler (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 6 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 part of scanner;
6
7 abstract class Scanner {
8 Token tokenize();
9
10 factory Scanner(SourceFile file, {bool includeComments: false}) {
11 if (file is Utf8BytesSourceFile) {
12 return new Utf8BytesScanner(file, includeComments: includeComments);
13 } else {
14 return new StringScanner(file, includeComments: includeComments);
15 }
16 }
17 }
18
19 abstract class AbstractScanner implements Scanner {
20 // TODO(ahe): Move this class to implementation.
21
22 final bool includeComments;
23
24 /**
25 * The string offset for the next token that will be created.
26 *
27 * Note that in the [Utf8BytesScanner], [stringOffset] and [scanOffset] values
28 * are different. One string character can be encoded using multiple UTF-8
29 * bytes.
30 */
31 int tokenStart = -1;
32
33 /**
34 * A pointer to the token stream created by this scanner. The first token
35 * is a special token and not part of the source file. This is an
36 * implementation detail to avoids special cases in the scanner. This token
37 * is not exposed to clients of the scanner, which are expected to invoke
38 * [firstToken] to access the token stream.
39 */
40 final Token tokens = new SymbolToken(EOF_INFO, -1);
41
42 /**
43 * A pointer to the last scanned token.
44 */
45 Token tail;
46
47 /**
48 * The source file that is being scanned. This field can be [:null:].
49 * If the source file is available, the scanner assigns its [:lineStarts:] and
50 * [:length:] fields at the end of [tokenize].
51 */
52 final SourceFile file;
53
54 final List<int> lineStarts = <int>[0];
55
56 AbstractScanner(this.file, this.includeComments) {
57 this.tail = this.tokens;
58 }
59
60 /**
61 * Advances and returns the next character.
62 *
63 * If the next character is non-ASCII, then the returned value depends on the
64 * scanner implementation. The [Utf8BytesScanner] returns a UTF-8 byte, while
65 * the [StringScanner] returns a UTF-16 code unit.
66 *
67 * The scanner ensures that [advance] is not invoked after it returned [$EOF].
68 * This allows implementations to omit bound checks if the data structure ends
69 * with '0'.
70 */
71 int advance();
72
73 /**
74 * Returns the current unicode character.
75 *
76 * If the current character is ASCII, then it is returned unchanged.
77 *
78 * The [Utf8BytesScanner] decodes the next unicode code point starting at the
79 * current position. Note that every unicode character is returned as a single
80 * code point, that is, for '\u{1d11e}' it returns 119070, and the following
81 * [advance] returns the next character.
82 *
83 * The [StringScanner] returns the current character unchanged, which might
84 * be a surrogate character. In the case of '\u{1d11e}', it returns the first
85 * code unit 55348, and the following [advance] returns the second code unit
86 * 56606.
87 *
88 * Invoking [currentAsUnicode] multiple times is safe, i.e.,
89 * [:currentAsUnicode(next) == currentAsUnicode(currentAsUnicode(next)):].
90 */
91 int currentAsUnicode(int next);
92
93 /**
94 * Returns the character at the next poisition. Like in [advance], the
95 * [Utf8BytesScanner] returns a UTF-8 byte, while the [StringScanner] returns
96 * a UTF-16 code unit.
97 */
98 int peek();
99
100 /**
101 * Notifies the scanner that unicode characters were detected in either a
102 * comment or a string literal between [startScanOffset] and the current
103 * scan offset.
104 */
105 void handleUnicode(int startScanOffset);
106
107 /**
108 * Returns the current scan offset.
109 *
110 * In the [Utf8BytesScanner] this is the offset into the byte list, in the
111 * [StringScanner] the offset in the source string.
112 */
113 int get scanOffset;
114
115 /**
116 * Returns the current string offset.
117 *
118 * In the [StringScanner] this is identical to the [scanOffset]. In the
119 * [Utf8BytesScanner] it is computed based on encountered UTF-8 characters.
120 */
121 int get stringOffset;
122
123 /**
124 * Returns the first token scanned by this [Scanner].
125 */
126 Token firstToken();
127
128 /**
129 * Returns the last token scanned by this [Scanner].
130 */
131 Token previousToken();
132
133 /**
134 * Notifies that a new token starts at current offset.
135 */
136 void beginToken() {
137 tokenStart = stringOffset;
138 }
139
140 /**
141 * Appends a substring from the scan offset [:start:] to the current
142 * [:scanOffset:] plus the [:extraOffset:]. For example, if the current
143 * scanOffset is 10, then [:appendSubstringToken(5, -1):] will append the
144 * substring string [5,9).
145 *
146 * Note that [extraOffset] can only be used if the covered character(s) are
147 * known to be ASCII.
148 */
149 void appendSubstringToken(PrecedenceInfo info, int start,
150 bool asciiOnly, [int extraOffset]);
151
152 /** Documentation in subclass [ArrayBasedScanner]. */
153 void appendPrecedenceToken(PrecedenceInfo info);
154
155 /** Documentation in subclass [ArrayBasedScanner]. */
156 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no);
157
158 /** Documentation in subclass [ArrayBasedScanner]. */
159 void appendKeywordToken(Keyword keyword);
160
161 /** Documentation in subclass [ArrayBasedScanner]. */
162 void appendEofToken();
163
164 /** Documentation in subclass [ArrayBasedScanner]. */
165 void appendWhiteSpace(int next);
166
167 /** Documentation in subclass [ArrayBasedScanner]. */
168 void lineFeedInMultiline();
169
170 /** Documentation in subclass [ArrayBasedScanner]. */
171 void appendBeginGroup(PrecedenceInfo info);
172
173 /** Documentation in subclass [ArrayBasedScanner]. */
174 int appendEndGroup(PrecedenceInfo info, int openKind);
175
176 /** Documentation in subclass [ArrayBasedScanner]. */
177 void appendGt(PrecedenceInfo info);
178
179 /** Documentation in subclass [ArrayBasedScanner]. */
180 void appendGtGt(PrecedenceInfo info);
181
182 /** Documentation in subclass [ArrayBasedScanner]. */
183 void appendComment(start, bool asciiOnly);
184
185 /// Append [token] to the token stream.
186 void appendErrorToken(ErrorToken token);
187
188 /** Documentation in subclass [ArrayBasedScanner]. */
189 void discardOpenLt();
190
191 /// Return true when at EOF.
192 bool atEndOfFile();
193
194 Token tokenize() {
195 while (!atEndOfFile()) {
196 int next = advance();
197 while (!identical(next, $EOF)) {
198 next = bigSwitch(next);
199 }
200 if (atEndOfFile()) {
201 appendEofToken();
202 } else {
203 unexpected($EOF);
204 }
205 }
206
207 if (file != null) {
208 file.length = stringOffset;
209 // One additional line start at the end, see [SourceFile.lineStarts].
210 lineStarts.add(stringOffset + 1);
211 file.lineStarts = lineStarts;
212 }
213
214 return firstToken();
215 }
216
217 int bigSwitch(int next) {
218 beginToken();
219 if (identical(next, $SPACE) || identical(next, $TAB)
220 || identical(next, $LF) || identical(next, $CR)) {
221 appendWhiteSpace(next);
222 next = advance();
223 // Sequences of spaces are common, so advance through them fast.
224 while (identical(next, $SPACE)) {
225 // We don't invoke [:appendWhiteSpace(next):] here for efficiency,
226 // assuming that it does not do anything for space characters.
227 next = advance();
228 }
229 return next;
230 }
231
232 if ($a <= next && next <= $z) {
233 if (identical($r, next)) {
234 return tokenizeRawStringKeywordOrIdentifier(next);
235 }
236 return tokenizeKeywordOrIdentifier(next, true);
237 }
238
239 if (($A <= next && next <= $Z) ||
240 identical(next, $_) ||
241 identical(next, $$)) {
242 return tokenizeIdentifier(next, scanOffset, true);
243 }
244
245 if (identical(next, $LT)) {
246 return tokenizeLessThan(next);
247 }
248
249 if (identical(next, $GT)) {
250 return tokenizeGreaterThan(next);
251 }
252
253 if (identical(next, $EQ)) {
254 return tokenizeEquals(next);
255 }
256
257 if (identical(next, $BANG)) {
258 return tokenizeExclamation(next);
259 }
260
261 if (identical(next, $PLUS)) {
262 return tokenizePlus(next);
263 }
264
265 if (identical(next, $MINUS)) {
266 return tokenizeMinus(next);
267 }
268
269 if (identical(next, $STAR)) {
270 return tokenizeMultiply(next);
271 }
272
273 if (identical(next, $PERCENT)) {
274 return tokenizePercent(next);
275 }
276
277 if (identical(next, $AMPERSAND)) {
278 return tokenizeAmpersand(next);
279 }
280
281 if (identical(next, $BAR)) {
282 return tokenizeBar(next);
283 }
284
285 if (identical(next, $CARET)) {
286 return tokenizeCaret(next);
287 }
288
289 if (identical(next, $OPEN_SQUARE_BRACKET)) {
290 return tokenizeOpenSquareBracket(next);
291 }
292
293 if (identical(next, $TILDE)) {
294 return tokenizeTilde(next);
295 }
296
297 if (identical(next, $BACKSLASH)) {
298 appendPrecedenceToken(BACKSLASH_INFO);
299 return advance();
300 }
301
302 if (identical(next, $HASH)) {
303 return tokenizeTag(next);
304 }
305
306 if (identical(next, $OPEN_PAREN)) {
307 appendBeginGroup(OPEN_PAREN_INFO);
308 return advance();
309 }
310
311 if (identical(next, $CLOSE_PAREN)) {
312 return appendEndGroup(CLOSE_PAREN_INFO, OPEN_PAREN_TOKEN);
313 }
314
315 if (identical(next, $COMMA)) {
316 appendPrecedenceToken(COMMA_INFO);
317 return advance();
318 }
319
320 if (identical(next, $COLON)) {
321 appendPrecedenceToken(COLON_INFO);
322 return advance();
323 }
324
325 if (identical(next, $SEMICOLON)) {
326 appendPrecedenceToken(SEMICOLON_INFO);
327 // Type parameters and arguments cannot contain semicolon.
328 discardOpenLt();
329 return advance();
330 }
331
332 if (identical(next, $QUESTION)) {
333 appendPrecedenceToken(QUESTION_INFO);
334 return advance();
335 }
336
337 if (identical(next, $CLOSE_SQUARE_BRACKET)) {
338 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO,
339 OPEN_SQUARE_BRACKET_TOKEN);
340 }
341
342 if (identical(next, $BACKPING)) {
343 appendPrecedenceToken(BACKPING_INFO);
344 return advance();
345 }
346
347 if (identical(next, $OPEN_CURLY_BRACKET)) {
348 appendBeginGroup(OPEN_CURLY_BRACKET_INFO);
349 return advance();
350 }
351
352 if (identical(next, $CLOSE_CURLY_BRACKET)) {
353 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO,
354 OPEN_CURLY_BRACKET_TOKEN);
355 }
356
357 if (identical(next, $SLASH)) {
358 return tokenizeSlashOrComment(next);
359 }
360
361 if (identical(next, $AT)) {
362 return tokenizeAt(next);
363 }
364
365 if (identical(next, $DQ) || identical(next, $SQ)) {
366 return tokenizeString(next, scanOffset, false);
367 }
368
369 if (identical(next, $PERIOD)) {
370 return tokenizeDotsOrNumber(next);
371 }
372
373 if (identical(next, $0)) {
374 return tokenizeHexOrNumber(next);
375 }
376
377 // TODO(ahe): Would a range check be faster?
378 if (identical(next, $1) || identical(next, $2) || identical(next, $3)
379 || identical(next, $4) || identical(next, $5) || identical(next, $6)
380 || identical(next, $7) || identical(next, $8) || identical(next, $9)) {
381 return tokenizeNumber(next);
382 }
383
384 if (identical(next, $EOF)) {
385 return $EOF;
386 }
387 if (next < 0x1f) {
388 return unexpected(next);
389 }
390
391 next = currentAsUnicode(next);
392
393 // The following are non-ASCII characters.
394
395 if (identical(next, $NBSP)) {
396 appendWhiteSpace(next);
397 return advance();
398 }
399
400 return unexpected(next);
401 }
402
403 int tokenizeTag(int next) {
404 // # or #!.*[\n\r]
405 if (scanOffset == 0) {
406 if (identical(peek(), $BANG)) {
407 int start = scanOffset + 1;
408 bool asciiOnly = true;
409 do {
410 next = advance();
411 if (next > 127) asciiOnly = false;
412 } while (!identical(next, $LF) &&
413 !identical(next, $CR) &&
414 !identical(next, $EOF));
415 if (!asciiOnly) handleUnicode(start);
416 return next;
417 }
418 }
419 appendPrecedenceToken(HASH_INFO);
420 return advance();
421 }
422
423 int tokenizeTilde(int next) {
424 // ~ ~/ ~/=
425 next = advance();
426 if (identical(next, $SLASH)) {
427 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO);
428 } else {
429 appendPrecedenceToken(TILDE_INFO);
430 return next;
431 }
432 }
433
434 int tokenizeOpenSquareBracket(int next) {
435 // [ [] []=
436 next = advance();
437 if (identical(next, $CLOSE_SQUARE_BRACKET)) {
438 Token token = previousToken();
439 if (token is KeywordToken && token.keyword.syntax == 'operator' ||
440 token is SymbolToken && token.info == HASH_INFO) {
441 return select($EQ, INDEX_EQ_INFO, INDEX_INFO);
442 }
443 }
444 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO);
445 return next;
446 }
447
448 int tokenizeCaret(int next) {
449 // ^ ^=
450 return select($EQ, CARET_EQ_INFO, CARET_INFO);
451 }
452
453 int tokenizeBar(int next) {
454 // | || |=
455 next = advance();
456 if (identical(next, $BAR)) {
457 appendPrecedenceToken(BAR_BAR_INFO);
458 return advance();
459 } else if (identical(next, $EQ)) {
460 appendPrecedenceToken(BAR_EQ_INFO);
461 return advance();
462 } else {
463 appendPrecedenceToken(BAR_INFO);
464 return next;
465 }
466 }
467
468 int tokenizeAmpersand(int next) {
469 // && &= &
470 next = advance();
471 if (identical(next, $AMPERSAND)) {
472 appendPrecedenceToken(AMPERSAND_AMPERSAND_INFO);
473 return advance();
474 } else if (identical(next, $EQ)) {
475 appendPrecedenceToken(AMPERSAND_EQ_INFO);
476 return advance();
477 } else {
478 appendPrecedenceToken(AMPERSAND_INFO);
479 return next;
480 }
481 }
482
483 int tokenizePercent(int next) {
484 // % %=
485 return select($EQ, PERCENT_EQ_INFO, PERCENT_INFO);
486 }
487
488 int tokenizeMultiply(int next) {
489 // * *=
490 return select($EQ, STAR_EQ_INFO, STAR_INFO);
491 }
492
493 int tokenizeMinus(int next) {
494 // - -- -=
495 next = advance();
496 if (identical(next, $MINUS)) {
497 appendPrecedenceToken(MINUS_MINUS_INFO);
498 return advance();
499 } else if (identical(next, $EQ)) {
500 appendPrecedenceToken(MINUS_EQ_INFO);
501 return advance();
502 } else {
503 appendPrecedenceToken(MINUS_INFO);
504 return next;
505 }
506 }
507
508 int tokenizePlus(int next) {
509 // + ++ +=
510 next = advance();
511 if (identical($PLUS, next)) {
512 appendPrecedenceToken(PLUS_PLUS_INFO);
513 return advance();
514 } else if (identical($EQ, next)) {
515 appendPrecedenceToken(PLUS_EQ_INFO);
516 return advance();
517 } else {
518 appendPrecedenceToken(PLUS_INFO);
519 return next;
520 }
521 }
522
523 int tokenizeExclamation(int next) {
524 // ! !=
525 // !== is kept for user-friendly error reporting.
526
527 next = advance();
528 if (identical(next, $EQ)) {
529 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO);
530 }
531 appendPrecedenceToken(BANG_INFO);
532 return next;
533 }
534
535 int tokenizeEquals(int next) {
536 // = == =>
537 // === is kept for user-friendly error reporting.
538
539 // Type parameters and arguments cannot contain any token that
540 // starts with '='.
541 discardOpenLt();
542
543 next = advance();
544 if (identical(next, $EQ)) {
545 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO);
546 } else if (identical(next, $GT)) {
547 appendPrecedenceToken(FUNCTION_INFO);
548 return advance();
549 }
550 appendPrecedenceToken(EQ_INFO);
551 return next;
552 }
553
554 int tokenizeGreaterThan(int next) {
555 // > >= >> >>=
556 next = advance();
557 if (identical($EQ, next)) {
558 appendPrecedenceToken(GT_EQ_INFO);
559 return advance();
560 } else if (identical($GT, next)) {
561 next = advance();
562 if (identical($EQ, next)) {
563 appendPrecedenceToken(GT_GT_EQ_INFO);
564 return advance();
565 } else {
566 appendGtGt(GT_GT_INFO);
567 return next;
568 }
569 } else {
570 appendGt(GT_INFO);
571 return next;
572 }
573 }
574
575 int tokenizeLessThan(int next) {
576 // < <= << <<=
577 next = advance();
578 if (identical($EQ, next)) {
579 appendPrecedenceToken(LT_EQ_INFO);
580 return advance();
581 } else if (identical($LT, next)) {
582 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO);
583 } else {
584 appendBeginGroup(LT_INFO);
585 return next;
586 }
587 }
588
589 int tokenizeNumber(int next) {
590 int start = scanOffset;
591 while (true) {
592 next = advance();
593 if ($0 <= next && next <= $9) {
594 continue;
595 } else if (identical(next, $e) || identical(next, $E)) {
596 return tokenizeFractionPart(next, start);
597 } else {
598 if (identical(next, $PERIOD)) {
599 int nextnext = peek();
600 if ($0 <= nextnext && nextnext <= $9) {
601 return tokenizeFractionPart(advance(), start);
602 }
603 }
604 appendSubstringToken(INT_INFO, start, true);
605 return next;
606 }
607 }
608 return null;
609 }
610
611 int tokenizeHexOrNumber(int next) {
612 int x = peek();
613 if (identical(x, $x) || identical(x, $X)) {
614 return tokenizeHex(next);
615 }
616 return tokenizeNumber(next);
617 }
618
619 int tokenizeHex(int next) {
620 int start = scanOffset;
621 next = advance(); // Advance past the $x or $X.
622 bool hasDigits = false;
623 while (true) {
624 next = advance();
625 if (($0 <= next && next <= $9)
626 || ($A <= next && next <= $F)
627 || ($a <= next && next <= $f)) {
628 hasDigits = true;
629 } else {
630 if (!hasDigits) {
631 unterminated('0x', shouldAdvance: false);
632 return next;
633 }
634 appendSubstringToken(HEXADECIMAL_INFO, start, true);
635 return next;
636 }
637 }
638 return null;
639 }
640
641 int tokenizeDotsOrNumber(int next) {
642 int start = scanOffset;
643 next = advance();
644 if (($0 <= next && next <= $9)) {
645 return tokenizeFractionPart(next, start);
646 } else if (identical($PERIOD, next)) {
647 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);
648 } else {
649 appendPrecedenceToken(PERIOD_INFO);
650 return next;
651 }
652 }
653
654 int tokenizeFractionPart(int next, int start) {
655 bool done = false;
656 bool hasDigit = false;
657 LOOP: while (!done) {
658 if ($0 <= next && next <= $9) {
659 hasDigit = true;
660 } else if (identical($e, next) || identical($E, next)) {
661 hasDigit = true;
662 next = advance();
663 if (identical(next, $PLUS) || identical(next, $MINUS)) {
664 next = advance();
665 }
666 bool hasExponentDigits = false;
667 while (true) {
668 if ($0 <= next && next <= $9) {
669 hasExponentDigits = true;
670 } else {
671 if (!hasExponentDigits) {
672 unterminated('1e', shouldAdvance: false);
673 return next;
674 }
675 break;
676 }
677 next = advance();
678 }
679
680 done = true;
681 continue LOOP;
682 } else {
683 done = true;
684 continue LOOP;
685 }
686 next = advance();
687 }
688 if (!hasDigit) {
689 // Reduce offset, we already advanced to the token past the period.
690 appendSubstringToken(INT_INFO, start, true, -1);
691
692 // TODO(ahe): Wrong offset for the period. Cannot call beginToken because
693 // the scanner already advanced past the period.
694 if (identical($PERIOD, next)) {
695 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);
696 }
697 appendPrecedenceToken(PERIOD_INFO);
698 return next;
699 }
700 appendSubstringToken(DOUBLE_INFO, start, true);
701 return next;
702 }
703
704 int tokenizeSlashOrComment(int next) {
705 int start = scanOffset;
706 next = advance();
707 if (identical($STAR, next)) {
708 return tokenizeMultiLineComment(next, start);
709 } else if (identical($SLASH, next)) {
710 return tokenizeSingleLineComment(next, start);
711 } else if (identical($EQ, next)) {
712 appendPrecedenceToken(SLASH_EQ_INFO);
713 return advance();
714 } else {
715 appendPrecedenceToken(SLASH_INFO);
716 return next;
717 }
718 }
719
720 int tokenizeSingleLineComment(int next, int start) {
721 bool asciiOnly = true;
722 while (true) {
723 next = advance();
724 if (next > 127) asciiOnly = false;
725 if (identical($LF, next) ||
726 identical($CR, next) ||
727 identical($EOF, next)) {
728 if (!asciiOnly) handleUnicode(start);
729 appendComment(start, asciiOnly);
730 return next;
731 }
732 }
733 return null;
734 }
735
736
737 int tokenizeMultiLineComment(int next, int start) {
738 bool asciiOnlyComment = true; // Track if the entire comment is ASCII.
739 bool asciiOnlyLines = true; // Track ASCII since the last handleUnicode.
740 int unicodeStart = start;
741 int nesting = 1;
742 next = advance();
743 while (true) {
744 if (identical($EOF, next)) {
745 if (!asciiOnlyLines) handleUnicode(unicodeStart);
746 unterminated('/*');
747 break;
748 } else if (identical($STAR, next)) {
749 next = advance();
750 if (identical($SLASH, next)) {
751 --nesting;
752 if (0 == nesting) {
753 if (!asciiOnlyLines) handleUnicode(unicodeStart);
754 next = advance();
755 appendComment(start, asciiOnlyComment);
756 break;
757 } else {
758 next = advance();
759 }
760 }
761 } else if (identical($SLASH, next)) {
762 next = advance();
763 if (identical($STAR, next)) {
764 next = advance();
765 ++nesting;
766 }
767 } else if (identical(next, $LF)) {
768 if (!asciiOnlyLines) {
769 // Synchronize the string offset in the utf8 scanner.
770 handleUnicode(unicodeStart);
771 asciiOnlyLines = true;
772 unicodeStart = scanOffset;
773 }
774 lineFeedInMultiline();
775 next = advance();
776 } else {
777 if (next > 127) {
778 asciiOnlyLines = false;
779 asciiOnlyComment = false;
780 }
781 next = advance();
782 }
783 }
784 return next;
785 }
786
787 int tokenizeRawStringKeywordOrIdentifier(int next) {
788 // [next] is $r.
789 int nextnext = peek();
790 if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) {
791 int start = scanOffset;
792 next = advance();
793 return tokenizeString(next, start, true);
794 }
795 return tokenizeKeywordOrIdentifier(next, true);
796 }
797
798 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) {
799 KeywordState state = KeywordState.KEYWORD_STATE;
800 int start = scanOffset;
801 while (state != null && $a <= next && next <= $z) {
802 state = state.next(next);
803 next = advance();
804 }
805 if (state == null || state.keyword == null) {
806 return tokenizeIdentifier(next, start, allowDollar);
807 }
808 if (($A <= next && next <= $Z) ||
809 ($0 <= next && next <= $9) ||
810 identical(next, $_) ||
811 identical(next, $$)) {
812 return tokenizeIdentifier(next, start, allowDollar);
813 } else {
814 appendKeywordToken(state.keyword);
815 return next;
816 }
817 }
818
819 /**
820 * [allowDollar] can exclude '$', which is not allowed as part of a string
821 * interpolation identifier.
822 */
823 int tokenizeIdentifier(int next, int start, bool allowDollar) {
824 while (true) {
825 if (($a <= next && next <= $z) ||
826 ($A <= next && next <= $Z) ||
827 ($0 <= next && next <= $9) ||
828 identical(next, $_) ||
829 (identical(next, $$) && allowDollar)) {
830 next = advance();
831 } else {
832 // Identifier ends here.
833 if (start == scanOffset) {
834 return unexpected(next);
835 } else {
836 appendSubstringToken(IDENTIFIER_INFO, start, true);
837 }
838 break;
839 }
840 }
841 return next;
842 }
843
844 int tokenizeAt(int next) {
845 appendPrecedenceToken(AT_INFO);
846 return advance();
847 }
848
849 int tokenizeString(int next, int start, bool raw) {
850 int quoteChar = next;
851 next = advance();
852 if (identical(quoteChar, next)) {
853 next = advance();
854 if (identical(quoteChar, next)) {
855 // Multiline string.
856 return tokenizeMultiLineString(quoteChar, start, raw);
857 } else {
858 // Empty string.
859 appendSubstringToken(STRING_INFO, start, true);
860 return next;
861 }
862 }
863 if (raw) {
864 return tokenizeSingleLineRawString(next, quoteChar, start);
865 } else {
866 return tokenizeSingleLineString(next, quoteChar, start);
867 }
868 }
869
870 /**
871 * [next] is the first character after the quote.
872 * [start] is the scanOffset of the quote.
873 *
874 * The token contains a substring of the source file, including the
875 * string quotes, backslashes for escaping. For interpolated strings,
876 * the parts before and after are separate tokens.
877 *
878 * "a $b c"
879 *
880 * gives StringToken("a $), StringToken(b) and StringToken( c").
881 */
882 int tokenizeSingleLineString(int next, int quoteChar, int start) {
883 bool asciiOnly = true;
884 while (!identical(next, quoteChar)) {
885 if (identical(next, $BACKSLASH)) {
886 next = advance();
887 } else if (identical(next, $$)) {
888 if (!asciiOnly) handleUnicode(start);
889 next = tokenizeStringInterpolation(start, asciiOnly);
890 start = scanOffset;
891 asciiOnly = true;
892 continue;
893 }
894 if (next <= $CR
895 && (identical(next, $LF) ||
896 identical(next, $CR) ||
897 identical(next, $EOF))) {
898 if (!asciiOnly) handleUnicode(start);
899 return unterminatedString(quoteChar);
900 }
901 if (next > 127) asciiOnly = false;
902 next = advance();
903 }
904 if (!asciiOnly) handleUnicode(start);
905 // Advance past the quote character.
906 next = advance();
907 appendSubstringToken(STRING_INFO, start, asciiOnly);
908 return next;
909 }
910
911 int tokenizeStringInterpolation(int start, bool asciiOnly) {
912 appendSubstringToken(STRING_INFO, start, asciiOnly);
913 beginToken(); // $ starts here.
914 int next = advance();
915 if (identical(next, $OPEN_CURLY_BRACKET)) {
916 return tokenizeInterpolatedExpression(next);
917 } else {
918 return tokenizeInterpolatedIdentifier(next);
919 }
920 }
921
922 int tokenizeInterpolatedExpression(int next) {
923 appendBeginGroup(STRING_INTERPOLATION_INFO);
924 beginToken(); // The expression starts here.
925 next = advance(); // Move past the curly bracket.
926 while (!identical(next, $EOF) && !identical(next, $STX)) {
927 next = bigSwitch(next);
928 }
929 if (identical(next, $EOF)) return next;
930 next = advance(); // Move past the $STX.
931 beginToken(); // The string interpolation suffix starts here.
932 return next;
933 }
934
935 int tokenizeInterpolatedIdentifier(int next) {
936 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO);
937
938 if ($a <= next && next <= $z) {
939 beginToken(); // The identifier starts here.
940 next = tokenizeKeywordOrIdentifier(next, false);
941 } else if (($A <= next && next <= $Z) || identical(next, $_)) {
942 beginToken(); // The identifier starts here.
943 next = tokenizeIdentifier(next, scanOffset, false);
944 } else {
945 unterminated(r'$', shouldAdvance: false);
946 }
947 beginToken(); // The string interpolation suffix starts here.
948 return next;
949 }
950
951 int tokenizeSingleLineRawString(int next, int quoteChar, int start) {
952 bool asciiOnly = true;
953 while (next != $EOF) {
954 if (identical(next, quoteChar)) {
955 if (!asciiOnly) handleUnicode(start);
956 next = advance();
957 appendSubstringToken(STRING_INFO, start, asciiOnly);
958 return next;
959 } else if (identical(next, $LF) || identical(next, $CR)) {
960 if (!asciiOnly) handleUnicode(start);
961 return unterminatedRawString(quoteChar);
962 } else if (next > 127) {
963 asciiOnly = false;
964 }
965 next = advance();
966 }
967 if (!asciiOnly) handleUnicode(start);
968 return unterminatedRawString(quoteChar);
969 }
970
971 int tokenizeMultiLineRawString(int quoteChar, int start) {
972 bool asciiOnlyString = true;
973 bool asciiOnlyLine = true;
974 int unicodeStart = start;
975 int next = advance(); // Advance past the (last) quote (of three).
976 outer: while (!identical(next, $EOF)) {
977 while (!identical(next, quoteChar)) {
978 if (identical(next, $LF)) {
979 if (!asciiOnlyLine) {
980 // Synchronize the string offset in the utf8 scanner.
981 handleUnicode(unicodeStart);
982 asciiOnlyLine = true;
983 unicodeStart = scanOffset;
984 }
985 lineFeedInMultiline();
986 } else if (next > 127) {
987 asciiOnlyLine = false;
988 asciiOnlyString = false;
989 }
990 next = advance();
991 if (identical(next, $EOF)) break outer;
992 }
993 next = advance();
994 if (identical(next, quoteChar)) {
995 next = advance();
996 if (identical(next, quoteChar)) {
997 if (!asciiOnlyLine) handleUnicode(unicodeStart);
998 next = advance();
999 appendSubstringToken(STRING_INFO, start, asciiOnlyString);
1000 return next;
1001 }
1002 }
1003 }
1004 if (!asciiOnlyLine) handleUnicode(unicodeStart);
1005 return unterminatedRawMultiLineString(quoteChar);
1006 }
1007
1008 int tokenizeMultiLineString(int quoteChar, int start, bool raw) {
1009 if (raw) return tokenizeMultiLineRawString(quoteChar, start);
1010 bool asciiOnlyString = true;
1011 bool asciiOnlyLine = true;
1012 int unicodeStart = start;
1013 int next = advance(); // Advance past the (last) quote (of three).
1014 while (!identical(next, $EOF)) {
1015 if (identical(next, $$)) {
1016 if (!asciiOnlyLine) handleUnicode(unicodeStart);
1017 next = tokenizeStringInterpolation(start, asciiOnlyString);
1018 start = scanOffset;
1019 unicodeStart = start;
1020 asciiOnlyString = true; // A new string token is created for the rest.
1021 asciiOnlyLine = true;
1022 continue;
1023 }
1024 if (identical(next, quoteChar)) {
1025 next = advance();
1026 if (identical(next, quoteChar)) {
1027 next = advance();
1028 if (identical(next, quoteChar)) {
1029 if (!asciiOnlyLine) handleUnicode(unicodeStart);
1030 next = advance();
1031 appendSubstringToken(STRING_INFO, start, asciiOnlyString);
1032 return next;
1033 }
1034 }
1035 continue;
1036 }
1037 if (identical(next, $BACKSLASH)) {
1038 next = advance();
1039 if (identical(next, $EOF)) break;
1040 }
1041 if (identical(next, $LF)) {
1042 if (!asciiOnlyLine) {
1043 // Synchronize the string offset in the utf8 scanner.
1044 handleUnicode(unicodeStart);
1045 asciiOnlyLine = true;
1046 unicodeStart = scanOffset;
1047 }
1048 lineFeedInMultiline();
1049 } else if (next > 127) {
1050 asciiOnlyString = false;
1051 asciiOnlyLine = false;
1052 }
1053 next = advance();
1054 }
1055 if (!asciiOnlyLine) handleUnicode(unicodeStart);
1056 return unterminatedMultiLineString(quoteChar);
1057 }
1058
1059 int unexpected(int character) {
1060 appendErrorToken(new BadInputToken(character, tokenStart));
1061 return advanceAfterError(true);
1062 }
1063
1064 int unterminated(String prefix, {bool shouldAdvance: true}) {
1065 appendErrorToken(new UnterminatedToken(prefix, tokenStart, stringOffset));
1066 return advanceAfterError(shouldAdvance);
1067 }
1068
1069 int unterminatedString(int quoteChar) {
1070 return unterminated(new String.fromCharCodes([quoteChar]));
1071 }
1072
1073 int unterminatedRawString(int quoteChar) {
1074 return unterminated('r${new String.fromCharCodes([quoteChar])}');
1075 }
1076
1077 int unterminatedMultiLineString(int quoteChar) {
1078 return unterminated(
1079 new String.fromCharCodes([quoteChar, quoteChar, quoteChar]));
1080 }
1081
1082 int unterminatedRawMultiLineString(int quoteChar) {
1083 return unterminated(
1084 'r${new String.fromCharCodes([quoteChar, quoteChar, quoteChar])}');
1085 }
1086
1087 int advanceAfterError(bool shouldAdvance) {
1088 if (atEndOfFile()) return $EOF;
1089 if (shouldAdvance) {
1090 return advance(); // Ensure progress.
1091 } else {
1092 return -1;
1093 }
1094 }
1095
1096 void unmatchedBeginGroup(BeginGroupToken begin) {
1097 // We want to ensure that unmatched BeginGroupTokens are reported as
1098 // errors. However, the diet parser assumes that groups are well-balanced
1099 // and will never look at the endGroup token. This is a nice property that
1100 // allows us to skip quickly over correct code. By inserting an additional
1101 // synthetic token in the stream, we can keep ignoring endGroup tokens.
1102 //
1103 // [begin] --next--> [tail]
1104 // [begin] --endG--> [synthetic] --next--> [next] --next--> [tail]
1105 //
1106 // This allows the diet parser to skip from [begin] via endGroup to
1107 // [synthetic] and ignore the [synthetic] token (assuming it's correct),
1108 // then the error will be reported when parsing the [next] token.
1109 //
1110 // For example, tokenize("{[1};") produces:
1111 //
1112 // SymbolToken({) --endGroup-----+
1113 // | |
1114 // next |
1115 // v |
1116 // SymbolToken([) --endGroup--+ |
1117 // | | |
1118 // next | |
1119 // v | |
1120 // StringToken(1) | |
1121 // | v |
1122 // next SymbolToken(]) | <- Synthetic token.
1123 // | | |
1124 // | next |
1125 // v | |
1126 // UnmatchedToken([)<---------+ |
1127 // | |
1128 // next |
1129 // v |
1130 // SymbolToken(})<---------------+
1131 // |
1132 // next
1133 // v
1134 // SymbolToken(;)
1135 // |
1136 // next
1137 // v
1138 // EOF
1139 Token synthetic =
1140 new SymbolToken(closeBraceInfoFor(begin), begin.charOffset);
1141 UnmatchedToken next = new UnmatchedToken(begin);
1142 begin.endGroup = synthetic;
1143 synthetic.next = next;
1144 appendErrorToken(next);
1145 }
1146 }
1147
1148 PrecedenceInfo closeBraceInfoFor(BeginGroupToken begin) {
1149 return const {
1150 '(': CLOSE_PAREN_INFO,
1151 '[': CLOSE_SQUARE_BRACKET_INFO,
1152 '{': CLOSE_CURLY_BRACKET_INFO,
1153 '<': GT_INFO,
1154 r'${': CLOSE_CURLY_BRACKET_INFO,
1155 }[begin.value];
1156 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698