Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(350)

Unified Diff: pkg/dart_scanner/lib/src/abstract_scanner.dart

Issue 2621153006: Copy scanner and parser to own packages. (Closed)
Patch Set: Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « pkg/dart_scanner/lib/io.dart ('k') | pkg/dart_scanner/lib/src/array_based_scanner.dart » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: pkg/dart_scanner/lib/src/abstract_scanner.dart
diff --git a/pkg/dart_scanner/lib/src/abstract_scanner.dart b/pkg/dart_scanner/lib/src/abstract_scanner.dart
new file mode 100644
index 0000000000000000000000000000000000000000..f0698611482b302293d982b6bccd4f5b91af214a
--- /dev/null
+++ b/pkg/dart_scanner/lib/src/abstract_scanner.dart
@@ -0,0 +1,1187 @@
+// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+library dart2js.scanner;
+
+import '../io/source_file.dart' show SourceFile, Utf8BytesSourceFile;
+import '../tokens/keyword.dart' show Keyword, KeywordState;
+import '../tokens/precedence.dart';
+import '../tokens/precedence_constants.dart';
+import '../tokens/token.dart';
+import '../tokens/token_constants.dart';
+import '../util/characters.dart';
+import 'string_scanner.dart' show StringScanner;
+import 'utf8_bytes_scanner.dart' show Utf8BytesScanner;
+
+abstract class Scanner {
+ Token tokenize();
+
+ factory Scanner(SourceFile file, {bool includeComments: false}) {
+ if (file is Utf8BytesSourceFile) {
+ return new Utf8BytesScanner(file, includeComments: includeComments);
+ } else {
+ return new StringScanner(file, includeComments: includeComments);
+ }
+ }
+}
+
+abstract class AbstractScanner implements Scanner {
+ // TODO(ahe): Move this class to implementation.
+
+ final bool includeComments;
+
+ /**
+ * The string offset for the next token that will be created.
+ *
+ * Note that in the [Utf8BytesScanner], [stringOffset] and [scanOffset] values
+ * are different. One string character can be encoded using multiple UTF-8
+ * bytes.
+ */
+ int tokenStart = -1;
+
+ /**
+ * A pointer to the token stream created by this scanner. The first token
+ * is a special token and not part of the source file. This is an
+ * implementation detail to avoids special cases in the scanner. This token
+ * is not exposed to clients of the scanner, which are expected to invoke
+ * [firstToken] to access the token stream.
+ */
+ final Token tokens = new SymbolToken(EOF_INFO, -1);
+
+ /**
+ * A pointer to the last scanned token.
+ */
+ Token tail;
+
+ /**
+ * The source file that is being scanned. This field can be [:null:].
+ * If the source file is available, the scanner assigns its [:lineStarts:] and
+ * [:length:] fields at the end of [tokenize].
+ */
+ final SourceFile file;
+
+ final List<int> lineStarts = <int>[0];
+
+ AbstractScanner(this.file, this.includeComments) {
+ this.tail = this.tokens;
+ }
+
+ /**
+ * Advances and returns the next character.
+ *
+ * If the next character is non-ASCII, then the returned value depends on the
+ * scanner implementation. The [Utf8BytesScanner] returns a UTF-8 byte, while
+ * the [StringScanner] returns a UTF-16 code unit.
+ *
+ * The scanner ensures that [advance] is not invoked after it returned [$EOF].
+ * This allows implementations to omit bound checks if the data structure ends
+ * with '0'.
+ */
+ int advance();
+
+ /**
+ * Returns the current unicode character.
+ *
+ * If the current character is ASCII, then it is returned unchanged.
+ *
+ * The [Utf8BytesScanner] decodes the next unicode code point starting at the
+ * current position. Note that every unicode character is returned as a single
+ * code point, that is, for '\u{1d11e}' it returns 119070, and the following
+ * [advance] returns the next character.
+ *
+ * The [StringScanner] returns the current character unchanged, which might
+ * be a surrogate character. In the case of '\u{1d11e}', it returns the first
+ * code unit 55348, and the following [advance] returns the second code unit
+ * 56606.
+ *
+ * Invoking [currentAsUnicode] multiple times is safe, i.e.,
+ * [:currentAsUnicode(next) == currentAsUnicode(currentAsUnicode(next)):].
+ */
+ int currentAsUnicode(int next);
+
+ /**
+ * Returns the character at the next poisition. Like in [advance], the
+ * [Utf8BytesScanner] returns a UTF-8 byte, while the [StringScanner] returns
+ * a UTF-16 code unit.
+ */
+ int peek();
+
+ /**
+ * Notifies the scanner that unicode characters were detected in either a
+ * comment or a string literal between [startScanOffset] and the current
+ * scan offset.
+ */
+ void handleUnicode(int startScanOffset);
+
+ /**
+ * Returns the current scan offset.
+ *
+ * In the [Utf8BytesScanner] this is the offset into the byte list, in the
+ * [StringScanner] the offset in the source string.
+ */
+ int get scanOffset;
+
+ /**
+ * Returns the current string offset.
+ *
+ * In the [StringScanner] this is identical to the [scanOffset]. In the
+ * [Utf8BytesScanner] it is computed based on encountered UTF-8 characters.
+ */
+ int get stringOffset;
+
+ /**
+ * Returns the first token scanned by this [Scanner].
+ */
+ Token firstToken();
+
+ /**
+ * Returns the last token scanned by this [Scanner].
+ */
+ Token previousToken();
+
+ /**
+ * Notifies that a new token starts at current offset.
+ */
+ void beginToken() {
+ tokenStart = stringOffset;
+ }
+
+ /**
+ * Appends a substring from the scan offset [:start:] to the current
+ * [:scanOffset:] plus the [:extraOffset:]. For example, if the current
+ * scanOffset is 10, then [:appendSubstringToken(5, -1):] will append the
+ * substring string [5,9).
+ *
+ * Note that [extraOffset] can only be used if the covered character(s) are
+ * known to be ASCII.
+ */
+ void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,
+ [int extraOffset]);
+
+ /** Documentation in subclass [ArrayBasedScanner]. */
+ void appendPrecedenceToken(PrecedenceInfo info);
+
+ /** Documentation in subclass [ArrayBasedScanner]. */
+ int select(int choice, PrecedenceInfo yes, PrecedenceInfo no);
+
+ /** Documentation in subclass [ArrayBasedScanner]. */
+ void appendKeywordToken(Keyword keyword);
+
+ /** Documentation in subclass [ArrayBasedScanner]. */
+ void appendEofToken();
+
+ /** Documentation in subclass [ArrayBasedScanner]. */
+ void appendWhiteSpace(int next);
+
+ /** Documentation in subclass [ArrayBasedScanner]. */
+ void lineFeedInMultiline();
+
+ /** Documentation in subclass [ArrayBasedScanner]. */
+ void appendBeginGroup(PrecedenceInfo info);
+
+ /** Documentation in subclass [ArrayBasedScanner]. */
+ int appendEndGroup(PrecedenceInfo info, int openKind);
+
+ /** Documentation in subclass [ArrayBasedScanner]. */
+ void appendGt(PrecedenceInfo info);
+
+ /** Documentation in subclass [ArrayBasedScanner]. */
+ void appendGtGt(PrecedenceInfo info);
+
+ /** Documentation in subclass [ArrayBasedScanner]. */
+ void appendComment(start, bool asciiOnly);
+
+ /// Append [token] to the token stream.
+ void appendErrorToken(ErrorToken token);
+
+ /** Documentation in subclass [ArrayBasedScanner]. */
+ void discardOpenLt();
+
+ /// Return true when at EOF.
+ bool atEndOfFile();
+
+ Token tokenize() {
+ while (!atEndOfFile()) {
+ int next = advance();
+ while (!identical(next, $EOF)) {
+ next = bigSwitch(next);
+ }
+ if (atEndOfFile()) {
+ appendEofToken();
+ } else {
+ unexpected($EOF);
+ }
+ }
+
+ if (file != null) {
+ file.length = stringOffset;
+ // One additional line start at the end, see [SourceFile.lineStarts].
+ lineStarts.add(stringOffset + 1);
+ file.lineStarts = lineStarts;
+ }
+
+ return firstToken();
+ }
+
+ int bigSwitch(int next) {
+ beginToken();
+ if (identical(next, $SPACE) ||
+ identical(next, $TAB) ||
+ identical(next, $LF) ||
+ identical(next, $CR)) {
+ appendWhiteSpace(next);
+ next = advance();
+ // Sequences of spaces are common, so advance through them fast.
+ while (identical(next, $SPACE)) {
+ // We don't invoke [:appendWhiteSpace(next):] here for efficiency,
+ // assuming that it does not do anything for space characters.
+ next = advance();
+ }
+ return next;
+ }
+
+ if ($a <= next && next <= $z) {
+ if (identical($r, next)) {
+ return tokenizeRawStringKeywordOrIdentifier(next);
+ }
+ return tokenizeKeywordOrIdentifier(next, true);
+ }
+
+ if (($A <= next && next <= $Z) ||
+ identical(next, $_) ||
+ identical(next, $$)) {
+ return tokenizeIdentifier(next, scanOffset, true);
+ }
+
+ if (identical(next, $LT)) {
+ return tokenizeLessThan(next);
+ }
+
+ if (identical(next, $GT)) {
+ return tokenizeGreaterThan(next);
+ }
+
+ if (identical(next, $EQ)) {
+ return tokenizeEquals(next);
+ }
+
+ if (identical(next, $BANG)) {
+ return tokenizeExclamation(next);
+ }
+
+ if (identical(next, $PLUS)) {
+ return tokenizePlus(next);
+ }
+
+ if (identical(next, $MINUS)) {
+ return tokenizeMinus(next);
+ }
+
+ if (identical(next, $STAR)) {
+ return tokenizeMultiply(next);
+ }
+
+ if (identical(next, $PERCENT)) {
+ return tokenizePercent(next);
+ }
+
+ if (identical(next, $AMPERSAND)) {
+ return tokenizeAmpersand(next);
+ }
+
+ if (identical(next, $BAR)) {
+ return tokenizeBar(next);
+ }
+
+ if (identical(next, $CARET)) {
+ return tokenizeCaret(next);
+ }
+
+ if (identical(next, $OPEN_SQUARE_BRACKET)) {
+ return tokenizeOpenSquareBracket(next);
+ }
+
+ if (identical(next, $TILDE)) {
+ return tokenizeTilde(next);
+ }
+
+ if (identical(next, $BACKSLASH)) {
+ appendPrecedenceToken(BACKSLASH_INFO);
+ return advance();
+ }
+
+ if (identical(next, $HASH)) {
+ return tokenizeTag(next);
+ }
+
+ if (identical(next, $OPEN_PAREN)) {
+ appendBeginGroup(OPEN_PAREN_INFO);
+ return advance();
+ }
+
+ if (identical(next, $CLOSE_PAREN)) {
+ return appendEndGroup(CLOSE_PAREN_INFO, OPEN_PAREN_TOKEN);
+ }
+
+ if (identical(next, $COMMA)) {
+ appendPrecedenceToken(COMMA_INFO);
+ return advance();
+ }
+
+ if (identical(next, $COLON)) {
+ appendPrecedenceToken(COLON_INFO);
+ return advance();
+ }
+
+ if (identical(next, $SEMICOLON)) {
+ appendPrecedenceToken(SEMICOLON_INFO);
+ // Type parameters and arguments cannot contain semicolon.
+ discardOpenLt();
+ return advance();
+ }
+
+ if (identical(next, $QUESTION)) {
+ return tokenizeQuestion(next);
+ }
+
+ if (identical(next, $CLOSE_SQUARE_BRACKET)) {
+ return appendEndGroup(
+ CLOSE_SQUARE_BRACKET_INFO, OPEN_SQUARE_BRACKET_TOKEN);
+ }
+
+ if (identical(next, $BACKPING)) {
+ appendPrecedenceToken(BACKPING_INFO);
+ return advance();
+ }
+
+ if (identical(next, $OPEN_CURLY_BRACKET)) {
+ appendBeginGroup(OPEN_CURLY_BRACKET_INFO);
+ return advance();
+ }
+
+ if (identical(next, $CLOSE_CURLY_BRACKET)) {
+ return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, OPEN_CURLY_BRACKET_TOKEN);
+ }
+
+ if (identical(next, $SLASH)) {
+ return tokenizeSlashOrComment(next);
+ }
+
+ if (identical(next, $AT)) {
+ return tokenizeAt(next);
+ }
+
+ if (identical(next, $DQ) || identical(next, $SQ)) {
+ return tokenizeString(next, scanOffset, false);
+ }
+
+ if (identical(next, $PERIOD)) {
+ return tokenizeDotsOrNumber(next);
+ }
+
+ if (identical(next, $0)) {
+ return tokenizeHexOrNumber(next);
+ }
+
+ // TODO(ahe): Would a range check be faster?
+ if (identical(next, $1) ||
+ identical(next, $2) ||
+ identical(next, $3) ||
+ identical(next, $4) ||
+ identical(next, $5) ||
+ identical(next, $6) ||
+ identical(next, $7) ||
+ identical(next, $8) ||
+ identical(next, $9)) {
+ return tokenizeNumber(next);
+ }
+
+ if (identical(next, $EOF)) {
+ return $EOF;
+ }
+ if (next < 0x1f) {
+ return unexpected(next);
+ }
+
+ next = currentAsUnicode(next);
+
+ // The following are non-ASCII characters.
+
+ if (identical(next, $NBSP)) {
+ appendWhiteSpace(next);
+ return advance();
+ }
+
+ return unexpected(next);
+ }
+
+ int tokenizeTag(int next) {
+ // # or #!.*[\n\r]
+ if (scanOffset == 0) {
+ if (identical(peek(), $BANG)) {
+ int start = scanOffset + 1;
+ bool asciiOnly = true;
+ do {
+ next = advance();
+ if (next > 127) asciiOnly = false;
+ } while (!identical(next, $LF) &&
+ !identical(next, $CR) &&
+ !identical(next, $EOF));
+ if (!asciiOnly) handleUnicode(start);
+ return next;
+ }
+ }
+ appendPrecedenceToken(HASH_INFO);
+ return advance();
+ }
+
+ int tokenizeTilde(int next) {
+ // ~ ~/ ~/=
+ next = advance();
+ if (identical(next, $SLASH)) {
+ return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO);
+ } else {
+ appendPrecedenceToken(TILDE_INFO);
+ return next;
+ }
+ }
+
+ int tokenizeOpenSquareBracket(int next) {
+ // [ [] []=
+ next = advance();
+ if (identical(next, $CLOSE_SQUARE_BRACKET)) {
+ Token token = previousToken();
+ if (token is KeywordToken && token.keyword.syntax == 'operator' ||
+ token is SymbolToken && token.info == HASH_INFO) {
+ return select($EQ, INDEX_EQ_INFO, INDEX_INFO);
+ }
+ }
+ appendBeginGroup(OPEN_SQUARE_BRACKET_INFO);
+ return next;
+ }
+
+ int tokenizeCaret(int next) {
+ // ^ ^=
+ return select($EQ, CARET_EQ_INFO, CARET_INFO);
+ }
+
+ int tokenizeQuestion(int next) {
+ // ? ?. ?? ??=
+ next = advance();
+ if (identical(next, $QUESTION)) {
+ return select($EQ, QUESTION_QUESTION_EQ_INFO, QUESTION_QUESTION_INFO);
+ } else if (identical(next, $PERIOD)) {
+ appendPrecedenceToken(QUESTION_PERIOD_INFO);
+ return advance();
+ } else {
+ appendPrecedenceToken(QUESTION_INFO);
+ return next;
+ }
+ }
+
+ int tokenizeBar(int next) {
+ // | || |=
+ next = advance();
+ if (identical(next, $BAR)) {
+ appendPrecedenceToken(BAR_BAR_INFO);
+ return advance();
+ } else if (identical(next, $EQ)) {
+ appendPrecedenceToken(BAR_EQ_INFO);
+ return advance();
+ } else {
+ appendPrecedenceToken(BAR_INFO);
+ return next;
+ }
+ }
+
+ int tokenizeAmpersand(int next) {
+ // && &= &
+ next = advance();
+ if (identical(next, $AMPERSAND)) {
+ appendPrecedenceToken(AMPERSAND_AMPERSAND_INFO);
+ return advance();
+ } else if (identical(next, $EQ)) {
+ appendPrecedenceToken(AMPERSAND_EQ_INFO);
+ return advance();
+ } else {
+ appendPrecedenceToken(AMPERSAND_INFO);
+ return next;
+ }
+ }
+
+ int tokenizePercent(int next) {
+ // % %=
+ return select($EQ, PERCENT_EQ_INFO, PERCENT_INFO);
+ }
+
+ int tokenizeMultiply(int next) {
+ // * *=
+ return select($EQ, STAR_EQ_INFO, STAR_INFO);
+ }
+
+ int tokenizeMinus(int next) {
+ // - -- -=
+ next = advance();
+ if (identical(next, $MINUS)) {
+ appendPrecedenceToken(MINUS_MINUS_INFO);
+ return advance();
+ } else if (identical(next, $EQ)) {
+ appendPrecedenceToken(MINUS_EQ_INFO);
+ return advance();
+ } else {
+ appendPrecedenceToken(MINUS_INFO);
+ return next;
+ }
+ }
+
+ int tokenizePlus(int next) {
+ // + ++ +=
+ next = advance();
+ if (identical($PLUS, next)) {
+ appendPrecedenceToken(PLUS_PLUS_INFO);
+ return advance();
+ } else if (identical($EQ, next)) {
+ appendPrecedenceToken(PLUS_EQ_INFO);
+ return advance();
+ } else {
+ appendPrecedenceToken(PLUS_INFO);
+ return next;
+ }
+ }
+
+ int tokenizeExclamation(int next) {
+ // ! !=
+ // !== is kept for user-friendly error reporting.
+
+ next = advance();
+ if (identical(next, $EQ)) {
+ return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO);
+ }
+ appendPrecedenceToken(BANG_INFO);
+ return next;
+ }
+
+ int tokenizeEquals(int next) {
+ // = == =>
+ // === is kept for user-friendly error reporting.
+
+ // Type parameters and arguments cannot contain any token that
+ // starts with '='.
+ discardOpenLt();
+
+ next = advance();
+ if (identical(next, $EQ)) {
+ return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO);
+ } else if (identical(next, $GT)) {
+ appendPrecedenceToken(FUNCTION_INFO);
+ return advance();
+ }
+ appendPrecedenceToken(EQ_INFO);
+ return next;
+ }
+
+ int tokenizeGreaterThan(int next) {
+ // > >= >> >>=
+ next = advance();
+ if (identical($EQ, next)) {
+ appendPrecedenceToken(GT_EQ_INFO);
+ return advance();
+ } else if (identical($GT, next)) {
+ next = advance();
+ if (identical($EQ, next)) {
+ appendPrecedenceToken(GT_GT_EQ_INFO);
+ return advance();
+ } else {
+ appendGtGt(GT_GT_INFO);
+ return next;
+ }
+ } else {
+ appendGt(GT_INFO);
+ return next;
+ }
+ }
+
+ int tokenizeLessThan(int next) {
+ // < <= << <<=
+ next = advance();
+ if (identical($EQ, next)) {
+ appendPrecedenceToken(LT_EQ_INFO);
+ return advance();
+ } else if (identical($LT, next)) {
+ return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO);
+ } else {
+ appendBeginGroup(LT_INFO);
+ return next;
+ }
+ }
+
+ int tokenizeNumber(int next) {
+ int start = scanOffset;
+ while (true) {
+ next = advance();
+ if ($0 <= next && next <= $9) {
+ continue;
+ } else if (identical(next, $e) || identical(next, $E)) {
+ return tokenizeFractionPart(next, start);
+ } else {
+ if (identical(next, $PERIOD)) {
+ int nextnext = peek();
+ if ($0 <= nextnext && nextnext <= $9) {
+ return tokenizeFractionPart(advance(), start);
+ }
+ }
+ appendSubstringToken(INT_INFO, start, true);
+ return next;
+ }
+ }
+ return null;
+ }
+
+ int tokenizeHexOrNumber(int next) {
+ int x = peek();
+ if (identical(x, $x) || identical(x, $X)) {
+ return tokenizeHex(next);
+ }
+ return tokenizeNumber(next);
+ }
+
+ int tokenizeHex(int next) {
+ int start = scanOffset;
+ next = advance(); // Advance past the $x or $X.
+ bool hasDigits = false;
+ while (true) {
+ next = advance();
+ if (($0 <= next && next <= $9) ||
+ ($A <= next && next <= $F) ||
+ ($a <= next && next <= $f)) {
+ hasDigits = true;
+ } else {
+ if (!hasDigits) {
+ unterminated('0x', shouldAdvance: false);
+ return next;
+ }
+ appendSubstringToken(HEXADECIMAL_INFO, start, true);
+ return next;
+ }
+ }
+ return null;
+ }
+
+ int tokenizeDotsOrNumber(int next) {
+ int start = scanOffset;
+ next = advance();
+ if (($0 <= next && next <= $9)) {
+ return tokenizeFractionPart(next, start);
+ } else if (identical($PERIOD, next)) {
+ return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);
+ } else {
+ appendPrecedenceToken(PERIOD_INFO);
+ return next;
+ }
+ }
+
+ int tokenizeFractionPart(int next, int start) {
+ bool done = false;
+ bool hasDigit = false;
+ LOOP:
+ while (!done) {
+ if ($0 <= next && next <= $9) {
+ hasDigit = true;
+ } else if (identical($e, next) || identical($E, next)) {
+ hasDigit = true;
+ next = advance();
+ if (identical(next, $PLUS) || identical(next, $MINUS)) {
+ next = advance();
+ }
+ bool hasExponentDigits = false;
+ while (true) {
+ if ($0 <= next && next <= $9) {
+ hasExponentDigits = true;
+ } else {
+ if (!hasExponentDigits) {
+ unterminated('1e', shouldAdvance: false);
+ return next;
+ }
+ break;
+ }
+ next = advance();
+ }
+
+ done = true;
+ continue LOOP;
+ } else {
+ done = true;
+ continue LOOP;
+ }
+ next = advance();
+ }
+ if (!hasDigit) {
+ // Reduce offset, we already advanced to the token past the period.
+ appendSubstringToken(INT_INFO, start, true, -1);
+
+ // TODO(ahe): Wrong offset for the period. Cannot call beginToken because
+ // the scanner already advanced past the period.
+ if (identical($PERIOD, next)) {
+ return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);
+ }
+ appendPrecedenceToken(PERIOD_INFO);
+ return next;
+ }
+ appendSubstringToken(DOUBLE_INFO, start, true);
+ return next;
+ }
+
+ int tokenizeSlashOrComment(int next) {
+ int start = scanOffset;
+ next = advance();
+ if (identical($STAR, next)) {
+ return tokenizeMultiLineComment(next, start);
+ } else if (identical($SLASH, next)) {
+ return tokenizeSingleLineComment(next, start);
+ } else if (identical($EQ, next)) {
+ appendPrecedenceToken(SLASH_EQ_INFO);
+ return advance();
+ } else {
+ appendPrecedenceToken(SLASH_INFO);
+ return next;
+ }
+ }
+
+ int tokenizeSingleLineComment(int next, int start) {
+ bool asciiOnly = true;
+ while (true) {
+ next = advance();
+ if (next > 127) asciiOnly = false;
+ if (identical($LF, next) ||
+ identical($CR, next) ||
+ identical($EOF, next)) {
+ if (!asciiOnly) handleUnicode(start);
+ appendComment(start, asciiOnly);
+ return next;
+ }
+ }
+ return null;
+ }
+
+ int tokenizeMultiLineComment(int next, int start) {
+ bool asciiOnlyComment = true; // Track if the entire comment is ASCII.
+ bool asciiOnlyLines = true; // Track ASCII since the last handleUnicode.
+ int unicodeStart = start;
+ int nesting = 1;
+ next = advance();
+ while (true) {
+ if (identical($EOF, next)) {
+ if (!asciiOnlyLines) handleUnicode(unicodeStart);
+ unterminated('/*');
+ break;
+ } else if (identical($STAR, next)) {
+ next = advance();
+ if (identical($SLASH, next)) {
+ --nesting;
+ if (0 == nesting) {
+ if (!asciiOnlyLines) handleUnicode(unicodeStart);
+ next = advance();
+ appendComment(start, asciiOnlyComment);
+ break;
+ } else {
+ next = advance();
+ }
+ }
+ } else if (identical($SLASH, next)) {
+ next = advance();
+ if (identical($STAR, next)) {
+ next = advance();
+ ++nesting;
+ }
+ } else if (identical(next, $LF)) {
+ if (!asciiOnlyLines) {
+ // Synchronize the string offset in the utf8 scanner.
+ handleUnicode(unicodeStart);
+ asciiOnlyLines = true;
+ unicodeStart = scanOffset;
+ }
+ lineFeedInMultiline();
+ next = advance();
+ } else {
+ if (next > 127) {
+ asciiOnlyLines = false;
+ asciiOnlyComment = false;
+ }
+ next = advance();
+ }
+ }
+ return next;
+ }
+
+ int tokenizeRawStringKeywordOrIdentifier(int next) {
+ // [next] is $r.
+ int nextnext = peek();
+ if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) {
+ int start = scanOffset;
+ next = advance();
+ return tokenizeString(next, start, true);
+ }
+ return tokenizeKeywordOrIdentifier(next, true);
+ }
+
+ int tokenizeKeywordOrIdentifier(int next, bool allowDollar) {
+ KeywordState state = KeywordState.KEYWORD_STATE;
+ int start = scanOffset;
+ while (state != null && $a <= next && next <= $z) {
+ state = state.next(next);
+ next = advance();
+ }
+ if (state == null || state.keyword == null) {
+ return tokenizeIdentifier(next, start, allowDollar);
+ }
+ if (($A <= next && next <= $Z) ||
+ ($0 <= next && next <= $9) ||
+ identical(next, $_) ||
+ identical(next, $$)) {
+ return tokenizeIdentifier(next, start, allowDollar);
+ } else {
+ appendKeywordToken(state.keyword);
+ return next;
+ }
+ }
+
+ /**
+ * [allowDollar] can exclude '$', which is not allowed as part of a string
+ * interpolation identifier.
+ */
+ int tokenizeIdentifier(int next, int start, bool allowDollar) {
+ while (true) {
+ if (($a <= next && next <= $z) ||
+ ($A <= next && next <= $Z) ||
+ ($0 <= next && next <= $9) ||
+ identical(next, $_) ||
+ (identical(next, $$) && allowDollar)) {
+ next = advance();
+ } else {
+ // Identifier ends here.
+ if (start == scanOffset) {
+ return unexpected(next);
+ } else {
+ appendSubstringToken(IDENTIFIER_INFO, start, true);
+ }
+ break;
+ }
+ }
+ return next;
+ }
+
+ int tokenizeAt(int next) {
+ appendPrecedenceToken(AT_INFO);
+ return advance();
+ }
+
+ int tokenizeString(int next, int start, bool raw) {
+ int quoteChar = next;
+ next = advance();
+ if (identical(quoteChar, next)) {
+ next = advance();
+ if (identical(quoteChar, next)) {
+ // Multiline string.
+ return tokenizeMultiLineString(quoteChar, start, raw);
+ } else {
+ // Empty string.
+ appendSubstringToken(STRING_INFO, start, true);
+ return next;
+ }
+ }
+ if (raw) {
+ return tokenizeSingleLineRawString(next, quoteChar, start);
+ } else {
+ return tokenizeSingleLineString(next, quoteChar, start);
+ }
+ }
+
+ /**
+ * [next] is the first character after the quote.
+ * [start] is the scanOffset of the quote.
+ *
+ * The token contains a substring of the source file, including the
+ * string quotes, backslashes for escaping. For interpolated strings,
+ * the parts before and after are separate tokens.
+ *
+ * "a $b c"
+ *
+ * gives StringToken("a $), StringToken(b) and StringToken( c").
+ */
+ int tokenizeSingleLineString(int next, int quoteChar, int start) {
+ bool asciiOnly = true;
+ while (!identical(next, quoteChar)) {
+ if (identical(next, $BACKSLASH)) {
+ next = advance();
+ } else if (identical(next, $$)) {
+ if (!asciiOnly) handleUnicode(start);
+ next = tokenizeStringInterpolation(start, asciiOnly);
+ start = scanOffset;
+ asciiOnly = true;
+ continue;
+ }
+ if (next <= $CR &&
+ (identical(next, $LF) ||
+ identical(next, $CR) ||
+ identical(next, $EOF))) {
+ if (!asciiOnly) handleUnicode(start);
+ return unterminatedString(quoteChar);
+ }
+ if (next > 127) asciiOnly = false;
+ next = advance();
+ }
+ if (!asciiOnly) handleUnicode(start);
+ // Advance past the quote character.
+ next = advance();
+ appendSubstringToken(STRING_INFO, start, asciiOnly);
+ return next;
+ }
+
+ int tokenizeStringInterpolation(int start, bool asciiOnly) {
+ appendSubstringToken(STRING_INFO, start, asciiOnly);
+ beginToken(); // $ starts here.
+ int next = advance();
+ if (identical(next, $OPEN_CURLY_BRACKET)) {
+ return tokenizeInterpolatedExpression(next);
+ } else {
+ return tokenizeInterpolatedIdentifier(next);
+ }
+ }
+
+ int tokenizeInterpolatedExpression(int next) {
+ appendBeginGroup(STRING_INTERPOLATION_INFO);
+ beginToken(); // The expression starts here.
+ next = advance(); // Move past the curly bracket.
+ while (!identical(next, $EOF) && !identical(next, $STX)) {
+ next = bigSwitch(next);
+ }
+ if (identical(next, $EOF)) return next;
+ next = advance(); // Move past the $STX.
+ beginToken(); // The string interpolation suffix starts here.
+ return next;
+ }
+
+ int tokenizeInterpolatedIdentifier(int next) {
+ appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO);
+
+ if ($a <= next && next <= $z) {
+ beginToken(); // The identifier starts here.
+ next = tokenizeKeywordOrIdentifier(next, false);
+ } else if (($A <= next && next <= $Z) || identical(next, $_)) {
+ beginToken(); // The identifier starts here.
+ next = tokenizeIdentifier(next, scanOffset, false);
+ } else {
+ unterminated(r'$', shouldAdvance: false);
+ }
+ beginToken(); // The string interpolation suffix starts here.
+ return next;
+ }
+
+ int tokenizeSingleLineRawString(int next, int quoteChar, int start) {
+ bool asciiOnly = true;
+ while (next != $EOF) {
+ if (identical(next, quoteChar)) {
+ if (!asciiOnly) handleUnicode(start);
+ next = advance();
+ appendSubstringToken(STRING_INFO, start, asciiOnly);
+ return next;
+ } else if (identical(next, $LF) || identical(next, $CR)) {
+ if (!asciiOnly) handleUnicode(start);
+ return unterminatedRawString(quoteChar);
+ } else if (next > 127) {
+ asciiOnly = false;
+ }
+ next = advance();
+ }
+ if (!asciiOnly) handleUnicode(start);
+ return unterminatedRawString(quoteChar);
+ }
+
+ int tokenizeMultiLineRawString(int quoteChar, int start) {
+ bool asciiOnlyString = true;
+ bool asciiOnlyLine = true;
+ int unicodeStart = start;
+ int next = advance(); // Advance past the (last) quote (of three).
+ outer:
+ while (!identical(next, $EOF)) {
+ while (!identical(next, quoteChar)) {
+ if (identical(next, $LF)) {
+ if (!asciiOnlyLine) {
+ // Synchronize the string offset in the utf8 scanner.
+ handleUnicode(unicodeStart);
+ asciiOnlyLine = true;
+ unicodeStart = scanOffset;
+ }
+ lineFeedInMultiline();
+ } else if (next > 127) {
+ asciiOnlyLine = false;
+ asciiOnlyString = false;
+ }
+ next = advance();
+ if (identical(next, $EOF)) break outer;
+ }
+ next = advance();
+ if (identical(next, quoteChar)) {
+ next = advance();
+ if (identical(next, quoteChar)) {
+ if (!asciiOnlyLine) handleUnicode(unicodeStart);
+ next = advance();
+ appendSubstringToken(STRING_INFO, start, asciiOnlyString);
+ return next;
+ }
+ }
+ }
+ if (!asciiOnlyLine) handleUnicode(unicodeStart);
+ return unterminatedRawMultiLineString(quoteChar);
+ }
+
+ int tokenizeMultiLineString(int quoteChar, int start, bool raw) {
+ if (raw) return tokenizeMultiLineRawString(quoteChar, start);
+ bool asciiOnlyString = true;
+ bool asciiOnlyLine = true;
+ int unicodeStart = start;
+ int next = advance(); // Advance past the (last) quote (of three).
+ while (!identical(next, $EOF)) {
+ if (identical(next, $$)) {
+ if (!asciiOnlyLine) handleUnicode(unicodeStart);
+ next = tokenizeStringInterpolation(start, asciiOnlyString);
+ start = scanOffset;
+ unicodeStart = start;
+ asciiOnlyString = true; // A new string token is created for the rest.
+ asciiOnlyLine = true;
+ continue;
+ }
+ if (identical(next, quoteChar)) {
+ next = advance();
+ if (identical(next, quoteChar)) {
+ next = advance();
+ if (identical(next, quoteChar)) {
+ if (!asciiOnlyLine) handleUnicode(unicodeStart);
+ next = advance();
+ appendSubstringToken(STRING_INFO, start, asciiOnlyString);
+ return next;
+ }
+ }
+ continue;
+ }
+ if (identical(next, $BACKSLASH)) {
+ next = advance();
+ if (identical(next, $EOF)) break;
+ }
+ if (identical(next, $LF)) {
+ if (!asciiOnlyLine) {
+ // Synchronize the string offset in the utf8 scanner.
+ handleUnicode(unicodeStart);
+ asciiOnlyLine = true;
+ unicodeStart = scanOffset;
+ }
+ lineFeedInMultiline();
+ } else if (next > 127) {
+ asciiOnlyString = false;
+ asciiOnlyLine = false;
+ }
+ next = advance();
+ }
+ if (!asciiOnlyLine) handleUnicode(unicodeStart);
+ return unterminatedMultiLineString(quoteChar);
+ }
+
+ int unexpected(int character) {
+ appendErrorToken(new BadInputToken(character, tokenStart));
+ return advanceAfterError(true);
+ }
+
+ int unterminated(String prefix, {bool shouldAdvance: true}) {
+ appendErrorToken(new UnterminatedToken(prefix, tokenStart, stringOffset));
+ return advanceAfterError(shouldAdvance);
+ }
+
+ int unterminatedString(int quoteChar) {
+ return unterminated(new String.fromCharCodes([quoteChar]));
+ }
+
+ int unterminatedRawString(int quoteChar) {
+ return unterminated('r${new String.fromCharCodes([quoteChar])}');
+ }
+
+ int unterminatedMultiLineString(int quoteChar) {
+ return unterminated(
+ new String.fromCharCodes([quoteChar, quoteChar, quoteChar]));
+ }
+
+ int unterminatedRawMultiLineString(int quoteChar) {
+ return unterminated(
+ 'r${new String.fromCharCodes([quoteChar, quoteChar, quoteChar])}');
+ }
+
+ int advanceAfterError(bool shouldAdvance) {
+ if (atEndOfFile()) return $EOF;
+ if (shouldAdvance) {
+ return advance(); // Ensure progress.
+ } else {
+ return -1;
+ }
+ }
+
+ void unmatchedBeginGroup(BeginGroupToken begin) {
+ // We want to ensure that unmatched BeginGroupTokens are reported as
+ // errors. However, the diet parser assumes that groups are well-balanced
+ // and will never look at the endGroup token. This is a nice property that
+ // allows us to skip quickly over correct code. By inserting an additional
+ // synthetic token in the stream, we can keep ignoring endGroup tokens.
+ //
+ // [begin] --next--> [tail]
+ // [begin] --endG--> [synthetic] --next--> [next] --next--> [tail]
+ //
+ // This allows the diet parser to skip from [begin] via endGroup to
+ // [synthetic] and ignore the [synthetic] token (assuming it's correct),
+ // then the error will be reported when parsing the [next] token.
+ //
+ // For example, tokenize("{[1};") produces:
+ //
+ // SymbolToken({) --endGroup-----+
+ // | |
+ // next |
+ // v |
+ // SymbolToken([) --endGroup--+ |
+ // | | |
+ // next | |
+ // v | |
+ // StringToken(1) | |
+ // | v |
+ // next SymbolToken(]) | <- Synthetic token.
+ // | | |
+ // | next |
+ // v | |
+ // UnmatchedToken([)<---------+ |
+ // | |
+ // next |
+ // v |
+ // SymbolToken(})<---------------+
+ // |
+ // next
+ // v
+ // SymbolToken(;)
+ // |
+ // next
+ // v
+ // EOF
+ Token synthetic =
+ new SymbolToken(closeBraceInfoFor(begin), begin.charOffset);
+ UnmatchedToken next = new UnmatchedToken(begin);
+ begin.endGroup = synthetic;
+ synthetic.next = next;
+ appendErrorToken(next);
+ }
+}
+
+PrecedenceInfo closeBraceInfoFor(BeginGroupToken begin) {
+ return const {
+ '(': CLOSE_PAREN_INFO,
+ '[': CLOSE_SQUARE_BRACKET_INFO,
+ '{': CLOSE_CURLY_BRACKET_INFO,
+ '<': GT_INFO,
+ r'${': CLOSE_CURLY_BRACKET_INFO,
+ }[begin.value];
+}
« no previous file with comments | « pkg/dart_scanner/lib/io.dart ('k') | pkg/dart_scanner/lib/src/array_based_scanner.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698