| Index: pkg/dart_scanner/lib/src/abstract_scanner.dart
|
| diff --git a/pkg/dart_scanner/lib/src/abstract_scanner.dart b/pkg/dart_scanner/lib/src/abstract_scanner.dart
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..f0698611482b302293d982b6bccd4f5b91af214a
|
| --- /dev/null
|
| +++ b/pkg/dart_scanner/lib/src/abstract_scanner.dart
|
| @@ -0,0 +1,1187 @@
|
| +// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
|
| +// for details. All rights reserved. Use of this source code is governed by a
|
| +// BSD-style license that can be found in the LICENSE file.
|
| +
|
| +library dart2js.scanner;
|
| +
|
| +import '../io/source_file.dart' show SourceFile, Utf8BytesSourceFile;
|
| +import '../tokens/keyword.dart' show Keyword, KeywordState;
|
| +import '../tokens/precedence.dart';
|
| +import '../tokens/precedence_constants.dart';
|
| +import '../tokens/token.dart';
|
| +import '../tokens/token_constants.dart';
|
| +import '../util/characters.dart';
|
| +import 'string_scanner.dart' show StringScanner;
|
| +import 'utf8_bytes_scanner.dart' show Utf8BytesScanner;
|
| +
|
| +abstract class Scanner {
|
| + Token tokenize();
|
| +
|
| + factory Scanner(SourceFile file, {bool includeComments: false}) {
|
| + if (file is Utf8BytesSourceFile) {
|
| + return new Utf8BytesScanner(file, includeComments: includeComments);
|
| + } else {
|
| + return new StringScanner(file, includeComments: includeComments);
|
| + }
|
| + }
|
| +}
|
| +
|
| +abstract class AbstractScanner implements Scanner {
|
| + // TODO(ahe): Move this class to implementation.
|
| +
|
| + final bool includeComments;
|
| +
|
| + /**
|
| + * The string offset for the next token that will be created.
|
| + *
|
| + * Note that in the [Utf8BytesScanner], [stringOffset] and [scanOffset] values
|
| + * are different. One string character can be encoded using multiple UTF-8
|
| + * bytes.
|
| + */
|
| + int tokenStart = -1;
|
| +
|
| + /**
|
| + * A pointer to the token stream created by this scanner. The first token
|
| + * is a special token and not part of the source file. This is an
|
| + * implementation detail to avoids special cases in the scanner. This token
|
| + * is not exposed to clients of the scanner, which are expected to invoke
|
| + * [firstToken] to access the token stream.
|
| + */
|
| + final Token tokens = new SymbolToken(EOF_INFO, -1);
|
| +
|
| + /**
|
| + * A pointer to the last scanned token.
|
| + */
|
| + Token tail;
|
| +
|
| + /**
|
| + * The source file that is being scanned. This field can be [:null:].
|
| + * If the source file is available, the scanner assigns its [:lineStarts:] and
|
| + * [:length:] fields at the end of [tokenize].
|
| + */
|
| + final SourceFile file;
|
| +
|
| + final List<int> lineStarts = <int>[0];
|
| +
|
| + AbstractScanner(this.file, this.includeComments) {
|
| + this.tail = this.tokens;
|
| + }
|
| +
|
| + /**
|
| + * Advances and returns the next character.
|
| + *
|
| + * If the next character is non-ASCII, then the returned value depends on the
|
| + * scanner implementation. The [Utf8BytesScanner] returns a UTF-8 byte, while
|
| + * the [StringScanner] returns a UTF-16 code unit.
|
| + *
|
| + * The scanner ensures that [advance] is not invoked after it returned [$EOF].
|
| + * This allows implementations to omit bound checks if the data structure ends
|
| + * with '0'.
|
| + */
|
| + int advance();
|
| +
|
| + /**
|
| + * Returns the current unicode character.
|
| + *
|
| + * If the current character is ASCII, then it is returned unchanged.
|
| + *
|
| + * The [Utf8BytesScanner] decodes the next unicode code point starting at the
|
| + * current position. Note that every unicode character is returned as a single
|
| + * code point, that is, for '\u{1d11e}' it returns 119070, and the following
|
| + * [advance] returns the next character.
|
| + *
|
| + * The [StringScanner] returns the current character unchanged, which might
|
| + * be a surrogate character. In the case of '\u{1d11e}', it returns the first
|
| + * code unit 55348, and the following [advance] returns the second code unit
|
| + * 56606.
|
| + *
|
| + * Invoking [currentAsUnicode] multiple times is safe, i.e.,
|
| + * [:currentAsUnicode(next) == currentAsUnicode(currentAsUnicode(next)):].
|
| + */
|
| + int currentAsUnicode(int next);
|
| +
|
| + /**
|
| + * Returns the character at the next poisition. Like in [advance], the
|
| + * [Utf8BytesScanner] returns a UTF-8 byte, while the [StringScanner] returns
|
| + * a UTF-16 code unit.
|
| + */
|
| + int peek();
|
| +
|
| + /**
|
| + * Notifies the scanner that unicode characters were detected in either a
|
| + * comment or a string literal between [startScanOffset] and the current
|
| + * scan offset.
|
| + */
|
| + void handleUnicode(int startScanOffset);
|
| +
|
| + /**
|
| + * Returns the current scan offset.
|
| + *
|
| + * In the [Utf8BytesScanner] this is the offset into the byte list, in the
|
| + * [StringScanner] the offset in the source string.
|
| + */
|
| + int get scanOffset;
|
| +
|
| + /**
|
| + * Returns the current string offset.
|
| + *
|
| + * In the [StringScanner] this is identical to the [scanOffset]. In the
|
| + * [Utf8BytesScanner] it is computed based on encountered UTF-8 characters.
|
| + */
|
| + int get stringOffset;
|
| +
|
| + /**
|
| + * Returns the first token scanned by this [Scanner].
|
| + */
|
| + Token firstToken();
|
| +
|
| + /**
|
| + * Returns the last token scanned by this [Scanner].
|
| + */
|
| + Token previousToken();
|
| +
|
| + /**
|
| + * Notifies that a new token starts at current offset.
|
| + */
|
| + void beginToken() {
|
| + tokenStart = stringOffset;
|
| + }
|
| +
|
| + /**
|
| + * Appends a substring from the scan offset [:start:] to the current
|
| + * [:scanOffset:] plus the [:extraOffset:]. For example, if the current
|
| + * scanOffset is 10, then [:appendSubstringToken(5, -1):] will append the
|
| + * substring string [5,9).
|
| + *
|
| + * Note that [extraOffset] can only be used if the covered character(s) are
|
| + * known to be ASCII.
|
| + */
|
| + void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,
|
| + [int extraOffset]);
|
| +
|
| + /** Documentation in subclass [ArrayBasedScanner]. */
|
| + void appendPrecedenceToken(PrecedenceInfo info);
|
| +
|
| + /** Documentation in subclass [ArrayBasedScanner]. */
|
| + int select(int choice, PrecedenceInfo yes, PrecedenceInfo no);
|
| +
|
| + /** Documentation in subclass [ArrayBasedScanner]. */
|
| + void appendKeywordToken(Keyword keyword);
|
| +
|
| + /** Documentation in subclass [ArrayBasedScanner]. */
|
| + void appendEofToken();
|
| +
|
| + /** Documentation in subclass [ArrayBasedScanner]. */
|
| + void appendWhiteSpace(int next);
|
| +
|
| + /** Documentation in subclass [ArrayBasedScanner]. */
|
| + void lineFeedInMultiline();
|
| +
|
| + /** Documentation in subclass [ArrayBasedScanner]. */
|
| + void appendBeginGroup(PrecedenceInfo info);
|
| +
|
| + /** Documentation in subclass [ArrayBasedScanner]. */
|
| + int appendEndGroup(PrecedenceInfo info, int openKind);
|
| +
|
| + /** Documentation in subclass [ArrayBasedScanner]. */
|
| + void appendGt(PrecedenceInfo info);
|
| +
|
| + /** Documentation in subclass [ArrayBasedScanner]. */
|
| + void appendGtGt(PrecedenceInfo info);
|
| +
|
| + /** Documentation in subclass [ArrayBasedScanner]. */
|
| + void appendComment(start, bool asciiOnly);
|
| +
|
| + /// Append [token] to the token stream.
|
| + void appendErrorToken(ErrorToken token);
|
| +
|
| + /** Documentation in subclass [ArrayBasedScanner]. */
|
| + void discardOpenLt();
|
| +
|
| + /// Return true when at EOF.
|
| + bool atEndOfFile();
|
| +
|
| + Token tokenize() {
|
| + while (!atEndOfFile()) {
|
| + int next = advance();
|
| + while (!identical(next, $EOF)) {
|
| + next = bigSwitch(next);
|
| + }
|
| + if (atEndOfFile()) {
|
| + appendEofToken();
|
| + } else {
|
| + unexpected($EOF);
|
| + }
|
| + }
|
| +
|
| + if (file != null) {
|
| + file.length = stringOffset;
|
| + // One additional line start at the end, see [SourceFile.lineStarts].
|
| + lineStarts.add(stringOffset + 1);
|
| + file.lineStarts = lineStarts;
|
| + }
|
| +
|
| + return firstToken();
|
| + }
|
| +
|
| + int bigSwitch(int next) {
|
| + beginToken();
|
| + if (identical(next, $SPACE) ||
|
| + identical(next, $TAB) ||
|
| + identical(next, $LF) ||
|
| + identical(next, $CR)) {
|
| + appendWhiteSpace(next);
|
| + next = advance();
|
| + // Sequences of spaces are common, so advance through them fast.
|
| + while (identical(next, $SPACE)) {
|
| + // We don't invoke [:appendWhiteSpace(next):] here for efficiency,
|
| + // assuming that it does not do anything for space characters.
|
| + next = advance();
|
| + }
|
| + return next;
|
| + }
|
| +
|
| + if ($a <= next && next <= $z) {
|
| + if (identical($r, next)) {
|
| + return tokenizeRawStringKeywordOrIdentifier(next);
|
| + }
|
| + return tokenizeKeywordOrIdentifier(next, true);
|
| + }
|
| +
|
| + if (($A <= next && next <= $Z) ||
|
| + identical(next, $_) ||
|
| + identical(next, $$)) {
|
| + return tokenizeIdentifier(next, scanOffset, true);
|
| + }
|
| +
|
| + if (identical(next, $LT)) {
|
| + return tokenizeLessThan(next);
|
| + }
|
| +
|
| + if (identical(next, $GT)) {
|
| + return tokenizeGreaterThan(next);
|
| + }
|
| +
|
| + if (identical(next, $EQ)) {
|
| + return tokenizeEquals(next);
|
| + }
|
| +
|
| + if (identical(next, $BANG)) {
|
| + return tokenizeExclamation(next);
|
| + }
|
| +
|
| + if (identical(next, $PLUS)) {
|
| + return tokenizePlus(next);
|
| + }
|
| +
|
| + if (identical(next, $MINUS)) {
|
| + return tokenizeMinus(next);
|
| + }
|
| +
|
| + if (identical(next, $STAR)) {
|
| + return tokenizeMultiply(next);
|
| + }
|
| +
|
| + if (identical(next, $PERCENT)) {
|
| + return tokenizePercent(next);
|
| + }
|
| +
|
| + if (identical(next, $AMPERSAND)) {
|
| + return tokenizeAmpersand(next);
|
| + }
|
| +
|
| + if (identical(next, $BAR)) {
|
| + return tokenizeBar(next);
|
| + }
|
| +
|
| + if (identical(next, $CARET)) {
|
| + return tokenizeCaret(next);
|
| + }
|
| +
|
| + if (identical(next, $OPEN_SQUARE_BRACKET)) {
|
| + return tokenizeOpenSquareBracket(next);
|
| + }
|
| +
|
| + if (identical(next, $TILDE)) {
|
| + return tokenizeTilde(next);
|
| + }
|
| +
|
| + if (identical(next, $BACKSLASH)) {
|
| + appendPrecedenceToken(BACKSLASH_INFO);
|
| + return advance();
|
| + }
|
| +
|
| + if (identical(next, $HASH)) {
|
| + return tokenizeTag(next);
|
| + }
|
| +
|
| + if (identical(next, $OPEN_PAREN)) {
|
| + appendBeginGroup(OPEN_PAREN_INFO);
|
| + return advance();
|
| + }
|
| +
|
| + if (identical(next, $CLOSE_PAREN)) {
|
| + return appendEndGroup(CLOSE_PAREN_INFO, OPEN_PAREN_TOKEN);
|
| + }
|
| +
|
| + if (identical(next, $COMMA)) {
|
| + appendPrecedenceToken(COMMA_INFO);
|
| + return advance();
|
| + }
|
| +
|
| + if (identical(next, $COLON)) {
|
| + appendPrecedenceToken(COLON_INFO);
|
| + return advance();
|
| + }
|
| +
|
| + if (identical(next, $SEMICOLON)) {
|
| + appendPrecedenceToken(SEMICOLON_INFO);
|
| + // Type parameters and arguments cannot contain semicolon.
|
| + discardOpenLt();
|
| + return advance();
|
| + }
|
| +
|
| + if (identical(next, $QUESTION)) {
|
| + return tokenizeQuestion(next);
|
| + }
|
| +
|
| + if (identical(next, $CLOSE_SQUARE_BRACKET)) {
|
| + return appendEndGroup(
|
| + CLOSE_SQUARE_BRACKET_INFO, OPEN_SQUARE_BRACKET_TOKEN);
|
| + }
|
| +
|
| + if (identical(next, $BACKPING)) {
|
| + appendPrecedenceToken(BACKPING_INFO);
|
| + return advance();
|
| + }
|
| +
|
| + if (identical(next, $OPEN_CURLY_BRACKET)) {
|
| + appendBeginGroup(OPEN_CURLY_BRACKET_INFO);
|
| + return advance();
|
| + }
|
| +
|
| + if (identical(next, $CLOSE_CURLY_BRACKET)) {
|
| + return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, OPEN_CURLY_BRACKET_TOKEN);
|
| + }
|
| +
|
| + if (identical(next, $SLASH)) {
|
| + return tokenizeSlashOrComment(next);
|
| + }
|
| +
|
| + if (identical(next, $AT)) {
|
| + return tokenizeAt(next);
|
| + }
|
| +
|
| + if (identical(next, $DQ) || identical(next, $SQ)) {
|
| + return tokenizeString(next, scanOffset, false);
|
| + }
|
| +
|
| + if (identical(next, $PERIOD)) {
|
| + return tokenizeDotsOrNumber(next);
|
| + }
|
| +
|
| + if (identical(next, $0)) {
|
| + return tokenizeHexOrNumber(next);
|
| + }
|
| +
|
| + // TODO(ahe): Would a range check be faster?
|
| + if (identical(next, $1) ||
|
| + identical(next, $2) ||
|
| + identical(next, $3) ||
|
| + identical(next, $4) ||
|
| + identical(next, $5) ||
|
| + identical(next, $6) ||
|
| + identical(next, $7) ||
|
| + identical(next, $8) ||
|
| + identical(next, $9)) {
|
| + return tokenizeNumber(next);
|
| + }
|
| +
|
| + if (identical(next, $EOF)) {
|
| + return $EOF;
|
| + }
|
| + if (next < 0x1f) {
|
| + return unexpected(next);
|
| + }
|
| +
|
| + next = currentAsUnicode(next);
|
| +
|
| + // The following are non-ASCII characters.
|
| +
|
| + if (identical(next, $NBSP)) {
|
| + appendWhiteSpace(next);
|
| + return advance();
|
| + }
|
| +
|
| + return unexpected(next);
|
| + }
|
| +
|
| + int tokenizeTag(int next) {
|
| + // # or #!.*[\n\r]
|
| + if (scanOffset == 0) {
|
| + if (identical(peek(), $BANG)) {
|
| + int start = scanOffset + 1;
|
| + bool asciiOnly = true;
|
| + do {
|
| + next = advance();
|
| + if (next > 127) asciiOnly = false;
|
| + } while (!identical(next, $LF) &&
|
| + !identical(next, $CR) &&
|
| + !identical(next, $EOF));
|
| + if (!asciiOnly) handleUnicode(start);
|
| + return next;
|
| + }
|
| + }
|
| + appendPrecedenceToken(HASH_INFO);
|
| + return advance();
|
| + }
|
| +
|
| + int tokenizeTilde(int next) {
|
| + // ~ ~/ ~/=
|
| + next = advance();
|
| + if (identical(next, $SLASH)) {
|
| + return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO);
|
| + } else {
|
| + appendPrecedenceToken(TILDE_INFO);
|
| + return next;
|
| + }
|
| + }
|
| +
|
| + int tokenizeOpenSquareBracket(int next) {
|
| + // [ [] []=
|
| + next = advance();
|
| + if (identical(next, $CLOSE_SQUARE_BRACKET)) {
|
| + Token token = previousToken();
|
| + if (token is KeywordToken && token.keyword.syntax == 'operator' ||
|
| + token is SymbolToken && token.info == HASH_INFO) {
|
| + return select($EQ, INDEX_EQ_INFO, INDEX_INFO);
|
| + }
|
| + }
|
| + appendBeginGroup(OPEN_SQUARE_BRACKET_INFO);
|
| + return next;
|
| + }
|
| +
|
| + int tokenizeCaret(int next) {
|
| + // ^ ^=
|
| + return select($EQ, CARET_EQ_INFO, CARET_INFO);
|
| + }
|
| +
|
| + int tokenizeQuestion(int next) {
|
| + // ? ?. ?? ??=
|
| + next = advance();
|
| + if (identical(next, $QUESTION)) {
|
| + return select($EQ, QUESTION_QUESTION_EQ_INFO, QUESTION_QUESTION_INFO);
|
| + } else if (identical(next, $PERIOD)) {
|
| + appendPrecedenceToken(QUESTION_PERIOD_INFO);
|
| + return advance();
|
| + } else {
|
| + appendPrecedenceToken(QUESTION_INFO);
|
| + return next;
|
| + }
|
| + }
|
| +
|
| + int tokenizeBar(int next) {
|
| + // | || |=
|
| + next = advance();
|
| + if (identical(next, $BAR)) {
|
| + appendPrecedenceToken(BAR_BAR_INFO);
|
| + return advance();
|
| + } else if (identical(next, $EQ)) {
|
| + appendPrecedenceToken(BAR_EQ_INFO);
|
| + return advance();
|
| + } else {
|
| + appendPrecedenceToken(BAR_INFO);
|
| + return next;
|
| + }
|
| + }
|
| +
|
| + int tokenizeAmpersand(int next) {
|
| + // && &= &
|
| + next = advance();
|
| + if (identical(next, $AMPERSAND)) {
|
| + appendPrecedenceToken(AMPERSAND_AMPERSAND_INFO);
|
| + return advance();
|
| + } else if (identical(next, $EQ)) {
|
| + appendPrecedenceToken(AMPERSAND_EQ_INFO);
|
| + return advance();
|
| + } else {
|
| + appendPrecedenceToken(AMPERSAND_INFO);
|
| + return next;
|
| + }
|
| + }
|
| +
|
| + int tokenizePercent(int next) {
|
| + // % %=
|
| + return select($EQ, PERCENT_EQ_INFO, PERCENT_INFO);
|
| + }
|
| +
|
| + int tokenizeMultiply(int next) {
|
| + // * *=
|
| + return select($EQ, STAR_EQ_INFO, STAR_INFO);
|
| + }
|
| +
|
| + int tokenizeMinus(int next) {
|
| + // - -- -=
|
| + next = advance();
|
| + if (identical(next, $MINUS)) {
|
| + appendPrecedenceToken(MINUS_MINUS_INFO);
|
| + return advance();
|
| + } else if (identical(next, $EQ)) {
|
| + appendPrecedenceToken(MINUS_EQ_INFO);
|
| + return advance();
|
| + } else {
|
| + appendPrecedenceToken(MINUS_INFO);
|
| + return next;
|
| + }
|
| + }
|
| +
|
| + int tokenizePlus(int next) {
|
| + // + ++ +=
|
| + next = advance();
|
| + if (identical($PLUS, next)) {
|
| + appendPrecedenceToken(PLUS_PLUS_INFO);
|
| + return advance();
|
| + } else if (identical($EQ, next)) {
|
| + appendPrecedenceToken(PLUS_EQ_INFO);
|
| + return advance();
|
| + } else {
|
| + appendPrecedenceToken(PLUS_INFO);
|
| + return next;
|
| + }
|
| + }
|
| +
|
| + int tokenizeExclamation(int next) {
|
| + // ! !=
|
| + // !== is kept for user-friendly error reporting.
|
| +
|
| + next = advance();
|
| + if (identical(next, $EQ)) {
|
| + return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO);
|
| + }
|
| + appendPrecedenceToken(BANG_INFO);
|
| + return next;
|
| + }
|
| +
|
| + int tokenizeEquals(int next) {
|
| + // = == =>
|
| + // === is kept for user-friendly error reporting.
|
| +
|
| + // Type parameters and arguments cannot contain any token that
|
| + // starts with '='.
|
| + discardOpenLt();
|
| +
|
| + next = advance();
|
| + if (identical(next, $EQ)) {
|
| + return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO);
|
| + } else if (identical(next, $GT)) {
|
| + appendPrecedenceToken(FUNCTION_INFO);
|
| + return advance();
|
| + }
|
| + appendPrecedenceToken(EQ_INFO);
|
| + return next;
|
| + }
|
| +
|
| + int tokenizeGreaterThan(int next) {
|
| + // > >= >> >>=
|
| + next = advance();
|
| + if (identical($EQ, next)) {
|
| + appendPrecedenceToken(GT_EQ_INFO);
|
| + return advance();
|
| + } else if (identical($GT, next)) {
|
| + next = advance();
|
| + if (identical($EQ, next)) {
|
| + appendPrecedenceToken(GT_GT_EQ_INFO);
|
| + return advance();
|
| + } else {
|
| + appendGtGt(GT_GT_INFO);
|
| + return next;
|
| + }
|
| + } else {
|
| + appendGt(GT_INFO);
|
| + return next;
|
| + }
|
| + }
|
| +
|
| + int tokenizeLessThan(int next) {
|
| + // < <= << <<=
|
| + next = advance();
|
| + if (identical($EQ, next)) {
|
| + appendPrecedenceToken(LT_EQ_INFO);
|
| + return advance();
|
| + } else if (identical($LT, next)) {
|
| + return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO);
|
| + } else {
|
| + appendBeginGroup(LT_INFO);
|
| + return next;
|
| + }
|
| + }
|
| +
|
| + int tokenizeNumber(int next) {
|
| + int start = scanOffset;
|
| + while (true) {
|
| + next = advance();
|
| + if ($0 <= next && next <= $9) {
|
| + continue;
|
| + } else if (identical(next, $e) || identical(next, $E)) {
|
| + return tokenizeFractionPart(next, start);
|
| + } else {
|
| + if (identical(next, $PERIOD)) {
|
| + int nextnext = peek();
|
| + if ($0 <= nextnext && nextnext <= $9) {
|
| + return tokenizeFractionPart(advance(), start);
|
| + }
|
| + }
|
| + appendSubstringToken(INT_INFO, start, true);
|
| + return next;
|
| + }
|
| + }
|
| + return null;
|
| + }
|
| +
|
| + int tokenizeHexOrNumber(int next) {
|
| + int x = peek();
|
| + if (identical(x, $x) || identical(x, $X)) {
|
| + return tokenizeHex(next);
|
| + }
|
| + return tokenizeNumber(next);
|
| + }
|
| +
|
| + int tokenizeHex(int next) {
|
| + int start = scanOffset;
|
| + next = advance(); // Advance past the $x or $X.
|
| + bool hasDigits = false;
|
| + while (true) {
|
| + next = advance();
|
| + if (($0 <= next && next <= $9) ||
|
| + ($A <= next && next <= $F) ||
|
| + ($a <= next && next <= $f)) {
|
| + hasDigits = true;
|
| + } else {
|
| + if (!hasDigits) {
|
| + unterminated('0x', shouldAdvance: false);
|
| + return next;
|
| + }
|
| + appendSubstringToken(HEXADECIMAL_INFO, start, true);
|
| + return next;
|
| + }
|
| + }
|
| + return null;
|
| + }
|
| +
|
| + int tokenizeDotsOrNumber(int next) {
|
| + int start = scanOffset;
|
| + next = advance();
|
| + if (($0 <= next && next <= $9)) {
|
| + return tokenizeFractionPart(next, start);
|
| + } else if (identical($PERIOD, next)) {
|
| + return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);
|
| + } else {
|
| + appendPrecedenceToken(PERIOD_INFO);
|
| + return next;
|
| + }
|
| + }
|
| +
|
| + int tokenizeFractionPart(int next, int start) {
|
| + bool done = false;
|
| + bool hasDigit = false;
|
| + LOOP:
|
| + while (!done) {
|
| + if ($0 <= next && next <= $9) {
|
| + hasDigit = true;
|
| + } else if (identical($e, next) || identical($E, next)) {
|
| + hasDigit = true;
|
| + next = advance();
|
| + if (identical(next, $PLUS) || identical(next, $MINUS)) {
|
| + next = advance();
|
| + }
|
| + bool hasExponentDigits = false;
|
| + while (true) {
|
| + if ($0 <= next && next <= $9) {
|
| + hasExponentDigits = true;
|
| + } else {
|
| + if (!hasExponentDigits) {
|
| + unterminated('1e', shouldAdvance: false);
|
| + return next;
|
| + }
|
| + break;
|
| + }
|
| + next = advance();
|
| + }
|
| +
|
| + done = true;
|
| + continue LOOP;
|
| + } else {
|
| + done = true;
|
| + continue LOOP;
|
| + }
|
| + next = advance();
|
| + }
|
| + if (!hasDigit) {
|
| + // Reduce offset, we already advanced to the token past the period.
|
| + appendSubstringToken(INT_INFO, start, true, -1);
|
| +
|
| + // TODO(ahe): Wrong offset for the period. Cannot call beginToken because
|
| + // the scanner already advanced past the period.
|
| + if (identical($PERIOD, next)) {
|
| + return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);
|
| + }
|
| + appendPrecedenceToken(PERIOD_INFO);
|
| + return next;
|
| + }
|
| + appendSubstringToken(DOUBLE_INFO, start, true);
|
| + return next;
|
| + }
|
| +
|
| + int tokenizeSlashOrComment(int next) {
|
| + int start = scanOffset;
|
| + next = advance();
|
| + if (identical($STAR, next)) {
|
| + return tokenizeMultiLineComment(next, start);
|
| + } else if (identical($SLASH, next)) {
|
| + return tokenizeSingleLineComment(next, start);
|
| + } else if (identical($EQ, next)) {
|
| + appendPrecedenceToken(SLASH_EQ_INFO);
|
| + return advance();
|
| + } else {
|
| + appendPrecedenceToken(SLASH_INFO);
|
| + return next;
|
| + }
|
| + }
|
| +
|
| + int tokenizeSingleLineComment(int next, int start) {
|
| + bool asciiOnly = true;
|
| + while (true) {
|
| + next = advance();
|
| + if (next > 127) asciiOnly = false;
|
| + if (identical($LF, next) ||
|
| + identical($CR, next) ||
|
| + identical($EOF, next)) {
|
| + if (!asciiOnly) handleUnicode(start);
|
| + appendComment(start, asciiOnly);
|
| + return next;
|
| + }
|
| + }
|
| + return null;
|
| + }
|
| +
|
| + int tokenizeMultiLineComment(int next, int start) {
|
| + bool asciiOnlyComment = true; // Track if the entire comment is ASCII.
|
| + bool asciiOnlyLines = true; // Track ASCII since the last handleUnicode.
|
| + int unicodeStart = start;
|
| + int nesting = 1;
|
| + next = advance();
|
| + while (true) {
|
| + if (identical($EOF, next)) {
|
| + if (!asciiOnlyLines) handleUnicode(unicodeStart);
|
| + unterminated('/*');
|
| + break;
|
| + } else if (identical($STAR, next)) {
|
| + next = advance();
|
| + if (identical($SLASH, next)) {
|
| + --nesting;
|
| + if (0 == nesting) {
|
| + if (!asciiOnlyLines) handleUnicode(unicodeStart);
|
| + next = advance();
|
| + appendComment(start, asciiOnlyComment);
|
| + break;
|
| + } else {
|
| + next = advance();
|
| + }
|
| + }
|
| + } else if (identical($SLASH, next)) {
|
| + next = advance();
|
| + if (identical($STAR, next)) {
|
| + next = advance();
|
| + ++nesting;
|
| + }
|
| + } else if (identical(next, $LF)) {
|
| + if (!asciiOnlyLines) {
|
| + // Synchronize the string offset in the utf8 scanner.
|
| + handleUnicode(unicodeStart);
|
| + asciiOnlyLines = true;
|
| + unicodeStart = scanOffset;
|
| + }
|
| + lineFeedInMultiline();
|
| + next = advance();
|
| + } else {
|
| + if (next > 127) {
|
| + asciiOnlyLines = false;
|
| + asciiOnlyComment = false;
|
| + }
|
| + next = advance();
|
| + }
|
| + }
|
| + return next;
|
| + }
|
| +
|
| + int tokenizeRawStringKeywordOrIdentifier(int next) {
|
| + // [next] is $r.
|
| + int nextnext = peek();
|
| + if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) {
|
| + int start = scanOffset;
|
| + next = advance();
|
| + return tokenizeString(next, start, true);
|
| + }
|
| + return tokenizeKeywordOrIdentifier(next, true);
|
| + }
|
| +
|
| + int tokenizeKeywordOrIdentifier(int next, bool allowDollar) {
|
| + KeywordState state = KeywordState.KEYWORD_STATE;
|
| + int start = scanOffset;
|
| + while (state != null && $a <= next && next <= $z) {
|
| + state = state.next(next);
|
| + next = advance();
|
| + }
|
| + if (state == null || state.keyword == null) {
|
| + return tokenizeIdentifier(next, start, allowDollar);
|
| + }
|
| + if (($A <= next && next <= $Z) ||
|
| + ($0 <= next && next <= $9) ||
|
| + identical(next, $_) ||
|
| + identical(next, $$)) {
|
| + return tokenizeIdentifier(next, start, allowDollar);
|
| + } else {
|
| + appendKeywordToken(state.keyword);
|
| + return next;
|
| + }
|
| + }
|
| +
|
| + /**
|
| + * [allowDollar] can exclude '$', which is not allowed as part of a string
|
| + * interpolation identifier.
|
| + */
|
| + int tokenizeIdentifier(int next, int start, bool allowDollar) {
|
| + while (true) {
|
| + if (($a <= next && next <= $z) ||
|
| + ($A <= next && next <= $Z) ||
|
| + ($0 <= next && next <= $9) ||
|
| + identical(next, $_) ||
|
| + (identical(next, $$) && allowDollar)) {
|
| + next = advance();
|
| + } else {
|
| + // Identifier ends here.
|
| + if (start == scanOffset) {
|
| + return unexpected(next);
|
| + } else {
|
| + appendSubstringToken(IDENTIFIER_INFO, start, true);
|
| + }
|
| + break;
|
| + }
|
| + }
|
| + return next;
|
| + }
|
| +
|
| + int tokenizeAt(int next) {
|
| + appendPrecedenceToken(AT_INFO);
|
| + return advance();
|
| + }
|
| +
|
| + int tokenizeString(int next, int start, bool raw) {
|
| + int quoteChar = next;
|
| + next = advance();
|
| + if (identical(quoteChar, next)) {
|
| + next = advance();
|
| + if (identical(quoteChar, next)) {
|
| + // Multiline string.
|
| + return tokenizeMultiLineString(quoteChar, start, raw);
|
| + } else {
|
| + // Empty string.
|
| + appendSubstringToken(STRING_INFO, start, true);
|
| + return next;
|
| + }
|
| + }
|
| + if (raw) {
|
| + return tokenizeSingleLineRawString(next, quoteChar, start);
|
| + } else {
|
| + return tokenizeSingleLineString(next, quoteChar, start);
|
| + }
|
| + }
|
| +
|
| + /**
|
| + * [next] is the first character after the quote.
|
| + * [start] is the scanOffset of the quote.
|
| + *
|
| + * The token contains a substring of the source file, including the
|
| + * string quotes, backslashes for escaping. For interpolated strings,
|
| + * the parts before and after are separate tokens.
|
| + *
|
| + * "a $b c"
|
| + *
|
| + * gives StringToken("a $), StringToken(b) and StringToken( c").
|
| + */
|
| + int tokenizeSingleLineString(int next, int quoteChar, int start) {
|
| + bool asciiOnly = true;
|
| + while (!identical(next, quoteChar)) {
|
| + if (identical(next, $BACKSLASH)) {
|
| + next = advance();
|
| + } else if (identical(next, $$)) {
|
| + if (!asciiOnly) handleUnicode(start);
|
| + next = tokenizeStringInterpolation(start, asciiOnly);
|
| + start = scanOffset;
|
| + asciiOnly = true;
|
| + continue;
|
| + }
|
| + if (next <= $CR &&
|
| + (identical(next, $LF) ||
|
| + identical(next, $CR) ||
|
| + identical(next, $EOF))) {
|
| + if (!asciiOnly) handleUnicode(start);
|
| + return unterminatedString(quoteChar);
|
| + }
|
| + if (next > 127) asciiOnly = false;
|
| + next = advance();
|
| + }
|
| + if (!asciiOnly) handleUnicode(start);
|
| + // Advance past the quote character.
|
| + next = advance();
|
| + appendSubstringToken(STRING_INFO, start, asciiOnly);
|
| + return next;
|
| + }
|
| +
|
| + int tokenizeStringInterpolation(int start, bool asciiOnly) {
|
| + appendSubstringToken(STRING_INFO, start, asciiOnly);
|
| + beginToken(); // $ starts here.
|
| + int next = advance();
|
| + if (identical(next, $OPEN_CURLY_BRACKET)) {
|
| + return tokenizeInterpolatedExpression(next);
|
| + } else {
|
| + return tokenizeInterpolatedIdentifier(next);
|
| + }
|
| + }
|
| +
|
| + int tokenizeInterpolatedExpression(int next) {
|
| + appendBeginGroup(STRING_INTERPOLATION_INFO);
|
| + beginToken(); // The expression starts here.
|
| + next = advance(); // Move past the curly bracket.
|
| + while (!identical(next, $EOF) && !identical(next, $STX)) {
|
| + next = bigSwitch(next);
|
| + }
|
| + if (identical(next, $EOF)) return next;
|
| + next = advance(); // Move past the $STX.
|
| + beginToken(); // The string interpolation suffix starts here.
|
| + return next;
|
| + }
|
| +
|
| + int tokenizeInterpolatedIdentifier(int next) {
|
| + appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO);
|
| +
|
| + if ($a <= next && next <= $z) {
|
| + beginToken(); // The identifier starts here.
|
| + next = tokenizeKeywordOrIdentifier(next, false);
|
| + } else if (($A <= next && next <= $Z) || identical(next, $_)) {
|
| + beginToken(); // The identifier starts here.
|
| + next = tokenizeIdentifier(next, scanOffset, false);
|
| + } else {
|
| + unterminated(r'$', shouldAdvance: false);
|
| + }
|
| + beginToken(); // The string interpolation suffix starts here.
|
| + return next;
|
| + }
|
| +
|
| + int tokenizeSingleLineRawString(int next, int quoteChar, int start) {
|
| + bool asciiOnly = true;
|
| + while (next != $EOF) {
|
| + if (identical(next, quoteChar)) {
|
| + if (!asciiOnly) handleUnicode(start);
|
| + next = advance();
|
| + appendSubstringToken(STRING_INFO, start, asciiOnly);
|
| + return next;
|
| + } else if (identical(next, $LF) || identical(next, $CR)) {
|
| + if (!asciiOnly) handleUnicode(start);
|
| + return unterminatedRawString(quoteChar);
|
| + } else if (next > 127) {
|
| + asciiOnly = false;
|
| + }
|
| + next = advance();
|
| + }
|
| + if (!asciiOnly) handleUnicode(start);
|
| + return unterminatedRawString(quoteChar);
|
| + }
|
| +
|
| + int tokenizeMultiLineRawString(int quoteChar, int start) {
|
| + bool asciiOnlyString = true;
|
| + bool asciiOnlyLine = true;
|
| + int unicodeStart = start;
|
| + int next = advance(); // Advance past the (last) quote (of three).
|
| + outer:
|
| + while (!identical(next, $EOF)) {
|
| + while (!identical(next, quoteChar)) {
|
| + if (identical(next, $LF)) {
|
| + if (!asciiOnlyLine) {
|
| + // Synchronize the string offset in the utf8 scanner.
|
| + handleUnicode(unicodeStart);
|
| + asciiOnlyLine = true;
|
| + unicodeStart = scanOffset;
|
| + }
|
| + lineFeedInMultiline();
|
| + } else if (next > 127) {
|
| + asciiOnlyLine = false;
|
| + asciiOnlyString = false;
|
| + }
|
| + next = advance();
|
| + if (identical(next, $EOF)) break outer;
|
| + }
|
| + next = advance();
|
| + if (identical(next, quoteChar)) {
|
| + next = advance();
|
| + if (identical(next, quoteChar)) {
|
| + if (!asciiOnlyLine) handleUnicode(unicodeStart);
|
| + next = advance();
|
| + appendSubstringToken(STRING_INFO, start, asciiOnlyString);
|
| + return next;
|
| + }
|
| + }
|
| + }
|
| + if (!asciiOnlyLine) handleUnicode(unicodeStart);
|
| + return unterminatedRawMultiLineString(quoteChar);
|
| + }
|
| +
|
| + int tokenizeMultiLineString(int quoteChar, int start, bool raw) {
|
| + if (raw) return tokenizeMultiLineRawString(quoteChar, start);
|
| + bool asciiOnlyString = true;
|
| + bool asciiOnlyLine = true;
|
| + int unicodeStart = start;
|
| + int next = advance(); // Advance past the (last) quote (of three).
|
| + while (!identical(next, $EOF)) {
|
| + if (identical(next, $$)) {
|
| + if (!asciiOnlyLine) handleUnicode(unicodeStart);
|
| + next = tokenizeStringInterpolation(start, asciiOnlyString);
|
| + start = scanOffset;
|
| + unicodeStart = start;
|
| + asciiOnlyString = true; // A new string token is created for the rest.
|
| + asciiOnlyLine = true;
|
| + continue;
|
| + }
|
| + if (identical(next, quoteChar)) {
|
| + next = advance();
|
| + if (identical(next, quoteChar)) {
|
| + next = advance();
|
| + if (identical(next, quoteChar)) {
|
| + if (!asciiOnlyLine) handleUnicode(unicodeStart);
|
| + next = advance();
|
| + appendSubstringToken(STRING_INFO, start, asciiOnlyString);
|
| + return next;
|
| + }
|
| + }
|
| + continue;
|
| + }
|
| + if (identical(next, $BACKSLASH)) {
|
| + next = advance();
|
| + if (identical(next, $EOF)) break;
|
| + }
|
| + if (identical(next, $LF)) {
|
| + if (!asciiOnlyLine) {
|
| + // Synchronize the string offset in the utf8 scanner.
|
| + handleUnicode(unicodeStart);
|
| + asciiOnlyLine = true;
|
| + unicodeStart = scanOffset;
|
| + }
|
| + lineFeedInMultiline();
|
| + } else if (next > 127) {
|
| + asciiOnlyString = false;
|
| + asciiOnlyLine = false;
|
| + }
|
| + next = advance();
|
| + }
|
| + if (!asciiOnlyLine) handleUnicode(unicodeStart);
|
| + return unterminatedMultiLineString(quoteChar);
|
| + }
|
| +
|
| + int unexpected(int character) {
|
| + appendErrorToken(new BadInputToken(character, tokenStart));
|
| + return advanceAfterError(true);
|
| + }
|
| +
|
| + int unterminated(String prefix, {bool shouldAdvance: true}) {
|
| + appendErrorToken(new UnterminatedToken(prefix, tokenStart, stringOffset));
|
| + return advanceAfterError(shouldAdvance);
|
| + }
|
| +
|
| + int unterminatedString(int quoteChar) {
|
| + return unterminated(new String.fromCharCodes([quoteChar]));
|
| + }
|
| +
|
| + int unterminatedRawString(int quoteChar) {
|
| + return unterminated('r${new String.fromCharCodes([quoteChar])}');
|
| + }
|
| +
|
| + int unterminatedMultiLineString(int quoteChar) {
|
| + return unterminated(
|
| + new String.fromCharCodes([quoteChar, quoteChar, quoteChar]));
|
| + }
|
| +
|
| + int unterminatedRawMultiLineString(int quoteChar) {
|
| + return unterminated(
|
| + 'r${new String.fromCharCodes([quoteChar, quoteChar, quoteChar])}');
|
| + }
|
| +
|
| + int advanceAfterError(bool shouldAdvance) {
|
| + if (atEndOfFile()) return $EOF;
|
| + if (shouldAdvance) {
|
| + return advance(); // Ensure progress.
|
| + } else {
|
| + return -1;
|
| + }
|
| + }
|
| +
|
| + void unmatchedBeginGroup(BeginGroupToken begin) {
|
| + // We want to ensure that unmatched BeginGroupTokens are reported as
|
| + // errors. However, the diet parser assumes that groups are well-balanced
|
| + // and will never look at the endGroup token. This is a nice property that
|
| + // allows us to skip quickly over correct code. By inserting an additional
|
| + // synthetic token in the stream, we can keep ignoring endGroup tokens.
|
| + //
|
| + // [begin] --next--> [tail]
|
| + // [begin] --endG--> [synthetic] --next--> [next] --next--> [tail]
|
| + //
|
| + // This allows the diet parser to skip from [begin] via endGroup to
|
| + // [synthetic] and ignore the [synthetic] token (assuming it's correct),
|
| + // then the error will be reported when parsing the [next] token.
|
| + //
|
| + // For example, tokenize("{[1};") produces:
|
| + //
|
| + // SymbolToken({) --endGroup-----+
|
| + // | |
|
| + // next |
|
| + // v |
|
| + // SymbolToken([) --endGroup--+ |
|
| + // | | |
|
| + // next | |
|
| + // v | |
|
| + // StringToken(1) | |
|
| + // | v |
|
| + // next SymbolToken(]) | <- Synthetic token.
|
| + // | | |
|
| + // | next |
|
| + // v | |
|
| + // UnmatchedToken([)<---------+ |
|
| + // | |
|
| + // next |
|
| + // v |
|
| + // SymbolToken(})<---------------+
|
| + // |
|
| + // next
|
| + // v
|
| + // SymbolToken(;)
|
| + // |
|
| + // next
|
| + // v
|
| + // EOF
|
| + Token synthetic =
|
| + new SymbolToken(closeBraceInfoFor(begin), begin.charOffset);
|
| + UnmatchedToken next = new UnmatchedToken(begin);
|
| + begin.endGroup = synthetic;
|
| + synthetic.next = next;
|
| + appendErrorToken(next);
|
| + }
|
| +}
|
| +
|
| +PrecedenceInfo closeBraceInfoFor(BeginGroupToken begin) {
|
| + return const {
|
| + '(': CLOSE_PAREN_INFO,
|
| + '[': CLOSE_SQUARE_BRACKET_INFO,
|
| + '{': CLOSE_CURLY_BRACKET_INFO,
|
| + '<': GT_INFO,
|
| + r'${': CLOSE_CURLY_BRACKET_INFO,
|
| + }[begin.value];
|
| +}
|
|
|