| Index: sdk/lib/_internal/compiler/implementation/scanner/scanner.dart
|
| diff --git a/sdk/lib/_internal/compiler/implementation/scanner/scanner.dart b/sdk/lib/_internal/compiler/implementation/scanner/scanner.dart
|
| index 16c238433f068319c29ef81d7ec4e29d53a80eae..f80a3e4a12f17e7accc3a40ea7fd3474a15b7853 100644
|
| --- a/sdk/lib/_internal/compiler/implementation/scanner/scanner.dart
|
| +++ b/sdk/lib/_internal/compiler/implementation/scanner/scanner.dart
|
| @@ -6,75 +6,332 @@ part of scanner;
|
|
|
| abstract class Scanner {
|
| Token tokenize();
|
| +
|
| + factory Scanner(SourceFile file, {bool includeComments: false}) {
|
| + if (file is Utf8BytesSourceFile) {
|
| + return new Utf8BytesScanner(file, includeComments: includeComments);
|
| + } else {
|
| + return new StringScanner(file, includeComments: includeComments);
|
| + }
|
| + }
|
| }
|
|
|
| -/**
|
| - * Common base class for a Dart scanner.
|
| - */
|
| -abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| +abstract class AbstractScanner implements Scanner {
|
| + final bool includeComments;
|
| +
|
| + /**
|
| + * The string offset for the next token that will be created.
|
| + *
|
| + * Note that in the [Utf8BytesScanner], string offsets and [scanOffset] values
|
| + * are different. One string character can be encoded using multiple UTF-8
|
| + * bytes.
|
| + */
|
| + int tokenStart = -1;
|
| +
|
| + /**
|
| + * A pointer to the token stream created by this scanner. The first token
|
| + * is a special token and not part of the source file. This is an
|
| + * implementation detail to avoids special cases in the scanner. This token
|
| + * is not exposed to clients of the scanner, which are expected to invoke
|
| + * [firstToken] to access the token stream.
|
| + */
|
| + final Token tokens = new SymbolToken(EOF_INFO, -1);
|
| +
|
| + /**
|
| + * A pointer to the last scanned token.
|
| + */
|
| + Token tail;
|
| +
|
| + /**
|
| + * The stack of open groups, e.g [: { ... ( .. :]
|
| + * Each BeginGroupToken has a pointer to the token where the group
|
| + * ends. This field is set when scanning the end group token.
|
| + */
|
| + Link<BeginGroupToken> groupingStack = const Link<BeginGroupToken>();
|
| +
|
| + /**
|
| + * The source file that is being scanned. This field can be [:null:].
|
| + * If the source file is available, the scanner assigns its [:lineStarts:] and
|
| + * [:length:] fields at the end of [tokenize].
|
| + */
|
| + final SourceFile file;
|
| +
|
| + final List<int> lineStarts = [0];
|
| +
|
| + AbstractScanner(this.file, this.includeComments) {
|
| + this.tail = this.tokens;
|
| + }
|
| +
|
| +
|
| + /**
|
| + * Advances and returns the next character.
|
| + *
|
| + * If the next character is non-ASCII, then the returned value depends on the
|
| + * scanner implementation. The [Utf8BytesScanner] returns a UTF-8 byte, while
|
| + * the [StringScanner] returns a UTF-16 code unit.
|
| + *
|
| + * The scanner ensures that [advance] is not invoked after it returned [$EOF].
|
| + * This allows implementations to omit bound checks if the data structure ends
|
| + * with '0'.
|
| + */
|
| int advance();
|
| - int nextByte();
|
|
|
| /**
|
| - * Returns the current character or byte depending on the underlying input
|
| - * kind. For example, [StringScanner] operates on [String] and thus returns
|
| - * characters (Unicode codepoints represented as int) whereas
|
| - * [ByteArrayScanner] operates on byte arrays and thus returns bytes.
|
| + * Returns the current unicode character.
|
| + *
|
| + * If the current character is ASCII, then it is returned unchanged.
|
| + *
|
| + * The [Utf8BytesScanner] decodes the next unicode code point starting at the
|
| + * current position. Note that every unicode character is returned as a single
|
| + * code point, i.e., for '\u{1d11e}' it returns 119070, and the following
|
| + * [advance] returns the next character.
|
| + *
|
| + * The [StringScanner] returns the current character unchanged, which might
|
| + * be a surrogate character. In the case of '\u{1d11e}', it returns the first
|
| + * code unit 55348, and the following [advance] returns the second code unit
|
| + * 56606.
|
| + *
|
| + * Invoking [currentAsUnicode] multiple times is safe, i.e.,
|
| + * [:currentAsUnicode(next) == currentAsUnicode(currentAsUnicode(next)):].
|
| + */
|
| + int currentAsUnicode(int next);
|
| +
|
| + /**
|
| + * Returns the character at the next poisition. Like in [advance], the
|
| + * [Utf8BytesScanner] returns a UTF-8 byte, while the [StringScanner] returns
|
| + * a UTF-16 code unit.
|
| */
|
| int peek();
|
|
|
| /**
|
| + * Notifies the scanner that unicode characters were detected in either a
|
| + * comment or a string literal between [startScanOffset] and the current
|
| + * scan offset.
|
| + */
|
| + void handleUnicode(int startScanOffset);
|
| +
|
| + /**
|
| + * Returns the current scan offset.
|
| + *
|
| + * In the [Utf8BytesScanner] this is the offset into the byte list, in the
|
| + * [StringScanner] the offset in the source string.
|
| + */
|
| + int get scanOffset;
|
| +
|
| + /**
|
| + * Returns the current string offset.
|
| + *
|
| + * In the [StringScanner] this is identical to the [scanOffset]. In the
|
| + * [Utf8BytesScanner] it is computed based on encountered UTF-8 characters.
|
| + */
|
| + int get stringOffset;
|
| +
|
| + /**
|
| + * Returns the first token scanned by this [Scanner].
|
| + */
|
| + Token firstToken();
|
| +
|
| + /**
|
| + * Returns the last token scanned by this [Scanner].
|
| + */
|
| + Token previousToken();
|
| +
|
| + /**
|
| + * Notifies that a new token starts at current offset.
|
| + */
|
| + void beginToken() {
|
| + tokenStart = stringOffset;
|
| + }
|
| +
|
| + /**
|
| + * Appends a substring from the scan offset [:start:] to the current
|
| + * [:scanOffset:] plus the [:extraOffset:]. For example, if the current
|
| + * scanOffset is 10, then [:appendSubstringToken(5, -1):] will append the
|
| + * substring string [5,9).
|
| + *
|
| + * Note that [extraOffset] can only be used if the covered character(s) are
|
| + * known to be ASCII.
|
| + */
|
| + void appendSubstringToken(PrecedenceInfo info, int start,
|
| + bool asciiOnly, [int extraOffset]);
|
| +
|
| + /**
|
| + * Appends a token whose kind is determined by [info] and content is defined
|
| + * by the String [value].
|
| + *
|
| + * This method is invoked for class names, field names, method names, types,
|
| + * etc.
|
| + */
|
| + void appendStringToken(PrecedenceInfo info, String value) {
|
| + tail.next = new StringToken.fromString(info, value, tokenStart, true);
|
| + tail = tail.next;
|
| + }
|
| +
|
| + /**
|
| + * Appends a fixed token whose kind and content is determined by [info].
|
| + * Appends an *operator* token from [info].
|
| + *
|
| + * An operator token represent operators like ':', '.', ';', '&&', '==', '--',
|
| + * '=>', etc.
|
| + */
|
| + void appendPrecedenceToken(PrecedenceInfo info) {
|
| + tail.next = new SymbolToken(info, tokenStart);
|
| + tail = tail.next;
|
| + }
|
| +
|
| + /**
|
| * Appends a fixed token based on whether the current char is [choice] or not.
|
| * If the current char is [choice] a fixed token whose kind and content
|
| * is determined by [yes] is appended, otherwise a fixed token whose kind
|
| * and content is determined by [no] is appended.
|
| */
|
| - int select(int choice, PrecedenceInfo yes, PrecedenceInfo no);
|
| + int select(int choice, PrecedenceInfo yes, PrecedenceInfo no) {
|
| + int next = advance();
|
| + if (identical(next, choice)) {
|
| + appendPrecedenceToken(yes);
|
| + return advance();
|
| + } else {
|
| + appendPrecedenceToken(no);
|
| + return next;
|
| + }
|
| + }
|
|
|
| /**
|
| - * Appends a fixed token whose kind and content is determined by [info].
|
| + * Appends a keyword token whose kind is determined by [keyword].
|
| + */
|
| + void appendKeywordToken(Keyword keyword) {
|
| + String syntax = keyword.syntax;
|
| + // Type parameters and arguments cannot contain 'this' or 'super'.
|
| + if (identical(syntax, 'this') || identical(syntax, 'super')) {
|
| + discardOpenLt();
|
| + }
|
| + tail.next = new KeywordToken(keyword, tokenStart);
|
| + tail = tail.next;
|
| + }
|
| +
|
| + void appendEofToken() {
|
| + beginToken();
|
| + tail.next = new SymbolToken(EOF_INFO, tokenStart);
|
| + tail = tail.next;
|
| + // EOF points to itself so there's always infinite look-ahead.
|
| + tail.next = tail;
|
| + discardOpenLt();
|
| + while (!groupingStack.isEmpty) {
|
| + unmatchedBeginGroup(groupingStack.head);
|
| + groupingStack = groupingStack.tail;
|
| + }
|
| + }
|
| +
|
| + /**
|
| + * Notifies scanning a whitespace character. Note that [appendWhiteSpace] is
|
| + * not always invoked for [$SPACE] characters.
|
| + *
|
| + * This method is used by the scanners to track line breaks and create the
|
| + * [lineStarts] map.
|
| */
|
| - void appendPrecedenceToken(PrecedenceInfo info);
|
| + void appendWhiteSpace(int next) {
|
| + if (next == $LF && file != null) {
|
| + lineStarts.add(stringOffset + 1); // +1, the line starts after the $LF.
|
| + }
|
| + }
|
|
|
| /**
|
| - * Appends a token whose kind is determined by [info] and content is [value].
|
| + * Notifies on [$LF] characters in multi-line commends or strings.
|
| + *
|
| + * This method is used by the scanners to track line breaks and create the
|
| + * [lineStarts] map.
|
| */
|
| - void appendStringToken(PrecedenceInfo info, String value);
|
| + void lineFeedInMultiline() {
|
| + if (file != null) {
|
| + lineStarts.add(stringOffset + 1);
|
| + }
|
| + }
|
|
|
| /**
|
| - * Appends a token whose kind is determined by [info] and content is defined
|
| - * by the SourceString [value].
|
| + * Appends a token that begins a new group, represented by [value].
|
| + * Group begin tokens are '{', '(', '[' and '${'.
|
| */
|
| - void appendByteStringToken(PrecedenceInfo info, T value);
|
| + void appendBeginGroup(PrecedenceInfo info) {
|
| + Token token = new BeginGroupToken(info, tokenStart);
|
| + tail.next = token;
|
| + tail = tail.next;
|
| +
|
| + // { ( [ ${ cannot appear inside a type parameters / arguments.
|
| + if (!identical(info.kind, LT_TOKEN)) discardOpenLt();
|
| + groupingStack = groupingStack.prepend(token);
|
| + }
|
|
|
| /**
|
| - * Appends a keyword token whose kind is determined by [keyword].
|
| + * Appends a token that begins a ends group, represented by [value].
|
| + * It handles the group end tokens '}', ')' and ']'. The tokens '>' and
|
| + * '>>' are handled separately bo [appendGt] and [appendGtGt].
|
| */
|
| - void appendKeywordToken(Keyword keyword);
|
| - void appendWhiteSpace(int next);
|
| - void appendEofToken();
|
| + int appendEndGroup(PrecedenceInfo info, int openKind) {
|
| + assert(!identical(openKind, LT_TOKEN)); // openKind is < for > and >>
|
| + appendPrecedenceToken(info);
|
| + // Don't report unmatched errors for <; it is also the less-than operator.
|
| + discardOpenLt();
|
| + if (groupingStack.isEmpty) {
|
| + return advance();
|
| + }
|
| + BeginGroupToken begin = groupingStack.head;
|
| + if (!identical(begin.kind, openKind)) {
|
| + if (!identical(openKind, OPEN_CURLY_BRACKET_TOKEN) ||
|
| + !identical(begin.kind, STRING_INTERPOLATION_TOKEN)) {
|
| + // Not ending string interpolation.
|
| + unmatchedBeginGroup(begin);
|
| + return advance();
|
| + }
|
| + // We're ending an interpolated expression.
|
| + begin.endGroup = tail;
|
| + groupingStack = groupingStack.tail;
|
| + // Using "start-of-text" to signal that we're back in string
|
| + // scanning mode.
|
| + return $STX;
|
| + }
|
| + begin.endGroup = tail;
|
| + groupingStack = groupingStack.tail;
|
| + return advance();
|
| + }
|
|
|
| /**
|
| - * Creates an ASCII SourceString whose content begins at the source byte
|
| - * offset [start] and ends at [offset] bytes from the current byte offset of
|
| - * the scanner. For example, if the current byte offset is 10,
|
| - * [:asciiString(0,-1):] creates an ASCII SourceString whose content is found
|
| - * at the [0,9[ byte interval of the source text.
|
| + * Appends a token for '>'.
|
| + * This method does not issue unmatched errors, because > is also the
|
| + * greater-than operator. It does not necessarily have to close a group.
|
| */
|
| - T asciiString(int start, int offset);
|
| - T utf8String(int start, int offset);
|
| - Token firstToken();
|
| - Token previousToken();
|
| - void beginToken();
|
| - void addToCharOffset(int offset);
|
| - int get charOffset;
|
| - int get byteOffset;
|
| - void appendBeginGroup(PrecedenceInfo info, String value);
|
| - int appendEndGroup(PrecedenceInfo info, String value, int openKind);
|
| - void appendGt(PrecedenceInfo info, String value);
|
| - void appendGtGt(PrecedenceInfo info, String value);
|
| - void appendGtGtGt(PrecedenceInfo info, String value);
|
| - void appendComment();
|
| + void appendGt(PrecedenceInfo info) {
|
| + appendPrecedenceToken(info);
|
| + if (groupingStack.isEmpty) return;
|
| + if (identical(groupingStack.head.kind, LT_TOKEN)) {
|
| + groupingStack.head.endGroup = tail;
|
| + groupingStack = groupingStack.tail;
|
| + }
|
| + }
|
| +
|
| + /**
|
| + * Appends a token for '>>'.
|
| + * This method does not issue unmatched errors, because >> is also the
|
| + * shift operator. It does not necessarily have to close a group.
|
| + */
|
| + void appendGtGt(PrecedenceInfo info) {
|
| + appendPrecedenceToken(info);
|
| + if (groupingStack.isEmpty) return;
|
| + if (identical(groupingStack.head.kind, LT_TOKEN)) {
|
| + // Don't assign endGroup: in "T<U<V>>", the '>>' token closes the outer
|
| + // '<', the inner '<' is left without endGroup.
|
| + groupingStack = groupingStack.tail;
|
| + }
|
| + if (groupingStack.isEmpty) return;
|
| + if (identical(groupingStack.head.kind, LT_TOKEN)) {
|
| + groupingStack.head.endGroup = tail;
|
| + groupingStack = groupingStack.tail;
|
| + }
|
| + }
|
| +
|
| + void appendComment(start, bool asciiOnly) {
|
| + if (!includeComments) return;
|
| + appendSubstringToken(COMMENT_INFO, start, asciiOnly);
|
| + }
|
|
|
| /**
|
| * We call this method to discard '<' from the "grouping" stack
|
| @@ -88,7 +345,12 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| * something which cannot possibly be part of a type
|
| * parameter/argument list.
|
| */
|
| - void discardOpenLt();
|
| + void discardOpenLt() {
|
| + while (!groupingStack.isEmpty
|
| + && identical(groupingStack.head.kind, LT_TOKEN)) {
|
| + groupingStack = groupingStack.tail;
|
| + }
|
| + }
|
|
|
| // TODO(ahe): Move this class to implementation.
|
|
|
| @@ -98,6 +360,14 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| next = bigSwitch(next);
|
| }
|
| appendEofToken();
|
| +
|
| + if (file != null) {
|
| + file.length = stringOffset;
|
| + // One additional line start at the end, see [SourceFile.lineStarts].
|
| + lineStarts.add(stringOffset + 1);
|
| + file.lineStarts = lineStarts;
|
| + }
|
| +
|
| return firstToken();
|
| }
|
|
|
| @@ -107,8 +377,10 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| || identical(next, $LF) || identical(next, $CR)) {
|
| appendWhiteSpace(next);
|
| next = advance();
|
| + // Sequences of spaces are common, so advance through them fast.
|
| while (identical(next, $SPACE)) {
|
| - appendWhiteSpace(next);
|
| + // We don't invoke [:appendWhiteSpace(next):] here for efficiency,
|
| + // assuming that it does not do anything for space characters.
|
| next = advance();
|
| }
|
| return next;
|
| @@ -121,8 +393,10 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| return tokenizeKeywordOrIdentifier(next, true);
|
| }
|
|
|
| - if (($A <= next && next <= $Z) || identical(next, $_) || identical(next, $$)) {
|
| - return tokenizeIdentifier(next, byteOffset, true);
|
| + if (($A <= next && next <= $Z) ||
|
| + identical(next, $_) ||
|
| + identical(next, $$)) {
|
| + return tokenizeIdentifier(next, scanOffset, true);
|
| }
|
|
|
| if (identical(next, $LT)) {
|
| @@ -187,12 +461,12 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| }
|
|
|
| if (identical(next, $OPEN_PAREN)) {
|
| - appendBeginGroup(OPEN_PAREN_INFO, "(");
|
| + appendBeginGroup(OPEN_PAREN_INFO);
|
| return advance();
|
| }
|
|
|
| if (identical(next, $CLOSE_PAREN)) {
|
| - return appendEndGroup(CLOSE_PAREN_INFO, ")", OPEN_PAREN_TOKEN);
|
| + return appendEndGroup(CLOSE_PAREN_INFO, OPEN_PAREN_TOKEN);
|
| }
|
|
|
| if (identical(next, $COMMA)) {
|
| @@ -218,7 +492,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| }
|
|
|
| if (identical(next, $CLOSE_SQUARE_BRACKET)) {
|
| - return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO, "]",
|
| + return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO,
|
| OPEN_SQUARE_BRACKET_TOKEN);
|
| }
|
|
|
| @@ -228,12 +502,12 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| }
|
|
|
| if (identical(next, $OPEN_CURLY_BRACKET)) {
|
| - appendBeginGroup(OPEN_CURLY_BRACKET_INFO, "{");
|
| + appendBeginGroup(OPEN_CURLY_BRACKET_INFO);
|
| return advance();
|
| }
|
|
|
| if (identical(next, $CLOSE_CURLY_BRACKET)) {
|
| - return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, "}",
|
| + return appendEndGroup(CLOSE_CURLY_BRACKET_INFO,
|
| OPEN_CURLY_BRACKET_TOKEN);
|
| }
|
|
|
| @@ -246,7 +520,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| }
|
|
|
| if (identical(next, $DQ) || identical(next, $SQ)) {
|
| - return tokenizeString(next, byteOffset, false);
|
| + return tokenizeString(next, scanOffset, false);
|
| }
|
|
|
| if (identical(next, $PERIOD)) {
|
| @@ -268,7 +542,11 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| return $EOF;
|
| }
|
| if (next < 0x1f) {
|
| - return error(new SourceString("unexpected character $next"));
|
| + return error("unexpected character $next");
|
| + }
|
| +
|
| + if (next >= 128) {
|
| + next = currentAsUnicode(next);
|
| }
|
|
|
| // The following are non-ASCII characters.
|
| @@ -278,16 +556,22 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| return advance();
|
| }
|
|
|
| - return tokenizeIdentifier(next, byteOffset, true);
|
| + return error("unexpected unicode character $next");
|
| }
|
|
|
| int tokenizeTag(int next) {
|
| // # or #!.*[\n\r]
|
| - if (byteOffset == 0) {
|
| + if (scanOffset == 0) {
|
| if (identical(peek(), $BANG)) {
|
| + int start = scanOffset + 1;
|
| + bool asciiOnly = true;
|
| do {
|
| next = advance();
|
| - } while (!identical(next, $LF) && !identical(next, $CR) && !identical(next, $EOF));
|
| + if (next > 127) asciiOnly = false;
|
| + } while (!identical(next, $LF) &&
|
| + !identical(next, $CR) &&
|
| + !identical(next, $EOF));
|
| + if (!asciiOnly) handleUnicode(start);
|
| return next;
|
| }
|
| }
|
| @@ -311,11 +595,12 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| next = advance();
|
| if (identical(next, $CLOSE_SQUARE_BRACKET)) {
|
| Token token = previousToken();
|
| - if (token is KeywordToken && identical(token.value.stringValue, 'operator')) {
|
| + if (token is KeywordToken &&
|
| + identical((token as KeywordToken).keyword.syntax, 'operator')) {
|
| return select($EQ, INDEX_EQ_INFO, INDEX_INFO);
|
| }
|
| }
|
| - appendBeginGroup(OPEN_SQUARE_BRACKET_INFO, "[");
|
| + appendBeginGroup(OPEN_SQUARE_BRACKET_INFO);
|
| return next;
|
| }
|
|
|
| @@ -379,7 +664,6 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| }
|
| }
|
|
|
| -
|
| int tokenizePlus(int next) {
|
| // + ++ +=
|
| next = advance();
|
| @@ -396,7 +680,9 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| }
|
|
|
| int tokenizeExclamation(int next) {
|
| - // ! != !==
|
| + // ! !=
|
| + // !== is kept for user-friendly error reporting
|
| +
|
| next = advance();
|
| if (identical(next, $EQ)) {
|
| return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO);
|
| @@ -406,7 +692,8 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| }
|
|
|
| int tokenizeEquals(int next) {
|
| - // = == ===
|
| + // = == =>
|
| + // === is kept for user-friendly error reporting
|
|
|
| // Type parameters and arguments cannot contain any token that
|
| // starts with '='.
|
| @@ -424,7 +711,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| }
|
|
|
| int tokenizeGreaterThan(int next) {
|
| - // > >= >> >>= >>> >>>=
|
| + // > >= >> >>=
|
| next = advance();
|
| if (identical($EQ, next)) {
|
| appendPrecedenceToken(GT_EQ_INFO);
|
| @@ -435,11 +722,11 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| appendPrecedenceToken(GT_GT_EQ_INFO);
|
| return advance();
|
| } else {
|
| - appendGtGt(GT_GT_INFO, ">>");
|
| + appendGtGt(GT_GT_INFO);
|
| return next;
|
| }
|
| } else {
|
| - appendGt(GT_INFO, ">");
|
| + appendGt(GT_INFO);
|
| return next;
|
| }
|
| }
|
| @@ -453,13 +740,13 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| } else if (identical($LT, next)) {
|
| return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO);
|
| } else {
|
| - appendBeginGroup(LT_INFO, "<");
|
| + appendBeginGroup(LT_INFO);
|
| return next;
|
| }
|
| }
|
|
|
| int tokenizeNumber(int next) {
|
| - int start = byteOffset;
|
| + int start = scanOffset;
|
| while (true) {
|
| next = advance();
|
| if ($0 <= next && next <= $9) {
|
| @@ -473,7 +760,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| return tokenizeFractionPart(advance(), start);
|
| }
|
| }
|
| - appendByteStringToken(INT_INFO, asciiString(start, 0));
|
| + appendSubstringToken(INT_INFO, start, true);
|
| return next;
|
| }
|
| }
|
| @@ -482,14 +769,14 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| int tokenizeHexOrNumber(int next) {
|
| int x = peek();
|
| if (identical(x, $x) || identical(x, $X)) {
|
| - advance();
|
| - return tokenizeHex(x);
|
| + return tokenizeHex(next);
|
| }
|
| return tokenizeNumber(next);
|
| }
|
|
|
| int tokenizeHex(int next) {
|
| - int start = byteOffset - 1;
|
| + int start = scanOffset;
|
| + next = advance(); // Advance past the $x or $X.
|
| bool hasDigits = false;
|
| while (true) {
|
| next = advance();
|
| @@ -499,16 +786,16 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| hasDigits = true;
|
| } else {
|
| if (!hasDigits) {
|
| - return error(const SourceString("hex digit expected"));
|
| + return error("hex digit expected");
|
| }
|
| - appendByteStringToken(HEXADECIMAL_INFO, asciiString(start, 0));
|
| + appendSubstringToken(HEXADECIMAL_INFO, start, true);
|
| return next;
|
| }
|
| }
|
| }
|
|
|
| int tokenizeDotsOrNumber(int next) {
|
| - int start = byteOffset;
|
| + int start = scanOffset;
|
| next = advance();
|
| if (($0 <= next && next <= $9)) {
|
| return tokenizeFractionPart(next, start);
|
| @@ -538,15 +825,18 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| next = advance();
|
| }
|
| if (!hasDigit) {
|
| - appendByteStringToken(INT_INFO, asciiString(start, -1));
|
| + // Reduce offset, we already advanced to the token past the period.
|
| + appendSubstringToken(INT_INFO, start, true, -1);
|
| +
|
| + // TODO(ahe): Wrong offset for the period. Cannot call beginToken because
|
| + // the scanner already advanced past the period.
|
| if (identical($PERIOD, next)) {
|
| return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);
|
| }
|
| - // TODO(ahe): Wrong offset for the period.
|
| appendPrecedenceToken(PERIOD_INFO);
|
| - return bigSwitch(next);
|
| + return next;
|
| }
|
| - appendByteStringToken(DOUBLE_INFO, asciiString(start, 0));
|
| + appendSubstringToken(DOUBLE_INFO, start, true);
|
| return next;
|
| }
|
|
|
| @@ -560,7 +850,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| hasDigits = true;
|
| } else {
|
| if (!hasDigits) {
|
| - return error(const SourceString("digit expected"));
|
| + return error("digit expected");
|
| }
|
| return next;
|
| }
|
| @@ -569,11 +859,12 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| }
|
|
|
| int tokenizeSlashOrComment(int next) {
|
| + int start = scanOffset;
|
| next = advance();
|
| if (identical($STAR, next)) {
|
| - return tokenizeMultiLineComment(next);
|
| + return tokenizeMultiLineComment(next, start);
|
| } else if (identical($SLASH, next)) {
|
| - return tokenizeSingleLineComment(next);
|
| + return tokenizeSingleLineComment(next, start);
|
| } else if (identical($EQ, next)) {
|
| appendPrecedenceToken(SLASH_EQ_INFO);
|
| return advance();
|
| @@ -583,30 +874,41 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| }
|
| }
|
|
|
| - int tokenizeSingleLineComment(int next) {
|
| + int tokenizeSingleLineComment(int next, int start) {
|
| + bool asciiOnly = true;
|
| while (true) {
|
| next = advance();
|
| - if (identical($LF, next) || identical($CR, next) || identical($EOF, next)) {
|
| - appendComment();
|
| + if (next > 127) asciiOnly = false;
|
| + if (identical($LF, next) ||
|
| + identical($CR, next) ||
|
| + identical($EOF, next)) {
|
| + if (!asciiOnly) handleUnicode(start);
|
| + appendComment(start, asciiOnly);
|
| return next;
|
| }
|
| }
|
| }
|
|
|
| - int tokenizeMultiLineComment(int next) {
|
| +
|
| + int tokenizeMultiLineComment(int next, int start) {
|
| + bool asciiOnlyComment = true; // Track if the entire comment is ASCII.
|
| + bool asciiOnlyLines = true; // Track ASCII since the last handleUnicode.
|
| + int unicodeStart = start;
|
| int nesting = 1;
|
| next = advance();
|
| while (true) {
|
| if (identical($EOF, next)) {
|
| - // TODO(ahe): Report error.
|
| + if (!asciiOnlyLines) handleUnicode(unicodeStart);
|
| + appendStringToken(BAD_INPUT_INFO, "unterminated multi-line comment");
|
| return next;
|
| } else if (identical($STAR, next)) {
|
| next = advance();
|
| if (identical($SLASH, next)) {
|
| --nesting;
|
| if (0 == nesting) {
|
| + if (!asciiOnlyLines) handleUnicode(unicodeStart);
|
| next = advance();
|
| - appendComment();
|
| + appendComment(start, asciiOnlyComment);
|
| return next;
|
| } else {
|
| next = advance();
|
| @@ -618,16 +920,30 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| next = advance();
|
| ++nesting;
|
| }
|
| + } else if (identical(next, $LF)) {
|
| + if (!asciiOnlyLines) {
|
| + // Synchronize the string offset in the utf8 scanner.
|
| + handleUnicode(unicodeStart);
|
| + asciiOnlyLines = true;
|
| + unicodeStart = scanOffset;
|
| + }
|
| + lineFeedInMultiline();
|
| + next = advance();
|
| } else {
|
| + if (next > 127) {
|
| + asciiOnlyLines = false;
|
| + asciiOnlyComment = false;
|
| + }
|
| next = advance();
|
| }
|
| }
|
| }
|
|
|
| int tokenizeRawStringKeywordOrIdentifier(int next) {
|
| + // [next] is $r.
|
| int nextnext = peek();
|
| if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) {
|
| - int start = byteOffset;
|
| + int start = scanOffset;
|
| next = advance();
|
| return tokenizeString(next, start, true);
|
| }
|
| @@ -636,7 +952,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
|
|
| int tokenizeKeywordOrIdentifier(int next, bool allowDollar) {
|
| KeywordState state = KeywordState.KEYWORD_STATE;
|
| - int start = byteOffset;
|
| + int start = scanOffset;
|
| while (state != null && $a <= next && next <= $z) {
|
| state = state.next(next);
|
| next = advance();
|
| @@ -649,17 +965,17 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| identical(next, $_) ||
|
| identical(next, $$)) {
|
| return tokenizeIdentifier(next, start, allowDollar);
|
| - } else if (next < 128) {
|
| + } else {
|
| appendKeywordToken(state.keyword);
|
| return next;
|
| - } else {
|
| - return tokenizeIdentifier(next, start, allowDollar);
|
| }
|
| }
|
|
|
| + /**
|
| + * [allowDollar] can exclude '$', which is not allowed as part of a string
|
| + * interpolation identifier.
|
| + */
|
| int tokenizeIdentifier(int next, int start, bool allowDollar) {
|
| - bool isAscii = true;
|
| -
|
| while (true) {
|
| if (($a <= next && next <= $z) ||
|
| ($A <= next && next <= $Z) ||
|
| @@ -667,35 +983,21 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| identical(next, $_) ||
|
| (identical(next, $$) && allowDollar)) {
|
| next = advance();
|
| - } else if ((next < 128) || (identical(next, $NBSP))) {
|
| + } else {
|
| // Identifier ends here.
|
| - if (start == byteOffset) {
|
| - return error(const SourceString("expected identifier"));
|
| - } else if (isAscii) {
|
| - appendByteStringToken(IDENTIFIER_INFO, asciiString(start, 0));
|
| + if (start == scanOffset) {
|
| + return error("expected identifier");
|
| } else {
|
| - appendByteStringToken(BAD_INPUT_INFO, utf8String(start, -1));
|
| + appendSubstringToken(IDENTIFIER_INFO, start, true);
|
| }
|
| return next;
|
| - } else {
|
| - int nonAsciiStart = byteOffset;
|
| - do {
|
| - next = nextByte();
|
| - if (identical(next, $NBSP)) break;
|
| - } while (next > 127);
|
| - String string = utf8String(nonAsciiStart, -1).slowToString();
|
| - isAscii = false;
|
| - int byteLength = nonAsciiStart - byteOffset;
|
| - addToCharOffset(string.length - byteLength);
|
| }
|
| }
|
| }
|
|
|
| int tokenizeAt(int next) {
|
| - int start = byteOffset;
|
| - next = advance();
|
| appendPrecedenceToken(AT_INFO);
|
| - return next;
|
| + return advance();
|
| }
|
|
|
| int tokenizeString(int next, int start, bool raw) {
|
| @@ -708,7 +1010,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| return tokenizeMultiLineString(quoteChar, start, raw);
|
| } else {
|
| // Empty string.
|
| - appendByteStringToken(STRING_INFO, utf8String(start, -1));
|
| + appendSubstringToken(STRING_INFO, start, true);
|
| return next;
|
| }
|
| }
|
| @@ -719,56 +1021,72 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| }
|
| }
|
|
|
| - static bool isHexDigit(int character) {
|
| - if ($0 <= character && character <= $9) return true;
|
| - character |= 0x20;
|
| - return ($a <= character && character <= $f);
|
| - }
|
| -
|
| + /**
|
| + * [next] is the first character after the qoute.
|
| + * [start] is the scanOffset of the quote.
|
| + *
|
| + * The token contains a substring of the source file, including the
|
| + * string quotes, backslashes for escaping. For interpolated strings,
|
| + * the parts before and after are separate tokens.
|
| + *
|
| + * "a $b c"
|
| + *
|
| + * gives StringToken("a $), StringToken(b) and StringToken( c").
|
| + */
|
| int tokenizeSingleLineString(int next, int quoteChar, int start) {
|
| + bool asciiOnly = true;
|
| while (!identical(next, quoteChar)) {
|
| if (identical(next, $BACKSLASH)) {
|
| next = advance();
|
| } else if (identical(next, $$)) {
|
| - next = tokenizeStringInterpolation(start);
|
| - start = byteOffset;
|
| + if (!asciiOnly) handleUnicode(start);
|
| + next = tokenizeStringInterpolation(start, asciiOnly);
|
| + start = scanOffset;
|
| + asciiOnly = true;
|
| continue;
|
| }
|
| if (next <= $CR
|
| - && (identical(next, $LF) || identical(next, $CR) || identical(next, $EOF))) {
|
| - return error(const SourceString("unterminated string literal"));
|
| + && (identical(next, $LF) ||
|
| + identical(next, $CR) ||
|
| + identical(next, $EOF))) {
|
| + if (!asciiOnly) handleUnicode(start);
|
| + return error("unterminated string literal");
|
| }
|
| + if (next > 127) asciiOnly = false;
|
| next = advance();
|
| }
|
| - appendByteStringToken(STRING_INFO, utf8String(start, 0));
|
| - return advance();
|
| + if (!asciiOnly) handleUnicode(start);
|
| + // Advance past the quote character.
|
| + next = advance();
|
| + appendSubstringToken(STRING_INFO, start, asciiOnly);
|
| + return next;
|
| }
|
|
|
| - int tokenizeStringInterpolation(int start) {
|
| - appendByteStringToken(STRING_INFO, utf8String(start, -1));
|
| + int tokenizeStringInterpolation(int start, bool asciiOnly) {
|
| + appendSubstringToken(STRING_INFO, start, asciiOnly);
|
| beginToken(); // $ starts here.
|
| int next = advance();
|
| if (identical(next, $OPEN_CURLY_BRACKET)) {
|
| - return tokenizeInterpolatedExpression(next, start);
|
| + return tokenizeInterpolatedExpression(next);
|
| } else {
|
| - return tokenizeInterpolatedIdentifier(next, start);
|
| + return tokenizeInterpolatedIdentifier(next);
|
| }
|
| }
|
|
|
| - int tokenizeInterpolatedExpression(int next, int start) {
|
| - appendBeginGroup(STRING_INTERPOLATION_INFO, "\${");
|
| + int tokenizeInterpolatedExpression(int next) {
|
| + appendBeginGroup(STRING_INTERPOLATION_INFO);
|
| beginToken(); // The expression starts here.
|
| - next = advance();
|
| + next = advance(); // Move past the curly bracket.
|
| while (!identical(next, $EOF) && !identical(next, $STX)) {
|
| next = bigSwitch(next);
|
| }
|
| if (identical(next, $EOF)) return next;
|
| - next = advance();
|
| + next = advance(); // Move past the $STX.
|
| beginToken(); // The string interpolation suffix starts here.
|
| return next;
|
| }
|
|
|
| - int tokenizeInterpolatedIdentifier(int next, int start) {
|
| + int tokenizeInterpolatedIdentifier(int next) {
|
| appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO);
|
| beginToken(); // The identifier starts here.
|
| next = tokenizeKeywordOrIdentifier(next, false);
|
| @@ -777,23 +1095,45 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| }
|
|
|
| int tokenizeSingleLineRawString(int next, int quoteChar, int start) {
|
| - next = advance();
|
| + bool asciiOnly = true;
|
| + next = advance(); // Advance past the quote
|
| while (next != $EOF) {
|
| if (identical(next, quoteChar)) {
|
| - appendByteStringToken(STRING_INFO, utf8String(start, 0));
|
| - return advance();
|
| + if (!asciiOnly) handleUnicode(start);
|
| + next = advance();
|
| + appendSubstringToken(STRING_INFO, start, asciiOnly);
|
| + return next;
|
| } else if (identical(next, $LF) || identical(next, $CR)) {
|
| - return error(const SourceString("unterminated string literal"));
|
| + if (!asciiOnly) handleUnicode(start);
|
| + return error("unterminated string literal");
|
| + } else if (next > 127) {
|
| + asciiOnly = false;
|
| }
|
| next = advance();
|
| }
|
| - return error(const SourceString("unterminated string literal"));
|
| + if (!asciiOnly) handleUnicode(start);
|
| + return error("unterminated string literal");
|
| }
|
|
|
| int tokenizeMultiLineRawString(int quoteChar, int start) {
|
| - int next = advance();
|
| + bool asciiOnlyString = true;
|
| + bool asciiOnlyLine = true;
|
| + int unicodeStart = start;
|
| + int next = advance(); // Advance past the (last) quote (of three)
|
| outer: while (!identical(next, $EOF)) {
|
| while (!identical(next, quoteChar)) {
|
| + if (identical(next, $LF)) {
|
| + if (!asciiOnlyLine) {
|
| + // Synchronize the string offset in the utf8 scanner.
|
| + handleUnicode(unicodeStart);
|
| + asciiOnlyLine = true;
|
| + unicodeStart = scanOffset;
|
| + }
|
| + lineFeedInMultiline();
|
| + } else if (next > 127) {
|
| + asciiOnlyLine = false;
|
| + asciiOnlyString = false;
|
| + }
|
| next = advance();
|
| if (identical(next, $EOF)) break outer;
|
| }
|
| @@ -801,21 +1141,31 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| if (identical(next, quoteChar)) {
|
| next = advance();
|
| if (identical(next, quoteChar)) {
|
| - appendByteStringToken(STRING_INFO, utf8String(start, 0));
|
| - return advance();
|
| + if (!asciiOnlyLine) handleUnicode(unicodeStart);
|
| + next = advance();
|
| + appendSubstringToken(STRING_INFO, start, asciiOnlyString);
|
| + return next;
|
| }
|
| }
|
| }
|
| - return error(const SourceString("unterminated string literal"));
|
| + if (!asciiOnlyLine) handleUnicode(unicodeStart);
|
| + return error("unterminated string literal");
|
| }
|
|
|
| int tokenizeMultiLineString(int quoteChar, int start, bool raw) {
|
| if (raw) return tokenizeMultiLineRawString(quoteChar, start);
|
| - int next = advance();
|
| + bool asciiOnlyString = true;
|
| + bool asciiOnlyLine = true;
|
| + int unicodeStart = start;
|
| + int next = advance(); // Advance past the (last) quote (of three).
|
| while (!identical(next, $EOF)) {
|
| if (identical(next, $$)) {
|
| - next = tokenizeStringInterpolation(start);
|
| - start = byteOffset;
|
| + if (!asciiOnlyLine) handleUnicode(unicodeStart);
|
| + next = tokenizeStringInterpolation(start, asciiOnlyString);
|
| + start = scanOffset;
|
| + unicodeStart = start;
|
| + asciiOnlyString = true; // A new string token is created for the rest.
|
| + asciiOnlyLine = true;
|
| continue;
|
| }
|
| if (identical(next, quoteChar)) {
|
| @@ -823,8 +1173,10 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| if (identical(next, quoteChar)) {
|
| next = advance();
|
| if (identical(next, quoteChar)) {
|
| - appendByteStringToken(STRING_INFO, utf8String(start, 0));
|
| - return advance();
|
| + if (!asciiOnlyLine) handleUnicode(unicodeStart);
|
| + next = advance();
|
| + appendSubstringToken(STRING_INFO, start, asciiOnlyString);
|
| + return next;
|
| }
|
| }
|
| continue;
|
| @@ -833,13 +1185,53 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|
| next = advance();
|
| if (identical(next, $EOF)) break;
|
| }
|
| + if (identical(next, $LF)) {
|
| + if (!asciiOnlyLine) {
|
| + // Synchronize the string offset in the utf8 scanner.
|
| + handleUnicode(unicodeStart);
|
| + asciiOnlyLine = true;
|
| + unicodeStart = scanOffset;
|
| + }
|
| + lineFeedInMultiline();
|
| + } else if (next > 127) {
|
| + asciiOnlyString = false;
|
| + asciiOnlyLine = false;
|
| + }
|
| next = advance();
|
| }
|
| - return error(const SourceString("unterminated string literal"));
|
| + if (!asciiOnlyLine) handleUnicode(unicodeStart);
|
| + return error("unterminated string literal");
|
| }
|
|
|
| - int error(SourceString message) {
|
| - appendByteStringToken(BAD_INPUT_INFO, message);
|
| + int error(String message) {
|
| + appendStringToken(BAD_INPUT_INFO, message);
|
| return advance(); // Ensure progress.
|
| }
|
| +
|
| + void unmatchedBeginGroup(BeginGroupToken begin) {
|
| + String error = 'unmatched "${begin.stringValue}"';
|
| + Token close =
|
| + new StringToken.fromString(
|
| + BAD_INPUT_INFO, error, begin.charOffset, true);
|
| +
|
| + // We want to ensure that unmatched BeginGroupTokens are reported
|
| + // as errors. However, the rest of the parser assume the groups
|
| + // are well-balanced and will never look at the endGroup
|
| + // token. This is a nice property that allows us to skip quickly
|
| + // over correct code. By inserting an additional error token in
|
| + // the stream, we can keep ignoring endGroup tokens.
|
| + //
|
| + // [begin] --next--> [tail]
|
| + // [begin] --endG--> [close] --next--> [next] --next--> [tail]
|
| + //
|
| + // This allows the parser to skip from [begin] via endGroup to [close] and
|
| + // ignore the [close] token (assuming it's correct), then the error will be
|
| + // reported when parsing the [next] token.
|
| +
|
| + Token next = new StringToken.fromString(
|
| + BAD_INPUT_INFO, error, begin.charOffset, true);
|
| + begin.endGroup = close;
|
| + close.next = next;
|
| + next.next = begin.next;
|
| + }
|
| }
|
|
|