sdk/lib/_internal/compiler/implementation/scanner/scanner.dart - Issue 27510003: Scanner for UTF-8 byte arrays

Unified Diff: sdk/lib/_internal/compiler/implementation/scanner/scanner.dart

Issue 27510003: Scanner for UTF-8 byte arrays (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 7 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« sdk/lib/_internal/compiler/implementation/js_emitter/code_emitter_task.dart ('K') | « sdk/lib/_internal/compiler/implementation/scanner/parser.dart ('k') | sdk/lib/_internal/compiler/implementation/scanner/scanner_task.dart » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: sdk/lib/_internal/compiler/implementation/scanner/scanner.dart

diff --git a/sdk/lib/_internal/compiler/implementation/scanner/scanner.dart b/sdk/lib/_internal/compiler/implementation/scanner/scanner.dart

index 16c238433f068319c29ef81d7ec4e29d53a80eae..f80a3e4a12f17e7accc3a40ea7fd3474a15b7853 100644

--- a/sdk/lib/_internal/compiler/implementation/scanner/scanner.dart

+++ b/sdk/lib/_internal/compiler/implementation/scanner/scanner.dart

@@ -6,75 +6,332 @@ part of scanner;

abstract class Scanner {

Token tokenize();

+ factory Scanner(SourceFile file, {bool includeComments: false}) {

+ if (file is Utf8BytesSourceFile) {

+ return new Utf8BytesScanner(file, includeComments: includeComments);

+ } else {

+ return new StringScanner(file, includeComments: includeComments);

+ }

}

-/**

- * Common base class for a Dart scanner.

- */

-abstract class AbstractScanner<T extends SourceString> implements Scanner {

+abstract class AbstractScanner implements Scanner {

+ final bool includeComments;

+ /**

+ * The string offset for the next token that will be created.

+ *

+ * Note that in the [Utf8BytesScanner], string offsets and [scanOffset] values

+ * are different. One string character can be encoded using multiple UTF-8

+ * bytes.

+ */

+ int tokenStart = -1;

+ /**

+ * A pointer to the token stream created by this scanner. The first token

+ * is a special token and not part of the source file. This is an

+ * implementation detail to avoids special cases in the scanner. This token

+ * is not exposed to clients of the scanner, which are expected to invoke

+ * [firstToken] to access the token stream.

+ */

+ final Token tokens = new SymbolToken(EOF_INFO, -1);

+ /**

+ * A pointer to the last scanned token.

+ */

+ Token tail;

+ /**

+ * The stack of open groups, e.g [: { ... ( .. :]

+ * Each BeginGroupToken has a pointer to the token where the group

+ * ends. This field is set when scanning the end group token.

+ */

+ Link<BeginGroupToken> groupingStack = const Link<BeginGroupToken>();

+ /**

+ * The source file that is being scanned. This field can be [:null:].

+ * If the source file is available, the scanner assigns its [:lineStarts:] and

+ * [:length:] fields at the end of [tokenize].

+ */

+ final SourceFile file;

+ final List<int> lineStarts = [0];

+ AbstractScanner(this.file, this.includeComments) {

+ this.tail = this.tokens;

+ }

+ /**

+ * Advances and returns the next character.

+ *

+ * If the next character is non-ASCII, then the returned value depends on the

+ * scanner implementation. The [Utf8BytesScanner] returns a UTF-8 byte, while

+ * the [StringScanner] returns a UTF-16 code unit.

+ *

+ * The scanner ensures that [advance] is not invoked after it returned [$EOF].

+ * This allows implementations to omit bound checks if the data structure ends

+ * with '0'.

+ */

int advance();

- int nextByte();

/**

- * Returns the current character or byte depending on the underlying input

- * kind. For example, [StringScanner] operates on [String] and thus returns

- * characters (Unicode codepoints represented as int) whereas

- * [ByteArrayScanner] operates on byte arrays and thus returns bytes.

+ * Returns the current unicode character.

+ *

+ * If the current character is ASCII, then it is returned unchanged.

+ *

+ * The [Utf8BytesScanner] decodes the next unicode code point starting at the

+ * current position. Note that every unicode character is returned as a single

+ * code point, i.e., for '\u{1d11e}' it returns 119070, and the following

+ * [advance] returns the next character.

+ *

+ * The [StringScanner] returns the current character unchanged, which might

+ * be a surrogate character. In the case of '\u{1d11e}', it returns the first

+ * code unit 55348, and the following [advance] returns the second code unit

+ * 56606.

+ *

+ * Invoking [currentAsUnicode] multiple times is safe, i.e.,

+ * [:currentAsUnicode(next) == currentAsUnicode(currentAsUnicode(next)):].

+ */

+ int currentAsUnicode(int next);

+ /**

+ * Returns the character at the next poisition. Like in [advance], the

+ * [Utf8BytesScanner] returns a UTF-8 byte, while the [StringScanner] returns

+ * a UTF-16 code unit.

int peek();

/**

+ * Notifies the scanner that unicode characters were detected in either a

+ * comment or a string literal between [startScanOffset] and the current

+ * scan offset.

+ */

+ void handleUnicode(int startScanOffset);

+ /**

+ * Returns the current scan offset.

+ *

+ * In the [Utf8BytesScanner] this is the offset into the byte list, in the

+ * [StringScanner] the offset in the source string.

+ */

+ int get scanOffset;

+ /**

+ * Returns the current string offset.

+ *

+ * In the [StringScanner] this is identical to the [scanOffset]. In the

+ * [Utf8BytesScanner] it is computed based on encountered UTF-8 characters.

+ */

+ int get stringOffset;

+ /**

+ * Returns the first token scanned by this [Scanner].

+ */

+ Token firstToken();

+ /**

+ * Returns the last token scanned by this [Scanner].

+ */

+ Token previousToken();

+ /**

+ * Notifies that a new token starts at current offset.

+ */

+ void beginToken() {

+ tokenStart = stringOffset;

+ }

+ /**

+ * Appends a substring from the scan offset [:start:] to the current

+ * [:scanOffset:] plus the [:extraOffset:]. For example, if the current

+ * scanOffset is 10, then [:appendSubstringToken(5, -1):] will append the

+ * substring string [5,9).

+ *

+ * Note that [extraOffset] can only be used if the covered character(s) are

+ * known to be ASCII.

+ */

+ void appendSubstringToken(PrecedenceInfo info, int start,

+ bool asciiOnly, [int extraOffset]);

+ /**

+ * Appends a token whose kind is determined by [info] and content is defined

+ * by the String [value].

+ *

+ * This method is invoked for class names, field names, method names, types,

+ * etc.

+ */

+ void appendStringToken(PrecedenceInfo info, String value) {

+ tail.next = new StringToken.fromString(info, value, tokenStart, true);

+ tail = tail.next;

+ }

+ /**

+ * Appends a fixed token whose kind and content is determined by [info].

+ * Appends an *operator* token from [info].

+ *

+ * An operator token represent operators like ':', '.', ';', '&&', '==', '--',

+ * '=>', etc.

+ */

+ void appendPrecedenceToken(PrecedenceInfo info) {

+ tail.next = new SymbolToken(info, tokenStart);

+ tail = tail.next;

+ }

+ /**

* Appends a fixed token based on whether the current char is [choice] or not.

* If the current char is [choice] a fixed token whose kind and content

* is determined by [yes] is appended, otherwise a fixed token whose kind

* and content is determined by [no] is appended.

- int select(int choice, PrecedenceInfo yes, PrecedenceInfo no);

+ int select(int choice, PrecedenceInfo yes, PrecedenceInfo no) {

+ int next = advance();

+ if (identical(next, choice)) {

+ appendPrecedenceToken(yes);

+ return advance();

+ } else {

+ appendPrecedenceToken(no);

+ return next;

+ }

/**

- * Appends a fixed token whose kind and content is determined by [info].

+ * Appends a keyword token whose kind is determined by [keyword].

+ */

+ void appendKeywordToken(Keyword keyword) {

+ String syntax = keyword.syntax;

+ // Type parameters and arguments cannot contain 'this' or 'super'.

+ if (identical(syntax, 'this') || identical(syntax, 'super')) {

+ discardOpenLt();

+ }

+ tail.next = new KeywordToken(keyword, tokenStart);

+ tail = tail.next;

+ }

+ void appendEofToken() {

+ beginToken();

+ tail.next = new SymbolToken(EOF_INFO, tokenStart);

+ tail = tail.next;

+ // EOF points to itself so there's always infinite look-ahead.

+ tail.next = tail;

+ discardOpenLt();

+ while (!groupingStack.isEmpty) {

+ unmatchedBeginGroup(groupingStack.head);

+ groupingStack = groupingStack.tail;

+ }

+ /**

+ * Notifies scanning a whitespace character. Note that [appendWhiteSpace] is

+ * not always invoked for [$SPACE] characters.

+ *

+ * This method is used by the scanners to track line breaks and create the

+ * [lineStarts] map.

- void appendPrecedenceToken(PrecedenceInfo info);

+ void appendWhiteSpace(int next) {

+ if (next == $LF && file != null) {

+ lineStarts.add(stringOffset + 1); // +1, the line starts after the $LF.

+ }

/**

- * Appends a token whose kind is determined by [info] and content is [value].

+ * Notifies on [$LF] characters in multi-line commends or strings.

+ *

+ * This method is used by the scanners to track line breaks and create the

+ * [lineStarts] map.

- void appendStringToken(PrecedenceInfo info, String value);

+ void lineFeedInMultiline() {

+ if (file != null) {

+ lineStarts.add(stringOffset + 1);

+ }

/**

- * Appends a token whose kind is determined by [info] and content is defined

- * by the SourceString [value].

+ * Appends a token that begins a new group, represented by [value].

+ * Group begin tokens are '{', '(', '[' and '${'.

- void appendByteStringToken(PrecedenceInfo info, T value);

+ void appendBeginGroup(PrecedenceInfo info) {

+ Token token = new BeginGroupToken(info, tokenStart);

+ tail.next = token;

+ tail = tail.next;

+ // { ( [ ${ cannot appear inside a type parameters / arguments.

+ if (!identical(info.kind, LT_TOKEN)) discardOpenLt();

+ groupingStack = groupingStack.prepend(token);

+ }

/**

- * Appends a keyword token whose kind is determined by [keyword].

+ * Appends a token that begins a ends group, represented by [value].

+ * It handles the group end tokens '}', ')' and ']'. The tokens '>' and

+ * '>>' are handled separately bo [appendGt] and [appendGtGt].

- void appendKeywordToken(Keyword keyword);

- void appendWhiteSpace(int next);

- void appendEofToken();

+ int appendEndGroup(PrecedenceInfo info, int openKind) {

+ assert(!identical(openKind, LT_TOKEN)); // openKind is < for > and >>

+ appendPrecedenceToken(info);

+ // Don't report unmatched errors for <; it is also the less-than operator.

+ discardOpenLt();

+ if (groupingStack.isEmpty) {

+ return advance();

+ }

+ BeginGroupToken begin = groupingStack.head;

+ if (!identical(begin.kind, openKind)) {

+ if (!identical(openKind, OPEN_CURLY_BRACKET_TOKEN) ||

+ !identical(begin.kind, STRING_INTERPOLATION_TOKEN)) {

+ // Not ending string interpolation.

+ unmatchedBeginGroup(begin);

+ return advance();

+ }

+ // We're ending an interpolated expression.

+ begin.endGroup = tail;

+ groupingStack = groupingStack.tail;

+ // Using "start-of-text" to signal that we're back in string

+ // scanning mode.

+ return $STX;

+ }

+ begin.endGroup = tail;

+ groupingStack = groupingStack.tail;

+ return advance();

+ }

/**

- * Creates an ASCII SourceString whose content begins at the source byte

- * offset [start] and ends at [offset] bytes from the current byte offset of

- * the scanner. For example, if the current byte offset is 10,

- * [:asciiString(0,-1):] creates an ASCII SourceString whose content is found

- * at the [0,9[ byte interval of the source text.

+ * Appends a token for '>'.

+ * This method does not issue unmatched errors, because > is also the

+ * greater-than operator. It does not necessarily have to close a group.

- T asciiString(int start, int offset);

- T utf8String(int start, int offset);

- Token firstToken();

- Token previousToken();

- void beginToken();

- void addToCharOffset(int offset);

- int get charOffset;

- int get byteOffset;

- void appendBeginGroup(PrecedenceInfo info, String value);

- int appendEndGroup(PrecedenceInfo info, String value, int openKind);

- void appendGt(PrecedenceInfo info, String value);

- void appendGtGt(PrecedenceInfo info, String value);

- void appendGtGtGt(PrecedenceInfo info, String value);

- void appendComment();

+ void appendGt(PrecedenceInfo info) {

+ appendPrecedenceToken(info);

+ if (groupingStack.isEmpty) return;

+ if (identical(groupingStack.head.kind, LT_TOKEN)) {

+ groupingStack.head.endGroup = tail;

+ groupingStack = groupingStack.tail;

+ }

+ /**

+ * Appends a token for '>>'.

+ * This method does not issue unmatched errors, because >> is also the

+ * shift operator. It does not necessarily have to close a group.

+ */

+ void appendGtGt(PrecedenceInfo info) {

+ appendPrecedenceToken(info);

+ if (groupingStack.isEmpty) return;

+ if (identical(groupingStack.head.kind, LT_TOKEN)) {

+ // Don't assign endGroup: in "T<U<V>>", the '>>' token closes the outer

+ // '<', the inner '<' is left without endGroup.

+ groupingStack = groupingStack.tail;

+ }

+ if (groupingStack.isEmpty) return;

+ if (identical(groupingStack.head.kind, LT_TOKEN)) {

+ groupingStack.head.endGroup = tail;

+ groupingStack = groupingStack.tail;

+ }

+ void appendComment(start, bool asciiOnly) {

+ if (!includeComments) return;

+ appendSubstringToken(COMMENT_INFO, start, asciiOnly);

+ }

/**

* We call this method to discard '<' from the "grouping" stack

@@ -88,7 +345,12 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

* something which cannot possibly be part of a type

* parameter/argument list.

- void discardOpenLt();

+ void discardOpenLt() {

+ while (!groupingStack.isEmpty

+ && identical(groupingStack.head.kind, LT_TOKEN)) {

+ groupingStack = groupingStack.tail;

+ }

// TODO(ahe): Move this class to implementation.

@@ -98,6 +360,14 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

next = bigSwitch(next);

}

appendEofToken();

+ if (file != null) {

+ file.length = stringOffset;

+ // One additional line start at the end, see [SourceFile.lineStarts].

+ lineStarts.add(stringOffset + 1);

+ file.lineStarts = lineStarts;

+ }

return firstToken();

}

@@ -107,8 +377,10 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

|| identical(next, $LF) || identical(next, $CR)) {

appendWhiteSpace(next);

next = advance();

+ // Sequences of spaces are common, so advance through them fast.

while (identical(next, $SPACE)) {

- appendWhiteSpace(next);

+ // We don't invoke [:appendWhiteSpace(next):] here for efficiency,

+ // assuming that it does not do anything for space characters.

next = advance();

}

return next;

@@ -121,8 +393,10 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

return tokenizeKeywordOrIdentifier(next, true);

}

- if (($A <= next && next <= $Z) || identical(next, $_) || identical(next, $$)) {

- return tokenizeIdentifier(next, byteOffset, true);

+ if (($A <= next && next <= $Z) ||

+ identical(next, $_) ||

+ identical(next, $$)) {

+ return tokenizeIdentifier(next, scanOffset, true);

}

if (identical(next, $LT)) {

@@ -187,12 +461,12 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

}

if (identical(next, $OPEN_PAREN)) {

- appendBeginGroup(OPEN_PAREN_INFO, "(");

+ appendBeginGroup(OPEN_PAREN_INFO);

return advance();

}

if (identical(next, $CLOSE_PAREN)) {

- return appendEndGroup(CLOSE_PAREN_INFO, ")", OPEN_PAREN_TOKEN);

+ return appendEndGroup(CLOSE_PAREN_INFO, OPEN_PAREN_TOKEN);

}

if (identical(next, $COMMA)) {

@@ -218,7 +492,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

}

if (identical(next, $CLOSE_SQUARE_BRACKET)) {

- return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO, "]",

+ return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO,

OPEN_SQUARE_BRACKET_TOKEN);

}

@@ -228,12 +502,12 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

}

if (identical(next, $OPEN_CURLY_BRACKET)) {

- appendBeginGroup(OPEN_CURLY_BRACKET_INFO, "{");

+ appendBeginGroup(OPEN_CURLY_BRACKET_INFO);

return advance();

}

if (identical(next, $CLOSE_CURLY_BRACKET)) {

- return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, "}",

+ return appendEndGroup(CLOSE_CURLY_BRACKET_INFO,

OPEN_CURLY_BRACKET_TOKEN);

}

@@ -246,7 +520,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

}

if (identical(next, $DQ) || identical(next, $SQ)) {

- return tokenizeString(next, byteOffset, false);

+ return tokenizeString(next, scanOffset, false);

}

if (identical(next, $PERIOD)) {

@@ -268,7 +542,11 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

return $EOF;

}

if (next < 0x1f) {

- return error(new SourceString("unexpected character $next"));

+ return error("unexpected character $next");

+ }

+ if (next >= 128) {

+ next = currentAsUnicode(next);

}

// The following are non-ASCII characters.

@@ -278,16 +556,22 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

return advance();

}

- return tokenizeIdentifier(next, byteOffset, true);

+ return error("unexpected unicode character $next");

}

int tokenizeTag(int next) {

// # or #!.*[\n\r]

- if (byteOffset == 0) {

+ if (scanOffset == 0) {

if (identical(peek(), $BANG)) {

+ int start = scanOffset + 1;

+ bool asciiOnly = true;

do {

next = advance();

- } while (!identical(next, $LF) && !identical(next, $CR) && !identical(next, $EOF));

+ if (next > 127) asciiOnly = false;

+ } while (!identical(next, $LF) &&

+ !identical(next, $CR) &&

+ !identical(next, $EOF));

+ if (!asciiOnly) handleUnicode(start);

return next;

}

@@ -311,11 +595,12 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

next = advance();

if (identical(next, $CLOSE_SQUARE_BRACKET)) {

Token token = previousToken();

- if (token is KeywordToken && identical(token.value.stringValue, 'operator')) {

+ if (token is KeywordToken &&

+ identical((token as KeywordToken).keyword.syntax, 'operator')) {

return select($EQ, INDEX_EQ_INFO, INDEX_INFO);

}

- appendBeginGroup(OPEN_SQUARE_BRACKET_INFO, "[");

+ appendBeginGroup(OPEN_SQUARE_BRACKET_INFO);

return next;

}

@@ -379,7 +664,6 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

}

int tokenizePlus(int next) {

// + ++ +=

next = advance();

@@ -396,7 +680,9 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

}

int tokenizeExclamation(int next) {

- // ! != !==

+ // ! !=

+ // !== is kept for user-friendly error reporting

next = advance();

if (identical(next, $EQ)) {

return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO);

@@ -406,7 +692,8 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

}

int tokenizeEquals(int next) {

- // = == ===

+ // = == =>

+ // === is kept for user-friendly error reporting

// Type parameters and arguments cannot contain any token that

// starts with '='.

@@ -424,7 +711,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

}

int tokenizeGreaterThan(int next) {

- // > >= >> >>= >>> >>>=

+ // > >= >> >>=

next = advance();

if (identical($EQ, next)) {

appendPrecedenceToken(GT_EQ_INFO);

@@ -435,11 +722,11 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

appendPrecedenceToken(GT_GT_EQ_INFO);

return advance();

} else {

- appendGtGt(GT_GT_INFO, ">>");

+ appendGtGt(GT_GT_INFO);

return next;

}

} else {

- appendGt(GT_INFO, ">");

+ appendGt(GT_INFO);

return next;

}

@@ -453,13 +740,13 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

} else if (identical($LT, next)) {

return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO);

} else {

- appendBeginGroup(LT_INFO, "<");

+ appendBeginGroup(LT_INFO);

return next;

}

int tokenizeNumber(int next) {

- int start = byteOffset;

+ int start = scanOffset;

while (true) {

next = advance();

if ($0 <= next && next <= $9) {

@@ -473,7 +760,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

return tokenizeFractionPart(advance(), start);

}

- appendByteStringToken(INT_INFO, asciiString(start, 0));

+ appendSubstringToken(INT_INFO, start, true);

return next;

}

@@ -482,14 +769,14 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

int tokenizeHexOrNumber(int next) {

int x = peek();

if (identical(x, $x) || identical(x, $X)) {

- advance();

- return tokenizeHex(x);

+ return tokenizeHex(next);

}

return tokenizeNumber(next);

}

int tokenizeHex(int next) {

- int start = byteOffset - 1;

+ int start = scanOffset;

+ next = advance(); // Advance past the $x or $X.

bool hasDigits = false;

while (true) {

next = advance();

@@ -499,16 +786,16 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

hasDigits = true;

} else {

if (!hasDigits) {

- return error(const SourceString("hex digit expected"));

+ return error("hex digit expected");

}

- appendByteStringToken(HEXADECIMAL_INFO, asciiString(start, 0));

+ appendSubstringToken(HEXADECIMAL_INFO, start, true);

return next;

}

int tokenizeDotsOrNumber(int next) {

- int start = byteOffset;

+ int start = scanOffset;

next = advance();

if (($0 <= next && next <= $9)) {

return tokenizeFractionPart(next, start);

@@ -538,15 +825,18 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

next = advance();

}

if (!hasDigit) {

- appendByteStringToken(INT_INFO, asciiString(start, -1));

+ // Reduce offset, we already advanced to the token past the period.

+ appendSubstringToken(INT_INFO, start, true, -1);

+ // TODO(ahe): Wrong offset for the period. Cannot call beginToken because

+ // the scanner already advanced past the period.

if (identical($PERIOD, next)) {

return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);

}

- // TODO(ahe): Wrong offset for the period.

appendPrecedenceToken(PERIOD_INFO);

- return bigSwitch(next);

+ return next;

}

- appendByteStringToken(DOUBLE_INFO, asciiString(start, 0));

+ appendSubstringToken(DOUBLE_INFO, start, true);

return next;

}

@@ -560,7 +850,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

hasDigits = true;

} else {

if (!hasDigits) {

- return error(const SourceString("digit expected"));

+ return error("digit expected");

}

return next;

}

@@ -569,11 +859,12 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

}

int tokenizeSlashOrComment(int next) {

+ int start = scanOffset;

next = advance();

if (identical($STAR, next)) {

- return tokenizeMultiLineComment(next);

+ return tokenizeMultiLineComment(next, start);

} else if (identical($SLASH, next)) {

- return tokenizeSingleLineComment(next);

+ return tokenizeSingleLineComment(next, start);

} else if (identical($EQ, next)) {

appendPrecedenceToken(SLASH_EQ_INFO);

return advance();

@@ -583,30 +874,41 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

}

- int tokenizeSingleLineComment(int next) {

+ int tokenizeSingleLineComment(int next, int start) {

+ bool asciiOnly = true;

while (true) {

next = advance();

- if (identical($LF, next) || identical($CR, next) || identical($EOF, next)) {

- appendComment();

+ if (next > 127) asciiOnly = false;

+ if (identical($LF, next) ||

+ identical($CR, next) ||

+ identical($EOF, next)) {

+ if (!asciiOnly) handleUnicode(start);

+ appendComment(start, asciiOnly);

return next;

}

- int tokenizeMultiLineComment(int next) {

+ int tokenizeMultiLineComment(int next, int start) {

+ bool asciiOnlyComment = true; // Track if the entire comment is ASCII.

+ bool asciiOnlyLines = true; // Track ASCII since the last handleUnicode.

+ int unicodeStart = start;

int nesting = 1;

next = advance();

while (true) {

if (identical($EOF, next)) {

- // TODO(ahe): Report error.

+ if (!asciiOnlyLines) handleUnicode(unicodeStart);

+ appendStringToken(BAD_INPUT_INFO, "unterminated multi-line comment");

return next;

} else if (identical($STAR, next)) {

next = advance();

if (identical($SLASH, next)) {

--nesting;

if (0 == nesting) {

+ if (!asciiOnlyLines) handleUnicode(unicodeStart);

next = advance();

- appendComment();

+ appendComment(start, asciiOnlyComment);

return next;

} else {

next = advance();

@@ -618,16 +920,30 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

next = advance();

++nesting;

}

+ } else if (identical(next, $LF)) {

+ if (!asciiOnlyLines) {

+ // Synchronize the string offset in the utf8 scanner.

+ handleUnicode(unicodeStart);

+ asciiOnlyLines = true;

+ unicodeStart = scanOffset;

+ }

+ lineFeedInMultiline();

+ next = advance();

} else {

+ if (next > 127) {

+ asciiOnlyLines = false;

+ asciiOnlyComment = false;

+ }

next = advance();

}

int tokenizeRawStringKeywordOrIdentifier(int next) {

+ // [next] is $r.

int nextnext = peek();

if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) {

- int start = byteOffset;

+ int start = scanOffset;

next = advance();

return tokenizeString(next, start, true);

}

@@ -636,7 +952,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

int tokenizeKeywordOrIdentifier(int next, bool allowDollar) {

KeywordState state = KeywordState.KEYWORD_STATE;

- int start = byteOffset;

+ int start = scanOffset;

while (state != null && $a <= next && next <= $z) {

state = state.next(next);

next = advance();

@@ -649,17 +965,17 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

identical(next, $_) ||

identical(next, $$)) {

return tokenizeIdentifier(next, start, allowDollar);

- } else if (next < 128) {

+ } else {

appendKeywordToken(state.keyword);

return next;

- } else {

- return tokenizeIdentifier(next, start, allowDollar);

}

+ /**

+ * [allowDollar] can exclude '$', which is not allowed as part of a string

+ * interpolation identifier.

+ */

int tokenizeIdentifier(int next, int start, bool allowDollar) {

- bool isAscii = true;

while (true) {

if (($a <= next && next <= $z) ||

($A <= next && next <= $Z) ||

@@ -667,35 +983,21 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

identical(next, $_) ||

(identical(next, $$) && allowDollar)) {

next = advance();

- } else if ((next < 128) || (identical(next, $NBSP))) {

+ } else {

// Identifier ends here.

- if (start == byteOffset) {

- return error(const SourceString("expected identifier"));

- } else if (isAscii) {

- appendByteStringToken(IDENTIFIER_INFO, asciiString(start, 0));

+ if (start == scanOffset) {

+ return error("expected identifier");

} else {

- appendByteStringToken(BAD_INPUT_INFO, utf8String(start, -1));

+ appendSubstringToken(IDENTIFIER_INFO, start, true);

}

return next;

- } else {

- int nonAsciiStart = byteOffset;

- do {

- next = nextByte();

- if (identical(next, $NBSP)) break;

- } while (next > 127);

- String string = utf8String(nonAsciiStart, -1).slowToString();

- isAscii = false;

- int byteLength = nonAsciiStart - byteOffset;

- addToCharOffset(string.length - byteLength);

}

int tokenizeAt(int next) {

- int start = byteOffset;

- next = advance();

appendPrecedenceToken(AT_INFO);

- return next;

+ return advance();

}

int tokenizeString(int next, int start, bool raw) {

@@ -708,7 +1010,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

return tokenizeMultiLineString(quoteChar, start, raw);

} else {

// Empty string.

- appendByteStringToken(STRING_INFO, utf8String(start, -1));

+ appendSubstringToken(STRING_INFO, start, true);

return next;

}

@@ -719,56 +1021,72 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

}

- static bool isHexDigit(int character) {

- if ($0 <= character && character <= $9) return true;

- character |= 0x20;

- return ($a <= character && character <= $f);

- }

+ /**

+ * [next] is the first character after the qoute.

+ * [start] is the scanOffset of the quote.

+ *

+ * The token contains a substring of the source file, including the

+ * string quotes, backslashes for escaping. For interpolated strings,

+ * the parts before and after are separate tokens.

+ *

+ * "a $b c"

+ *

+ * gives StringToken("a $), StringToken(b) and StringToken( c").

+ */

int tokenizeSingleLineString(int next, int quoteChar, int start) {

+ bool asciiOnly = true;

while (!identical(next, quoteChar)) {

if (identical(next, $BACKSLASH)) {

next = advance();

} else if (identical(next, $$)) {

- next = tokenizeStringInterpolation(start);

- start = byteOffset;

+ if (!asciiOnly) handleUnicode(start);

+ next = tokenizeStringInterpolation(start, asciiOnly);

+ start = scanOffset;

+ asciiOnly = true;

continue;

}

if (next <= $CR

- && (identical(next, $LF) || identical(next, $CR) || identical(next, $EOF))) {

- return error(const SourceString("unterminated string literal"));

+ && (identical(next, $LF) ||

+ identical(next, $CR) ||

+ identical(next, $EOF))) {

+ if (!asciiOnly) handleUnicode(start);

+ return error("unterminated string literal");

}

+ if (next > 127) asciiOnly = false;

next = advance();

}

- appendByteStringToken(STRING_INFO, utf8String(start, 0));

- return advance();

+ if (!asciiOnly) handleUnicode(start);

+ // Advance past the quote character.

+ next = advance();

+ appendSubstringToken(STRING_INFO, start, asciiOnly);

+ return next;

}

- int tokenizeStringInterpolation(int start) {

- appendByteStringToken(STRING_INFO, utf8String(start, -1));

+ int tokenizeStringInterpolation(int start, bool asciiOnly) {

+ appendSubstringToken(STRING_INFO, start, asciiOnly);

beginToken(); // $ starts here.

int next = advance();

if (identical(next, $OPEN_CURLY_BRACKET)) {

- return tokenizeInterpolatedExpression(next, start);

+ return tokenizeInterpolatedExpression(next);

} else {

- return tokenizeInterpolatedIdentifier(next, start);

+ return tokenizeInterpolatedIdentifier(next);

}

- int tokenizeInterpolatedExpression(int next, int start) {

- appendBeginGroup(STRING_INTERPOLATION_INFO, "\${");

+ int tokenizeInterpolatedExpression(int next) {

+ appendBeginGroup(STRING_INTERPOLATION_INFO);

beginToken(); // The expression starts here.

- next = advance();

+ next = advance(); // Move past the curly bracket.

while (!identical(next, $EOF) && !identical(next, $STX)) {

next = bigSwitch(next);

}

if (identical(next, $EOF)) return next;

- next = advance();

+ next = advance(); // Move past the $STX.

beginToken(); // The string interpolation suffix starts here.

return next;

}

- int tokenizeInterpolatedIdentifier(int next, int start) {

+ int tokenizeInterpolatedIdentifier(int next) {

appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO);

beginToken(); // The identifier starts here.

next = tokenizeKeywordOrIdentifier(next, false);

@@ -777,23 +1095,45 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

}

int tokenizeSingleLineRawString(int next, int quoteChar, int start) {

- next = advance();

+ bool asciiOnly = true;

+ next = advance(); // Advance past the quote

while (next != $EOF) {

if (identical(next, quoteChar)) {

- appendByteStringToken(STRING_INFO, utf8String(start, 0));

- return advance();

+ if (!asciiOnly) handleUnicode(start);

+ next = advance();

+ appendSubstringToken(STRING_INFO, start, asciiOnly);

+ return next;

} else if (identical(next, $LF) || identical(next, $CR)) {

- return error(const SourceString("unterminated string literal"));

+ if (!asciiOnly) handleUnicode(start);

+ return error("unterminated string literal");

+ } else if (next > 127) {

+ asciiOnly = false;

}

next = advance();

}

- return error(const SourceString("unterminated string literal"));

+ if (!asciiOnly) handleUnicode(start);

+ return error("unterminated string literal");

}

int tokenizeMultiLineRawString(int quoteChar, int start) {

- int next = advance();

+ bool asciiOnlyString = true;

+ bool asciiOnlyLine = true;

+ int unicodeStart = start;

+ int next = advance(); // Advance past the (last) quote (of three)

outer: while (!identical(next, $EOF)) {

while (!identical(next, quoteChar)) {

+ if (identical(next, $LF)) {

+ if (!asciiOnlyLine) {

+ // Synchronize the string offset in the utf8 scanner.

+ handleUnicode(unicodeStart);

+ asciiOnlyLine = true;

+ unicodeStart = scanOffset;

+ }

+ lineFeedInMultiline();

+ } else if (next > 127) {

+ asciiOnlyLine = false;

+ asciiOnlyString = false;

+ }

next = advance();

if (identical(next, $EOF)) break outer;

}

@@ -801,21 +1141,31 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

if (identical(next, quoteChar)) {

next = advance();

if (identical(next, quoteChar)) {

- appendByteStringToken(STRING_INFO, utf8String(start, 0));

- return advance();

+ if (!asciiOnlyLine) handleUnicode(unicodeStart);

+ next = advance();

+ appendSubstringToken(STRING_INFO, start, asciiOnlyString);

+ return next;

}

- return error(const SourceString("unterminated string literal"));

+ if (!asciiOnlyLine) handleUnicode(unicodeStart);

+ return error("unterminated string literal");

}

int tokenizeMultiLineString(int quoteChar, int start, bool raw) {

if (raw) return tokenizeMultiLineRawString(quoteChar, start);

- int next = advance();

+ bool asciiOnlyString = true;

+ bool asciiOnlyLine = true;

+ int unicodeStart = start;

+ int next = advance(); // Advance past the (last) quote (of three).

while (!identical(next, $EOF)) {

if (identical(next, $$)) {

- next = tokenizeStringInterpolation(start);

- start = byteOffset;

+ if (!asciiOnlyLine) handleUnicode(unicodeStart);

+ next = tokenizeStringInterpolation(start, asciiOnlyString);

+ start = scanOffset;

+ unicodeStart = start;

+ asciiOnlyString = true; // A new string token is created for the rest.

+ asciiOnlyLine = true;

continue;

}

if (identical(next, quoteChar)) {

@@ -823,8 +1173,10 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

if (identical(next, quoteChar)) {

next = advance();

if (identical(next, quoteChar)) {

- appendByteStringToken(STRING_INFO, utf8String(start, 0));

- return advance();

+ if (!asciiOnlyLine) handleUnicode(unicodeStart);

+ next = advance();

+ appendSubstringToken(STRING_INFO, start, asciiOnlyString);

+ return next;

}

continue;

@@ -833,13 +1185,53 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {

next = advance();

if (identical(next, $EOF)) break;

}

+ if (identical(next, $LF)) {

+ if (!asciiOnlyLine) {

+ // Synchronize the string offset in the utf8 scanner.

+ handleUnicode(unicodeStart);

+ asciiOnlyLine = true;

+ unicodeStart = scanOffset;

+ }

+ lineFeedInMultiline();

+ } else if (next > 127) {

+ asciiOnlyString = false;

+ asciiOnlyLine = false;

+ }

next = advance();

}

- return error(const SourceString("unterminated string literal"));

+ if (!asciiOnlyLine) handleUnicode(unicodeStart);

+ return error("unterminated string literal");

}

- int error(SourceString message) {

- appendByteStringToken(BAD_INPUT_INFO, message);

+ int error(String message) {

+ appendStringToken(BAD_INPUT_INFO, message);

return advance(); // Ensure progress.

}

+ void unmatchedBeginGroup(BeginGroupToken begin) {

+ String error = 'unmatched "${begin.stringValue}"';

+ Token close =

+ new StringToken.fromString(

+ BAD_INPUT_INFO, error, begin.charOffset, true);

+ // We want to ensure that unmatched BeginGroupTokens are reported

+ // as errors. However, the rest of the parser assume the groups

+ // are well-balanced and will never look at the endGroup

+ // token. This is a nice property that allows us to skip quickly

+ // over correct code. By inserting an additional error token in

+ // the stream, we can keep ignoring endGroup tokens.

+ //

+ // [begin] --next--> [tail]

+ // [begin] --endG--> [close] --next--> [next] --next--> [tail]

+ //

+ // This allows the parser to skip from [begin] via endGroup to [close] and

+ // ignore the [close] token (assuming it's correct), then the error will be

+ // reported when parsing the [next] token.

+ Token next = new StringToken.fromString(

+ BAD_INPUT_INFO, error, begin.charOffset, true);

+ begin.endGroup = close;

+ close.next = next;

+ next.next = begin.next;

+ }

}