Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2551)

Unified Diff: sdk/lib/_internal/compiler/implementation/scanner/scanner.dart

Issue 27510003: Scanner for UTF-8 byte arrays (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 7 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: sdk/lib/_internal/compiler/implementation/scanner/scanner.dart
diff --git a/sdk/lib/_internal/compiler/implementation/scanner/scanner.dart b/sdk/lib/_internal/compiler/implementation/scanner/scanner.dart
index 16c238433f068319c29ef81d7ec4e29d53a80eae..f80a3e4a12f17e7accc3a40ea7fd3474a15b7853 100644
--- a/sdk/lib/_internal/compiler/implementation/scanner/scanner.dart
+++ b/sdk/lib/_internal/compiler/implementation/scanner/scanner.dart
@@ -6,75 +6,332 @@ part of scanner;
abstract class Scanner {
Token tokenize();
+
+ factory Scanner(SourceFile file, {bool includeComments: false}) {
+ if (file is Utf8BytesSourceFile) {
+ return new Utf8BytesScanner(file, includeComments: includeComments);
+ } else {
+ return new StringScanner(file, includeComments: includeComments);
+ }
+ }
}
-/**
- * Common base class for a Dart scanner.
- */
-abstract class AbstractScanner<T extends SourceString> implements Scanner {
+abstract class AbstractScanner implements Scanner {
+ final bool includeComments;
+
+ /**
+ * The string offset for the next token that will be created.
+ *
+ * Note that in the [Utf8BytesScanner], string offsets and [scanOffset] values
+ * are different. One string character can be encoded using multiple UTF-8
+ * bytes.
+ */
+ int tokenStart = -1;
+
+ /**
+ * A pointer to the token stream created by this scanner. The first token
+ * is a special token and not part of the source file. This is an
+ * implementation detail to avoids special cases in the scanner. This token
+ * is not exposed to clients of the scanner, which are expected to invoke
+ * [firstToken] to access the token stream.
+ */
+ final Token tokens = new SymbolToken(EOF_INFO, -1);
+
+ /**
+ * A pointer to the last scanned token.
+ */
+ Token tail;
+
+ /**
+ * The stack of open groups, e.g [: { ... ( .. :]
+ * Each BeginGroupToken has a pointer to the token where the group
+ * ends. This field is set when scanning the end group token.
+ */
+ Link<BeginGroupToken> groupingStack = const Link<BeginGroupToken>();
+
+ /**
+ * The source file that is being scanned. This field can be [:null:].
+ * If the source file is available, the scanner assigns its [:lineStarts:] and
+ * [:length:] fields at the end of [tokenize].
+ */
+ final SourceFile file;
+
+ final List<int> lineStarts = [0];
+
+ AbstractScanner(this.file, this.includeComments) {
+ this.tail = this.tokens;
+ }
+
+
+ /**
+ * Advances and returns the next character.
+ *
+ * If the next character is non-ASCII, then the returned value depends on the
+ * scanner implementation. The [Utf8BytesScanner] returns a UTF-8 byte, while
+ * the [StringScanner] returns a UTF-16 code unit.
+ *
+ * The scanner ensures that [advance] is not invoked after it returned [$EOF].
+ * This allows implementations to omit bound checks if the data structure ends
+ * with '0'.
+ */
int advance();
- int nextByte();
/**
- * Returns the current character or byte depending on the underlying input
- * kind. For example, [StringScanner] operates on [String] and thus returns
- * characters (Unicode codepoints represented as int) whereas
- * [ByteArrayScanner] operates on byte arrays and thus returns bytes.
+ * Returns the current unicode character.
+ *
+ * If the current character is ASCII, then it is returned unchanged.
+ *
+ * The [Utf8BytesScanner] decodes the next unicode code point starting at the
+ * current position. Note that every unicode character is returned as a single
+ * code point, i.e., for '\u{1d11e}' it returns 119070, and the following
+ * [advance] returns the next character.
+ *
+ * The [StringScanner] returns the current character unchanged, which might
+ * be a surrogate character. In the case of '\u{1d11e}', it returns the first
+ * code unit 55348, and the following [advance] returns the second code unit
+ * 56606.
+ *
+ * Invoking [currentAsUnicode] multiple times is safe, i.e.,
+ * [:currentAsUnicode(next) == currentAsUnicode(currentAsUnicode(next)):].
+ */
+ int currentAsUnicode(int next);
+
+ /**
+ * Returns the character at the next poisition. Like in [advance], the
+ * [Utf8BytesScanner] returns a UTF-8 byte, while the [StringScanner] returns
+ * a UTF-16 code unit.
*/
int peek();
/**
+ * Notifies the scanner that unicode characters were detected in either a
+ * comment or a string literal between [startScanOffset] and the current
+ * scan offset.
+ */
+ void handleUnicode(int startScanOffset);
+
+ /**
+ * Returns the current scan offset.
+ *
+ * In the [Utf8BytesScanner] this is the offset into the byte list, in the
+ * [StringScanner] the offset in the source string.
+ */
+ int get scanOffset;
+
+ /**
+ * Returns the current string offset.
+ *
+ * In the [StringScanner] this is identical to the [scanOffset]. In the
+ * [Utf8BytesScanner] it is computed based on encountered UTF-8 characters.
+ */
+ int get stringOffset;
+
+ /**
+ * Returns the first token scanned by this [Scanner].
+ */
+ Token firstToken();
+
+ /**
+ * Returns the last token scanned by this [Scanner].
+ */
+ Token previousToken();
+
+ /**
+ * Notifies that a new token starts at current offset.
+ */
+ void beginToken() {
+ tokenStart = stringOffset;
+ }
+
+ /**
+ * Appends a substring from the scan offset [:start:] to the current
+ * [:scanOffset:] plus the [:extraOffset:]. For example, if the current
+ * scanOffset is 10, then [:appendSubstringToken(5, -1):] will append the
+ * substring string [5,9).
+ *
+ * Note that [extraOffset] can only be used if the covered character(s) are
+ * known to be ASCII.
+ */
+ void appendSubstringToken(PrecedenceInfo info, int start,
+ bool asciiOnly, [int extraOffset]);
+
+ /**
+ * Appends a token whose kind is determined by [info] and content is defined
+ * by the String [value].
+ *
+ * This method is invoked for class names, field names, method names, types,
+ * etc.
+ */
+ void appendStringToken(PrecedenceInfo info, String value) {
+ tail.next = new StringToken.fromString(info, value, tokenStart, true);
+ tail = tail.next;
+ }
+
+ /**
+ * Appends a fixed token whose kind and content is determined by [info].
+ * Appends an *operator* token from [info].
+ *
+ * An operator token represent operators like ':', '.', ';', '&&', '==', '--',
+ * '=>', etc.
+ */
+ void appendPrecedenceToken(PrecedenceInfo info) {
+ tail.next = new SymbolToken(info, tokenStart);
+ tail = tail.next;
+ }
+
+ /**
* Appends a fixed token based on whether the current char is [choice] or not.
* If the current char is [choice] a fixed token whose kind and content
* is determined by [yes] is appended, otherwise a fixed token whose kind
* and content is determined by [no] is appended.
*/
- int select(int choice, PrecedenceInfo yes, PrecedenceInfo no);
+ int select(int choice, PrecedenceInfo yes, PrecedenceInfo no) {
+ int next = advance();
+ if (identical(next, choice)) {
+ appendPrecedenceToken(yes);
+ return advance();
+ } else {
+ appendPrecedenceToken(no);
+ return next;
+ }
+ }
/**
- * Appends a fixed token whose kind and content is determined by [info].
+ * Appends a keyword token whose kind is determined by [keyword].
+ */
+ void appendKeywordToken(Keyword keyword) {
+ String syntax = keyword.syntax;
+ // Type parameters and arguments cannot contain 'this' or 'super'.
+ if (identical(syntax, 'this') || identical(syntax, 'super')) {
+ discardOpenLt();
+ }
+ tail.next = new KeywordToken(keyword, tokenStart);
+ tail = tail.next;
+ }
+
+ void appendEofToken() {
+ beginToken();
+ tail.next = new SymbolToken(EOF_INFO, tokenStart);
+ tail = tail.next;
+ // EOF points to itself so there's always infinite look-ahead.
+ tail.next = tail;
+ discardOpenLt();
+ while (!groupingStack.isEmpty) {
+ unmatchedBeginGroup(groupingStack.head);
+ groupingStack = groupingStack.tail;
+ }
+ }
+
+ /**
+ * Notifies scanning a whitespace character. Note that [appendWhiteSpace] is
+ * not always invoked for [$SPACE] characters.
+ *
+ * This method is used by the scanners to track line breaks and create the
+ * [lineStarts] map.
*/
- void appendPrecedenceToken(PrecedenceInfo info);
+ void appendWhiteSpace(int next) {
+ if (next == $LF && file != null) {
+ lineStarts.add(stringOffset + 1); // +1, the line starts after the $LF.
+ }
+ }
/**
- * Appends a token whose kind is determined by [info] and content is [value].
+ * Notifies on [$LF] characters in multi-line commends or strings.
+ *
+ * This method is used by the scanners to track line breaks and create the
+ * [lineStarts] map.
*/
- void appendStringToken(PrecedenceInfo info, String value);
+ void lineFeedInMultiline() {
+ if (file != null) {
+ lineStarts.add(stringOffset + 1);
+ }
+ }
/**
- * Appends a token whose kind is determined by [info] and content is defined
- * by the SourceString [value].
+ * Appends a token that begins a new group, represented by [value].
+ * Group begin tokens are '{', '(', '[' and '${'.
*/
- void appendByteStringToken(PrecedenceInfo info, T value);
+ void appendBeginGroup(PrecedenceInfo info) {
+ Token token = new BeginGroupToken(info, tokenStart);
+ tail.next = token;
+ tail = tail.next;
+
+ // { ( [ ${ cannot appear inside a type parameters / arguments.
+ if (!identical(info.kind, LT_TOKEN)) discardOpenLt();
+ groupingStack = groupingStack.prepend(token);
+ }
/**
- * Appends a keyword token whose kind is determined by [keyword].
+ * Appends a token that begins a ends group, represented by [value].
+ * It handles the group end tokens '}', ')' and ']'. The tokens '>' and
+ * '>>' are handled separately bo [appendGt] and [appendGtGt].
*/
- void appendKeywordToken(Keyword keyword);
- void appendWhiteSpace(int next);
- void appendEofToken();
+ int appendEndGroup(PrecedenceInfo info, int openKind) {
+ assert(!identical(openKind, LT_TOKEN)); // openKind is < for > and >>
+ appendPrecedenceToken(info);
+ // Don't report unmatched errors for <; it is also the less-than operator.
+ discardOpenLt();
+ if (groupingStack.isEmpty) {
+ return advance();
+ }
+ BeginGroupToken begin = groupingStack.head;
+ if (!identical(begin.kind, openKind)) {
+ if (!identical(openKind, OPEN_CURLY_BRACKET_TOKEN) ||
+ !identical(begin.kind, STRING_INTERPOLATION_TOKEN)) {
+ // Not ending string interpolation.
+ unmatchedBeginGroup(begin);
+ return advance();
+ }
+ // We're ending an interpolated expression.
+ begin.endGroup = tail;
+ groupingStack = groupingStack.tail;
+ // Using "start-of-text" to signal that we're back in string
+ // scanning mode.
+ return $STX;
+ }
+ begin.endGroup = tail;
+ groupingStack = groupingStack.tail;
+ return advance();
+ }
/**
- * Creates an ASCII SourceString whose content begins at the source byte
- * offset [start] and ends at [offset] bytes from the current byte offset of
- * the scanner. For example, if the current byte offset is 10,
- * [:asciiString(0,-1):] creates an ASCII SourceString whose content is found
- * at the [0,9[ byte interval of the source text.
+ * Appends a token for '>'.
+ * This method does not issue unmatched errors, because > is also the
+ * greater-than operator. It does not necessarily have to close a group.
*/
- T asciiString(int start, int offset);
- T utf8String(int start, int offset);
- Token firstToken();
- Token previousToken();
- void beginToken();
- void addToCharOffset(int offset);
- int get charOffset;
- int get byteOffset;
- void appendBeginGroup(PrecedenceInfo info, String value);
- int appendEndGroup(PrecedenceInfo info, String value, int openKind);
- void appendGt(PrecedenceInfo info, String value);
- void appendGtGt(PrecedenceInfo info, String value);
- void appendGtGtGt(PrecedenceInfo info, String value);
- void appendComment();
+ void appendGt(PrecedenceInfo info) {
+ appendPrecedenceToken(info);
+ if (groupingStack.isEmpty) return;
+ if (identical(groupingStack.head.kind, LT_TOKEN)) {
+ groupingStack.head.endGroup = tail;
+ groupingStack = groupingStack.tail;
+ }
+ }
+
+ /**
+ * Appends a token for '>>'.
+ * This method does not issue unmatched errors, because >> is also the
+ * shift operator. It does not necessarily have to close a group.
+ */
+ void appendGtGt(PrecedenceInfo info) {
+ appendPrecedenceToken(info);
+ if (groupingStack.isEmpty) return;
+ if (identical(groupingStack.head.kind, LT_TOKEN)) {
+ // Don't assign endGroup: in "T<U<V>>", the '>>' token closes the outer
+ // '<', the inner '<' is left without endGroup.
+ groupingStack = groupingStack.tail;
+ }
+ if (groupingStack.isEmpty) return;
+ if (identical(groupingStack.head.kind, LT_TOKEN)) {
+ groupingStack.head.endGroup = tail;
+ groupingStack = groupingStack.tail;
+ }
+ }
+
+ void appendComment(start, bool asciiOnly) {
+ if (!includeComments) return;
+ appendSubstringToken(COMMENT_INFO, start, asciiOnly);
+ }
/**
* We call this method to discard '<' from the "grouping" stack
@@ -88,7 +345,12 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
* something which cannot possibly be part of a type
* parameter/argument list.
*/
- void discardOpenLt();
+ void discardOpenLt() {
+ while (!groupingStack.isEmpty
+ && identical(groupingStack.head.kind, LT_TOKEN)) {
+ groupingStack = groupingStack.tail;
+ }
+ }
// TODO(ahe): Move this class to implementation.
@@ -98,6 +360,14 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
next = bigSwitch(next);
}
appendEofToken();
+
+ if (file != null) {
+ file.length = stringOffset;
+ // One additional line start at the end, see [SourceFile.lineStarts].
+ lineStarts.add(stringOffset + 1);
+ file.lineStarts = lineStarts;
+ }
+
return firstToken();
}
@@ -107,8 +377,10 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
|| identical(next, $LF) || identical(next, $CR)) {
appendWhiteSpace(next);
next = advance();
+ // Sequences of spaces are common, so advance through them fast.
while (identical(next, $SPACE)) {
- appendWhiteSpace(next);
+ // We don't invoke [:appendWhiteSpace(next):] here for efficiency,
+ // assuming that it does not do anything for space characters.
next = advance();
}
return next;
@@ -121,8 +393,10 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
return tokenizeKeywordOrIdentifier(next, true);
}
- if (($A <= next && next <= $Z) || identical(next, $_) || identical(next, $$)) {
- return tokenizeIdentifier(next, byteOffset, true);
+ if (($A <= next && next <= $Z) ||
+ identical(next, $_) ||
+ identical(next, $$)) {
+ return tokenizeIdentifier(next, scanOffset, true);
}
if (identical(next, $LT)) {
@@ -187,12 +461,12 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
}
if (identical(next, $OPEN_PAREN)) {
- appendBeginGroup(OPEN_PAREN_INFO, "(");
+ appendBeginGroup(OPEN_PAREN_INFO);
return advance();
}
if (identical(next, $CLOSE_PAREN)) {
- return appendEndGroup(CLOSE_PAREN_INFO, ")", OPEN_PAREN_TOKEN);
+ return appendEndGroup(CLOSE_PAREN_INFO, OPEN_PAREN_TOKEN);
}
if (identical(next, $COMMA)) {
@@ -218,7 +492,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
}
if (identical(next, $CLOSE_SQUARE_BRACKET)) {
- return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO, "]",
+ return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO,
OPEN_SQUARE_BRACKET_TOKEN);
}
@@ -228,12 +502,12 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
}
if (identical(next, $OPEN_CURLY_BRACKET)) {
- appendBeginGroup(OPEN_CURLY_BRACKET_INFO, "{");
+ appendBeginGroup(OPEN_CURLY_BRACKET_INFO);
return advance();
}
if (identical(next, $CLOSE_CURLY_BRACKET)) {
- return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, "}",
+ return appendEndGroup(CLOSE_CURLY_BRACKET_INFO,
OPEN_CURLY_BRACKET_TOKEN);
}
@@ -246,7 +520,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
}
if (identical(next, $DQ) || identical(next, $SQ)) {
- return tokenizeString(next, byteOffset, false);
+ return tokenizeString(next, scanOffset, false);
}
if (identical(next, $PERIOD)) {
@@ -268,7 +542,11 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
return $EOF;
}
if (next < 0x1f) {
- return error(new SourceString("unexpected character $next"));
+ return error("unexpected character $next");
+ }
+
+ if (next >= 128) {
+ next = currentAsUnicode(next);
}
// The following are non-ASCII characters.
@@ -278,16 +556,22 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
return advance();
}
- return tokenizeIdentifier(next, byteOffset, true);
+ return error("unexpected unicode character $next");
}
int tokenizeTag(int next) {
// # or #!.*[\n\r]
- if (byteOffset == 0) {
+ if (scanOffset == 0) {
if (identical(peek(), $BANG)) {
+ int start = scanOffset + 1;
+ bool asciiOnly = true;
do {
next = advance();
- } while (!identical(next, $LF) && !identical(next, $CR) && !identical(next, $EOF));
+ if (next > 127) asciiOnly = false;
+ } while (!identical(next, $LF) &&
+ !identical(next, $CR) &&
+ !identical(next, $EOF));
+ if (!asciiOnly) handleUnicode(start);
return next;
}
}
@@ -311,11 +595,12 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
next = advance();
if (identical(next, $CLOSE_SQUARE_BRACKET)) {
Token token = previousToken();
- if (token is KeywordToken && identical(token.value.stringValue, 'operator')) {
+ if (token is KeywordToken &&
+ identical((token as KeywordToken).keyword.syntax, 'operator')) {
return select($EQ, INDEX_EQ_INFO, INDEX_INFO);
}
}
- appendBeginGroup(OPEN_SQUARE_BRACKET_INFO, "[");
+ appendBeginGroup(OPEN_SQUARE_BRACKET_INFO);
return next;
}
@@ -379,7 +664,6 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
}
}
-
int tokenizePlus(int next) {
// + ++ +=
next = advance();
@@ -396,7 +680,9 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
}
int tokenizeExclamation(int next) {
- // ! != !==
+ // ! !=
+ // !== is kept for user-friendly error reporting
+
next = advance();
if (identical(next, $EQ)) {
return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO);
@@ -406,7 +692,8 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
}
int tokenizeEquals(int next) {
- // = == ===
+ // = == =>
+ // === is kept for user-friendly error reporting
// Type parameters and arguments cannot contain any token that
// starts with '='.
@@ -424,7 +711,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
}
int tokenizeGreaterThan(int next) {
- // > >= >> >>= >>> >>>=
+ // > >= >> >>=
next = advance();
if (identical($EQ, next)) {
appendPrecedenceToken(GT_EQ_INFO);
@@ -435,11 +722,11 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
appendPrecedenceToken(GT_GT_EQ_INFO);
return advance();
} else {
- appendGtGt(GT_GT_INFO, ">>");
+ appendGtGt(GT_GT_INFO);
return next;
}
} else {
- appendGt(GT_INFO, ">");
+ appendGt(GT_INFO);
return next;
}
}
@@ -453,13 +740,13 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
} else if (identical($LT, next)) {
return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO);
} else {
- appendBeginGroup(LT_INFO, "<");
+ appendBeginGroup(LT_INFO);
return next;
}
}
int tokenizeNumber(int next) {
- int start = byteOffset;
+ int start = scanOffset;
while (true) {
next = advance();
if ($0 <= next && next <= $9) {
@@ -473,7 +760,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
return tokenizeFractionPart(advance(), start);
}
}
- appendByteStringToken(INT_INFO, asciiString(start, 0));
+ appendSubstringToken(INT_INFO, start, true);
return next;
}
}
@@ -482,14 +769,14 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
int tokenizeHexOrNumber(int next) {
int x = peek();
if (identical(x, $x) || identical(x, $X)) {
- advance();
- return tokenizeHex(x);
+ return tokenizeHex(next);
}
return tokenizeNumber(next);
}
int tokenizeHex(int next) {
- int start = byteOffset - 1;
+ int start = scanOffset;
+ next = advance(); // Advance past the $x or $X.
bool hasDigits = false;
while (true) {
next = advance();
@@ -499,16 +786,16 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
hasDigits = true;
} else {
if (!hasDigits) {
- return error(const SourceString("hex digit expected"));
+ return error("hex digit expected");
}
- appendByteStringToken(HEXADECIMAL_INFO, asciiString(start, 0));
+ appendSubstringToken(HEXADECIMAL_INFO, start, true);
return next;
}
}
}
int tokenizeDotsOrNumber(int next) {
- int start = byteOffset;
+ int start = scanOffset;
next = advance();
if (($0 <= next && next <= $9)) {
return tokenizeFractionPart(next, start);
@@ -538,15 +825,18 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
next = advance();
}
if (!hasDigit) {
- appendByteStringToken(INT_INFO, asciiString(start, -1));
+ // Reduce offset, we already advanced to the token past the period.
+ appendSubstringToken(INT_INFO, start, true, -1);
+
+ // TODO(ahe): Wrong offset for the period. Cannot call beginToken because
+ // the scanner already advanced past the period.
if (identical($PERIOD, next)) {
return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);
}
- // TODO(ahe): Wrong offset for the period.
appendPrecedenceToken(PERIOD_INFO);
- return bigSwitch(next);
+ return next;
}
- appendByteStringToken(DOUBLE_INFO, asciiString(start, 0));
+ appendSubstringToken(DOUBLE_INFO, start, true);
return next;
}
@@ -560,7 +850,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
hasDigits = true;
} else {
if (!hasDigits) {
- return error(const SourceString("digit expected"));
+ return error("digit expected");
}
return next;
}
@@ -569,11 +859,12 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
}
int tokenizeSlashOrComment(int next) {
+ int start = scanOffset;
next = advance();
if (identical($STAR, next)) {
- return tokenizeMultiLineComment(next);
+ return tokenizeMultiLineComment(next, start);
} else if (identical($SLASH, next)) {
- return tokenizeSingleLineComment(next);
+ return tokenizeSingleLineComment(next, start);
} else if (identical($EQ, next)) {
appendPrecedenceToken(SLASH_EQ_INFO);
return advance();
@@ -583,30 +874,41 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
}
}
- int tokenizeSingleLineComment(int next) {
+ int tokenizeSingleLineComment(int next, int start) {
+ bool asciiOnly = true;
while (true) {
next = advance();
- if (identical($LF, next) || identical($CR, next) || identical($EOF, next)) {
- appendComment();
+ if (next > 127) asciiOnly = false;
+ if (identical($LF, next) ||
+ identical($CR, next) ||
+ identical($EOF, next)) {
+ if (!asciiOnly) handleUnicode(start);
+ appendComment(start, asciiOnly);
return next;
}
}
}
- int tokenizeMultiLineComment(int next) {
+
+ int tokenizeMultiLineComment(int next, int start) {
+ bool asciiOnlyComment = true; // Track if the entire comment is ASCII.
+ bool asciiOnlyLines = true; // Track ASCII since the last handleUnicode.
+ int unicodeStart = start;
int nesting = 1;
next = advance();
while (true) {
if (identical($EOF, next)) {
- // TODO(ahe): Report error.
+ if (!asciiOnlyLines) handleUnicode(unicodeStart);
+ appendStringToken(BAD_INPUT_INFO, "unterminated multi-line comment");
return next;
} else if (identical($STAR, next)) {
next = advance();
if (identical($SLASH, next)) {
--nesting;
if (0 == nesting) {
+ if (!asciiOnlyLines) handleUnicode(unicodeStart);
next = advance();
- appendComment();
+ appendComment(start, asciiOnlyComment);
return next;
} else {
next = advance();
@@ -618,16 +920,30 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
next = advance();
++nesting;
}
+ } else if (identical(next, $LF)) {
+ if (!asciiOnlyLines) {
+ // Synchronize the string offset in the utf8 scanner.
+ handleUnicode(unicodeStart);
+ asciiOnlyLines = true;
+ unicodeStart = scanOffset;
+ }
+ lineFeedInMultiline();
+ next = advance();
} else {
+ if (next > 127) {
+ asciiOnlyLines = false;
+ asciiOnlyComment = false;
+ }
next = advance();
}
}
}
int tokenizeRawStringKeywordOrIdentifier(int next) {
+ // [next] is $r.
int nextnext = peek();
if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) {
- int start = byteOffset;
+ int start = scanOffset;
next = advance();
return tokenizeString(next, start, true);
}
@@ -636,7 +952,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
int tokenizeKeywordOrIdentifier(int next, bool allowDollar) {
KeywordState state = KeywordState.KEYWORD_STATE;
- int start = byteOffset;
+ int start = scanOffset;
while (state != null && $a <= next && next <= $z) {
state = state.next(next);
next = advance();
@@ -649,17 +965,17 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
identical(next, $_) ||
identical(next, $$)) {
return tokenizeIdentifier(next, start, allowDollar);
- } else if (next < 128) {
+ } else {
appendKeywordToken(state.keyword);
return next;
- } else {
- return tokenizeIdentifier(next, start, allowDollar);
}
}
+ /**
+ * [allowDollar] can exclude '$', which is not allowed as part of a string
+ * interpolation identifier.
+ */
int tokenizeIdentifier(int next, int start, bool allowDollar) {
- bool isAscii = true;
-
while (true) {
if (($a <= next && next <= $z) ||
($A <= next && next <= $Z) ||
@@ -667,35 +983,21 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
identical(next, $_) ||
(identical(next, $$) && allowDollar)) {
next = advance();
- } else if ((next < 128) || (identical(next, $NBSP))) {
+ } else {
// Identifier ends here.
- if (start == byteOffset) {
- return error(const SourceString("expected identifier"));
- } else if (isAscii) {
- appendByteStringToken(IDENTIFIER_INFO, asciiString(start, 0));
+ if (start == scanOffset) {
+ return error("expected identifier");
} else {
- appendByteStringToken(BAD_INPUT_INFO, utf8String(start, -1));
+ appendSubstringToken(IDENTIFIER_INFO, start, true);
}
return next;
- } else {
- int nonAsciiStart = byteOffset;
- do {
- next = nextByte();
- if (identical(next, $NBSP)) break;
- } while (next > 127);
- String string = utf8String(nonAsciiStart, -1).slowToString();
- isAscii = false;
- int byteLength = nonAsciiStart - byteOffset;
- addToCharOffset(string.length - byteLength);
}
}
}
int tokenizeAt(int next) {
- int start = byteOffset;
- next = advance();
appendPrecedenceToken(AT_INFO);
- return next;
+ return advance();
}
int tokenizeString(int next, int start, bool raw) {
@@ -708,7 +1010,7 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
return tokenizeMultiLineString(quoteChar, start, raw);
} else {
// Empty string.
- appendByteStringToken(STRING_INFO, utf8String(start, -1));
+ appendSubstringToken(STRING_INFO, start, true);
return next;
}
}
@@ -719,56 +1021,72 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
}
}
- static bool isHexDigit(int character) {
- if ($0 <= character && character <= $9) return true;
- character |= 0x20;
- return ($a <= character && character <= $f);
- }
-
+ /**
+ * [next] is the first character after the qoute.
+ * [start] is the scanOffset of the quote.
+ *
+ * The token contains a substring of the source file, including the
+ * string quotes, backslashes for escaping. For interpolated strings,
+ * the parts before and after are separate tokens.
+ *
+ * "a $b c"
+ *
+ * gives StringToken("a $), StringToken(b) and StringToken( c").
+ */
int tokenizeSingleLineString(int next, int quoteChar, int start) {
+ bool asciiOnly = true;
while (!identical(next, quoteChar)) {
if (identical(next, $BACKSLASH)) {
next = advance();
} else if (identical(next, $$)) {
- next = tokenizeStringInterpolation(start);
- start = byteOffset;
+ if (!asciiOnly) handleUnicode(start);
+ next = tokenizeStringInterpolation(start, asciiOnly);
+ start = scanOffset;
+ asciiOnly = true;
continue;
}
if (next <= $CR
- && (identical(next, $LF) || identical(next, $CR) || identical(next, $EOF))) {
- return error(const SourceString("unterminated string literal"));
+ && (identical(next, $LF) ||
+ identical(next, $CR) ||
+ identical(next, $EOF))) {
+ if (!asciiOnly) handleUnicode(start);
+ return error("unterminated string literal");
}
+ if (next > 127) asciiOnly = false;
next = advance();
}
- appendByteStringToken(STRING_INFO, utf8String(start, 0));
- return advance();
+ if (!asciiOnly) handleUnicode(start);
+ // Advance past the quote character.
+ next = advance();
+ appendSubstringToken(STRING_INFO, start, asciiOnly);
+ return next;
}
- int tokenizeStringInterpolation(int start) {
- appendByteStringToken(STRING_INFO, utf8String(start, -1));
+ int tokenizeStringInterpolation(int start, bool asciiOnly) {
+ appendSubstringToken(STRING_INFO, start, asciiOnly);
beginToken(); // $ starts here.
int next = advance();
if (identical(next, $OPEN_CURLY_BRACKET)) {
- return tokenizeInterpolatedExpression(next, start);
+ return tokenizeInterpolatedExpression(next);
} else {
- return tokenizeInterpolatedIdentifier(next, start);
+ return tokenizeInterpolatedIdentifier(next);
}
}
- int tokenizeInterpolatedExpression(int next, int start) {
- appendBeginGroup(STRING_INTERPOLATION_INFO, "\${");
+ int tokenizeInterpolatedExpression(int next) {
+ appendBeginGroup(STRING_INTERPOLATION_INFO);
beginToken(); // The expression starts here.
- next = advance();
+ next = advance(); // Move past the curly bracket.
while (!identical(next, $EOF) && !identical(next, $STX)) {
next = bigSwitch(next);
}
if (identical(next, $EOF)) return next;
- next = advance();
+ next = advance(); // Move past the $STX.
beginToken(); // The string interpolation suffix starts here.
return next;
}
- int tokenizeInterpolatedIdentifier(int next, int start) {
+ int tokenizeInterpolatedIdentifier(int next) {
appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO);
beginToken(); // The identifier starts here.
next = tokenizeKeywordOrIdentifier(next, false);
@@ -777,23 +1095,45 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
}
int tokenizeSingleLineRawString(int next, int quoteChar, int start) {
- next = advance();
+ bool asciiOnly = true;
+ next = advance(); // Advance past the quote
while (next != $EOF) {
if (identical(next, quoteChar)) {
- appendByteStringToken(STRING_INFO, utf8String(start, 0));
- return advance();
+ if (!asciiOnly) handleUnicode(start);
+ next = advance();
+ appendSubstringToken(STRING_INFO, start, asciiOnly);
+ return next;
} else if (identical(next, $LF) || identical(next, $CR)) {
- return error(const SourceString("unterminated string literal"));
+ if (!asciiOnly) handleUnicode(start);
+ return error("unterminated string literal");
+ } else if (next > 127) {
+ asciiOnly = false;
}
next = advance();
}
- return error(const SourceString("unterminated string literal"));
+ if (!asciiOnly) handleUnicode(start);
+ return error("unterminated string literal");
}
int tokenizeMultiLineRawString(int quoteChar, int start) {
- int next = advance();
+ bool asciiOnlyString = true;
+ bool asciiOnlyLine = true;
+ int unicodeStart = start;
+ int next = advance(); // Advance past the (last) quote (of three)
outer: while (!identical(next, $EOF)) {
while (!identical(next, quoteChar)) {
+ if (identical(next, $LF)) {
+ if (!asciiOnlyLine) {
+ // Synchronize the string offset in the utf8 scanner.
+ handleUnicode(unicodeStart);
+ asciiOnlyLine = true;
+ unicodeStart = scanOffset;
+ }
+ lineFeedInMultiline();
+ } else if (next > 127) {
+ asciiOnlyLine = false;
+ asciiOnlyString = false;
+ }
next = advance();
if (identical(next, $EOF)) break outer;
}
@@ -801,21 +1141,31 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
if (identical(next, quoteChar)) {
next = advance();
if (identical(next, quoteChar)) {
- appendByteStringToken(STRING_INFO, utf8String(start, 0));
- return advance();
+ if (!asciiOnlyLine) handleUnicode(unicodeStart);
+ next = advance();
+ appendSubstringToken(STRING_INFO, start, asciiOnlyString);
+ return next;
}
}
}
- return error(const SourceString("unterminated string literal"));
+ if (!asciiOnlyLine) handleUnicode(unicodeStart);
+ return error("unterminated string literal");
}
int tokenizeMultiLineString(int quoteChar, int start, bool raw) {
if (raw) return tokenizeMultiLineRawString(quoteChar, start);
- int next = advance();
+ bool asciiOnlyString = true;
+ bool asciiOnlyLine = true;
+ int unicodeStart = start;
+ int next = advance(); // Advance past the (last) quote (of three).
while (!identical(next, $EOF)) {
if (identical(next, $$)) {
- next = tokenizeStringInterpolation(start);
- start = byteOffset;
+ if (!asciiOnlyLine) handleUnicode(unicodeStart);
+ next = tokenizeStringInterpolation(start, asciiOnlyString);
+ start = scanOffset;
+ unicodeStart = start;
+ asciiOnlyString = true; // A new string token is created for the rest.
+ asciiOnlyLine = true;
continue;
}
if (identical(next, quoteChar)) {
@@ -823,8 +1173,10 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
if (identical(next, quoteChar)) {
next = advance();
if (identical(next, quoteChar)) {
- appendByteStringToken(STRING_INFO, utf8String(start, 0));
- return advance();
+ if (!asciiOnlyLine) handleUnicode(unicodeStart);
+ next = advance();
+ appendSubstringToken(STRING_INFO, start, asciiOnlyString);
+ return next;
}
}
continue;
@@ -833,13 +1185,53 @@ abstract class AbstractScanner<T extends SourceString> implements Scanner {
next = advance();
if (identical(next, $EOF)) break;
}
+ if (identical(next, $LF)) {
+ if (!asciiOnlyLine) {
+ // Synchronize the string offset in the utf8 scanner.
+ handleUnicode(unicodeStart);
+ asciiOnlyLine = true;
+ unicodeStart = scanOffset;
+ }
+ lineFeedInMultiline();
+ } else if (next > 127) {
+ asciiOnlyString = false;
+ asciiOnlyLine = false;
+ }
next = advance();
}
- return error(const SourceString("unterminated string literal"));
+ if (!asciiOnlyLine) handleUnicode(unicodeStart);
+ return error("unterminated string literal");
}
- int error(SourceString message) {
- appendByteStringToken(BAD_INPUT_INFO, message);
+ int error(String message) {
+ appendStringToken(BAD_INPUT_INFO, message);
return advance(); // Ensure progress.
}
+
+ void unmatchedBeginGroup(BeginGroupToken begin) {
+ String error = 'unmatched "${begin.stringValue}"';
+ Token close =
+ new StringToken.fromString(
+ BAD_INPUT_INFO, error, begin.charOffset, true);
+
+ // We want to ensure that unmatched BeginGroupTokens are reported
+ // as errors. However, the rest of the parser assume the groups
+ // are well-balanced and will never look at the endGroup
+ // token. This is a nice property that allows us to skip quickly
+ // over correct code. By inserting an additional error token in
+ // the stream, we can keep ignoring endGroup tokens.
+ //
+ // [begin] --next--> [tail]
+ // [begin] --endG--> [close] --next--> [next] --next--> [tail]
+ //
+ // This allows the parser to skip from [begin] via endGroup to [close] and
+ // ignore the [close] token (assuming it's correct), then the error will be
+ // reported when parsing the [next] token.
+
+ Token next = new StringToken.fromString(
+ BAD_INPUT_INFO, error, begin.charOffset, true);
+ begin.endGroup = close;
+ close.next = next;
+ next.next = begin.next;
+ }
}

Powered by Google App Engine
This is Rietveld 408576698