sdk/lib/_internal/compiler/implementation/scanner/scanner.dart - Issue 27510003: Scanner for UTF-8 byte arrays

Unified Diff: sdk/lib/_internal/compiler/implementation/scanner/scanner.dart

Issue 27510003: Scanner for UTF-8 byte arrays (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: fixes compiler tests Created 7 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« sdk/lib/_internal/compiler/implementation/scanner/array_based_scanner.dart ('K') | « sdk/lib/_internal/compiler/implementation/scanner/parser.dart ('k') | sdk/lib/_internal/compiler/implementation/scanner/scanner_task.dart » ('j') | sdk/lib/_internal/compiler/implementation/scanner/scanner_task.dart » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: sdk/lib/_internal/compiler/implementation/scanner/scanner.dart

diff --git a/sdk/lib/_internal/compiler/implementation/scanner/scanner.dart b/sdk/lib/_internal/compiler/implementation/scanner/scanner.dart

index 16c238433f068319c29ef81d7ec4e29d53a80eae..771290b729ed65bd80ef6de53b5df25aae98ebbf 100644

--- a/sdk/lib/_internal/compiler/implementation/scanner/scanner.dart

+++ b/sdk/lib/_internal/compiler/implementation/scanner/scanner.dart

@@ -6,88 +6,185 @@ part of scanner;

abstract class Scanner {

Token tokenize();

+ factory Scanner(SourceFile file, {bool includeComments: false}) {

+ if (file is Utf8BytesSourceFile) {

+ return new Utf8BytesScanner(file, includeComments: includeComments);

+ } else {

+ return new StringScanner(file, includeComments: includeComments);

+ }

}

-/**

- * Common base class for a Dart scanner.

- */

-abstract class AbstractScanner<T extends SourceString> implements Scanner {

- int advance();

- int nextByte();

+abstract class AbstractScanner implements Scanner {

+ final bool includeComments;

/**

- * Returns the current character or byte depending on the underlying input

- * kind. For example, [StringScanner] operates on [String] and thus returns

- * characters (Unicode codepoints represented as int) whereas

- * [ByteArrayScanner] operates on byte arrays and thus returns bytes.

+ * The string offset for the next token that will be created.

+ *

+ * Note that in the [Utf8BytesScanner], [stringOffset] and [scanOffset] values

+ * are different. One string character can be encoded using multiple UTF-8

+ * bytes.

- int peek();

+ int tokenStart = -1;

/**

- * Appends a fixed token based on whether the current char is [choice] or not.

- * If the current char is [choice] a fixed token whose kind and content

- * is determined by [yes] is appended, otherwise a fixed token whose kind

- * and content is determined by [no] is appended.

+ * A pointer to the token stream created by this scanner. The first token

+ * is a special token and not part of the source file. This is an

+ * implementation detail to avoids special cases in the scanner. This token

+ * is not exposed to clients of the scanner, which are expected to invoke

+ * [firstToken] to access the token stream.

- int select(int choice, PrecedenceInfo yes, PrecedenceInfo no);

+ final Token tokens = new SymbolToken(EOF_INFO, -1);

/**

- * Appends a fixed token whose kind and content is determined by [info].

+ * A pointer to the last scanned token.

- void appendPrecedenceToken(PrecedenceInfo info);

+ Token tail;

/**

- * Appends a token whose kind is determined by [info] and content is [value].

+ * The source file that is being scanned. This field can be [:null:].

+ * If the source file is available, the scanner assigns its [:lineStarts:] and

+ * [:length:] fields at the end of [tokenize].

- void appendStringToken(PrecedenceInfo info, String value);

+ final SourceFile file;

+ final List<int> lineStarts = [0];

ngeoffray 2013/10/18 10:19:37 <int>[0]

lukas 2013/10/24 16:48:36 Done.

+ AbstractScanner(this.file, this.includeComments) {

+ this.tail = this.tokens;

+ }

ngeoffray 2013/10/18 10:19:37 Extra line.

lukas 2013/10/24 16:48:36 Done.

+ /**

+ * Advances and returns the next character.

+ *

+ * If the next character is non-ASCII, then the returned value depends on the

+ * scanner implementation. The [Utf8BytesScanner] returns a UTF-8 byte, while

+ * the [StringScanner] returns a UTF-16 code unit.

+ *

+ * The scanner ensures that [advance] is not invoked after it returned [$EOF].

+ * This allows implementations to omit bound checks if the data structure ends

+ * with '0'.

+ */

+ int advance();

/**

- * Appends a token whose kind is determined by [info] and content is defined

- * by the SourceString [value].

+ * Returns the current unicode character.

+ *

+ * If the current character is ASCII, then it is returned unchanged.

+ *

+ * The [Utf8BytesScanner] decodes the next unicode code point starting at the

+ * current position. Note that every unicode character is returned as a single

+ * code point, i.e., for '\u{1d11e}' it returns 119070, and the following

+ * [advance] returns the next character.

+ *

+ * The [StringScanner] returns the current character unchanged, which might

+ * be a surrogate character. In the case of '\u{1d11e}', it returns the first

+ * code unit 55348, and the following [advance] returns the second code unit

+ * 56606.

+ *

+ * Invoking [currentAsUnicode] multiple times is safe, i.e.,

ngeoffray 2013/10/18 10:19:37 i.e. -> that is

lukas 2013/10/24 16:48:36 Done.

+ * [:currentAsUnicode(next) == currentAsUnicode(currentAsUnicode(next)):].

- void appendByteStringToken(PrecedenceInfo info, T value);

+ int currentAsUnicode(int next);

/**

- * Appends a keyword token whose kind is determined by [keyword].

+ * Returns the character at the next poisition. Like in [advance], the

+ * [Utf8BytesScanner] returns a UTF-8 byte, while the [StringScanner] returns

+ * a UTF-16 code unit.

- void appendKeywordToken(Keyword keyword);

- void appendWhiteSpace(int next);

- void appendEofToken();

+ int peek();

+ /**

+ * Notifies the scanner that unicode characters were detected in either a

+ * comment or a string literal between [startScanOffset] and the current

+ * scan offset.

+ */

+ void handleUnicode(int startScanOffset);

+ /**

+ * Returns the current scan offset.

+ *

+ * In the [Utf8BytesScanner] this is the offset into the byte list, in the

+ * [StringScanner] the offset in the source string.

+ */

+ int get scanOffset;

/**

- * Creates an ASCII SourceString whose content begins at the source byte

- * offset [start] and ends at [offset] bytes from the current byte offset of

- * the scanner. For example, if the current byte offset is 10,

- * [:asciiString(0,-1):] creates an ASCII SourceString whose content is found

- * at the [0,9[ byte interval of the source text.

+ * Returns the current string offset.

+ *

+ * In the [StringScanner] this is identical to the [scanOffset]. In the

+ * [Utf8BytesScanner] it is computed based on encountered UTF-8 characters.

+ */

+ int get stringOffset;

+ /**

+ * Returns the first token scanned by this [Scanner].

- T asciiString(int start, int offset);

- T utf8String(int start, int offset);

Token firstToken();

+ /**

+ * Returns the last token scanned by this [Scanner].

+ */

Token previousToken();

- void beginToken();

- void addToCharOffset(int offset);

- int get charOffset;

- int get byteOffset;

- void appendBeginGroup(PrecedenceInfo info, String value);

- int appendEndGroup(PrecedenceInfo info, String value, int openKind);

- void appendGt(PrecedenceInfo info, String value);

- void appendGtGt(PrecedenceInfo info, String value);

- void appendGtGtGt(PrecedenceInfo info, String value);

- void appendComment();

/**

- * We call this method to discard '<' from the "grouping" stack

- * (maintained by subclasses).

- *

- * [PartialParser.skipExpression] relies on the fact that we do not

- * create groups for stuff like:

- * [:a = b < c, d = e > f:].

+ * Notifies that a new token starts at current offset.

+ */

+ void beginToken() {

+ tokenStart = stringOffset;

+ }

+ /**

+ * Appends a substring from the scan offset [:start:] to the current

+ * [:scanOffset:] plus the [:extraOffset:]. For example, if the current

+ * scanOffset is 10, then [:appendSubstringToken(5, -1):] will append the

+ * substring string [5,9).

- * In other words, this method is called when the scanner recognizes

- * something which cannot possibly be part of a type

- * parameter/argument list.

+ * Note that [extraOffset] can only be used if the covered character(s) are

+ * known to be ASCII.

+ void appendSubstringToken(PrecedenceInfo info, int start,

+ bool asciiOnly, [int extraOffset]);