sdk/lib/_internal/compiler/implementation/scanner/token.dart - Issue 27510003: Scanner for UTF-8 byte arrays

Unified Diff: sdk/lib/_internal/compiler/implementation/scanner/token.dart

Issue 27510003: Scanner for UTF-8 byte arrays (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: fixes compiler tests Created 7 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« sdk/lib/_internal/compiler/implementation/scanner/string_scanner.dart ('K') | « sdk/lib/_internal/compiler/implementation/scanner/string_scanner.dart ('k') | sdk/lib/_internal/compiler/implementation/scanner/utf8_bytes_scanner.dart » ('j') | sdk/lib/_internal/compiler/implementation/scanner/utf8_bytes_scanner.dart » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: sdk/lib/_internal/compiler/implementation/scanner/token.dart

diff --git a/sdk/lib/_internal/compiler/implementation/scanner/token.dart b/sdk/lib/_internal/compiler/implementation/scanner/token.dart

index a94a2d4410d952faf44216fb859e21b37993d466..d9909e3d79a16bdb529585466ce386aa7b12143d 100644

--- a/sdk/lib/_internal/compiler/implementation/scanner/token.dart

+++ b/sdk/lib/_internal/compiler/implementation/scanner/token.dart

@@ -79,36 +79,54 @@ const int STRING_INTERPOLATION_IDENTIFIER_TOKEN = COMMENT_TOKEN + 1;

/**

* A token that doubles as a linked list.

-class Token implements Spannable {

- /**

- * The precedence info for this token. [info] determines the kind and the

- * precedence level of this token.

- */

- final PrecedenceInfo info;

+abstract class Token implements Spannable {

/**

* The character offset of the start of this token within the source text.

final int charOffset;

+ Token(this.charOffset);

/**

* The next token in the token stream.

Token next;

- Token(this.info, this.charOffset);

+ /**

+ * The precedence info for this token. [info] determines the kind and the

+ * precedence level of this token.

+ *

+ * Defined as getter to save a field in the [KeywordToken] subclass.

+ */

+ PrecedenceInfo get info;

- get value => info.value;

+ /**

+ * The string represented by this token, a substring of the source code.

+ *

+ * For [StringToken]s the value includes the quotes, explicit escapes, etc.

ngeoffray 2013/10/18 10:19:37 the value -> [value]

lukas 2013/10/24 16:48:36 Done.

+ *

+ */

+ String get value;

/**

- * Returns the string value for keywords and symbols. For instance 'class' for

- * the [CLASS] keyword token and '*' for a [Token] based on [STAR_INFO]. For

- * other tokens, such identifiers, strings, numbers, etc, [stringValue]

- * returns [:null:].

+ * For symbol and keyword tokens, returns the string value reprenseted by this

ngeoffray 2013/10/18 10:19:37 represented

lukas 2013/10/24 16:48:36 Done.

+ * token. For [StringToken]s this method returns [:null:].

+ *

+ * For [SymbolToken]s and [KeywordToken]s, the string value is a compile-time

+ * constant originating in the [PrecedenceInfo] or in the [Keyword] instance.

+ * This allows testing for keywords and symbols using [:identical:], e.g.,

+ * [:identical('class', token.value):].

- * [stringValue] should only be used for testing keywords and symbols.

+ * Note that returning [:null:] for string tokens is important to identify

+ * symbols and keywords, we cannot use [value] instead. The string literal

+ * "$a($b"

+ * produces ..., SymbolToken($), StringToken(a), StringToken((), ...

+ *

+ * After parsing the identifier 'a', the parser tests for a function

+ * declaration using [:identical(next.stringValue, '('):], which (rihgtfully)

+ * returns false because stringValue returns [:null:].

- String get stringValue => info.value.stringValue;

+ String get stringValue;

/**

* The kind enum of this token as determined by its [info].

@@ -120,31 +138,32 @@ class Token implements Spannable {

int get precedence => info.precedence;

- bool isIdentifier() => identical(kind, IDENTIFIER_TOKEN);

+ /**

+ * True if this token is an identifier. Some keywords allowed as identifiers,

+ * see implementaiton in [KeywordToken].

ngeoffray 2013/10/18 10:19:37 implementation

lukas 2013/10/24 16:48:36 Done.

+ */

+ bool isIdentifier();

/**

* Returns a textual representation of this token to be used for debugging

* purposes. The resulting string might contain information about the

* structure of the token, for example 'StringToken(foo)' for the identifier

- * token 'foo'. Use [slowToString] for the text actually parsed by the token.

- */

- String toString() => info.value.toString();

- /**

- * The text parsed by this token.

+ * token 'foo'.

+ *

+ * Use [value] for the text actually parsed by the token.

- String slowToString() => toString();

+ String toString();

/**

* The number of characters parsed by this token.

- int get slowCharCount {

+ int get charCount {

if (info == BAD_INPUT_INFO) {

// This is a token that wraps around an error message. Return 1

// instead of the size of the length of the error message.

return 1;

} else {

- return slowToString().length;

+ return value.length;

}

@@ -152,129 +171,230 @@ class Token implements Spannable {

}

/**

- * A keyword token.

+ * A symbol token represents the symbol in its precendence info.

ngeoffray 2013/10/18 10:19:37 symbol token -> [SymbolToken]

lukas 2013/10/24 16:48:36 Done.

+ * Also used for end of file with EOF_INFO.

-class KeywordToken extends Token {

- final Keyword value;

- String get stringValue => value.syntax;

+class SymbolToken extends Token {

- KeywordToken(Keyword value, int charOffset)

- : this.value = value, super(value.info, charOffset);

+ final PrecedenceInfo info;

+ SymbolToken(this.info, int charOffset) : super(charOffset);

+ String get value => info.value;

- bool isIdentifier() => value.isPseudo || value.isBuiltIn;

+ String get stringValue => info.value;

- String toString() => value.syntax;

+ bool isIdentifier() => false;

+ String toString() => "SymbolToken($value)";

}

/**

- * A String-valued token.

+ * A [BeginGroupToken] reprsents a symbol that may be the beginning of

ngeoffray 2013/10/18 10:19:37 represents

lukas 2013/10/24 16:48:36 Done.

+ * a pair of brackets, i.e., ( { [ < or ${

+ * The [endGroup] token points to the matching closing bracked in case

+ * it can be identified during scanning.

-class StringToken extends Token {

- final SourceString value;

- StringToken(PrecedenceInfo info, String value, int charOffset)

- : this.fromSource(info, new SourceString(value), charOffset);

+class BeginGroupToken extends SymbolToken {

+ Token endGroup;

- StringToken.fromSource(PrecedenceInfo info, this.value, int charOffset)

+ BeginGroupToken(PrecedenceInfo info, int charOffset)

: super(info, charOffset);

ngeoffray 2013/10/18 10:19:37 Fits in one line?

lukas 2013/10/24 16:48:36 No :)

- String toString() => "StringToken(${value.slowToString()})";

- String slowToString() => value.slowToString();

}

-abstract class SourceString extends IterableBase<int> {

- const factory SourceString(String string) = StringWrapper;

+/**

+ * A keyword token.

+ */

+class KeywordToken extends Token {

+ final Keyword keyword;

- static final Map<String, StringWrapper> canonicalizedValues =

- new Map<String, StringWrapper>();

+ KeywordToken(this.keyword, int charOffset) : super(charOffset);

- factory SourceString.fromSubstring(String string, int begin, int end) {

- var substring = string.substring(begin, end);

- return canonicalizedValues.putIfAbsent(

- substring, () => new StringWrapper(substring));

- }

+ PrecedenceInfo get info => keyword.info;

- void printOn(StringBuffer sb);

+ String get value => keyword.syntax;

- /** Gives a [SourceString] that is not including the [initial] first and

- * [terminal] last characters. This is only intended to be used to remove

- * quotes from string literals (including an initial '@' for raw strings).

- */

- SourceString copyWithoutQuotes(int initial, int terminal);

+ String get stringValue => keyword.syntax;

- String get stringValue;

+ bool isIdentifier() => keyword.isPseudo || keyword.isBuiltIn;

- String slowToString();

- bool get isEmpty;

- bool isPrivate();

+ String toString() => "KeywordToken($value)";

}

-class StringWrapper extends IterableBase<int> implements SourceString {

- final String stringValue;

+/**

+ * A String-valued token. Represents identifiers, string literals,

+ * number literals, comments and error tokens, using the corresponding

ngeoffray 2013/10/18 10:19:37 comments, and ...

lukas 2013/10/24 16:48:36 Done.

+ * precedence info.

+ */

+class StringToken extends Token {

+ /**

+ * The length threshold above which substring tokens are computed lazily.

+ *

+ * For string tokens that are substrings of the program source, the actual

+ * substring extraction is performed lazily. This is beneficial because

+ * not all scanned code is actually used. For unused parts, the substrings

+ * are never computed and allocated.

+ */

+ static const int LAZY_THRESHOLD = 4;

sra1 2013/10/22 19:52:31 How did you calculate this threshold?

lukas 2013/10/23 07:11:01 Short strings have a smaller footprint than a Comp

- const StringWrapper(this.stringValue);

+ var valueOrLazySubstring;

ngeoffray 2013/10/18 10:19:37 You could put the union type of this field in comm

lukas 2013/10/24 16:48:36 Done.

- int get hashCode => stringValue.hashCode;

+ final PrecedenceInfo info;

+ /**

+ * Creates a non-lazy string token. If [canonicalize] is true, the string

+ * is canonicalized before the token is created.

+ */

+ StringToken.fromString(this.info, String value, int charOffset,

+ [bool canonicalize = false])

ngeoffray 2013/10/18 10:19:37 Make it a named parameter? Easier when reading cal

lukas 2013/10/24 16:48:36 Done.

+ : valueOrLazySubstring = canonicalizedString(value, canonicalize),

+ super(charOffset);

- bool operator ==(other) {

- return other is SourceString && toString() == other.slowToString();

+ /**

+ * Creates a lazy string token. If [canonicalize] is true, the string

+ * is canonicalized before the token is created.

+ */

+ StringToken.fromSubstring(this.info, String data, int start, int end,

+ int charOffset, [bool canonicalize = false])

ngeoffray 2013/10/18 10:19:37 ditto

lukas 2013/10/24 16:48:36 Done.

+ : super(charOffset) {

+ int length = end - start;

+ if (length <= LAZY_THRESHOLD) {

+ valueOrLazySubstring = canonicalizedString(data.substring(start, end),

+ canonicalize);

ngeoffray 2013/10/18 10:19:37 indentation.

lukas 2013/10/24 16:48:36 Done.

+ } else {

+ valueOrLazySubstring =

+ new LazySubstring(data, start, length, canonicalize);

+ }

}

- Iterator<int> get iterator => new StringCodeIterator(stringValue);

+ /**

+ * Creates a lazy string token. If [asciiOnly] is false, the byte array

+ * is passed through a UTF-8 decoder.

+ */

+ StringToken.fromUtf8Bytes(this.info, List<int> data, int start, int end,

+ bool asciiOnly, int charOffset)

+ : super(charOffset) {

+ int length = end - start;

+ if (length <= LAZY_THRESHOLD) {

+ valueOrLazySubstring = decodeUtf8(data, start, end, asciiOnly);

+ } else {

+ valueOrLazySubstring = new LazySubstring(data, start, length, asciiOnly);

+ }

- void printOn(StringBuffer sb) {

- sb.write(stringValue);

+ String get value {

+ if (valueOrLazySubstring is String) {

+ return valueOrLazySubstring;

+ } else {

+ assert(valueOrLazySubstring is LazySubstring);

+ var data = valueOrLazySubstring.data;

+ int start = valueOrLazySubstring.start;

+ int end = start + valueOrLazySubstring.length;

+ if (data is String) {

+ valueOrLazySubstring = canonicalizedString(

+ data.substring(start, end), valueOrLazySubstring.boolValue);

+ } else {

+ valueOrLazySubstring = decodeUtf8(

+ data, start, end, valueOrLazySubstring.boolValue);

+ }

+ return valueOrLazySubstring;

+ }

}

- String toString() => stringValue;

+ String get stringValue => null;

- String slowToString() => stringValue;

+ bool isIdentifier() => identical(kind, IDENTIFIER_TOKEN);

- SourceString copyWithoutQuotes(int initial, int terminal) {

- assert(0 <= initial);

- assert(0 <= terminal);

- assert(initial + terminal <= stringValue.length);

- return new StringWrapper(

- stringValue.substring(initial, stringValue.length - terminal));

- }

+ String toString() => "StringToken($value)";

- bool get isEmpty => stringValue.isEmpty;

+ static final HashSet<String> canonicalizedSubstrings =

+ new HashSet();

ngeoffray 2013/10/18 10:19:37 HashSet<String>()

lukas 2013/10/24 16:48:36 Done.

- bool isPrivate() => !isEmpty && stringValue.codeUnitAt(0) == $_;

+ static String canonicalizedString(String s, bool canonicalize) {

+ if (!canonicalize) return s;

+ var result = canonicalizedSubstrings.lookup(s);

+ if (result != null) return result;

+ canonicalizedSubstrings.add(s);

+ return s;

+ }

+ static String decodeUtf8(List<int> data, int start, int end, bool asciiOnly) {

+ var s;

+ if (asciiOnly) {

+ // getRange returns an iterator, it does not copy the data.

+ s = new String.fromCharCodes(data.getRange(start, end));

+ } else {

+ // TODO(lry): this is measurably slow. Also sublist is copied eagerly.

+ var bytes = data.sublist(start, end);

+ s = UTF8.decode(bytes);

+ }

+ return canonicalizedString(s, true);

+ }

}

-class StringCodeIterator implements Iterator<int> {

- final String string;

- int index;

- final int end;

- int _current;

+/**

+ * This class represents the necessary information to compute a substring

+ * lazily. The substring can either originate in a string or in a [:List<int>:]

ngeoffray 2013/10/18 10:19:37 originate in -> originate from?

lukas 2013/10/24 16:48:36 Done.

+ * of UTF-8 bytes.

+ */

+abstract class LazySubstring {

+ /** The original data, either a string or a List<int> */

+ get data;

- StringCodeIterator(String string) :

- this.string = string, index = 0, end = string.length;

+ int get start;

+ int get length;

- StringCodeIterator.substring(this.string, this.index, this.end) {

- assert(0 <= index);

- assert(index <= end);

- assert(end <= string.length);

+ /**

+ * If this substring is based on a String, the boolean indicates wheter the

ngeoffray 2013/10/18 10:19:37 the boolean -> [boolValue]

lukas 2013/10/24 16:48:36 Done.

+ * resulting substring should be canonicalized.

+ *

+ * For substrings based on a byte array, the boolean value is true if the

ngeoffray 2013/10/18 10:19:37 ditto

lukas 2013/10/24 16:48:36 Done.

+ * array only holds ASCII characters. The resulting substring will be

+ * canonicalized after decoding.

+ */

+ bool get boolValue;

+ LazySubstring.internal();

+ factory LazySubstring(data, int start, int length, bool b) {

+ // See comment on [CompactLazySubstring].

+ if (start < 0x100000 && length < 0x200) {

+ int fields = (start << 9);

+ fields = fields | length;

+ fields = fields << 1;

+ if (b) fields |= 1;

+ return new CompactLazySubstring(data, fields);

+ } else {

+ return new FullLazySubstring(data, start, length, b);

+ }

}

- int get current => _current;

+/**

+ * This class encodes [start], [length] and [boolValue] in a single

+ * 30 bit integer. It uses 20 bits for [start], which covers source files

+ * of 1M. [length] has 9 bits, which covers 512 characters.

ngeoffray 2013/10/18 10:19:37 1M -> 1MB.

lukas 2013/10/24 16:48:36 Done.

+ *

+ * The file html_dart2js.dart is currently around 1M.

ngeoffray 2013/10/18 10:19:37 1M -> 1MB

lukas 2013/10/24 16:48:36 Done.

+ */

+class CompactLazySubstring extends LazySubstring {

+ final data;

+ final int fields;

- bool moveNext() {

- _current = null;

- if (index >= end) return false;

- _current = string.codeUnitAt(index++);

- return true;

- }

+ CompactLazySubstring(this.data, this.fields) : super.internal();

+ int get start => fields >> 10;

+ int get length => (fields >> 1) & 0x1ff;

+ bool get boolValue => (fields & 1) == 1;

}

-class BeginGroupToken extends StringToken {

- Token endGroup;

- BeginGroupToken(PrecedenceInfo info, String value, int charOffset)

- : super(info, value, charOffset);

+class FullLazySubstring extends LazySubstring {

+ final data;

+ final int start;

+ final int length;

+ final bool boolValue;

+ FullLazySubstring(this.data, this.start, this.length, this.boolValue)

+ : super.internal();

}

bool isUserDefinableOperator(String value) {