Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(734)

Unified Diff: sdk/lib/_internal/compiler/implementation/scanner/token.dart

Issue 27510003: Scanner for UTF-8 byte arrays (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: fixes compiler tests Created 7 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: sdk/lib/_internal/compiler/implementation/scanner/token.dart
diff --git a/sdk/lib/_internal/compiler/implementation/scanner/token.dart b/sdk/lib/_internal/compiler/implementation/scanner/token.dart
index a94a2d4410d952faf44216fb859e21b37993d466..d9909e3d79a16bdb529585466ce386aa7b12143d 100644
--- a/sdk/lib/_internal/compiler/implementation/scanner/token.dart
+++ b/sdk/lib/_internal/compiler/implementation/scanner/token.dart
@@ -79,36 +79,54 @@ const int STRING_INTERPOLATION_IDENTIFIER_TOKEN = COMMENT_TOKEN + 1;
/**
* A token that doubles as a linked list.
*/
-class Token implements Spannable {
- /**
- * The precedence info for this token. [info] determines the kind and the
- * precedence level of this token.
- */
- final PrecedenceInfo info;
-
+abstract class Token implements Spannable {
/**
* The character offset of the start of this token within the source text.
*/
final int charOffset;
+ Token(this.charOffset);
+
/**
* The next token in the token stream.
*/
Token next;
- Token(this.info, this.charOffset);
+ /**
+ * The precedence info for this token. [info] determines the kind and the
+ * precedence level of this token.
+ *
+ * Defined as getter to save a field in the [KeywordToken] subclass.
+ */
+ PrecedenceInfo get info;
- get value => info.value;
+ /**
+ * The string represented by this token, a substring of the source code.
+ *
+ * For [StringToken]s the value includes the quotes, explicit escapes, etc.
ngeoffray 2013/10/18 10:19:37 the value -> [value]
lukas 2013/10/24 16:48:36 Done.
+ *
+ */
+ String get value;
/**
- * Returns the string value for keywords and symbols. For instance 'class' for
- * the [CLASS] keyword token and '*' for a [Token] based on [STAR_INFO]. For
- * other tokens, such identifiers, strings, numbers, etc, [stringValue]
- * returns [:null:].
+ * For symbol and keyword tokens, returns the string value reprenseted by this
ngeoffray 2013/10/18 10:19:37 represented
lukas 2013/10/24 16:48:36 Done.
+ * token. For [StringToken]s this method returns [:null:].
+ *
+ * For [SymbolToken]s and [KeywordToken]s, the string value is a compile-time
+ * constant originating in the [PrecedenceInfo] or in the [Keyword] instance.
+ * This allows testing for keywords and symbols using [:identical:], e.g.,
+ * [:identical('class', token.value):].
*
- * [stringValue] should only be used for testing keywords and symbols.
+ * Note that returning [:null:] for string tokens is important to identify
+ * symbols and keywords, we cannot use [value] instead. The string literal
+ * "$a($b"
+ * produces ..., SymbolToken($), StringToken(a), StringToken((), ...
+ *
+ * After parsing the identifier 'a', the parser tests for a function
+ * declaration using [:identical(next.stringValue, '('):], which (rihgtfully)
+ * returns false because stringValue returns [:null:].
*/
- String get stringValue => info.value.stringValue;
+ String get stringValue;
/**
* The kind enum of this token as determined by its [info].
@@ -120,31 +138,32 @@ class Token implements Spannable {
*/
int get precedence => info.precedence;
- bool isIdentifier() => identical(kind, IDENTIFIER_TOKEN);
+ /**
+ * True if this token is an identifier. Some keywords allowed as identifiers,
+ * see implementaiton in [KeywordToken].
ngeoffray 2013/10/18 10:19:37 implementation
lukas 2013/10/24 16:48:36 Done.
+ */
+ bool isIdentifier();
/**
* Returns a textual representation of this token to be used for debugging
* purposes. The resulting string might contain information about the
* structure of the token, for example 'StringToken(foo)' for the identifier
- * token 'foo'. Use [slowToString] for the text actually parsed by the token.
- */
- String toString() => info.value.toString();
-
- /**
- * The text parsed by this token.
+ * token 'foo'.
+ *
+ * Use [value] for the text actually parsed by the token.
*/
- String slowToString() => toString();
+ String toString();
/**
* The number of characters parsed by this token.
*/
- int get slowCharCount {
+ int get charCount {
if (info == BAD_INPUT_INFO) {
// This is a token that wraps around an error message. Return 1
// instead of the size of the length of the error message.
return 1;
} else {
- return slowToString().length;
+ return value.length;
}
}
@@ -152,129 +171,230 @@ class Token implements Spannable {
}
/**
- * A keyword token.
+ * A symbol token represents the symbol in its precendence info.
ngeoffray 2013/10/18 10:19:37 symbol token -> [SymbolToken]
lukas 2013/10/24 16:48:36 Done.
+ * Also used for end of file with EOF_INFO.
*/
-class KeywordToken extends Token {
- final Keyword value;
- String get stringValue => value.syntax;
+class SymbolToken extends Token {
- KeywordToken(Keyword value, int charOffset)
- : this.value = value, super(value.info, charOffset);
+ final PrecedenceInfo info;
+
+ SymbolToken(this.info, int charOffset) : super(charOffset);
+
+ String get value => info.value;
- bool isIdentifier() => value.isPseudo || value.isBuiltIn;
+ String get stringValue => info.value;
- String toString() => value.syntax;
+ bool isIdentifier() => false;
+
+ String toString() => "SymbolToken($value)";
}
/**
- * A String-valued token.
+ * A [BeginGroupToken] reprsents a symbol that may be the beginning of
ngeoffray 2013/10/18 10:19:37 represents
lukas 2013/10/24 16:48:36 Done.
+ * a pair of brackets, i.e., ( { [ < or ${
+ * The [endGroup] token points to the matching closing bracked in case
+ * it can be identified during scanning.
*/
-class StringToken extends Token {
- final SourceString value;
-
- StringToken(PrecedenceInfo info, String value, int charOffset)
- : this.fromSource(info, new SourceString(value), charOffset);
+class BeginGroupToken extends SymbolToken {
+ Token endGroup;
- StringToken.fromSource(PrecedenceInfo info, this.value, int charOffset)
+ BeginGroupToken(PrecedenceInfo info, int charOffset)
: super(info, charOffset);
ngeoffray 2013/10/18 10:19:37 Fits in one line?
lukas 2013/10/24 16:48:36 No :)
-
- String toString() => "StringToken(${value.slowToString()})";
-
- String slowToString() => value.slowToString();
}
-abstract class SourceString extends IterableBase<int> {
- const factory SourceString(String string) = StringWrapper;
+/**
+ * A keyword token.
+ */
+class KeywordToken extends Token {
+ final Keyword keyword;
- static final Map<String, StringWrapper> canonicalizedValues =
- new Map<String, StringWrapper>();
+ KeywordToken(this.keyword, int charOffset) : super(charOffset);
- factory SourceString.fromSubstring(String string, int begin, int end) {
- var substring = string.substring(begin, end);
- return canonicalizedValues.putIfAbsent(
- substring, () => new StringWrapper(substring));
- }
+ PrecedenceInfo get info => keyword.info;
- void printOn(StringBuffer sb);
+ String get value => keyword.syntax;
- /** Gives a [SourceString] that is not including the [initial] first and
- * [terminal] last characters. This is only intended to be used to remove
- * quotes from string literals (including an initial '@' for raw strings).
- */
- SourceString copyWithoutQuotes(int initial, int terminal);
+ String get stringValue => keyword.syntax;
- String get stringValue;
+ bool isIdentifier() => keyword.isPseudo || keyword.isBuiltIn;
- String slowToString();
-
- bool get isEmpty;
-
- bool isPrivate();
+ String toString() => "KeywordToken($value)";
}
-class StringWrapper extends IterableBase<int> implements SourceString {
- final String stringValue;
+/**
+ * A String-valued token. Represents identifiers, string literals,
+ * number literals, comments and error tokens, using the corresponding
ngeoffray 2013/10/18 10:19:37 comments, and ...
lukas 2013/10/24 16:48:36 Done.
+ * precedence info.
+ */
+class StringToken extends Token {
+ /**
+ * The length threshold above which substring tokens are computed lazily.
+ *
+ * For string tokens that are substrings of the program source, the actual
+ * substring extraction is performed lazily. This is beneficial because
+ * not all scanned code is actually used. For unused parts, the substrings
+ * are never computed and allocated.
+ */
+ static const int LAZY_THRESHOLD = 4;
sra1 2013/10/22 19:52:31 How did you calculate this threshold?
lukas 2013/10/23 07:11:01 Short strings have a smaller footprint than a Comp
- const StringWrapper(this.stringValue);
+ var valueOrLazySubstring;
ngeoffray 2013/10/18 10:19:37 You could put the union type of this field in comm
lukas 2013/10/24 16:48:36 Done.
- int get hashCode => stringValue.hashCode;
+ final PrecedenceInfo info;
+
+ /**
+ * Creates a non-lazy string token. If [canonicalize] is true, the string
+ * is canonicalized before the token is created.
+ */
+ StringToken.fromString(this.info, String value, int charOffset,
+ [bool canonicalize = false])
ngeoffray 2013/10/18 10:19:37 Make it a named parameter? Easier when reading cal
lukas 2013/10/24 16:48:36 Done.
+ : valueOrLazySubstring = canonicalizedString(value, canonicalize),
+ super(charOffset);
- bool operator ==(other) {
- return other is SourceString && toString() == other.slowToString();
+ /**
+ * Creates a lazy string token. If [canonicalize] is true, the string
+ * is canonicalized before the token is created.
+ */
+ StringToken.fromSubstring(this.info, String data, int start, int end,
+ int charOffset, [bool canonicalize = false])
ngeoffray 2013/10/18 10:19:37 ditto
lukas 2013/10/24 16:48:36 Done.
+ : super(charOffset) {
+ int length = end - start;
+ if (length <= LAZY_THRESHOLD) {
+ valueOrLazySubstring = canonicalizedString(data.substring(start, end),
+ canonicalize);
ngeoffray 2013/10/18 10:19:37 indentation.
lukas 2013/10/24 16:48:36 Done.
+ } else {
+ valueOrLazySubstring =
+ new LazySubstring(data, start, length, canonicalize);
+ }
}
- Iterator<int> get iterator => new StringCodeIterator(stringValue);
+ /**
+ * Creates a lazy string token. If [asciiOnly] is false, the byte array
+ * is passed through a UTF-8 decoder.
+ */
+ StringToken.fromUtf8Bytes(this.info, List<int> data, int start, int end,
+ bool asciiOnly, int charOffset)
+ : super(charOffset) {
+ int length = end - start;
+ if (length <= LAZY_THRESHOLD) {
+ valueOrLazySubstring = decodeUtf8(data, start, end, asciiOnly);
+ } else {
+ valueOrLazySubstring = new LazySubstring(data, start, length, asciiOnly);
+ }
+ }
- void printOn(StringBuffer sb) {
- sb.write(stringValue);
+ String get value {
+ if (valueOrLazySubstring is String) {
+ return valueOrLazySubstring;
+ } else {
+ assert(valueOrLazySubstring is LazySubstring);
+ var data = valueOrLazySubstring.data;
+ int start = valueOrLazySubstring.start;
+ int end = start + valueOrLazySubstring.length;
+ if (data is String) {
+ valueOrLazySubstring = canonicalizedString(
+ data.substring(start, end), valueOrLazySubstring.boolValue);
+ } else {
+ valueOrLazySubstring = decodeUtf8(
+ data, start, end, valueOrLazySubstring.boolValue);
+ }
+ return valueOrLazySubstring;
+ }
}
- String toString() => stringValue;
+ String get stringValue => null;
- String slowToString() => stringValue;
+ bool isIdentifier() => identical(kind, IDENTIFIER_TOKEN);
- SourceString copyWithoutQuotes(int initial, int terminal) {
- assert(0 <= initial);
- assert(0 <= terminal);
- assert(initial + terminal <= stringValue.length);
- return new StringWrapper(
- stringValue.substring(initial, stringValue.length - terminal));
- }
+ String toString() => "StringToken($value)";
- bool get isEmpty => stringValue.isEmpty;
+ static final HashSet<String> canonicalizedSubstrings =
+ new HashSet();
ngeoffray 2013/10/18 10:19:37 HashSet<String>()
lukas 2013/10/24 16:48:36 Done.
- bool isPrivate() => !isEmpty && stringValue.codeUnitAt(0) == $_;
+ static String canonicalizedString(String s, bool canonicalize) {
+ if (!canonicalize) return s;
+ var result = canonicalizedSubstrings.lookup(s);
+ if (result != null) return result;
+ canonicalizedSubstrings.add(s);
+ return s;
+ }
+
+ static String decodeUtf8(List<int> data, int start, int end, bool asciiOnly) {
+ var s;
+ if (asciiOnly) {
+ // getRange returns an iterator, it does not copy the data.
+ s = new String.fromCharCodes(data.getRange(start, end));
+ } else {
+ // TODO(lry): this is measurably slow. Also sublist is copied eagerly.
+ var bytes = data.sublist(start, end);
+ s = UTF8.decode(bytes);
+ }
+ return canonicalizedString(s, true);
+ }
}
-class StringCodeIterator implements Iterator<int> {
- final String string;
- int index;
- final int end;
- int _current;
+/**
+ * This class represents the necessary information to compute a substring
+ * lazily. The substring can either originate in a string or in a [:List<int>:]
ngeoffray 2013/10/18 10:19:37 originate in -> originate from?
lukas 2013/10/24 16:48:36 Done.
+ * of UTF-8 bytes.
+ */
+abstract class LazySubstring {
+ /** The original data, either a string or a List<int> */
+ get data;
- StringCodeIterator(String string) :
- this.string = string, index = 0, end = string.length;
+ int get start;
+ int get length;
- StringCodeIterator.substring(this.string, this.index, this.end) {
- assert(0 <= index);
- assert(index <= end);
- assert(end <= string.length);
+ /**
+ * If this substring is based on a String, the boolean indicates wheter the
ngeoffray 2013/10/18 10:19:37 the boolean -> [boolValue]
lukas 2013/10/24 16:48:36 Done.
+ * resulting substring should be canonicalized.
+ *
+ * For substrings based on a byte array, the boolean value is true if the
ngeoffray 2013/10/18 10:19:37 ditto
lukas 2013/10/24 16:48:36 Done.
+ * array only holds ASCII characters. The resulting substring will be
+ * canonicalized after decoding.
+ */
+ bool get boolValue;
+
+ LazySubstring.internal();
+
+ factory LazySubstring(data, int start, int length, bool b) {
+ // See comment on [CompactLazySubstring].
+ if (start < 0x100000 && length < 0x200) {
+ int fields = (start << 9);
+ fields = fields | length;
+ fields = fields << 1;
+ if (b) fields |= 1;
+ return new CompactLazySubstring(data, fields);
+ } else {
+ return new FullLazySubstring(data, start, length, b);
+ }
}
+}
- int get current => _current;
+/**
+ * This class encodes [start], [length] and [boolValue] in a single
+ * 30 bit integer. It uses 20 bits for [start], which covers source files
+ * of 1M. [length] has 9 bits, which covers 512 characters.
ngeoffray 2013/10/18 10:19:37 1M -> 1MB.
lukas 2013/10/24 16:48:36 Done.
+ *
+ * The file html_dart2js.dart is currently around 1M.
ngeoffray 2013/10/18 10:19:37 1M -> 1MB
lukas 2013/10/24 16:48:36 Done.
+ */
+class CompactLazySubstring extends LazySubstring {
+ final data;
+ final int fields;
- bool moveNext() {
- _current = null;
- if (index >= end) return false;
- _current = string.codeUnitAt(index++);
- return true;
- }
+ CompactLazySubstring(this.data, this.fields) : super.internal();
+
+ int get start => fields >> 10;
+ int get length => (fields >> 1) & 0x1ff;
+ bool get boolValue => (fields & 1) == 1;
}
-class BeginGroupToken extends StringToken {
- Token endGroup;
- BeginGroupToken(PrecedenceInfo info, String value, int charOffset)
- : super(info, value, charOffset);
+class FullLazySubstring extends LazySubstring {
+ final data;
+ final int start;
+ final int length;
+ final bool boolValue;
+ FullLazySubstring(this.data, this.start, this.length, this.boolValue)
+ : super.internal();
}
bool isUserDefinableOperator(String value) {

Powered by Google App Engine
This is Rietveld 408576698