Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(63)

Side by Side Diff: sdk/lib/_internal/compiler/implementation/scanner/token.dart

Issue 27510003: Scanner for UTF-8 byte arrays (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: fixes compiler tests Created 7 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 part of scanner; 5 part of scanner;
6 6
7 const int EOF_TOKEN = 0; 7 const int EOF_TOKEN = 0;
8 8
9 const int KEYWORD_TOKEN = $k; 9 const int KEYWORD_TOKEN = $k;
10 const int IDENTIFIER_TOKEN = $a; 10 const int IDENTIFIER_TOKEN = $a;
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after
72 const int TILDE_SLASH_TOKEN = TILDE_SLASH_EQ_TOKEN + 1; 72 const int TILDE_SLASH_TOKEN = TILDE_SLASH_EQ_TOKEN + 1;
73 const int PERCENT_EQ_TOKEN = TILDE_SLASH_TOKEN + 1; 73 const int PERCENT_EQ_TOKEN = TILDE_SLASH_TOKEN + 1;
74 const int GT_GT_TOKEN = PERCENT_EQ_TOKEN + 1; 74 const int GT_GT_TOKEN = PERCENT_EQ_TOKEN + 1;
75 const int CARET_EQ_TOKEN = GT_GT_TOKEN + 1; 75 const int CARET_EQ_TOKEN = GT_GT_TOKEN + 1;
76 const int COMMENT_TOKEN = CARET_EQ_TOKEN + 1; 76 const int COMMENT_TOKEN = CARET_EQ_TOKEN + 1;
77 const int STRING_INTERPOLATION_IDENTIFIER_TOKEN = COMMENT_TOKEN + 1; 77 const int STRING_INTERPOLATION_IDENTIFIER_TOKEN = COMMENT_TOKEN + 1;
78 78
79 /** 79 /**
80 * A token that doubles as a linked list. 80 * A token that doubles as a linked list.
81 */ 81 */
82 class Token implements Spannable { 82 abstract class Token implements Spannable {
83 /**
84 * The precedence info for this token. [info] determines the kind and the
85 * precedence level of this token.
86 */
87 final PrecedenceInfo info;
88
89 /** 83 /**
90 * The character offset of the start of this token within the source text. 84 * The character offset of the start of this token within the source text.
91 */ 85 */
92 final int charOffset; 86 final int charOffset;
93 87
88 Token(this.charOffset);
89
94 /** 90 /**
95 * The next token in the token stream. 91 * The next token in the token stream.
96 */ 92 */
97 Token next; 93 Token next;
98 94
99 Token(this.info, this.charOffset); 95 /**
100 96 * The precedence info for this token. [info] determines the kind and the
101 get value => info.value; 97 * precedence level of this token.
98 *
99 * Defined as getter to save a field in the [KeywordToken] subclass.
100 */
101 PrecedenceInfo get info;
102 102
103 /** 103 /**
104 * Returns the string value for keywords and symbols. For instance 'class' for 104 * The string represented by this token, a substring of the source code.
105 * the [CLASS] keyword token and '*' for a [Token] based on [STAR_INFO]. For
106 * other tokens, such identifiers, strings, numbers, etc, [stringValue]
107 * returns [:null:].
108 * 105 *
109 * [stringValue] should only be used for testing keywords and symbols. 106 * For [StringToken]s the value includes the quotes, explicit escapes, etc.
ngeoffray 2013/10/18 10:19:37 the value -> [value]
lukas 2013/10/24 16:48:36 Done.
107 *
110 */ 108 */
111 String get stringValue => info.value.stringValue; 109 String get value;
110
111 /**
112 * For symbol and keyword tokens, returns the string value reprenseted by this
ngeoffray 2013/10/18 10:19:37 represented
lukas 2013/10/24 16:48:36 Done.
113 * token. For [StringToken]s this method returns [:null:].
114 *
115 * For [SymbolToken]s and [KeywordToken]s, the string value is a compile-time
116 * constant originating in the [PrecedenceInfo] or in the [Keyword] instance.
117 * This allows testing for keywords and symbols using [:identical:], e.g.,
118 * [:identical('class', token.value):].
119 *
120 * Note that returning [:null:] for string tokens is important to identify
121 * symbols and keywords, we cannot use [value] instead. The string literal
122 * "$a($b"
123 * produces ..., SymbolToken($), StringToken(a), StringToken((), ...
124 *
125 * After parsing the identifier 'a', the parser tests for a function
126 * declaration using [:identical(next.stringValue, '('):], which (rihgtfully)
127 * returns false because stringValue returns [:null:].
128 */
129 String get stringValue;
112 130
113 /** 131 /**
114 * The kind enum of this token as determined by its [info]. 132 * The kind enum of this token as determined by its [info].
115 */ 133 */
116 int get kind => info.kind; 134 int get kind => info.kind;
117 135
118 /** 136 /**
119 * The precedence level for this token. 137 * The precedence level for this token.
120 */ 138 */
121 int get precedence => info.precedence; 139 int get precedence => info.precedence;
122 140
123 bool isIdentifier() => identical(kind, IDENTIFIER_TOKEN); 141 /**
142 * True if this token is an identifier. Some keywords allowed as identifiers,
143 * see implementaiton in [KeywordToken].
ngeoffray 2013/10/18 10:19:37 implementation
lukas 2013/10/24 16:48:36 Done.
144 */
145 bool isIdentifier();
124 146
125 /** 147 /**
126 * Returns a textual representation of this token to be used for debugging 148 * Returns a textual representation of this token to be used for debugging
127 * purposes. The resulting string might contain information about the 149 * purposes. The resulting string might contain information about the
128 * structure of the token, for example 'StringToken(foo)' for the identifier 150 * structure of the token, for example 'StringToken(foo)' for the identifier
129 * token 'foo'. Use [slowToString] for the text actually parsed by the token. 151 * token 'foo'.
152 *
153 * Use [value] for the text actually parsed by the token.
130 */ 154 */
131 String toString() => info.value.toString(); 155 String toString();
132
133 /**
134 * The text parsed by this token.
135 */
136 String slowToString() => toString();
137 156
138 /** 157 /**
139 * The number of characters parsed by this token. 158 * The number of characters parsed by this token.
140 */ 159 */
141 int get slowCharCount { 160 int get charCount {
142 if (info == BAD_INPUT_INFO) { 161 if (info == BAD_INPUT_INFO) {
143 // This is a token that wraps around an error message. Return 1 162 // This is a token that wraps around an error message. Return 1
144 // instead of the size of the length of the error message. 163 // instead of the size of the length of the error message.
145 return 1; 164 return 1;
146 } else { 165 } else {
147 return slowToString().length; 166 return value.length;
148 } 167 }
149 } 168 }
150 169
151 int get hashCode => computeHashCode(charOffset, info, value); 170 int get hashCode => computeHashCode(charOffset, info, value);
152 } 171 }
153 172
154 /** 173 /**
174 * A symbol token represents the symbol in its precendence info.
ngeoffray 2013/10/18 10:19:37 symbol token -> [SymbolToken]
lukas 2013/10/24 16:48:36 Done.
175 * Also used for end of file with EOF_INFO.
176 */
177 class SymbolToken extends Token {
178
179 final PrecedenceInfo info;
180
181 SymbolToken(this.info, int charOffset) : super(charOffset);
182
183 String get value => info.value;
184
185 String get stringValue => info.value;
186
187 bool isIdentifier() => false;
188
189 String toString() => "SymbolToken($value)";
190 }
191
192 /**
193 * A [BeginGroupToken] reprsents a symbol that may be the beginning of
ngeoffray 2013/10/18 10:19:37 represents
lukas 2013/10/24 16:48:36 Done.
194 * a pair of brackets, i.e., ( { [ < or ${
195 * The [endGroup] token points to the matching closing bracked in case
196 * it can be identified during scanning.
197 */
198 class BeginGroupToken extends SymbolToken {
199 Token endGroup;
200
201 BeginGroupToken(PrecedenceInfo info, int charOffset)
202 : super(info, charOffset);
ngeoffray 2013/10/18 10:19:37 Fits in one line?
lukas 2013/10/24 16:48:36 No :)
203 }
204
205 /**
155 * A keyword token. 206 * A keyword token.
156 */ 207 */
157 class KeywordToken extends Token { 208 class KeywordToken extends Token {
158 final Keyword value; 209 final Keyword keyword;
159 String get stringValue => value.syntax; 210
160 211 KeywordToken(this.keyword, int charOffset) : super(charOffset);
161 KeywordToken(Keyword value, int charOffset) 212
162 : this.value = value, super(value.info, charOffset); 213 PrecedenceInfo get info => keyword.info;
163 214
164 bool isIdentifier() => value.isPseudo || value.isBuiltIn; 215 String get value => keyword.syntax;
165 216
166 String toString() => value.syntax; 217 String get stringValue => keyword.syntax;
167 } 218
168 219 bool isIdentifier() => keyword.isPseudo || keyword.isBuiltIn;
169 /** 220
170 * A String-valued token. 221 String toString() => "KeywordToken($value)";
222 }
223
224 /**
225 * A String-valued token. Represents identifiers, string literals,
226 * number literals, comments and error tokens, using the corresponding
ngeoffray 2013/10/18 10:19:37 comments, and ...
lukas 2013/10/24 16:48:36 Done.
227 * precedence info.
171 */ 228 */
172 class StringToken extends Token { 229 class StringToken extends Token {
173 final SourceString value; 230 /**
174 231 * The length threshold above which substring tokens are computed lazily.
175 StringToken(PrecedenceInfo info, String value, int charOffset) 232 *
176 : this.fromSource(info, new SourceString(value), charOffset); 233 * For string tokens that are substrings of the program source, the actual
177 234 * substring extraction is performed lazily. This is beneficial because
178 StringToken.fromSource(PrecedenceInfo info, this.value, int charOffset) 235 * not all scanned code is actually used. For unused parts, the substrings
179 : super(info, charOffset); 236 * are never computed and allocated.
180 237 */
181 String toString() => "StringToken(${value.slowToString()})"; 238 static const int LAZY_THRESHOLD = 4;
sra1 2013/10/22 19:52:31 How did you calculate this threshold?
lukas 2013/10/23 07:11:01 Short strings have a smaller footprint than a Comp
182 239
183 String slowToString() => value.slowToString(); 240 var valueOrLazySubstring;
ngeoffray 2013/10/18 10:19:37 You could put the union type of this field in comm
lukas 2013/10/24 16:48:36 Done.
184 } 241
185 242 final PrecedenceInfo info;
186 abstract class SourceString extends IterableBase<int> { 243
187 const factory SourceString(String string) = StringWrapper; 244 /**
188 245 * Creates a non-lazy string token. If [canonicalize] is true, the string
189 static final Map<String, StringWrapper> canonicalizedValues = 246 * is canonicalized before the token is created.
190 new Map<String, StringWrapper>(); 247 */
191 248 StringToken.fromString(this.info, String value, int charOffset,
192 factory SourceString.fromSubstring(String string, int begin, int end) { 249 [bool canonicalize = false])
ngeoffray 2013/10/18 10:19:37 Make it a named parameter? Easier when reading cal
lukas 2013/10/24 16:48:36 Done.
193 var substring = string.substring(begin, end); 250 : valueOrLazySubstring = canonicalizedString(value, canonicalize),
194 return canonicalizedValues.putIfAbsent( 251 super(charOffset);
195 substring, () => new StringWrapper(substring)); 252
196 } 253 /**
197 254 * Creates a lazy string token. If [canonicalize] is true, the string
198 void printOn(StringBuffer sb); 255 * is canonicalized before the token is created.
199 256 */
200 /** Gives a [SourceString] that is not including the [initial] first and 257 StringToken.fromSubstring(this.info, String data, int start, int end,
201 * [terminal] last characters. This is only intended to be used to remove 258 int charOffset, [bool canonicalize = false])
ngeoffray 2013/10/18 10:19:37 ditto
lukas 2013/10/24 16:48:36 Done.
202 * quotes from string literals (including an initial '@' for raw strings). 259 : super(charOffset) {
203 */ 260 int length = end - start;
204 SourceString copyWithoutQuotes(int initial, int terminal); 261 if (length <= LAZY_THRESHOLD) {
205 262 valueOrLazySubstring = canonicalizedString(data.substring(start, end),
206 String get stringValue; 263 canonicalize);
ngeoffray 2013/10/18 10:19:37 indentation.
lukas 2013/10/24 16:48:36 Done.
207 264 } else {
208 String slowToString(); 265 valueOrLazySubstring =
209 266 new LazySubstring(data, start, length, canonicalize);
210 bool get isEmpty; 267 }
211 268 }
212 bool isPrivate(); 269
213 } 270 /**
214 271 * Creates a lazy string token. If [asciiOnly] is false, the byte array
215 class StringWrapper extends IterableBase<int> implements SourceString { 272 * is passed through a UTF-8 decoder.
216 final String stringValue; 273 */
217 274 StringToken.fromUtf8Bytes(this.info, List<int> data, int start, int end,
218 const StringWrapper(this.stringValue); 275 bool asciiOnly, int charOffset)
219 276 : super(charOffset) {
220 int get hashCode => stringValue.hashCode; 277 int length = end - start;
221 278 if (length <= LAZY_THRESHOLD) {
222 bool operator ==(other) { 279 valueOrLazySubstring = decodeUtf8(data, start, end, asciiOnly);
223 return other is SourceString && toString() == other.slowToString(); 280 } else {
224 } 281 valueOrLazySubstring = new LazySubstring(data, start, length, asciiOnly);
225 282 }
226 Iterator<int> get iterator => new StringCodeIterator(stringValue); 283 }
227 284
228 void printOn(StringBuffer sb) { 285 String get value {
229 sb.write(stringValue); 286 if (valueOrLazySubstring is String) {
230 } 287 return valueOrLazySubstring;
231 288 } else {
232 String toString() => stringValue; 289 assert(valueOrLazySubstring is LazySubstring);
233 290 var data = valueOrLazySubstring.data;
234 String slowToString() => stringValue; 291 int start = valueOrLazySubstring.start;
235 292 int end = start + valueOrLazySubstring.length;
236 SourceString copyWithoutQuotes(int initial, int terminal) { 293 if (data is String) {
237 assert(0 <= initial); 294 valueOrLazySubstring = canonicalizedString(
238 assert(0 <= terminal); 295 data.substring(start, end), valueOrLazySubstring.boolValue);
239 assert(initial + terminal <= stringValue.length); 296 } else {
240 return new StringWrapper( 297 valueOrLazySubstring = decodeUtf8(
241 stringValue.substring(initial, stringValue.length - terminal)); 298 data, start, end, valueOrLazySubstring.boolValue);
242 } 299 }
243 300 return valueOrLazySubstring;
244 bool get isEmpty => stringValue.isEmpty; 301 }
245 302 }
246 bool isPrivate() => !isEmpty && stringValue.codeUnitAt(0) == $_; 303
247 } 304 String get stringValue => null;
248 305
249 class StringCodeIterator implements Iterator<int> { 306 bool isIdentifier() => identical(kind, IDENTIFIER_TOKEN);
250 final String string; 307
251 int index; 308 String toString() => "StringToken($value)";
252 final int end; 309
253 int _current; 310 static final HashSet<String> canonicalizedSubstrings =
254 311 new HashSet();
ngeoffray 2013/10/18 10:19:37 HashSet<String>()
lukas 2013/10/24 16:48:36 Done.
255 StringCodeIterator(String string) : 312
256 this.string = string, index = 0, end = string.length; 313 static String canonicalizedString(String s, bool canonicalize) {
257 314 if (!canonicalize) return s;
258 StringCodeIterator.substring(this.string, this.index, this.end) { 315 var result = canonicalizedSubstrings.lookup(s);
259 assert(0 <= index); 316 if (result != null) return result;
260 assert(index <= end); 317 canonicalizedSubstrings.add(s);
261 assert(end <= string.length); 318 return s;
262 } 319 }
263 320
264 int get current => _current; 321 static String decodeUtf8(List<int> data, int start, int end, bool asciiOnly) {
265 322 var s;
266 bool moveNext() { 323 if (asciiOnly) {
267 _current = null; 324 // getRange returns an iterator, it does not copy the data.
268 if (index >= end) return false; 325 s = new String.fromCharCodes(data.getRange(start, end));
269 _current = string.codeUnitAt(index++); 326 } else {
270 return true; 327 // TODO(lry): this is measurably slow. Also sublist is copied eagerly.
271 } 328 var bytes = data.sublist(start, end);
272 } 329 s = UTF8.decode(bytes);
273 330 }
274 class BeginGroupToken extends StringToken { 331 return canonicalizedString(s, true);
275 Token endGroup; 332 }
276 BeginGroupToken(PrecedenceInfo info, String value, int charOffset) 333 }
277 : super(info, value, charOffset); 334
335 /**
336 * This class represents the necessary information to compute a substring
337 * lazily. The substring can either originate in a string or in a [:List<int>:]
ngeoffray 2013/10/18 10:19:37 originate in -> originate from?
lukas 2013/10/24 16:48:36 Done.
338 * of UTF-8 bytes.
339 */
340 abstract class LazySubstring {
341 /** The original data, either a string or a List<int> */
342 get data;
343
344 int get start;
345 int get length;
346
347 /**
348 * If this substring is based on a String, the boolean indicates wheter the
ngeoffray 2013/10/18 10:19:37 the boolean -> [boolValue]
lukas 2013/10/24 16:48:36 Done.
349 * resulting substring should be canonicalized.
350 *
351 * For substrings based on a byte array, the boolean value is true if the
ngeoffray 2013/10/18 10:19:37 ditto
lukas 2013/10/24 16:48:36 Done.
352 * array only holds ASCII characters. The resulting substring will be
353 * canonicalized after decoding.
354 */
355 bool get boolValue;
356
357 LazySubstring.internal();
358
359 factory LazySubstring(data, int start, int length, bool b) {
360 // See comment on [CompactLazySubstring].
361 if (start < 0x100000 && length < 0x200) {
362 int fields = (start << 9);
363 fields = fields | length;
364 fields = fields << 1;
365 if (b) fields |= 1;
366 return new CompactLazySubstring(data, fields);
367 } else {
368 return new FullLazySubstring(data, start, length, b);
369 }
370 }
371 }
372
373 /**
374 * This class encodes [start], [length] and [boolValue] in a single
375 * 30 bit integer. It uses 20 bits for [start], which covers source files
376 * of 1M. [length] has 9 bits, which covers 512 characters.
ngeoffray 2013/10/18 10:19:37 1M -> 1MB.
lukas 2013/10/24 16:48:36 Done.
377 *
378 * The file html_dart2js.dart is currently around 1M.
ngeoffray 2013/10/18 10:19:37 1M -> 1MB
lukas 2013/10/24 16:48:36 Done.
379 */
380 class CompactLazySubstring extends LazySubstring {
381 final data;
382 final int fields;
383
384 CompactLazySubstring(this.data, this.fields) : super.internal();
385
386 int get start => fields >> 10;
387 int get length => (fields >> 1) & 0x1ff;
388 bool get boolValue => (fields & 1) == 1;
389 }
390
391 class FullLazySubstring extends LazySubstring {
392 final data;
393 final int start;
394 final int length;
395 final bool boolValue;
396 FullLazySubstring(this.data, this.start, this.length, this.boolValue)
397 : super.internal();
278 } 398 }
279 399
280 bool isUserDefinableOperator(String value) { 400 bool isUserDefinableOperator(String value) {
281 return 401 return
282 isBinaryOperator(value) || 402 isBinaryOperator(value) ||
283 isMinusOperator(value) || 403 isMinusOperator(value) ||
284 isTernaryOperator(value) || 404 isTernaryOperator(value) ||
285 isUnaryOperator(value); 405 isUnaryOperator(value);
286 } 406 }
287 407
(...skipping 250 matching lines...) Expand 10 before | Expand all | Expand 10 after
538 658
539 const PrecedenceInfo STRING_INTERPOLATION_IDENTIFIER_INFO = 659 const PrecedenceInfo STRING_INTERPOLATION_IDENTIFIER_INFO =
540 const PrecedenceInfo('\$', 0, 660 const PrecedenceInfo('\$', 0,
541 STRING_INTERPOLATION_IDENTIFIER_TOKEN); 661 STRING_INTERPOLATION_IDENTIFIER_TOKEN);
542 662
543 const PrecedenceInfo HEXADECIMAL_INFO = 663 const PrecedenceInfo HEXADECIMAL_INFO =
544 const PrecedenceInfo('hexadecimal', 0, HEXADECIMAL_TOKEN); 664 const PrecedenceInfo('hexadecimal', 0, HEXADECIMAL_TOKEN);
545 665
546 const PrecedenceInfo COMMENT_INFO = 666 const PrecedenceInfo COMMENT_INFO =
547 const PrecedenceInfo('comment', 0, COMMENT_TOKEN); 667 const PrecedenceInfo('comment', 0, COMMENT_TOKEN);
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698