sdk/lib/_internal/compiler/implementation/scanner/token.dart - Issue 27510003: Scanner for UTF-8 byte arrays

Side by Side Diff: sdk/lib/_internal/compiler/implementation/scanner/token.dart

Issue 27510003: Scanner for UTF-8 byte arrays (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Re-add ArrayBasedScanner, minor fixes. Created 7 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« sdk/lib/_internal/compiler/implementation/scanner/scanner.dart ('K') | « sdk/lib/_internal/compiler/implementation/scanner/string_scanner.dart ('k') | sdk/lib/_internal/compiler/implementation/scanner/utf8_bytes_scanner.dart » ('j') | sdk/lib/_internal/compiler/implementation/scanner/utf8_bytes_scanner.dart » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 part of scanner;	5 part of scanner;

6	6

7 const int EOF_TOKEN = 0;	7 const int EOF_TOKEN = 0;

8	8

9 const int KEYWORD_TOKEN = $k;	9 const int KEYWORD_TOKEN = $k;

10 const int IDENTIFIER_TOKEN = $a;	10 const int IDENTIFIER_TOKEN = $a;

(...skipping 61 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
72 const int TILDE_SLASH_TOKEN = TILDE_SLASH_EQ_TOKEN + 1;	72 const int TILDE_SLASH_TOKEN = TILDE_SLASH_EQ_TOKEN + 1;

73 const int PERCENT_EQ_TOKEN = TILDE_SLASH_TOKEN + 1;	73 const int PERCENT_EQ_TOKEN = TILDE_SLASH_TOKEN + 1;

74 const int GT_GT_TOKEN = PERCENT_EQ_TOKEN + 1;	74 const int GT_GT_TOKEN = PERCENT_EQ_TOKEN + 1;

75 const int CARET_EQ_TOKEN = GT_GT_TOKEN + 1;	75 const int CARET_EQ_TOKEN = GT_GT_TOKEN + 1;

76 const int COMMENT_TOKEN = CARET_EQ_TOKEN + 1;	76 const int COMMENT_TOKEN = CARET_EQ_TOKEN + 1;

77 const int STRING_INTERPOLATION_IDENTIFIER_TOKEN = COMMENT_TOKEN + 1;	77 const int STRING_INTERPOLATION_IDENTIFIER_TOKEN = COMMENT_TOKEN + 1;

78	78

79 /**	79 /**

80 * A token that doubles as a linked list.	80 * A token that doubles as a linked list.

81 */	81 */

82 class Token implements Spannable {	82 abstract class Token implements Spannable {

83 /**

84 * The precedence info for this token. [info] determines the kind and the

85 * precedence level of this token.

86 */

87 final PrecedenceInfo info;

88

89 /**	83 /**

90 * The character offset of the start of this token within the source text.	84 * The character offset of the start of this token within the source text.

91 */	85 */

92 final int charOffset;	86 final int charOffset;

93	87

	88 Token(this.charOffset);

	89

94 /**	90 /**

95 * The next token in the token stream.	91 * The next token in the token stream.

96 */	92 */

97 Token next;	93 Token next;

98	94

99 Token(this.info, this.charOffset);	95 /**

100	96 * The precedence info for this token. [info] determines the kind and the

101 get value => info.value;	97 * precedence level of this token.

	98 *

	99 * Defined as getter to save a field in the [KeywordToken] subclass.

	100 */

	101 PrecedenceInfo get info;

102	102

103 /**	103 /**

104 * Returns the string value for keywords and symbols. For instance 'class' for	104 * The string represented by this token, a substring of the source code.

105 * the [CLASS] keyword token and '*' for a [Token] based on [STAR_INFO]. For

106 * other tokens, such identifiers, strings, numbers, etc, [stringValue]

107 * returns [:null:].

108 *	105 *

109 * [stringValue] should only be used for testing keywords and symbols.	106 * For [StringToken]s the value includes the quotes, explicit escapes, etc.

	107 *

110 */	108 */

111 String get stringValue => info.value.stringValue;	109 String get value;

	110

	111 /**

	112 * For symbol and keyword tokens, returns the string value reprenseted by this

	113 * token. For [StringToken]s this method returns [:null:].

	114 *

	115 * For [SymbolToken]s and [KeywordToken]s, the string value is a compile-time

	116 * constant originating in the [PrecedenceInfo] or in the [Keyword] instance.

	117 * This allows testing for keywords and symbols using [:identical:], e.g.,

	118 * [:identical('class', token.value):].

	119 *

	120 * Note that returning [:null:] for string tokens is important to identify

	121 * symbols and keywords, we cannot use [value] instead. The string literal

	122 * "$a($b"

	123 * produces ..., SymbolToken($), StringToken(a), StringToken((), ...

	124 *

	125 * After parsing the identifier 'a', the parser tests for a function

	126 * declaration using [:identical(next.stringValue, '('):], which (rihgtfully)

	127 * returns false because stringValue returns [:null:].

	128 */

	129 String get stringValue;

112	130

113 /**	131 /**

114 * The kind enum of this token as determined by its [info].	132 * The kind enum of this token as determined by its [info].

115 */	133 */

116 int get kind => info.kind;	134 int get kind => info.kind;

117	135

118 /**	136 /**

119 * The precedence level for this token.	137 * The precedence level for this token.

120 */	138 */

121 int get precedence => info.precedence;	139 int get precedence => info.precedence;

122	140

123 bool isIdentifier() => identical(kind, IDENTIFIER_TOKEN);	141 /**

	142 * True if this token is an identifier. Some keywords allowed as identifiers,

	143 * see implementaiton in [KeywordToken].

	144 */

	145 bool isIdentifier();

124	146

125 /**	147 /**

126 * Returns a textual representation of this token to be used for debugging	148 * Returns a textual representation of this token to be used for debugging

127 * purposes. The resulting string might contain information about the	149 * purposes. The resulting string might contain information about the

128 * structure of the token, for example 'StringToken(foo)' for the identifier	150 * structure of the token, for example 'StringToken(foo)' for the identifier

129 * token 'foo'. Use [slowToString] for the text actually parsed by the token.	151 * token 'foo'.

	152 *

	153 * Use [value] for the text actually parsed by the token.

130 */	154 */

131 String toString() => info.value.toString();	155 String toString();

132

133 /**

134 * The text parsed by this token.

135 */

136 String slowToString() => toString();

137	156

138 /**	157 /**

139 * The number of characters parsed by this token.	158 * The number of characters parsed by this token.

140 */	159 */

141 int get slowCharCount {	160 int get charCount {

142 if (info == BAD_INPUT_INFO) {	161 if (info == BAD_INPUT_INFO) {

143 // This is a token that wraps around an error message. Return 1	162 // This is a token that wraps around an error message. Return 1

144 // instead of the size of the length of the error message.	163 // instead of the size of the length of the error message.

145 return 1;	164 return 1;

146 } else {	165 } else {

147 return slowToString().length;	166 return value.length;

148 }	167 }

149 }	168 }

150	169

151 int get hashCode => computeHashCode(charOffset, info, value);	170 int get hashCode => computeHashCode(charOffset, info, value);

152 }	171 }

153	172

154 /**	173 /**

	174 * A symbol token represents the symbol in its precendence info.

	175 * Also used for end of file with EOF_INFO.

	176 */

	177 class SymbolToken extends Token {

	178

	179 final PrecedenceInfo info;

	180

	181 SymbolToken(this.info, int charOffset) : super(charOffset);

	182

	183 String get value => info.value;

	184

	185 String get stringValue => info.value;

	186

	187 bool isIdentifier() => false;

	188

	189 String toString() => "SymbolToken($value)";

	190 }

	191

	192 /**

	193 * A [BeginGroupToken] reprsents a symbol that may be the beginning of

	194 * a pair of brackets, i.e., ( { [ < or ${

	195 * The [endGroup] token points to the matching closing bracked in case

	196 * it can be identified during scanning.

	197 */

	198 class BeginGroupToken extends SymbolToken {

	199 Token endGroup;

	200

	201 BeginGroupToken(PrecedenceInfo info, int charOffset)

	202 : super(info, charOffset);

	203 }

	204

	205 /**

155 * A keyword token.	206 * A keyword token.

156 */	207 */

157 class KeywordToken extends Token {	208 class KeywordToken extends Token {

158 final Keyword value;	209 final Keyword keyword;

159 String get stringValue => value.syntax;	210

160	211 KeywordToken(this.keyword, int charOffset) : super(charOffset);

161 KeywordToken(Keyword value, int charOffset)	212

162 : this.value = value, super(value.info, charOffset);	213 PrecedenceInfo get info => keyword.info;

163	214

164 bool isIdentifier() => value.isPseudo \|\| value.isBuiltIn;	215 String get value => keyword.syntax;

165	216

166 String toString() => value.syntax;	217 String get stringValue => keyword.syntax;

167 }	218

168	219 bool isIdentifier() => keyword.isPseudo \|\| keyword.isBuiltIn;

169 /**	220

170 * A String-valued token.	221 String toString() => "KeywordToken($value)";

	222 }

	223

	224 /**

	225 * A String-valued token. Represents identifiers, string literals,

	226 * number literals, comments and error tokens, using the corresponding

	227 * precedence info.

171 */	228 */

172 class StringToken extends Token {	229 class StringToken extends Token {

173 final SourceString value;	230 /**

174	231 * The length thershold above which substring tokens are computed lazily.
	kasperl 2013/10/17 08:50:39 thershold -> threshold thershold -> threshold lukas 2013/10/17 17:49:34 Done. Show quoted text On 2013/10/17 08:50:39, kasperl wrote: > thershold -> threshold Done.
175 StringToken(PrecedenceInfo info, String value, int charOffset)	232 *

176 : this.fromSource(info, new SourceString(value), charOffset);	233 * For string tokens that are substrings of the program source, the actual

177	234 * substring extraction is performed lazily. This is beneficial because

178 StringToken.fromSource(PrecedenceInfo info, this.value, int charOffset)	235 * not all scanned code is actually used. For unused parts, the substrings

179 : super(info, charOffset);	236 * are never computed and allocated.

180	237 */

181 String toString() => "StringToken(${value.slowToString()})";	238 static final int lazyThreshold = 4;
	kasperl 2013/10/17 08:50:39 static const int LAZY_THRESHOLD static const int LAZY_THRESHOLD lukas 2013/10/17 17:49:34 Done. Show quoted text On 2013/10/17 08:50:39, kasperl wrote: > static const int LAZY_THRESHOLD Done.
182	239

183 String slowToString() => value.slowToString();	240 var valueOrSublist;
	kasperl 2013/10/17 08:50:39 Sublist -> LazyList? Sublist -> LazyList? lukas 2013/10/17 17:49:34 Done. Show quoted text On 2013/10/17 08:50:39, kasperl wrote: > Sublist -> LazyList? Done.
184 }	241

185	242 final PrecedenceInfo info;

186 abstract class SourceString extends IterableBase<int> {	243

187 const factory SourceString(String string) = StringWrapper;	244 /**

188	245 * Creates a non-lazy string token. If [canonicalize] is true, the string

189 static final Map<String, StringWrapper> canonicalizedValues =	246 * is canonicalized before the token is created.

190 new Map<String, StringWrapper>();	247 */

191	248 StringToken.fromString(this.info, String value, int charOffset,

192 factory SourceString.fromSubstring(String string, int begin, int end) {	249 [bool canonicalize = false])

193 var substring = string.substring(begin, end);	250 : valueOrSublist = canonicalizedString(value, canonicalize),

194 return canonicalizedValues.putIfAbsent(	251 super(charOffset);

195 substring, () => new StringWrapper(substring));	252

196 }	253 /**

197	254 * Creates a lazy string token. If [canonicalize] is true, the string

198 void printOn(StringBuffer sb);	255 * is canonicalized before the token is created.

199	256 */

200 /** Gives a [SourceString] that is not including the [initial] first and	257 StringToken.fromSubstring(this.info, String data, int start, int end,

201 * [terminal] last characters. This is only intended to be used to remove	258 int charOffset, [bool canonicalize = false])

202 * quotes from string literals (including an initial '@' for raw strings).	259 : super(charOffset) {

203 */	260 int length = end - start;

204 SourceString copyWithoutQuotes(int initial, int terminal);	261 if (length <= lazyThreshold) {

205	262 valueOrSublist = canonicalizedString(data.substring(start, end),

206 String get stringValue;	263 canonicalize);

207	264 } else {

208 String slowToString();	265 valueOrSublist = new LazySubstring(data, start, length, canonicalize);

209	266 }

210 bool get isEmpty;	267 }

211	268

212 bool isPrivate();	269 /**

213 }	270 * Creates a lazy string token. If [asciiOnly] is false, the byte array

214	271 * is passed through a UTF-8 decoder.

215 class StringWrapper extends IterableBase<int> implements SourceString {	272 */

216 final String stringValue;	273 StringToken.fromUtf8Bytes(this.info, List<int> data, int start, int end,

217	274 bool asciiOnly, int charOffset)

218 const StringWrapper(this.stringValue);	275 : super(charOffset) {

219	276 int length = end - start;

220 int get hashCode => stringValue.hashCode;	277 if (length <= lazyThreshold) {

221	278 valueOrSublist = decodeUtf8(data, start, end, asciiOnly);

222 bool operator ==(other) {	279 } else {

223 return other is SourceString && toString() == other.slowToString();	280 valueOrSublist = new LazySubstring(data, start, length, asciiOnly);

224 }	281 }

225	282 }

226 Iterator<int> get iterator => new StringCodeIterator(stringValue);	283

227	284 String get value {

228 void printOn(StringBuffer sb) {	285 if (valueOrSublist is String) {

229 sb.write(stringValue);	286 return valueOrSublist;

230 }	287 } else {

231	288 if (valueOrSublist is LazySubstring) {
	kasperl 2013/10/17 08:50:39 Should this be an assert instead? Should this be an assert instead? lukas 2013/10/17 17:49:34 Done. Show quoted text On 2013/10/17 08:50:39, kasperl wrote: > Should this be an assert instead? Done.
232 String toString() => stringValue;	289 var data = valueOrSublist.data;

233	290 int start = valueOrSublist.start;

234 String slowToString() => stringValue;	291 int end = start + valueOrSublist.length;

235	292 if (data is String) {

236 SourceString copyWithoutQuotes(int initial, int terminal) {	293 valueOrSublist = canonicalizedString(data.substring(start, end),

237 assert(0 <= initial);	294 valueOrSublist.boolValue);

238 assert(0 <= terminal);	295 } else {

239 assert(initial + terminal <= stringValue.length);	296 valueOrSublist = decodeUtf8(data, start, end,

240 return new StringWrapper(	297 valueOrSublist.boolValue);

241 stringValue.substring(initial, stringValue.length - terminal));	298 }

242 }	299 }

243	300 return valueOrSublist;

244 bool get isEmpty => stringValue.isEmpty;	301 }

245	302 }

246 bool isPrivate() => !isEmpty && stringValue.codeUnitAt(0) == $_;	303

247 }	304 String get stringValue => null;

248	305

249 class StringCodeIterator implements Iterator<int> {	306 bool isIdentifier() => identical(kind, IDENTIFIER_TOKEN);

250 final String string;	307

251 int index;	308 String toString() => "StringToken($value)";

252 final int end;	309

253 int _current;	310 // @lry replace by hash set after merging from svn trunk
	kasperl 2013/10/17 08:50:39 I guess this is done? I guess this is done? lukas 2013/10/17 17:49:34 Done. Show quoted text On 2013/10/17 08:50:39, kasperl wrote: > I guess this is done? Done.
254	311 static final HashSet<String> canonicalizedSubstrings =

255 StringCodeIterator(String string) :	312 new HashSet();

256 this.string = string, index = 0, end = string.length;	313

257	314 static String canonicalizedString(String s, bool canonicalize) {

258 StringCodeIterator.substring(this.string, this.index, this.end) {	315 if (canonicalize) {
	kasperl 2013/10/17 08:50:39 I'd consider doing something like: if (!canonic I'd consider doing something like: if (!canonicalize) return s; ... to get rid of some of the nesting. lukas 2013/10/17 17:49:34 Done. Show quoted text On 2013/10/17 08:50:39, kasperl wrote: > I'd consider doing something like: > > if (!canonicalize) return s; > ... > > to get rid of some of the nesting. Done.
259 assert(0 <= index);	316 var result = canonicalizedSubstrings.lookup(s);

260 assert(index <= end);	317 if (result == null) {
	kasperl 2013/10/17 08:50:39 I'd consider: if (result != null) return result I'd consider: if (result != null) return result; to get rid of even more nesting. lukas 2013/10/17 17:49:34 Done. Show quoted text On 2013/10/17 08:50:39, kasperl wrote: > I'd consider: > > if (result != null) return result; > > to get rid of even more nesting. Done.
261 assert(end <= string.length);	318 canonicalizedSubstrings.add(s);

262 }	319 return s;

263	320 } else {

264 int get current => _current;	321 return result;

265	322 }

266 bool moveNext() {	323 } else {

267 _current = null;	324 return s;

268 if (index >= end) return false;	325 }

269 _current = string.codeUnitAt(index++);	326 }

270 return true;	327

271 }	328 static String decodeUtf8(List<int> data, int start, int end, bool asciiOnly) {

272 }	329 var s;

273	330 if (asciiOnly) {

274 class BeginGroupToken extends StringToken {	331 s = new String.fromCharCodes(data.getRange(start, end));
	kasperl 2013/10/17 08:50:39 Maybe add a comment that getRange doesn't copy the Maybe add a comment that getRange doesn't copy the data, but it gives an iterable "view" of the range. lukas 2013/10/17 17:49:34 Done. Show quoted text On 2013/10/17 08:50:39, kasperl wrote: > Maybe add a comment that getRange doesn't copy the data, but it gives an > iterable "view" of the range. Done.
275 Token endGroup;	332 } else {

276 BeginGroupToken(PrecedenceInfo info, String value, int charOffset)	333 // TODO(lry), this is measurably slow. Also sublist is allocated eagerly.
	kasperl 2013/10/17 08:50:39 TODO(lry), -> TODO(lry):. Instead of allocated you TODO(lry), -> TODO(lry):. Instead of allocated you might use a word like copy or something. lukas 2013/10/17 17:49:34 Done. Show quoted text On 2013/10/17 08:50:39, kasperl wrote: > TODO(lry), -> TODO(lry):. Instead of allocated you might use a word like copy or > something. Done.
277 : super(info, value, charOffset);	334 var bytes = data.sublist(start, end);

	335 s = UTF8.decode(bytes);

	336 }

	337 return canonicalizedString(s, true);

	338 }

	339 }

	340

	341 /**

	342 * This class represents the necessary information to compute a substring

	343 * lazily. The substring can either originate in a string or in a [:List<int>:]

	344 * of UTF-8 bytes.

	345 */

	346 abstract class LazySubstring {

	347 /** The original data, either a string or a List<int> */

	348 get data;

	349

	350 int get start;

	351 int get length;

	352

	353 /**

	354 * If this substring is based on a String, the boolean indicates wheter the

	355 * resulting substring should be canonicalized.

	356 *

	357 * For substrings based on a byte array, the boolean value is true if the

	358 * array only holds ASCII characters. The resulting substring will be

	359 * canonicalized after decoding.

	360 */

	361 bool get boolValue;

	362

	363 LazySubstring.internal();

	364

	365 factory LazySubstring(data, int start, int length, bool b) {

	366 // See comment on [CompactLazySubstring].

	367 if (start < 0x100000 && length < 0x200) {

	368 int fields = (start << 9);

	369 fields = fields \| length;

	370 fields = fields << 1;

	371 if (b) fields \|= 1;

	372 return new CompactLazySubstring(data, fields);

	373 } else {

	374 return new FullLazySubstring(data, start, length, b);

	375 }

	376 }

	377 }

	378

	379 /**

	380 * This class encodes [start], [length] and [boolValue] in a single

	381 * 30 bit integer. It uses 20 bits for [start], which covers source files

	382 * of 1M. [length] has 9 bits, which covers 512 characters.

	383 *

	384 * The file html_dart2js.dart is currently around 1M.

	385 */

	386 class CompactLazySubstring extends LazySubstring {

	387 final data;

	388 final int fields;

	389

	390 CompactLazySubstring(this.data, this.fields) : super.internal();

	391

	392 int get start => fields >> 10;

	393 int get length => (fields >> 1) & 0x1ff;

	394 bool get boolValue => (fields & 1) == 1;

	395 }

	396

	397 class FullLazySubstring extends LazySubstring {

	398 final data;

	399 final int start;

	400 final int length;

	401 final bool boolValue;

	402 FullLazySubstring(this.data, this.start, this.length, this.boolValue)

	403 : super.internal();

278 }	404 }

279	405

280 bool isUserDefinableOperator(String value) {	406 bool isUserDefinableOperator(String value) {

281 return	407 return

282 isBinaryOperator(value) \|\|	408 isBinaryOperator(value) \|\|

283 isMinusOperator(value) \|\|	409 isMinusOperator(value) \|\|

284 isTernaryOperator(value) \|\|	410 isTernaryOperator(value) \|\|

285 isUnaryOperator(value);	411 isUnaryOperator(value);

286 }	412 }

287	413

(...skipping 250 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
538	664

539 const PrecedenceInfo STRING_INTERPOLATION_IDENTIFIER_INFO =	665 const PrecedenceInfo STRING_INTERPOLATION_IDENTIFIER_INFO =

540 const PrecedenceInfo('\$', 0,	666 const PrecedenceInfo('\$', 0,

541 STRING_INTERPOLATION_IDENTIFIER_TOKEN);	667 STRING_INTERPOLATION_IDENTIFIER_TOKEN);

542	668

543 const PrecedenceInfo HEXADECIMAL_INFO =	669 const PrecedenceInfo HEXADECIMAL_INFO =

544 const PrecedenceInfo('hexadecimal', 0, HEXADECIMAL_TOKEN);	670 const PrecedenceInfo('hexadecimal', 0, HEXADECIMAL_TOKEN);

545	671

546 const PrecedenceInfo COMMENT_INFO =	672 const PrecedenceInfo COMMENT_INFO =

547 const PrecedenceInfo('comment', 0, COMMENT_TOKEN);	673 const PrecedenceInfo('comment', 0, COMMENT_TOKEN);

OLD	NEW