Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(151)

Side by Side Diff: sdk/lib/_internal/compiler/implementation/scanner/scanner.dart

Issue 27510003: Scanner for UTF-8 byte arrays (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Re-add ArrayBasedScanner, minor fixes. Created 7 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 part of scanner; 5 part of scanner;
6 6
7 abstract class Scanner { 7 abstract class Scanner {
8 Token tokenize(); 8 Token tokenize();
9
10 factory Scanner(SourceFile file, {bool includeComments: false}) {
11 if (file is Utf8BytesSourceFile) {
12 return new Utf8BytesScanner(file, includeComments: includeComments);
13 } else {
14 return new StringScanner(file, includeComments: includeComments);
15 }
16 }
9 } 17 }
10 18
11 /** 19 abstract class AbstractScanner implements Scanner {
12 * Common base class for a Dart scanner. 20 final bool includeComments;
13 */
14 abstract class AbstractScanner<T extends SourceString> implements Scanner {
15 int advance();
16 int nextByte();
17 21
18 /** 22 /**
19 * Returns the current character or byte depending on the underlying input 23 * The string offset for the next token that will be created.
20 * kind. For example, [StringScanner] operates on [String] and thus returns 24 *
21 * characters (Unicode codepoints represented as int) whereas 25 * Note that in the [Utf8BytesScanner], [stringOffset] and [scanOffset] values
22 * [ByteArrayScanner] operates on byte arrays and thus returns bytes. 26 * are different. One string character can be encoded using multiple UTF-8
27 * bytes.
28 */
29 int tokenStart = -1;
30
31 /**
32 * A pointer to the token stream created by this scanner. The first token
33 * is a special token and not part of the source file. This is an
34 * implementation detail to avoids special cases in the scanner. This token
35 * is not exposed to clients of the scanner, which are expected to invoke
36 * [firstToken] to access the token stream.
37 */
38 final Token tokens = new SymbolToken(EOF_INFO, -1);
39
40 /**
41 * A pointer to the last scanned token.
42 */
43 Token tail;
44
45 /**
46 * The source file that is being scanned. This field can be [:null:].
47 * If the source file is available, the scanner assigns its [:lineStarts:] and
48 * [:length:] fields at the end of [tokenize].
49 */
50 final SourceFile file;
51
52 final List<int> lineStarts = [0];
53
54 AbstractScanner(this.file, this.includeComments) {
55 this.tail = this.tokens;
56 }
57
58
59 /**
60 * Advances and returns the next character.
61 *
62 * If the next character is non-ASCII, then the returned value depends on the
63 * scanner implementation. The [Utf8BytesScanner] returns a UTF-8 byte, while
64 * the [StringScanner] returns a UTF-16 code unit.
65 *
66 * The scanner ensures that [advance] is not invoked after it returned [$EOF].
67 * This allows implementations to omit bound checks if the data structure ends
68 * with '0'.
69 */
70 int advance();
71
72 /**
73 * Returns the current unicode character.
74 *
75 * If the current character is ASCII, then it is returned unchanged.
76 *
77 * The [Utf8BytesScanner] decodes the next unicode code point starting at the
78 * current position. Note that every unicode character is returned as a single
79 * code point, i.e., for '\u{1d11e}' it returns 119070, and the following
80 * [advance] returns the next character.
81 *
82 * The [StringScanner] returns the current character unchanged, which might
83 * be a surrogate character. In the case of '\u{1d11e}', it returns the first
84 * code unit 55348, and the following [advance] returns the second code unit
85 * 56606.
86 *
87 * Invoking [currentAsUnicode] multiple times is safe, i.e.,
88 * [:currentAsUnicode(next) == currentAsUnicode(currentAsUnicode(next)):].
89 */
90 int currentAsUnicode(int next);
91
92 /**
93 * Returns the character at the next poisition. Like in [advance], the
94 * [Utf8BytesScanner] returns a UTF-8 byte, while the [StringScanner] returns
95 * a UTF-16 code unit.
23 */ 96 */
24 int peek(); 97 int peek();
25 98
26 /** 99 /**
27 * Appends a fixed token based on whether the current char is [choice] or not. 100 * Notifies the scanner that unicode characters were detected in either a
28 * If the current char is [choice] a fixed token whose kind and content 101 * comment or a string literal between [startScanOffset] and the current
29 * is determined by [yes] is appended, otherwise a fixed token whose kind 102 * scan offset.
30 * and content is determined by [no] is appended.
31 */ 103 */
104 void handleUnicode(int startScanOffset);
105
106 /**
107 * Returns the current scan offset.
108 *
109 * In the [Utf8BytesScanner] this is the offset into the byte list, in the
110 * [StringScanner] the offset in the source string.
111 */
112 int get scanOffset;
113
114 /**
115 * Returns the current string offset.
116 *
117 * In the [StringScanner] this is identical to the [scanOffset]. In the
118 * [Utf8BytesScanner] it is computed based on encountered UTF-8 characters.
119 */
120 int get stringOffset;
121
122 /**
123 * Returns the first token scanned by this [Scanner].
124 */
125 Token firstToken();
126
127 /**
128 * Returns the last token scanned by this [Scanner].
129 */
130 Token previousToken();
131
132 /**
133 * Notifies that a new token starts at current offset.
134 */
135 void beginToken() {
136 tokenStart = stringOffset;
137 }
138
139 /**
140 * Appends a substring from the scan offset [:start:] to the current
141 * [:scanOffset:] plus the [:extraOffset:]. For example, if the current
142 * scanOffset is 10, then [:appendSubstringToken(5, -1):] will append the
143 * substring string [5,9).
144 *
145 * Note that [extraOffset] can only be used if the covered character(s) are
146 * known to be ASCII.
147 */
148 void appendSubstringToken(PrecedenceInfo info, int start,
149 bool asciiOnly, [int extraOffset]);
150
151 /** Documentation in subclass [ArrayBasedScanner] */
kasperl 2013/10/17 08:50:39 Terminate these comments /** Documentation in subc
lukas 2013/10/17 17:49:34 Done.
152 void appendStringToken(PrecedenceInfo info, String value);
153
154 /** Documentation in subclass [ArrayBasedScanner] */
155 void appendPrecedenceToken(PrecedenceInfo info);
156
157 /** Documentation in subclass [ArrayBasedScanner] */
32 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no); 158 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no);
33 159
34 /** 160 /** Documentation in subclass [ArrayBasedScanner] */
35 * Appends a fixed token whose kind and content is determined by [info]. 161 void appendKeywordToken(Keyword keyword);
36 */
37 void appendPrecedenceToken(PrecedenceInfo info);
38 162
39 /** 163 /** Documentation in subclass [ArrayBasedScanner] */
40 * Appends a token whose kind is determined by [info] and content is [value].
41 */
42 void appendStringToken(PrecedenceInfo info, String value);
43
44 /**
45 * Appends a token whose kind is determined by [info] and content is defined
46 * by the SourceString [value].
47 */
48 void appendByteStringToken(PrecedenceInfo info, T value);
49
50 /**
51 * Appends a keyword token whose kind is determined by [keyword].
52 */
53 void appendKeywordToken(Keyword keyword);
54 void appendWhiteSpace(int next);
55 void appendEofToken(); 164 void appendEofToken();
56 165
57 /** 166 /** Documentation in subclass [ArrayBasedScanner] */
58 * Creates an ASCII SourceString whose content begins at the source byte 167 void appendWhiteSpace(int next);
59 * offset [start] and ends at [offset] bytes from the current byte offset of
60 * the scanner. For example, if the current byte offset is 10,
61 * [:asciiString(0,-1):] creates an ASCII SourceString whose content is found
62 * at the [0,9[ byte interval of the source text.
63 */
64 T asciiString(int start, int offset);
65 T utf8String(int start, int offset);
66 Token firstToken();
67 Token previousToken();
68 void beginToken();
69 void addToCharOffset(int offset);
70 int get charOffset;
71 int get byteOffset;
72 void appendBeginGroup(PrecedenceInfo info, String value);
73 int appendEndGroup(PrecedenceInfo info, String value, int openKind);
74 void appendGt(PrecedenceInfo info, String value);
75 void appendGtGt(PrecedenceInfo info, String value);
76 void appendGtGtGt(PrecedenceInfo info, String value);
77 void appendComment();
78 168
79 /** 169 /** Documentation in subclass [ArrayBasedScanner] */
80 * We call this method to discard '<' from the "grouping" stack 170 void lineFeedInMultiline();
81 * (maintained by subclasses). 171
82 * 172 /** Documentation in subclass [ArrayBasedScanner] */
83 * [PartialParser.skipExpression] relies on the fact that we do not 173 void appendBeginGroup(PrecedenceInfo info);
84 * create groups for stuff like: 174
85 * [:a = b < c, d = e > f:]. 175 /** Documentation in subclass [ArrayBasedScanner] */
86 * 176 int appendEndGroup(PrecedenceInfo info, int openKind);
87 * In other words, this method is called when the scanner recognizes 177
88 * something which cannot possibly be part of a type 178 /** Documentation in subclass [ArrayBasedScanner] */
89 * parameter/argument list. 179 void appendGt(PrecedenceInfo info);
90 */ 180
181 /** Documentation in subclass [ArrayBasedScanner] */
182 void appendGtGt(PrecedenceInfo info);
183
184 /** Documentation in subclass [ArrayBasedScanner] */
185 void appendComment(start, bool asciiOnly);
186
187 /** Documentation in subclass [ArrayBasedScanner] */
91 void discardOpenLt(); 188 void discardOpenLt();
92 189
93 // TODO(ahe): Move this class to implementation. 190 // TODO(ahe): Move this class to implementation.
94 191
95 Token tokenize() { 192 Token tokenize() {
96 int next = advance(); 193 int next = advance();
97 while (!identical(next, $EOF)) { 194 while (!identical(next, $EOF)) {
98 next = bigSwitch(next); 195 next = bigSwitch(next);
99 } 196 }
100 appendEofToken(); 197 appendEofToken();
198
199 if (file != null) {
200 file.length = stringOffset;
201 // One additional line start at the end, see [SourceFile.lineStarts].
202 lineStarts.add(stringOffset + 1);
203 file.lineStarts = lineStarts;
204 }
205
101 return firstToken(); 206 return firstToken();
102 } 207 }
103 208
104 int bigSwitch(int next) { 209 int bigSwitch(int next) {
105 beginToken(); 210 beginToken();
106 if (identical(next, $SPACE) || identical(next, $TAB) 211 if (identical(next, $SPACE) || identical(next, $TAB)
107 || identical(next, $LF) || identical(next, $CR)) { 212 || identical(next, $LF) || identical(next, $CR)) {
108 appendWhiteSpace(next); 213 appendWhiteSpace(next);
109 next = advance(); 214 next = advance();
215 // Sequences of spaces are common, so advance through them fast.
110 while (identical(next, $SPACE)) { 216 while (identical(next, $SPACE)) {
111 appendWhiteSpace(next); 217 // We don't invoke [:appendWhiteSpace(next):] here for efficiency,
218 // assuming that it does not do anything for space characters.
112 next = advance(); 219 next = advance();
113 } 220 }
114 return next; 221 return next;
115 } 222 }
116 223
117 if ($a <= next && next <= $z) { 224 if ($a <= next && next <= $z) {
118 if (identical($r, next)) { 225 if (identical($r, next)) {
119 return tokenizeRawStringKeywordOrIdentifier(next); 226 return tokenizeRawStringKeywordOrIdentifier(next);
120 } 227 }
121 return tokenizeKeywordOrIdentifier(next, true); 228 return tokenizeKeywordOrIdentifier(next, true);
122 } 229 }
123 230
124 if (($A <= next && next <= $Z) || identical(next, $_) || identical(next, $$) ) { 231 if (($A <= next && next <= $Z) ||
125 return tokenizeIdentifier(next, byteOffset, true); 232 identical(next, $_) ||
233 identical(next, $$)) {
234 return tokenizeIdentifier(next, scanOffset, true);
126 } 235 }
127 236
128 if (identical(next, $LT)) { 237 if (identical(next, $LT)) {
129 return tokenizeLessThan(next); 238 return tokenizeLessThan(next);
130 } 239 }
131 240
132 if (identical(next, $GT)) { 241 if (identical(next, $GT)) {
133 return tokenizeGreaterThan(next); 242 return tokenizeGreaterThan(next);
134 } 243 }
135 244
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
180 if (identical(next, $BACKSLASH)) { 289 if (identical(next, $BACKSLASH)) {
181 appendPrecedenceToken(BACKSLASH_INFO); 290 appendPrecedenceToken(BACKSLASH_INFO);
182 return advance(); 291 return advance();
183 } 292 }
184 293
185 if (identical(next, $HASH)) { 294 if (identical(next, $HASH)) {
186 return tokenizeTag(next); 295 return tokenizeTag(next);
187 } 296 }
188 297
189 if (identical(next, $OPEN_PAREN)) { 298 if (identical(next, $OPEN_PAREN)) {
190 appendBeginGroup(OPEN_PAREN_INFO, "("); 299 appendBeginGroup(OPEN_PAREN_INFO);
191 return advance(); 300 return advance();
192 } 301 }
193 302
194 if (identical(next, $CLOSE_PAREN)) { 303 if (identical(next, $CLOSE_PAREN)) {
195 return appendEndGroup(CLOSE_PAREN_INFO, ")", OPEN_PAREN_TOKEN); 304 return appendEndGroup(CLOSE_PAREN_INFO, OPEN_PAREN_TOKEN);
196 } 305 }
197 306
198 if (identical(next, $COMMA)) { 307 if (identical(next, $COMMA)) {
199 appendPrecedenceToken(COMMA_INFO); 308 appendPrecedenceToken(COMMA_INFO);
200 return advance(); 309 return advance();
201 } 310 }
202 311
203 if (identical(next, $COLON)) { 312 if (identical(next, $COLON)) {
204 appendPrecedenceToken(COLON_INFO); 313 appendPrecedenceToken(COLON_INFO);
205 return advance(); 314 return advance();
206 } 315 }
207 316
208 if (identical(next, $SEMICOLON)) { 317 if (identical(next, $SEMICOLON)) {
209 appendPrecedenceToken(SEMICOLON_INFO); 318 appendPrecedenceToken(SEMICOLON_INFO);
210 // Type parameters and arguments cannot contain semicolon. 319 // Type parameters and arguments cannot contain semicolon.
211 discardOpenLt(); 320 discardOpenLt();
212 return advance(); 321 return advance();
213 } 322 }
214 323
215 if (identical(next, $QUESTION)) { 324 if (identical(next, $QUESTION)) {
216 appendPrecedenceToken(QUESTION_INFO); 325 appendPrecedenceToken(QUESTION_INFO);
217 return advance(); 326 return advance();
218 } 327 }
219 328
220 if (identical(next, $CLOSE_SQUARE_BRACKET)) { 329 if (identical(next, $CLOSE_SQUARE_BRACKET)) {
221 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO, "]", 330 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO,
222 OPEN_SQUARE_BRACKET_TOKEN); 331 OPEN_SQUARE_BRACKET_TOKEN);
223 } 332 }
224 333
225 if (identical(next, $BACKPING)) { 334 if (identical(next, $BACKPING)) {
226 appendPrecedenceToken(BACKPING_INFO); 335 appendPrecedenceToken(BACKPING_INFO);
227 return advance(); 336 return advance();
228 } 337 }
229 338
230 if (identical(next, $OPEN_CURLY_BRACKET)) { 339 if (identical(next, $OPEN_CURLY_BRACKET)) {
231 appendBeginGroup(OPEN_CURLY_BRACKET_INFO, "{"); 340 appendBeginGroup(OPEN_CURLY_BRACKET_INFO);
232 return advance(); 341 return advance();
233 } 342 }
234 343
235 if (identical(next, $CLOSE_CURLY_BRACKET)) { 344 if (identical(next, $CLOSE_CURLY_BRACKET)) {
236 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, "}", 345 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO,
237 OPEN_CURLY_BRACKET_TOKEN); 346 OPEN_CURLY_BRACKET_TOKEN);
238 } 347 }
239 348
240 if (identical(next, $SLASH)) { 349 if (identical(next, $SLASH)) {
241 return tokenizeSlashOrComment(next); 350 return tokenizeSlashOrComment(next);
242 } 351 }
243 352
244 if (identical(next, $AT)) { 353 if (identical(next, $AT)) {
245 return tokenizeAt(next); 354 return tokenizeAt(next);
246 } 355 }
247 356
248 if (identical(next, $DQ) || identical(next, $SQ)) { 357 if (identical(next, $DQ) || identical(next, $SQ)) {
249 return tokenizeString(next, byteOffset, false); 358 return tokenizeString(next, scanOffset, false);
250 } 359 }
251 360
252 if (identical(next, $PERIOD)) { 361 if (identical(next, $PERIOD)) {
253 return tokenizeDotsOrNumber(next); 362 return tokenizeDotsOrNumber(next);
254 } 363 }
255 364
256 if (identical(next, $0)) { 365 if (identical(next, $0)) {
257 return tokenizeHexOrNumber(next); 366 return tokenizeHexOrNumber(next);
258 } 367 }
259 368
260 // TODO(ahe): Would a range check be faster? 369 // TODO(ahe): Would a range check be faster?
261 if (identical(next, $1) || identical(next, $2) || identical(next, $3) 370 if (identical(next, $1) || identical(next, $2) || identical(next, $3)
262 || identical(next, $4) || identical(next, $5) || identical(next, $6) 371 || identical(next, $4) || identical(next, $5) || identical(next, $6)
263 || identical(next, $7) || identical(next, $8) || identical(next, $9)) { 372 || identical(next, $7) || identical(next, $8) || identical(next, $9)) {
264 return tokenizeNumber(next); 373 return tokenizeNumber(next);
265 } 374 }
266 375
267 if (identical(next, $EOF)) { 376 if (identical(next, $EOF)) {
268 return $EOF; 377 return $EOF;
269 } 378 }
270 if (next < 0x1f) { 379 if (next < 0x1f) {
271 return error(new SourceString("unexpected character $next")); 380 return error("unexpected character $next");
381 }
382
383 if (next >= 128) {
384 next = currentAsUnicode(next);
272 } 385 }
273 386
274 // The following are non-ASCII characters. 387 // The following are non-ASCII characters.
kasperl 2013/10/17 08:50:39 Can the check for $NBSP be guarded by the next >=
lukas 2013/10/17 17:49:34 Actually we can just remove the check for >= 128.
275 388
276 if (identical(next, $NBSP)) { 389 if (identical(next, $NBSP)) {
277 appendWhiteSpace(next); 390 appendWhiteSpace(next);
278 return advance(); 391 return advance();
279 } 392 }
280 393
281 return tokenizeIdentifier(next, byteOffset, true); 394 return error("unexpected unicode character $next");
282 } 395 }
283 396
284 int tokenizeTag(int next) { 397 int tokenizeTag(int next) {
285 // # or #!.*[\n\r] 398 // # or #!.*[\n\r]
286 if (byteOffset == 0) { 399 if (scanOffset == 0) {
287 if (identical(peek(), $BANG)) { 400 if (identical(peek(), $BANG)) {
401 int start = scanOffset + 1;
402 bool asciiOnly = true;
288 do { 403 do {
289 next = advance(); 404 next = advance();
290 } while (!identical(next, $LF) && !identical(next, $CR) && !identical(ne xt, $EOF)); 405 if (next > 127) asciiOnly = false;
406 } while (!identical(next, $LF) &&
407 !identical(next, $CR) &&
408 !identical(next, $EOF));
409 if (!asciiOnly) handleUnicode(start);
291 return next; 410 return next;
292 } 411 }
293 } 412 }
294 appendPrecedenceToken(HASH_INFO); 413 appendPrecedenceToken(HASH_INFO);
295 return advance(); 414 return advance();
296 } 415 }
297 416
298 int tokenizeTilde(int next) { 417 int tokenizeTilde(int next) {
299 // ~ ~/ ~/= 418 // ~ ~/ ~/=
300 next = advance(); 419 next = advance();
301 if (identical(next, $SLASH)) { 420 if (identical(next, $SLASH)) {
302 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO); 421 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO);
303 } else { 422 } else {
304 appendPrecedenceToken(TILDE_INFO); 423 appendPrecedenceToken(TILDE_INFO);
305 return next; 424 return next;
306 } 425 }
307 } 426 }
308 427
309 int tokenizeOpenSquareBracket(int next) { 428 int tokenizeOpenSquareBracket(int next) {
310 // [ [] []= 429 // [ [] []=
311 next = advance(); 430 next = advance();
312 if (identical(next, $CLOSE_SQUARE_BRACKET)) { 431 if (identical(next, $CLOSE_SQUARE_BRACKET)) {
313 Token token = previousToken(); 432 Token token = previousToken();
314 if (token is KeywordToken && identical(token.value.stringValue, 'operator' )) { 433 if (token is KeywordToken &&
434 identical((token as KeywordToken).keyword.syntax, 'operator')) {
315 return select($EQ, INDEX_EQ_INFO, INDEX_INFO); 435 return select($EQ, INDEX_EQ_INFO, INDEX_INFO);
316 } 436 }
317 } 437 }
318 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO, "["); 438 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO);
319 return next; 439 return next;
320 } 440 }
321 441
322 int tokenizeCaret(int next) { 442 int tokenizeCaret(int next) {
323 // ^ ^= 443 // ^ ^=
324 return select($EQ, CARET_EQ_INFO, CARET_INFO); 444 return select($EQ, CARET_EQ_INFO, CARET_INFO);
325 } 445 }
326 446
327 int tokenizeBar(int next) { 447 int tokenizeBar(int next) {
328 // | || |= 448 // | || |=
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
372 return advance(); 492 return advance();
373 } else if (identical(next, $EQ)) { 493 } else if (identical(next, $EQ)) {
374 appendPrecedenceToken(MINUS_EQ_INFO); 494 appendPrecedenceToken(MINUS_EQ_INFO);
375 return advance(); 495 return advance();
376 } else { 496 } else {
377 appendPrecedenceToken(MINUS_INFO); 497 appendPrecedenceToken(MINUS_INFO);
378 return next; 498 return next;
379 } 499 }
380 } 500 }
381 501
382
383 int tokenizePlus(int next) { 502 int tokenizePlus(int next) {
384 // + ++ += 503 // + ++ +=
385 next = advance(); 504 next = advance();
386 if (identical($PLUS, next)) { 505 if (identical($PLUS, next)) {
387 appendPrecedenceToken(PLUS_PLUS_INFO); 506 appendPrecedenceToken(PLUS_PLUS_INFO);
388 return advance(); 507 return advance();
389 } else if (identical($EQ, next)) { 508 } else if (identical($EQ, next)) {
390 appendPrecedenceToken(PLUS_EQ_INFO); 509 appendPrecedenceToken(PLUS_EQ_INFO);
391 return advance(); 510 return advance();
392 } else { 511 } else {
393 appendPrecedenceToken(PLUS_INFO); 512 appendPrecedenceToken(PLUS_INFO);
394 return next; 513 return next;
395 } 514 }
396 } 515 }
397 516
398 int tokenizeExclamation(int next) { 517 int tokenizeExclamation(int next) {
399 // ! != !== 518 // ! !=
519 // !== is kept for user-friendly error reporting
kasperl 2013/10/17 08:50:39 Nit: I'd terminate the ... is kept ... comments wi
lukas 2013/10/17 17:49:34 Done.
520
400 next = advance(); 521 next = advance();
401 if (identical(next, $EQ)) { 522 if (identical(next, $EQ)) {
402 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO); 523 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO);
403 } 524 }
404 appendPrecedenceToken(BANG_INFO); 525 appendPrecedenceToken(BANG_INFO);
405 return next; 526 return next;
406 } 527 }
407 528
408 int tokenizeEquals(int next) { 529 int tokenizeEquals(int next) {
409 // = == === 530 // = == =>
531 // === is kept for user-friendly error reporting
410 532
411 // Type parameters and arguments cannot contain any token that 533 // Type parameters and arguments cannot contain any token that
412 // starts with '='. 534 // starts with '='.
413 discardOpenLt(); 535 discardOpenLt();
414 536
415 next = advance(); 537 next = advance();
416 if (identical(next, $EQ)) { 538 if (identical(next, $EQ)) {
417 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO); 539 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO);
418 } else if (identical(next, $GT)) { 540 } else if (identical(next, $GT)) {
419 appendPrecedenceToken(FUNCTION_INFO); 541 appendPrecedenceToken(FUNCTION_INFO);
420 return advance(); 542 return advance();
421 } 543 }
422 appendPrecedenceToken(EQ_INFO); 544 appendPrecedenceToken(EQ_INFO);
423 return next; 545 return next;
424 } 546 }
425 547
426 int tokenizeGreaterThan(int next) { 548 int tokenizeGreaterThan(int next) {
427 // > >= >> >>= >>> >>>= 549 // > >= >> >>=
428 next = advance(); 550 next = advance();
429 if (identical($EQ, next)) { 551 if (identical($EQ, next)) {
430 appendPrecedenceToken(GT_EQ_INFO); 552 appendPrecedenceToken(GT_EQ_INFO);
431 return advance(); 553 return advance();
432 } else if (identical($GT, next)) { 554 } else if (identical($GT, next)) {
433 next = advance(); 555 next = advance();
434 if (identical($EQ, next)) { 556 if (identical($EQ, next)) {
435 appendPrecedenceToken(GT_GT_EQ_INFO); 557 appendPrecedenceToken(GT_GT_EQ_INFO);
436 return advance(); 558 return advance();
437 } else { 559 } else {
438 appendGtGt(GT_GT_INFO, ">>"); 560 appendGtGt(GT_GT_INFO);
439 return next; 561 return next;
440 } 562 }
441 } else { 563 } else {
442 appendGt(GT_INFO, ">"); 564 appendGt(GT_INFO);
443 return next; 565 return next;
444 } 566 }
445 } 567 }
446 568
447 int tokenizeLessThan(int next) { 569 int tokenizeLessThan(int next) {
448 // < <= << <<= 570 // < <= << <<=
449 next = advance(); 571 next = advance();
450 if (identical($EQ, next)) { 572 if (identical($EQ, next)) {
451 appendPrecedenceToken(LT_EQ_INFO); 573 appendPrecedenceToken(LT_EQ_INFO);
452 return advance(); 574 return advance();
453 } else if (identical($LT, next)) { 575 } else if (identical($LT, next)) {
454 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO); 576 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO);
455 } else { 577 } else {
456 appendBeginGroup(LT_INFO, "<"); 578 appendBeginGroup(LT_INFO);
457 return next; 579 return next;
458 } 580 }
459 } 581 }
460 582
461 int tokenizeNumber(int next) { 583 int tokenizeNumber(int next) {
462 int start = byteOffset; 584 int start = scanOffset;
463 while (true) { 585 while (true) {
464 next = advance(); 586 next = advance();
465 if ($0 <= next && next <= $9) { 587 if ($0 <= next && next <= $9) {
466 continue; 588 continue;
467 } else if (identical(next, $e) || identical(next, $E)) { 589 } else if (identical(next, $e) || identical(next, $E)) {
468 return tokenizeFractionPart(next, start); 590 return tokenizeFractionPart(next, start);
469 } else { 591 } else {
470 if (identical(next, $PERIOD)) { 592 if (identical(next, $PERIOD)) {
471 int nextnext = peek(); 593 int nextnext = peek();
472 if ($0 <= nextnext && nextnext <= $9) { 594 if ($0 <= nextnext && nextnext <= $9) {
473 return tokenizeFractionPart(advance(), start); 595 return tokenizeFractionPart(advance(), start);
474 } 596 }
475 } 597 }
476 appendByteStringToken(INT_INFO, asciiString(start, 0)); 598 appendSubstringToken(INT_INFO, start, true);
477 return next; 599 return next;
478 } 600 }
479 } 601 }
480 } 602 }
481 603
482 int tokenizeHexOrNumber(int next) { 604 int tokenizeHexOrNumber(int next) {
483 int x = peek(); 605 int x = peek();
484 if (identical(x, $x) || identical(x, $X)) { 606 if (identical(x, $x) || identical(x, $X)) {
485 advance(); 607 return tokenizeHex(next);
486 return tokenizeHex(x);
487 } 608 }
488 return tokenizeNumber(next); 609 return tokenizeNumber(next);
489 } 610 }
490 611
491 int tokenizeHex(int next) { 612 int tokenizeHex(int next) {
492 int start = byteOffset - 1; 613 int start = scanOffset;
614 next = advance(); // Advance past the $x or $X.
493 bool hasDigits = false; 615 bool hasDigits = false;
494 while (true) { 616 while (true) {
495 next = advance(); 617 next = advance();
496 if (($0 <= next && next <= $9) 618 if (($0 <= next && next <= $9)
497 || ($A <= next && next <= $F) 619 || ($A <= next && next <= $F)
498 || ($a <= next && next <= $f)) { 620 || ($a <= next && next <= $f)) {
499 hasDigits = true; 621 hasDigits = true;
500 } else { 622 } else {
501 if (!hasDigits) { 623 if (!hasDigits) {
502 return error(const SourceString("hex digit expected")); 624 return error("hex digit expected");
503 } 625 }
504 appendByteStringToken(HEXADECIMAL_INFO, asciiString(start, 0)); 626 appendSubstringToken(HEXADECIMAL_INFO, start, true);
505 return next; 627 return next;
506 } 628 }
507 } 629 }
508 } 630 }
509 631
510 int tokenizeDotsOrNumber(int next) { 632 int tokenizeDotsOrNumber(int next) {
511 int start = byteOffset; 633 int start = scanOffset;
512 next = advance(); 634 next = advance();
513 if (($0 <= next && next <= $9)) { 635 if (($0 <= next && next <= $9)) {
514 return tokenizeFractionPart(next, start); 636 return tokenizeFractionPart(next, start);
515 } else if (identical($PERIOD, next)) { 637 } else if (identical($PERIOD, next)) {
516 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); 638 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);
517 } else { 639 } else {
518 appendPrecedenceToken(PERIOD_INFO); 640 appendPrecedenceToken(PERIOD_INFO);
519 return next; 641 return next;
520 } 642 }
521 } 643 }
522 644
523 int tokenizeFractionPart(int next, int start) { 645 int tokenizeFractionPart(int next, int start) {
524 bool done = false; 646 bool done = false;
525 bool hasDigit = false; 647 bool hasDigit = false;
526 LOOP: while (!done) { 648 LOOP: while (!done) {
527 if ($0 <= next && next <= $9) { 649 if ($0 <= next && next <= $9) {
528 hasDigit = true; 650 hasDigit = true;
529 } else if (identical($e, next) || identical($E, next)) { 651 } else if (identical($e, next) || identical($E, next)) {
530 hasDigit = true; 652 hasDigit = true;
531 next = tokenizeExponent(advance()); 653 next = tokenizeExponent(advance());
532 done = true; 654 done = true;
533 continue LOOP; 655 continue LOOP;
534 } else { 656 } else {
535 done = true; 657 done = true;
536 continue LOOP; 658 continue LOOP;
537 } 659 }
538 next = advance(); 660 next = advance();
539 } 661 }
540 if (!hasDigit) { 662 if (!hasDigit) {
541 appendByteStringToken(INT_INFO, asciiString(start, -1)); 663 // Reduce offset, we already advanced to the token past the period.
664 appendSubstringToken(INT_INFO, start, true, -1);
665
666 // TODO(ahe): Wrong offset for the period. Cannot call beginToken because
667 // the scanner already advanced past the period.
542 if (identical($PERIOD, next)) { 668 if (identical($PERIOD, next)) {
543 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); 669 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);
544 } 670 }
545 // TODO(ahe): Wrong offset for the period.
546 appendPrecedenceToken(PERIOD_INFO); 671 appendPrecedenceToken(PERIOD_INFO);
547 return bigSwitch(next); 672 return next;
548 } 673 }
549 appendByteStringToken(DOUBLE_INFO, asciiString(start, 0)); 674 appendSubstringToken(DOUBLE_INFO, start, true);
550 return next; 675 return next;
551 } 676 }
552 677
553 int tokenizeExponent(int next) { 678 int tokenizeExponent(int next) {
554 if (identical(next, $PLUS) || identical(next, $MINUS)) { 679 if (identical(next, $PLUS) || identical(next, $MINUS)) {
555 next = advance(); 680 next = advance();
556 } 681 }
557 bool hasDigits = false; 682 bool hasDigits = false;
558 while (true) { 683 while (true) {
559 if ($0 <= next && next <= $9) { 684 if ($0 <= next && next <= $9) {
560 hasDigits = true; 685 hasDigits = true;
561 } else { 686 } else {
562 if (!hasDigits) { 687 if (!hasDigits) {
563 return error(const SourceString("digit expected")); 688 return error("digit expected");
564 } 689 }
565 return next; 690 return next;
566 } 691 }
567 next = advance(); 692 next = advance();
568 } 693 }
569 } 694 }
570 695
571 int tokenizeSlashOrComment(int next) { 696 int tokenizeSlashOrComment(int next) {
697 int start = scanOffset;
572 next = advance(); 698 next = advance();
573 if (identical($STAR, next)) { 699 if (identical($STAR, next)) {
574 return tokenizeMultiLineComment(next); 700 return tokenizeMultiLineComment(next, start);
575 } else if (identical($SLASH, next)) { 701 } else if (identical($SLASH, next)) {
576 return tokenizeSingleLineComment(next); 702 return tokenizeSingleLineComment(next, start);
577 } else if (identical($EQ, next)) { 703 } else if (identical($EQ, next)) {
578 appendPrecedenceToken(SLASH_EQ_INFO); 704 appendPrecedenceToken(SLASH_EQ_INFO);
579 return advance(); 705 return advance();
580 } else { 706 } else {
581 appendPrecedenceToken(SLASH_INFO); 707 appendPrecedenceToken(SLASH_INFO);
582 return next; 708 return next;
583 } 709 }
584 } 710 }
585 711
586 int tokenizeSingleLineComment(int next) { 712 int tokenizeSingleLineComment(int next, int start) {
713 bool asciiOnly = true;
587 while (true) { 714 while (true) {
588 next = advance(); 715 next = advance();
589 if (identical($LF, next) || identical($CR, next) || identical($EOF, next)) { 716 if (next > 127) asciiOnly = false;
590 appendComment(); 717 if (identical($LF, next) ||
718 identical($CR, next) ||
719 identical($EOF, next)) {
720 if (!asciiOnly) handleUnicode(start);
721 appendComment(start, asciiOnly);
591 return next; 722 return next;
592 } 723 }
593 } 724 }
594 } 725 }
595 726
596 int tokenizeMultiLineComment(int next) { 727
728 int tokenizeMultiLineComment(int next, int start) {
729 bool asciiOnlyComment = true; // Track if the entire comment is ASCII.
730 bool asciiOnlyLines = true; // Track ASCII since the last handleUnicode.
731 int unicodeStart = start;
597 int nesting = 1; 732 int nesting = 1;
598 next = advance(); 733 next = advance();
599 while (true) { 734 while (true) {
600 if (identical($EOF, next)) { 735 if (identical($EOF, next)) {
601 // TODO(ahe): Report error. 736 if (!asciiOnlyLines) handleUnicode(unicodeStart);
737 appendStringToken(BAD_INPUT_INFO, "unterminated multi-line comment");
602 return next; 738 return next;
603 } else if (identical($STAR, next)) { 739 } else if (identical($STAR, next)) {
604 next = advance(); 740 next = advance();
605 if (identical($SLASH, next)) { 741 if (identical($SLASH, next)) {
606 --nesting; 742 --nesting;
607 if (0 == nesting) { 743 if (0 == nesting) {
744 if (!asciiOnlyLines) handleUnicode(unicodeStart);
608 next = advance(); 745 next = advance();
609 appendComment(); 746 appendComment(start, asciiOnlyComment);
610 return next; 747 return next;
611 } else { 748 } else {
612 next = advance(); 749 next = advance();
613 } 750 }
614 } 751 }
615 } else if (identical($SLASH, next)) { 752 } else if (identical($SLASH, next)) {
616 next = advance(); 753 next = advance();
617 if (identical($STAR, next)) { 754 if (identical($STAR, next)) {
618 next = advance(); 755 next = advance();
619 ++nesting; 756 ++nesting;
620 } 757 }
758 } else if (identical(next, $LF)) {
759 if (!asciiOnlyLines) {
760 // Synchronize the string offset in the utf8 scanner.
761 handleUnicode(unicodeStart);
762 asciiOnlyLines = true;
763 unicodeStart = scanOffset;
764 }
765 lineFeedInMultiline();
766 next = advance();
621 } else { 767 } else {
768 if (next > 127) {
769 asciiOnlyLines = false;
770 asciiOnlyComment = false;
771 }
622 next = advance(); 772 next = advance();
623 } 773 }
624 } 774 }
625 } 775 }
626 776
627 int tokenizeRawStringKeywordOrIdentifier(int next) { 777 int tokenizeRawStringKeywordOrIdentifier(int next) {
778 // [next] is $r.
628 int nextnext = peek(); 779 int nextnext = peek();
629 if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) { 780 if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) {
630 int start = byteOffset; 781 int start = scanOffset;
631 next = advance(); 782 next = advance();
632 return tokenizeString(next, start, true); 783 return tokenizeString(next, start, true);
633 } 784 }
634 return tokenizeKeywordOrIdentifier(next, true); 785 return tokenizeKeywordOrIdentifier(next, true);
635 } 786 }
636 787
637 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) { 788 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) {
638 KeywordState state = KeywordState.KEYWORD_STATE; 789 KeywordState state = KeywordState.KEYWORD_STATE;
639 int start = byteOffset; 790 int start = scanOffset;
640 while (state != null && $a <= next && next <= $z) { 791 while (state != null && $a <= next && next <= $z) {
641 state = state.next(next); 792 state = state.next(next);
642 next = advance(); 793 next = advance();
643 } 794 }
644 if (state == null || state.keyword == null) { 795 if (state == null || state.keyword == null) {
645 return tokenizeIdentifier(next, start, allowDollar); 796 return tokenizeIdentifier(next, start, allowDollar);
646 } 797 }
647 if (($A <= next && next <= $Z) || 798 if (($A <= next && next <= $Z) ||
648 ($0 <= next && next <= $9) || 799 ($0 <= next && next <= $9) ||
649 identical(next, $_) || 800 identical(next, $_) ||
650 identical(next, $$)) { 801 identical(next, $$)) {
651 return tokenizeIdentifier(next, start, allowDollar); 802 return tokenizeIdentifier(next, start, allowDollar);
652 } else if (next < 128) { 803 } else {
653 appendKeywordToken(state.keyword); 804 appendKeywordToken(state.keyword);
654 return next; 805 return next;
655 } else {
656 return tokenizeIdentifier(next, start, allowDollar);
657 } 806 }
658 } 807 }
659 808
809 /**
810 * [allowDollar] can exclude '$', which is not allowed as part of a string
811 * interpolation identifier.
812 */
660 int tokenizeIdentifier(int next, int start, bool allowDollar) { 813 int tokenizeIdentifier(int next, int start, bool allowDollar) {
661 bool isAscii = true;
662
663 while (true) { 814 while (true) {
664 if (($a <= next && next <= $z) || 815 if (($a <= next && next <= $z) ||
665 ($A <= next && next <= $Z) || 816 ($A <= next && next <= $Z) ||
666 ($0 <= next && next <= $9) || 817 ($0 <= next && next <= $9) ||
667 identical(next, $_) || 818 identical(next, $_) ||
668 (identical(next, $$) && allowDollar)) { 819 (identical(next, $$) && allowDollar)) {
669 next = advance(); 820 next = advance();
670 } else if ((next < 128) || (identical(next, $NBSP))) { 821 } else {
671 // Identifier ends here. 822 // Identifier ends here.
672 if (start == byteOffset) { 823 if (start == scanOffset) {
673 return error(const SourceString("expected identifier")); 824 return error("expected identifier");
674 } else if (isAscii) {
675 appendByteStringToken(IDENTIFIER_INFO, asciiString(start, 0));
676 } else { 825 } else {
677 appendByteStringToken(BAD_INPUT_INFO, utf8String(start, -1)); 826 appendSubstringToken(IDENTIFIER_INFO, start, true);
678 } 827 }
679 return next; 828 return next;
680 } else {
681 int nonAsciiStart = byteOffset;
682 do {
683 next = nextByte();
684 if (identical(next, $NBSP)) break;
685 } while (next > 127);
686 String string = utf8String(nonAsciiStart, -1).slowToString();
687 isAscii = false;
688 int byteLength = nonAsciiStart - byteOffset;
689 addToCharOffset(string.length - byteLength);
690 } 829 }
691 } 830 }
692 } 831 }
693 832
694 int tokenizeAt(int next) { 833 int tokenizeAt(int next) {
695 int start = byteOffset;
696 next = advance();
697 appendPrecedenceToken(AT_INFO); 834 appendPrecedenceToken(AT_INFO);
698 return next; 835 return advance();
699 } 836 }
700 837
701 int tokenizeString(int next, int start, bool raw) { 838 int tokenizeString(int next, int start, bool raw) {
702 int quoteChar = next; 839 int quoteChar = next;
703 next = advance(); 840 next = advance();
704 if (identical(quoteChar, next)) { 841 if (identical(quoteChar, next)) {
705 next = advance(); 842 next = advance();
706 if (identical(quoteChar, next)) { 843 if (identical(quoteChar, next)) {
707 // Multiline string. 844 // Multiline string.
708 return tokenizeMultiLineString(quoteChar, start, raw); 845 return tokenizeMultiLineString(quoteChar, start, raw);
709 } else { 846 } else {
710 // Empty string. 847 // Empty string.
711 appendByteStringToken(STRING_INFO, utf8String(start, -1)); 848 appendSubstringToken(STRING_INFO, start, true);
712 return next; 849 return next;
713 } 850 }
714 } 851 }
715 if (raw) { 852 if (raw) {
716 return tokenizeSingleLineRawString(next, quoteChar, start); 853 return tokenizeSingleLineRawString(next, quoteChar, start);
717 } else { 854 } else {
718 return tokenizeSingleLineString(next, quoteChar, start); 855 return tokenizeSingleLineString(next, quoteChar, start);
719 } 856 }
720 } 857 }
721 858
722 static bool isHexDigit(int character) { 859 /**
723 if ($0 <= character && character <= $9) return true; 860 * [next] is the first character after the qoute.
724 character |= 0x20; 861 * [start] is the scanOffset of the quote.
725 return ($a <= character && character <= $f); 862 *
726 } 863 * The token contains a substring of the source file, including the
727 864 * string quotes, backslashes for escaping. For interpolated strings,
865 * the parts before and after are separate tokens.
866 *
867 * "a $b c"
868 *
869 * gives StringToken("a $), StringToken(b) and StringToken( c").
870 */
728 int tokenizeSingleLineString(int next, int quoteChar, int start) { 871 int tokenizeSingleLineString(int next, int quoteChar, int start) {
872 bool asciiOnly = true;
729 while (!identical(next, quoteChar)) { 873 while (!identical(next, quoteChar)) {
730 if (identical(next, $BACKSLASH)) { 874 if (identical(next, $BACKSLASH)) {
731 next = advance(); 875 next = advance();
732 } else if (identical(next, $$)) { 876 } else if (identical(next, $$)) {
733 next = tokenizeStringInterpolation(start); 877 if (!asciiOnly) handleUnicode(start);
734 start = byteOffset; 878 next = tokenizeStringInterpolation(start, asciiOnly);
879 start = scanOffset;
880 asciiOnly = true;
735 continue; 881 continue;
736 } 882 }
737 if (next <= $CR 883 if (next <= $CR
738 && (identical(next, $LF) || identical(next, $CR) || identical(next, $E OF))) { 884 && (identical(next, $LF) ||
739 return error(const SourceString("unterminated string literal")); 885 identical(next, $CR) ||
886 identical(next, $EOF))) {
887 if (!asciiOnly) handleUnicode(start);
888 return error("unterminated string literal");
740 } 889 }
890 if (next > 127) asciiOnly = false;
741 next = advance(); 891 next = advance();
742 } 892 }
743 appendByteStringToken(STRING_INFO, utf8String(start, 0)); 893 if (!asciiOnly) handleUnicode(start);
744 return advance(); 894 // Advance past the quote character.
895 next = advance();
896 appendSubstringToken(STRING_INFO, start, asciiOnly);
897 return next;
745 } 898 }
746 899
747 int tokenizeStringInterpolation(int start) { 900 int tokenizeStringInterpolation(int start, bool asciiOnly) {
748 appendByteStringToken(STRING_INFO, utf8String(start, -1)); 901 appendSubstringToken(STRING_INFO, start, asciiOnly);
749 beginToken(); // $ starts here. 902 beginToken(); // $ starts here.
750 int next = advance(); 903 int next = advance();
751 if (identical(next, $OPEN_CURLY_BRACKET)) { 904 if (identical(next, $OPEN_CURLY_BRACKET)) {
752 return tokenizeInterpolatedExpression(next, start); 905 return tokenizeInterpolatedExpression(next);
753 } else { 906 } else {
754 return tokenizeInterpolatedIdentifier(next, start); 907 return tokenizeInterpolatedIdentifier(next);
755 } 908 }
756 } 909 }
757 910
758 int tokenizeInterpolatedExpression(int next, int start) { 911 int tokenizeInterpolatedExpression(int next) {
759 appendBeginGroup(STRING_INTERPOLATION_INFO, "\${"); 912 appendBeginGroup(STRING_INTERPOLATION_INFO);
760 beginToken(); // The expression starts here. 913 beginToken(); // The expression starts here.
761 next = advance(); 914 next = advance(); // Move past the curly bracket.
762 while (!identical(next, $EOF) && !identical(next, $STX)) { 915 while (!identical(next, $EOF) && !identical(next, $STX)) {
763 next = bigSwitch(next); 916 next = bigSwitch(next);
764 } 917 }
765 if (identical(next, $EOF)) return next; 918 if (identical(next, $EOF)) return next;
766 next = advance(); 919 next = advance(); // Move past the $STX.
767 beginToken(); // The string interpolation suffix starts here. 920 beginToken(); // The string interpolation suffix starts here.
768 return next; 921 return next;
769 } 922 }
770 923
771 int tokenizeInterpolatedIdentifier(int next, int start) { 924 int tokenizeInterpolatedIdentifier(int next) {
772 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO); 925 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO);
773 beginToken(); // The identifier starts here. 926 beginToken(); // The identifier starts here.
774 next = tokenizeKeywordOrIdentifier(next, false); 927 next = tokenizeKeywordOrIdentifier(next, false);
775 beginToken(); // The string interpolation suffix starts here. 928 beginToken(); // The string interpolation suffix starts here.
776 return next; 929 return next;
777 } 930 }
778 931
779 int tokenizeSingleLineRawString(int next, int quoteChar, int start) { 932 int tokenizeSingleLineRawString(int next, int quoteChar, int start) {
780 next = advance(); 933 bool asciiOnly = true;
934 next = advance(); // Advance past the quote
781 while (next != $EOF) { 935 while (next != $EOF) {
782 if (identical(next, quoteChar)) { 936 if (identical(next, quoteChar)) {
783 appendByteStringToken(STRING_INFO, utf8String(start, 0)); 937 if (!asciiOnly) handleUnicode(start);
784 return advance(); 938 next = advance();
939 appendSubstringToken(STRING_INFO, start, asciiOnly);
940 return next;
785 } else if (identical(next, $LF) || identical(next, $CR)) { 941 } else if (identical(next, $LF) || identical(next, $CR)) {
786 return error(const SourceString("unterminated string literal")); 942 if (!asciiOnly) handleUnicode(start);
943 return error("unterminated string literal");
944 } else if (next > 127) {
945 asciiOnly = false;
787 } 946 }
788 next = advance(); 947 next = advance();
789 } 948 }
790 return error(const SourceString("unterminated string literal")); 949 if (!asciiOnly) handleUnicode(start);
950 return error("unterminated string literal");
791 } 951 }
792 952
793 int tokenizeMultiLineRawString(int quoteChar, int start) { 953 int tokenizeMultiLineRawString(int quoteChar, int start) {
794 int next = advance(); 954 bool asciiOnlyString = true;
955 bool asciiOnlyLine = true;
956 int unicodeStart = start;
957 int next = advance(); // Advance past the (last) quote (of three)
795 outer: while (!identical(next, $EOF)) { 958 outer: while (!identical(next, $EOF)) {
796 while (!identical(next, quoteChar)) { 959 while (!identical(next, quoteChar)) {
960 if (identical(next, $LF)) {
961 if (!asciiOnlyLine) {
962 // Synchronize the string offset in the utf8 scanner.
963 handleUnicode(unicodeStart);
964 asciiOnlyLine = true;
965 unicodeStart = scanOffset;
966 }
967 lineFeedInMultiline();
968 } else if (next > 127) {
969 asciiOnlyLine = false;
970 asciiOnlyString = false;
971 }
797 next = advance(); 972 next = advance();
798 if (identical(next, $EOF)) break outer; 973 if (identical(next, $EOF)) break outer;
799 } 974 }
800 next = advance(); 975 next = advance();
801 if (identical(next, quoteChar)) { 976 if (identical(next, quoteChar)) {
802 next = advance(); 977 next = advance();
803 if (identical(next, quoteChar)) { 978 if (identical(next, quoteChar)) {
804 appendByteStringToken(STRING_INFO, utf8String(start, 0)); 979 if (!asciiOnlyLine) handleUnicode(unicodeStart);
805 return advance(); 980 next = advance();
981 appendSubstringToken(STRING_INFO, start, asciiOnlyString);
982 return next;
806 } 983 }
807 } 984 }
808 } 985 }
809 return error(const SourceString("unterminated string literal")); 986 if (!asciiOnlyLine) handleUnicode(unicodeStart);
987 return error("unterminated string literal");
810 } 988 }
811 989
812 int tokenizeMultiLineString(int quoteChar, int start, bool raw) { 990 int tokenizeMultiLineString(int quoteChar, int start, bool raw) {
813 if (raw) return tokenizeMultiLineRawString(quoteChar, start); 991 if (raw) return tokenizeMultiLineRawString(quoteChar, start);
814 int next = advance(); 992 bool asciiOnlyString = true;
993 bool asciiOnlyLine = true;
994 int unicodeStart = start;
995 int next = advance(); // Advance past the (last) quote (of three).
815 while (!identical(next, $EOF)) { 996 while (!identical(next, $EOF)) {
816 if (identical(next, $$)) { 997 if (identical(next, $$)) {
817 next = tokenizeStringInterpolation(start); 998 if (!asciiOnlyLine) handleUnicode(unicodeStart);
818 start = byteOffset; 999 next = tokenizeStringInterpolation(start, asciiOnlyString);
1000 start = scanOffset;
1001 unicodeStart = start;
1002 asciiOnlyString = true; // A new string token is created for the rest.
1003 asciiOnlyLine = true;
819 continue; 1004 continue;
820 } 1005 }
821 if (identical(next, quoteChar)) { 1006 if (identical(next, quoteChar)) {
822 next = advance(); 1007 next = advance();
823 if (identical(next, quoteChar)) { 1008 if (identical(next, quoteChar)) {
824 next = advance(); 1009 next = advance();
825 if (identical(next, quoteChar)) { 1010 if (identical(next, quoteChar)) {
826 appendByteStringToken(STRING_INFO, utf8String(start, 0)); 1011 if (!asciiOnlyLine) handleUnicode(unicodeStart);
827 return advance(); 1012 next = advance();
1013 appendSubstringToken(STRING_INFO, start, asciiOnlyString);
1014 return next;
828 } 1015 }
829 } 1016 }
830 continue; 1017 continue;
831 } 1018 }
832 if (identical(next, $BACKSLASH)) { 1019 if (identical(next, $BACKSLASH)) {
833 next = advance(); 1020 next = advance();
834 if (identical(next, $EOF)) break; 1021 if (identical(next, $EOF)) break;
835 } 1022 }
1023 if (identical(next, $LF)) {
1024 if (!asciiOnlyLine) {
1025 // Synchronize the string offset in the utf8 scanner.
1026 handleUnicode(unicodeStart);
1027 asciiOnlyLine = true;
1028 unicodeStart = scanOffset;
1029 }
1030 lineFeedInMultiline();
1031 } else if (next > 127) {
1032 asciiOnlyString = false;
1033 asciiOnlyLine = false;
1034 }
836 next = advance(); 1035 next = advance();
837 } 1036 }
838 return error(const SourceString("unterminated string literal")); 1037 if (!asciiOnlyLine) handleUnicode(unicodeStart);
1038 return error("unterminated string literal");
839 } 1039 }
840 1040
841 int error(SourceString message) { 1041 int error(String message) {
842 appendByteStringToken(BAD_INPUT_INFO, message); 1042 appendStringToken(BAD_INPUT_INFO, message);
843 return advance(); // Ensure progress. 1043 return advance(); // Ensure progress.
844 } 1044 }
1045
1046 void unmatchedBeginGroup(BeginGroupToken begin) {
1047 String error = 'unmatched "${begin.stringValue}"';
1048 Token close =
1049 new StringToken.fromString(
1050 BAD_INPUT_INFO, error, begin.charOffset, true);
1051
1052 // We want to ensure that unmatched BeginGroupTokens are reported
1053 // as errors. However, the rest of the parser assume the groups
1054 // are well-balanced and will never look at the endGroup
1055 // token. This is a nice property that allows us to skip quickly
1056 // over correct code. By inserting an additional error token in
1057 // the stream, we can keep ignoring endGroup tokens.
1058 //
1059 // [begin] --next--> [tail]
1060 // [begin] --endG--> [close] --next--> [next] --next--> [tail]
1061 //
1062 // This allows the parser to skip from [begin] via endGroup to [close] and
1063 // ignore the [close] token (assuming it's correct), then the error will be
1064 // reported when parsing the [next] token.
1065
1066 Token next = new StringToken.fromString(
1067 BAD_INPUT_INFO, error, begin.charOffset, true);
1068 begin.endGroup = close;
1069 close.next = next;
1070 next.next = begin.next;
1071 }
845 } 1072 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698