Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(706)

Side by Side Diff: sdk/lib/_internal/compiler/implementation/scanner/scanner.dart

Issue 27510003: Scanner for UTF-8 byte arrays (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: fixes compiler tests Created 7 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 part of scanner; 5 part of scanner;
6 6
7 abstract class Scanner { 7 abstract class Scanner {
8 Token tokenize(); 8 Token tokenize();
9
10 factory Scanner(SourceFile file, {bool includeComments: false}) {
11 if (file is Utf8BytesSourceFile) {
12 return new Utf8BytesScanner(file, includeComments: includeComments);
13 } else {
14 return new StringScanner(file, includeComments: includeComments);
15 }
16 }
9 } 17 }
10 18
11 /** 19 abstract class AbstractScanner implements Scanner {
12 * Common base class for a Dart scanner. 20 final bool includeComments;
13 */
14 abstract class AbstractScanner<T extends SourceString> implements Scanner {
15 int advance();
16 int nextByte();
17 21
18 /** 22 /**
19 * Returns the current character or byte depending on the underlying input 23 * The string offset for the next token that will be created.
20 * kind. For example, [StringScanner] operates on [String] and thus returns 24 *
21 * characters (Unicode codepoints represented as int) whereas 25 * Note that in the [Utf8BytesScanner], [stringOffset] and [scanOffset] values
22 * [ByteArrayScanner] operates on byte arrays and thus returns bytes. 26 * are different. One string character can be encoded using multiple UTF-8
27 * bytes.
28 */
29 int tokenStart = -1;
30
31 /**
32 * A pointer to the token stream created by this scanner. The first token
33 * is a special token and not part of the source file. This is an
34 * implementation detail to avoids special cases in the scanner. This token
35 * is not exposed to clients of the scanner, which are expected to invoke
36 * [firstToken] to access the token stream.
37 */
38 final Token tokens = new SymbolToken(EOF_INFO, -1);
39
40 /**
41 * A pointer to the last scanned token.
42 */
43 Token tail;
44
45 /**
46 * The source file that is being scanned. This field can be [:null:].
47 * If the source file is available, the scanner assigns its [:lineStarts:] and
48 * [:length:] fields at the end of [tokenize].
49 */
50 final SourceFile file;
51
52 final List<int> lineStarts = [0];
ngeoffray 2013/10/18 10:19:37 <int>[0]
lukas 2013/10/24 16:48:36 Done.
53
54 AbstractScanner(this.file, this.includeComments) {
55 this.tail = this.tokens;
56 }
57
58
ngeoffray 2013/10/18 10:19:37 Extra line.
lukas 2013/10/24 16:48:36 Done.
59 /**
60 * Advances and returns the next character.
61 *
62 * If the next character is non-ASCII, then the returned value depends on the
63 * scanner implementation. The [Utf8BytesScanner] returns a UTF-8 byte, while
64 * the [StringScanner] returns a UTF-16 code unit.
65 *
66 * The scanner ensures that [advance] is not invoked after it returned [$EOF].
67 * This allows implementations to omit bound checks if the data structure ends
68 * with '0'.
69 */
70 int advance();
71
72 /**
73 * Returns the current unicode character.
74 *
75 * If the current character is ASCII, then it is returned unchanged.
76 *
77 * The [Utf8BytesScanner] decodes the next unicode code point starting at the
78 * current position. Note that every unicode character is returned as a single
79 * code point, i.e., for '\u{1d11e}' it returns 119070, and the following
80 * [advance] returns the next character.
81 *
82 * The [StringScanner] returns the current character unchanged, which might
83 * be a surrogate character. In the case of '\u{1d11e}', it returns the first
84 * code unit 55348, and the following [advance] returns the second code unit
85 * 56606.
86 *
87 * Invoking [currentAsUnicode] multiple times is safe, i.e.,
ngeoffray 2013/10/18 10:19:37 i.e. -> that is
lukas 2013/10/24 16:48:36 Done.
88 * [:currentAsUnicode(next) == currentAsUnicode(currentAsUnicode(next)):].
89 */
90 int currentAsUnicode(int next);
91
92 /**
93 * Returns the character at the next poisition. Like in [advance], the
94 * [Utf8BytesScanner] returns a UTF-8 byte, while the [StringScanner] returns
95 * a UTF-16 code unit.
23 */ 96 */
24 int peek(); 97 int peek();
25 98
26 /** 99 /**
27 * Appends a fixed token based on whether the current char is [choice] or not. 100 * Notifies the scanner that unicode characters were detected in either a
28 * If the current char is [choice] a fixed token whose kind and content 101 * comment or a string literal between [startScanOffset] and the current
29 * is determined by [yes] is appended, otherwise a fixed token whose kind 102 * scan offset.
30 * and content is determined by [no] is appended.
31 */ 103 */
104 void handleUnicode(int startScanOffset);
105
106 /**
107 * Returns the current scan offset.
108 *
109 * In the [Utf8BytesScanner] this is the offset into the byte list, in the
110 * [StringScanner] the offset in the source string.
111 */
112 int get scanOffset;
113
114 /**
115 * Returns the current string offset.
116 *
117 * In the [StringScanner] this is identical to the [scanOffset]. In the
118 * [Utf8BytesScanner] it is computed based on encountered UTF-8 characters.
119 */
120 int get stringOffset;
121
122 /**
123 * Returns the first token scanned by this [Scanner].
124 */
125 Token firstToken();
126
127 /**
128 * Returns the last token scanned by this [Scanner].
129 */
130 Token previousToken();
131
132 /**
133 * Notifies that a new token starts at current offset.
134 */
135 void beginToken() {
136 tokenStart = stringOffset;
137 }
138
139 /**
140 * Appends a substring from the scan offset [:start:] to the current
141 * [:scanOffset:] plus the [:extraOffset:]. For example, if the current
142 * scanOffset is 10, then [:appendSubstringToken(5, -1):] will append the
143 * substring string [5,9).
144 *
145 * Note that [extraOffset] can only be used if the covered character(s) are
146 * known to be ASCII.
147 */
148 void appendSubstringToken(PrecedenceInfo info, int start,
149 bool asciiOnly, [int extraOffset]);
150
151 /** Documentation in subclass [ArrayBasedScanner]. */
152 void appendStringToken(PrecedenceInfo info, String value);
153
154 /** Documentation in subclass [ArrayBasedScanner]. */
155 void appendPrecedenceToken(PrecedenceInfo info);
156
157 /** Documentation in subclass [ArrayBasedScanner]. */
32 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no); 158 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no);
33 159
34 /** 160 /** Documentation in subclass [ArrayBasedScanner]. */
35 * Appends a fixed token whose kind and content is determined by [info]. 161 void appendKeywordToken(Keyword keyword);
36 */
37 void appendPrecedenceToken(PrecedenceInfo info);
38 162
39 /** 163 /** Documentation in subclass [ArrayBasedScanner]. */
40 * Appends a token whose kind is determined by [info] and content is [value].
41 */
42 void appendStringToken(PrecedenceInfo info, String value);
43
44 /**
45 * Appends a token whose kind is determined by [info] and content is defined
46 * by the SourceString [value].
47 */
48 void appendByteStringToken(PrecedenceInfo info, T value);
49
50 /**
51 * Appends a keyword token whose kind is determined by [keyword].
52 */
53 void appendKeywordToken(Keyword keyword);
54 void appendWhiteSpace(int next);
55 void appendEofToken(); 164 void appendEofToken();
56 165
57 /** 166 /** Documentation in subclass [ArrayBasedScanner]. */
ngeoffray 2013/10/18 10:19:37 So do the following methods only apply to the Arra
lukas 2013/10/24 16:48:36 I just decided to put the documentation together w
58 * Creates an ASCII SourceString whose content begins at the source byte 167 void appendWhiteSpace(int next);
59 * offset [start] and ends at [offset] bytes from the current byte offset of
60 * the scanner. For example, if the current byte offset is 10,
61 * [:asciiString(0,-1):] creates an ASCII SourceString whose content is found
62 * at the [0,9[ byte interval of the source text.
63 */
64 T asciiString(int start, int offset);
65 T utf8String(int start, int offset);
66 Token firstToken();
67 Token previousToken();
68 void beginToken();
69 void addToCharOffset(int offset);
70 int get charOffset;
71 int get byteOffset;
72 void appendBeginGroup(PrecedenceInfo info, String value);
73 int appendEndGroup(PrecedenceInfo info, String value, int openKind);
74 void appendGt(PrecedenceInfo info, String value);
75 void appendGtGt(PrecedenceInfo info, String value);
76 void appendGtGtGt(PrecedenceInfo info, String value);
77 void appendComment();
78 168
79 /** 169 /** Documentation in subclass [ArrayBasedScanner]. */
80 * We call this method to discard '<' from the "grouping" stack 170 void lineFeedInMultiline();
81 * (maintained by subclasses). 171
82 * 172 /** Documentation in subclass [ArrayBasedScanner]. */
83 * [PartialParser.skipExpression] relies on the fact that we do not 173 void appendBeginGroup(PrecedenceInfo info);
84 * create groups for stuff like: 174
85 * [:a = b < c, d = e > f:]. 175 /** Documentation in subclass [ArrayBasedScanner]. */
86 * 176 int appendEndGroup(PrecedenceInfo info, int openKind);
87 * In other words, this method is called when the scanner recognizes 177
88 * something which cannot possibly be part of a type 178 /** Documentation in subclass [ArrayBasedScanner]. */
89 * parameter/argument list. 179 void appendGt(PrecedenceInfo info);
90 */ 180
181 /** Documentation in subclass [ArrayBasedScanner]. */
182 void appendGtGt(PrecedenceInfo info);
183
184 /** Documentation in subclass [ArrayBasedScanner]. */
185 void appendComment(start, bool asciiOnly);
186
187 /** Documentation in subclass [ArrayBasedScanner]. */
91 void discardOpenLt(); 188 void discardOpenLt();
92 189
93 // TODO(ahe): Move this class to implementation. 190 // TODO(ahe): Move this class to implementation.
94 191
95 Token tokenize() { 192 Token tokenize() {
96 int next = advance(); 193 int next = advance();
97 while (!identical(next, $EOF)) { 194 while (!identical(next, $EOF)) {
98 next = bigSwitch(next); 195 next = bigSwitch(next);
99 } 196 }
100 appendEofToken(); 197 appendEofToken();
198
199 if (file != null) {
200 file.length = stringOffset;
201 // One additional line start at the end, see [SourceFile.lineStarts].
202 lineStarts.add(stringOffset + 1);
203 file.lineStarts = lineStarts;
204 }
205
101 return firstToken(); 206 return firstToken();
102 } 207 }
103 208
104 int bigSwitch(int next) { 209 int bigSwitch(int next) {
105 beginToken(); 210 beginToken();
106 if (identical(next, $SPACE) || identical(next, $TAB) 211 if (identical(next, $SPACE) || identical(next, $TAB)
107 || identical(next, $LF) || identical(next, $CR)) { 212 || identical(next, $LF) || identical(next, $CR)) {
108 appendWhiteSpace(next); 213 appendWhiteSpace(next);
109 next = advance(); 214 next = advance();
215 // Sequences of spaces are common, so advance through them fast.
110 while (identical(next, $SPACE)) { 216 while (identical(next, $SPACE)) {
111 appendWhiteSpace(next); 217 // We don't invoke [:appendWhiteSpace(next):] here for efficiency,
218 // assuming that it does not do anything for space characters.
112 next = advance(); 219 next = advance();
113 } 220 }
114 return next; 221 return next;
115 } 222 }
116 223
117 if ($a <= next && next <= $z) { 224 if ($a <= next && next <= $z) {
118 if (identical($r, next)) { 225 if (identical($r, next)) {
119 return tokenizeRawStringKeywordOrIdentifier(next); 226 return tokenizeRawStringKeywordOrIdentifier(next);
120 } 227 }
121 return tokenizeKeywordOrIdentifier(next, true); 228 return tokenizeKeywordOrIdentifier(next, true);
122 } 229 }
123 230
124 if (($A <= next && next <= $Z) || identical(next, $_) || identical(next, $$) ) { 231 if (($A <= next && next <= $Z) ||
125 return tokenizeIdentifier(next, byteOffset, true); 232 identical(next, $_) ||
233 identical(next, $$)) {
234 return tokenizeIdentifier(next, scanOffset, true);
126 } 235 }
127 236
128 if (identical(next, $LT)) { 237 if (identical(next, $LT)) {
129 return tokenizeLessThan(next); 238 return tokenizeLessThan(next);
130 } 239 }
131 240
132 if (identical(next, $GT)) { 241 if (identical(next, $GT)) {
133 return tokenizeGreaterThan(next); 242 return tokenizeGreaterThan(next);
134 } 243 }
135 244
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
180 if (identical(next, $BACKSLASH)) { 289 if (identical(next, $BACKSLASH)) {
181 appendPrecedenceToken(BACKSLASH_INFO); 290 appendPrecedenceToken(BACKSLASH_INFO);
182 return advance(); 291 return advance();
183 } 292 }
184 293
185 if (identical(next, $HASH)) { 294 if (identical(next, $HASH)) {
186 return tokenizeTag(next); 295 return tokenizeTag(next);
187 } 296 }
188 297
189 if (identical(next, $OPEN_PAREN)) { 298 if (identical(next, $OPEN_PAREN)) {
190 appendBeginGroup(OPEN_PAREN_INFO, "("); 299 appendBeginGroup(OPEN_PAREN_INFO);
191 return advance(); 300 return advance();
192 } 301 }
193 302
194 if (identical(next, $CLOSE_PAREN)) { 303 if (identical(next, $CLOSE_PAREN)) {
195 return appendEndGroup(CLOSE_PAREN_INFO, ")", OPEN_PAREN_TOKEN); 304 return appendEndGroup(CLOSE_PAREN_INFO, OPEN_PAREN_TOKEN);
196 } 305 }
197 306
198 if (identical(next, $COMMA)) { 307 if (identical(next, $COMMA)) {
199 appendPrecedenceToken(COMMA_INFO); 308 appendPrecedenceToken(COMMA_INFO);
200 return advance(); 309 return advance();
201 } 310 }
202 311
203 if (identical(next, $COLON)) { 312 if (identical(next, $COLON)) {
204 appendPrecedenceToken(COLON_INFO); 313 appendPrecedenceToken(COLON_INFO);
205 return advance(); 314 return advance();
206 } 315 }
207 316
208 if (identical(next, $SEMICOLON)) { 317 if (identical(next, $SEMICOLON)) {
209 appendPrecedenceToken(SEMICOLON_INFO); 318 appendPrecedenceToken(SEMICOLON_INFO);
210 // Type parameters and arguments cannot contain semicolon. 319 // Type parameters and arguments cannot contain semicolon.
211 discardOpenLt(); 320 discardOpenLt();
212 return advance(); 321 return advance();
213 } 322 }
214 323
215 if (identical(next, $QUESTION)) { 324 if (identical(next, $QUESTION)) {
216 appendPrecedenceToken(QUESTION_INFO); 325 appendPrecedenceToken(QUESTION_INFO);
217 return advance(); 326 return advance();
218 } 327 }
219 328
220 if (identical(next, $CLOSE_SQUARE_BRACKET)) { 329 if (identical(next, $CLOSE_SQUARE_BRACKET)) {
221 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO, "]", 330 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO,
222 OPEN_SQUARE_BRACKET_TOKEN); 331 OPEN_SQUARE_BRACKET_TOKEN);
223 } 332 }
224 333
225 if (identical(next, $BACKPING)) { 334 if (identical(next, $BACKPING)) {
226 appendPrecedenceToken(BACKPING_INFO); 335 appendPrecedenceToken(BACKPING_INFO);
227 return advance(); 336 return advance();
228 } 337 }
229 338
230 if (identical(next, $OPEN_CURLY_BRACKET)) { 339 if (identical(next, $OPEN_CURLY_BRACKET)) {
231 appendBeginGroup(OPEN_CURLY_BRACKET_INFO, "{"); 340 appendBeginGroup(OPEN_CURLY_BRACKET_INFO);
232 return advance(); 341 return advance();
233 } 342 }
234 343
235 if (identical(next, $CLOSE_CURLY_BRACKET)) { 344 if (identical(next, $CLOSE_CURLY_BRACKET)) {
236 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, "}", 345 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO,
237 OPEN_CURLY_BRACKET_TOKEN); 346 OPEN_CURLY_BRACKET_TOKEN);
238 } 347 }
239 348
240 if (identical(next, $SLASH)) { 349 if (identical(next, $SLASH)) {
241 return tokenizeSlashOrComment(next); 350 return tokenizeSlashOrComment(next);
242 } 351 }
243 352
244 if (identical(next, $AT)) { 353 if (identical(next, $AT)) {
245 return tokenizeAt(next); 354 return tokenizeAt(next);
246 } 355 }
247 356
248 if (identical(next, $DQ) || identical(next, $SQ)) { 357 if (identical(next, $DQ) || identical(next, $SQ)) {
249 return tokenizeString(next, byteOffset, false); 358 return tokenizeString(next, scanOffset, false);
250 } 359 }
251 360
252 if (identical(next, $PERIOD)) { 361 if (identical(next, $PERIOD)) {
253 return tokenizeDotsOrNumber(next); 362 return tokenizeDotsOrNumber(next);
254 } 363 }
255 364
256 if (identical(next, $0)) { 365 if (identical(next, $0)) {
257 return tokenizeHexOrNumber(next); 366 return tokenizeHexOrNumber(next);
258 } 367 }
259 368
260 // TODO(ahe): Would a range check be faster? 369 // TODO(ahe): Would a range check be faster?
261 if (identical(next, $1) || identical(next, $2) || identical(next, $3) 370 if (identical(next, $1) || identical(next, $2) || identical(next, $3)
262 || identical(next, $4) || identical(next, $5) || identical(next, $6) 371 || identical(next, $4) || identical(next, $5) || identical(next, $6)
263 || identical(next, $7) || identical(next, $8) || identical(next, $9)) { 372 || identical(next, $7) || identical(next, $8) || identical(next, $9)) {
264 return tokenizeNumber(next); 373 return tokenizeNumber(next);
265 } 374 }
266 375
267 if (identical(next, $EOF)) { 376 if (identical(next, $EOF)) {
268 return $EOF; 377 return $EOF;
269 } 378 }
270 if (next < 0x1f) { 379 if (next < 0x1f) {
271 return error(new SourceString("unexpected character $next")); 380 return error("unexpected character $next");
272 } 381 }
273 382
383 next = currentAsUnicode(next);
384
274 // The following are non-ASCII characters. 385 // The following are non-ASCII characters.
275 386
276 if (identical(next, $NBSP)) { 387 if (identical(next, $NBSP)) {
277 appendWhiteSpace(next); 388 appendWhiteSpace(next);
278 return advance(); 389 return advance();
279 } 390 }
280 391
281 return tokenizeIdentifier(next, byteOffset, true); 392 return error("unexpected unicode character $next");
282 } 393 }
283 394
284 int tokenizeTag(int next) { 395 int tokenizeTag(int next) {
285 // # or #!.*[\n\r] 396 // # or #!.*[\n\r]
286 if (byteOffset == 0) { 397 if (scanOffset == 0) {
287 if (identical(peek(), $BANG)) { 398 if (identical(peek(), $BANG)) {
399 int start = scanOffset + 1;
400 bool asciiOnly = true;
288 do { 401 do {
289 next = advance(); 402 next = advance();
290 } while (!identical(next, $LF) && !identical(next, $CR) && !identical(ne xt, $EOF)); 403 if (next > 127) asciiOnly = false;
404 } while (!identical(next, $LF) &&
405 !identical(next, $CR) &&
406 !identical(next, $EOF));
407 if (!asciiOnly) handleUnicode(start);
291 return next; 408 return next;
292 } 409 }
293 } 410 }
294 appendPrecedenceToken(HASH_INFO); 411 appendPrecedenceToken(HASH_INFO);
295 return advance(); 412 return advance();
296 } 413 }
297 414
298 int tokenizeTilde(int next) { 415 int tokenizeTilde(int next) {
299 // ~ ~/ ~/= 416 // ~ ~/ ~/=
300 next = advance(); 417 next = advance();
301 if (identical(next, $SLASH)) { 418 if (identical(next, $SLASH)) {
302 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO); 419 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO);
303 } else { 420 } else {
304 appendPrecedenceToken(TILDE_INFO); 421 appendPrecedenceToken(TILDE_INFO);
305 return next; 422 return next;
306 } 423 }
307 } 424 }
308 425
309 int tokenizeOpenSquareBracket(int next) { 426 int tokenizeOpenSquareBracket(int next) {
310 // [ [] []= 427 // [ [] []=
311 next = advance(); 428 next = advance();
312 if (identical(next, $CLOSE_SQUARE_BRACKET)) { 429 if (identical(next, $CLOSE_SQUARE_BRACKET)) {
313 Token token = previousToken(); 430 Token token = previousToken();
314 if (token is KeywordToken && identical(token.value.stringValue, 'operator' )) { 431 if (token is KeywordToken &&
432 identical((token as KeywordToken).keyword.syntax, 'operator')) {
315 return select($EQ, INDEX_EQ_INFO, INDEX_INFO); 433 return select($EQ, INDEX_EQ_INFO, INDEX_INFO);
316 } 434 }
317 } 435 }
318 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO, "["); 436 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO);
319 return next; 437 return next;
320 } 438 }
321 439
322 int tokenizeCaret(int next) { 440 int tokenizeCaret(int next) {
323 // ^ ^= 441 // ^ ^=
324 return select($EQ, CARET_EQ_INFO, CARET_INFO); 442 return select($EQ, CARET_EQ_INFO, CARET_INFO);
325 } 443 }
326 444
327 int tokenizeBar(int next) { 445 int tokenizeBar(int next) {
328 // | || |= 446 // | || |=
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
372 return advance(); 490 return advance();
373 } else if (identical(next, $EQ)) { 491 } else if (identical(next, $EQ)) {
374 appendPrecedenceToken(MINUS_EQ_INFO); 492 appendPrecedenceToken(MINUS_EQ_INFO);
375 return advance(); 493 return advance();
376 } else { 494 } else {
377 appendPrecedenceToken(MINUS_INFO); 495 appendPrecedenceToken(MINUS_INFO);
378 return next; 496 return next;
379 } 497 }
380 } 498 }
381 499
382
383 int tokenizePlus(int next) { 500 int tokenizePlus(int next) {
384 // + ++ += 501 // + ++ +=
385 next = advance(); 502 next = advance();
386 if (identical($PLUS, next)) { 503 if (identical($PLUS, next)) {
387 appendPrecedenceToken(PLUS_PLUS_INFO); 504 appendPrecedenceToken(PLUS_PLUS_INFO);
388 return advance(); 505 return advance();
389 } else if (identical($EQ, next)) { 506 } else if (identical($EQ, next)) {
390 appendPrecedenceToken(PLUS_EQ_INFO); 507 appendPrecedenceToken(PLUS_EQ_INFO);
391 return advance(); 508 return advance();
392 } else { 509 } else {
393 appendPrecedenceToken(PLUS_INFO); 510 appendPrecedenceToken(PLUS_INFO);
394 return next; 511 return next;
395 } 512 }
396 } 513 }
397 514
398 int tokenizeExclamation(int next) { 515 int tokenizeExclamation(int next) {
399 // ! != !== 516 // ! !=
517 // !== is kept for user-friendly error reporting.
518
400 next = advance(); 519 next = advance();
401 if (identical(next, $EQ)) { 520 if (identical(next, $EQ)) {
402 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO); 521 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO);
403 } 522 }
404 appendPrecedenceToken(BANG_INFO); 523 appendPrecedenceToken(BANG_INFO);
405 return next; 524 return next;
406 } 525 }
407 526
408 int tokenizeEquals(int next) { 527 int tokenizeEquals(int next) {
409 // = == === 528 // = == =>
529 // === is kept for user-friendly error reporting.
410 530
411 // Type parameters and arguments cannot contain any token that 531 // Type parameters and arguments cannot contain any token that
412 // starts with '='. 532 // starts with '='.
413 discardOpenLt(); 533 discardOpenLt();
414 534
415 next = advance(); 535 next = advance();
416 if (identical(next, $EQ)) { 536 if (identical(next, $EQ)) {
417 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO); 537 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO);
418 } else if (identical(next, $GT)) { 538 } else if (identical(next, $GT)) {
419 appendPrecedenceToken(FUNCTION_INFO); 539 appendPrecedenceToken(FUNCTION_INFO);
420 return advance(); 540 return advance();
421 } 541 }
422 appendPrecedenceToken(EQ_INFO); 542 appendPrecedenceToken(EQ_INFO);
423 return next; 543 return next;
424 } 544 }
425 545
426 int tokenizeGreaterThan(int next) { 546 int tokenizeGreaterThan(int next) {
427 // > >= >> >>= >>> >>>= 547 // > >= >> >>=
428 next = advance(); 548 next = advance();
429 if (identical($EQ, next)) { 549 if (identical($EQ, next)) {
430 appendPrecedenceToken(GT_EQ_INFO); 550 appendPrecedenceToken(GT_EQ_INFO);
431 return advance(); 551 return advance();
432 } else if (identical($GT, next)) { 552 } else if (identical($GT, next)) {
433 next = advance(); 553 next = advance();
434 if (identical($EQ, next)) { 554 if (identical($EQ, next)) {
435 appendPrecedenceToken(GT_GT_EQ_INFO); 555 appendPrecedenceToken(GT_GT_EQ_INFO);
436 return advance(); 556 return advance();
437 } else { 557 } else {
438 appendGtGt(GT_GT_INFO, ">>"); 558 appendGtGt(GT_GT_INFO);
439 return next; 559 return next;
440 } 560 }
441 } else { 561 } else {
442 appendGt(GT_INFO, ">"); 562 appendGt(GT_INFO);
443 return next; 563 return next;
444 } 564 }
445 } 565 }
446 566
447 int tokenizeLessThan(int next) { 567 int tokenizeLessThan(int next) {
448 // < <= << <<= 568 // < <= << <<=
449 next = advance(); 569 next = advance();
450 if (identical($EQ, next)) { 570 if (identical($EQ, next)) {
451 appendPrecedenceToken(LT_EQ_INFO); 571 appendPrecedenceToken(LT_EQ_INFO);
452 return advance(); 572 return advance();
453 } else if (identical($LT, next)) { 573 } else if (identical($LT, next)) {
454 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO); 574 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO);
455 } else { 575 } else {
456 appendBeginGroup(LT_INFO, "<"); 576 appendBeginGroup(LT_INFO);
457 return next; 577 return next;
458 } 578 }
459 } 579 }
460 580
461 int tokenizeNumber(int next) { 581 int tokenizeNumber(int next) {
462 int start = byteOffset; 582 int start = scanOffset;
463 while (true) { 583 while (true) {
464 next = advance(); 584 next = advance();
465 if ($0 <= next && next <= $9) { 585 if ($0 <= next && next <= $9) {
466 continue; 586 continue;
467 } else if (identical(next, $e) || identical(next, $E)) { 587 } else if (identical(next, $e) || identical(next, $E)) {
468 return tokenizeFractionPart(next, start); 588 return tokenizeFractionPart(next, start);
469 } else { 589 } else {
470 if (identical(next, $PERIOD)) { 590 if (identical(next, $PERIOD)) {
471 int nextnext = peek(); 591 int nextnext = peek();
472 if ($0 <= nextnext && nextnext <= $9) { 592 if ($0 <= nextnext && nextnext <= $9) {
473 return tokenizeFractionPart(advance(), start); 593 return tokenizeFractionPart(advance(), start);
474 } 594 }
475 } 595 }
476 appendByteStringToken(INT_INFO, asciiString(start, 0)); 596 appendSubstringToken(INT_INFO, start, true);
477 return next; 597 return next;
478 } 598 }
479 } 599 }
480 } 600 }
481 601
482 int tokenizeHexOrNumber(int next) { 602 int tokenizeHexOrNumber(int next) {
483 int x = peek(); 603 int x = peek();
484 if (identical(x, $x) || identical(x, $X)) { 604 if (identical(x, $x) || identical(x, $X)) {
485 advance(); 605 return tokenizeHex(next);
486 return tokenizeHex(x);
487 } 606 }
488 return tokenizeNumber(next); 607 return tokenizeNumber(next);
489 } 608 }
490 609
491 int tokenizeHex(int next) { 610 int tokenizeHex(int next) {
492 int start = byteOffset - 1; 611 int start = scanOffset;
612 next = advance(); // Advance past the $x or $X.
493 bool hasDigits = false; 613 bool hasDigits = false;
494 while (true) { 614 while (true) {
495 next = advance(); 615 next = advance();
496 if (($0 <= next && next <= $9) 616 if (($0 <= next && next <= $9)
497 || ($A <= next && next <= $F) 617 || ($A <= next && next <= $F)
498 || ($a <= next && next <= $f)) { 618 || ($a <= next && next <= $f)) {
499 hasDigits = true; 619 hasDigits = true;
500 } else { 620 } else {
501 if (!hasDigits) { 621 if (!hasDigits) {
502 return error(const SourceString("hex digit expected")); 622 return error("hex digit expected");
503 } 623 }
504 appendByteStringToken(HEXADECIMAL_INFO, asciiString(start, 0)); 624 appendSubstringToken(HEXADECIMAL_INFO, start, true);
505 return next; 625 return next;
506 } 626 }
507 } 627 }
508 } 628 }
509 629
510 int tokenizeDotsOrNumber(int next) { 630 int tokenizeDotsOrNumber(int next) {
511 int start = byteOffset; 631 int start = scanOffset;
512 next = advance(); 632 next = advance();
513 if (($0 <= next && next <= $9)) { 633 if (($0 <= next && next <= $9)) {
514 return tokenizeFractionPart(next, start); 634 return tokenizeFractionPart(next, start);
515 } else if (identical($PERIOD, next)) { 635 } else if (identical($PERIOD, next)) {
516 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); 636 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);
517 } else { 637 } else {
518 appendPrecedenceToken(PERIOD_INFO); 638 appendPrecedenceToken(PERIOD_INFO);
519 return next; 639 return next;
520 } 640 }
521 } 641 }
522 642
523 int tokenizeFractionPart(int next, int start) { 643 int tokenizeFractionPart(int next, int start) {
524 bool done = false; 644 bool done = false;
525 bool hasDigit = false; 645 bool hasDigit = false;
526 LOOP: while (!done) { 646 LOOP: while (!done) {
527 if ($0 <= next && next <= $9) { 647 if ($0 <= next && next <= $9) {
528 hasDigit = true; 648 hasDigit = true;
529 } else if (identical($e, next) || identical($E, next)) { 649 } else if (identical($e, next) || identical($E, next)) {
530 hasDigit = true; 650 hasDigit = true;
531 next = tokenizeExponent(advance()); 651 next = tokenizeExponent(advance());
532 done = true; 652 done = true;
533 continue LOOP; 653 continue LOOP;
534 } else { 654 } else {
535 done = true; 655 done = true;
536 continue LOOP; 656 continue LOOP;
537 } 657 }
538 next = advance(); 658 next = advance();
539 } 659 }
540 if (!hasDigit) { 660 if (!hasDigit) {
541 appendByteStringToken(INT_INFO, asciiString(start, -1)); 661 // Reduce offset, we already advanced to the token past the period.
662 appendSubstringToken(INT_INFO, start, true, -1);
663
664 // TODO(ahe): Wrong offset for the period. Cannot call beginToken because
665 // the scanner already advanced past the period.
542 if (identical($PERIOD, next)) { 666 if (identical($PERIOD, next)) {
543 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); 667 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);
544 } 668 }
545 // TODO(ahe): Wrong offset for the period.
546 appendPrecedenceToken(PERIOD_INFO); 669 appendPrecedenceToken(PERIOD_INFO);
547 return bigSwitch(next); 670 return next;
548 } 671 }
549 appendByteStringToken(DOUBLE_INFO, asciiString(start, 0)); 672 appendSubstringToken(DOUBLE_INFO, start, true);
550 return next; 673 return next;
551 } 674 }
552 675
553 int tokenizeExponent(int next) { 676 int tokenizeExponent(int next) {
554 if (identical(next, $PLUS) || identical(next, $MINUS)) { 677 if (identical(next, $PLUS) || identical(next, $MINUS)) {
555 next = advance(); 678 next = advance();
556 } 679 }
557 bool hasDigits = false; 680 bool hasDigits = false;
558 while (true) { 681 while (true) {
559 if ($0 <= next && next <= $9) { 682 if ($0 <= next && next <= $9) {
560 hasDigits = true; 683 hasDigits = true;
561 } else { 684 } else {
562 if (!hasDigits) { 685 if (!hasDigits) {
563 return error(const SourceString("digit expected")); 686 return error("digit expected");
564 } 687 }
565 return next; 688 return next;
566 } 689 }
567 next = advance(); 690 next = advance();
568 } 691 }
569 } 692 }
570 693
571 int tokenizeSlashOrComment(int next) { 694 int tokenizeSlashOrComment(int next) {
695 int start = scanOffset;
572 next = advance(); 696 next = advance();
573 if (identical($STAR, next)) { 697 if (identical($STAR, next)) {
574 return tokenizeMultiLineComment(next); 698 return tokenizeMultiLineComment(next, start);
575 } else if (identical($SLASH, next)) { 699 } else if (identical($SLASH, next)) {
576 return tokenizeSingleLineComment(next); 700 return tokenizeSingleLineComment(next, start);
577 } else if (identical($EQ, next)) { 701 } else if (identical($EQ, next)) {
578 appendPrecedenceToken(SLASH_EQ_INFO); 702 appendPrecedenceToken(SLASH_EQ_INFO);
579 return advance(); 703 return advance();
580 } else { 704 } else {
581 appendPrecedenceToken(SLASH_INFO); 705 appendPrecedenceToken(SLASH_INFO);
582 return next; 706 return next;
583 } 707 }
584 } 708 }
585 709
586 int tokenizeSingleLineComment(int next) { 710 int tokenizeSingleLineComment(int next, int start) {
711 bool asciiOnly = true;
587 while (true) { 712 while (true) {
588 next = advance(); 713 next = advance();
589 if (identical($LF, next) || identical($CR, next) || identical($EOF, next)) { 714 if (next > 127) asciiOnly = false;
590 appendComment(); 715 if (identical($LF, next) ||
716 identical($CR, next) ||
717 identical($EOF, next)) {
718 if (!asciiOnly) handleUnicode(start);
719 appendComment(start, asciiOnly);
591 return next; 720 return next;
592 } 721 }
593 } 722 }
594 } 723 }
595 724
596 int tokenizeMultiLineComment(int next) { 725
726 int tokenizeMultiLineComment(int next, int start) {
727 bool asciiOnlyComment = true; // Track if the entire comment is ASCII.
728 bool asciiOnlyLines = true; // Track ASCII since the last handleUnicode.
729 int unicodeStart = start;
597 int nesting = 1; 730 int nesting = 1;
598 next = advance(); 731 next = advance();
599 while (true) { 732 while (true) {
600 if (identical($EOF, next)) { 733 if (identical($EOF, next)) {
601 // TODO(ahe): Report error. 734 if (!asciiOnlyLines) handleUnicode(unicodeStart);
735 appendStringToken(BAD_INPUT_INFO, "unterminated multi-line comment");
602 return next; 736 return next;
603 } else if (identical($STAR, next)) { 737 } else if (identical($STAR, next)) {
604 next = advance(); 738 next = advance();
605 if (identical($SLASH, next)) { 739 if (identical($SLASH, next)) {
606 --nesting; 740 --nesting;
607 if (0 == nesting) { 741 if (0 == nesting) {
742 if (!asciiOnlyLines) handleUnicode(unicodeStart);
608 next = advance(); 743 next = advance();
609 appendComment(); 744 appendComment(start, asciiOnlyComment);
610 return next; 745 return next;
611 } else { 746 } else {
612 next = advance(); 747 next = advance();
613 } 748 }
614 } 749 }
615 } else if (identical($SLASH, next)) { 750 } else if (identical($SLASH, next)) {
616 next = advance(); 751 next = advance();
617 if (identical($STAR, next)) { 752 if (identical($STAR, next)) {
618 next = advance(); 753 next = advance();
619 ++nesting; 754 ++nesting;
620 } 755 }
756 } else if (identical(next, $LF)) {
757 if (!asciiOnlyLines) {
758 // Synchronize the string offset in the utf8 scanner.
759 handleUnicode(unicodeStart);
760 asciiOnlyLines = true;
761 unicodeStart = scanOffset;
762 }
763 lineFeedInMultiline();
764 next = advance();
621 } else { 765 } else {
766 if (next > 127) {
767 asciiOnlyLines = false;
768 asciiOnlyComment = false;
769 }
622 next = advance(); 770 next = advance();
623 } 771 }
624 } 772 }
625 } 773 }
626 774
627 int tokenizeRawStringKeywordOrIdentifier(int next) { 775 int tokenizeRawStringKeywordOrIdentifier(int next) {
776 // [next] is $r.
628 int nextnext = peek(); 777 int nextnext = peek();
629 if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) { 778 if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) {
630 int start = byteOffset; 779 int start = scanOffset;
631 next = advance(); 780 next = advance();
632 return tokenizeString(next, start, true); 781 return tokenizeString(next, start, true);
633 } 782 }
634 return tokenizeKeywordOrIdentifier(next, true); 783 return tokenizeKeywordOrIdentifier(next, true);
635 } 784 }
636 785
637 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) { 786 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) {
638 KeywordState state = KeywordState.KEYWORD_STATE; 787 KeywordState state = KeywordState.KEYWORD_STATE;
639 int start = byteOffset; 788 int start = scanOffset;
640 while (state != null && $a <= next && next <= $z) { 789 while (state != null && $a <= next && next <= $z) {
641 state = state.next(next); 790 state = state.next(next);
642 next = advance(); 791 next = advance();
643 } 792 }
644 if (state == null || state.keyword == null) { 793 if (state == null || state.keyword == null) {
645 return tokenizeIdentifier(next, start, allowDollar); 794 return tokenizeIdentifier(next, start, allowDollar);
646 } 795 }
647 if (($A <= next && next <= $Z) || 796 if (($A <= next && next <= $Z) ||
648 ($0 <= next && next <= $9) || 797 ($0 <= next && next <= $9) ||
649 identical(next, $_) || 798 identical(next, $_) ||
650 identical(next, $$)) { 799 identical(next, $$)) {
651 return tokenizeIdentifier(next, start, allowDollar); 800 return tokenizeIdentifier(next, start, allowDollar);
652 } else if (next < 128) { 801 } else {
653 appendKeywordToken(state.keyword); 802 appendKeywordToken(state.keyword);
654 return next; 803 return next;
655 } else {
656 return tokenizeIdentifier(next, start, allowDollar);
657 } 804 }
658 } 805 }
659 806
807 /**
808 * [allowDollar] can exclude '$', which is not allowed as part of a string
809 * interpolation identifier.
810 */
660 int tokenizeIdentifier(int next, int start, bool allowDollar) { 811 int tokenizeIdentifier(int next, int start, bool allowDollar) {
661 bool isAscii = true;
662
663 while (true) { 812 while (true) {
664 if (($a <= next && next <= $z) || 813 if (($a <= next && next <= $z) ||
665 ($A <= next && next <= $Z) || 814 ($A <= next && next <= $Z) ||
666 ($0 <= next && next <= $9) || 815 ($0 <= next && next <= $9) ||
667 identical(next, $_) || 816 identical(next, $_) ||
668 (identical(next, $$) && allowDollar)) { 817 (identical(next, $$) && allowDollar)) {
669 next = advance(); 818 next = advance();
670 } else if ((next < 128) || (identical(next, $NBSP))) { 819 } else {
671 // Identifier ends here. 820 // Identifier ends here.
672 if (start == byteOffset) { 821 if (start == scanOffset) {
673 return error(const SourceString("expected identifier")); 822 return error("expected identifier");
674 } else if (isAscii) {
675 appendByteStringToken(IDENTIFIER_INFO, asciiString(start, 0));
676 } else { 823 } else {
677 appendByteStringToken(BAD_INPUT_INFO, utf8String(start, -1)); 824 appendSubstringToken(IDENTIFIER_INFO, start, true);
678 } 825 }
679 return next; 826 return next;
680 } else {
681 int nonAsciiStart = byteOffset;
682 do {
683 next = nextByte();
684 if (identical(next, $NBSP)) break;
685 } while (next > 127);
686 String string = utf8String(nonAsciiStart, -1).slowToString();
687 isAscii = false;
688 int byteLength = nonAsciiStart - byteOffset;
689 addToCharOffset(string.length - byteLength);
690 } 827 }
691 } 828 }
692 } 829 }
693 830
694 int tokenizeAt(int next) { 831 int tokenizeAt(int next) {
695 int start = byteOffset;
696 next = advance();
697 appendPrecedenceToken(AT_INFO); 832 appendPrecedenceToken(AT_INFO);
698 return next; 833 return advance();
699 } 834 }
700 835
701 int tokenizeString(int next, int start, bool raw) { 836 int tokenizeString(int next, int start, bool raw) {
702 int quoteChar = next; 837 int quoteChar = next;
703 next = advance(); 838 next = advance();
704 if (identical(quoteChar, next)) { 839 if (identical(quoteChar, next)) {
705 next = advance(); 840 next = advance();
706 if (identical(quoteChar, next)) { 841 if (identical(quoteChar, next)) {
707 // Multiline string. 842 // Multiline string.
708 return tokenizeMultiLineString(quoteChar, start, raw); 843 return tokenizeMultiLineString(quoteChar, start, raw);
709 } else { 844 } else {
710 // Empty string. 845 // Empty string.
711 appendByteStringToken(STRING_INFO, utf8String(start, -1)); 846 appendSubstringToken(STRING_INFO, start, true);
712 return next; 847 return next;
713 } 848 }
714 } 849 }
715 if (raw) { 850 if (raw) {
716 return tokenizeSingleLineRawString(next, quoteChar, start); 851 return tokenizeSingleLineRawString(next, quoteChar, start);
717 } else { 852 } else {
718 return tokenizeSingleLineString(next, quoteChar, start); 853 return tokenizeSingleLineString(next, quoteChar, start);
719 } 854 }
720 } 855 }
721 856
722 static bool isHexDigit(int character) { 857 /**
723 if ($0 <= character && character <= $9) return true; 858 * [next] is the first character after the qoute.
ngeoffray 2013/10/18 10:19:37 qoute -> quote
lukas 2013/10/24 16:48:36 Done.
724 character |= 0x20; 859 * [start] is the scanOffset of the quote.
725 return ($a <= character && character <= $f); 860 *
726 } 861 * The token contains a substring of the source file, including the
727 862 * string quotes, backslashes for escaping. For interpolated strings,
863 * the parts before and after are separate tokens.
864 *
865 * "a $b c"
866 *
867 * gives StringToken("a $), StringToken(b) and StringToken( c").
868 */
728 int tokenizeSingleLineString(int next, int quoteChar, int start) { 869 int tokenizeSingleLineString(int next, int quoteChar, int start) {
870 bool asciiOnly = true;
729 while (!identical(next, quoteChar)) { 871 while (!identical(next, quoteChar)) {
730 if (identical(next, $BACKSLASH)) { 872 if (identical(next, $BACKSLASH)) {
731 next = advance(); 873 next = advance();
732 } else if (identical(next, $$)) { 874 } else if (identical(next, $$)) {
733 next = tokenizeStringInterpolation(start); 875 if (!asciiOnly) handleUnicode(start);
734 start = byteOffset; 876 next = tokenizeStringInterpolation(start, asciiOnly);
877 start = scanOffset;
878 asciiOnly = true;
735 continue; 879 continue;
736 } 880 }
737 if (next <= $CR 881 if (next <= $CR
738 && (identical(next, $LF) || identical(next, $CR) || identical(next, $E OF))) { 882 && (identical(next, $LF) ||
739 return error(const SourceString("unterminated string literal")); 883 identical(next, $CR) ||
884 identical(next, $EOF))) {
885 if (!asciiOnly) handleUnicode(start);
886 return error("unterminated string literal");
740 } 887 }
888 if (next > 127) asciiOnly = false;
741 next = advance(); 889 next = advance();
742 } 890 }
743 appendByteStringToken(STRING_INFO, utf8String(start, 0)); 891 if (!asciiOnly) handleUnicode(start);
744 return advance(); 892 // Advance past the quote character.
893 next = advance();
894 appendSubstringToken(STRING_INFO, start, asciiOnly);
895 return next;
745 } 896 }
746 897
747 int tokenizeStringInterpolation(int start) { 898 int tokenizeStringInterpolation(int start, bool asciiOnly) {
748 appendByteStringToken(STRING_INFO, utf8String(start, -1)); 899 appendSubstringToken(STRING_INFO, start, asciiOnly);
749 beginToken(); // $ starts here. 900 beginToken(); // $ starts here.
750 int next = advance(); 901 int next = advance();
751 if (identical(next, $OPEN_CURLY_BRACKET)) { 902 if (identical(next, $OPEN_CURLY_BRACKET)) {
752 return tokenizeInterpolatedExpression(next, start); 903 return tokenizeInterpolatedExpression(next);
753 } else { 904 } else {
754 return tokenizeInterpolatedIdentifier(next, start); 905 return tokenizeInterpolatedIdentifier(next);
755 } 906 }
756 } 907 }
757 908
758 int tokenizeInterpolatedExpression(int next, int start) { 909 int tokenizeInterpolatedExpression(int next) {
759 appendBeginGroup(STRING_INTERPOLATION_INFO, "\${"); 910 appendBeginGroup(STRING_INTERPOLATION_INFO);
760 beginToken(); // The expression starts here. 911 beginToken(); // The expression starts here.
761 next = advance(); 912 next = advance(); // Move past the curly bracket.
762 while (!identical(next, $EOF) && !identical(next, $STX)) { 913 while (!identical(next, $EOF) && !identical(next, $STX)) {
763 next = bigSwitch(next); 914 next = bigSwitch(next);
764 } 915 }
765 if (identical(next, $EOF)) return next; 916 if (identical(next, $EOF)) return next;
766 next = advance(); 917 next = advance(); // Move past the $STX.
767 beginToken(); // The string interpolation suffix starts here. 918 beginToken(); // The string interpolation suffix starts here.
768 return next; 919 return next;
769 } 920 }
770 921
771 int tokenizeInterpolatedIdentifier(int next, int start) { 922 int tokenizeInterpolatedIdentifier(int next) {
772 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO); 923 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO);
773 beginToken(); // The identifier starts here. 924 beginToken(); // The identifier starts here.
774 next = tokenizeKeywordOrIdentifier(next, false); 925 next = tokenizeKeywordOrIdentifier(next, false);
775 beginToken(); // The string interpolation suffix starts here. 926 beginToken(); // The string interpolation suffix starts here.
776 return next; 927 return next;
777 } 928 }
778 929
779 int tokenizeSingleLineRawString(int next, int quoteChar, int start) { 930 int tokenizeSingleLineRawString(int next, int quoteChar, int start) {
780 next = advance(); 931 bool asciiOnly = true;
932 next = advance(); // Advance past the quote
ngeoffray 2013/10/18 10:19:37 Missing .
lukas 2013/10/24 16:48:36 Done.
781 while (next != $EOF) { 933 while (next != $EOF) {
782 if (identical(next, quoteChar)) { 934 if (identical(next, quoteChar)) {
783 appendByteStringToken(STRING_INFO, utf8String(start, 0)); 935 if (!asciiOnly) handleUnicode(start);
784 return advance(); 936 next = advance();
937 appendSubstringToken(STRING_INFO, start, asciiOnly);
938 return next;
785 } else if (identical(next, $LF) || identical(next, $CR)) { 939 } else if (identical(next, $LF) || identical(next, $CR)) {
786 return error(const SourceString("unterminated string literal")); 940 if (!asciiOnly) handleUnicode(start);
941 return error("unterminated string literal");
942 } else if (next > 127) {
943 asciiOnly = false;
787 } 944 }
788 next = advance(); 945 next = advance();
789 } 946 }
790 return error(const SourceString("unterminated string literal")); 947 if (!asciiOnly) handleUnicode(start);
948 return error("unterminated string literal");
791 } 949 }
792 950
793 int tokenizeMultiLineRawString(int quoteChar, int start) { 951 int tokenizeMultiLineRawString(int quoteChar, int start) {
794 int next = advance(); 952 bool asciiOnlyString = true;
953 bool asciiOnlyLine = true;
954 int unicodeStart = start;
955 int next = advance(); // Advance past the (last) quote (of three)
ngeoffray 2013/10/18 10:19:37 Missing .
lukas 2013/10/24 16:48:36 Done.
795 outer: while (!identical(next, $EOF)) { 956 outer: while (!identical(next, $EOF)) {
796 while (!identical(next, quoteChar)) { 957 while (!identical(next, quoteChar)) {
958 if (identical(next, $LF)) {
959 if (!asciiOnlyLine) {
960 // Synchronize the string offset in the utf8 scanner.
961 handleUnicode(unicodeStart);
962 asciiOnlyLine = true;
963 unicodeStart = scanOffset;
964 }
965 lineFeedInMultiline();
966 } else if (next > 127) {
967 asciiOnlyLine = false;
968 asciiOnlyString = false;
969 }
797 next = advance(); 970 next = advance();
798 if (identical(next, $EOF)) break outer; 971 if (identical(next, $EOF)) break outer;
799 } 972 }
800 next = advance(); 973 next = advance();
801 if (identical(next, quoteChar)) { 974 if (identical(next, quoteChar)) {
802 next = advance(); 975 next = advance();
803 if (identical(next, quoteChar)) { 976 if (identical(next, quoteChar)) {
804 appendByteStringToken(STRING_INFO, utf8String(start, 0)); 977 if (!asciiOnlyLine) handleUnicode(unicodeStart);
805 return advance(); 978 next = advance();
979 appendSubstringToken(STRING_INFO, start, asciiOnlyString);
980 return next;
806 } 981 }
807 } 982 }
808 } 983 }
809 return error(const SourceString("unterminated string literal")); 984 if (!asciiOnlyLine) handleUnicode(unicodeStart);
985 return error("unterminated string literal");
810 } 986 }
811 987
812 int tokenizeMultiLineString(int quoteChar, int start, bool raw) { 988 int tokenizeMultiLineString(int quoteChar, int start, bool raw) {
813 if (raw) return tokenizeMultiLineRawString(quoteChar, start); 989 if (raw) return tokenizeMultiLineRawString(quoteChar, start);
814 int next = advance(); 990 bool asciiOnlyString = true;
991 bool asciiOnlyLine = true;
992 int unicodeStart = start;
993 int next = advance(); // Advance past the (last) quote (of three).
815 while (!identical(next, $EOF)) { 994 while (!identical(next, $EOF)) {
816 if (identical(next, $$)) { 995 if (identical(next, $$)) {
817 next = tokenizeStringInterpolation(start); 996 if (!asciiOnlyLine) handleUnicode(unicodeStart);
818 start = byteOffset; 997 next = tokenizeStringInterpolation(start, asciiOnlyString);
998 start = scanOffset;
999 unicodeStart = start;
1000 asciiOnlyString = true; // A new string token is created for the rest.
1001 asciiOnlyLine = true;
819 continue; 1002 continue;
820 } 1003 }
821 if (identical(next, quoteChar)) { 1004 if (identical(next, quoteChar)) {
822 next = advance(); 1005 next = advance();
823 if (identical(next, quoteChar)) { 1006 if (identical(next, quoteChar)) {
824 next = advance(); 1007 next = advance();
825 if (identical(next, quoteChar)) { 1008 if (identical(next, quoteChar)) {
826 appendByteStringToken(STRING_INFO, utf8String(start, 0)); 1009 if (!asciiOnlyLine) handleUnicode(unicodeStart);
827 return advance(); 1010 next = advance();
1011 appendSubstringToken(STRING_INFO, start, asciiOnlyString);
1012 return next;
828 } 1013 }
829 } 1014 }
830 continue; 1015 continue;
831 } 1016 }
832 if (identical(next, $BACKSLASH)) { 1017 if (identical(next, $BACKSLASH)) {
833 next = advance(); 1018 next = advance();
834 if (identical(next, $EOF)) break; 1019 if (identical(next, $EOF)) break;
835 } 1020 }
1021 if (identical(next, $LF)) {
1022 if (!asciiOnlyLine) {
1023 // Synchronize the string offset in the utf8 scanner.
1024 handleUnicode(unicodeStart);
1025 asciiOnlyLine = true;
1026 unicodeStart = scanOffset;
1027 }
1028 lineFeedInMultiline();
1029 } else if (next > 127) {
1030 asciiOnlyString = false;
1031 asciiOnlyLine = false;
1032 }
836 next = advance(); 1033 next = advance();
837 } 1034 }
838 return error(const SourceString("unterminated string literal")); 1035 if (!asciiOnlyLine) handleUnicode(unicodeStart);
1036 return error("unterminated string literal");
839 } 1037 }
840 1038
841 int error(SourceString message) { 1039 int error(String message) {
842 appendByteStringToken(BAD_INPUT_INFO, message); 1040 appendStringToken(BAD_INPUT_INFO, message);
843 return advance(); // Ensure progress. 1041 return advance(); // Ensure progress.
844 } 1042 }
1043
1044 void unmatchedBeginGroup(BeginGroupToken begin) {
1045 String error = 'unmatched "${begin.stringValue}"';
1046 Token close =
1047 new StringToken.fromString(
1048 BAD_INPUT_INFO, error, begin.charOffset, true);
1049
1050 // We want to ensure that unmatched BeginGroupTokens are reported
1051 // as errors. However, the rest of the parser assume the groups
ngeoffray 2013/10/18 10:19:37 assume -> assumes
lukas 2013/10/24 16:48:36 Done.
1052 // are well-balanced and will never look at the endGroup
1053 // token. This is a nice property that allows us to skip quickly
1054 // over correct code. By inserting an additional error token in
1055 // the stream, we can keep ignoring endGroup tokens.
1056 //
1057 // [begin] --next--> [tail]
1058 // [begin] --endG--> [close] --next--> [next] --next--> [tail]
1059 //
1060 // This allows the parser to skip from [begin] via endGroup to [close] and
1061 // ignore the [close] token (assuming it's correct), then the error will be
1062 // reported when parsing the [next] token.
1063
1064 Token next = new StringToken.fromString(
1065 BAD_INPUT_INFO, error, begin.charOffset, true);
1066 begin.endGroup = close;
1067 close.next = next;
1068 next.next = begin.next;
1069 }
845 } 1070 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698