Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(383)

Side by Side Diff: sdk/lib/_internal/compiler/implementation/scanner/scanner.dart

Issue 27510003: Scanner for UTF-8 byte arrays (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 7 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 part of scanner; 5 part of scanner;
6 6
7 abstract class Scanner { 7 abstract class Scanner {
8 Token tokenize(); 8 Token tokenize();
9
10 factory Scanner(SourceFile file, {bool includeComments: false}) {
11 if (file is Utf8BytesSourceFile) {
12 return new Utf8BytesScanner(file, includeComments: includeComments);
13 } else {
14 return new StringScanner(file, includeComments: includeComments);
15 }
16 }
9 } 17 }
10 18
11 /** 19 abstract class AbstractScanner implements Scanner {
12 * Common base class for a Dart scanner. 20 final bool includeComments;
13 */ 21
14 abstract class AbstractScanner<T extends SourceString> implements Scanner { 22 /**
23 * The string offset for the next token that will be created.
24 *
25 * Note that in the [Utf8BytesScanner], string offsets and [scanOffset] values
26 * are different. One string character can be encoded using multiple UTF-8
27 * bytes.
28 */
29 int tokenStart = -1;
30
31 /**
32 * A pointer to the token stream created by this scanner. The first token
33 * is a special token and not part of the source file. This is an
34 * implementation detail to avoids special cases in the scanner. This token
35 * is not exposed to clients of the scanner, which are expected to invoke
36 * [firstToken] to access the token stream.
37 */
38 final Token tokens = new SymbolToken(EOF_INFO, -1);
39
40 /**
41 * A pointer to the last scanned token.
42 */
43 Token tail;
44
45 /**
46 * The stack of open groups, e.g [: { ... ( .. :]
47 * Each BeginGroupToken has a pointer to the token where the group
48 * ends. This field is set when scanning the end group token.
49 */
50 Link<BeginGroupToken> groupingStack = const Link<BeginGroupToken>();
51
52 /**
53 * The source file that is being scanned. This field can be [:null:].
54 * If the source file is available, the scanner assigns its [:lineStarts:] and
55 * [:length:] fields at the end of [tokenize].
56 */
57 final SourceFile file;
58
59 final List<int> lineStarts = [0];
60
61 AbstractScanner(this.file, this.includeComments) {
62 this.tail = this.tokens;
63 }
64
65
66 /**
67 * Advances and returns the next character.
68 *
69 * If the next character is non-ASCII, then the returned value depends on the
70 * scanner implementation. The [Utf8BytesScanner] returns a UTF-8 byte, while
71 * the [StringScanner] returns a UTF-16 code unit.
72 *
73 * The scanner ensures that [advance] is not invoked after it returned [$EOF].
74 * This allows implementations to omit bound checks if the data structure ends
75 * with '0'.
76 */
15 int advance(); 77 int advance();
16 int nextByte(); 78
17 79 /**
18 /** 80 * Returns the current unicode character.
19 * Returns the current character or byte depending on the underlying input 81 *
20 * kind. For example, [StringScanner] operates on [String] and thus returns 82 * If the current character is ASCII, then it is returned unchanged.
21 * characters (Unicode codepoints represented as int) whereas 83 *
22 * [ByteArrayScanner] operates on byte arrays and thus returns bytes. 84 * The [Utf8BytesScanner] decodes the next unicode code point starting at the
85 * current position. Note that every unicode character is returned as a single
86 * code point, i.e., for '\u{1d11e}' it returns 119070, and the following
87 * [advance] returns the next character.
88 *
89 * The [StringScanner] returns the current character unchanged, which might
90 * be a surrogate character. In the case of '\u{1d11e}', it returns the first
91 * code unit 55348, and the following [advance] returns the second code unit
92 * 56606.
93 *
94 * Invoking [currentAsUnicode] multiple times is safe, i.e.,
95 * [:currentAsUnicode(next) == currentAsUnicode(currentAsUnicode(next)):].
96 */
97 int currentAsUnicode(int next);
98
99 /**
100 * Returns the character at the next poisition. Like in [advance], the
101 * [Utf8BytesScanner] returns a UTF-8 byte, while the [StringScanner] returns
102 * a UTF-16 code unit.
23 */ 103 */
24 int peek(); 104 int peek();
25 105
26 /** 106 /**
107 * Notifies the scanner that unicode characters were detected in either a
108 * comment or a string literal between [startScanOffset] and the current
109 * scan offset.
110 */
111 void handleUnicode(int startScanOffset);
112
113 /**
114 * Returns the current scan offset.
115 *
116 * In the [Utf8BytesScanner] this is the offset into the byte list, in the
117 * [StringScanner] the offset in the source string.
118 */
119 int get scanOffset;
120
121 /**
122 * Returns the current string offset.
123 *
124 * In the [StringScanner] this is identical to the [scanOffset]. In the
125 * [Utf8BytesScanner] it is computed based on encountered UTF-8 characters.
126 */
127 int get stringOffset;
128
129 /**
130 * Returns the first token scanned by this [Scanner].
131 */
132 Token firstToken();
133
134 /**
135 * Returns the last token scanned by this [Scanner].
136 */
137 Token previousToken();
138
139 /**
140 * Notifies that a new token starts at current offset.
141 */
142 void beginToken() {
143 tokenStart = stringOffset;
144 }
145
146 /**
147 * Appends a substring from the scan offset [:start:] to the current
148 * [:scanOffset:] plus the [:extraOffset:]. For example, if the current
149 * scanOffset is 10, then [:appendSubstringToken(5, -1):] will append the
150 * substring string [5,9).
151 *
152 * Note that [extraOffset] can only be used if the covered character(s) are
153 * known to be ASCII.
154 */
155 void appendSubstringToken(PrecedenceInfo info, int start,
156 bool asciiOnly, [int extraOffset]);
157
158 /**
159 * Appends a token whose kind is determined by [info] and content is defined
160 * by the String [value].
161 *
162 * This method is invoked for class names, field names, method names, types,
163 * etc.
164 */
165 void appendStringToken(PrecedenceInfo info, String value) {
166 tail.next = new StringToken.fromString(info, value, tokenStart, true);
167 tail = tail.next;
168 }
169
170 /**
171 * Appends a fixed token whose kind and content is determined by [info].
172 * Appends an *operator* token from [info].
173 *
174 * An operator token represent operators like ':', '.', ';', '&&', '==', '--',
175 * '=>', etc.
176 */
177 void appendPrecedenceToken(PrecedenceInfo info) {
178 tail.next = new SymbolToken(info, tokenStart);
179 tail = tail.next;
180 }
181
182 /**
27 * Appends a fixed token based on whether the current char is [choice] or not. 183 * Appends a fixed token based on whether the current char is [choice] or not.
28 * If the current char is [choice] a fixed token whose kind and content 184 * If the current char is [choice] a fixed token whose kind and content
29 * is determined by [yes] is appended, otherwise a fixed token whose kind 185 * is determined by [yes] is appended, otherwise a fixed token whose kind
30 * and content is determined by [no] is appended. 186 * and content is determined by [no] is appended.
31 */ 187 */
32 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no); 188 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no) {
33 189 int next = advance();
34 /** 190 if (identical(next, choice)) {
35 * Appends a fixed token whose kind and content is determined by [info]. 191 appendPrecedenceToken(yes);
36 */ 192 return advance();
37 void appendPrecedenceToken(PrecedenceInfo info); 193 } else {
38 194 appendPrecedenceToken(no);
39 /** 195 return next;
40 * Appends a token whose kind is determined by [info] and content is [value]. 196 }
41 */ 197 }
42 void appendStringToken(PrecedenceInfo info, String value);
43
44 /**
45 * Appends a token whose kind is determined by [info] and content is defined
46 * by the SourceString [value].
47 */
48 void appendByteStringToken(PrecedenceInfo info, T value);
49 198
50 /** 199 /**
51 * Appends a keyword token whose kind is determined by [keyword]. 200 * Appends a keyword token whose kind is determined by [keyword].
52 */ 201 */
53 void appendKeywordToken(Keyword keyword); 202 void appendKeywordToken(Keyword keyword) {
54 void appendWhiteSpace(int next); 203 String syntax = keyword.syntax;
55 void appendEofToken(); 204 // Type parameters and arguments cannot contain 'this' or 'super'.
56 205 if (identical(syntax, 'this') || identical(syntax, 'super')) {
57 /** 206 discardOpenLt();
58 * Creates an ASCII SourceString whose content begins at the source byte 207 }
59 * offset [start] and ends at [offset] bytes from the current byte offset of 208 tail.next = new KeywordToken(keyword, tokenStart);
60 * the scanner. For example, if the current byte offset is 10, 209 tail = tail.next;
61 * [:asciiString(0,-1):] creates an ASCII SourceString whose content is found 210 }
62 * at the [0,9[ byte interval of the source text. 211
63 */ 212 void appendEofToken() {
64 T asciiString(int start, int offset); 213 beginToken();
65 T utf8String(int start, int offset); 214 tail.next = new SymbolToken(EOF_INFO, tokenStart);
66 Token firstToken(); 215 tail = tail.next;
67 Token previousToken(); 216 // EOF points to itself so there's always infinite look-ahead.
68 void beginToken(); 217 tail.next = tail;
69 void addToCharOffset(int offset); 218 discardOpenLt();
70 int get charOffset; 219 while (!groupingStack.isEmpty) {
71 int get byteOffset; 220 unmatchedBeginGroup(groupingStack.head);
72 void appendBeginGroup(PrecedenceInfo info, String value); 221 groupingStack = groupingStack.tail;
73 int appendEndGroup(PrecedenceInfo info, String value, int openKind); 222 }
74 void appendGt(PrecedenceInfo info, String value); 223 }
75 void appendGtGt(PrecedenceInfo info, String value); 224
76 void appendGtGtGt(PrecedenceInfo info, String value); 225 /**
77 void appendComment(); 226 * Notifies scanning a whitespace character. Note that [appendWhiteSpace] is
227 * not always invoked for [$SPACE] characters.
228 *
229 * This method is used by the scanners to track line breaks and create the
230 * [lineStarts] map.
231 */
232 void appendWhiteSpace(int next) {
233 if (next == $LF && file != null) {
234 lineStarts.add(stringOffset + 1); // +1, the line starts after the $LF.
235 }
236 }
237
238 /**
239 * Notifies on [$LF] characters in multi-line commends or strings.
240 *
241 * This method is used by the scanners to track line breaks and create the
242 * [lineStarts] map.
243 */
244 void lineFeedInMultiline() {
245 if (file != null) {
246 lineStarts.add(stringOffset + 1);
247 }
248 }
249
250 /**
251 * Appends a token that begins a new group, represented by [value].
252 * Group begin tokens are '{', '(', '[' and '${'.
253 */
254 void appendBeginGroup(PrecedenceInfo info) {
255 Token token = new BeginGroupToken(info, tokenStart);
256 tail.next = token;
257 tail = tail.next;
258
259 // { ( [ ${ cannot appear inside a type parameters / arguments.
260 if (!identical(info.kind, LT_TOKEN)) discardOpenLt();
261 groupingStack = groupingStack.prepend(token);
262 }
263
264 /**
265 * Appends a token that begins a ends group, represented by [value].
266 * It handles the group end tokens '}', ')' and ']'. The tokens '>' and
267 * '>>' are handled separately bo [appendGt] and [appendGtGt].
268 */
269 int appendEndGroup(PrecedenceInfo info, int openKind) {
270 assert(!identical(openKind, LT_TOKEN)); // openKind is < for > and >>
271 appendPrecedenceToken(info);
272 // Don't report unmatched errors for <; it is also the less-than operator.
273 discardOpenLt();
274 if (groupingStack.isEmpty) {
275 return advance();
276 }
277 BeginGroupToken begin = groupingStack.head;
278 if (!identical(begin.kind, openKind)) {
279 if (!identical(openKind, OPEN_CURLY_BRACKET_TOKEN) ||
280 !identical(begin.kind, STRING_INTERPOLATION_TOKEN)) {
281 // Not ending string interpolation.
282 unmatchedBeginGroup(begin);
283 return advance();
284 }
285 // We're ending an interpolated expression.
286 begin.endGroup = tail;
287 groupingStack = groupingStack.tail;
288 // Using "start-of-text" to signal that we're back in string
289 // scanning mode.
290 return $STX;
291 }
292 begin.endGroup = tail;
293 groupingStack = groupingStack.tail;
294 return advance();
295 }
296
297 /**
298 * Appends a token for '>'.
299 * This method does not issue unmatched errors, because > is also the
300 * greater-than operator. It does not necessarily have to close a group.
301 */
302 void appendGt(PrecedenceInfo info) {
303 appendPrecedenceToken(info);
304 if (groupingStack.isEmpty) return;
305 if (identical(groupingStack.head.kind, LT_TOKEN)) {
306 groupingStack.head.endGroup = tail;
307 groupingStack = groupingStack.tail;
308 }
309 }
310
311 /**
312 * Appends a token for '>>'.
313 * This method does not issue unmatched errors, because >> is also the
314 * shift operator. It does not necessarily have to close a group.
315 */
316 void appendGtGt(PrecedenceInfo info) {
317 appendPrecedenceToken(info);
318 if (groupingStack.isEmpty) return;
319 if (identical(groupingStack.head.kind, LT_TOKEN)) {
320 // Don't assign endGroup: in "T<U<V>>", the '>>' token closes the outer
321 // '<', the inner '<' is left without endGroup.
322 groupingStack = groupingStack.tail;
323 }
324 if (groupingStack.isEmpty) return;
325 if (identical(groupingStack.head.kind, LT_TOKEN)) {
326 groupingStack.head.endGroup = tail;
327 groupingStack = groupingStack.tail;
328 }
329 }
330
331 void appendComment(start, bool asciiOnly) {
332 if (!includeComments) return;
333 appendSubstringToken(COMMENT_INFO, start, asciiOnly);
334 }
78 335
79 /** 336 /**
80 * We call this method to discard '<' from the "grouping" stack 337 * We call this method to discard '<' from the "grouping" stack
81 * (maintained by subclasses). 338 * (maintained by subclasses).
82 * 339 *
83 * [PartialParser.skipExpression] relies on the fact that we do not 340 * [PartialParser.skipExpression] relies on the fact that we do not
84 * create groups for stuff like: 341 * create groups for stuff like:
85 * [:a = b < c, d = e > f:]. 342 * [:a = b < c, d = e > f:].
86 * 343 *
87 * In other words, this method is called when the scanner recognizes 344 * In other words, this method is called when the scanner recognizes
88 * something which cannot possibly be part of a type 345 * something which cannot possibly be part of a type
89 * parameter/argument list. 346 * parameter/argument list.
90 */ 347 */
91 void discardOpenLt(); 348 void discardOpenLt() {
349 while (!groupingStack.isEmpty
350 && identical(groupingStack.head.kind, LT_TOKEN)) {
351 groupingStack = groupingStack.tail;
352 }
353 }
92 354
93 // TODO(ahe): Move this class to implementation. 355 // TODO(ahe): Move this class to implementation.
94 356
95 Token tokenize() { 357 Token tokenize() {
96 int next = advance(); 358 int next = advance();
97 while (!identical(next, $EOF)) { 359 while (!identical(next, $EOF)) {
98 next = bigSwitch(next); 360 next = bigSwitch(next);
99 } 361 }
100 appendEofToken(); 362 appendEofToken();
363
364 if (file != null) {
365 file.length = stringOffset;
366 // One additional line start at the end, see [SourceFile.lineStarts].
367 lineStarts.add(stringOffset + 1);
368 file.lineStarts = lineStarts;
369 }
370
101 return firstToken(); 371 return firstToken();
102 } 372 }
103 373
104 int bigSwitch(int next) { 374 int bigSwitch(int next) {
105 beginToken(); 375 beginToken();
106 if (identical(next, $SPACE) || identical(next, $TAB) 376 if (identical(next, $SPACE) || identical(next, $TAB)
107 || identical(next, $LF) || identical(next, $CR)) { 377 || identical(next, $LF) || identical(next, $CR)) {
108 appendWhiteSpace(next); 378 appendWhiteSpace(next);
109 next = advance(); 379 next = advance();
380 // Sequences of spaces are common, so advance through them fast.
110 while (identical(next, $SPACE)) { 381 while (identical(next, $SPACE)) {
111 appendWhiteSpace(next); 382 // We don't invoke [:appendWhiteSpace(next):] here for efficiency,
383 // assuming that it does not do anything for space characters.
112 next = advance(); 384 next = advance();
113 } 385 }
114 return next; 386 return next;
115 } 387 }
116 388
117 if ($a <= next && next <= $z) { 389 if ($a <= next && next <= $z) {
118 if (identical($r, next)) { 390 if (identical($r, next)) {
119 return tokenizeRawStringKeywordOrIdentifier(next); 391 return tokenizeRawStringKeywordOrIdentifier(next);
120 } 392 }
121 return tokenizeKeywordOrIdentifier(next, true); 393 return tokenizeKeywordOrIdentifier(next, true);
122 } 394 }
123 395
124 if (($A <= next && next <= $Z) || identical(next, $_) || identical(next, $$) ) { 396 if (($A <= next && next <= $Z) ||
125 return tokenizeIdentifier(next, byteOffset, true); 397 identical(next, $_) ||
398 identical(next, $$)) {
399 return tokenizeIdentifier(next, scanOffset, true);
126 } 400 }
127 401
128 if (identical(next, $LT)) { 402 if (identical(next, $LT)) {
129 return tokenizeLessThan(next); 403 return tokenizeLessThan(next);
130 } 404 }
131 405
132 if (identical(next, $GT)) { 406 if (identical(next, $GT)) {
133 return tokenizeGreaterThan(next); 407 return tokenizeGreaterThan(next);
134 } 408 }
135 409
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
180 if (identical(next, $BACKSLASH)) { 454 if (identical(next, $BACKSLASH)) {
181 appendPrecedenceToken(BACKSLASH_INFO); 455 appendPrecedenceToken(BACKSLASH_INFO);
182 return advance(); 456 return advance();
183 } 457 }
184 458
185 if (identical(next, $HASH)) { 459 if (identical(next, $HASH)) {
186 return tokenizeTag(next); 460 return tokenizeTag(next);
187 } 461 }
188 462
189 if (identical(next, $OPEN_PAREN)) { 463 if (identical(next, $OPEN_PAREN)) {
190 appendBeginGroup(OPEN_PAREN_INFO, "("); 464 appendBeginGroup(OPEN_PAREN_INFO);
191 return advance(); 465 return advance();
192 } 466 }
193 467
194 if (identical(next, $CLOSE_PAREN)) { 468 if (identical(next, $CLOSE_PAREN)) {
195 return appendEndGroup(CLOSE_PAREN_INFO, ")", OPEN_PAREN_TOKEN); 469 return appendEndGroup(CLOSE_PAREN_INFO, OPEN_PAREN_TOKEN);
196 } 470 }
197 471
198 if (identical(next, $COMMA)) { 472 if (identical(next, $COMMA)) {
199 appendPrecedenceToken(COMMA_INFO); 473 appendPrecedenceToken(COMMA_INFO);
200 return advance(); 474 return advance();
201 } 475 }
202 476
203 if (identical(next, $COLON)) { 477 if (identical(next, $COLON)) {
204 appendPrecedenceToken(COLON_INFO); 478 appendPrecedenceToken(COLON_INFO);
205 return advance(); 479 return advance();
206 } 480 }
207 481
208 if (identical(next, $SEMICOLON)) { 482 if (identical(next, $SEMICOLON)) {
209 appendPrecedenceToken(SEMICOLON_INFO); 483 appendPrecedenceToken(SEMICOLON_INFO);
210 // Type parameters and arguments cannot contain semicolon. 484 // Type parameters and arguments cannot contain semicolon.
211 discardOpenLt(); 485 discardOpenLt();
212 return advance(); 486 return advance();
213 } 487 }
214 488
215 if (identical(next, $QUESTION)) { 489 if (identical(next, $QUESTION)) {
216 appendPrecedenceToken(QUESTION_INFO); 490 appendPrecedenceToken(QUESTION_INFO);
217 return advance(); 491 return advance();
218 } 492 }
219 493
220 if (identical(next, $CLOSE_SQUARE_BRACKET)) { 494 if (identical(next, $CLOSE_SQUARE_BRACKET)) {
221 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO, "]", 495 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO,
222 OPEN_SQUARE_BRACKET_TOKEN); 496 OPEN_SQUARE_BRACKET_TOKEN);
223 } 497 }
224 498
225 if (identical(next, $BACKPING)) { 499 if (identical(next, $BACKPING)) {
226 appendPrecedenceToken(BACKPING_INFO); 500 appendPrecedenceToken(BACKPING_INFO);
227 return advance(); 501 return advance();
228 } 502 }
229 503
230 if (identical(next, $OPEN_CURLY_BRACKET)) { 504 if (identical(next, $OPEN_CURLY_BRACKET)) {
231 appendBeginGroup(OPEN_CURLY_BRACKET_INFO, "{"); 505 appendBeginGroup(OPEN_CURLY_BRACKET_INFO);
232 return advance(); 506 return advance();
233 } 507 }
234 508
235 if (identical(next, $CLOSE_CURLY_BRACKET)) { 509 if (identical(next, $CLOSE_CURLY_BRACKET)) {
236 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, "}", 510 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO,
237 OPEN_CURLY_BRACKET_TOKEN); 511 OPEN_CURLY_BRACKET_TOKEN);
238 } 512 }
239 513
240 if (identical(next, $SLASH)) { 514 if (identical(next, $SLASH)) {
241 return tokenizeSlashOrComment(next); 515 return tokenizeSlashOrComment(next);
242 } 516 }
243 517
244 if (identical(next, $AT)) { 518 if (identical(next, $AT)) {
245 return tokenizeAt(next); 519 return tokenizeAt(next);
246 } 520 }
247 521
248 if (identical(next, $DQ) || identical(next, $SQ)) { 522 if (identical(next, $DQ) || identical(next, $SQ)) {
249 return tokenizeString(next, byteOffset, false); 523 return tokenizeString(next, scanOffset, false);
250 } 524 }
251 525
252 if (identical(next, $PERIOD)) { 526 if (identical(next, $PERIOD)) {
253 return tokenizeDotsOrNumber(next); 527 return tokenizeDotsOrNumber(next);
254 } 528 }
255 529
256 if (identical(next, $0)) { 530 if (identical(next, $0)) {
257 return tokenizeHexOrNumber(next); 531 return tokenizeHexOrNumber(next);
258 } 532 }
259 533
260 // TODO(ahe): Would a range check be faster? 534 // TODO(ahe): Would a range check be faster?
261 if (identical(next, $1) || identical(next, $2) || identical(next, $3) 535 if (identical(next, $1) || identical(next, $2) || identical(next, $3)
262 || identical(next, $4) || identical(next, $5) || identical(next, $6) 536 || identical(next, $4) || identical(next, $5) || identical(next, $6)
263 || identical(next, $7) || identical(next, $8) || identical(next, $9)) { 537 || identical(next, $7) || identical(next, $8) || identical(next, $9)) {
264 return tokenizeNumber(next); 538 return tokenizeNumber(next);
265 } 539 }
266 540
267 if (identical(next, $EOF)) { 541 if (identical(next, $EOF)) {
268 return $EOF; 542 return $EOF;
269 } 543 }
270 if (next < 0x1f) { 544 if (next < 0x1f) {
271 return error(new SourceString("unexpected character $next")); 545 return error("unexpected character $next");
546 }
547
548 if (next >= 128) {
549 next = currentAsUnicode(next);
272 } 550 }
273 551
274 // The following are non-ASCII characters. 552 // The following are non-ASCII characters.
275 553
276 if (identical(next, $NBSP)) { 554 if (identical(next, $NBSP)) {
277 appendWhiteSpace(next); 555 appendWhiteSpace(next);
278 return advance(); 556 return advance();
279 } 557 }
280 558
281 return tokenizeIdentifier(next, byteOffset, true); 559 return error("unexpected unicode character $next");
282 } 560 }
283 561
284 int tokenizeTag(int next) { 562 int tokenizeTag(int next) {
285 // # or #!.*[\n\r] 563 // # or #!.*[\n\r]
286 if (byteOffset == 0) { 564 if (scanOffset == 0) {
287 if (identical(peek(), $BANG)) { 565 if (identical(peek(), $BANG)) {
566 int start = scanOffset + 1;
567 bool asciiOnly = true;
288 do { 568 do {
289 next = advance(); 569 next = advance();
290 } while (!identical(next, $LF) && !identical(next, $CR) && !identical(ne xt, $EOF)); 570 if (next > 127) asciiOnly = false;
571 } while (!identical(next, $LF) &&
572 !identical(next, $CR) &&
573 !identical(next, $EOF));
574 if (!asciiOnly) handleUnicode(start);
291 return next; 575 return next;
292 } 576 }
293 } 577 }
294 appendPrecedenceToken(HASH_INFO); 578 appendPrecedenceToken(HASH_INFO);
295 return advance(); 579 return advance();
296 } 580 }
297 581
298 int tokenizeTilde(int next) { 582 int tokenizeTilde(int next) {
299 // ~ ~/ ~/= 583 // ~ ~/ ~/=
300 next = advance(); 584 next = advance();
301 if (identical(next, $SLASH)) { 585 if (identical(next, $SLASH)) {
302 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO); 586 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO);
303 } else { 587 } else {
304 appendPrecedenceToken(TILDE_INFO); 588 appendPrecedenceToken(TILDE_INFO);
305 return next; 589 return next;
306 } 590 }
307 } 591 }
308 592
309 int tokenizeOpenSquareBracket(int next) { 593 int tokenizeOpenSquareBracket(int next) {
310 // [ [] []= 594 // [ [] []=
311 next = advance(); 595 next = advance();
312 if (identical(next, $CLOSE_SQUARE_BRACKET)) { 596 if (identical(next, $CLOSE_SQUARE_BRACKET)) {
313 Token token = previousToken(); 597 Token token = previousToken();
314 if (token is KeywordToken && identical(token.value.stringValue, 'operator' )) { 598 if (token is KeywordToken &&
599 identical((token as KeywordToken).keyword.syntax, 'operator')) {
315 return select($EQ, INDEX_EQ_INFO, INDEX_INFO); 600 return select($EQ, INDEX_EQ_INFO, INDEX_INFO);
316 } 601 }
317 } 602 }
318 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO, "["); 603 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO);
319 return next; 604 return next;
320 } 605 }
321 606
322 int tokenizeCaret(int next) { 607 int tokenizeCaret(int next) {
323 // ^ ^= 608 // ^ ^=
324 return select($EQ, CARET_EQ_INFO, CARET_INFO); 609 return select($EQ, CARET_EQ_INFO, CARET_INFO);
325 } 610 }
326 611
327 int tokenizeBar(int next) { 612 int tokenizeBar(int next) {
328 // | || |= 613 // | || |=
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
372 return advance(); 657 return advance();
373 } else if (identical(next, $EQ)) { 658 } else if (identical(next, $EQ)) {
374 appendPrecedenceToken(MINUS_EQ_INFO); 659 appendPrecedenceToken(MINUS_EQ_INFO);
375 return advance(); 660 return advance();
376 } else { 661 } else {
377 appendPrecedenceToken(MINUS_INFO); 662 appendPrecedenceToken(MINUS_INFO);
378 return next; 663 return next;
379 } 664 }
380 } 665 }
381 666
382
383 int tokenizePlus(int next) { 667 int tokenizePlus(int next) {
384 // + ++ += 668 // + ++ +=
385 next = advance(); 669 next = advance();
386 if (identical($PLUS, next)) { 670 if (identical($PLUS, next)) {
387 appendPrecedenceToken(PLUS_PLUS_INFO); 671 appendPrecedenceToken(PLUS_PLUS_INFO);
388 return advance(); 672 return advance();
389 } else if (identical($EQ, next)) { 673 } else if (identical($EQ, next)) {
390 appendPrecedenceToken(PLUS_EQ_INFO); 674 appendPrecedenceToken(PLUS_EQ_INFO);
391 return advance(); 675 return advance();
392 } else { 676 } else {
393 appendPrecedenceToken(PLUS_INFO); 677 appendPrecedenceToken(PLUS_INFO);
394 return next; 678 return next;
395 } 679 }
396 } 680 }
397 681
398 int tokenizeExclamation(int next) { 682 int tokenizeExclamation(int next) {
399 // ! != !== 683 // ! !=
684 // !== is kept for user-friendly error reporting
685
400 next = advance(); 686 next = advance();
401 if (identical(next, $EQ)) { 687 if (identical(next, $EQ)) {
402 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO); 688 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO);
403 } 689 }
404 appendPrecedenceToken(BANG_INFO); 690 appendPrecedenceToken(BANG_INFO);
405 return next; 691 return next;
406 } 692 }
407 693
408 int tokenizeEquals(int next) { 694 int tokenizeEquals(int next) {
409 // = == === 695 // = == =>
696 // === is kept for user-friendly error reporting
410 697
411 // Type parameters and arguments cannot contain any token that 698 // Type parameters and arguments cannot contain any token that
412 // starts with '='. 699 // starts with '='.
413 discardOpenLt(); 700 discardOpenLt();
414 701
415 next = advance(); 702 next = advance();
416 if (identical(next, $EQ)) { 703 if (identical(next, $EQ)) {
417 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO); 704 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO);
418 } else if (identical(next, $GT)) { 705 } else if (identical(next, $GT)) {
419 appendPrecedenceToken(FUNCTION_INFO); 706 appendPrecedenceToken(FUNCTION_INFO);
420 return advance(); 707 return advance();
421 } 708 }
422 appendPrecedenceToken(EQ_INFO); 709 appendPrecedenceToken(EQ_INFO);
423 return next; 710 return next;
424 } 711 }
425 712
426 int tokenizeGreaterThan(int next) { 713 int tokenizeGreaterThan(int next) {
427 // > >= >> >>= >>> >>>= 714 // > >= >> >>=
428 next = advance(); 715 next = advance();
429 if (identical($EQ, next)) { 716 if (identical($EQ, next)) {
430 appendPrecedenceToken(GT_EQ_INFO); 717 appendPrecedenceToken(GT_EQ_INFO);
431 return advance(); 718 return advance();
432 } else if (identical($GT, next)) { 719 } else if (identical($GT, next)) {
433 next = advance(); 720 next = advance();
434 if (identical($EQ, next)) { 721 if (identical($EQ, next)) {
435 appendPrecedenceToken(GT_GT_EQ_INFO); 722 appendPrecedenceToken(GT_GT_EQ_INFO);
436 return advance(); 723 return advance();
437 } else { 724 } else {
438 appendGtGt(GT_GT_INFO, ">>"); 725 appendGtGt(GT_GT_INFO);
439 return next; 726 return next;
440 } 727 }
441 } else { 728 } else {
442 appendGt(GT_INFO, ">"); 729 appendGt(GT_INFO);
443 return next; 730 return next;
444 } 731 }
445 } 732 }
446 733
447 int tokenizeLessThan(int next) { 734 int tokenizeLessThan(int next) {
448 // < <= << <<= 735 // < <= << <<=
449 next = advance(); 736 next = advance();
450 if (identical($EQ, next)) { 737 if (identical($EQ, next)) {
451 appendPrecedenceToken(LT_EQ_INFO); 738 appendPrecedenceToken(LT_EQ_INFO);
452 return advance(); 739 return advance();
453 } else if (identical($LT, next)) { 740 } else if (identical($LT, next)) {
454 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO); 741 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO);
455 } else { 742 } else {
456 appendBeginGroup(LT_INFO, "<"); 743 appendBeginGroup(LT_INFO);
457 return next; 744 return next;
458 } 745 }
459 } 746 }
460 747
461 int tokenizeNumber(int next) { 748 int tokenizeNumber(int next) {
462 int start = byteOffset; 749 int start = scanOffset;
463 while (true) { 750 while (true) {
464 next = advance(); 751 next = advance();
465 if ($0 <= next && next <= $9) { 752 if ($0 <= next && next <= $9) {
466 continue; 753 continue;
467 } else if (identical(next, $e) || identical(next, $E)) { 754 } else if (identical(next, $e) || identical(next, $E)) {
468 return tokenizeFractionPart(next, start); 755 return tokenizeFractionPart(next, start);
469 } else { 756 } else {
470 if (identical(next, $PERIOD)) { 757 if (identical(next, $PERIOD)) {
471 int nextnext = peek(); 758 int nextnext = peek();
472 if ($0 <= nextnext && nextnext <= $9) { 759 if ($0 <= nextnext && nextnext <= $9) {
473 return tokenizeFractionPart(advance(), start); 760 return tokenizeFractionPart(advance(), start);
474 } 761 }
475 } 762 }
476 appendByteStringToken(INT_INFO, asciiString(start, 0)); 763 appendSubstringToken(INT_INFO, start, true);
477 return next; 764 return next;
478 } 765 }
479 } 766 }
480 } 767 }
481 768
482 int tokenizeHexOrNumber(int next) { 769 int tokenizeHexOrNumber(int next) {
483 int x = peek(); 770 int x = peek();
484 if (identical(x, $x) || identical(x, $X)) { 771 if (identical(x, $x) || identical(x, $X)) {
485 advance(); 772 return tokenizeHex(next);
486 return tokenizeHex(x);
487 } 773 }
488 return tokenizeNumber(next); 774 return tokenizeNumber(next);
489 } 775 }
490 776
491 int tokenizeHex(int next) { 777 int tokenizeHex(int next) {
492 int start = byteOffset - 1; 778 int start = scanOffset;
779 next = advance(); // Advance past the $x or $X.
493 bool hasDigits = false; 780 bool hasDigits = false;
494 while (true) { 781 while (true) {
495 next = advance(); 782 next = advance();
496 if (($0 <= next && next <= $9) 783 if (($0 <= next && next <= $9)
497 || ($A <= next && next <= $F) 784 || ($A <= next && next <= $F)
498 || ($a <= next && next <= $f)) { 785 || ($a <= next && next <= $f)) {
499 hasDigits = true; 786 hasDigits = true;
500 } else { 787 } else {
501 if (!hasDigits) { 788 if (!hasDigits) {
502 return error(const SourceString("hex digit expected")); 789 return error("hex digit expected");
503 } 790 }
504 appendByteStringToken(HEXADECIMAL_INFO, asciiString(start, 0)); 791 appendSubstringToken(HEXADECIMAL_INFO, start, true);
505 return next; 792 return next;
506 } 793 }
507 } 794 }
508 } 795 }
509 796
510 int tokenizeDotsOrNumber(int next) { 797 int tokenizeDotsOrNumber(int next) {
511 int start = byteOffset; 798 int start = scanOffset;
512 next = advance(); 799 next = advance();
513 if (($0 <= next && next <= $9)) { 800 if (($0 <= next && next <= $9)) {
514 return tokenizeFractionPart(next, start); 801 return tokenizeFractionPart(next, start);
515 } else if (identical($PERIOD, next)) { 802 } else if (identical($PERIOD, next)) {
516 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); 803 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);
517 } else { 804 } else {
518 appendPrecedenceToken(PERIOD_INFO); 805 appendPrecedenceToken(PERIOD_INFO);
519 return next; 806 return next;
520 } 807 }
521 } 808 }
522 809
523 int tokenizeFractionPart(int next, int start) { 810 int tokenizeFractionPart(int next, int start) {
524 bool done = false; 811 bool done = false;
525 bool hasDigit = false; 812 bool hasDigit = false;
526 LOOP: while (!done) { 813 LOOP: while (!done) {
527 if ($0 <= next && next <= $9) { 814 if ($0 <= next && next <= $9) {
528 hasDigit = true; 815 hasDigit = true;
529 } else if (identical($e, next) || identical($E, next)) { 816 } else if (identical($e, next) || identical($E, next)) {
530 hasDigit = true; 817 hasDigit = true;
531 next = tokenizeExponent(advance()); 818 next = tokenizeExponent(advance());
532 done = true; 819 done = true;
533 continue LOOP; 820 continue LOOP;
534 } else { 821 } else {
535 done = true; 822 done = true;
536 continue LOOP; 823 continue LOOP;
537 } 824 }
538 next = advance(); 825 next = advance();
539 } 826 }
540 if (!hasDigit) { 827 if (!hasDigit) {
541 appendByteStringToken(INT_INFO, asciiString(start, -1)); 828 // Reduce offset, we already advanced to the token past the period.
829 appendSubstringToken(INT_INFO, start, true, -1);
830
831 // TODO(ahe): Wrong offset for the period. Cannot call beginToken because
832 // the scanner already advanced past the period.
542 if (identical($PERIOD, next)) { 833 if (identical($PERIOD, next)) {
543 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); 834 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);
544 } 835 }
545 // TODO(ahe): Wrong offset for the period.
546 appendPrecedenceToken(PERIOD_INFO); 836 appendPrecedenceToken(PERIOD_INFO);
547 return bigSwitch(next); 837 return next;
548 } 838 }
549 appendByteStringToken(DOUBLE_INFO, asciiString(start, 0)); 839 appendSubstringToken(DOUBLE_INFO, start, true);
550 return next; 840 return next;
551 } 841 }
552 842
553 int tokenizeExponent(int next) { 843 int tokenizeExponent(int next) {
554 if (identical(next, $PLUS) || identical(next, $MINUS)) { 844 if (identical(next, $PLUS) || identical(next, $MINUS)) {
555 next = advance(); 845 next = advance();
556 } 846 }
557 bool hasDigits = false; 847 bool hasDigits = false;
558 while (true) { 848 while (true) {
559 if ($0 <= next && next <= $9) { 849 if ($0 <= next && next <= $9) {
560 hasDigits = true; 850 hasDigits = true;
561 } else { 851 } else {
562 if (!hasDigits) { 852 if (!hasDigits) {
563 return error(const SourceString("digit expected")); 853 return error("digit expected");
564 } 854 }
565 return next; 855 return next;
566 } 856 }
567 next = advance(); 857 next = advance();
568 } 858 }
569 } 859 }
570 860
571 int tokenizeSlashOrComment(int next) { 861 int tokenizeSlashOrComment(int next) {
862 int start = scanOffset;
572 next = advance(); 863 next = advance();
573 if (identical($STAR, next)) { 864 if (identical($STAR, next)) {
574 return tokenizeMultiLineComment(next); 865 return tokenizeMultiLineComment(next, start);
575 } else if (identical($SLASH, next)) { 866 } else if (identical($SLASH, next)) {
576 return tokenizeSingleLineComment(next); 867 return tokenizeSingleLineComment(next, start);
577 } else if (identical($EQ, next)) { 868 } else if (identical($EQ, next)) {
578 appendPrecedenceToken(SLASH_EQ_INFO); 869 appendPrecedenceToken(SLASH_EQ_INFO);
579 return advance(); 870 return advance();
580 } else { 871 } else {
581 appendPrecedenceToken(SLASH_INFO); 872 appendPrecedenceToken(SLASH_INFO);
582 return next; 873 return next;
583 } 874 }
584 } 875 }
585 876
586 int tokenizeSingleLineComment(int next) { 877 int tokenizeSingleLineComment(int next, int start) {
878 bool asciiOnly = true;
587 while (true) { 879 while (true) {
588 next = advance(); 880 next = advance();
589 if (identical($LF, next) || identical($CR, next) || identical($EOF, next)) { 881 if (next > 127) asciiOnly = false;
590 appendComment(); 882 if (identical($LF, next) ||
883 identical($CR, next) ||
884 identical($EOF, next)) {
885 if (!asciiOnly) handleUnicode(start);
886 appendComment(start, asciiOnly);
591 return next; 887 return next;
592 } 888 }
593 } 889 }
594 } 890 }
595 891
596 int tokenizeMultiLineComment(int next) { 892
893 int tokenizeMultiLineComment(int next, int start) {
894 bool asciiOnlyComment = true; // Track if the entire comment is ASCII.
895 bool asciiOnlyLines = true; // Track ASCII since the last handleUnicode.
896 int unicodeStart = start;
597 int nesting = 1; 897 int nesting = 1;
598 next = advance(); 898 next = advance();
599 while (true) { 899 while (true) {
600 if (identical($EOF, next)) { 900 if (identical($EOF, next)) {
601 // TODO(ahe): Report error. 901 if (!asciiOnlyLines) handleUnicode(unicodeStart);
902 appendStringToken(BAD_INPUT_INFO, "unterminated multi-line comment");
602 return next; 903 return next;
603 } else if (identical($STAR, next)) { 904 } else if (identical($STAR, next)) {
604 next = advance(); 905 next = advance();
605 if (identical($SLASH, next)) { 906 if (identical($SLASH, next)) {
606 --nesting; 907 --nesting;
607 if (0 == nesting) { 908 if (0 == nesting) {
909 if (!asciiOnlyLines) handleUnicode(unicodeStart);
608 next = advance(); 910 next = advance();
609 appendComment(); 911 appendComment(start, asciiOnlyComment);
610 return next; 912 return next;
611 } else { 913 } else {
612 next = advance(); 914 next = advance();
613 } 915 }
614 } 916 }
615 } else if (identical($SLASH, next)) { 917 } else if (identical($SLASH, next)) {
616 next = advance(); 918 next = advance();
617 if (identical($STAR, next)) { 919 if (identical($STAR, next)) {
618 next = advance(); 920 next = advance();
619 ++nesting; 921 ++nesting;
620 } 922 }
923 } else if (identical(next, $LF)) {
924 if (!asciiOnlyLines) {
925 // Synchronize the string offset in the utf8 scanner.
926 handleUnicode(unicodeStart);
927 asciiOnlyLines = true;
928 unicodeStart = scanOffset;
929 }
930 lineFeedInMultiline();
931 next = advance();
621 } else { 932 } else {
933 if (next > 127) {
934 asciiOnlyLines = false;
935 asciiOnlyComment = false;
936 }
622 next = advance(); 937 next = advance();
623 } 938 }
624 } 939 }
625 } 940 }
626 941
627 int tokenizeRawStringKeywordOrIdentifier(int next) { 942 int tokenizeRawStringKeywordOrIdentifier(int next) {
943 // [next] is $r.
628 int nextnext = peek(); 944 int nextnext = peek();
629 if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) { 945 if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) {
630 int start = byteOffset; 946 int start = scanOffset;
631 next = advance(); 947 next = advance();
632 return tokenizeString(next, start, true); 948 return tokenizeString(next, start, true);
633 } 949 }
634 return tokenizeKeywordOrIdentifier(next, true); 950 return tokenizeKeywordOrIdentifier(next, true);
635 } 951 }
636 952
637 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) { 953 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) {
638 KeywordState state = KeywordState.KEYWORD_STATE; 954 KeywordState state = KeywordState.KEYWORD_STATE;
639 int start = byteOffset; 955 int start = scanOffset;
640 while (state != null && $a <= next && next <= $z) { 956 while (state != null && $a <= next && next <= $z) {
641 state = state.next(next); 957 state = state.next(next);
642 next = advance(); 958 next = advance();
643 } 959 }
644 if (state == null || state.keyword == null) { 960 if (state == null || state.keyword == null) {
645 return tokenizeIdentifier(next, start, allowDollar); 961 return tokenizeIdentifier(next, start, allowDollar);
646 } 962 }
647 if (($A <= next && next <= $Z) || 963 if (($A <= next && next <= $Z) ||
648 ($0 <= next && next <= $9) || 964 ($0 <= next && next <= $9) ||
649 identical(next, $_) || 965 identical(next, $_) ||
650 identical(next, $$)) { 966 identical(next, $$)) {
651 return tokenizeIdentifier(next, start, allowDollar); 967 return tokenizeIdentifier(next, start, allowDollar);
652 } else if (next < 128) { 968 } else {
653 appendKeywordToken(state.keyword); 969 appendKeywordToken(state.keyword);
654 return next; 970 return next;
655 } else {
656 return tokenizeIdentifier(next, start, allowDollar);
657 } 971 }
658 } 972 }
659 973
974 /**
975 * [allowDollar] can exclude '$', which is not allowed as part of a string
976 * interpolation identifier.
977 */
660 int tokenizeIdentifier(int next, int start, bool allowDollar) { 978 int tokenizeIdentifier(int next, int start, bool allowDollar) {
661 bool isAscii = true;
662
663 while (true) { 979 while (true) {
664 if (($a <= next && next <= $z) || 980 if (($a <= next && next <= $z) ||
665 ($A <= next && next <= $Z) || 981 ($A <= next && next <= $Z) ||
666 ($0 <= next && next <= $9) || 982 ($0 <= next && next <= $9) ||
667 identical(next, $_) || 983 identical(next, $_) ||
668 (identical(next, $$) && allowDollar)) { 984 (identical(next, $$) && allowDollar)) {
669 next = advance(); 985 next = advance();
670 } else if ((next < 128) || (identical(next, $NBSP))) { 986 } else {
671 // Identifier ends here. 987 // Identifier ends here.
672 if (start == byteOffset) { 988 if (start == scanOffset) {
673 return error(const SourceString("expected identifier")); 989 return error("expected identifier");
674 } else if (isAscii) {
675 appendByteStringToken(IDENTIFIER_INFO, asciiString(start, 0));
676 } else { 990 } else {
677 appendByteStringToken(BAD_INPUT_INFO, utf8String(start, -1)); 991 appendSubstringToken(IDENTIFIER_INFO, start, true);
678 } 992 }
679 return next; 993 return next;
680 } else {
681 int nonAsciiStart = byteOffset;
682 do {
683 next = nextByte();
684 if (identical(next, $NBSP)) break;
685 } while (next > 127);
686 String string = utf8String(nonAsciiStart, -1).slowToString();
687 isAscii = false;
688 int byteLength = nonAsciiStart - byteOffset;
689 addToCharOffset(string.length - byteLength);
690 } 994 }
691 } 995 }
692 } 996 }
693 997
694 int tokenizeAt(int next) { 998 int tokenizeAt(int next) {
695 int start = byteOffset;
696 next = advance();
697 appendPrecedenceToken(AT_INFO); 999 appendPrecedenceToken(AT_INFO);
698 return next; 1000 return advance();
699 } 1001 }
700 1002
701 int tokenizeString(int next, int start, bool raw) { 1003 int tokenizeString(int next, int start, bool raw) {
702 int quoteChar = next; 1004 int quoteChar = next;
703 next = advance(); 1005 next = advance();
704 if (identical(quoteChar, next)) { 1006 if (identical(quoteChar, next)) {
705 next = advance(); 1007 next = advance();
706 if (identical(quoteChar, next)) { 1008 if (identical(quoteChar, next)) {
707 // Multiline string. 1009 // Multiline string.
708 return tokenizeMultiLineString(quoteChar, start, raw); 1010 return tokenizeMultiLineString(quoteChar, start, raw);
709 } else { 1011 } else {
710 // Empty string. 1012 // Empty string.
711 appendByteStringToken(STRING_INFO, utf8String(start, -1)); 1013 appendSubstringToken(STRING_INFO, start, true);
712 return next; 1014 return next;
713 } 1015 }
714 } 1016 }
715 if (raw) { 1017 if (raw) {
716 return tokenizeSingleLineRawString(next, quoteChar, start); 1018 return tokenizeSingleLineRawString(next, quoteChar, start);
717 } else { 1019 } else {
718 return tokenizeSingleLineString(next, quoteChar, start); 1020 return tokenizeSingleLineString(next, quoteChar, start);
719 } 1021 }
720 } 1022 }
721 1023
722 static bool isHexDigit(int character) { 1024 /**
723 if ($0 <= character && character <= $9) return true; 1025 * [next] is the first character after the qoute.
724 character |= 0x20; 1026 * [start] is the scanOffset of the quote.
725 return ($a <= character && character <= $f); 1027 *
726 } 1028 * The token contains a substring of the source file, including the
727 1029 * string quotes, backslashes for escaping. For interpolated strings,
1030 * the parts before and after are separate tokens.
1031 *
1032 * "a $b c"
1033 *
1034 * gives StringToken("a $), StringToken(b) and StringToken( c").
1035 */
728 int tokenizeSingleLineString(int next, int quoteChar, int start) { 1036 int tokenizeSingleLineString(int next, int quoteChar, int start) {
1037 bool asciiOnly = true;
729 while (!identical(next, quoteChar)) { 1038 while (!identical(next, quoteChar)) {
730 if (identical(next, $BACKSLASH)) { 1039 if (identical(next, $BACKSLASH)) {
731 next = advance(); 1040 next = advance();
732 } else if (identical(next, $$)) { 1041 } else if (identical(next, $$)) {
733 next = tokenizeStringInterpolation(start); 1042 if (!asciiOnly) handleUnicode(start);
734 start = byteOffset; 1043 next = tokenizeStringInterpolation(start, asciiOnly);
1044 start = scanOffset;
1045 asciiOnly = true;
735 continue; 1046 continue;
736 } 1047 }
737 if (next <= $CR 1048 if (next <= $CR
738 && (identical(next, $LF) || identical(next, $CR) || identical(next, $E OF))) { 1049 && (identical(next, $LF) ||
739 return error(const SourceString("unterminated string literal")); 1050 identical(next, $CR) ||
1051 identical(next, $EOF))) {
1052 if (!asciiOnly) handleUnicode(start);
1053 return error("unterminated string literal");
740 } 1054 }
1055 if (next > 127) asciiOnly = false;
741 next = advance(); 1056 next = advance();
742 } 1057 }
743 appendByteStringToken(STRING_INFO, utf8String(start, 0)); 1058 if (!asciiOnly) handleUnicode(start);
744 return advance(); 1059 // Advance past the quote character.
1060 next = advance();
1061 appendSubstringToken(STRING_INFO, start, asciiOnly);
1062 return next;
745 } 1063 }
746 1064
747 int tokenizeStringInterpolation(int start) { 1065 int tokenizeStringInterpolation(int start, bool asciiOnly) {
748 appendByteStringToken(STRING_INFO, utf8String(start, -1)); 1066 appendSubstringToken(STRING_INFO, start, asciiOnly);
749 beginToken(); // $ starts here. 1067 beginToken(); // $ starts here.
750 int next = advance(); 1068 int next = advance();
751 if (identical(next, $OPEN_CURLY_BRACKET)) { 1069 if (identical(next, $OPEN_CURLY_BRACKET)) {
752 return tokenizeInterpolatedExpression(next, start); 1070 return tokenizeInterpolatedExpression(next);
753 } else { 1071 } else {
754 return tokenizeInterpolatedIdentifier(next, start); 1072 return tokenizeInterpolatedIdentifier(next);
755 } 1073 }
756 } 1074 }
757 1075
758 int tokenizeInterpolatedExpression(int next, int start) { 1076 int tokenizeInterpolatedExpression(int next) {
759 appendBeginGroup(STRING_INTERPOLATION_INFO, "\${"); 1077 appendBeginGroup(STRING_INTERPOLATION_INFO);
760 beginToken(); // The expression starts here. 1078 beginToken(); // The expression starts here.
761 next = advance(); 1079 next = advance(); // Move past the curly bracket.
762 while (!identical(next, $EOF) && !identical(next, $STX)) { 1080 while (!identical(next, $EOF) && !identical(next, $STX)) {
763 next = bigSwitch(next); 1081 next = bigSwitch(next);
764 } 1082 }
765 if (identical(next, $EOF)) return next; 1083 if (identical(next, $EOF)) return next;
766 next = advance(); 1084 next = advance(); // Move past the $STX.
767 beginToken(); // The string interpolation suffix starts here. 1085 beginToken(); // The string interpolation suffix starts here.
768 return next; 1086 return next;
769 } 1087 }
770 1088
771 int tokenizeInterpolatedIdentifier(int next, int start) { 1089 int tokenizeInterpolatedIdentifier(int next) {
772 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO); 1090 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO);
773 beginToken(); // The identifier starts here. 1091 beginToken(); // The identifier starts here.
774 next = tokenizeKeywordOrIdentifier(next, false); 1092 next = tokenizeKeywordOrIdentifier(next, false);
775 beginToken(); // The string interpolation suffix starts here. 1093 beginToken(); // The string interpolation suffix starts here.
776 return next; 1094 return next;
777 } 1095 }
778 1096
779 int tokenizeSingleLineRawString(int next, int quoteChar, int start) { 1097 int tokenizeSingleLineRawString(int next, int quoteChar, int start) {
780 next = advance(); 1098 bool asciiOnly = true;
1099 next = advance(); // Advance past the quote
781 while (next != $EOF) { 1100 while (next != $EOF) {
782 if (identical(next, quoteChar)) { 1101 if (identical(next, quoteChar)) {
783 appendByteStringToken(STRING_INFO, utf8String(start, 0)); 1102 if (!asciiOnly) handleUnicode(start);
784 return advance(); 1103 next = advance();
1104 appendSubstringToken(STRING_INFO, start, asciiOnly);
1105 return next;
785 } else if (identical(next, $LF) || identical(next, $CR)) { 1106 } else if (identical(next, $LF) || identical(next, $CR)) {
786 return error(const SourceString("unterminated string literal")); 1107 if (!asciiOnly) handleUnicode(start);
1108 return error("unterminated string literal");
1109 } else if (next > 127) {
1110 asciiOnly = false;
787 } 1111 }
788 next = advance(); 1112 next = advance();
789 } 1113 }
790 return error(const SourceString("unterminated string literal")); 1114 if (!asciiOnly) handleUnicode(start);
1115 return error("unterminated string literal");
791 } 1116 }
792 1117
793 int tokenizeMultiLineRawString(int quoteChar, int start) { 1118 int tokenizeMultiLineRawString(int quoteChar, int start) {
794 int next = advance(); 1119 bool asciiOnlyString = true;
1120 bool asciiOnlyLine = true;
1121 int unicodeStart = start;
1122 int next = advance(); // Advance past the (last) quote (of three)
795 outer: while (!identical(next, $EOF)) { 1123 outer: while (!identical(next, $EOF)) {
796 while (!identical(next, quoteChar)) { 1124 while (!identical(next, quoteChar)) {
1125 if (identical(next, $LF)) {
1126 if (!asciiOnlyLine) {
1127 // Synchronize the string offset in the utf8 scanner.
1128 handleUnicode(unicodeStart);
1129 asciiOnlyLine = true;
1130 unicodeStart = scanOffset;
1131 }
1132 lineFeedInMultiline();
1133 } else if (next > 127) {
1134 asciiOnlyLine = false;
1135 asciiOnlyString = false;
1136 }
797 next = advance(); 1137 next = advance();
798 if (identical(next, $EOF)) break outer; 1138 if (identical(next, $EOF)) break outer;
799 } 1139 }
800 next = advance(); 1140 next = advance();
801 if (identical(next, quoteChar)) { 1141 if (identical(next, quoteChar)) {
802 next = advance(); 1142 next = advance();
803 if (identical(next, quoteChar)) { 1143 if (identical(next, quoteChar)) {
804 appendByteStringToken(STRING_INFO, utf8String(start, 0)); 1144 if (!asciiOnlyLine) handleUnicode(unicodeStart);
805 return advance(); 1145 next = advance();
1146 appendSubstringToken(STRING_INFO, start, asciiOnlyString);
1147 return next;
806 } 1148 }
807 } 1149 }
808 } 1150 }
809 return error(const SourceString("unterminated string literal")); 1151 if (!asciiOnlyLine) handleUnicode(unicodeStart);
1152 return error("unterminated string literal");
810 } 1153 }
811 1154
812 int tokenizeMultiLineString(int quoteChar, int start, bool raw) { 1155 int tokenizeMultiLineString(int quoteChar, int start, bool raw) {
813 if (raw) return tokenizeMultiLineRawString(quoteChar, start); 1156 if (raw) return tokenizeMultiLineRawString(quoteChar, start);
814 int next = advance(); 1157 bool asciiOnlyString = true;
1158 bool asciiOnlyLine = true;
1159 int unicodeStart = start;
1160 int next = advance(); // Advance past the (last) quote (of three).
815 while (!identical(next, $EOF)) { 1161 while (!identical(next, $EOF)) {
816 if (identical(next, $$)) { 1162 if (identical(next, $$)) {
817 next = tokenizeStringInterpolation(start); 1163 if (!asciiOnlyLine) handleUnicode(unicodeStart);
818 start = byteOffset; 1164 next = tokenizeStringInterpolation(start, asciiOnlyString);
1165 start = scanOffset;
1166 unicodeStart = start;
1167 asciiOnlyString = true; // A new string token is created for the rest.
1168 asciiOnlyLine = true;
819 continue; 1169 continue;
820 } 1170 }
821 if (identical(next, quoteChar)) { 1171 if (identical(next, quoteChar)) {
822 next = advance(); 1172 next = advance();
823 if (identical(next, quoteChar)) { 1173 if (identical(next, quoteChar)) {
824 next = advance(); 1174 next = advance();
825 if (identical(next, quoteChar)) { 1175 if (identical(next, quoteChar)) {
826 appendByteStringToken(STRING_INFO, utf8String(start, 0)); 1176 if (!asciiOnlyLine) handleUnicode(unicodeStart);
827 return advance(); 1177 next = advance();
1178 appendSubstringToken(STRING_INFO, start, asciiOnlyString);
1179 return next;
828 } 1180 }
829 } 1181 }
830 continue; 1182 continue;
831 } 1183 }
832 if (identical(next, $BACKSLASH)) { 1184 if (identical(next, $BACKSLASH)) {
833 next = advance(); 1185 next = advance();
834 if (identical(next, $EOF)) break; 1186 if (identical(next, $EOF)) break;
835 } 1187 }
1188 if (identical(next, $LF)) {
1189 if (!asciiOnlyLine) {
1190 // Synchronize the string offset in the utf8 scanner.
1191 handleUnicode(unicodeStart);
1192 asciiOnlyLine = true;
1193 unicodeStart = scanOffset;
1194 }
1195 lineFeedInMultiline();
1196 } else if (next > 127) {
1197 asciiOnlyString = false;
1198 asciiOnlyLine = false;
1199 }
836 next = advance(); 1200 next = advance();
837 } 1201 }
838 return error(const SourceString("unterminated string literal")); 1202 if (!asciiOnlyLine) handleUnicode(unicodeStart);
1203 return error("unterminated string literal");
839 } 1204 }
840 1205
841 int error(SourceString message) { 1206 int error(String message) {
842 appendByteStringToken(BAD_INPUT_INFO, message); 1207 appendStringToken(BAD_INPUT_INFO, message);
843 return advance(); // Ensure progress. 1208 return advance(); // Ensure progress.
844 } 1209 }
1210
1211 void unmatchedBeginGroup(BeginGroupToken begin) {
1212 String error = 'unmatched "${begin.stringValue}"';
1213 Token close =
1214 new StringToken.fromString(
1215 BAD_INPUT_INFO, error, begin.charOffset, true);
1216
1217 // We want to ensure that unmatched BeginGroupTokens are reported
1218 // as errors. However, the rest of the parser assume the groups
1219 // are well-balanced and will never look at the endGroup
1220 // token. This is a nice property that allows us to skip quickly
1221 // over correct code. By inserting an additional error token in
1222 // the stream, we can keep ignoring endGroup tokens.
1223 //
1224 // [begin] --next--> [tail]
1225 // [begin] --endG--> [close] --next--> [next] --next--> [tail]
1226 //
1227 // This allows the parser to skip from [begin] via endGroup to [close] and
1228 // ignore the [close] token (assuming it's correct), then the error will be
1229 // reported when parsing the [next] token.
1230
1231 Token next = new StringToken.fromString(
1232 BAD_INPUT_INFO, error, begin.charOffset, true);
1233 begin.endGroup = close;
1234 close.next = next;
1235 next.next = begin.next;
1236 }
845 } 1237 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698