OLD | NEW |
---|---|
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 part of scanner; | 5 part of scanner; |
6 | 6 |
7 abstract class Scanner { | 7 abstract class Scanner { |
8 Token tokenize(); | 8 Token tokenize(); |
9 | |
10 factory Scanner(SourceFile file, {bool includeComments: false}) { | |
11 if (file is Utf8BytesSourceFile) { | |
12 return new Utf8BytesScanner(file, includeComments: includeComments); | |
13 } else { | |
14 return new StringScanner(file, includeComments: includeComments); | |
15 } | |
16 } | |
9 } | 17 } |
10 | 18 |
11 /** | 19 abstract class AbstractScanner implements Scanner { |
12 * Common base class for a Dart scanner. | 20 final bool includeComments; |
13 */ | |
14 abstract class AbstractScanner<T extends SourceString> implements Scanner { | |
15 int advance(); | |
16 int nextByte(); | |
17 | 21 |
18 /** | 22 /** |
19 * Returns the current character or byte depending on the underlying input | 23 * The string offset for the next token that will be created. |
20 * kind. For example, [StringScanner] operates on [String] and thus returns | 24 * |
21 * characters (Unicode codepoints represented as int) whereas | 25 * Note that in the [Utf8BytesScanner], [stringOffset] and [scanOffset] values |
22 * [ByteArrayScanner] operates on byte arrays and thus returns bytes. | 26 * are different. One string character can be encoded using multiple UTF-8 |
27 * bytes. | |
28 */ | |
29 int tokenStart = -1; | |
30 | |
31 /** | |
32 * A pointer to the token stream created by this scanner. The first token | |
33 * is a special token and not part of the source file. This is an | |
34 * implementation detail to avoids special cases in the scanner. This token | |
35 * is not exposed to clients of the scanner, which are expected to invoke | |
36 * [firstToken] to access the token stream. | |
37 */ | |
38 final Token tokens = new SymbolToken(EOF_INFO, -1); | |
39 | |
40 /** | |
41 * A pointer to the last scanned token. | |
42 */ | |
43 Token tail; | |
44 | |
45 /** | |
46 * The source file that is being scanned. This field can be [:null:]. | |
47 * If the source file is available, the scanner assigns its [:lineStarts:] and | |
48 * [:length:] fields at the end of [tokenize]. | |
49 */ | |
50 final SourceFile file; | |
51 | |
52 final List<int> lineStarts = [0]; | |
ngeoffray
2013/10/18 10:19:37
<int>[0]
lukas
2013/10/24 16:48:36
Done.
| |
53 | |
54 AbstractScanner(this.file, this.includeComments) { | |
55 this.tail = this.tokens; | |
56 } | |
57 | |
58 | |
ngeoffray
2013/10/18 10:19:37
Extra line.
lukas
2013/10/24 16:48:36
Done.
| |
59 /** | |
60 * Advances and returns the next character. | |
61 * | |
62 * If the next character is non-ASCII, then the returned value depends on the | |
63 * scanner implementation. The [Utf8BytesScanner] returns a UTF-8 byte, while | |
64 * the [StringScanner] returns a UTF-16 code unit. | |
65 * | |
66 * The scanner ensures that [advance] is not invoked after it returned [$EOF]. | |
67 * This allows implementations to omit bound checks if the data structure ends | |
68 * with '0'. | |
69 */ | |
70 int advance(); | |
71 | |
72 /** | |
73 * Returns the current unicode character. | |
74 * | |
75 * If the current character is ASCII, then it is returned unchanged. | |
76 * | |
77 * The [Utf8BytesScanner] decodes the next unicode code point starting at the | |
78 * current position. Note that every unicode character is returned as a single | |
79 * code point, i.e., for '\u{1d11e}' it returns 119070, and the following | |
80 * [advance] returns the next character. | |
81 * | |
82 * The [StringScanner] returns the current character unchanged, which might | |
83 * be a surrogate character. In the case of '\u{1d11e}', it returns the first | |
84 * code unit 55348, and the following [advance] returns the second code unit | |
85 * 56606. | |
86 * | |
87 * Invoking [currentAsUnicode] multiple times is safe, i.e., | |
ngeoffray
2013/10/18 10:19:37
i.e. -> that is
lukas
2013/10/24 16:48:36
Done.
| |
88 * [:currentAsUnicode(next) == currentAsUnicode(currentAsUnicode(next)):]. | |
89 */ | |
90 int currentAsUnicode(int next); | |
91 | |
92 /** | |
93 * Returns the character at the next poisition. Like in [advance], the | |
94 * [Utf8BytesScanner] returns a UTF-8 byte, while the [StringScanner] returns | |
95 * a UTF-16 code unit. | |
23 */ | 96 */ |
24 int peek(); | 97 int peek(); |
25 | 98 |
26 /** | 99 /** |
27 * Appends a fixed token based on whether the current char is [choice] or not. | 100 * Notifies the scanner that unicode characters were detected in either a |
28 * If the current char is [choice] a fixed token whose kind and content | 101 * comment or a string literal between [startScanOffset] and the current |
29 * is determined by [yes] is appended, otherwise a fixed token whose kind | 102 * scan offset. |
30 * and content is determined by [no] is appended. | |
31 */ | 103 */ |
104 void handleUnicode(int startScanOffset); | |
105 | |
106 /** | |
107 * Returns the current scan offset. | |
108 * | |
109 * In the [Utf8BytesScanner] this is the offset into the byte list, in the | |
110 * [StringScanner] the offset in the source string. | |
111 */ | |
112 int get scanOffset; | |
113 | |
114 /** | |
115 * Returns the current string offset. | |
116 * | |
117 * In the [StringScanner] this is identical to the [scanOffset]. In the | |
118 * [Utf8BytesScanner] it is computed based on encountered UTF-8 characters. | |
119 */ | |
120 int get stringOffset; | |
121 | |
122 /** | |
123 * Returns the first token scanned by this [Scanner]. | |
124 */ | |
125 Token firstToken(); | |
126 | |
127 /** | |
128 * Returns the last token scanned by this [Scanner]. | |
129 */ | |
130 Token previousToken(); | |
131 | |
132 /** | |
133 * Notifies that a new token starts at current offset. | |
134 */ | |
135 void beginToken() { | |
136 tokenStart = stringOffset; | |
137 } | |
138 | |
139 /** | |
140 * Appends a substring from the scan offset [:start:] to the current | |
141 * [:scanOffset:] plus the [:extraOffset:]. For example, if the current | |
142 * scanOffset is 10, then [:appendSubstringToken(5, -1):] will append the | |
143 * substring string [5,9). | |
144 * | |
145 * Note that [extraOffset] can only be used if the covered character(s) are | |
146 * known to be ASCII. | |
147 */ | |
148 void appendSubstringToken(PrecedenceInfo info, int start, | |
149 bool asciiOnly, [int extraOffset]); | |
150 | |
151 /** Documentation in subclass [ArrayBasedScanner]. */ | |
152 void appendStringToken(PrecedenceInfo info, String value); | |
153 | |
154 /** Documentation in subclass [ArrayBasedScanner]. */ | |
155 void appendPrecedenceToken(PrecedenceInfo info); | |
156 | |
157 /** Documentation in subclass [ArrayBasedScanner]. */ | |
32 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no); | 158 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no); |
33 | 159 |
34 /** | 160 /** Documentation in subclass [ArrayBasedScanner]. */ |
35 * Appends a fixed token whose kind and content is determined by [info]. | 161 void appendKeywordToken(Keyword keyword); |
36 */ | |
37 void appendPrecedenceToken(PrecedenceInfo info); | |
38 | 162 |
39 /** | 163 /** Documentation in subclass [ArrayBasedScanner]. */ |
40 * Appends a token whose kind is determined by [info] and content is [value]. | |
41 */ | |
42 void appendStringToken(PrecedenceInfo info, String value); | |
43 | |
44 /** | |
45 * Appends a token whose kind is determined by [info] and content is defined | |
46 * by the SourceString [value]. | |
47 */ | |
48 void appendByteStringToken(PrecedenceInfo info, T value); | |
49 | |
50 /** | |
51 * Appends a keyword token whose kind is determined by [keyword]. | |
52 */ | |
53 void appendKeywordToken(Keyword keyword); | |
54 void appendWhiteSpace(int next); | |
55 void appendEofToken(); | 164 void appendEofToken(); |
56 | 165 |
57 /** | 166 /** Documentation in subclass [ArrayBasedScanner]. */ |
ngeoffray
2013/10/18 10:19:37
So do the following methods only apply to the Arra
lukas
2013/10/24 16:48:36
I just decided to put the documentation together w
| |
58 * Creates an ASCII SourceString whose content begins at the source byte | 167 void appendWhiteSpace(int next); |
59 * offset [start] and ends at [offset] bytes from the current byte offset of | |
60 * the scanner. For example, if the current byte offset is 10, | |
61 * [:asciiString(0,-1):] creates an ASCII SourceString whose content is found | |
62 * at the [0,9[ byte interval of the source text. | |
63 */ | |
64 T asciiString(int start, int offset); | |
65 T utf8String(int start, int offset); | |
66 Token firstToken(); | |
67 Token previousToken(); | |
68 void beginToken(); | |
69 void addToCharOffset(int offset); | |
70 int get charOffset; | |
71 int get byteOffset; | |
72 void appendBeginGroup(PrecedenceInfo info, String value); | |
73 int appendEndGroup(PrecedenceInfo info, String value, int openKind); | |
74 void appendGt(PrecedenceInfo info, String value); | |
75 void appendGtGt(PrecedenceInfo info, String value); | |
76 void appendGtGtGt(PrecedenceInfo info, String value); | |
77 void appendComment(); | |
78 | 168 |
79 /** | 169 /** Documentation in subclass [ArrayBasedScanner]. */ |
80 * We call this method to discard '<' from the "grouping" stack | 170 void lineFeedInMultiline(); |
81 * (maintained by subclasses). | 171 |
82 * | 172 /** Documentation in subclass [ArrayBasedScanner]. */ |
83 * [PartialParser.skipExpression] relies on the fact that we do not | 173 void appendBeginGroup(PrecedenceInfo info); |
84 * create groups for stuff like: | 174 |
85 * [:a = b < c, d = e > f:]. | 175 /** Documentation in subclass [ArrayBasedScanner]. */ |
86 * | 176 int appendEndGroup(PrecedenceInfo info, int openKind); |
87 * In other words, this method is called when the scanner recognizes | 177 |
88 * something which cannot possibly be part of a type | 178 /** Documentation in subclass [ArrayBasedScanner]. */ |
89 * parameter/argument list. | 179 void appendGt(PrecedenceInfo info); |
90 */ | 180 |
181 /** Documentation in subclass [ArrayBasedScanner]. */ | |
182 void appendGtGt(PrecedenceInfo info); | |
183 | |
184 /** Documentation in subclass [ArrayBasedScanner]. */ | |
185 void appendComment(start, bool asciiOnly); | |
186 | |
187 /** Documentation in subclass [ArrayBasedScanner]. */ | |
91 void discardOpenLt(); | 188 void discardOpenLt(); |
92 | 189 |
93 // TODO(ahe): Move this class to implementation. | 190 // TODO(ahe): Move this class to implementation. |
94 | 191 |
95 Token tokenize() { | 192 Token tokenize() { |
96 int next = advance(); | 193 int next = advance(); |
97 while (!identical(next, $EOF)) { | 194 while (!identical(next, $EOF)) { |
98 next = bigSwitch(next); | 195 next = bigSwitch(next); |
99 } | 196 } |
100 appendEofToken(); | 197 appendEofToken(); |
198 | |
199 if (file != null) { | |
200 file.length = stringOffset; | |
201 // One additional line start at the end, see [SourceFile.lineStarts]. | |
202 lineStarts.add(stringOffset + 1); | |
203 file.lineStarts = lineStarts; | |
204 } | |
205 | |
101 return firstToken(); | 206 return firstToken(); |
102 } | 207 } |
103 | 208 |
104 int bigSwitch(int next) { | 209 int bigSwitch(int next) { |
105 beginToken(); | 210 beginToken(); |
106 if (identical(next, $SPACE) || identical(next, $TAB) | 211 if (identical(next, $SPACE) || identical(next, $TAB) |
107 || identical(next, $LF) || identical(next, $CR)) { | 212 || identical(next, $LF) || identical(next, $CR)) { |
108 appendWhiteSpace(next); | 213 appendWhiteSpace(next); |
109 next = advance(); | 214 next = advance(); |
215 // Sequences of spaces are common, so advance through them fast. | |
110 while (identical(next, $SPACE)) { | 216 while (identical(next, $SPACE)) { |
111 appendWhiteSpace(next); | 217 // We don't invoke [:appendWhiteSpace(next):] here for efficiency, |
218 // assuming that it does not do anything for space characters. | |
112 next = advance(); | 219 next = advance(); |
113 } | 220 } |
114 return next; | 221 return next; |
115 } | 222 } |
116 | 223 |
117 if ($a <= next && next <= $z) { | 224 if ($a <= next && next <= $z) { |
118 if (identical($r, next)) { | 225 if (identical($r, next)) { |
119 return tokenizeRawStringKeywordOrIdentifier(next); | 226 return tokenizeRawStringKeywordOrIdentifier(next); |
120 } | 227 } |
121 return tokenizeKeywordOrIdentifier(next, true); | 228 return tokenizeKeywordOrIdentifier(next, true); |
122 } | 229 } |
123 | 230 |
124 if (($A <= next && next <= $Z) || identical(next, $_) || identical(next, $$) ) { | 231 if (($A <= next && next <= $Z) || |
125 return tokenizeIdentifier(next, byteOffset, true); | 232 identical(next, $_) || |
233 identical(next, $$)) { | |
234 return tokenizeIdentifier(next, scanOffset, true); | |
126 } | 235 } |
127 | 236 |
128 if (identical(next, $LT)) { | 237 if (identical(next, $LT)) { |
129 return tokenizeLessThan(next); | 238 return tokenizeLessThan(next); |
130 } | 239 } |
131 | 240 |
132 if (identical(next, $GT)) { | 241 if (identical(next, $GT)) { |
133 return tokenizeGreaterThan(next); | 242 return tokenizeGreaterThan(next); |
134 } | 243 } |
135 | 244 |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
180 if (identical(next, $BACKSLASH)) { | 289 if (identical(next, $BACKSLASH)) { |
181 appendPrecedenceToken(BACKSLASH_INFO); | 290 appendPrecedenceToken(BACKSLASH_INFO); |
182 return advance(); | 291 return advance(); |
183 } | 292 } |
184 | 293 |
185 if (identical(next, $HASH)) { | 294 if (identical(next, $HASH)) { |
186 return tokenizeTag(next); | 295 return tokenizeTag(next); |
187 } | 296 } |
188 | 297 |
189 if (identical(next, $OPEN_PAREN)) { | 298 if (identical(next, $OPEN_PAREN)) { |
190 appendBeginGroup(OPEN_PAREN_INFO, "("); | 299 appendBeginGroup(OPEN_PAREN_INFO); |
191 return advance(); | 300 return advance(); |
192 } | 301 } |
193 | 302 |
194 if (identical(next, $CLOSE_PAREN)) { | 303 if (identical(next, $CLOSE_PAREN)) { |
195 return appendEndGroup(CLOSE_PAREN_INFO, ")", OPEN_PAREN_TOKEN); | 304 return appendEndGroup(CLOSE_PAREN_INFO, OPEN_PAREN_TOKEN); |
196 } | 305 } |
197 | 306 |
198 if (identical(next, $COMMA)) { | 307 if (identical(next, $COMMA)) { |
199 appendPrecedenceToken(COMMA_INFO); | 308 appendPrecedenceToken(COMMA_INFO); |
200 return advance(); | 309 return advance(); |
201 } | 310 } |
202 | 311 |
203 if (identical(next, $COLON)) { | 312 if (identical(next, $COLON)) { |
204 appendPrecedenceToken(COLON_INFO); | 313 appendPrecedenceToken(COLON_INFO); |
205 return advance(); | 314 return advance(); |
206 } | 315 } |
207 | 316 |
208 if (identical(next, $SEMICOLON)) { | 317 if (identical(next, $SEMICOLON)) { |
209 appendPrecedenceToken(SEMICOLON_INFO); | 318 appendPrecedenceToken(SEMICOLON_INFO); |
210 // Type parameters and arguments cannot contain semicolon. | 319 // Type parameters and arguments cannot contain semicolon. |
211 discardOpenLt(); | 320 discardOpenLt(); |
212 return advance(); | 321 return advance(); |
213 } | 322 } |
214 | 323 |
215 if (identical(next, $QUESTION)) { | 324 if (identical(next, $QUESTION)) { |
216 appendPrecedenceToken(QUESTION_INFO); | 325 appendPrecedenceToken(QUESTION_INFO); |
217 return advance(); | 326 return advance(); |
218 } | 327 } |
219 | 328 |
220 if (identical(next, $CLOSE_SQUARE_BRACKET)) { | 329 if (identical(next, $CLOSE_SQUARE_BRACKET)) { |
221 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO, "]", | 330 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO, |
222 OPEN_SQUARE_BRACKET_TOKEN); | 331 OPEN_SQUARE_BRACKET_TOKEN); |
223 } | 332 } |
224 | 333 |
225 if (identical(next, $BACKPING)) { | 334 if (identical(next, $BACKPING)) { |
226 appendPrecedenceToken(BACKPING_INFO); | 335 appendPrecedenceToken(BACKPING_INFO); |
227 return advance(); | 336 return advance(); |
228 } | 337 } |
229 | 338 |
230 if (identical(next, $OPEN_CURLY_BRACKET)) { | 339 if (identical(next, $OPEN_CURLY_BRACKET)) { |
231 appendBeginGroup(OPEN_CURLY_BRACKET_INFO, "{"); | 340 appendBeginGroup(OPEN_CURLY_BRACKET_INFO); |
232 return advance(); | 341 return advance(); |
233 } | 342 } |
234 | 343 |
235 if (identical(next, $CLOSE_CURLY_BRACKET)) { | 344 if (identical(next, $CLOSE_CURLY_BRACKET)) { |
236 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, "}", | 345 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, |
237 OPEN_CURLY_BRACKET_TOKEN); | 346 OPEN_CURLY_BRACKET_TOKEN); |
238 } | 347 } |
239 | 348 |
240 if (identical(next, $SLASH)) { | 349 if (identical(next, $SLASH)) { |
241 return tokenizeSlashOrComment(next); | 350 return tokenizeSlashOrComment(next); |
242 } | 351 } |
243 | 352 |
244 if (identical(next, $AT)) { | 353 if (identical(next, $AT)) { |
245 return tokenizeAt(next); | 354 return tokenizeAt(next); |
246 } | 355 } |
247 | 356 |
248 if (identical(next, $DQ) || identical(next, $SQ)) { | 357 if (identical(next, $DQ) || identical(next, $SQ)) { |
249 return tokenizeString(next, byteOffset, false); | 358 return tokenizeString(next, scanOffset, false); |
250 } | 359 } |
251 | 360 |
252 if (identical(next, $PERIOD)) { | 361 if (identical(next, $PERIOD)) { |
253 return tokenizeDotsOrNumber(next); | 362 return tokenizeDotsOrNumber(next); |
254 } | 363 } |
255 | 364 |
256 if (identical(next, $0)) { | 365 if (identical(next, $0)) { |
257 return tokenizeHexOrNumber(next); | 366 return tokenizeHexOrNumber(next); |
258 } | 367 } |
259 | 368 |
260 // TODO(ahe): Would a range check be faster? | 369 // TODO(ahe): Would a range check be faster? |
261 if (identical(next, $1) || identical(next, $2) || identical(next, $3) | 370 if (identical(next, $1) || identical(next, $2) || identical(next, $3) |
262 || identical(next, $4) || identical(next, $5) || identical(next, $6) | 371 || identical(next, $4) || identical(next, $5) || identical(next, $6) |
263 || identical(next, $7) || identical(next, $8) || identical(next, $9)) { | 372 || identical(next, $7) || identical(next, $8) || identical(next, $9)) { |
264 return tokenizeNumber(next); | 373 return tokenizeNumber(next); |
265 } | 374 } |
266 | 375 |
267 if (identical(next, $EOF)) { | 376 if (identical(next, $EOF)) { |
268 return $EOF; | 377 return $EOF; |
269 } | 378 } |
270 if (next < 0x1f) { | 379 if (next < 0x1f) { |
271 return error(new SourceString("unexpected character $next")); | 380 return error("unexpected character $next"); |
272 } | 381 } |
273 | 382 |
383 next = currentAsUnicode(next); | |
384 | |
274 // The following are non-ASCII characters. | 385 // The following are non-ASCII characters. |
275 | 386 |
276 if (identical(next, $NBSP)) { | 387 if (identical(next, $NBSP)) { |
277 appendWhiteSpace(next); | 388 appendWhiteSpace(next); |
278 return advance(); | 389 return advance(); |
279 } | 390 } |
280 | 391 |
281 return tokenizeIdentifier(next, byteOffset, true); | 392 return error("unexpected unicode character $next"); |
282 } | 393 } |
283 | 394 |
284 int tokenizeTag(int next) { | 395 int tokenizeTag(int next) { |
285 // # or #!.*[\n\r] | 396 // # or #!.*[\n\r] |
286 if (byteOffset == 0) { | 397 if (scanOffset == 0) { |
287 if (identical(peek(), $BANG)) { | 398 if (identical(peek(), $BANG)) { |
399 int start = scanOffset + 1; | |
400 bool asciiOnly = true; | |
288 do { | 401 do { |
289 next = advance(); | 402 next = advance(); |
290 } while (!identical(next, $LF) && !identical(next, $CR) && !identical(ne xt, $EOF)); | 403 if (next > 127) asciiOnly = false; |
404 } while (!identical(next, $LF) && | |
405 !identical(next, $CR) && | |
406 !identical(next, $EOF)); | |
407 if (!asciiOnly) handleUnicode(start); | |
291 return next; | 408 return next; |
292 } | 409 } |
293 } | 410 } |
294 appendPrecedenceToken(HASH_INFO); | 411 appendPrecedenceToken(HASH_INFO); |
295 return advance(); | 412 return advance(); |
296 } | 413 } |
297 | 414 |
298 int tokenizeTilde(int next) { | 415 int tokenizeTilde(int next) { |
299 // ~ ~/ ~/= | 416 // ~ ~/ ~/= |
300 next = advance(); | 417 next = advance(); |
301 if (identical(next, $SLASH)) { | 418 if (identical(next, $SLASH)) { |
302 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO); | 419 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO); |
303 } else { | 420 } else { |
304 appendPrecedenceToken(TILDE_INFO); | 421 appendPrecedenceToken(TILDE_INFO); |
305 return next; | 422 return next; |
306 } | 423 } |
307 } | 424 } |
308 | 425 |
309 int tokenizeOpenSquareBracket(int next) { | 426 int tokenizeOpenSquareBracket(int next) { |
310 // [ [] []= | 427 // [ [] []= |
311 next = advance(); | 428 next = advance(); |
312 if (identical(next, $CLOSE_SQUARE_BRACKET)) { | 429 if (identical(next, $CLOSE_SQUARE_BRACKET)) { |
313 Token token = previousToken(); | 430 Token token = previousToken(); |
314 if (token is KeywordToken && identical(token.value.stringValue, 'operator' )) { | 431 if (token is KeywordToken && |
432 identical((token as KeywordToken).keyword.syntax, 'operator')) { | |
315 return select($EQ, INDEX_EQ_INFO, INDEX_INFO); | 433 return select($EQ, INDEX_EQ_INFO, INDEX_INFO); |
316 } | 434 } |
317 } | 435 } |
318 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO, "["); | 436 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO); |
319 return next; | 437 return next; |
320 } | 438 } |
321 | 439 |
322 int tokenizeCaret(int next) { | 440 int tokenizeCaret(int next) { |
323 // ^ ^= | 441 // ^ ^= |
324 return select($EQ, CARET_EQ_INFO, CARET_INFO); | 442 return select($EQ, CARET_EQ_INFO, CARET_INFO); |
325 } | 443 } |
326 | 444 |
327 int tokenizeBar(int next) { | 445 int tokenizeBar(int next) { |
328 // | || |= | 446 // | || |= |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
372 return advance(); | 490 return advance(); |
373 } else if (identical(next, $EQ)) { | 491 } else if (identical(next, $EQ)) { |
374 appendPrecedenceToken(MINUS_EQ_INFO); | 492 appendPrecedenceToken(MINUS_EQ_INFO); |
375 return advance(); | 493 return advance(); |
376 } else { | 494 } else { |
377 appendPrecedenceToken(MINUS_INFO); | 495 appendPrecedenceToken(MINUS_INFO); |
378 return next; | 496 return next; |
379 } | 497 } |
380 } | 498 } |
381 | 499 |
382 | |
383 int tokenizePlus(int next) { | 500 int tokenizePlus(int next) { |
384 // + ++ += | 501 // + ++ += |
385 next = advance(); | 502 next = advance(); |
386 if (identical($PLUS, next)) { | 503 if (identical($PLUS, next)) { |
387 appendPrecedenceToken(PLUS_PLUS_INFO); | 504 appendPrecedenceToken(PLUS_PLUS_INFO); |
388 return advance(); | 505 return advance(); |
389 } else if (identical($EQ, next)) { | 506 } else if (identical($EQ, next)) { |
390 appendPrecedenceToken(PLUS_EQ_INFO); | 507 appendPrecedenceToken(PLUS_EQ_INFO); |
391 return advance(); | 508 return advance(); |
392 } else { | 509 } else { |
393 appendPrecedenceToken(PLUS_INFO); | 510 appendPrecedenceToken(PLUS_INFO); |
394 return next; | 511 return next; |
395 } | 512 } |
396 } | 513 } |
397 | 514 |
398 int tokenizeExclamation(int next) { | 515 int tokenizeExclamation(int next) { |
399 // ! != !== | 516 // ! != |
517 // !== is kept for user-friendly error reporting. | |
518 | |
400 next = advance(); | 519 next = advance(); |
401 if (identical(next, $EQ)) { | 520 if (identical(next, $EQ)) { |
402 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO); | 521 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO); |
403 } | 522 } |
404 appendPrecedenceToken(BANG_INFO); | 523 appendPrecedenceToken(BANG_INFO); |
405 return next; | 524 return next; |
406 } | 525 } |
407 | 526 |
408 int tokenizeEquals(int next) { | 527 int tokenizeEquals(int next) { |
409 // = == === | 528 // = == => |
529 // === is kept for user-friendly error reporting. | |
410 | 530 |
411 // Type parameters and arguments cannot contain any token that | 531 // Type parameters and arguments cannot contain any token that |
412 // starts with '='. | 532 // starts with '='. |
413 discardOpenLt(); | 533 discardOpenLt(); |
414 | 534 |
415 next = advance(); | 535 next = advance(); |
416 if (identical(next, $EQ)) { | 536 if (identical(next, $EQ)) { |
417 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO); | 537 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO); |
418 } else if (identical(next, $GT)) { | 538 } else if (identical(next, $GT)) { |
419 appendPrecedenceToken(FUNCTION_INFO); | 539 appendPrecedenceToken(FUNCTION_INFO); |
420 return advance(); | 540 return advance(); |
421 } | 541 } |
422 appendPrecedenceToken(EQ_INFO); | 542 appendPrecedenceToken(EQ_INFO); |
423 return next; | 543 return next; |
424 } | 544 } |
425 | 545 |
426 int tokenizeGreaterThan(int next) { | 546 int tokenizeGreaterThan(int next) { |
427 // > >= >> >>= >>> >>>= | 547 // > >= >> >>= |
428 next = advance(); | 548 next = advance(); |
429 if (identical($EQ, next)) { | 549 if (identical($EQ, next)) { |
430 appendPrecedenceToken(GT_EQ_INFO); | 550 appendPrecedenceToken(GT_EQ_INFO); |
431 return advance(); | 551 return advance(); |
432 } else if (identical($GT, next)) { | 552 } else if (identical($GT, next)) { |
433 next = advance(); | 553 next = advance(); |
434 if (identical($EQ, next)) { | 554 if (identical($EQ, next)) { |
435 appendPrecedenceToken(GT_GT_EQ_INFO); | 555 appendPrecedenceToken(GT_GT_EQ_INFO); |
436 return advance(); | 556 return advance(); |
437 } else { | 557 } else { |
438 appendGtGt(GT_GT_INFO, ">>"); | 558 appendGtGt(GT_GT_INFO); |
439 return next; | 559 return next; |
440 } | 560 } |
441 } else { | 561 } else { |
442 appendGt(GT_INFO, ">"); | 562 appendGt(GT_INFO); |
443 return next; | 563 return next; |
444 } | 564 } |
445 } | 565 } |
446 | 566 |
447 int tokenizeLessThan(int next) { | 567 int tokenizeLessThan(int next) { |
448 // < <= << <<= | 568 // < <= << <<= |
449 next = advance(); | 569 next = advance(); |
450 if (identical($EQ, next)) { | 570 if (identical($EQ, next)) { |
451 appendPrecedenceToken(LT_EQ_INFO); | 571 appendPrecedenceToken(LT_EQ_INFO); |
452 return advance(); | 572 return advance(); |
453 } else if (identical($LT, next)) { | 573 } else if (identical($LT, next)) { |
454 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO); | 574 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO); |
455 } else { | 575 } else { |
456 appendBeginGroup(LT_INFO, "<"); | 576 appendBeginGroup(LT_INFO); |
457 return next; | 577 return next; |
458 } | 578 } |
459 } | 579 } |
460 | 580 |
461 int tokenizeNumber(int next) { | 581 int tokenizeNumber(int next) { |
462 int start = byteOffset; | 582 int start = scanOffset; |
463 while (true) { | 583 while (true) { |
464 next = advance(); | 584 next = advance(); |
465 if ($0 <= next && next <= $9) { | 585 if ($0 <= next && next <= $9) { |
466 continue; | 586 continue; |
467 } else if (identical(next, $e) || identical(next, $E)) { | 587 } else if (identical(next, $e) || identical(next, $E)) { |
468 return tokenizeFractionPart(next, start); | 588 return tokenizeFractionPart(next, start); |
469 } else { | 589 } else { |
470 if (identical(next, $PERIOD)) { | 590 if (identical(next, $PERIOD)) { |
471 int nextnext = peek(); | 591 int nextnext = peek(); |
472 if ($0 <= nextnext && nextnext <= $9) { | 592 if ($0 <= nextnext && nextnext <= $9) { |
473 return tokenizeFractionPart(advance(), start); | 593 return tokenizeFractionPart(advance(), start); |
474 } | 594 } |
475 } | 595 } |
476 appendByteStringToken(INT_INFO, asciiString(start, 0)); | 596 appendSubstringToken(INT_INFO, start, true); |
477 return next; | 597 return next; |
478 } | 598 } |
479 } | 599 } |
480 } | 600 } |
481 | 601 |
482 int tokenizeHexOrNumber(int next) { | 602 int tokenizeHexOrNumber(int next) { |
483 int x = peek(); | 603 int x = peek(); |
484 if (identical(x, $x) || identical(x, $X)) { | 604 if (identical(x, $x) || identical(x, $X)) { |
485 advance(); | 605 return tokenizeHex(next); |
486 return tokenizeHex(x); | |
487 } | 606 } |
488 return tokenizeNumber(next); | 607 return tokenizeNumber(next); |
489 } | 608 } |
490 | 609 |
491 int tokenizeHex(int next) { | 610 int tokenizeHex(int next) { |
492 int start = byteOffset - 1; | 611 int start = scanOffset; |
612 next = advance(); // Advance past the $x or $X. | |
493 bool hasDigits = false; | 613 bool hasDigits = false; |
494 while (true) { | 614 while (true) { |
495 next = advance(); | 615 next = advance(); |
496 if (($0 <= next && next <= $9) | 616 if (($0 <= next && next <= $9) |
497 || ($A <= next && next <= $F) | 617 || ($A <= next && next <= $F) |
498 || ($a <= next && next <= $f)) { | 618 || ($a <= next && next <= $f)) { |
499 hasDigits = true; | 619 hasDigits = true; |
500 } else { | 620 } else { |
501 if (!hasDigits) { | 621 if (!hasDigits) { |
502 return error(const SourceString("hex digit expected")); | 622 return error("hex digit expected"); |
503 } | 623 } |
504 appendByteStringToken(HEXADECIMAL_INFO, asciiString(start, 0)); | 624 appendSubstringToken(HEXADECIMAL_INFO, start, true); |
505 return next; | 625 return next; |
506 } | 626 } |
507 } | 627 } |
508 } | 628 } |
509 | 629 |
510 int tokenizeDotsOrNumber(int next) { | 630 int tokenizeDotsOrNumber(int next) { |
511 int start = byteOffset; | 631 int start = scanOffset; |
512 next = advance(); | 632 next = advance(); |
513 if (($0 <= next && next <= $9)) { | 633 if (($0 <= next && next <= $9)) { |
514 return tokenizeFractionPart(next, start); | 634 return tokenizeFractionPart(next, start); |
515 } else if (identical($PERIOD, next)) { | 635 } else if (identical($PERIOD, next)) { |
516 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); | 636 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); |
517 } else { | 637 } else { |
518 appendPrecedenceToken(PERIOD_INFO); | 638 appendPrecedenceToken(PERIOD_INFO); |
519 return next; | 639 return next; |
520 } | 640 } |
521 } | 641 } |
522 | 642 |
523 int tokenizeFractionPart(int next, int start) { | 643 int tokenizeFractionPart(int next, int start) { |
524 bool done = false; | 644 bool done = false; |
525 bool hasDigit = false; | 645 bool hasDigit = false; |
526 LOOP: while (!done) { | 646 LOOP: while (!done) { |
527 if ($0 <= next && next <= $9) { | 647 if ($0 <= next && next <= $9) { |
528 hasDigit = true; | 648 hasDigit = true; |
529 } else if (identical($e, next) || identical($E, next)) { | 649 } else if (identical($e, next) || identical($E, next)) { |
530 hasDigit = true; | 650 hasDigit = true; |
531 next = tokenizeExponent(advance()); | 651 next = tokenizeExponent(advance()); |
532 done = true; | 652 done = true; |
533 continue LOOP; | 653 continue LOOP; |
534 } else { | 654 } else { |
535 done = true; | 655 done = true; |
536 continue LOOP; | 656 continue LOOP; |
537 } | 657 } |
538 next = advance(); | 658 next = advance(); |
539 } | 659 } |
540 if (!hasDigit) { | 660 if (!hasDigit) { |
541 appendByteStringToken(INT_INFO, asciiString(start, -1)); | 661 // Reduce offset, we already advanced to the token past the period. |
662 appendSubstringToken(INT_INFO, start, true, -1); | |
663 | |
664 // TODO(ahe): Wrong offset for the period. Cannot call beginToken because | |
665 // the scanner already advanced past the period. | |
542 if (identical($PERIOD, next)) { | 666 if (identical($PERIOD, next)) { |
543 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); | 667 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); |
544 } | 668 } |
545 // TODO(ahe): Wrong offset for the period. | |
546 appendPrecedenceToken(PERIOD_INFO); | 669 appendPrecedenceToken(PERIOD_INFO); |
547 return bigSwitch(next); | 670 return next; |
548 } | 671 } |
549 appendByteStringToken(DOUBLE_INFO, asciiString(start, 0)); | 672 appendSubstringToken(DOUBLE_INFO, start, true); |
550 return next; | 673 return next; |
551 } | 674 } |
552 | 675 |
553 int tokenizeExponent(int next) { | 676 int tokenizeExponent(int next) { |
554 if (identical(next, $PLUS) || identical(next, $MINUS)) { | 677 if (identical(next, $PLUS) || identical(next, $MINUS)) { |
555 next = advance(); | 678 next = advance(); |
556 } | 679 } |
557 bool hasDigits = false; | 680 bool hasDigits = false; |
558 while (true) { | 681 while (true) { |
559 if ($0 <= next && next <= $9) { | 682 if ($0 <= next && next <= $9) { |
560 hasDigits = true; | 683 hasDigits = true; |
561 } else { | 684 } else { |
562 if (!hasDigits) { | 685 if (!hasDigits) { |
563 return error(const SourceString("digit expected")); | 686 return error("digit expected"); |
564 } | 687 } |
565 return next; | 688 return next; |
566 } | 689 } |
567 next = advance(); | 690 next = advance(); |
568 } | 691 } |
569 } | 692 } |
570 | 693 |
571 int tokenizeSlashOrComment(int next) { | 694 int tokenizeSlashOrComment(int next) { |
695 int start = scanOffset; | |
572 next = advance(); | 696 next = advance(); |
573 if (identical($STAR, next)) { | 697 if (identical($STAR, next)) { |
574 return tokenizeMultiLineComment(next); | 698 return tokenizeMultiLineComment(next, start); |
575 } else if (identical($SLASH, next)) { | 699 } else if (identical($SLASH, next)) { |
576 return tokenizeSingleLineComment(next); | 700 return tokenizeSingleLineComment(next, start); |
577 } else if (identical($EQ, next)) { | 701 } else if (identical($EQ, next)) { |
578 appendPrecedenceToken(SLASH_EQ_INFO); | 702 appendPrecedenceToken(SLASH_EQ_INFO); |
579 return advance(); | 703 return advance(); |
580 } else { | 704 } else { |
581 appendPrecedenceToken(SLASH_INFO); | 705 appendPrecedenceToken(SLASH_INFO); |
582 return next; | 706 return next; |
583 } | 707 } |
584 } | 708 } |
585 | 709 |
586 int tokenizeSingleLineComment(int next) { | 710 int tokenizeSingleLineComment(int next, int start) { |
711 bool asciiOnly = true; | |
587 while (true) { | 712 while (true) { |
588 next = advance(); | 713 next = advance(); |
589 if (identical($LF, next) || identical($CR, next) || identical($EOF, next)) { | 714 if (next > 127) asciiOnly = false; |
590 appendComment(); | 715 if (identical($LF, next) || |
716 identical($CR, next) || | |
717 identical($EOF, next)) { | |
718 if (!asciiOnly) handleUnicode(start); | |
719 appendComment(start, asciiOnly); | |
591 return next; | 720 return next; |
592 } | 721 } |
593 } | 722 } |
594 } | 723 } |
595 | 724 |
596 int tokenizeMultiLineComment(int next) { | 725 |
726 int tokenizeMultiLineComment(int next, int start) { | |
727 bool asciiOnlyComment = true; // Track if the entire comment is ASCII. | |
728 bool asciiOnlyLines = true; // Track ASCII since the last handleUnicode. | |
729 int unicodeStart = start; | |
597 int nesting = 1; | 730 int nesting = 1; |
598 next = advance(); | 731 next = advance(); |
599 while (true) { | 732 while (true) { |
600 if (identical($EOF, next)) { | 733 if (identical($EOF, next)) { |
601 // TODO(ahe): Report error. | 734 if (!asciiOnlyLines) handleUnicode(unicodeStart); |
735 appendStringToken(BAD_INPUT_INFO, "unterminated multi-line comment"); | |
602 return next; | 736 return next; |
603 } else if (identical($STAR, next)) { | 737 } else if (identical($STAR, next)) { |
604 next = advance(); | 738 next = advance(); |
605 if (identical($SLASH, next)) { | 739 if (identical($SLASH, next)) { |
606 --nesting; | 740 --nesting; |
607 if (0 == nesting) { | 741 if (0 == nesting) { |
742 if (!asciiOnlyLines) handleUnicode(unicodeStart); | |
608 next = advance(); | 743 next = advance(); |
609 appendComment(); | 744 appendComment(start, asciiOnlyComment); |
610 return next; | 745 return next; |
611 } else { | 746 } else { |
612 next = advance(); | 747 next = advance(); |
613 } | 748 } |
614 } | 749 } |
615 } else if (identical($SLASH, next)) { | 750 } else if (identical($SLASH, next)) { |
616 next = advance(); | 751 next = advance(); |
617 if (identical($STAR, next)) { | 752 if (identical($STAR, next)) { |
618 next = advance(); | 753 next = advance(); |
619 ++nesting; | 754 ++nesting; |
620 } | 755 } |
756 } else if (identical(next, $LF)) { | |
757 if (!asciiOnlyLines) { | |
758 // Synchronize the string offset in the utf8 scanner. | |
759 handleUnicode(unicodeStart); | |
760 asciiOnlyLines = true; | |
761 unicodeStart = scanOffset; | |
762 } | |
763 lineFeedInMultiline(); | |
764 next = advance(); | |
621 } else { | 765 } else { |
766 if (next > 127) { | |
767 asciiOnlyLines = false; | |
768 asciiOnlyComment = false; | |
769 } | |
622 next = advance(); | 770 next = advance(); |
623 } | 771 } |
624 } | 772 } |
625 } | 773 } |
626 | 774 |
627 int tokenizeRawStringKeywordOrIdentifier(int next) { | 775 int tokenizeRawStringKeywordOrIdentifier(int next) { |
776 // [next] is $r. | |
628 int nextnext = peek(); | 777 int nextnext = peek(); |
629 if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) { | 778 if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) { |
630 int start = byteOffset; | 779 int start = scanOffset; |
631 next = advance(); | 780 next = advance(); |
632 return tokenizeString(next, start, true); | 781 return tokenizeString(next, start, true); |
633 } | 782 } |
634 return tokenizeKeywordOrIdentifier(next, true); | 783 return tokenizeKeywordOrIdentifier(next, true); |
635 } | 784 } |
636 | 785 |
637 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) { | 786 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) { |
638 KeywordState state = KeywordState.KEYWORD_STATE; | 787 KeywordState state = KeywordState.KEYWORD_STATE; |
639 int start = byteOffset; | 788 int start = scanOffset; |
640 while (state != null && $a <= next && next <= $z) { | 789 while (state != null && $a <= next && next <= $z) { |
641 state = state.next(next); | 790 state = state.next(next); |
642 next = advance(); | 791 next = advance(); |
643 } | 792 } |
644 if (state == null || state.keyword == null) { | 793 if (state == null || state.keyword == null) { |
645 return tokenizeIdentifier(next, start, allowDollar); | 794 return tokenizeIdentifier(next, start, allowDollar); |
646 } | 795 } |
647 if (($A <= next && next <= $Z) || | 796 if (($A <= next && next <= $Z) || |
648 ($0 <= next && next <= $9) || | 797 ($0 <= next && next <= $9) || |
649 identical(next, $_) || | 798 identical(next, $_) || |
650 identical(next, $$)) { | 799 identical(next, $$)) { |
651 return tokenizeIdentifier(next, start, allowDollar); | 800 return tokenizeIdentifier(next, start, allowDollar); |
652 } else if (next < 128) { | 801 } else { |
653 appendKeywordToken(state.keyword); | 802 appendKeywordToken(state.keyword); |
654 return next; | 803 return next; |
655 } else { | |
656 return tokenizeIdentifier(next, start, allowDollar); | |
657 } | 804 } |
658 } | 805 } |
659 | 806 |
807 /** | |
808 * [allowDollar] can exclude '$', which is not allowed as part of a string | |
809 * interpolation identifier. | |
810 */ | |
660 int tokenizeIdentifier(int next, int start, bool allowDollar) { | 811 int tokenizeIdentifier(int next, int start, bool allowDollar) { |
661 bool isAscii = true; | |
662 | |
663 while (true) { | 812 while (true) { |
664 if (($a <= next && next <= $z) || | 813 if (($a <= next && next <= $z) || |
665 ($A <= next && next <= $Z) || | 814 ($A <= next && next <= $Z) || |
666 ($0 <= next && next <= $9) || | 815 ($0 <= next && next <= $9) || |
667 identical(next, $_) || | 816 identical(next, $_) || |
668 (identical(next, $$) && allowDollar)) { | 817 (identical(next, $$) && allowDollar)) { |
669 next = advance(); | 818 next = advance(); |
670 } else if ((next < 128) || (identical(next, $NBSP))) { | 819 } else { |
671 // Identifier ends here. | 820 // Identifier ends here. |
672 if (start == byteOffset) { | 821 if (start == scanOffset) { |
673 return error(const SourceString("expected identifier")); | 822 return error("expected identifier"); |
674 } else if (isAscii) { | |
675 appendByteStringToken(IDENTIFIER_INFO, asciiString(start, 0)); | |
676 } else { | 823 } else { |
677 appendByteStringToken(BAD_INPUT_INFO, utf8String(start, -1)); | 824 appendSubstringToken(IDENTIFIER_INFO, start, true); |
678 } | 825 } |
679 return next; | 826 return next; |
680 } else { | |
681 int nonAsciiStart = byteOffset; | |
682 do { | |
683 next = nextByte(); | |
684 if (identical(next, $NBSP)) break; | |
685 } while (next > 127); | |
686 String string = utf8String(nonAsciiStart, -1).slowToString(); | |
687 isAscii = false; | |
688 int byteLength = nonAsciiStart - byteOffset; | |
689 addToCharOffset(string.length - byteLength); | |
690 } | 827 } |
691 } | 828 } |
692 } | 829 } |
693 | 830 |
694 int tokenizeAt(int next) { | 831 int tokenizeAt(int next) { |
695 int start = byteOffset; | |
696 next = advance(); | |
697 appendPrecedenceToken(AT_INFO); | 832 appendPrecedenceToken(AT_INFO); |
698 return next; | 833 return advance(); |
699 } | 834 } |
700 | 835 |
701 int tokenizeString(int next, int start, bool raw) { | 836 int tokenizeString(int next, int start, bool raw) { |
702 int quoteChar = next; | 837 int quoteChar = next; |
703 next = advance(); | 838 next = advance(); |
704 if (identical(quoteChar, next)) { | 839 if (identical(quoteChar, next)) { |
705 next = advance(); | 840 next = advance(); |
706 if (identical(quoteChar, next)) { | 841 if (identical(quoteChar, next)) { |
707 // Multiline string. | 842 // Multiline string. |
708 return tokenizeMultiLineString(quoteChar, start, raw); | 843 return tokenizeMultiLineString(quoteChar, start, raw); |
709 } else { | 844 } else { |
710 // Empty string. | 845 // Empty string. |
711 appendByteStringToken(STRING_INFO, utf8String(start, -1)); | 846 appendSubstringToken(STRING_INFO, start, true); |
712 return next; | 847 return next; |
713 } | 848 } |
714 } | 849 } |
715 if (raw) { | 850 if (raw) { |
716 return tokenizeSingleLineRawString(next, quoteChar, start); | 851 return tokenizeSingleLineRawString(next, quoteChar, start); |
717 } else { | 852 } else { |
718 return tokenizeSingleLineString(next, quoteChar, start); | 853 return tokenizeSingleLineString(next, quoteChar, start); |
719 } | 854 } |
720 } | 855 } |
721 | 856 |
722 static bool isHexDigit(int character) { | 857 /** |
723 if ($0 <= character && character <= $9) return true; | 858 * [next] is the first character after the qoute. |
ngeoffray
2013/10/18 10:19:37
qoute -> quote
lukas
2013/10/24 16:48:36
Done.
| |
724 character |= 0x20; | 859 * [start] is the scanOffset of the quote. |
725 return ($a <= character && character <= $f); | 860 * |
726 } | 861 * The token contains a substring of the source file, including the |
727 | 862 * string quotes, backslashes for escaping. For interpolated strings, |
863 * the parts before and after are separate tokens. | |
864 * | |
865 * "a $b c" | |
866 * | |
867 * gives StringToken("a $), StringToken(b) and StringToken( c"). | |
868 */ | |
728 int tokenizeSingleLineString(int next, int quoteChar, int start) { | 869 int tokenizeSingleLineString(int next, int quoteChar, int start) { |
870 bool asciiOnly = true; | |
729 while (!identical(next, quoteChar)) { | 871 while (!identical(next, quoteChar)) { |
730 if (identical(next, $BACKSLASH)) { | 872 if (identical(next, $BACKSLASH)) { |
731 next = advance(); | 873 next = advance(); |
732 } else if (identical(next, $$)) { | 874 } else if (identical(next, $$)) { |
733 next = tokenizeStringInterpolation(start); | 875 if (!asciiOnly) handleUnicode(start); |
734 start = byteOffset; | 876 next = tokenizeStringInterpolation(start, asciiOnly); |
877 start = scanOffset; | |
878 asciiOnly = true; | |
735 continue; | 879 continue; |
736 } | 880 } |
737 if (next <= $CR | 881 if (next <= $CR |
738 && (identical(next, $LF) || identical(next, $CR) || identical(next, $E OF))) { | 882 && (identical(next, $LF) || |
739 return error(const SourceString("unterminated string literal")); | 883 identical(next, $CR) || |
884 identical(next, $EOF))) { | |
885 if (!asciiOnly) handleUnicode(start); | |
886 return error("unterminated string literal"); | |
740 } | 887 } |
888 if (next > 127) asciiOnly = false; | |
741 next = advance(); | 889 next = advance(); |
742 } | 890 } |
743 appendByteStringToken(STRING_INFO, utf8String(start, 0)); | 891 if (!asciiOnly) handleUnicode(start); |
744 return advance(); | 892 // Advance past the quote character. |
893 next = advance(); | |
894 appendSubstringToken(STRING_INFO, start, asciiOnly); | |
895 return next; | |
745 } | 896 } |
746 | 897 |
747 int tokenizeStringInterpolation(int start) { | 898 int tokenizeStringInterpolation(int start, bool asciiOnly) { |
748 appendByteStringToken(STRING_INFO, utf8String(start, -1)); | 899 appendSubstringToken(STRING_INFO, start, asciiOnly); |
749 beginToken(); // $ starts here. | 900 beginToken(); // $ starts here. |
750 int next = advance(); | 901 int next = advance(); |
751 if (identical(next, $OPEN_CURLY_BRACKET)) { | 902 if (identical(next, $OPEN_CURLY_BRACKET)) { |
752 return tokenizeInterpolatedExpression(next, start); | 903 return tokenizeInterpolatedExpression(next); |
753 } else { | 904 } else { |
754 return tokenizeInterpolatedIdentifier(next, start); | 905 return tokenizeInterpolatedIdentifier(next); |
755 } | 906 } |
756 } | 907 } |
757 | 908 |
758 int tokenizeInterpolatedExpression(int next, int start) { | 909 int tokenizeInterpolatedExpression(int next) { |
759 appendBeginGroup(STRING_INTERPOLATION_INFO, "\${"); | 910 appendBeginGroup(STRING_INTERPOLATION_INFO); |
760 beginToken(); // The expression starts here. | 911 beginToken(); // The expression starts here. |
761 next = advance(); | 912 next = advance(); // Move past the curly bracket. |
762 while (!identical(next, $EOF) && !identical(next, $STX)) { | 913 while (!identical(next, $EOF) && !identical(next, $STX)) { |
763 next = bigSwitch(next); | 914 next = bigSwitch(next); |
764 } | 915 } |
765 if (identical(next, $EOF)) return next; | 916 if (identical(next, $EOF)) return next; |
766 next = advance(); | 917 next = advance(); // Move past the $STX. |
767 beginToken(); // The string interpolation suffix starts here. | 918 beginToken(); // The string interpolation suffix starts here. |
768 return next; | 919 return next; |
769 } | 920 } |
770 | 921 |
771 int tokenizeInterpolatedIdentifier(int next, int start) { | 922 int tokenizeInterpolatedIdentifier(int next) { |
772 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO); | 923 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO); |
773 beginToken(); // The identifier starts here. | 924 beginToken(); // The identifier starts here. |
774 next = tokenizeKeywordOrIdentifier(next, false); | 925 next = tokenizeKeywordOrIdentifier(next, false); |
775 beginToken(); // The string interpolation suffix starts here. | 926 beginToken(); // The string interpolation suffix starts here. |
776 return next; | 927 return next; |
777 } | 928 } |
778 | 929 |
779 int tokenizeSingleLineRawString(int next, int quoteChar, int start) { | 930 int tokenizeSingleLineRawString(int next, int quoteChar, int start) { |
780 next = advance(); | 931 bool asciiOnly = true; |
932 next = advance(); // Advance past the quote | |
ngeoffray
2013/10/18 10:19:37
Missing .
lukas
2013/10/24 16:48:36
Done.
| |
781 while (next != $EOF) { | 933 while (next != $EOF) { |
782 if (identical(next, quoteChar)) { | 934 if (identical(next, quoteChar)) { |
783 appendByteStringToken(STRING_INFO, utf8String(start, 0)); | 935 if (!asciiOnly) handleUnicode(start); |
784 return advance(); | 936 next = advance(); |
937 appendSubstringToken(STRING_INFO, start, asciiOnly); | |
938 return next; | |
785 } else if (identical(next, $LF) || identical(next, $CR)) { | 939 } else if (identical(next, $LF) || identical(next, $CR)) { |
786 return error(const SourceString("unterminated string literal")); | 940 if (!asciiOnly) handleUnicode(start); |
941 return error("unterminated string literal"); | |
942 } else if (next > 127) { | |
943 asciiOnly = false; | |
787 } | 944 } |
788 next = advance(); | 945 next = advance(); |
789 } | 946 } |
790 return error(const SourceString("unterminated string literal")); | 947 if (!asciiOnly) handleUnicode(start); |
948 return error("unterminated string literal"); | |
791 } | 949 } |
792 | 950 |
793 int tokenizeMultiLineRawString(int quoteChar, int start) { | 951 int tokenizeMultiLineRawString(int quoteChar, int start) { |
794 int next = advance(); | 952 bool asciiOnlyString = true; |
953 bool asciiOnlyLine = true; | |
954 int unicodeStart = start; | |
955 int next = advance(); // Advance past the (last) quote (of three) | |
ngeoffray
2013/10/18 10:19:37
Missing .
lukas
2013/10/24 16:48:36
Done.
| |
795 outer: while (!identical(next, $EOF)) { | 956 outer: while (!identical(next, $EOF)) { |
796 while (!identical(next, quoteChar)) { | 957 while (!identical(next, quoteChar)) { |
958 if (identical(next, $LF)) { | |
959 if (!asciiOnlyLine) { | |
960 // Synchronize the string offset in the utf8 scanner. | |
961 handleUnicode(unicodeStart); | |
962 asciiOnlyLine = true; | |
963 unicodeStart = scanOffset; | |
964 } | |
965 lineFeedInMultiline(); | |
966 } else if (next > 127) { | |
967 asciiOnlyLine = false; | |
968 asciiOnlyString = false; | |
969 } | |
797 next = advance(); | 970 next = advance(); |
798 if (identical(next, $EOF)) break outer; | 971 if (identical(next, $EOF)) break outer; |
799 } | 972 } |
800 next = advance(); | 973 next = advance(); |
801 if (identical(next, quoteChar)) { | 974 if (identical(next, quoteChar)) { |
802 next = advance(); | 975 next = advance(); |
803 if (identical(next, quoteChar)) { | 976 if (identical(next, quoteChar)) { |
804 appendByteStringToken(STRING_INFO, utf8String(start, 0)); | 977 if (!asciiOnlyLine) handleUnicode(unicodeStart); |
805 return advance(); | 978 next = advance(); |
979 appendSubstringToken(STRING_INFO, start, asciiOnlyString); | |
980 return next; | |
806 } | 981 } |
807 } | 982 } |
808 } | 983 } |
809 return error(const SourceString("unterminated string literal")); | 984 if (!asciiOnlyLine) handleUnicode(unicodeStart); |
985 return error("unterminated string literal"); | |
810 } | 986 } |
811 | 987 |
812 int tokenizeMultiLineString(int quoteChar, int start, bool raw) { | 988 int tokenizeMultiLineString(int quoteChar, int start, bool raw) { |
813 if (raw) return tokenizeMultiLineRawString(quoteChar, start); | 989 if (raw) return tokenizeMultiLineRawString(quoteChar, start); |
814 int next = advance(); | 990 bool asciiOnlyString = true; |
991 bool asciiOnlyLine = true; | |
992 int unicodeStart = start; | |
993 int next = advance(); // Advance past the (last) quote (of three). | |
815 while (!identical(next, $EOF)) { | 994 while (!identical(next, $EOF)) { |
816 if (identical(next, $$)) { | 995 if (identical(next, $$)) { |
817 next = tokenizeStringInterpolation(start); | 996 if (!asciiOnlyLine) handleUnicode(unicodeStart); |
818 start = byteOffset; | 997 next = tokenizeStringInterpolation(start, asciiOnlyString); |
998 start = scanOffset; | |
999 unicodeStart = start; | |
1000 asciiOnlyString = true; // A new string token is created for the rest. | |
1001 asciiOnlyLine = true; | |
819 continue; | 1002 continue; |
820 } | 1003 } |
821 if (identical(next, quoteChar)) { | 1004 if (identical(next, quoteChar)) { |
822 next = advance(); | 1005 next = advance(); |
823 if (identical(next, quoteChar)) { | 1006 if (identical(next, quoteChar)) { |
824 next = advance(); | 1007 next = advance(); |
825 if (identical(next, quoteChar)) { | 1008 if (identical(next, quoteChar)) { |
826 appendByteStringToken(STRING_INFO, utf8String(start, 0)); | 1009 if (!asciiOnlyLine) handleUnicode(unicodeStart); |
827 return advance(); | 1010 next = advance(); |
1011 appendSubstringToken(STRING_INFO, start, asciiOnlyString); | |
1012 return next; | |
828 } | 1013 } |
829 } | 1014 } |
830 continue; | 1015 continue; |
831 } | 1016 } |
832 if (identical(next, $BACKSLASH)) { | 1017 if (identical(next, $BACKSLASH)) { |
833 next = advance(); | 1018 next = advance(); |
834 if (identical(next, $EOF)) break; | 1019 if (identical(next, $EOF)) break; |
835 } | 1020 } |
1021 if (identical(next, $LF)) { | |
1022 if (!asciiOnlyLine) { | |
1023 // Synchronize the string offset in the utf8 scanner. | |
1024 handleUnicode(unicodeStart); | |
1025 asciiOnlyLine = true; | |
1026 unicodeStart = scanOffset; | |
1027 } | |
1028 lineFeedInMultiline(); | |
1029 } else if (next > 127) { | |
1030 asciiOnlyString = false; | |
1031 asciiOnlyLine = false; | |
1032 } | |
836 next = advance(); | 1033 next = advance(); |
837 } | 1034 } |
838 return error(const SourceString("unterminated string literal")); | 1035 if (!asciiOnlyLine) handleUnicode(unicodeStart); |
1036 return error("unterminated string literal"); | |
839 } | 1037 } |
840 | 1038 |
841 int error(SourceString message) { | 1039 int error(String message) { |
842 appendByteStringToken(BAD_INPUT_INFO, message); | 1040 appendStringToken(BAD_INPUT_INFO, message); |
843 return advance(); // Ensure progress. | 1041 return advance(); // Ensure progress. |
844 } | 1042 } |
1043 | |
1044 void unmatchedBeginGroup(BeginGroupToken begin) { | |
1045 String error = 'unmatched "${begin.stringValue}"'; | |
1046 Token close = | |
1047 new StringToken.fromString( | |
1048 BAD_INPUT_INFO, error, begin.charOffset, true); | |
1049 | |
1050 // We want to ensure that unmatched BeginGroupTokens are reported | |
1051 // as errors. However, the rest of the parser assume the groups | |
ngeoffray
2013/10/18 10:19:37
assume -> assumes
lukas
2013/10/24 16:48:36
Done.
| |
1052 // are well-balanced and will never look at the endGroup | |
1053 // token. This is a nice property that allows us to skip quickly | |
1054 // over correct code. By inserting an additional error token in | |
1055 // the stream, we can keep ignoring endGroup tokens. | |
1056 // | |
1057 // [begin] --next--> [tail] | |
1058 // [begin] --endG--> [close] --next--> [next] --next--> [tail] | |
1059 // | |
1060 // This allows the parser to skip from [begin] via endGroup to [close] and | |
1061 // ignore the [close] token (assuming it's correct), then the error will be | |
1062 // reported when parsing the [next] token. | |
1063 | |
1064 Token next = new StringToken.fromString( | |
1065 BAD_INPUT_INFO, error, begin.charOffset, true); | |
1066 begin.endGroup = close; | |
1067 close.next = next; | |
1068 next.next = begin.next; | |
1069 } | |
845 } | 1070 } |
OLD | NEW |