OLD | NEW |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 part of scanner; | 5 part of scanner; |
6 | 6 |
7 abstract class Scanner { | 7 abstract class Scanner { |
8 Token tokenize(); | 8 Token tokenize(); |
| 9 |
| 10 factory Scanner(SourceFile file, {bool includeComments: false}) { |
| 11 if (file is Utf8BytesSourceFile) { |
| 12 return new Utf8BytesScanner(file, includeComments: includeComments); |
| 13 } else { |
| 14 return new StringScanner(file, includeComments: includeComments); |
| 15 } |
| 16 } |
9 } | 17 } |
10 | 18 |
11 /** | 19 abstract class AbstractScanner implements Scanner { |
12 * Common base class for a Dart scanner. | 20 final bool includeComments; |
13 */ | 21 |
14 abstract class AbstractScanner<T extends SourceString> implements Scanner { | 22 /** |
| 23 * The string offset for the next token that will be created. |
| 24 * |
| 25 * Note that in the [Utf8BytesScanner], string offsets and [scanOffset] values |
| 26 * are different. One string character can be encoded using multiple UTF-8 |
| 27 * bytes. |
| 28 */ |
| 29 int tokenStart = -1; |
| 30 |
| 31 /** |
| 32 * A pointer to the token stream created by this scanner. The first token |
| 33 * is a special token and not part of the source file. This is an |
| 34 * implementation detail to avoids special cases in the scanner. This token |
| 35 * is not exposed to clients of the scanner, which are expected to invoke |
| 36 * [firstToken] to access the token stream. |
| 37 */ |
| 38 final Token tokens = new SymbolToken(EOF_INFO, -1); |
| 39 |
| 40 /** |
| 41 * A pointer to the last scanned token. |
| 42 */ |
| 43 Token tail; |
| 44 |
| 45 /** |
| 46 * The stack of open groups, e.g [: { ... ( .. :] |
| 47 * Each BeginGroupToken has a pointer to the token where the group |
| 48 * ends. This field is set when scanning the end group token. |
| 49 */ |
| 50 Link<BeginGroupToken> groupingStack = const Link<BeginGroupToken>(); |
| 51 |
| 52 /** |
| 53 * The source file that is being scanned. This field can be [:null:]. |
| 54 * If the source file is available, the scanner assigns its [:lineStarts:] and |
| 55 * [:length:] fields at the end of [tokenize]. |
| 56 */ |
| 57 final SourceFile file; |
| 58 |
| 59 final List<int> lineStarts = [0]; |
| 60 |
| 61 AbstractScanner(this.file, this.includeComments) { |
| 62 this.tail = this.tokens; |
| 63 } |
| 64 |
| 65 |
| 66 /** |
| 67 * Advances and returns the next character. |
| 68 * |
| 69 * If the next character is non-ASCII, then the returned value depends on the |
| 70 * scanner implementation. The [Utf8BytesScanner] returns a UTF-8 byte, while |
| 71 * the [StringScanner] returns a UTF-16 code unit. |
| 72 * |
| 73 * The scanner ensures that [advance] is not invoked after it returned [$EOF]. |
| 74 * This allows implementations to omit bound checks if the data structure ends |
| 75 * with '0'. |
| 76 */ |
15 int advance(); | 77 int advance(); |
16 int nextByte(); | 78 |
17 | 79 /** |
18 /** | 80 * Returns the current unicode character. |
19 * Returns the current character or byte depending on the underlying input | 81 * |
20 * kind. For example, [StringScanner] operates on [String] and thus returns | 82 * If the current character is ASCII, then it is returned unchanged. |
21 * characters (Unicode codepoints represented as int) whereas | 83 * |
22 * [ByteArrayScanner] operates on byte arrays and thus returns bytes. | 84 * The [Utf8BytesScanner] decodes the next unicode code point starting at the |
| 85 * current position. Note that every unicode character is returned as a single |
| 86 * code point, i.e., for '\u{1d11e}' it returns 119070, and the following |
| 87 * [advance] returns the next character. |
| 88 * |
| 89 * The [StringScanner] returns the current character unchanged, which might |
| 90 * be a surrogate character. In the case of '\u{1d11e}', it returns the first |
| 91 * code unit 55348, and the following [advance] returns the second code unit |
| 92 * 56606. |
| 93 * |
| 94 * Invoking [currentAsUnicode] multiple times is safe, i.e., |
| 95 * [:currentAsUnicode(next) == currentAsUnicode(currentAsUnicode(next)):]. |
| 96 */ |
| 97 int currentAsUnicode(int next); |
| 98 |
| 99 /** |
| 100 * Returns the character at the next poisition. Like in [advance], the |
| 101 * [Utf8BytesScanner] returns a UTF-8 byte, while the [StringScanner] returns |
| 102 * a UTF-16 code unit. |
23 */ | 103 */ |
24 int peek(); | 104 int peek(); |
25 | 105 |
26 /** | 106 /** |
| 107 * Notifies the scanner that unicode characters were detected in either a |
| 108 * comment or a string literal between [startScanOffset] and the current |
| 109 * scan offset. |
| 110 */ |
| 111 void handleUnicode(int startScanOffset); |
| 112 |
| 113 /** |
| 114 * Returns the current scan offset. |
| 115 * |
| 116 * In the [Utf8BytesScanner] this is the offset into the byte list, in the |
| 117 * [StringScanner] the offset in the source string. |
| 118 */ |
| 119 int get scanOffset; |
| 120 |
| 121 /** |
| 122 * Returns the current string offset. |
| 123 * |
| 124 * In the [StringScanner] this is identical to the [scanOffset]. In the |
| 125 * [Utf8BytesScanner] it is computed based on encountered UTF-8 characters. |
| 126 */ |
| 127 int get stringOffset; |
| 128 |
| 129 /** |
| 130 * Returns the first token scanned by this [Scanner]. |
| 131 */ |
| 132 Token firstToken(); |
| 133 |
| 134 /** |
| 135 * Returns the last token scanned by this [Scanner]. |
| 136 */ |
| 137 Token previousToken(); |
| 138 |
| 139 /** |
| 140 * Notifies that a new token starts at current offset. |
| 141 */ |
| 142 void beginToken() { |
| 143 tokenStart = stringOffset; |
| 144 } |
| 145 |
| 146 /** |
| 147 * Appends a substring from the scan offset [:start:] to the current |
| 148 * [:scanOffset:] plus the [:extraOffset:]. For example, if the current |
| 149 * scanOffset is 10, then [:appendSubstringToken(5, -1):] will append the |
| 150 * substring string [5,9). |
| 151 * |
| 152 * Note that [extraOffset] can only be used if the covered character(s) are |
| 153 * known to be ASCII. |
| 154 */ |
| 155 void appendSubstringToken(PrecedenceInfo info, int start, |
| 156 bool asciiOnly, [int extraOffset]); |
| 157 |
| 158 /** |
| 159 * Appends a token whose kind is determined by [info] and content is defined |
| 160 * by the String [value]. |
| 161 * |
| 162 * This method is invoked for class names, field names, method names, types, |
| 163 * etc. |
| 164 */ |
| 165 void appendStringToken(PrecedenceInfo info, String value) { |
| 166 tail.next = new StringToken.fromString(info, value, tokenStart, true); |
| 167 tail = tail.next; |
| 168 } |
| 169 |
| 170 /** |
| 171 * Appends a fixed token whose kind and content is determined by [info]. |
| 172 * Appends an *operator* token from [info]. |
| 173 * |
| 174 * An operator token represent operators like ':', '.', ';', '&&', '==', '--', |
| 175 * '=>', etc. |
| 176 */ |
| 177 void appendPrecedenceToken(PrecedenceInfo info) { |
| 178 tail.next = new SymbolToken(info, tokenStart); |
| 179 tail = tail.next; |
| 180 } |
| 181 |
| 182 /** |
27 * Appends a fixed token based on whether the current char is [choice] or not. | 183 * Appends a fixed token based on whether the current char is [choice] or not. |
28 * If the current char is [choice] a fixed token whose kind and content | 184 * If the current char is [choice] a fixed token whose kind and content |
29 * is determined by [yes] is appended, otherwise a fixed token whose kind | 185 * is determined by [yes] is appended, otherwise a fixed token whose kind |
30 * and content is determined by [no] is appended. | 186 * and content is determined by [no] is appended. |
31 */ | 187 */ |
32 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no); | 188 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no) { |
33 | 189 int next = advance(); |
34 /** | 190 if (identical(next, choice)) { |
35 * Appends a fixed token whose kind and content is determined by [info]. | 191 appendPrecedenceToken(yes); |
36 */ | 192 return advance(); |
37 void appendPrecedenceToken(PrecedenceInfo info); | 193 } else { |
38 | 194 appendPrecedenceToken(no); |
39 /** | 195 return next; |
40 * Appends a token whose kind is determined by [info] and content is [value]. | 196 } |
41 */ | 197 } |
42 void appendStringToken(PrecedenceInfo info, String value); | |
43 | |
44 /** | |
45 * Appends a token whose kind is determined by [info] and content is defined | |
46 * by the SourceString [value]. | |
47 */ | |
48 void appendByteStringToken(PrecedenceInfo info, T value); | |
49 | 198 |
50 /** | 199 /** |
51 * Appends a keyword token whose kind is determined by [keyword]. | 200 * Appends a keyword token whose kind is determined by [keyword]. |
52 */ | 201 */ |
53 void appendKeywordToken(Keyword keyword); | 202 void appendKeywordToken(Keyword keyword) { |
54 void appendWhiteSpace(int next); | 203 String syntax = keyword.syntax; |
55 void appendEofToken(); | 204 // Type parameters and arguments cannot contain 'this' or 'super'. |
56 | 205 if (identical(syntax, 'this') || identical(syntax, 'super')) { |
57 /** | 206 discardOpenLt(); |
58 * Creates an ASCII SourceString whose content begins at the source byte | 207 } |
59 * offset [start] and ends at [offset] bytes from the current byte offset of | 208 tail.next = new KeywordToken(keyword, tokenStart); |
60 * the scanner. For example, if the current byte offset is 10, | 209 tail = tail.next; |
61 * [:asciiString(0,-1):] creates an ASCII SourceString whose content is found | 210 } |
62 * at the [0,9[ byte interval of the source text. | 211 |
63 */ | 212 void appendEofToken() { |
64 T asciiString(int start, int offset); | 213 beginToken(); |
65 T utf8String(int start, int offset); | 214 tail.next = new SymbolToken(EOF_INFO, tokenStart); |
66 Token firstToken(); | 215 tail = tail.next; |
67 Token previousToken(); | 216 // EOF points to itself so there's always infinite look-ahead. |
68 void beginToken(); | 217 tail.next = tail; |
69 void addToCharOffset(int offset); | 218 discardOpenLt(); |
70 int get charOffset; | 219 while (!groupingStack.isEmpty) { |
71 int get byteOffset; | 220 unmatchedBeginGroup(groupingStack.head); |
72 void appendBeginGroup(PrecedenceInfo info, String value); | 221 groupingStack = groupingStack.tail; |
73 int appendEndGroup(PrecedenceInfo info, String value, int openKind); | 222 } |
74 void appendGt(PrecedenceInfo info, String value); | 223 } |
75 void appendGtGt(PrecedenceInfo info, String value); | 224 |
76 void appendGtGtGt(PrecedenceInfo info, String value); | 225 /** |
77 void appendComment(); | 226 * Notifies scanning a whitespace character. Note that [appendWhiteSpace] is |
| 227 * not always invoked for [$SPACE] characters. |
| 228 * |
| 229 * This method is used by the scanners to track line breaks and create the |
| 230 * [lineStarts] map. |
| 231 */ |
| 232 void appendWhiteSpace(int next) { |
| 233 if (next == $LF && file != null) { |
| 234 lineStarts.add(stringOffset + 1); // +1, the line starts after the $LF. |
| 235 } |
| 236 } |
| 237 |
| 238 /** |
| 239 * Notifies on [$LF] characters in multi-line commends or strings. |
| 240 * |
| 241 * This method is used by the scanners to track line breaks and create the |
| 242 * [lineStarts] map. |
| 243 */ |
| 244 void lineFeedInMultiline() { |
| 245 if (file != null) { |
| 246 lineStarts.add(stringOffset + 1); |
| 247 } |
| 248 } |
| 249 |
| 250 /** |
| 251 * Appends a token that begins a new group, represented by [value]. |
| 252 * Group begin tokens are '{', '(', '[' and '${'. |
| 253 */ |
| 254 void appendBeginGroup(PrecedenceInfo info) { |
| 255 Token token = new BeginGroupToken(info, tokenStart); |
| 256 tail.next = token; |
| 257 tail = tail.next; |
| 258 |
| 259 // { ( [ ${ cannot appear inside a type parameters / arguments. |
| 260 if (!identical(info.kind, LT_TOKEN)) discardOpenLt(); |
| 261 groupingStack = groupingStack.prepend(token); |
| 262 } |
| 263 |
| 264 /** |
| 265 * Appends a token that begins a ends group, represented by [value]. |
| 266 * It handles the group end tokens '}', ')' and ']'. The tokens '>' and |
| 267 * '>>' are handled separately bo [appendGt] and [appendGtGt]. |
| 268 */ |
| 269 int appendEndGroup(PrecedenceInfo info, int openKind) { |
| 270 assert(!identical(openKind, LT_TOKEN)); // openKind is < for > and >> |
| 271 appendPrecedenceToken(info); |
| 272 // Don't report unmatched errors for <; it is also the less-than operator. |
| 273 discardOpenLt(); |
| 274 if (groupingStack.isEmpty) { |
| 275 return advance(); |
| 276 } |
| 277 BeginGroupToken begin = groupingStack.head; |
| 278 if (!identical(begin.kind, openKind)) { |
| 279 if (!identical(openKind, OPEN_CURLY_BRACKET_TOKEN) || |
| 280 !identical(begin.kind, STRING_INTERPOLATION_TOKEN)) { |
| 281 // Not ending string interpolation. |
| 282 unmatchedBeginGroup(begin); |
| 283 return advance(); |
| 284 } |
| 285 // We're ending an interpolated expression. |
| 286 begin.endGroup = tail; |
| 287 groupingStack = groupingStack.tail; |
| 288 // Using "start-of-text" to signal that we're back in string |
| 289 // scanning mode. |
| 290 return $STX; |
| 291 } |
| 292 begin.endGroup = tail; |
| 293 groupingStack = groupingStack.tail; |
| 294 return advance(); |
| 295 } |
| 296 |
| 297 /** |
| 298 * Appends a token for '>'. |
| 299 * This method does not issue unmatched errors, because > is also the |
| 300 * greater-than operator. It does not necessarily have to close a group. |
| 301 */ |
| 302 void appendGt(PrecedenceInfo info) { |
| 303 appendPrecedenceToken(info); |
| 304 if (groupingStack.isEmpty) return; |
| 305 if (identical(groupingStack.head.kind, LT_TOKEN)) { |
| 306 groupingStack.head.endGroup = tail; |
| 307 groupingStack = groupingStack.tail; |
| 308 } |
| 309 } |
| 310 |
| 311 /** |
| 312 * Appends a token for '>>'. |
| 313 * This method does not issue unmatched errors, because >> is also the |
| 314 * shift operator. It does not necessarily have to close a group. |
| 315 */ |
| 316 void appendGtGt(PrecedenceInfo info) { |
| 317 appendPrecedenceToken(info); |
| 318 if (groupingStack.isEmpty) return; |
| 319 if (identical(groupingStack.head.kind, LT_TOKEN)) { |
| 320 // Don't assign endGroup: in "T<U<V>>", the '>>' token closes the outer |
| 321 // '<', the inner '<' is left without endGroup. |
| 322 groupingStack = groupingStack.tail; |
| 323 } |
| 324 if (groupingStack.isEmpty) return; |
| 325 if (identical(groupingStack.head.kind, LT_TOKEN)) { |
| 326 groupingStack.head.endGroup = tail; |
| 327 groupingStack = groupingStack.tail; |
| 328 } |
| 329 } |
| 330 |
| 331 void appendComment(start, bool asciiOnly) { |
| 332 if (!includeComments) return; |
| 333 appendSubstringToken(COMMENT_INFO, start, asciiOnly); |
| 334 } |
78 | 335 |
79 /** | 336 /** |
80 * We call this method to discard '<' from the "grouping" stack | 337 * We call this method to discard '<' from the "grouping" stack |
81 * (maintained by subclasses). | 338 * (maintained by subclasses). |
82 * | 339 * |
83 * [PartialParser.skipExpression] relies on the fact that we do not | 340 * [PartialParser.skipExpression] relies on the fact that we do not |
84 * create groups for stuff like: | 341 * create groups for stuff like: |
85 * [:a = b < c, d = e > f:]. | 342 * [:a = b < c, d = e > f:]. |
86 * | 343 * |
87 * In other words, this method is called when the scanner recognizes | 344 * In other words, this method is called when the scanner recognizes |
88 * something which cannot possibly be part of a type | 345 * something which cannot possibly be part of a type |
89 * parameter/argument list. | 346 * parameter/argument list. |
90 */ | 347 */ |
91 void discardOpenLt(); | 348 void discardOpenLt() { |
| 349 while (!groupingStack.isEmpty |
| 350 && identical(groupingStack.head.kind, LT_TOKEN)) { |
| 351 groupingStack = groupingStack.tail; |
| 352 } |
| 353 } |
92 | 354 |
93 // TODO(ahe): Move this class to implementation. | 355 // TODO(ahe): Move this class to implementation. |
94 | 356 |
95 Token tokenize() { | 357 Token tokenize() { |
96 int next = advance(); | 358 int next = advance(); |
97 while (!identical(next, $EOF)) { | 359 while (!identical(next, $EOF)) { |
98 next = bigSwitch(next); | 360 next = bigSwitch(next); |
99 } | 361 } |
100 appendEofToken(); | 362 appendEofToken(); |
| 363 |
| 364 if (file != null) { |
| 365 file.length = stringOffset; |
| 366 // One additional line start at the end, see [SourceFile.lineStarts]. |
| 367 lineStarts.add(stringOffset + 1); |
| 368 file.lineStarts = lineStarts; |
| 369 } |
| 370 |
101 return firstToken(); | 371 return firstToken(); |
102 } | 372 } |
103 | 373 |
104 int bigSwitch(int next) { | 374 int bigSwitch(int next) { |
105 beginToken(); | 375 beginToken(); |
106 if (identical(next, $SPACE) || identical(next, $TAB) | 376 if (identical(next, $SPACE) || identical(next, $TAB) |
107 || identical(next, $LF) || identical(next, $CR)) { | 377 || identical(next, $LF) || identical(next, $CR)) { |
108 appendWhiteSpace(next); | 378 appendWhiteSpace(next); |
109 next = advance(); | 379 next = advance(); |
| 380 // Sequences of spaces are common, so advance through them fast. |
110 while (identical(next, $SPACE)) { | 381 while (identical(next, $SPACE)) { |
111 appendWhiteSpace(next); | 382 // We don't invoke [:appendWhiteSpace(next):] here for efficiency, |
| 383 // assuming that it does not do anything for space characters. |
112 next = advance(); | 384 next = advance(); |
113 } | 385 } |
114 return next; | 386 return next; |
115 } | 387 } |
116 | 388 |
117 if ($a <= next && next <= $z) { | 389 if ($a <= next && next <= $z) { |
118 if (identical($r, next)) { | 390 if (identical($r, next)) { |
119 return tokenizeRawStringKeywordOrIdentifier(next); | 391 return tokenizeRawStringKeywordOrIdentifier(next); |
120 } | 392 } |
121 return tokenizeKeywordOrIdentifier(next, true); | 393 return tokenizeKeywordOrIdentifier(next, true); |
122 } | 394 } |
123 | 395 |
124 if (($A <= next && next <= $Z) || identical(next, $_) || identical(next, $$)
) { | 396 if (($A <= next && next <= $Z) || |
125 return tokenizeIdentifier(next, byteOffset, true); | 397 identical(next, $_) || |
| 398 identical(next, $$)) { |
| 399 return tokenizeIdentifier(next, scanOffset, true); |
126 } | 400 } |
127 | 401 |
128 if (identical(next, $LT)) { | 402 if (identical(next, $LT)) { |
129 return tokenizeLessThan(next); | 403 return tokenizeLessThan(next); |
130 } | 404 } |
131 | 405 |
132 if (identical(next, $GT)) { | 406 if (identical(next, $GT)) { |
133 return tokenizeGreaterThan(next); | 407 return tokenizeGreaterThan(next); |
134 } | 408 } |
135 | 409 |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
180 if (identical(next, $BACKSLASH)) { | 454 if (identical(next, $BACKSLASH)) { |
181 appendPrecedenceToken(BACKSLASH_INFO); | 455 appendPrecedenceToken(BACKSLASH_INFO); |
182 return advance(); | 456 return advance(); |
183 } | 457 } |
184 | 458 |
185 if (identical(next, $HASH)) { | 459 if (identical(next, $HASH)) { |
186 return tokenizeTag(next); | 460 return tokenizeTag(next); |
187 } | 461 } |
188 | 462 |
189 if (identical(next, $OPEN_PAREN)) { | 463 if (identical(next, $OPEN_PAREN)) { |
190 appendBeginGroup(OPEN_PAREN_INFO, "("); | 464 appendBeginGroup(OPEN_PAREN_INFO); |
191 return advance(); | 465 return advance(); |
192 } | 466 } |
193 | 467 |
194 if (identical(next, $CLOSE_PAREN)) { | 468 if (identical(next, $CLOSE_PAREN)) { |
195 return appendEndGroup(CLOSE_PAREN_INFO, ")", OPEN_PAREN_TOKEN); | 469 return appendEndGroup(CLOSE_PAREN_INFO, OPEN_PAREN_TOKEN); |
196 } | 470 } |
197 | 471 |
198 if (identical(next, $COMMA)) { | 472 if (identical(next, $COMMA)) { |
199 appendPrecedenceToken(COMMA_INFO); | 473 appendPrecedenceToken(COMMA_INFO); |
200 return advance(); | 474 return advance(); |
201 } | 475 } |
202 | 476 |
203 if (identical(next, $COLON)) { | 477 if (identical(next, $COLON)) { |
204 appendPrecedenceToken(COLON_INFO); | 478 appendPrecedenceToken(COLON_INFO); |
205 return advance(); | 479 return advance(); |
206 } | 480 } |
207 | 481 |
208 if (identical(next, $SEMICOLON)) { | 482 if (identical(next, $SEMICOLON)) { |
209 appendPrecedenceToken(SEMICOLON_INFO); | 483 appendPrecedenceToken(SEMICOLON_INFO); |
210 // Type parameters and arguments cannot contain semicolon. | 484 // Type parameters and arguments cannot contain semicolon. |
211 discardOpenLt(); | 485 discardOpenLt(); |
212 return advance(); | 486 return advance(); |
213 } | 487 } |
214 | 488 |
215 if (identical(next, $QUESTION)) { | 489 if (identical(next, $QUESTION)) { |
216 appendPrecedenceToken(QUESTION_INFO); | 490 appendPrecedenceToken(QUESTION_INFO); |
217 return advance(); | 491 return advance(); |
218 } | 492 } |
219 | 493 |
220 if (identical(next, $CLOSE_SQUARE_BRACKET)) { | 494 if (identical(next, $CLOSE_SQUARE_BRACKET)) { |
221 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO, "]", | 495 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO, |
222 OPEN_SQUARE_BRACKET_TOKEN); | 496 OPEN_SQUARE_BRACKET_TOKEN); |
223 } | 497 } |
224 | 498 |
225 if (identical(next, $BACKPING)) { | 499 if (identical(next, $BACKPING)) { |
226 appendPrecedenceToken(BACKPING_INFO); | 500 appendPrecedenceToken(BACKPING_INFO); |
227 return advance(); | 501 return advance(); |
228 } | 502 } |
229 | 503 |
230 if (identical(next, $OPEN_CURLY_BRACKET)) { | 504 if (identical(next, $OPEN_CURLY_BRACKET)) { |
231 appendBeginGroup(OPEN_CURLY_BRACKET_INFO, "{"); | 505 appendBeginGroup(OPEN_CURLY_BRACKET_INFO); |
232 return advance(); | 506 return advance(); |
233 } | 507 } |
234 | 508 |
235 if (identical(next, $CLOSE_CURLY_BRACKET)) { | 509 if (identical(next, $CLOSE_CURLY_BRACKET)) { |
236 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, "}", | 510 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, |
237 OPEN_CURLY_BRACKET_TOKEN); | 511 OPEN_CURLY_BRACKET_TOKEN); |
238 } | 512 } |
239 | 513 |
240 if (identical(next, $SLASH)) { | 514 if (identical(next, $SLASH)) { |
241 return tokenizeSlashOrComment(next); | 515 return tokenizeSlashOrComment(next); |
242 } | 516 } |
243 | 517 |
244 if (identical(next, $AT)) { | 518 if (identical(next, $AT)) { |
245 return tokenizeAt(next); | 519 return tokenizeAt(next); |
246 } | 520 } |
247 | 521 |
248 if (identical(next, $DQ) || identical(next, $SQ)) { | 522 if (identical(next, $DQ) || identical(next, $SQ)) { |
249 return tokenizeString(next, byteOffset, false); | 523 return tokenizeString(next, scanOffset, false); |
250 } | 524 } |
251 | 525 |
252 if (identical(next, $PERIOD)) { | 526 if (identical(next, $PERIOD)) { |
253 return tokenizeDotsOrNumber(next); | 527 return tokenizeDotsOrNumber(next); |
254 } | 528 } |
255 | 529 |
256 if (identical(next, $0)) { | 530 if (identical(next, $0)) { |
257 return tokenizeHexOrNumber(next); | 531 return tokenizeHexOrNumber(next); |
258 } | 532 } |
259 | 533 |
260 // TODO(ahe): Would a range check be faster? | 534 // TODO(ahe): Would a range check be faster? |
261 if (identical(next, $1) || identical(next, $2) || identical(next, $3) | 535 if (identical(next, $1) || identical(next, $2) || identical(next, $3) |
262 || identical(next, $4) || identical(next, $5) || identical(next, $6) | 536 || identical(next, $4) || identical(next, $5) || identical(next, $6) |
263 || identical(next, $7) || identical(next, $8) || identical(next, $9)) { | 537 || identical(next, $7) || identical(next, $8) || identical(next, $9)) { |
264 return tokenizeNumber(next); | 538 return tokenizeNumber(next); |
265 } | 539 } |
266 | 540 |
267 if (identical(next, $EOF)) { | 541 if (identical(next, $EOF)) { |
268 return $EOF; | 542 return $EOF; |
269 } | 543 } |
270 if (next < 0x1f) { | 544 if (next < 0x1f) { |
271 return error(new SourceString("unexpected character $next")); | 545 return error("unexpected character $next"); |
| 546 } |
| 547 |
| 548 if (next >= 128) { |
| 549 next = currentAsUnicode(next); |
272 } | 550 } |
273 | 551 |
274 // The following are non-ASCII characters. | 552 // The following are non-ASCII characters. |
275 | 553 |
276 if (identical(next, $NBSP)) { | 554 if (identical(next, $NBSP)) { |
277 appendWhiteSpace(next); | 555 appendWhiteSpace(next); |
278 return advance(); | 556 return advance(); |
279 } | 557 } |
280 | 558 |
281 return tokenizeIdentifier(next, byteOffset, true); | 559 return error("unexpected unicode character $next"); |
282 } | 560 } |
283 | 561 |
284 int tokenizeTag(int next) { | 562 int tokenizeTag(int next) { |
285 // # or #!.*[\n\r] | 563 // # or #!.*[\n\r] |
286 if (byteOffset == 0) { | 564 if (scanOffset == 0) { |
287 if (identical(peek(), $BANG)) { | 565 if (identical(peek(), $BANG)) { |
| 566 int start = scanOffset + 1; |
| 567 bool asciiOnly = true; |
288 do { | 568 do { |
289 next = advance(); | 569 next = advance(); |
290 } while (!identical(next, $LF) && !identical(next, $CR) && !identical(ne
xt, $EOF)); | 570 if (next > 127) asciiOnly = false; |
| 571 } while (!identical(next, $LF) && |
| 572 !identical(next, $CR) && |
| 573 !identical(next, $EOF)); |
| 574 if (!asciiOnly) handleUnicode(start); |
291 return next; | 575 return next; |
292 } | 576 } |
293 } | 577 } |
294 appendPrecedenceToken(HASH_INFO); | 578 appendPrecedenceToken(HASH_INFO); |
295 return advance(); | 579 return advance(); |
296 } | 580 } |
297 | 581 |
298 int tokenizeTilde(int next) { | 582 int tokenizeTilde(int next) { |
299 // ~ ~/ ~/= | 583 // ~ ~/ ~/= |
300 next = advance(); | 584 next = advance(); |
301 if (identical(next, $SLASH)) { | 585 if (identical(next, $SLASH)) { |
302 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO); | 586 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO); |
303 } else { | 587 } else { |
304 appendPrecedenceToken(TILDE_INFO); | 588 appendPrecedenceToken(TILDE_INFO); |
305 return next; | 589 return next; |
306 } | 590 } |
307 } | 591 } |
308 | 592 |
309 int tokenizeOpenSquareBracket(int next) { | 593 int tokenizeOpenSquareBracket(int next) { |
310 // [ [] []= | 594 // [ [] []= |
311 next = advance(); | 595 next = advance(); |
312 if (identical(next, $CLOSE_SQUARE_BRACKET)) { | 596 if (identical(next, $CLOSE_SQUARE_BRACKET)) { |
313 Token token = previousToken(); | 597 Token token = previousToken(); |
314 if (token is KeywordToken && identical(token.value.stringValue, 'operator'
)) { | 598 if (token is KeywordToken && |
| 599 identical((token as KeywordToken).keyword.syntax, 'operator')) { |
315 return select($EQ, INDEX_EQ_INFO, INDEX_INFO); | 600 return select($EQ, INDEX_EQ_INFO, INDEX_INFO); |
316 } | 601 } |
317 } | 602 } |
318 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO, "["); | 603 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO); |
319 return next; | 604 return next; |
320 } | 605 } |
321 | 606 |
322 int tokenizeCaret(int next) { | 607 int tokenizeCaret(int next) { |
323 // ^ ^= | 608 // ^ ^= |
324 return select($EQ, CARET_EQ_INFO, CARET_INFO); | 609 return select($EQ, CARET_EQ_INFO, CARET_INFO); |
325 } | 610 } |
326 | 611 |
327 int tokenizeBar(int next) { | 612 int tokenizeBar(int next) { |
328 // | || |= | 613 // | || |= |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
372 return advance(); | 657 return advance(); |
373 } else if (identical(next, $EQ)) { | 658 } else if (identical(next, $EQ)) { |
374 appendPrecedenceToken(MINUS_EQ_INFO); | 659 appendPrecedenceToken(MINUS_EQ_INFO); |
375 return advance(); | 660 return advance(); |
376 } else { | 661 } else { |
377 appendPrecedenceToken(MINUS_INFO); | 662 appendPrecedenceToken(MINUS_INFO); |
378 return next; | 663 return next; |
379 } | 664 } |
380 } | 665 } |
381 | 666 |
382 | |
383 int tokenizePlus(int next) { | 667 int tokenizePlus(int next) { |
384 // + ++ += | 668 // + ++ += |
385 next = advance(); | 669 next = advance(); |
386 if (identical($PLUS, next)) { | 670 if (identical($PLUS, next)) { |
387 appendPrecedenceToken(PLUS_PLUS_INFO); | 671 appendPrecedenceToken(PLUS_PLUS_INFO); |
388 return advance(); | 672 return advance(); |
389 } else if (identical($EQ, next)) { | 673 } else if (identical($EQ, next)) { |
390 appendPrecedenceToken(PLUS_EQ_INFO); | 674 appendPrecedenceToken(PLUS_EQ_INFO); |
391 return advance(); | 675 return advance(); |
392 } else { | 676 } else { |
393 appendPrecedenceToken(PLUS_INFO); | 677 appendPrecedenceToken(PLUS_INFO); |
394 return next; | 678 return next; |
395 } | 679 } |
396 } | 680 } |
397 | 681 |
398 int tokenizeExclamation(int next) { | 682 int tokenizeExclamation(int next) { |
399 // ! != !== | 683 // ! != |
| 684 // !== is kept for user-friendly error reporting |
| 685 |
400 next = advance(); | 686 next = advance(); |
401 if (identical(next, $EQ)) { | 687 if (identical(next, $EQ)) { |
402 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO); | 688 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO); |
403 } | 689 } |
404 appendPrecedenceToken(BANG_INFO); | 690 appendPrecedenceToken(BANG_INFO); |
405 return next; | 691 return next; |
406 } | 692 } |
407 | 693 |
408 int tokenizeEquals(int next) { | 694 int tokenizeEquals(int next) { |
409 // = == === | 695 // = == => |
| 696 // === is kept for user-friendly error reporting |
410 | 697 |
411 // Type parameters and arguments cannot contain any token that | 698 // Type parameters and arguments cannot contain any token that |
412 // starts with '='. | 699 // starts with '='. |
413 discardOpenLt(); | 700 discardOpenLt(); |
414 | 701 |
415 next = advance(); | 702 next = advance(); |
416 if (identical(next, $EQ)) { | 703 if (identical(next, $EQ)) { |
417 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO); | 704 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO); |
418 } else if (identical(next, $GT)) { | 705 } else if (identical(next, $GT)) { |
419 appendPrecedenceToken(FUNCTION_INFO); | 706 appendPrecedenceToken(FUNCTION_INFO); |
420 return advance(); | 707 return advance(); |
421 } | 708 } |
422 appendPrecedenceToken(EQ_INFO); | 709 appendPrecedenceToken(EQ_INFO); |
423 return next; | 710 return next; |
424 } | 711 } |
425 | 712 |
426 int tokenizeGreaterThan(int next) { | 713 int tokenizeGreaterThan(int next) { |
427 // > >= >> >>= >>> >>>= | 714 // > >= >> >>= |
428 next = advance(); | 715 next = advance(); |
429 if (identical($EQ, next)) { | 716 if (identical($EQ, next)) { |
430 appendPrecedenceToken(GT_EQ_INFO); | 717 appendPrecedenceToken(GT_EQ_INFO); |
431 return advance(); | 718 return advance(); |
432 } else if (identical($GT, next)) { | 719 } else if (identical($GT, next)) { |
433 next = advance(); | 720 next = advance(); |
434 if (identical($EQ, next)) { | 721 if (identical($EQ, next)) { |
435 appendPrecedenceToken(GT_GT_EQ_INFO); | 722 appendPrecedenceToken(GT_GT_EQ_INFO); |
436 return advance(); | 723 return advance(); |
437 } else { | 724 } else { |
438 appendGtGt(GT_GT_INFO, ">>"); | 725 appendGtGt(GT_GT_INFO); |
439 return next; | 726 return next; |
440 } | 727 } |
441 } else { | 728 } else { |
442 appendGt(GT_INFO, ">"); | 729 appendGt(GT_INFO); |
443 return next; | 730 return next; |
444 } | 731 } |
445 } | 732 } |
446 | 733 |
447 int tokenizeLessThan(int next) { | 734 int tokenizeLessThan(int next) { |
448 // < <= << <<= | 735 // < <= << <<= |
449 next = advance(); | 736 next = advance(); |
450 if (identical($EQ, next)) { | 737 if (identical($EQ, next)) { |
451 appendPrecedenceToken(LT_EQ_INFO); | 738 appendPrecedenceToken(LT_EQ_INFO); |
452 return advance(); | 739 return advance(); |
453 } else if (identical($LT, next)) { | 740 } else if (identical($LT, next)) { |
454 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO); | 741 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO); |
455 } else { | 742 } else { |
456 appendBeginGroup(LT_INFO, "<"); | 743 appendBeginGroup(LT_INFO); |
457 return next; | 744 return next; |
458 } | 745 } |
459 } | 746 } |
460 | 747 |
461 int tokenizeNumber(int next) { | 748 int tokenizeNumber(int next) { |
462 int start = byteOffset; | 749 int start = scanOffset; |
463 while (true) { | 750 while (true) { |
464 next = advance(); | 751 next = advance(); |
465 if ($0 <= next && next <= $9) { | 752 if ($0 <= next && next <= $9) { |
466 continue; | 753 continue; |
467 } else if (identical(next, $e) || identical(next, $E)) { | 754 } else if (identical(next, $e) || identical(next, $E)) { |
468 return tokenizeFractionPart(next, start); | 755 return tokenizeFractionPart(next, start); |
469 } else { | 756 } else { |
470 if (identical(next, $PERIOD)) { | 757 if (identical(next, $PERIOD)) { |
471 int nextnext = peek(); | 758 int nextnext = peek(); |
472 if ($0 <= nextnext && nextnext <= $9) { | 759 if ($0 <= nextnext && nextnext <= $9) { |
473 return tokenizeFractionPart(advance(), start); | 760 return tokenizeFractionPart(advance(), start); |
474 } | 761 } |
475 } | 762 } |
476 appendByteStringToken(INT_INFO, asciiString(start, 0)); | 763 appendSubstringToken(INT_INFO, start, true); |
477 return next; | 764 return next; |
478 } | 765 } |
479 } | 766 } |
480 } | 767 } |
481 | 768 |
482 int tokenizeHexOrNumber(int next) { | 769 int tokenizeHexOrNumber(int next) { |
483 int x = peek(); | 770 int x = peek(); |
484 if (identical(x, $x) || identical(x, $X)) { | 771 if (identical(x, $x) || identical(x, $X)) { |
485 advance(); | 772 return tokenizeHex(next); |
486 return tokenizeHex(x); | |
487 } | 773 } |
488 return tokenizeNumber(next); | 774 return tokenizeNumber(next); |
489 } | 775 } |
490 | 776 |
491 int tokenizeHex(int next) { | 777 int tokenizeHex(int next) { |
492 int start = byteOffset - 1; | 778 int start = scanOffset; |
| 779 next = advance(); // Advance past the $x or $X. |
493 bool hasDigits = false; | 780 bool hasDigits = false; |
494 while (true) { | 781 while (true) { |
495 next = advance(); | 782 next = advance(); |
496 if (($0 <= next && next <= $9) | 783 if (($0 <= next && next <= $9) |
497 || ($A <= next && next <= $F) | 784 || ($A <= next && next <= $F) |
498 || ($a <= next && next <= $f)) { | 785 || ($a <= next && next <= $f)) { |
499 hasDigits = true; | 786 hasDigits = true; |
500 } else { | 787 } else { |
501 if (!hasDigits) { | 788 if (!hasDigits) { |
502 return error(const SourceString("hex digit expected")); | 789 return error("hex digit expected"); |
503 } | 790 } |
504 appendByteStringToken(HEXADECIMAL_INFO, asciiString(start, 0)); | 791 appendSubstringToken(HEXADECIMAL_INFO, start, true); |
505 return next; | 792 return next; |
506 } | 793 } |
507 } | 794 } |
508 } | 795 } |
509 | 796 |
510 int tokenizeDotsOrNumber(int next) { | 797 int tokenizeDotsOrNumber(int next) { |
511 int start = byteOffset; | 798 int start = scanOffset; |
512 next = advance(); | 799 next = advance(); |
513 if (($0 <= next && next <= $9)) { | 800 if (($0 <= next && next <= $9)) { |
514 return tokenizeFractionPart(next, start); | 801 return tokenizeFractionPart(next, start); |
515 } else if (identical($PERIOD, next)) { | 802 } else if (identical($PERIOD, next)) { |
516 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); | 803 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); |
517 } else { | 804 } else { |
518 appendPrecedenceToken(PERIOD_INFO); | 805 appendPrecedenceToken(PERIOD_INFO); |
519 return next; | 806 return next; |
520 } | 807 } |
521 } | 808 } |
522 | 809 |
523 int tokenizeFractionPart(int next, int start) { | 810 int tokenizeFractionPart(int next, int start) { |
524 bool done = false; | 811 bool done = false; |
525 bool hasDigit = false; | 812 bool hasDigit = false; |
526 LOOP: while (!done) { | 813 LOOP: while (!done) { |
527 if ($0 <= next && next <= $9) { | 814 if ($0 <= next && next <= $9) { |
528 hasDigit = true; | 815 hasDigit = true; |
529 } else if (identical($e, next) || identical($E, next)) { | 816 } else if (identical($e, next) || identical($E, next)) { |
530 hasDigit = true; | 817 hasDigit = true; |
531 next = tokenizeExponent(advance()); | 818 next = tokenizeExponent(advance()); |
532 done = true; | 819 done = true; |
533 continue LOOP; | 820 continue LOOP; |
534 } else { | 821 } else { |
535 done = true; | 822 done = true; |
536 continue LOOP; | 823 continue LOOP; |
537 } | 824 } |
538 next = advance(); | 825 next = advance(); |
539 } | 826 } |
540 if (!hasDigit) { | 827 if (!hasDigit) { |
541 appendByteStringToken(INT_INFO, asciiString(start, -1)); | 828 // Reduce offset, we already advanced to the token past the period. |
| 829 appendSubstringToken(INT_INFO, start, true, -1); |
| 830 |
| 831 // TODO(ahe): Wrong offset for the period. Cannot call beginToken because |
| 832 // the scanner already advanced past the period. |
542 if (identical($PERIOD, next)) { | 833 if (identical($PERIOD, next)) { |
543 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); | 834 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); |
544 } | 835 } |
545 // TODO(ahe): Wrong offset for the period. | |
546 appendPrecedenceToken(PERIOD_INFO); | 836 appendPrecedenceToken(PERIOD_INFO); |
547 return bigSwitch(next); | 837 return next; |
548 } | 838 } |
549 appendByteStringToken(DOUBLE_INFO, asciiString(start, 0)); | 839 appendSubstringToken(DOUBLE_INFO, start, true); |
550 return next; | 840 return next; |
551 } | 841 } |
552 | 842 |
553 int tokenizeExponent(int next) { | 843 int tokenizeExponent(int next) { |
554 if (identical(next, $PLUS) || identical(next, $MINUS)) { | 844 if (identical(next, $PLUS) || identical(next, $MINUS)) { |
555 next = advance(); | 845 next = advance(); |
556 } | 846 } |
557 bool hasDigits = false; | 847 bool hasDigits = false; |
558 while (true) { | 848 while (true) { |
559 if ($0 <= next && next <= $9) { | 849 if ($0 <= next && next <= $9) { |
560 hasDigits = true; | 850 hasDigits = true; |
561 } else { | 851 } else { |
562 if (!hasDigits) { | 852 if (!hasDigits) { |
563 return error(const SourceString("digit expected")); | 853 return error("digit expected"); |
564 } | 854 } |
565 return next; | 855 return next; |
566 } | 856 } |
567 next = advance(); | 857 next = advance(); |
568 } | 858 } |
569 } | 859 } |
570 | 860 |
571 int tokenizeSlashOrComment(int next) { | 861 int tokenizeSlashOrComment(int next) { |
| 862 int start = scanOffset; |
572 next = advance(); | 863 next = advance(); |
573 if (identical($STAR, next)) { | 864 if (identical($STAR, next)) { |
574 return tokenizeMultiLineComment(next); | 865 return tokenizeMultiLineComment(next, start); |
575 } else if (identical($SLASH, next)) { | 866 } else if (identical($SLASH, next)) { |
576 return tokenizeSingleLineComment(next); | 867 return tokenizeSingleLineComment(next, start); |
577 } else if (identical($EQ, next)) { | 868 } else if (identical($EQ, next)) { |
578 appendPrecedenceToken(SLASH_EQ_INFO); | 869 appendPrecedenceToken(SLASH_EQ_INFO); |
579 return advance(); | 870 return advance(); |
580 } else { | 871 } else { |
581 appendPrecedenceToken(SLASH_INFO); | 872 appendPrecedenceToken(SLASH_INFO); |
582 return next; | 873 return next; |
583 } | 874 } |
584 } | 875 } |
585 | 876 |
586 int tokenizeSingleLineComment(int next) { | 877 int tokenizeSingleLineComment(int next, int start) { |
| 878 bool asciiOnly = true; |
587 while (true) { | 879 while (true) { |
588 next = advance(); | 880 next = advance(); |
589 if (identical($LF, next) || identical($CR, next) || identical($EOF, next))
{ | 881 if (next > 127) asciiOnly = false; |
590 appendComment(); | 882 if (identical($LF, next) || |
| 883 identical($CR, next) || |
| 884 identical($EOF, next)) { |
| 885 if (!asciiOnly) handleUnicode(start); |
| 886 appendComment(start, asciiOnly); |
591 return next; | 887 return next; |
592 } | 888 } |
593 } | 889 } |
594 } | 890 } |
595 | 891 |
596 int tokenizeMultiLineComment(int next) { | 892 |
| 893 int tokenizeMultiLineComment(int next, int start) { |
| 894 bool asciiOnlyComment = true; // Track if the entire comment is ASCII. |
| 895 bool asciiOnlyLines = true; // Track ASCII since the last handleUnicode. |
| 896 int unicodeStart = start; |
597 int nesting = 1; | 897 int nesting = 1; |
598 next = advance(); | 898 next = advance(); |
599 while (true) { | 899 while (true) { |
600 if (identical($EOF, next)) { | 900 if (identical($EOF, next)) { |
601 // TODO(ahe): Report error. | 901 if (!asciiOnlyLines) handleUnicode(unicodeStart); |
| 902 appendStringToken(BAD_INPUT_INFO, "unterminated multi-line comment"); |
602 return next; | 903 return next; |
603 } else if (identical($STAR, next)) { | 904 } else if (identical($STAR, next)) { |
604 next = advance(); | 905 next = advance(); |
605 if (identical($SLASH, next)) { | 906 if (identical($SLASH, next)) { |
606 --nesting; | 907 --nesting; |
607 if (0 == nesting) { | 908 if (0 == nesting) { |
| 909 if (!asciiOnlyLines) handleUnicode(unicodeStart); |
608 next = advance(); | 910 next = advance(); |
609 appendComment(); | 911 appendComment(start, asciiOnlyComment); |
610 return next; | 912 return next; |
611 } else { | 913 } else { |
612 next = advance(); | 914 next = advance(); |
613 } | 915 } |
614 } | 916 } |
615 } else if (identical($SLASH, next)) { | 917 } else if (identical($SLASH, next)) { |
616 next = advance(); | 918 next = advance(); |
617 if (identical($STAR, next)) { | 919 if (identical($STAR, next)) { |
618 next = advance(); | 920 next = advance(); |
619 ++nesting; | 921 ++nesting; |
620 } | 922 } |
| 923 } else if (identical(next, $LF)) { |
| 924 if (!asciiOnlyLines) { |
| 925 // Synchronize the string offset in the utf8 scanner. |
| 926 handleUnicode(unicodeStart); |
| 927 asciiOnlyLines = true; |
| 928 unicodeStart = scanOffset; |
| 929 } |
| 930 lineFeedInMultiline(); |
| 931 next = advance(); |
621 } else { | 932 } else { |
| 933 if (next > 127) { |
| 934 asciiOnlyLines = false; |
| 935 asciiOnlyComment = false; |
| 936 } |
622 next = advance(); | 937 next = advance(); |
623 } | 938 } |
624 } | 939 } |
625 } | 940 } |
626 | 941 |
627 int tokenizeRawStringKeywordOrIdentifier(int next) { | 942 int tokenizeRawStringKeywordOrIdentifier(int next) { |
| 943 // [next] is $r. |
628 int nextnext = peek(); | 944 int nextnext = peek(); |
629 if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) { | 945 if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) { |
630 int start = byteOffset; | 946 int start = scanOffset; |
631 next = advance(); | 947 next = advance(); |
632 return tokenizeString(next, start, true); | 948 return tokenizeString(next, start, true); |
633 } | 949 } |
634 return tokenizeKeywordOrIdentifier(next, true); | 950 return tokenizeKeywordOrIdentifier(next, true); |
635 } | 951 } |
636 | 952 |
637 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) { | 953 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) { |
638 KeywordState state = KeywordState.KEYWORD_STATE; | 954 KeywordState state = KeywordState.KEYWORD_STATE; |
639 int start = byteOffset; | 955 int start = scanOffset; |
640 while (state != null && $a <= next && next <= $z) { | 956 while (state != null && $a <= next && next <= $z) { |
641 state = state.next(next); | 957 state = state.next(next); |
642 next = advance(); | 958 next = advance(); |
643 } | 959 } |
644 if (state == null || state.keyword == null) { | 960 if (state == null || state.keyword == null) { |
645 return tokenizeIdentifier(next, start, allowDollar); | 961 return tokenizeIdentifier(next, start, allowDollar); |
646 } | 962 } |
647 if (($A <= next && next <= $Z) || | 963 if (($A <= next && next <= $Z) || |
648 ($0 <= next && next <= $9) || | 964 ($0 <= next && next <= $9) || |
649 identical(next, $_) || | 965 identical(next, $_) || |
650 identical(next, $$)) { | 966 identical(next, $$)) { |
651 return tokenizeIdentifier(next, start, allowDollar); | 967 return tokenizeIdentifier(next, start, allowDollar); |
652 } else if (next < 128) { | 968 } else { |
653 appendKeywordToken(state.keyword); | 969 appendKeywordToken(state.keyword); |
654 return next; | 970 return next; |
655 } else { | |
656 return tokenizeIdentifier(next, start, allowDollar); | |
657 } | 971 } |
658 } | 972 } |
659 | 973 |
| 974 /** |
| 975 * [allowDollar] can exclude '$', which is not allowed as part of a string |
| 976 * interpolation identifier. |
| 977 */ |
660 int tokenizeIdentifier(int next, int start, bool allowDollar) { | 978 int tokenizeIdentifier(int next, int start, bool allowDollar) { |
661 bool isAscii = true; | |
662 | |
663 while (true) { | 979 while (true) { |
664 if (($a <= next && next <= $z) || | 980 if (($a <= next && next <= $z) || |
665 ($A <= next && next <= $Z) || | 981 ($A <= next && next <= $Z) || |
666 ($0 <= next && next <= $9) || | 982 ($0 <= next && next <= $9) || |
667 identical(next, $_) || | 983 identical(next, $_) || |
668 (identical(next, $$) && allowDollar)) { | 984 (identical(next, $$) && allowDollar)) { |
669 next = advance(); | 985 next = advance(); |
670 } else if ((next < 128) || (identical(next, $NBSP))) { | 986 } else { |
671 // Identifier ends here. | 987 // Identifier ends here. |
672 if (start == byteOffset) { | 988 if (start == scanOffset) { |
673 return error(const SourceString("expected identifier")); | 989 return error("expected identifier"); |
674 } else if (isAscii) { | |
675 appendByteStringToken(IDENTIFIER_INFO, asciiString(start, 0)); | |
676 } else { | 990 } else { |
677 appendByteStringToken(BAD_INPUT_INFO, utf8String(start, -1)); | 991 appendSubstringToken(IDENTIFIER_INFO, start, true); |
678 } | 992 } |
679 return next; | 993 return next; |
680 } else { | |
681 int nonAsciiStart = byteOffset; | |
682 do { | |
683 next = nextByte(); | |
684 if (identical(next, $NBSP)) break; | |
685 } while (next > 127); | |
686 String string = utf8String(nonAsciiStart, -1).slowToString(); | |
687 isAscii = false; | |
688 int byteLength = nonAsciiStart - byteOffset; | |
689 addToCharOffset(string.length - byteLength); | |
690 } | 994 } |
691 } | 995 } |
692 } | 996 } |
693 | 997 |
694 int tokenizeAt(int next) { | 998 int tokenizeAt(int next) { |
695 int start = byteOffset; | |
696 next = advance(); | |
697 appendPrecedenceToken(AT_INFO); | 999 appendPrecedenceToken(AT_INFO); |
698 return next; | 1000 return advance(); |
699 } | 1001 } |
700 | 1002 |
701 int tokenizeString(int next, int start, bool raw) { | 1003 int tokenizeString(int next, int start, bool raw) { |
702 int quoteChar = next; | 1004 int quoteChar = next; |
703 next = advance(); | 1005 next = advance(); |
704 if (identical(quoteChar, next)) { | 1006 if (identical(quoteChar, next)) { |
705 next = advance(); | 1007 next = advance(); |
706 if (identical(quoteChar, next)) { | 1008 if (identical(quoteChar, next)) { |
707 // Multiline string. | 1009 // Multiline string. |
708 return tokenizeMultiLineString(quoteChar, start, raw); | 1010 return tokenizeMultiLineString(quoteChar, start, raw); |
709 } else { | 1011 } else { |
710 // Empty string. | 1012 // Empty string. |
711 appendByteStringToken(STRING_INFO, utf8String(start, -1)); | 1013 appendSubstringToken(STRING_INFO, start, true); |
712 return next; | 1014 return next; |
713 } | 1015 } |
714 } | 1016 } |
715 if (raw) { | 1017 if (raw) { |
716 return tokenizeSingleLineRawString(next, quoteChar, start); | 1018 return tokenizeSingleLineRawString(next, quoteChar, start); |
717 } else { | 1019 } else { |
718 return tokenizeSingleLineString(next, quoteChar, start); | 1020 return tokenizeSingleLineString(next, quoteChar, start); |
719 } | 1021 } |
720 } | 1022 } |
721 | 1023 |
722 static bool isHexDigit(int character) { | 1024 /** |
723 if ($0 <= character && character <= $9) return true; | 1025 * [next] is the first character after the qoute. |
724 character |= 0x20; | 1026 * [start] is the scanOffset of the quote. |
725 return ($a <= character && character <= $f); | 1027 * |
726 } | 1028 * The token contains a substring of the source file, including the |
727 | 1029 * string quotes, backslashes for escaping. For interpolated strings, |
| 1030 * the parts before and after are separate tokens. |
| 1031 * |
| 1032 * "a $b c" |
| 1033 * |
| 1034 * gives StringToken("a $), StringToken(b) and StringToken( c"). |
| 1035 */ |
728 int tokenizeSingleLineString(int next, int quoteChar, int start) { | 1036 int tokenizeSingleLineString(int next, int quoteChar, int start) { |
| 1037 bool asciiOnly = true; |
729 while (!identical(next, quoteChar)) { | 1038 while (!identical(next, quoteChar)) { |
730 if (identical(next, $BACKSLASH)) { | 1039 if (identical(next, $BACKSLASH)) { |
731 next = advance(); | 1040 next = advance(); |
732 } else if (identical(next, $$)) { | 1041 } else if (identical(next, $$)) { |
733 next = tokenizeStringInterpolation(start); | 1042 if (!asciiOnly) handleUnicode(start); |
734 start = byteOffset; | 1043 next = tokenizeStringInterpolation(start, asciiOnly); |
| 1044 start = scanOffset; |
| 1045 asciiOnly = true; |
735 continue; | 1046 continue; |
736 } | 1047 } |
737 if (next <= $CR | 1048 if (next <= $CR |
738 && (identical(next, $LF) || identical(next, $CR) || identical(next, $E
OF))) { | 1049 && (identical(next, $LF) || |
739 return error(const SourceString("unterminated string literal")); | 1050 identical(next, $CR) || |
| 1051 identical(next, $EOF))) { |
| 1052 if (!asciiOnly) handleUnicode(start); |
| 1053 return error("unterminated string literal"); |
740 } | 1054 } |
| 1055 if (next > 127) asciiOnly = false; |
741 next = advance(); | 1056 next = advance(); |
742 } | 1057 } |
743 appendByteStringToken(STRING_INFO, utf8String(start, 0)); | 1058 if (!asciiOnly) handleUnicode(start); |
744 return advance(); | 1059 // Advance past the quote character. |
| 1060 next = advance(); |
| 1061 appendSubstringToken(STRING_INFO, start, asciiOnly); |
| 1062 return next; |
745 } | 1063 } |
746 | 1064 |
747 int tokenizeStringInterpolation(int start) { | 1065 int tokenizeStringInterpolation(int start, bool asciiOnly) { |
748 appendByteStringToken(STRING_INFO, utf8String(start, -1)); | 1066 appendSubstringToken(STRING_INFO, start, asciiOnly); |
749 beginToken(); // $ starts here. | 1067 beginToken(); // $ starts here. |
750 int next = advance(); | 1068 int next = advance(); |
751 if (identical(next, $OPEN_CURLY_BRACKET)) { | 1069 if (identical(next, $OPEN_CURLY_BRACKET)) { |
752 return tokenizeInterpolatedExpression(next, start); | 1070 return tokenizeInterpolatedExpression(next); |
753 } else { | 1071 } else { |
754 return tokenizeInterpolatedIdentifier(next, start); | 1072 return tokenizeInterpolatedIdentifier(next); |
755 } | 1073 } |
756 } | 1074 } |
757 | 1075 |
758 int tokenizeInterpolatedExpression(int next, int start) { | 1076 int tokenizeInterpolatedExpression(int next) { |
759 appendBeginGroup(STRING_INTERPOLATION_INFO, "\${"); | 1077 appendBeginGroup(STRING_INTERPOLATION_INFO); |
760 beginToken(); // The expression starts here. | 1078 beginToken(); // The expression starts here. |
761 next = advance(); | 1079 next = advance(); // Move past the curly bracket. |
762 while (!identical(next, $EOF) && !identical(next, $STX)) { | 1080 while (!identical(next, $EOF) && !identical(next, $STX)) { |
763 next = bigSwitch(next); | 1081 next = bigSwitch(next); |
764 } | 1082 } |
765 if (identical(next, $EOF)) return next; | 1083 if (identical(next, $EOF)) return next; |
766 next = advance(); | 1084 next = advance(); // Move past the $STX. |
767 beginToken(); // The string interpolation suffix starts here. | 1085 beginToken(); // The string interpolation suffix starts here. |
768 return next; | 1086 return next; |
769 } | 1087 } |
770 | 1088 |
771 int tokenizeInterpolatedIdentifier(int next, int start) { | 1089 int tokenizeInterpolatedIdentifier(int next) { |
772 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO); | 1090 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO); |
773 beginToken(); // The identifier starts here. | 1091 beginToken(); // The identifier starts here. |
774 next = tokenizeKeywordOrIdentifier(next, false); | 1092 next = tokenizeKeywordOrIdentifier(next, false); |
775 beginToken(); // The string interpolation suffix starts here. | 1093 beginToken(); // The string interpolation suffix starts here. |
776 return next; | 1094 return next; |
777 } | 1095 } |
778 | 1096 |
779 int tokenizeSingleLineRawString(int next, int quoteChar, int start) { | 1097 int tokenizeSingleLineRawString(int next, int quoteChar, int start) { |
780 next = advance(); | 1098 bool asciiOnly = true; |
| 1099 next = advance(); // Advance past the quote |
781 while (next != $EOF) { | 1100 while (next != $EOF) { |
782 if (identical(next, quoteChar)) { | 1101 if (identical(next, quoteChar)) { |
783 appendByteStringToken(STRING_INFO, utf8String(start, 0)); | 1102 if (!asciiOnly) handleUnicode(start); |
784 return advance(); | 1103 next = advance(); |
| 1104 appendSubstringToken(STRING_INFO, start, asciiOnly); |
| 1105 return next; |
785 } else if (identical(next, $LF) || identical(next, $CR)) { | 1106 } else if (identical(next, $LF) || identical(next, $CR)) { |
786 return error(const SourceString("unterminated string literal")); | 1107 if (!asciiOnly) handleUnicode(start); |
| 1108 return error("unterminated string literal"); |
| 1109 } else if (next > 127) { |
| 1110 asciiOnly = false; |
787 } | 1111 } |
788 next = advance(); | 1112 next = advance(); |
789 } | 1113 } |
790 return error(const SourceString("unterminated string literal")); | 1114 if (!asciiOnly) handleUnicode(start); |
| 1115 return error("unterminated string literal"); |
791 } | 1116 } |
792 | 1117 |
793 int tokenizeMultiLineRawString(int quoteChar, int start) { | 1118 int tokenizeMultiLineRawString(int quoteChar, int start) { |
794 int next = advance(); | 1119 bool asciiOnlyString = true; |
| 1120 bool asciiOnlyLine = true; |
| 1121 int unicodeStart = start; |
| 1122 int next = advance(); // Advance past the (last) quote (of three) |
795 outer: while (!identical(next, $EOF)) { | 1123 outer: while (!identical(next, $EOF)) { |
796 while (!identical(next, quoteChar)) { | 1124 while (!identical(next, quoteChar)) { |
| 1125 if (identical(next, $LF)) { |
| 1126 if (!asciiOnlyLine) { |
| 1127 // Synchronize the string offset in the utf8 scanner. |
| 1128 handleUnicode(unicodeStart); |
| 1129 asciiOnlyLine = true; |
| 1130 unicodeStart = scanOffset; |
| 1131 } |
| 1132 lineFeedInMultiline(); |
| 1133 } else if (next > 127) { |
| 1134 asciiOnlyLine = false; |
| 1135 asciiOnlyString = false; |
| 1136 } |
797 next = advance(); | 1137 next = advance(); |
798 if (identical(next, $EOF)) break outer; | 1138 if (identical(next, $EOF)) break outer; |
799 } | 1139 } |
800 next = advance(); | 1140 next = advance(); |
801 if (identical(next, quoteChar)) { | 1141 if (identical(next, quoteChar)) { |
802 next = advance(); | 1142 next = advance(); |
803 if (identical(next, quoteChar)) { | 1143 if (identical(next, quoteChar)) { |
804 appendByteStringToken(STRING_INFO, utf8String(start, 0)); | 1144 if (!asciiOnlyLine) handleUnicode(unicodeStart); |
805 return advance(); | 1145 next = advance(); |
| 1146 appendSubstringToken(STRING_INFO, start, asciiOnlyString); |
| 1147 return next; |
806 } | 1148 } |
807 } | 1149 } |
808 } | 1150 } |
809 return error(const SourceString("unterminated string literal")); | 1151 if (!asciiOnlyLine) handleUnicode(unicodeStart); |
| 1152 return error("unterminated string literal"); |
810 } | 1153 } |
811 | 1154 |
812 int tokenizeMultiLineString(int quoteChar, int start, bool raw) { | 1155 int tokenizeMultiLineString(int quoteChar, int start, bool raw) { |
813 if (raw) return tokenizeMultiLineRawString(quoteChar, start); | 1156 if (raw) return tokenizeMultiLineRawString(quoteChar, start); |
814 int next = advance(); | 1157 bool asciiOnlyString = true; |
| 1158 bool asciiOnlyLine = true; |
| 1159 int unicodeStart = start; |
| 1160 int next = advance(); // Advance past the (last) quote (of three). |
815 while (!identical(next, $EOF)) { | 1161 while (!identical(next, $EOF)) { |
816 if (identical(next, $$)) { | 1162 if (identical(next, $$)) { |
817 next = tokenizeStringInterpolation(start); | 1163 if (!asciiOnlyLine) handleUnicode(unicodeStart); |
818 start = byteOffset; | 1164 next = tokenizeStringInterpolation(start, asciiOnlyString); |
| 1165 start = scanOffset; |
| 1166 unicodeStart = start; |
| 1167 asciiOnlyString = true; // A new string token is created for the rest. |
| 1168 asciiOnlyLine = true; |
819 continue; | 1169 continue; |
820 } | 1170 } |
821 if (identical(next, quoteChar)) { | 1171 if (identical(next, quoteChar)) { |
822 next = advance(); | 1172 next = advance(); |
823 if (identical(next, quoteChar)) { | 1173 if (identical(next, quoteChar)) { |
824 next = advance(); | 1174 next = advance(); |
825 if (identical(next, quoteChar)) { | 1175 if (identical(next, quoteChar)) { |
826 appendByteStringToken(STRING_INFO, utf8String(start, 0)); | 1176 if (!asciiOnlyLine) handleUnicode(unicodeStart); |
827 return advance(); | 1177 next = advance(); |
| 1178 appendSubstringToken(STRING_INFO, start, asciiOnlyString); |
| 1179 return next; |
828 } | 1180 } |
829 } | 1181 } |
830 continue; | 1182 continue; |
831 } | 1183 } |
832 if (identical(next, $BACKSLASH)) { | 1184 if (identical(next, $BACKSLASH)) { |
833 next = advance(); | 1185 next = advance(); |
834 if (identical(next, $EOF)) break; | 1186 if (identical(next, $EOF)) break; |
835 } | 1187 } |
| 1188 if (identical(next, $LF)) { |
| 1189 if (!asciiOnlyLine) { |
| 1190 // Synchronize the string offset in the utf8 scanner. |
| 1191 handleUnicode(unicodeStart); |
| 1192 asciiOnlyLine = true; |
| 1193 unicodeStart = scanOffset; |
| 1194 } |
| 1195 lineFeedInMultiline(); |
| 1196 } else if (next > 127) { |
| 1197 asciiOnlyString = false; |
| 1198 asciiOnlyLine = false; |
| 1199 } |
836 next = advance(); | 1200 next = advance(); |
837 } | 1201 } |
838 return error(const SourceString("unterminated string literal")); | 1202 if (!asciiOnlyLine) handleUnicode(unicodeStart); |
| 1203 return error("unterminated string literal"); |
839 } | 1204 } |
840 | 1205 |
841 int error(SourceString message) { | 1206 int error(String message) { |
842 appendByteStringToken(BAD_INPUT_INFO, message); | 1207 appendStringToken(BAD_INPUT_INFO, message); |
843 return advance(); // Ensure progress. | 1208 return advance(); // Ensure progress. |
844 } | 1209 } |
| 1210 |
| 1211 void unmatchedBeginGroup(BeginGroupToken begin) { |
| 1212 String error = 'unmatched "${begin.stringValue}"'; |
| 1213 Token close = |
| 1214 new StringToken.fromString( |
| 1215 BAD_INPUT_INFO, error, begin.charOffset, true); |
| 1216 |
| 1217 // We want to ensure that unmatched BeginGroupTokens are reported |
| 1218 // as errors. However, the rest of the parser assume the groups |
| 1219 // are well-balanced and will never look at the endGroup |
| 1220 // token. This is a nice property that allows us to skip quickly |
| 1221 // over correct code. By inserting an additional error token in |
| 1222 // the stream, we can keep ignoring endGroup tokens. |
| 1223 // |
| 1224 // [begin] --next--> [tail] |
| 1225 // [begin] --endG--> [close] --next--> [next] --next--> [tail] |
| 1226 // |
| 1227 // This allows the parser to skip from [begin] via endGroup to [close] and |
| 1228 // ignore the [close] token (assuming it's correct), then the error will be |
| 1229 // reported when parsing the [next] token. |
| 1230 |
| 1231 Token next = new StringToken.fromString( |
| 1232 BAD_INPUT_INFO, error, begin.charOffset, true); |
| 1233 begin.endGroup = close; |
| 1234 close.next = next; |
| 1235 next.next = begin.next; |
| 1236 } |
845 } | 1237 } |
OLD | NEW |