| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | |
| 2 // for details. All rights reserved. Use of this source code is governed by a | |
| 3 // BSD-style license that can be found in the LICENSE file. | |
| 4 | |
| 5 part of scanner; | |
| 6 | |
| 7 abstract class Scanner { | |
| 8 Token tokenize(); | |
| 9 | |
| 10 factory Scanner(SourceFile file, {bool includeComments: false}) { | |
| 11 if (file is Utf8BytesSourceFile) { | |
| 12 return new Utf8BytesScanner(file, includeComments: includeComments); | |
| 13 } else { | |
| 14 return new StringScanner(file, includeComments: includeComments); | |
| 15 } | |
| 16 } | |
| 17 } | |
| 18 | |
| 19 abstract class AbstractScanner implements Scanner { | |
| 20 // TODO(ahe): Move this class to implementation. | |
| 21 | |
| 22 final bool includeComments; | |
| 23 | |
| 24 /** | |
| 25 * The string offset for the next token that will be created. | |
| 26 * | |
| 27 * Note that in the [Utf8BytesScanner], [stringOffset] and [scanOffset] values | |
| 28 * are different. One string character can be encoded using multiple UTF-8 | |
| 29 * bytes. | |
| 30 */ | |
| 31 int tokenStart = -1; | |
| 32 | |
| 33 /** | |
| 34 * A pointer to the token stream created by this scanner. The first token | |
| 35 * is a special token and not part of the source file. This is an | |
| 36 * implementation detail to avoids special cases in the scanner. This token | |
| 37 * is not exposed to clients of the scanner, which are expected to invoke | |
| 38 * [firstToken] to access the token stream. | |
| 39 */ | |
| 40 final Token tokens = new SymbolToken(EOF_INFO, -1); | |
| 41 | |
| 42 /** | |
| 43 * A pointer to the last scanned token. | |
| 44 */ | |
| 45 Token tail; | |
| 46 | |
| 47 /** | |
| 48 * The source file that is being scanned. This field can be [:null:]. | |
| 49 * If the source file is available, the scanner assigns its [:lineStarts:] and | |
| 50 * [:length:] fields at the end of [tokenize]. | |
| 51 */ | |
| 52 final SourceFile file; | |
| 53 | |
| 54 final List<int> lineStarts = <int>[0]; | |
| 55 | |
| 56 AbstractScanner(this.file, this.includeComments) { | |
| 57 this.tail = this.tokens; | |
| 58 } | |
| 59 | |
| 60 /** | |
| 61 * Advances and returns the next character. | |
| 62 * | |
| 63 * If the next character is non-ASCII, then the returned value depends on the | |
| 64 * scanner implementation. The [Utf8BytesScanner] returns a UTF-8 byte, while | |
| 65 * the [StringScanner] returns a UTF-16 code unit. | |
| 66 * | |
| 67 * The scanner ensures that [advance] is not invoked after it returned [$EOF]. | |
| 68 * This allows implementations to omit bound checks if the data structure ends | |
| 69 * with '0'. | |
| 70 */ | |
| 71 int advance(); | |
| 72 | |
| 73 /** | |
| 74 * Returns the current unicode character. | |
| 75 * | |
| 76 * If the current character is ASCII, then it is returned unchanged. | |
| 77 * | |
| 78 * The [Utf8BytesScanner] decodes the next unicode code point starting at the | |
| 79 * current position. Note that every unicode character is returned as a single | |
| 80 * code point, that is, for '\u{1d11e}' it returns 119070, and the following | |
| 81 * [advance] returns the next character. | |
| 82 * | |
| 83 * The [StringScanner] returns the current character unchanged, which might | |
| 84 * be a surrogate character. In the case of '\u{1d11e}', it returns the first | |
| 85 * code unit 55348, and the following [advance] returns the second code unit | |
| 86 * 56606. | |
| 87 * | |
| 88 * Invoking [currentAsUnicode] multiple times is safe, i.e., | |
| 89 * [:currentAsUnicode(next) == currentAsUnicode(currentAsUnicode(next)):]. | |
| 90 */ | |
| 91 int currentAsUnicode(int next); | |
| 92 | |
| 93 /** | |
| 94 * Returns the character at the next poisition. Like in [advance], the | |
| 95 * [Utf8BytesScanner] returns a UTF-8 byte, while the [StringScanner] returns | |
| 96 * a UTF-16 code unit. | |
| 97 */ | |
| 98 int peek(); | |
| 99 | |
| 100 /** | |
| 101 * Notifies the scanner that unicode characters were detected in either a | |
| 102 * comment or a string literal between [startScanOffset] and the current | |
| 103 * scan offset. | |
| 104 */ | |
| 105 void handleUnicode(int startScanOffset); | |
| 106 | |
| 107 /** | |
| 108 * Returns the current scan offset. | |
| 109 * | |
| 110 * In the [Utf8BytesScanner] this is the offset into the byte list, in the | |
| 111 * [StringScanner] the offset in the source string. | |
| 112 */ | |
| 113 int get scanOffset; | |
| 114 | |
| 115 /** | |
| 116 * Returns the current string offset. | |
| 117 * | |
| 118 * In the [StringScanner] this is identical to the [scanOffset]. In the | |
| 119 * [Utf8BytesScanner] it is computed based on encountered UTF-8 characters. | |
| 120 */ | |
| 121 int get stringOffset; | |
| 122 | |
| 123 /** | |
| 124 * Returns the first token scanned by this [Scanner]. | |
| 125 */ | |
| 126 Token firstToken(); | |
| 127 | |
| 128 /** | |
| 129 * Returns the last token scanned by this [Scanner]. | |
| 130 */ | |
| 131 Token previousToken(); | |
| 132 | |
| 133 /** | |
| 134 * Notifies that a new token starts at current offset. | |
| 135 */ | |
| 136 void beginToken() { | |
| 137 tokenStart = stringOffset; | |
| 138 } | |
| 139 | |
| 140 /** | |
| 141 * Appends a substring from the scan offset [:start:] to the current | |
| 142 * [:scanOffset:] plus the [:extraOffset:]. For example, if the current | |
| 143 * scanOffset is 10, then [:appendSubstringToken(5, -1):] will append the | |
| 144 * substring string [5,9). | |
| 145 * | |
| 146 * Note that [extraOffset] can only be used if the covered character(s) are | |
| 147 * known to be ASCII. | |
| 148 */ | |
| 149 void appendSubstringToken(PrecedenceInfo info, int start, | |
| 150 bool asciiOnly, [int extraOffset]); | |
| 151 | |
| 152 /** Documentation in subclass [ArrayBasedScanner]. */ | |
| 153 void appendPrecedenceToken(PrecedenceInfo info); | |
| 154 | |
| 155 /** Documentation in subclass [ArrayBasedScanner]. */ | |
| 156 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no); | |
| 157 | |
| 158 /** Documentation in subclass [ArrayBasedScanner]. */ | |
| 159 void appendKeywordToken(Keyword keyword); | |
| 160 | |
| 161 /** Documentation in subclass [ArrayBasedScanner]. */ | |
| 162 void appendEofToken(); | |
| 163 | |
| 164 /** Documentation in subclass [ArrayBasedScanner]. */ | |
| 165 void appendWhiteSpace(int next); | |
| 166 | |
| 167 /** Documentation in subclass [ArrayBasedScanner]. */ | |
| 168 void lineFeedInMultiline(); | |
| 169 | |
| 170 /** Documentation in subclass [ArrayBasedScanner]. */ | |
| 171 void appendBeginGroup(PrecedenceInfo info); | |
| 172 | |
| 173 /** Documentation in subclass [ArrayBasedScanner]. */ | |
| 174 int appendEndGroup(PrecedenceInfo info, int openKind); | |
| 175 | |
| 176 /** Documentation in subclass [ArrayBasedScanner]. */ | |
| 177 void appendGt(PrecedenceInfo info); | |
| 178 | |
| 179 /** Documentation in subclass [ArrayBasedScanner]. */ | |
| 180 void appendGtGt(PrecedenceInfo info); | |
| 181 | |
| 182 /** Documentation in subclass [ArrayBasedScanner]. */ | |
| 183 void appendComment(start, bool asciiOnly); | |
| 184 | |
| 185 /// Append [token] to the token stream. | |
| 186 void appendErrorToken(ErrorToken token); | |
| 187 | |
| 188 /** Documentation in subclass [ArrayBasedScanner]. */ | |
| 189 void discardOpenLt(); | |
| 190 | |
| 191 /// Return true when at EOF. | |
| 192 bool atEndOfFile(); | |
| 193 | |
| 194 Token tokenize() { | |
| 195 while (!atEndOfFile()) { | |
| 196 int next = advance(); | |
| 197 while (!identical(next, $EOF)) { | |
| 198 next = bigSwitch(next); | |
| 199 } | |
| 200 if (atEndOfFile()) { | |
| 201 appendEofToken(); | |
| 202 } else { | |
| 203 unexpected($EOF); | |
| 204 } | |
| 205 } | |
| 206 | |
| 207 if (file != null) { | |
| 208 file.length = stringOffset; | |
| 209 // One additional line start at the end, see [SourceFile.lineStarts]. | |
| 210 lineStarts.add(stringOffset + 1); | |
| 211 file.lineStarts = lineStarts; | |
| 212 } | |
| 213 | |
| 214 return firstToken(); | |
| 215 } | |
| 216 | |
| 217 int bigSwitch(int next) { | |
| 218 beginToken(); | |
| 219 if (identical(next, $SPACE) || identical(next, $TAB) | |
| 220 || identical(next, $LF) || identical(next, $CR)) { | |
| 221 appendWhiteSpace(next); | |
| 222 next = advance(); | |
| 223 // Sequences of spaces are common, so advance through them fast. | |
| 224 while (identical(next, $SPACE)) { | |
| 225 // We don't invoke [:appendWhiteSpace(next):] here for efficiency, | |
| 226 // assuming that it does not do anything for space characters. | |
| 227 next = advance(); | |
| 228 } | |
| 229 return next; | |
| 230 } | |
| 231 | |
| 232 if ($a <= next && next <= $z) { | |
| 233 if (identical($r, next)) { | |
| 234 return tokenizeRawStringKeywordOrIdentifier(next); | |
| 235 } | |
| 236 return tokenizeKeywordOrIdentifier(next, true); | |
| 237 } | |
| 238 | |
| 239 if (($A <= next && next <= $Z) || | |
| 240 identical(next, $_) || | |
| 241 identical(next, $$)) { | |
| 242 return tokenizeIdentifier(next, scanOffset, true); | |
| 243 } | |
| 244 | |
| 245 if (identical(next, $LT)) { | |
| 246 return tokenizeLessThan(next); | |
| 247 } | |
| 248 | |
| 249 if (identical(next, $GT)) { | |
| 250 return tokenizeGreaterThan(next); | |
| 251 } | |
| 252 | |
| 253 if (identical(next, $EQ)) { | |
| 254 return tokenizeEquals(next); | |
| 255 } | |
| 256 | |
| 257 if (identical(next, $BANG)) { | |
| 258 return tokenizeExclamation(next); | |
| 259 } | |
| 260 | |
| 261 if (identical(next, $PLUS)) { | |
| 262 return tokenizePlus(next); | |
| 263 } | |
| 264 | |
| 265 if (identical(next, $MINUS)) { | |
| 266 return tokenizeMinus(next); | |
| 267 } | |
| 268 | |
| 269 if (identical(next, $STAR)) { | |
| 270 return tokenizeMultiply(next); | |
| 271 } | |
| 272 | |
| 273 if (identical(next, $PERCENT)) { | |
| 274 return tokenizePercent(next); | |
| 275 } | |
| 276 | |
| 277 if (identical(next, $AMPERSAND)) { | |
| 278 return tokenizeAmpersand(next); | |
| 279 } | |
| 280 | |
| 281 if (identical(next, $BAR)) { | |
| 282 return tokenizeBar(next); | |
| 283 } | |
| 284 | |
| 285 if (identical(next, $CARET)) { | |
| 286 return tokenizeCaret(next); | |
| 287 } | |
| 288 | |
| 289 if (identical(next, $OPEN_SQUARE_BRACKET)) { | |
| 290 return tokenizeOpenSquareBracket(next); | |
| 291 } | |
| 292 | |
| 293 if (identical(next, $TILDE)) { | |
| 294 return tokenizeTilde(next); | |
| 295 } | |
| 296 | |
| 297 if (identical(next, $BACKSLASH)) { | |
| 298 appendPrecedenceToken(BACKSLASH_INFO); | |
| 299 return advance(); | |
| 300 } | |
| 301 | |
| 302 if (identical(next, $HASH)) { | |
| 303 return tokenizeTag(next); | |
| 304 } | |
| 305 | |
| 306 if (identical(next, $OPEN_PAREN)) { | |
| 307 appendBeginGroup(OPEN_PAREN_INFO); | |
| 308 return advance(); | |
| 309 } | |
| 310 | |
| 311 if (identical(next, $CLOSE_PAREN)) { | |
| 312 return appendEndGroup(CLOSE_PAREN_INFO, OPEN_PAREN_TOKEN); | |
| 313 } | |
| 314 | |
| 315 if (identical(next, $COMMA)) { | |
| 316 appendPrecedenceToken(COMMA_INFO); | |
| 317 return advance(); | |
| 318 } | |
| 319 | |
| 320 if (identical(next, $COLON)) { | |
| 321 appendPrecedenceToken(COLON_INFO); | |
| 322 return advance(); | |
| 323 } | |
| 324 | |
| 325 if (identical(next, $SEMICOLON)) { | |
| 326 appendPrecedenceToken(SEMICOLON_INFO); | |
| 327 // Type parameters and arguments cannot contain semicolon. | |
| 328 discardOpenLt(); | |
| 329 return advance(); | |
| 330 } | |
| 331 | |
| 332 if (identical(next, $QUESTION)) { | |
| 333 appendPrecedenceToken(QUESTION_INFO); | |
| 334 return advance(); | |
| 335 } | |
| 336 | |
| 337 if (identical(next, $CLOSE_SQUARE_BRACKET)) { | |
| 338 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO, | |
| 339 OPEN_SQUARE_BRACKET_TOKEN); | |
| 340 } | |
| 341 | |
| 342 if (identical(next, $BACKPING)) { | |
| 343 appendPrecedenceToken(BACKPING_INFO); | |
| 344 return advance(); | |
| 345 } | |
| 346 | |
| 347 if (identical(next, $OPEN_CURLY_BRACKET)) { | |
| 348 appendBeginGroup(OPEN_CURLY_BRACKET_INFO); | |
| 349 return advance(); | |
| 350 } | |
| 351 | |
| 352 if (identical(next, $CLOSE_CURLY_BRACKET)) { | |
| 353 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO, | |
| 354 OPEN_CURLY_BRACKET_TOKEN); | |
| 355 } | |
| 356 | |
| 357 if (identical(next, $SLASH)) { | |
| 358 return tokenizeSlashOrComment(next); | |
| 359 } | |
| 360 | |
| 361 if (identical(next, $AT)) { | |
| 362 return tokenizeAt(next); | |
| 363 } | |
| 364 | |
| 365 if (identical(next, $DQ) || identical(next, $SQ)) { | |
| 366 return tokenizeString(next, scanOffset, false); | |
| 367 } | |
| 368 | |
| 369 if (identical(next, $PERIOD)) { | |
| 370 return tokenizeDotsOrNumber(next); | |
| 371 } | |
| 372 | |
| 373 if (identical(next, $0)) { | |
| 374 return tokenizeHexOrNumber(next); | |
| 375 } | |
| 376 | |
| 377 // TODO(ahe): Would a range check be faster? | |
| 378 if (identical(next, $1) || identical(next, $2) || identical(next, $3) | |
| 379 || identical(next, $4) || identical(next, $5) || identical(next, $6) | |
| 380 || identical(next, $7) || identical(next, $8) || identical(next, $9)) { | |
| 381 return tokenizeNumber(next); | |
| 382 } | |
| 383 | |
| 384 if (identical(next, $EOF)) { | |
| 385 return $EOF; | |
| 386 } | |
| 387 if (next < 0x1f) { | |
| 388 return unexpected(next); | |
| 389 } | |
| 390 | |
| 391 next = currentAsUnicode(next); | |
| 392 | |
| 393 // The following are non-ASCII characters. | |
| 394 | |
| 395 if (identical(next, $NBSP)) { | |
| 396 appendWhiteSpace(next); | |
| 397 return advance(); | |
| 398 } | |
| 399 | |
| 400 return unexpected(next); | |
| 401 } | |
| 402 | |
| 403 int tokenizeTag(int next) { | |
| 404 // # or #!.*[\n\r] | |
| 405 if (scanOffset == 0) { | |
| 406 if (identical(peek(), $BANG)) { | |
| 407 int start = scanOffset + 1; | |
| 408 bool asciiOnly = true; | |
| 409 do { | |
| 410 next = advance(); | |
| 411 if (next > 127) asciiOnly = false; | |
| 412 } while (!identical(next, $LF) && | |
| 413 !identical(next, $CR) && | |
| 414 !identical(next, $EOF)); | |
| 415 if (!asciiOnly) handleUnicode(start); | |
| 416 return next; | |
| 417 } | |
| 418 } | |
| 419 appendPrecedenceToken(HASH_INFO); | |
| 420 return advance(); | |
| 421 } | |
| 422 | |
| 423 int tokenizeTilde(int next) { | |
| 424 // ~ ~/ ~/= | |
| 425 next = advance(); | |
| 426 if (identical(next, $SLASH)) { | |
| 427 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO); | |
| 428 } else { | |
| 429 appendPrecedenceToken(TILDE_INFO); | |
| 430 return next; | |
| 431 } | |
| 432 } | |
| 433 | |
| 434 int tokenizeOpenSquareBracket(int next) { | |
| 435 // [ [] []= | |
| 436 next = advance(); | |
| 437 if (identical(next, $CLOSE_SQUARE_BRACKET)) { | |
| 438 Token token = previousToken(); | |
| 439 if (token is KeywordToken && token.keyword.syntax == 'operator' || | |
| 440 token is SymbolToken && token.info == HASH_INFO) { | |
| 441 return select($EQ, INDEX_EQ_INFO, INDEX_INFO); | |
| 442 } | |
| 443 } | |
| 444 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO); | |
| 445 return next; | |
| 446 } | |
| 447 | |
| 448 int tokenizeCaret(int next) { | |
| 449 // ^ ^= | |
| 450 return select($EQ, CARET_EQ_INFO, CARET_INFO); | |
| 451 } | |
| 452 | |
| 453 int tokenizeBar(int next) { | |
| 454 // | || |= | |
| 455 next = advance(); | |
| 456 if (identical(next, $BAR)) { | |
| 457 appendPrecedenceToken(BAR_BAR_INFO); | |
| 458 return advance(); | |
| 459 } else if (identical(next, $EQ)) { | |
| 460 appendPrecedenceToken(BAR_EQ_INFO); | |
| 461 return advance(); | |
| 462 } else { | |
| 463 appendPrecedenceToken(BAR_INFO); | |
| 464 return next; | |
| 465 } | |
| 466 } | |
| 467 | |
| 468 int tokenizeAmpersand(int next) { | |
| 469 // && &= & | |
| 470 next = advance(); | |
| 471 if (identical(next, $AMPERSAND)) { | |
| 472 appendPrecedenceToken(AMPERSAND_AMPERSAND_INFO); | |
| 473 return advance(); | |
| 474 } else if (identical(next, $EQ)) { | |
| 475 appendPrecedenceToken(AMPERSAND_EQ_INFO); | |
| 476 return advance(); | |
| 477 } else { | |
| 478 appendPrecedenceToken(AMPERSAND_INFO); | |
| 479 return next; | |
| 480 } | |
| 481 } | |
| 482 | |
| 483 int tokenizePercent(int next) { | |
| 484 // % %= | |
| 485 return select($EQ, PERCENT_EQ_INFO, PERCENT_INFO); | |
| 486 } | |
| 487 | |
| 488 int tokenizeMultiply(int next) { | |
| 489 // * *= | |
| 490 return select($EQ, STAR_EQ_INFO, STAR_INFO); | |
| 491 } | |
| 492 | |
| 493 int tokenizeMinus(int next) { | |
| 494 // - -- -= | |
| 495 next = advance(); | |
| 496 if (identical(next, $MINUS)) { | |
| 497 appendPrecedenceToken(MINUS_MINUS_INFO); | |
| 498 return advance(); | |
| 499 } else if (identical(next, $EQ)) { | |
| 500 appendPrecedenceToken(MINUS_EQ_INFO); | |
| 501 return advance(); | |
| 502 } else { | |
| 503 appendPrecedenceToken(MINUS_INFO); | |
| 504 return next; | |
| 505 } | |
| 506 } | |
| 507 | |
| 508 int tokenizePlus(int next) { | |
| 509 // + ++ += | |
| 510 next = advance(); | |
| 511 if (identical($PLUS, next)) { | |
| 512 appendPrecedenceToken(PLUS_PLUS_INFO); | |
| 513 return advance(); | |
| 514 } else if (identical($EQ, next)) { | |
| 515 appendPrecedenceToken(PLUS_EQ_INFO); | |
| 516 return advance(); | |
| 517 } else { | |
| 518 appendPrecedenceToken(PLUS_INFO); | |
| 519 return next; | |
| 520 } | |
| 521 } | |
| 522 | |
| 523 int tokenizeExclamation(int next) { | |
| 524 // ! != | |
| 525 // !== is kept for user-friendly error reporting. | |
| 526 | |
| 527 next = advance(); | |
| 528 if (identical(next, $EQ)) { | |
| 529 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO); | |
| 530 } | |
| 531 appendPrecedenceToken(BANG_INFO); | |
| 532 return next; | |
| 533 } | |
| 534 | |
| 535 int tokenizeEquals(int next) { | |
| 536 // = == => | |
| 537 // === is kept for user-friendly error reporting. | |
| 538 | |
| 539 // Type parameters and arguments cannot contain any token that | |
| 540 // starts with '='. | |
| 541 discardOpenLt(); | |
| 542 | |
| 543 next = advance(); | |
| 544 if (identical(next, $EQ)) { | |
| 545 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO); | |
| 546 } else if (identical(next, $GT)) { | |
| 547 appendPrecedenceToken(FUNCTION_INFO); | |
| 548 return advance(); | |
| 549 } | |
| 550 appendPrecedenceToken(EQ_INFO); | |
| 551 return next; | |
| 552 } | |
| 553 | |
| 554 int tokenizeGreaterThan(int next) { | |
| 555 // > >= >> >>= | |
| 556 next = advance(); | |
| 557 if (identical($EQ, next)) { | |
| 558 appendPrecedenceToken(GT_EQ_INFO); | |
| 559 return advance(); | |
| 560 } else if (identical($GT, next)) { | |
| 561 next = advance(); | |
| 562 if (identical($EQ, next)) { | |
| 563 appendPrecedenceToken(GT_GT_EQ_INFO); | |
| 564 return advance(); | |
| 565 } else { | |
| 566 appendGtGt(GT_GT_INFO); | |
| 567 return next; | |
| 568 } | |
| 569 } else { | |
| 570 appendGt(GT_INFO); | |
| 571 return next; | |
| 572 } | |
| 573 } | |
| 574 | |
| 575 int tokenizeLessThan(int next) { | |
| 576 // < <= << <<= | |
| 577 next = advance(); | |
| 578 if (identical($EQ, next)) { | |
| 579 appendPrecedenceToken(LT_EQ_INFO); | |
| 580 return advance(); | |
| 581 } else if (identical($LT, next)) { | |
| 582 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO); | |
| 583 } else { | |
| 584 appendBeginGroup(LT_INFO); | |
| 585 return next; | |
| 586 } | |
| 587 } | |
| 588 | |
| 589 int tokenizeNumber(int next) { | |
| 590 int start = scanOffset; | |
| 591 while (true) { | |
| 592 next = advance(); | |
| 593 if ($0 <= next && next <= $9) { | |
| 594 continue; | |
| 595 } else if (identical(next, $e) || identical(next, $E)) { | |
| 596 return tokenizeFractionPart(next, start); | |
| 597 } else { | |
| 598 if (identical(next, $PERIOD)) { | |
| 599 int nextnext = peek(); | |
| 600 if ($0 <= nextnext && nextnext <= $9) { | |
| 601 return tokenizeFractionPart(advance(), start); | |
| 602 } | |
| 603 } | |
| 604 appendSubstringToken(INT_INFO, start, true); | |
| 605 return next; | |
| 606 } | |
| 607 } | |
| 608 return null; | |
| 609 } | |
| 610 | |
| 611 int tokenizeHexOrNumber(int next) { | |
| 612 int x = peek(); | |
| 613 if (identical(x, $x) || identical(x, $X)) { | |
| 614 return tokenizeHex(next); | |
| 615 } | |
| 616 return tokenizeNumber(next); | |
| 617 } | |
| 618 | |
| 619 int tokenizeHex(int next) { | |
| 620 int start = scanOffset; | |
| 621 next = advance(); // Advance past the $x or $X. | |
| 622 bool hasDigits = false; | |
| 623 while (true) { | |
| 624 next = advance(); | |
| 625 if (($0 <= next && next <= $9) | |
| 626 || ($A <= next && next <= $F) | |
| 627 || ($a <= next && next <= $f)) { | |
| 628 hasDigits = true; | |
| 629 } else { | |
| 630 if (!hasDigits) { | |
| 631 unterminated('0x', shouldAdvance: false); | |
| 632 return next; | |
| 633 } | |
| 634 appendSubstringToken(HEXADECIMAL_INFO, start, true); | |
| 635 return next; | |
| 636 } | |
| 637 } | |
| 638 return null; | |
| 639 } | |
| 640 | |
| 641 int tokenizeDotsOrNumber(int next) { | |
| 642 int start = scanOffset; | |
| 643 next = advance(); | |
| 644 if (($0 <= next && next <= $9)) { | |
| 645 return tokenizeFractionPart(next, start); | |
| 646 } else if (identical($PERIOD, next)) { | |
| 647 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); | |
| 648 } else { | |
| 649 appendPrecedenceToken(PERIOD_INFO); | |
| 650 return next; | |
| 651 } | |
| 652 } | |
| 653 | |
| 654 int tokenizeFractionPart(int next, int start) { | |
| 655 bool done = false; | |
| 656 bool hasDigit = false; | |
| 657 LOOP: while (!done) { | |
| 658 if ($0 <= next && next <= $9) { | |
| 659 hasDigit = true; | |
| 660 } else if (identical($e, next) || identical($E, next)) { | |
| 661 hasDigit = true; | |
| 662 next = advance(); | |
| 663 if (identical(next, $PLUS) || identical(next, $MINUS)) { | |
| 664 next = advance(); | |
| 665 } | |
| 666 bool hasExponentDigits = false; | |
| 667 while (true) { | |
| 668 if ($0 <= next && next <= $9) { | |
| 669 hasExponentDigits = true; | |
| 670 } else { | |
| 671 if (!hasExponentDigits) { | |
| 672 unterminated('1e', shouldAdvance: false); | |
| 673 return next; | |
| 674 } | |
| 675 break; | |
| 676 } | |
| 677 next = advance(); | |
| 678 } | |
| 679 | |
| 680 done = true; | |
| 681 continue LOOP; | |
| 682 } else { | |
| 683 done = true; | |
| 684 continue LOOP; | |
| 685 } | |
| 686 next = advance(); | |
| 687 } | |
| 688 if (!hasDigit) { | |
| 689 // Reduce offset, we already advanced to the token past the period. | |
| 690 appendSubstringToken(INT_INFO, start, true, -1); | |
| 691 | |
| 692 // TODO(ahe): Wrong offset for the period. Cannot call beginToken because | |
| 693 // the scanner already advanced past the period. | |
| 694 if (identical($PERIOD, next)) { | |
| 695 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO); | |
| 696 } | |
| 697 appendPrecedenceToken(PERIOD_INFO); | |
| 698 return next; | |
| 699 } | |
| 700 appendSubstringToken(DOUBLE_INFO, start, true); | |
| 701 return next; | |
| 702 } | |
| 703 | |
| 704 int tokenizeSlashOrComment(int next) { | |
| 705 int start = scanOffset; | |
| 706 next = advance(); | |
| 707 if (identical($STAR, next)) { | |
| 708 return tokenizeMultiLineComment(next, start); | |
| 709 } else if (identical($SLASH, next)) { | |
| 710 return tokenizeSingleLineComment(next, start); | |
| 711 } else if (identical($EQ, next)) { | |
| 712 appendPrecedenceToken(SLASH_EQ_INFO); | |
| 713 return advance(); | |
| 714 } else { | |
| 715 appendPrecedenceToken(SLASH_INFO); | |
| 716 return next; | |
| 717 } | |
| 718 } | |
| 719 | |
| 720 int tokenizeSingleLineComment(int next, int start) { | |
| 721 bool asciiOnly = true; | |
| 722 while (true) { | |
| 723 next = advance(); | |
| 724 if (next > 127) asciiOnly = false; | |
| 725 if (identical($LF, next) || | |
| 726 identical($CR, next) || | |
| 727 identical($EOF, next)) { | |
| 728 if (!asciiOnly) handleUnicode(start); | |
| 729 appendComment(start, asciiOnly); | |
| 730 return next; | |
| 731 } | |
| 732 } | |
| 733 return null; | |
| 734 } | |
| 735 | |
| 736 | |
| 737 int tokenizeMultiLineComment(int next, int start) { | |
| 738 bool asciiOnlyComment = true; // Track if the entire comment is ASCII. | |
| 739 bool asciiOnlyLines = true; // Track ASCII since the last handleUnicode. | |
| 740 int unicodeStart = start; | |
| 741 int nesting = 1; | |
| 742 next = advance(); | |
| 743 while (true) { | |
| 744 if (identical($EOF, next)) { | |
| 745 if (!asciiOnlyLines) handleUnicode(unicodeStart); | |
| 746 unterminated('/*'); | |
| 747 break; | |
| 748 } else if (identical($STAR, next)) { | |
| 749 next = advance(); | |
| 750 if (identical($SLASH, next)) { | |
| 751 --nesting; | |
| 752 if (0 == nesting) { | |
| 753 if (!asciiOnlyLines) handleUnicode(unicodeStart); | |
| 754 next = advance(); | |
| 755 appendComment(start, asciiOnlyComment); | |
| 756 break; | |
| 757 } else { | |
| 758 next = advance(); | |
| 759 } | |
| 760 } | |
| 761 } else if (identical($SLASH, next)) { | |
| 762 next = advance(); | |
| 763 if (identical($STAR, next)) { | |
| 764 next = advance(); | |
| 765 ++nesting; | |
| 766 } | |
| 767 } else if (identical(next, $LF)) { | |
| 768 if (!asciiOnlyLines) { | |
| 769 // Synchronize the string offset in the utf8 scanner. | |
| 770 handleUnicode(unicodeStart); | |
| 771 asciiOnlyLines = true; | |
| 772 unicodeStart = scanOffset; | |
| 773 } | |
| 774 lineFeedInMultiline(); | |
| 775 next = advance(); | |
| 776 } else { | |
| 777 if (next > 127) { | |
| 778 asciiOnlyLines = false; | |
| 779 asciiOnlyComment = false; | |
| 780 } | |
| 781 next = advance(); | |
| 782 } | |
| 783 } | |
| 784 return next; | |
| 785 } | |
| 786 | |
| 787 int tokenizeRawStringKeywordOrIdentifier(int next) { | |
| 788 // [next] is $r. | |
| 789 int nextnext = peek(); | |
| 790 if (identical(nextnext, $DQ) || identical(nextnext, $SQ)) { | |
| 791 int start = scanOffset; | |
| 792 next = advance(); | |
| 793 return tokenizeString(next, start, true); | |
| 794 } | |
| 795 return tokenizeKeywordOrIdentifier(next, true); | |
| 796 } | |
| 797 | |
| 798 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) { | |
| 799 KeywordState state = KeywordState.KEYWORD_STATE; | |
| 800 int start = scanOffset; | |
| 801 while (state != null && $a <= next && next <= $z) { | |
| 802 state = state.next(next); | |
| 803 next = advance(); | |
| 804 } | |
| 805 if (state == null || state.keyword == null) { | |
| 806 return tokenizeIdentifier(next, start, allowDollar); | |
| 807 } | |
| 808 if (($A <= next && next <= $Z) || | |
| 809 ($0 <= next && next <= $9) || | |
| 810 identical(next, $_) || | |
| 811 identical(next, $$)) { | |
| 812 return tokenizeIdentifier(next, start, allowDollar); | |
| 813 } else { | |
| 814 appendKeywordToken(state.keyword); | |
| 815 return next; | |
| 816 } | |
| 817 } | |
| 818 | |
| 819 /** | |
| 820 * [allowDollar] can exclude '$', which is not allowed as part of a string | |
| 821 * interpolation identifier. | |
| 822 */ | |
| 823 int tokenizeIdentifier(int next, int start, bool allowDollar) { | |
| 824 while (true) { | |
| 825 if (($a <= next && next <= $z) || | |
| 826 ($A <= next && next <= $Z) || | |
| 827 ($0 <= next && next <= $9) || | |
| 828 identical(next, $_) || | |
| 829 (identical(next, $$) && allowDollar)) { | |
| 830 next = advance(); | |
| 831 } else { | |
| 832 // Identifier ends here. | |
| 833 if (start == scanOffset) { | |
| 834 return unexpected(next); | |
| 835 } else { | |
| 836 appendSubstringToken(IDENTIFIER_INFO, start, true); | |
| 837 } | |
| 838 break; | |
| 839 } | |
| 840 } | |
| 841 return next; | |
| 842 } | |
| 843 | |
| 844 int tokenizeAt(int next) { | |
| 845 appendPrecedenceToken(AT_INFO); | |
| 846 return advance(); | |
| 847 } | |
| 848 | |
| 849 int tokenizeString(int next, int start, bool raw) { | |
| 850 int quoteChar = next; | |
| 851 next = advance(); | |
| 852 if (identical(quoteChar, next)) { | |
| 853 next = advance(); | |
| 854 if (identical(quoteChar, next)) { | |
| 855 // Multiline string. | |
| 856 return tokenizeMultiLineString(quoteChar, start, raw); | |
| 857 } else { | |
| 858 // Empty string. | |
| 859 appendSubstringToken(STRING_INFO, start, true); | |
| 860 return next; | |
| 861 } | |
| 862 } | |
| 863 if (raw) { | |
| 864 return tokenizeSingleLineRawString(next, quoteChar, start); | |
| 865 } else { | |
| 866 return tokenizeSingleLineString(next, quoteChar, start); | |
| 867 } | |
| 868 } | |
| 869 | |
| 870 /** | |
| 871 * [next] is the first character after the quote. | |
| 872 * [start] is the scanOffset of the quote. | |
| 873 * | |
| 874 * The token contains a substring of the source file, including the | |
| 875 * string quotes, backslashes for escaping. For interpolated strings, | |
| 876 * the parts before and after are separate tokens. | |
| 877 * | |
| 878 * "a $b c" | |
| 879 * | |
| 880 * gives StringToken("a $), StringToken(b) and StringToken( c"). | |
| 881 */ | |
| 882 int tokenizeSingleLineString(int next, int quoteChar, int start) { | |
| 883 bool asciiOnly = true; | |
| 884 while (!identical(next, quoteChar)) { | |
| 885 if (identical(next, $BACKSLASH)) { | |
| 886 next = advance(); | |
| 887 } else if (identical(next, $$)) { | |
| 888 if (!asciiOnly) handleUnicode(start); | |
| 889 next = tokenizeStringInterpolation(start, asciiOnly); | |
| 890 start = scanOffset; | |
| 891 asciiOnly = true; | |
| 892 continue; | |
| 893 } | |
| 894 if (next <= $CR | |
| 895 && (identical(next, $LF) || | |
| 896 identical(next, $CR) || | |
| 897 identical(next, $EOF))) { | |
| 898 if (!asciiOnly) handleUnicode(start); | |
| 899 return unterminatedString(quoteChar); | |
| 900 } | |
| 901 if (next > 127) asciiOnly = false; | |
| 902 next = advance(); | |
| 903 } | |
| 904 if (!asciiOnly) handleUnicode(start); | |
| 905 // Advance past the quote character. | |
| 906 next = advance(); | |
| 907 appendSubstringToken(STRING_INFO, start, asciiOnly); | |
| 908 return next; | |
| 909 } | |
| 910 | |
| 911 int tokenizeStringInterpolation(int start, bool asciiOnly) { | |
| 912 appendSubstringToken(STRING_INFO, start, asciiOnly); | |
| 913 beginToken(); // $ starts here. | |
| 914 int next = advance(); | |
| 915 if (identical(next, $OPEN_CURLY_BRACKET)) { | |
| 916 return tokenizeInterpolatedExpression(next); | |
| 917 } else { | |
| 918 return tokenizeInterpolatedIdentifier(next); | |
| 919 } | |
| 920 } | |
| 921 | |
| 922 int tokenizeInterpolatedExpression(int next) { | |
| 923 appendBeginGroup(STRING_INTERPOLATION_INFO); | |
| 924 beginToken(); // The expression starts here. | |
| 925 next = advance(); // Move past the curly bracket. | |
| 926 while (!identical(next, $EOF) && !identical(next, $STX)) { | |
| 927 next = bigSwitch(next); | |
| 928 } | |
| 929 if (identical(next, $EOF)) return next; | |
| 930 next = advance(); // Move past the $STX. | |
| 931 beginToken(); // The string interpolation suffix starts here. | |
| 932 return next; | |
| 933 } | |
| 934 | |
| 935 int tokenizeInterpolatedIdentifier(int next) { | |
| 936 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO); | |
| 937 | |
| 938 if ($a <= next && next <= $z) { | |
| 939 beginToken(); // The identifier starts here. | |
| 940 next = tokenizeKeywordOrIdentifier(next, false); | |
| 941 } else if (($A <= next && next <= $Z) || identical(next, $_)) { | |
| 942 beginToken(); // The identifier starts here. | |
| 943 next = tokenizeIdentifier(next, scanOffset, false); | |
| 944 } else { | |
| 945 unterminated(r'$', shouldAdvance: false); | |
| 946 } | |
| 947 beginToken(); // The string interpolation suffix starts here. | |
| 948 return next; | |
| 949 } | |
| 950 | |
| 951 int tokenizeSingleLineRawString(int next, int quoteChar, int start) { | |
| 952 bool asciiOnly = true; | |
| 953 while (next != $EOF) { | |
| 954 if (identical(next, quoteChar)) { | |
| 955 if (!asciiOnly) handleUnicode(start); | |
| 956 next = advance(); | |
| 957 appendSubstringToken(STRING_INFO, start, asciiOnly); | |
| 958 return next; | |
| 959 } else if (identical(next, $LF) || identical(next, $CR)) { | |
| 960 if (!asciiOnly) handleUnicode(start); | |
| 961 return unterminatedRawString(quoteChar); | |
| 962 } else if (next > 127) { | |
| 963 asciiOnly = false; | |
| 964 } | |
| 965 next = advance(); | |
| 966 } | |
| 967 if (!asciiOnly) handleUnicode(start); | |
| 968 return unterminatedRawString(quoteChar); | |
| 969 } | |
| 970 | |
| 971 int tokenizeMultiLineRawString(int quoteChar, int start) { | |
| 972 bool asciiOnlyString = true; | |
| 973 bool asciiOnlyLine = true; | |
| 974 int unicodeStart = start; | |
| 975 int next = advance(); // Advance past the (last) quote (of three). | |
| 976 outer: while (!identical(next, $EOF)) { | |
| 977 while (!identical(next, quoteChar)) { | |
| 978 if (identical(next, $LF)) { | |
| 979 if (!asciiOnlyLine) { | |
| 980 // Synchronize the string offset in the utf8 scanner. | |
| 981 handleUnicode(unicodeStart); | |
| 982 asciiOnlyLine = true; | |
| 983 unicodeStart = scanOffset; | |
| 984 } | |
| 985 lineFeedInMultiline(); | |
| 986 } else if (next > 127) { | |
| 987 asciiOnlyLine = false; | |
| 988 asciiOnlyString = false; | |
| 989 } | |
| 990 next = advance(); | |
| 991 if (identical(next, $EOF)) break outer; | |
| 992 } | |
| 993 next = advance(); | |
| 994 if (identical(next, quoteChar)) { | |
| 995 next = advance(); | |
| 996 if (identical(next, quoteChar)) { | |
| 997 if (!asciiOnlyLine) handleUnicode(unicodeStart); | |
| 998 next = advance(); | |
| 999 appendSubstringToken(STRING_INFO, start, asciiOnlyString); | |
| 1000 return next; | |
| 1001 } | |
| 1002 } | |
| 1003 } | |
| 1004 if (!asciiOnlyLine) handleUnicode(unicodeStart); | |
| 1005 return unterminatedRawMultiLineString(quoteChar); | |
| 1006 } | |
| 1007 | |
| 1008 int tokenizeMultiLineString(int quoteChar, int start, bool raw) { | |
| 1009 if (raw) return tokenizeMultiLineRawString(quoteChar, start); | |
| 1010 bool asciiOnlyString = true; | |
| 1011 bool asciiOnlyLine = true; | |
| 1012 int unicodeStart = start; | |
| 1013 int next = advance(); // Advance past the (last) quote (of three). | |
| 1014 while (!identical(next, $EOF)) { | |
| 1015 if (identical(next, $$)) { | |
| 1016 if (!asciiOnlyLine) handleUnicode(unicodeStart); | |
| 1017 next = tokenizeStringInterpolation(start, asciiOnlyString); | |
| 1018 start = scanOffset; | |
| 1019 unicodeStart = start; | |
| 1020 asciiOnlyString = true; // A new string token is created for the rest. | |
| 1021 asciiOnlyLine = true; | |
| 1022 continue; | |
| 1023 } | |
| 1024 if (identical(next, quoteChar)) { | |
| 1025 next = advance(); | |
| 1026 if (identical(next, quoteChar)) { | |
| 1027 next = advance(); | |
| 1028 if (identical(next, quoteChar)) { | |
| 1029 if (!asciiOnlyLine) handleUnicode(unicodeStart); | |
| 1030 next = advance(); | |
| 1031 appendSubstringToken(STRING_INFO, start, asciiOnlyString); | |
| 1032 return next; | |
| 1033 } | |
| 1034 } | |
| 1035 continue; | |
| 1036 } | |
| 1037 if (identical(next, $BACKSLASH)) { | |
| 1038 next = advance(); | |
| 1039 if (identical(next, $EOF)) break; | |
| 1040 } | |
| 1041 if (identical(next, $LF)) { | |
| 1042 if (!asciiOnlyLine) { | |
| 1043 // Synchronize the string offset in the utf8 scanner. | |
| 1044 handleUnicode(unicodeStart); | |
| 1045 asciiOnlyLine = true; | |
| 1046 unicodeStart = scanOffset; | |
| 1047 } | |
| 1048 lineFeedInMultiline(); | |
| 1049 } else if (next > 127) { | |
| 1050 asciiOnlyString = false; | |
| 1051 asciiOnlyLine = false; | |
| 1052 } | |
| 1053 next = advance(); | |
| 1054 } | |
| 1055 if (!asciiOnlyLine) handleUnicode(unicodeStart); | |
| 1056 return unterminatedMultiLineString(quoteChar); | |
| 1057 } | |
| 1058 | |
| 1059 int unexpected(int character) { | |
| 1060 appendErrorToken(new BadInputToken(character, tokenStart)); | |
| 1061 return advanceAfterError(true); | |
| 1062 } | |
| 1063 | |
| 1064 int unterminated(String prefix, {bool shouldAdvance: true}) { | |
| 1065 appendErrorToken(new UnterminatedToken(prefix, tokenStart, stringOffset)); | |
| 1066 return advanceAfterError(shouldAdvance); | |
| 1067 } | |
| 1068 | |
| 1069 int unterminatedString(int quoteChar) { | |
| 1070 return unterminated(new String.fromCharCodes([quoteChar])); | |
| 1071 } | |
| 1072 | |
| 1073 int unterminatedRawString(int quoteChar) { | |
| 1074 return unterminated('r${new String.fromCharCodes([quoteChar])}'); | |
| 1075 } | |
| 1076 | |
| 1077 int unterminatedMultiLineString(int quoteChar) { | |
| 1078 return unterminated( | |
| 1079 new String.fromCharCodes([quoteChar, quoteChar, quoteChar])); | |
| 1080 } | |
| 1081 | |
| 1082 int unterminatedRawMultiLineString(int quoteChar) { | |
| 1083 return unterminated( | |
| 1084 'r${new String.fromCharCodes([quoteChar, quoteChar, quoteChar])}'); | |
| 1085 } | |
| 1086 | |
| 1087 int advanceAfterError(bool shouldAdvance) { | |
| 1088 if (atEndOfFile()) return $EOF; | |
| 1089 if (shouldAdvance) { | |
| 1090 return advance(); // Ensure progress. | |
| 1091 } else { | |
| 1092 return -1; | |
| 1093 } | |
| 1094 } | |
| 1095 | |
| 1096 void unmatchedBeginGroup(BeginGroupToken begin) { | |
| 1097 // We want to ensure that unmatched BeginGroupTokens are reported as | |
| 1098 // errors. However, the diet parser assumes that groups are well-balanced | |
| 1099 // and will never look at the endGroup token. This is a nice property that | |
| 1100 // allows us to skip quickly over correct code. By inserting an additional | |
| 1101 // synthetic token in the stream, we can keep ignoring endGroup tokens. | |
| 1102 // | |
| 1103 // [begin] --next--> [tail] | |
| 1104 // [begin] --endG--> [synthetic] --next--> [next] --next--> [tail] | |
| 1105 // | |
| 1106 // This allows the diet parser to skip from [begin] via endGroup to | |
| 1107 // [synthetic] and ignore the [synthetic] token (assuming it's correct), | |
| 1108 // then the error will be reported when parsing the [next] token. | |
| 1109 // | |
| 1110 // For example, tokenize("{[1};") produces: | |
| 1111 // | |
| 1112 // SymbolToken({) --endGroup-----+ | |
| 1113 // | | | |
| 1114 // next | | |
| 1115 // v | | |
| 1116 // SymbolToken([) --endGroup--+ | | |
| 1117 // | | | | |
| 1118 // next | | | |
| 1119 // v | | | |
| 1120 // StringToken(1) | | | |
| 1121 // | v | | |
| 1122 // next SymbolToken(]) | <- Synthetic token. | |
| 1123 // | | | | |
| 1124 // | next | | |
| 1125 // v | | | |
| 1126 // UnmatchedToken([)<---------+ | | |
| 1127 // | | | |
| 1128 // next | | |
| 1129 // v | | |
| 1130 // SymbolToken(})<---------------+ | |
| 1131 // | | |
| 1132 // next | |
| 1133 // v | |
| 1134 // SymbolToken(;) | |
| 1135 // | | |
| 1136 // next | |
| 1137 // v | |
| 1138 // EOF | |
| 1139 Token synthetic = | |
| 1140 new SymbolToken(closeBraceInfoFor(begin), begin.charOffset); | |
| 1141 UnmatchedToken next = new UnmatchedToken(begin); | |
| 1142 begin.endGroup = synthetic; | |
| 1143 synthetic.next = next; | |
| 1144 appendErrorToken(next); | |
| 1145 } | |
| 1146 } | |
| 1147 | |
| 1148 PrecedenceInfo closeBraceInfoFor(BeginGroupToken begin) { | |
| 1149 return const { | |
| 1150 '(': CLOSE_PAREN_INFO, | |
| 1151 '[': CLOSE_SQUARE_BRACKET_INFO, | |
| 1152 '{': CLOSE_CURLY_BRACKET_INFO, | |
| 1153 '<': GT_INFO, | |
| 1154 r'${': CLOSE_CURLY_BRACKET_INFO, | |
| 1155 }[begin.value]; | |
| 1156 } | |
| OLD | NEW |