sdk/lib/_internal/compiler/implementation/scanner/scanner.dart - Issue 694353007: Move dart2js from sdk/lib/_internal/compiler to pkg/compiler

Side by Side Diff: sdk/lib/_internal/compiler/implementation/scanner/scanner.dart

Issue 694353007: Move dart2js from sdk/lib/_internal/compiler to pkg/compiler (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 6 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « sdk/lib/_internal/compiler/implementation/scanner/partial_parser.dart ('k') | sdk/lib/_internal/compiler/implementation/scanner/scanner_task.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.

4

5 part of scanner;

6

7 abstract class Scanner {

8 Token tokenize();

9

10 factory Scanner(SourceFile file, {bool includeComments: false}) {

11 if (file is Utf8BytesSourceFile) {

12 return new Utf8BytesScanner(file, includeComments: includeComments);

13 } else {

14 return new StringScanner(file, includeComments: includeComments);

15 }

16 }

17 }

18

19 abstract class AbstractScanner implements Scanner {

20 // TODO(ahe): Move this class to implementation.

21

22 final bool includeComments;

23

24 /**

25 * The string offset for the next token that will be created.

26 *

27 * Note that in the [Utf8BytesScanner], [stringOffset] and [scanOffset] values

28 * are different. One string character can be encoded using multiple UTF-8

29 * bytes.

30 */

31 int tokenStart = -1;

32

33 /**

34 * A pointer to the token stream created by this scanner. The first token

35 * is a special token and not part of the source file. This is an

36 * implementation detail to avoids special cases in the scanner. This token

37 * is not exposed to clients of the scanner, which are expected to invoke

38 * [firstToken] to access the token stream.

39 */

40 final Token tokens = new SymbolToken(EOF_INFO, -1);

41

42 /**

43 * A pointer to the last scanned token.

44 */

45 Token tail;

46

47 /**

48 * The source file that is being scanned. This field can be [:null:].

49 * If the source file is available, the scanner assigns its [:lineStarts:] and

50 * [:length:] fields at the end of [tokenize].

51 */

52 final SourceFile file;

53

54 final List<int> lineStarts = <int>[0];

55

56 AbstractScanner(this.file, this.includeComments) {

57 this.tail = this.tokens;

58 }

59

60 /**

61 * Advances and returns the next character.

62 *

63 * If the next character is non-ASCII, then the returned value depends on the

64 * scanner implementation. The [Utf8BytesScanner] returns a UTF-8 byte, while

65 * the [StringScanner] returns a UTF-16 code unit.

66 *

67 * The scanner ensures that [advance] is not invoked after it returned [$EOF].

68 * This allows implementations to omit bound checks if the data structure ends

69 * with '0'.

70 */

71 int advance();

72

73 /**

74 * Returns the current unicode character.

75 *

76 * If the current character is ASCII, then it is returned unchanged.

77 *

78 * The [Utf8BytesScanner] decodes the next unicode code point starting at the

79 * current position. Note that every unicode character is returned as a single

80 * code point, that is, for '\u{1d11e}' it returns 119070, and the following

81 * [advance] returns the next character.

82 *

83 * The [StringScanner] returns the current character unchanged, which might

84 * be a surrogate character. In the case of '\u{1d11e}', it returns the first

85 * code unit 55348, and the following [advance] returns the second code unit

86 * 56606.

87 *

88 * Invoking [currentAsUnicode] multiple times is safe, i.e.,

89 * [:currentAsUnicode(next) == currentAsUnicode(currentAsUnicode(next)):].

90 */

91 int currentAsUnicode(int next);

92

93 /**

94 * Returns the character at the next poisition. Like in [advance], the

95 * [Utf8BytesScanner] returns a UTF-8 byte, while the [StringScanner] returns

96 * a UTF-16 code unit.

97 */

98 int peek();

99

100 /**

101 * Notifies the scanner that unicode characters were detected in either a

102 * comment or a string literal between [startScanOffset] and the current

103 * scan offset.

104 */

105 void handleUnicode(int startScanOffset);

106

107 /**

108 * Returns the current scan offset.

109 *

110 * In the [Utf8BytesScanner] this is the offset into the byte list, in the

111 * [StringScanner] the offset in the source string.

112 */

113 int get scanOffset;

114

115 /**

116 * Returns the current string offset.

117 *

118 * In the [StringScanner] this is identical to the [scanOffset]. In the

119 * [Utf8BytesScanner] it is computed based on encountered UTF-8 characters.

120 */

121 int get stringOffset;

122

123 /**

124 * Returns the first token scanned by this [Scanner].

125 */

126 Token firstToken();

127

128 /**

129 * Returns the last token scanned by this [Scanner].

130 */

131 Token previousToken();

132

133 /**

134 * Notifies that a new token starts at current offset.

135 */

136 void beginToken() {

137 tokenStart = stringOffset;

138 }

139

140 /**

141 * Appends a substring from the scan offset [:start:] to the current

142 * [:scanOffset:] plus the [:extraOffset:]. For example, if the current

143 * scanOffset is 10, then [:appendSubstringToken(5, -1):] will append the

144 * substring string [5,9).

145 *

146 * Note that [extraOffset] can only be used if the covered character(s) are

147 * known to be ASCII.

148 */

149 void appendSubstringToken(PrecedenceInfo info, int start,

150 bool asciiOnly, [int extraOffset]);

151

152 /** Documentation in subclass [ArrayBasedScanner]. */

153 void appendPrecedenceToken(PrecedenceInfo info);

154

155 /** Documentation in subclass [ArrayBasedScanner]. */

156 int select(int choice, PrecedenceInfo yes, PrecedenceInfo no);

157

158 /** Documentation in subclass [ArrayBasedScanner]. */

159 void appendKeywordToken(Keyword keyword);

160

161 /** Documentation in subclass [ArrayBasedScanner]. */

162 void appendEofToken();

163

164 /** Documentation in subclass [ArrayBasedScanner]. */

165 void appendWhiteSpace(int next);

166

167 /** Documentation in subclass [ArrayBasedScanner]. */

168 void lineFeedInMultiline();

169

170 /** Documentation in subclass [ArrayBasedScanner]. */

171 void appendBeginGroup(PrecedenceInfo info);

172

173 /** Documentation in subclass [ArrayBasedScanner]. */

174 int appendEndGroup(PrecedenceInfo info, int openKind);

175

176 /** Documentation in subclass [ArrayBasedScanner]. */

177 void appendGt(PrecedenceInfo info);

178

179 /** Documentation in subclass [ArrayBasedScanner]. */

180 void appendGtGt(PrecedenceInfo info);

181

182 /** Documentation in subclass [ArrayBasedScanner]. */

183 void appendComment(start, bool asciiOnly);

184

185 /// Append [token] to the token stream.

186 void appendErrorToken(ErrorToken token);

187

188 /** Documentation in subclass [ArrayBasedScanner]. */

189 void discardOpenLt();

190

191 /// Return true when at EOF.

192 bool atEndOfFile();

193

194 Token tokenize() {

195 while (!atEndOfFile()) {

196 int next = advance();

197 while (!identical(next, $EOF)) {

198 next = bigSwitch(next);

199 }

200 if (atEndOfFile()) {

201 appendEofToken();

202 } else {

203 unexpected($EOF);

204 }

205 }

206

207 if (file != null) {

208 file.length = stringOffset;

209 // One additional line start at the end, see [SourceFile.lineStarts].

210 lineStarts.add(stringOffset + 1);

211 file.lineStarts = lineStarts;

212 }

213

214 return firstToken();

215 }

216

217 int bigSwitch(int next) {

218 beginToken();

219 if (identical(next, $SPACE) \|\| identical(next, $TAB)

220 \|\| identical(next, $LF) \|\| identical(next, $CR)) {

221 appendWhiteSpace(next);

222 next = advance();

223 // Sequences of spaces are common, so advance through them fast.

224 while (identical(next, $SPACE)) {

225 // We don't invoke [:appendWhiteSpace(next):] here for efficiency,

226 // assuming that it does not do anything for space characters.

227 next = advance();

228 }

229 return next;

230 }

231

232 if ($a <= next && next <= $z) {

233 if (identical($r, next)) {

234 return tokenizeRawStringKeywordOrIdentifier(next);

235 }

236 return tokenizeKeywordOrIdentifier(next, true);

237 }

238

239 if (($A <= next && next <= $Z) \|\|

240 identical(next, $_) \|\|

241 identical(next, $$)) {

242 return tokenizeIdentifier(next, scanOffset, true);

243 }

244

245 if (identical(next, $LT)) {

246 return tokenizeLessThan(next);

247 }

248

249 if (identical(next, $GT)) {

250 return tokenizeGreaterThan(next);

251 }

252

253 if (identical(next, $EQ)) {

254 return tokenizeEquals(next);

255 }

256

257 if (identical(next, $BANG)) {

258 return tokenizeExclamation(next);

259 }

260

261 if (identical(next, $PLUS)) {

262 return tokenizePlus(next);

263 }

264

265 if (identical(next, $MINUS)) {

266 return tokenizeMinus(next);

267 }

268

269 if (identical(next, $STAR)) {

270 return tokenizeMultiply(next);

271 }

272

273 if (identical(next, $PERCENT)) {

274 return tokenizePercent(next);

275 }

276

277 if (identical(next, $AMPERSAND)) {

278 return tokenizeAmpersand(next);

279 }

280

281 if (identical(next, $BAR)) {

282 return tokenizeBar(next);

283 }

284

285 if (identical(next, $CARET)) {

286 return tokenizeCaret(next);

287 }

288

289 if (identical(next, $OPEN_SQUARE_BRACKET)) {

290 return tokenizeOpenSquareBracket(next);

291 }

292

293 if (identical(next, $TILDE)) {

294 return tokenizeTilde(next);

295 }

296

297 if (identical(next, $BACKSLASH)) {

298 appendPrecedenceToken(BACKSLASH_INFO);

299 return advance();

300 }

301

302 if (identical(next, $HASH)) {

303 return tokenizeTag(next);

304 }

305

306 if (identical(next, $OPEN_PAREN)) {

307 appendBeginGroup(OPEN_PAREN_INFO);

308 return advance();

309 }

310

311 if (identical(next, $CLOSE_PAREN)) {

312 return appendEndGroup(CLOSE_PAREN_INFO, OPEN_PAREN_TOKEN);

313 }

314

315 if (identical(next, $COMMA)) {

316 appendPrecedenceToken(COMMA_INFO);

317 return advance();

318 }

319

320 if (identical(next, $COLON)) {

321 appendPrecedenceToken(COLON_INFO);

322 return advance();

323 }

324

325 if (identical(next, $SEMICOLON)) {

326 appendPrecedenceToken(SEMICOLON_INFO);

327 // Type parameters and arguments cannot contain semicolon.

328 discardOpenLt();

329 return advance();

330 }

331

332 if (identical(next, $QUESTION)) {

333 appendPrecedenceToken(QUESTION_INFO);

334 return advance();

335 }

336

337 if (identical(next, $CLOSE_SQUARE_BRACKET)) {

338 return appendEndGroup(CLOSE_SQUARE_BRACKET_INFO,

339 OPEN_SQUARE_BRACKET_TOKEN);

340 }

341

342 if (identical(next, $BACKPING)) {

343 appendPrecedenceToken(BACKPING_INFO);

344 return advance();

345 }

346

347 if (identical(next, $OPEN_CURLY_BRACKET)) {

348 appendBeginGroup(OPEN_CURLY_BRACKET_INFO);

349 return advance();

350 }

351

352 if (identical(next, $CLOSE_CURLY_BRACKET)) {

353 return appendEndGroup(CLOSE_CURLY_BRACKET_INFO,

354 OPEN_CURLY_BRACKET_TOKEN);

355 }

356

357 if (identical(next, $SLASH)) {

358 return tokenizeSlashOrComment(next);

359 }

360

361 if (identical(next, $AT)) {

362 return tokenizeAt(next);

363 }

364

365 if (identical(next, $DQ) \|\| identical(next, $SQ)) {

366 return tokenizeString(next, scanOffset, false);

367 }

368

369 if (identical(next, $PERIOD)) {

370 return tokenizeDotsOrNumber(next);

371 }

372

373 if (identical(next, $0)) {

374 return tokenizeHexOrNumber(next);

375 }

376

377 // TODO(ahe): Would a range check be faster?

378 if (identical(next, $1) \|\| identical(next, $2) \|\| identical(next, $3)

379 \|\| identical(next, $4) \|\| identical(next, $5) \|\| identical(next, $6)

380 \|\| identical(next, $7) \|\| identical(next, $8) \|\| identical(next, $9)) {

381 return tokenizeNumber(next);

382 }

383

384 if (identical(next, $EOF)) {

385 return $EOF;

386 }

387 if (next < 0x1f) {

388 return unexpected(next);

389 }

390

391 next = currentAsUnicode(next);

392

393 // The following are non-ASCII characters.

394

395 if (identical(next, $NBSP)) {

396 appendWhiteSpace(next);

397 return advance();

398 }

399

400 return unexpected(next);

401 }

402

403 int tokenizeTag(int next) {

404 // # or #!.*[\n\r]

405 if (scanOffset == 0) {

406 if (identical(peek(), $BANG)) {

407 int start = scanOffset + 1;

408 bool asciiOnly = true;

409 do {

410 next = advance();

411 if (next > 127) asciiOnly = false;

412 } while (!identical(next, $LF) &&

413 !identical(next, $CR) &&

414 !identical(next, $EOF));

415 if (!asciiOnly) handleUnicode(start);

416 return next;

417 }

418 }

419 appendPrecedenceToken(HASH_INFO);

420 return advance();

421 }

422

423 int tokenizeTilde(int next) {

424 // ~ ~/ ~/=

425 next = advance();

426 if (identical(next, $SLASH)) {

427 return select($EQ, TILDE_SLASH_EQ_INFO, TILDE_SLASH_INFO);

428 } else {

429 appendPrecedenceToken(TILDE_INFO);

430 return next;

431 }

432 }

433

434 int tokenizeOpenSquareBracket(int next) {

435 // [ [] []=

436 next = advance();

437 if (identical(next, $CLOSE_SQUARE_BRACKET)) {

438 Token token = previousToken();

439 if (token is KeywordToken && token.keyword.syntax == 'operator' \|\|

440 token is SymbolToken && token.info == HASH_INFO) {

441 return select($EQ, INDEX_EQ_INFO, INDEX_INFO);

442 }

443 }

444 appendBeginGroup(OPEN_SQUARE_BRACKET_INFO);

445 return next;

446 }

447

448 int tokenizeCaret(int next) {

449 // ^ ^=

450 return select($EQ, CARET_EQ_INFO, CARET_INFO);

451 }

452

453 int tokenizeBar(int next) {

454 // \| \|\| \|=

455 next = advance();

456 if (identical(next, $BAR)) {

457 appendPrecedenceToken(BAR_BAR_INFO);

458 return advance();

459 } else if (identical(next, $EQ)) {

460 appendPrecedenceToken(BAR_EQ_INFO);

461 return advance();

462 } else {

463 appendPrecedenceToken(BAR_INFO);

464 return next;

465 }

466 }

467

468 int tokenizeAmpersand(int next) {

469 // && &= &

470 next = advance();

471 if (identical(next, $AMPERSAND)) {

472 appendPrecedenceToken(AMPERSAND_AMPERSAND_INFO);

473 return advance();

474 } else if (identical(next, $EQ)) {

475 appendPrecedenceToken(AMPERSAND_EQ_INFO);

476 return advance();

477 } else {

478 appendPrecedenceToken(AMPERSAND_INFO);

479 return next;

480 }

481 }

482

483 int tokenizePercent(int next) {

484 // % %=

485 return select($EQ, PERCENT_EQ_INFO, PERCENT_INFO);

486 }

487

488 int tokenizeMultiply(int next) {

489 // * *=

490 return select($EQ, STAR_EQ_INFO, STAR_INFO);

491 }

492

493 int tokenizeMinus(int next) {

494 // - -- -=

495 next = advance();

496 if (identical(next, $MINUS)) {

497 appendPrecedenceToken(MINUS_MINUS_INFO);

498 return advance();

499 } else if (identical(next, $EQ)) {

500 appendPrecedenceToken(MINUS_EQ_INFO);

501 return advance();

502 } else {

503 appendPrecedenceToken(MINUS_INFO);

504 return next;

505 }

506 }

507

508 int tokenizePlus(int next) {

509 // + ++ +=

510 next = advance();

511 if (identical($PLUS, next)) {

512 appendPrecedenceToken(PLUS_PLUS_INFO);

513 return advance();

514 } else if (identical($EQ, next)) {

515 appendPrecedenceToken(PLUS_EQ_INFO);

516 return advance();

517 } else {

518 appendPrecedenceToken(PLUS_INFO);

519 return next;

520 }

521 }

522

523 int tokenizeExclamation(int next) {

524 // ! !=

525 // !== is kept for user-friendly error reporting.

526

527 next = advance();

528 if (identical(next, $EQ)) {

529 return select($EQ, BANG_EQ_EQ_INFO, BANG_EQ_INFO);

530 }

531 appendPrecedenceToken(BANG_INFO);

532 return next;

533 }

534

535 int tokenizeEquals(int next) {

536 // = == =>

537 // === is kept for user-friendly error reporting.

538

539 // Type parameters and arguments cannot contain any token that

540 // starts with '='.

541 discardOpenLt();

542

543 next = advance();

544 if (identical(next, $EQ)) {

545 return select($EQ, EQ_EQ_EQ_INFO, EQ_EQ_INFO);

546 } else if (identical(next, $GT)) {

547 appendPrecedenceToken(FUNCTION_INFO);

548 return advance();

549 }

550 appendPrecedenceToken(EQ_INFO);

551 return next;

552 }

553

554 int tokenizeGreaterThan(int next) {

555 // > >= >> >>=

556 next = advance();

557 if (identical($EQ, next)) {

558 appendPrecedenceToken(GT_EQ_INFO);

559 return advance();

560 } else if (identical($GT, next)) {

561 next = advance();

562 if (identical($EQ, next)) {

563 appendPrecedenceToken(GT_GT_EQ_INFO);

564 return advance();

565 } else {

566 appendGtGt(GT_GT_INFO);

567 return next;

568 }

569 } else {

570 appendGt(GT_INFO);

571 return next;

572 }

573 }

574

575 int tokenizeLessThan(int next) {

576 // < <= << <<=

577 next = advance();

578 if (identical($EQ, next)) {

579 appendPrecedenceToken(LT_EQ_INFO);

580 return advance();

581 } else if (identical($LT, next)) {

582 return select($EQ, LT_LT_EQ_INFO, LT_LT_INFO);

583 } else {

584 appendBeginGroup(LT_INFO);

585 return next;

586 }

587 }

588

589 int tokenizeNumber(int next) {

590 int start = scanOffset;

591 while (true) {

592 next = advance();

593 if ($0 <= next && next <= $9) {

594 continue;

595 } else if (identical(next, $e) \|\| identical(next, $E)) {

596 return tokenizeFractionPart(next, start);

597 } else {

598 if (identical(next, $PERIOD)) {

599 int nextnext = peek();

600 if ($0 <= nextnext && nextnext <= $9) {

601 return tokenizeFractionPart(advance(), start);

602 }

603 }

604 appendSubstringToken(INT_INFO, start, true);

605 return next;

606 }

607 }

608 return null;

609 }

610

611 int tokenizeHexOrNumber(int next) {

612 int x = peek();

613 if (identical(x, $x) \|\| identical(x, $X)) {

614 return tokenizeHex(next);

615 }

616 return tokenizeNumber(next);

617 }

618

619 int tokenizeHex(int next) {

620 int start = scanOffset;

621 next = advance(); // Advance past the $x or $X.

622 bool hasDigits = false;

623 while (true) {

624 next = advance();

625 if (($0 <= next && next <= $9)

626 \|\| ($A <= next && next <= $F)

627 \|\| ($a <= next && next <= $f)) {

628 hasDigits = true;

629 } else {

630 if (!hasDigits) {

631 unterminated('0x', shouldAdvance: false);

632 return next;

633 }

634 appendSubstringToken(HEXADECIMAL_INFO, start, true);

635 return next;

636 }

637 }

638 return null;

639 }

640

641 int tokenizeDotsOrNumber(int next) {

642 int start = scanOffset;

643 next = advance();

644 if (($0 <= next && next <= $9)) {

645 return tokenizeFractionPart(next, start);

646 } else if (identical($PERIOD, next)) {

647 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);

648 } else {

649 appendPrecedenceToken(PERIOD_INFO);

650 return next;

651 }

652 }

653

654 int tokenizeFractionPart(int next, int start) {

655 bool done = false;

656 bool hasDigit = false;

657 LOOP: while (!done) {

658 if ($0 <= next && next <= $9) {

659 hasDigit = true;

660 } else if (identical($e, next) \|\| identical($E, next)) {

661 hasDigit = true;

662 next = advance();

663 if (identical(next, $PLUS) \|\| identical(next, $MINUS)) {

664 next = advance();

665 }

666 bool hasExponentDigits = false;

667 while (true) {

668 if ($0 <= next && next <= $9) {

669 hasExponentDigits = true;

670 } else {

671 if (!hasExponentDigits) {

672 unterminated('1e', shouldAdvance: false);

673 return next;

674 }

675 break;

676 }

677 next = advance();

678 }

679

680 done = true;

681 continue LOOP;

682 } else {

683 done = true;

684 continue LOOP;

685 }

686 next = advance();

687 }

688 if (!hasDigit) {

689 // Reduce offset, we already advanced to the token past the period.

690 appendSubstringToken(INT_INFO, start, true, -1);

691

692 // TODO(ahe): Wrong offset for the period. Cannot call beginToken because

693 // the scanner already advanced past the period.

694 if (identical($PERIOD, next)) {

695 return select($PERIOD, PERIOD_PERIOD_PERIOD_INFO, PERIOD_PERIOD_INFO);

696 }

697 appendPrecedenceToken(PERIOD_INFO);

698 return next;

699 }

700 appendSubstringToken(DOUBLE_INFO, start, true);

701 return next;

702 }

703

704 int tokenizeSlashOrComment(int next) {

705 int start = scanOffset;

706 next = advance();

707 if (identical($STAR, next)) {

708 return tokenizeMultiLineComment(next, start);

709 } else if (identical($SLASH, next)) {

710 return tokenizeSingleLineComment(next, start);

711 } else if (identical($EQ, next)) {

712 appendPrecedenceToken(SLASH_EQ_INFO);

713 return advance();

714 } else {

715 appendPrecedenceToken(SLASH_INFO);

716 return next;

717 }

718 }

719

720 int tokenizeSingleLineComment(int next, int start) {

721 bool asciiOnly = true;

722 while (true) {

723 next = advance();

724 if (next > 127) asciiOnly = false;

725 if (identical($LF, next) \|\|

726 identical($CR, next) \|\|

727 identical($EOF, next)) {

728 if (!asciiOnly) handleUnicode(start);

729 appendComment(start, asciiOnly);

730 return next;

731 }

732 }

733 return null;

734 }

735

736

737 int tokenizeMultiLineComment(int next, int start) {

738 bool asciiOnlyComment = true; // Track if the entire comment is ASCII.

739 bool asciiOnlyLines = true; // Track ASCII since the last handleUnicode.

740 int unicodeStart = start;

741 int nesting = 1;

742 next = advance();

743 while (true) {

744 if (identical($EOF, next)) {

745 if (!asciiOnlyLines) handleUnicode(unicodeStart);

746 unterminated('/*');

747 break;

748 } else if (identical($STAR, next)) {

749 next = advance();

750 if (identical($SLASH, next)) {

751 --nesting;

752 if (0 == nesting) {

753 if (!asciiOnlyLines) handleUnicode(unicodeStart);

754 next = advance();

755 appendComment(start, asciiOnlyComment);

756 break;

757 } else {

758 next = advance();

759 }

760 }

761 } else if (identical($SLASH, next)) {

762 next = advance();

763 if (identical($STAR, next)) {

764 next = advance();

765 ++nesting;

766 }

767 } else if (identical(next, $LF)) {

768 if (!asciiOnlyLines) {

769 // Synchronize the string offset in the utf8 scanner.

770 handleUnicode(unicodeStart);

771 asciiOnlyLines = true;

772 unicodeStart = scanOffset;

773 }

774 lineFeedInMultiline();

775 next = advance();

776 } else {

777 if (next > 127) {

778 asciiOnlyLines = false;

779 asciiOnlyComment = false;

780 }

781 next = advance();

782 }

783 }

784 return next;

785 }

786

787 int tokenizeRawStringKeywordOrIdentifier(int next) {

788 // [next] is $r.

789 int nextnext = peek();

790 if (identical(nextnext, $DQ) \|\| identical(nextnext, $SQ)) {

791 int start = scanOffset;

792 next = advance();

793 return tokenizeString(next, start, true);

794 }

795 return tokenizeKeywordOrIdentifier(next, true);

796 }

797

798 int tokenizeKeywordOrIdentifier(int next, bool allowDollar) {

799 KeywordState state = KeywordState.KEYWORD_STATE;

800 int start = scanOffset;

801 while (state != null && $a <= next && next <= $z) {

802 state = state.next(next);

803 next = advance();

804 }

805 if (state == null \|\| state.keyword == null) {

806 return tokenizeIdentifier(next, start, allowDollar);

807 }

808 if (($A <= next && next <= $Z) \|\|

809 ($0 <= next && next <= $9) \|\|

810 identical(next, $_) \|\|

811 identical(next, $$)) {

812 return tokenizeIdentifier(next, start, allowDollar);

813 } else {

814 appendKeywordToken(state.keyword);

815 return next;

816 }

817 }

818

819 /**

820 * [allowDollar] can exclude '$', which is not allowed as part of a string

821 * interpolation identifier.

822 */

823 int tokenizeIdentifier(int next, int start, bool allowDollar) {

824 while (true) {

825 if (($a <= next && next <= $z) \|\|

826 ($A <= next && next <= $Z) \|\|

827 ($0 <= next && next <= $9) \|\|

828 identical(next, $_) \|\|

829 (identical(next, $$) && allowDollar)) {

830 next = advance();

831 } else {

832 // Identifier ends here.

833 if (start == scanOffset) {

834 return unexpected(next);

835 } else {

836 appendSubstringToken(IDENTIFIER_INFO, start, true);

837 }

838 break;

839 }

840 }

841 return next;

842 }

843

844 int tokenizeAt(int next) {

845 appendPrecedenceToken(AT_INFO);

846 return advance();

847 }

848

849 int tokenizeString(int next, int start, bool raw) {

850 int quoteChar = next;

851 next = advance();

852 if (identical(quoteChar, next)) {

853 next = advance();

854 if (identical(quoteChar, next)) {

855 // Multiline string.

856 return tokenizeMultiLineString(quoteChar, start, raw);

857 } else {

858 // Empty string.

859 appendSubstringToken(STRING_INFO, start, true);

860 return next;

861 }

862 }

863 if (raw) {

864 return tokenizeSingleLineRawString(next, quoteChar, start);

865 } else {

866 return tokenizeSingleLineString(next, quoteChar, start);

867 }

868 }

869

870 /**

871 * [next] is the first character after the quote.

872 * [start] is the scanOffset of the quote.

873 *

874 * The token contains a substring of the source file, including the

875 * string quotes, backslashes for escaping. For interpolated strings,

876 * the parts before and after are separate tokens.

877 *

878 * "a $b c"

879 *

880 * gives StringToken("a $), StringToken(b) and StringToken( c").

881 */

882 int tokenizeSingleLineString(int next, int quoteChar, int start) {

883 bool asciiOnly = true;

884 while (!identical(next, quoteChar)) {

885 if (identical(next, $BACKSLASH)) {

886 next = advance();

887 } else if (identical(next, $$)) {

888 if (!asciiOnly) handleUnicode(start);

889 next = tokenizeStringInterpolation(start, asciiOnly);

890 start = scanOffset;

891 asciiOnly = true;

892 continue;

893 }

894 if (next <= $CR

895 && (identical(next, $LF) \|\|

896 identical(next, $CR) \|\|

897 identical(next, $EOF))) {

898 if (!asciiOnly) handleUnicode(start);

899 return unterminatedString(quoteChar);

900 }

901 if (next > 127) asciiOnly = false;

902 next = advance();

903 }

904 if (!asciiOnly) handleUnicode(start);

905 // Advance past the quote character.

906 next = advance();

907 appendSubstringToken(STRING_INFO, start, asciiOnly);

908 return next;

909 }

910

911 int tokenizeStringInterpolation(int start, bool asciiOnly) {

912 appendSubstringToken(STRING_INFO, start, asciiOnly);

913 beginToken(); // $ starts here.

914 int next = advance();

915 if (identical(next, $OPEN_CURLY_BRACKET)) {

916 return tokenizeInterpolatedExpression(next);

917 } else {

918 return tokenizeInterpolatedIdentifier(next);

919 }

920 }

921

922 int tokenizeInterpolatedExpression(int next) {

923 appendBeginGroup(STRING_INTERPOLATION_INFO);

924 beginToken(); // The expression starts here.

925 next = advance(); // Move past the curly bracket.

926 while (!identical(next, $EOF) && !identical(next, $STX)) {

927 next = bigSwitch(next);

928 }

929 if (identical(next, $EOF)) return next;

930 next = advance(); // Move past the $STX.

931 beginToken(); // The string interpolation suffix starts here.

932 return next;

933 }

934

935 int tokenizeInterpolatedIdentifier(int next) {

936 appendPrecedenceToken(STRING_INTERPOLATION_IDENTIFIER_INFO);

937

938 if ($a <= next && next <= $z) {

939 beginToken(); // The identifier starts here.

940 next = tokenizeKeywordOrIdentifier(next, false);

941 } else if (($A <= next && next <= $Z) \|\| identical(next, $_)) {

942 beginToken(); // The identifier starts here.

943 next = tokenizeIdentifier(next, scanOffset, false);

944 } else {

945 unterminated(r'$', shouldAdvance: false);

946 }

947 beginToken(); // The string interpolation suffix starts here.

948 return next;

949 }

950

951 int tokenizeSingleLineRawString(int next, int quoteChar, int start) {

952 bool asciiOnly = true;

953 while (next != $EOF) {

954 if (identical(next, quoteChar)) {

955 if (!asciiOnly) handleUnicode(start);

956 next = advance();

957 appendSubstringToken(STRING_INFO, start, asciiOnly);

958 return next;

959 } else if (identical(next, $LF) \|\| identical(next, $CR)) {

960 if (!asciiOnly) handleUnicode(start);

961 return unterminatedRawString(quoteChar);

962 } else if (next > 127) {

963 asciiOnly = false;

964 }

965 next = advance();

966 }

967 if (!asciiOnly) handleUnicode(start);

968 return unterminatedRawString(quoteChar);

969 }

970

971 int tokenizeMultiLineRawString(int quoteChar, int start) {

972 bool asciiOnlyString = true;

973 bool asciiOnlyLine = true;

974 int unicodeStart = start;

975 int next = advance(); // Advance past the (last) quote (of three).

976 outer: while (!identical(next, $EOF)) {

977 while (!identical(next, quoteChar)) {

978 if (identical(next, $LF)) {

979 if (!asciiOnlyLine) {

980 // Synchronize the string offset in the utf8 scanner.

981 handleUnicode(unicodeStart);

982 asciiOnlyLine = true;

983 unicodeStart = scanOffset;

984 }

985 lineFeedInMultiline();

986 } else if (next > 127) {

987 asciiOnlyLine = false;

988 asciiOnlyString = false;

989 }

990 next = advance();

991 if (identical(next, $EOF)) break outer;

992 }

993 next = advance();

994 if (identical(next, quoteChar)) {

995 next = advance();

996 if (identical(next, quoteChar)) {

997 if (!asciiOnlyLine) handleUnicode(unicodeStart);

998 next = advance();

999 appendSubstringToken(STRING_INFO, start, asciiOnlyString);

1000 return next;

1001 }

1002 }

1003 }

1004 if (!asciiOnlyLine) handleUnicode(unicodeStart);

1005 return unterminatedRawMultiLineString(quoteChar);

1006 }

1007

1008 int tokenizeMultiLineString(int quoteChar, int start, bool raw) {

1009 if (raw) return tokenizeMultiLineRawString(quoteChar, start);

1010 bool asciiOnlyString = true;

1011 bool asciiOnlyLine = true;

1012 int unicodeStart = start;

1013 int next = advance(); // Advance past the (last) quote (of three).

1014 while (!identical(next, $EOF)) {

1015 if (identical(next, $$)) {

1016 if (!asciiOnlyLine) handleUnicode(unicodeStart);

1017 next = tokenizeStringInterpolation(start, asciiOnlyString);

1018 start = scanOffset;

1019 unicodeStart = start;

1020 asciiOnlyString = true; // A new string token is created for the rest.

1021 asciiOnlyLine = true;

1022 continue;

1023 }

1024 if (identical(next, quoteChar)) {

1025 next = advance();

1026 if (identical(next, quoteChar)) {

1027 next = advance();

1028 if (identical(next, quoteChar)) {

1029 if (!asciiOnlyLine) handleUnicode(unicodeStart);

1030 next = advance();

1031 appendSubstringToken(STRING_INFO, start, asciiOnlyString);

1032 return next;

1033 }

1034 }

1035 continue;

1036 }

1037 if (identical(next, $BACKSLASH)) {

1038 next = advance();

1039 if (identical(next, $EOF)) break;

1040 }

1041 if (identical(next, $LF)) {

1042 if (!asciiOnlyLine) {

1043 // Synchronize the string offset in the utf8 scanner.

1044 handleUnicode(unicodeStart);

1045 asciiOnlyLine = true;

1046 unicodeStart = scanOffset;

1047 }

1048 lineFeedInMultiline();

1049 } else if (next > 127) {

1050 asciiOnlyString = false;

1051 asciiOnlyLine = false;

1052 }

1053 next = advance();

1054 }

1055 if (!asciiOnlyLine) handleUnicode(unicodeStart);

1056 return unterminatedMultiLineString(quoteChar);

1057 }

1058

1059 int unexpected(int character) {

1060 appendErrorToken(new BadInputToken(character, tokenStart));

1061 return advanceAfterError(true);

1062 }

1063

1064 int unterminated(String prefix, {bool shouldAdvance: true}) {

1065 appendErrorToken(new UnterminatedToken(prefix, tokenStart, stringOffset));

1066 return advanceAfterError(shouldAdvance);

1067 }

1068

1069 int unterminatedString(int quoteChar) {

1070 return unterminated(new String.fromCharCodes([quoteChar]));

1071 }

1072

1073 int unterminatedRawString(int quoteChar) {

1074 return unterminated('r${new String.fromCharCodes([quoteChar])}');

1075 }

1076

1077 int unterminatedMultiLineString(int quoteChar) {

1078 return unterminated(

1079 new String.fromCharCodes([quoteChar, quoteChar, quoteChar]));

1080 }

1081

1082 int unterminatedRawMultiLineString(int quoteChar) {

1083 return unterminated(

1084 'r${new String.fromCharCodes([quoteChar, quoteChar, quoteChar])}');

1085 }

1086

1087 int advanceAfterError(bool shouldAdvance) {

1088 if (atEndOfFile()) return $EOF;

1089 if (shouldAdvance) {

1090 return advance(); // Ensure progress.

1091 } else {

1092 return -1;

1093 }

1094 }

1095

1096 void unmatchedBeginGroup(BeginGroupToken begin) {

1097 // We want to ensure that unmatched BeginGroupTokens are reported as

1098 // errors. However, the diet parser assumes that groups are well-balanced

1099 // and will never look at the endGroup token. This is a nice property that

1100 // allows us to skip quickly over correct code. By inserting an additional

1101 // synthetic token in the stream, we can keep ignoring endGroup tokens.

1102 //

1103 // [begin] --next--> [tail]

1104 // [begin] --endG--> [synthetic] --next--> [next] --next--> [tail]

1105 //

1106 // This allows the diet parser to skip from [begin] via endGroup to

1107 // [synthetic] and ignore the [synthetic] token (assuming it's correct),

1108 // then the error will be reported when parsing the [next] token.

1109 //

1110 // For example, tokenize("{[1};") produces:

1111 //

1112 // SymbolToken({) --endGroup-----+

1113 // \| \|

1114 // next \|

1115 // v \|

1116 // SymbolToken([) --endGroup--+ \|

1117 // \| \| \|

1118 // next \| \|

1119 // v \| \|

1120 // StringToken(1) \| \|

1121 // \| v \|

1122 // next SymbolToken(]) \| <- Synthetic token.

1123 // \| \| \|

1124 // \| next \|

1125 // v \| \|

1126 // UnmatchedToken([)<---------+ \|

1127 // \| \|

1128 // next \|

1129 // v \|

1130 // SymbolToken(})<---------------+

1131 // \|

1132 // next

1133 // v

1134 // SymbolToken(;)

1135 // \|

1136 // next

1137 // v

1138 // EOF

1139 Token synthetic =

1140 new SymbolToken(closeBraceInfoFor(begin), begin.charOffset);

1141 UnmatchedToken next = new UnmatchedToken(begin);

1142 begin.endGroup = synthetic;

1143 synthetic.next = next;

1144 appendErrorToken(next);

1145 }

1146 }

1147

1148 PrecedenceInfo closeBraceInfoFor(BeginGroupToken begin) {

1149 return const {

1150 '(': CLOSE_PAREN_INFO,

1151 '[': CLOSE_SQUARE_BRACKET_INFO,

1152 '{': CLOSE_CURLY_BRACKET_INFO,

1153 '<': GT_INFO,

1154 r'${': CLOSE_CURLY_BRACKET_INFO,

1155 }[begin.value];

1156 }

OLD	NEW