src/scanner-base.cc - Issue 7739020: Rename scanner.* to scanner-character-streams.* and scanner-base.* to scanner.*

Side by Side Diff: src/scanner-base.cc

Issue 7739020: Rename scanner.* to scanner-character-streams.* and scanner-base.* to scanner.* (Closed) Base URL: git://github.com/v8/v8.git@master

Patch Set: rename scanner-base.* to scanner.* Created 9 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright 2011 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are

4 // met:

5 //

6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided

11 // with the distribution.

12 // * Neither the name of Google Inc. nor the names of its

13 // contributors may be used to endorse or promote products derived

14 // from this software without specific prior written permission.

15 //

16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

27

28 // Features shared by parsing and pre-parsing scanners.

29

30 #include "../include/v8stdint.h"

31 #include "scanner-base.h"

32 #include "char-predicates-inl.h"

33

34 namespace v8 {

35 namespace internal {

36

37 // ----------------------------------------------------------------------------

38 // Scanner

39

40 Scanner::Scanner(UnicodeCache* unicode_cache)

41 : unicode_cache_(unicode_cache) { }

42

43

44 uc32 Scanner::ScanHexNumber(int expected_length) {

45 ASSERT(expected_length <= 4); // prevent overflow

46

47 uc32 digits[4] = { 0, 0, 0, 0 };

48 uc32 x = 0;

49 for (int i = 0; i < expected_length; i++) {

50 digits[i] = c0_;

51 int d = HexValue(c0_);

52 if (d < 0) {

53 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes

54 // should be illegal, but other JS VMs just return the

55 // non-escaped version of the original character.

56

57 // Push back digits that we have advanced past.

58 for (int j = i-1; j >= 0; j--) {

59 PushBack(digits[j]);

60 }

61 return -1;

62 }

63 x = x * 16 + d;

64 Advance();

65 }

66

67 return x;

68 }

69

70

71

72 // ----------------------------------------------------------------------------

73 // JavaScriptScanner

74

75 JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants)

76 : Scanner(scanner_contants),

77 octal_pos_(Location::invalid()),

78 harmony_block_scoping_(false) { }

79

80

81 void JavaScriptScanner::Initialize(UC16CharacterStream* source) {

82 source_ = source;

83 // Need to capture identifiers in order to recognize "get" and "set"

84 // in object literals.

85 Init();

86 // Skip initial whitespace allowing HTML comment ends just like

87 // after a newline and scan first token.

88 has_line_terminator_before_next_ = true;

89 SkipWhiteSpace();

90 Scan();

91 }

92

93

94 // Ensure that tokens can be stored in a byte.

95 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);

96

97 // Table of one-character tokens, by character (0x00..0x7f only).

98 static const byte one_char_tokens[] = {

99 Token::ILLEGAL,

100 Token::ILLEGAL,

101 Token::ILLEGAL,

102 Token::ILLEGAL,

103 Token::ILLEGAL,

104 Token::ILLEGAL,

105 Token::ILLEGAL,

106 Token::ILLEGAL,

107 Token::ILLEGAL,

108 Token::ILLEGAL,

109 Token::ILLEGAL,

110 Token::ILLEGAL,

111 Token::ILLEGAL,

112 Token::ILLEGAL,

113 Token::ILLEGAL,

114 Token::ILLEGAL,

115 Token::ILLEGAL,

116 Token::ILLEGAL,

117 Token::ILLEGAL,

118 Token::ILLEGAL,

119 Token::ILLEGAL,

120 Token::ILLEGAL,

121 Token::ILLEGAL,

122 Token::ILLEGAL,

123 Token::ILLEGAL,

124 Token::ILLEGAL,

125 Token::ILLEGAL,

126 Token::ILLEGAL,

127 Token::ILLEGAL,

128 Token::ILLEGAL,

129 Token::ILLEGAL,

130 Token::ILLEGAL,

131 Token::ILLEGAL,

132 Token::ILLEGAL,

133 Token::ILLEGAL,

134 Token::ILLEGAL,

135 Token::ILLEGAL,

136 Token::ILLEGAL,

137 Token::ILLEGAL,

138 Token::ILLEGAL,

139 Token::LPAREN, // 0x28

140 Token::RPAREN, // 0x29

141 Token::ILLEGAL,

142 Token::ILLEGAL,

143 Token::COMMA, // 0x2c

144 Token::ILLEGAL,

145 Token::ILLEGAL,

146 Token::ILLEGAL,

147 Token::ILLEGAL,

148 Token::ILLEGAL,

149 Token::ILLEGAL,

150 Token::ILLEGAL,

151 Token::ILLEGAL,

152 Token::ILLEGAL,

153 Token::ILLEGAL,

154 Token::ILLEGAL,

155 Token::ILLEGAL,

156 Token::ILLEGAL,

157 Token::COLON, // 0x3a

158 Token::SEMICOLON, // 0x3b

159 Token::ILLEGAL,

160 Token::ILLEGAL,

161 Token::ILLEGAL,

162 Token::CONDITIONAL, // 0x3f

163 Token::ILLEGAL,

164 Token::ILLEGAL,

165 Token::ILLEGAL,

166 Token::ILLEGAL,

167 Token::ILLEGAL,

168 Token::ILLEGAL,

169 Token::ILLEGAL,

170 Token::ILLEGAL,

171 Token::ILLEGAL,

172 Token::ILLEGAL,

173 Token::ILLEGAL,

174 Token::ILLEGAL,

175 Token::ILLEGAL,

176 Token::ILLEGAL,

177 Token::ILLEGAL,

178 Token::ILLEGAL,

179 Token::ILLEGAL,

180 Token::ILLEGAL,

181 Token::ILLEGAL,

182 Token::ILLEGAL,

183 Token::ILLEGAL,

184 Token::ILLEGAL,

185 Token::ILLEGAL,

186 Token::ILLEGAL,

187 Token::ILLEGAL,

188 Token::ILLEGAL,

189 Token::ILLEGAL,

190 Token::LBRACK, // 0x5b

191 Token::ILLEGAL,

192 Token::RBRACK, // 0x5d

193 Token::ILLEGAL,

194 Token::ILLEGAL,

195 Token::ILLEGAL,

196 Token::ILLEGAL,

197 Token::ILLEGAL,

198 Token::ILLEGAL,

199 Token::ILLEGAL,

200 Token::ILLEGAL,

201 Token::ILLEGAL,

202 Token::ILLEGAL,

203 Token::ILLEGAL,

204 Token::ILLEGAL,

205 Token::ILLEGAL,

206 Token::ILLEGAL,

207 Token::ILLEGAL,

208 Token::ILLEGAL,

209 Token::ILLEGAL,

210 Token::ILLEGAL,

211 Token::ILLEGAL,

212 Token::ILLEGAL,

213 Token::ILLEGAL,

214 Token::ILLEGAL,

215 Token::ILLEGAL,

216 Token::ILLEGAL,

217 Token::ILLEGAL,

218 Token::ILLEGAL,

219 Token::ILLEGAL,

220 Token::ILLEGAL,

221 Token::ILLEGAL,

222 Token::LBRACE, // 0x7b

223 Token::ILLEGAL,

224 Token::RBRACE, // 0x7d

225 Token::BIT_NOT, // 0x7e

226 Token::ILLEGAL

227 };

228

229

230 Token::Value JavaScriptScanner::Next() {

231 current_ = next_;

232 has_line_terminator_before_next_ = false;

233 has_multiline_comment_before_next_ = false;

234 if (static_cast<unsigned>(c0_) <= 0x7f) {

235 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);

236 if (token != Token::ILLEGAL) {

237 int pos = source_pos();

238 next_.token = token;

239 next_.location.beg_pos = pos;

240 next_.location.end_pos = pos + 1;

241 Advance();

242 return current_.token;

243 }

244 }

245 Scan();

246 return current_.token;

247 }

248

249

250 static inline bool IsByteOrderMark(uc32 c) {

251 // The Unicode value U+FFFE is guaranteed never to be assigned as a

252 // Unicode character; this implies that in a Unicode context the

253 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF

254 // character expressed in little-endian byte order (since it could

255 // not be a U+FFFE character expressed in big-endian byte

256 // order). Nevertheless, we check for it to be compatible with

257 // Spidermonkey.

258 return c == 0xFEFF \|\| c == 0xFFFE;

259 }

260

261

262 bool JavaScriptScanner::SkipWhiteSpace() {

263 int start_position = source_pos();

264

265 while (true) {

266 // We treat byte-order marks (BOMs) as whitespace for better

267 // compatibility with Spidermonkey and other JavaScript engines.

268 while (unicode_cache_->IsWhiteSpace(c0_) \|\| IsByteOrderMark(c0_)) {

269 // IsWhiteSpace() includes line terminators!

270 if (unicode_cache_->IsLineTerminator(c0_)) {

271 // Ignore line terminators, but remember them. This is necessary

272 // for automatic semicolon insertion.

273 has_line_terminator_before_next_ = true;

274 }

275 Advance();

276 }

277

278 // If there is an HTML comment end '-->' at the beginning of a

279 // line (with only whitespace in front of it), we treat the rest

280 // of the line as a comment. This is in line with the way

281 // SpiderMonkey handles it.

282 if (c0_ == '-' && has_line_terminator_before_next_) {

283 Advance();

284 if (c0_ == '-') {

285 Advance();

286 if (c0_ == '>') {

287 // Treat the rest of the line as a comment.

288 SkipSingleLineComment();

289 // Continue skipping white space after the comment.

290 continue;

291 }

292 PushBack('-'); // undo Advance()

293 }

294 PushBack('-'); // undo Advance()

295 }

296 // Return whether or not we skipped any characters.

297 return source_pos() != start_position;

298 }

299 }

300

301

302 Token::Value JavaScriptScanner::SkipSingleLineComment() {

303 Advance();

304

305 // The line terminator at the end of the line is not considered

306 // to be part of the single-line comment; it is recognized

307 // separately by the lexical grammar and becomes part of the

308 // stream of input elements for the syntactic grammar (see

309 // ECMA-262, section 7.4).

310 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {

311 Advance();

312 }

313

314 return Token::WHITESPACE;

315 }

316

317

318 Token::Value JavaScriptScanner::SkipMultiLineComment() {

319 ASSERT(c0_ == '*');

320 Advance();

321

322 while (c0_ >= 0) {

323 uc32 ch = c0_;

324 Advance();

325 if (unicode_cache_->IsLineTerminator(ch)) {

326 // Following ECMA-262, section 7.4, a comment containing

327 // a newline will make the comment count as a line-terminator.

328 has_multiline_comment_before_next_ = true;

329 }

330 // If we have reached the end of the multi-line comment, we

331 // consume the '/' and insert a whitespace. This way all

332 // multi-line comments are treated as whitespace.

333 if (ch == '*' && c0_ == '/') {

334 c0_ = ' ';

335 return Token::WHITESPACE;

336 }

337 }

338

339 // Unterminated multi-line comment.

340 return Token::ILLEGAL;

341 }

342

343

344 Token::Value JavaScriptScanner::ScanHtmlComment() {

345 // Check for <!-- comments.

346 ASSERT(c0_ == '!');

347 Advance();

348 if (c0_ == '-') {

349 Advance();

350 if (c0_ == '-') return SkipSingleLineComment();

351 PushBack('-'); // undo Advance()

352 }

353 PushBack('!'); // undo Advance()

354 ASSERT(c0_ == '!');

355 return Token::LT;

356 }

357

358

359 void JavaScriptScanner::Scan() {

360 next_.literal_chars = NULL;

361 Token::Value token;

362 do {

363 // Remember the position of the next token

364 next_.location.beg_pos = source_pos();

365

366 switch (c0_) {

367 case ' ':

368 case '\t':

369 Advance();

370 token = Token::WHITESPACE;

371 break;

372

373 case '\n':

374 Advance();

375 has_line_terminator_before_next_ = true;

376 token = Token::WHITESPACE;

377 break;

378

379 case '"': case '\'':

380 token = ScanString();

381 break;

382

383 case '<':

384 // < <= << <<= <!--

385 Advance();

386 if (c0_ == '=') {

387 token = Select(Token::LTE);

388 } else if (c0_ == '<') {

389 token = Select('=', Token::ASSIGN_SHL, Token::SHL);

390 } else if (c0_ == '!') {

391 token = ScanHtmlComment();

392 } else {

393 token = Token::LT;

394 }

395 break;

396

397 case '>':

398 // > >= >> >>= >>> >>>=

399 Advance();

400 if (c0_ == '=') {

401 token = Select(Token::GTE);

402 } else if (c0_ == '>') {

403 // >> >>= >>> >>>=

404 Advance();

405 if (c0_ == '=') {

406 token = Select(Token::ASSIGN_SAR);

407 } else if (c0_ == '>') {

408 token = Select('=', Token::ASSIGN_SHR, Token::SHR);

409 } else {

410 token = Token::SAR;

411 }

412 } else {

413 token = Token::GT;

414 }

415 break;

416

417 case '=':

418 // = == ===

419 Advance();

420 if (c0_ == '=') {

421 token = Select('=', Token::EQ_STRICT, Token::EQ);

422 } else {

423 token = Token::ASSIGN;

424 }

425 break;

426

427 case '!':

428 // ! != !==

429 Advance();

430 if (c0_ == '=') {

431 token = Select('=', Token::NE_STRICT, Token::NE);

432 } else {

433 token = Token::NOT;

434 }

435 break;

436

437 case '+':

438 // + ++ +=

439 Advance();

440 if (c0_ == '+') {

441 token = Select(Token::INC);

442 } else if (c0_ == '=') {

443 token = Select(Token::ASSIGN_ADD);

444 } else {

445 token = Token::ADD;

446 }

447 break;

448

449 case '-':

450 // - -- --> -=

451 Advance();

452 if (c0_ == '-') {

453 Advance();

454 if (c0_ == '>' && has_line_terminator_before_next_) {

455 // For compatibility with SpiderMonkey, we skip lines that

456 // start with an HTML comment end '-->'.

457 token = SkipSingleLineComment();

458 } else {

459 token = Token::DEC;

460 }

461 } else if (c0_ == '=') {

462 token = Select(Token::ASSIGN_SUB);

463 } else {

464 token = Token::SUB;

465 }

466 break;

467

468 case '*':

469 // * *=

470 token = Select('=', Token::ASSIGN_MUL, Token::MUL);

471 break;

472

473 case '%':

474 // % %=

475 token = Select('=', Token::ASSIGN_MOD, Token::MOD);

476 break;

477

478 case '/':

479 // / // /* /=

480 Advance();

481 if (c0_ == '/') {

482 token = SkipSingleLineComment();

483 } else if (c0_ == '*') {

484 token = SkipMultiLineComment();

485 } else if (c0_ == '=') {

486 token = Select(Token::ASSIGN_DIV);

487 } else {

488 token = Token::DIV;

489 }

490 break;

491

492 case '&':

493 // & && &=

494 Advance();

495 if (c0_ == '&') {

496 token = Select(Token::AND);

497 } else if (c0_ == '=') {

498 token = Select(Token::ASSIGN_BIT_AND);

499 } else {

500 token = Token::BIT_AND;

501 }

502 break;

503

504 case '\|':

505 // \| \|\| \|=

506 Advance();

507 if (c0_ == '\|') {

508 token = Select(Token::OR);

509 } else if (c0_ == '=') {

510 token = Select(Token::ASSIGN_BIT_OR);

511 } else {

512 token = Token::BIT_OR;

513 }

514 break;

515

516 case '^':

517 // ^ ^=

518 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);

519 break;

520

521 case '.':

522 // . Number

523 Advance();

524 if (IsDecimalDigit(c0_)) {

525 token = ScanNumber(true);

526 } else {

527 token = Token::PERIOD;

528 }

529 break;

530

531 case ':':

532 token = Select(Token::COLON);

533 break;

534

535 case ';':

536 token = Select(Token::SEMICOLON);

537 break;

538

539 case ',':

540 token = Select(Token::COMMA);

541 break;

542

543 case '(':

544 token = Select(Token::LPAREN);

545 break;

546

547 case ')':

548 token = Select(Token::RPAREN);

549 break;

550

551 case '[':

552 token = Select(Token::LBRACK);

553 break;

554

555 case ']':

556 token = Select(Token::RBRACK);

557 break;

558

559 case '{':

560 token = Select(Token::LBRACE);

561 break;

562

563 case '}':

564 token = Select(Token::RBRACE);

565 break;

566

567 case '?':

568 token = Select(Token::CONDITIONAL);

569 break;

570

571 case '~':

572 token = Select(Token::BIT_NOT);

573 break;

574

575 default:

576 if (unicode_cache_->IsIdentifierStart(c0_)) {

577 token = ScanIdentifierOrKeyword();

578 } else if (IsDecimalDigit(c0_)) {

579 token = ScanNumber(false);

580 } else if (SkipWhiteSpace()) {

581 token = Token::WHITESPACE;

582 } else if (c0_ < 0) {

583 token = Token::EOS;

584 } else {

585 token = Select(Token::ILLEGAL);

586 }

587 break;

588 }

589

590 // Continue scanning for tokens as long as we're just skipping

591 // whitespace.

592 } while (token == Token::WHITESPACE);

593

594 next_.location.end_pos = source_pos();

595 next_.token = token;

596 }

597

598

599 void JavaScriptScanner::SeekForward(int pos) {

600 // After this call, we will have the token at the given position as

601 // the "next" token. The "current" token will be invalid.

602 if (pos == next_.location.beg_pos) return;

603 int current_pos = source_pos();

604 ASSERT_EQ(next_.location.end_pos, current_pos);

605 // Positions inside the lookahead token aren't supported.

606 ASSERT(pos >= current_pos);

607 if (pos != current_pos) {

608 source_->SeekForward(pos - source_->pos());

609 Advance();

610 // This function is only called to seek to the location

611 // of the end of a function (at the "}" token). It doesn't matter

612 // whether there was a line terminator in the part we skip.

613 has_line_terminator_before_next_ = false;

614 has_multiline_comment_before_next_ = false;

615 }

616 Scan();

617 }

618

619

620 void JavaScriptScanner::ScanEscape() {

621 uc32 c = c0_;

622 Advance();

623

624 // Skip escaped newlines.

625 if (unicode_cache_->IsLineTerminator(c)) {

626 // Allow CR+LF newlines in multiline string literals.

627 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();

628 // Allow LF+CR newlines in multiline string literals.

629 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();

630 return;

631 }

632

633 switch (c) {

634 case '\'': // fall through

635 case '"' : // fall through

636 case '\\': break;

637 case 'b' : c = '\b'; break;

638 case 'f' : c = '\f'; break;

639 case 'n' : c = '\n'; break;

640 case 'r' : c = '\r'; break;

641 case 't' : c = '\t'; break;

642 case 'u' : {

643 c = ScanHexNumber(4);

644 if (c < 0) c = 'u';

645 break;

646 }

647 case 'v' : c = '\v'; break;

648 case 'x' : {

649 c = ScanHexNumber(2);

650 if (c < 0) c = 'x';

651 break;

652 }

653 case '0' : // fall through

654 case '1' : // fall through

655 case '2' : // fall through

656 case '3' : // fall through

657 case '4' : // fall through

658 case '5' : // fall through

659 case '6' : // fall through

660 case '7' : c = ScanOctalEscape(c, 2); break;

661 }

662

663 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these

664 // should be illegal, but they are commonly handled

665 // as non-escaped characters by JS VMs.

666 AddLiteralChar(c);

667 }

668

669

670 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of

671 // ECMA-262. Other JS VMs support them.

672 uc32 JavaScriptScanner::ScanOctalEscape(uc32 c, int length) {

673 uc32 x = c - '0';

674 int i = 0;

675 for (; i < length; i++) {

676 int d = c0_ - '0';

677 if (d < 0 \|\| d > 7) break;

678 int nx = x * 8 + d;

679 if (nx >= 256) break;

680 x = nx;

681 Advance();

682 }

683 // Anything except '\0' is an octal escape sequence, illegal in strict mode.

684 // Remember the position of octal escape sequences so that an error

685 // can be reported later (in strict mode).

686 // We don't report the error immediately, because the octal escape can

687 // occur before the "use strict" directive.

688 if (c != '0' \|\| i > 0) {

689 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);

690 }

691 return x;

692 }

693

694

695 Token::Value JavaScriptScanner::ScanString() {

696 uc32 quote = c0_;

697 Advance(); // consume quote

698

699 LiteralScope literal(this);

700 while (c0_ != quote && c0_ >= 0

701 && !unicode_cache_->IsLineTerminator(c0_)) {

702 uc32 c = c0_;

703 Advance();

704 if (c == '\\') {

705 if (c0_ < 0) return Token::ILLEGAL;

706 ScanEscape();

707 } else {

708 AddLiteralChar(c);

709 }

710 }

711 if (c0_ != quote) return Token::ILLEGAL;

712 literal.Complete();

713

714 Advance(); // consume quote

715 return Token::STRING;

716 }

717

718

719 void JavaScriptScanner::ScanDecimalDigits() {

720 while (IsDecimalDigit(c0_))

721 AddLiteralCharAdvance();

722 }

723

724

725 Token::Value JavaScriptScanner::ScanNumber(bool seen_period) {

726 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction

727

728 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;

729

730 LiteralScope literal(this);

731 if (seen_period) {

732 // we have already seen a decimal point of the float

733 AddLiteralChar('.');

734 ScanDecimalDigits(); // we know we have at least one digit

735

736 } else {

737 // if the first character is '0' we must check for octals and hex

738 if (c0_ == '0') {

739 int start_pos = source_pos(); // For reporting octal positions.

740 AddLiteralCharAdvance();

741

742 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number

743 if (c0_ == 'x' \|\| c0_ == 'X') {

744 // hex number

745 kind = HEX;

746 AddLiteralCharAdvance();

747 if (!IsHexDigit(c0_)) {

748 // we must have at least one hex digit after 'x'/'X'

749 return Token::ILLEGAL;

750 }

751 while (IsHexDigit(c0_)) {

752 AddLiteralCharAdvance();

753 }

754 } else if ('0' <= c0_ && c0_ <= '7') {

755 // (possible) octal number

756 kind = OCTAL;

757 while (true) {

758 if (c0_ == '8' \|\| c0_ == '9') {

759 kind = DECIMAL;

760 break;

761 }

762 if (c0_ < '0' \|\| '7' < c0_) {

763 // Octal literal finished.

764 octal_pos_ = Location(start_pos, source_pos());

765 break;

766 }

767 AddLiteralCharAdvance();

768 }

769 }

770 }

771

772 // Parse decimal digits and allow trailing fractional part.

773 if (kind == DECIMAL) {

774 ScanDecimalDigits(); // optional

775 if (c0_ == '.') {

776 AddLiteralCharAdvance();

777 ScanDecimalDigits(); // optional

778 }

779 }

780 }

781

782 // scan exponent, if any

783 if (c0_ == 'e' \|\| c0_ == 'E') {

784 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number

785 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed

786 // scan exponent

787 AddLiteralCharAdvance();

788 if (c0_ == '+' \|\| c0_ == '-')

789 AddLiteralCharAdvance();

790 if (!IsDecimalDigit(c0_)) {

791 // we must have at least one decimal digit after 'e'/'E'

792 return Token::ILLEGAL;

793 }

794 ScanDecimalDigits();

795 }

796

797 // The source character immediately following a numeric literal must

798 // not be an identifier start or a decimal digit; see ECMA-262

799 // section 7.8.3, page 17 (note that we read only one decimal digit

800 // if the value is 0).

801 if (IsDecimalDigit(c0_) \|\| unicode_cache_->IsIdentifierStart(c0_))

802 return Token::ILLEGAL;

803

804 literal.Complete();

805

806 return Token::NUMBER;

807 }

808

809

810 uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {

811 Advance();

812 if (c0_ != 'u') return -1;

813 Advance();

814 uc32 result = ScanHexNumber(4);

815 if (result < 0) PushBack('u');

816 return result;

817 }

818

819

820 // ----------------------------------------------------------------------------

821 // Keyword Matcher

822

823 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \

824 KEYWORD_GROUP('b') \

825 KEYWORD("break", Token::BREAK) \

826 KEYWORD_GROUP('c') \

827 KEYWORD("case", Token::CASE) \

828 KEYWORD("catch", Token::CATCH) \

829 KEYWORD("class", Token::FUTURE_RESERVED_WORD) \

830 KEYWORD("const", Token::CONST) \

831 KEYWORD("continue", Token::CONTINUE) \

832 KEYWORD_GROUP('d') \

833 KEYWORD("debugger", Token::DEBUGGER) \

834 KEYWORD("default", Token::DEFAULT) \

835 KEYWORD("delete", Token::DELETE) \

836 KEYWORD("do", Token::DO) \

837 KEYWORD_GROUP('e') \

838 KEYWORD("else", Token::ELSE) \

839 KEYWORD("enum", Token::FUTURE_RESERVED_WORD) \

840 KEYWORD("export", Token::FUTURE_RESERVED_WORD) \

841 KEYWORD("extends", Token::FUTURE_RESERVED_WORD) \

842 KEYWORD_GROUP('f') \

843 KEYWORD("false", Token::FALSE_LITERAL) \

844 KEYWORD("finally", Token::FINALLY) \

845 KEYWORD("for", Token::FOR) \

846 KEYWORD("function", Token::FUNCTION) \

847 KEYWORD_GROUP('i') \

848 KEYWORD("if", Token::IF) \

849 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \

850 KEYWORD("import", Token::FUTURE_RESERVED_WORD) \

851 KEYWORD("in", Token::IN) \

852 KEYWORD("instanceof", Token::INSTANCEOF) \

853 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \

854 KEYWORD_GROUP('l') \

855 KEYWORD("let", harmony_block_scoping \

856 ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \

857 KEYWORD_GROUP('n') \

858 KEYWORD("new", Token::NEW) \

859 KEYWORD("null", Token::NULL_LITERAL) \

860 KEYWORD_GROUP('p') \

861 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \

862 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \

863 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \

864 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \

865 KEYWORD_GROUP('r') \

866 KEYWORD("return", Token::RETURN) \

867 KEYWORD_GROUP('s') \

868 KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD) \

869 KEYWORD("super", Token::FUTURE_RESERVED_WORD) \

870 KEYWORD("switch", Token::SWITCH) \

871 KEYWORD_GROUP('t') \

872 KEYWORD("this", Token::THIS) \

873 KEYWORD("throw", Token::THROW) \

874 KEYWORD("true", Token::TRUE_LITERAL) \

875 KEYWORD("try", Token::TRY) \

876 KEYWORD("typeof", Token::TYPEOF) \

877 KEYWORD_GROUP('v') \

878 KEYWORD("var", Token::VAR) \

879 KEYWORD("void", Token::VOID) \

880 KEYWORD_GROUP('w') \

881 KEYWORD("while", Token::WHILE) \

882 KEYWORD("with", Token::WITH) \

883 KEYWORD_GROUP('y') \

884 KEYWORD("yield", Token::FUTURE_STRICT_RESERVED_WORD)

885

886

887 static Token::Value KeywordOrIdentifierToken(const char* input,

888 int input_length,

889 bool harmony_block_scoping) {

890 ASSERT(input_length >= 1);

891 const int kMinLength = 2;

892 const int kMaxLength = 10;

893 if (input_length < kMinLength \|\| input_length > kMaxLength) {

894 return Token::IDENTIFIER;

895 }

896 switch (input[0]) {

897 default:

898 #define KEYWORD_GROUP_CASE(ch) \

899 break; \

900 case ch:

901 #define KEYWORD(keyword, token) \

902 { \

903 /* 'keyword' is a char array, so sizeof(keyword) is */ \

904 /* strlen(keyword) plus 1 for the NUL char. */ \

905 const int keyword_length = sizeof(keyword) - 1; \

906 STATIC_ASSERT(keyword_length >= kMinLength); \

907 STATIC_ASSERT(keyword_length <= kMaxLength); \

908 if (input_length == keyword_length && \

909 input[1] == keyword[1] && \

910 (keyword_length <= 2 \|\| input[2] == keyword[2]) && \

911 (keyword_length <= 3 \|\| input[3] == keyword[3]) && \

912 (keyword_length <= 4 \|\| input[4] == keyword[4]) && \

913 (keyword_length <= 5 \|\| input[5] == keyword[5]) && \

914 (keyword_length <= 6 \|\| input[6] == keyword[6]) && \

915 (keyword_length <= 7 \|\| input[7] == keyword[7]) && \

916 (keyword_length <= 8 \|\| input[8] == keyword[8]) && \

917 (keyword_length <= 9 \|\| input[9] == keyword[9])) { \

918 return token; \

919 } \

920 }

921 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)

922 }

923 return Token::IDENTIFIER;

924 }

925

926

927 Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {

928 ASSERT(unicode_cache_->IsIdentifierStart(c0_));

929 LiteralScope literal(this);

930 // Scan identifier start character.

931 if (c0_ == '\\') {

932 uc32 c = ScanIdentifierUnicodeEscape();

933 // Only allow legal identifier start characters.

934 if (c < 0 \|\|

935 c == '\\' \|\| // No recursive escapes.

936 !unicode_cache_->IsIdentifierStart(c)) {

937 return Token::ILLEGAL;

938 }

939 AddLiteralChar(c);

940 return ScanIdentifierSuffix(&literal);

941 }

942

943 uc32 first_char = c0_;

944 Advance();

945 AddLiteralChar(first_char);

946

947 // Scan the rest of the identifier characters.

948 while (unicode_cache_->IsIdentifierPart(c0_)) {

949 if (c0_ != '\\') {

950 uc32 next_char = c0_;

951 Advance();

952 AddLiteralChar(next_char);

953 continue;

954 }

955 // Fallthrough if no longer able to complete keyword.

956 return ScanIdentifierSuffix(&literal);

957 }

958

959 literal.Complete();

960

961 if (next_.literal_chars->is_ascii()) {

962 Vector<const char> chars = next_.literal_chars->ascii_literal();

963 return KeywordOrIdentifierToken(chars.start(),

964 chars.length(),

965 harmony_block_scoping_);

966 }

967

968 return Token::IDENTIFIER;

969 }

970

971

972 Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {

973 // Scan the rest of the identifier characters.

974 while (unicode_cache_->IsIdentifierPart(c0_)) {

975 if (c0_ == '\\') {

976 uc32 c = ScanIdentifierUnicodeEscape();

977 // Only allow legal identifier part characters.

978 if (c < 0 \|\|

979 c == '\\' \|\|

980 !unicode_cache_->IsIdentifierPart(c)) {

981 return Token::ILLEGAL;

982 }

983 AddLiteralChar(c);

984 } else {

985 AddLiteralChar(c0_);

986 Advance();

987 }

988 }

989 literal->Complete();

990

991 return Token::IDENTIFIER;

992 }

993

994

995 bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {

996 // Scan: ('/' \| '/=') RegularExpressionBody '/' RegularExpressionFlags

997 bool in_character_class = false;

998

999 // Previous token is either '/' or '/=', in the second case, the

1000 // pattern starts at =.

1001 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);

1002 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);

1003

1004 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,

1005 // the scanner should pass uninterpreted bodies to the RegExp

1006 // constructor.

1007 LiteralScope literal(this);

1008 if (seen_equal) {

1009 AddLiteralChar('=');

1010 }

1011

1012 while (c0_ != '/' \|\| in_character_class) {

1013 if (unicode_cache_->IsLineTerminator(c0_) \|\| c0_ < 0) return false;

1014 if (c0_ == '\\') { // Escape sequence.

1015 AddLiteralCharAdvance();

1016 if (unicode_cache_->IsLineTerminator(c0_) \|\| c0_ < 0) return false;

1017 AddLiteralCharAdvance();

1018 // If the escape allows more characters, i.e., \x??, \u????, or \c?,

1019 // only "safe" characters are allowed (letters, digits, underscore),

1020 // otherwise the escape isn't valid and the invalid character has

1021 // its normal meaning. I.e., we can just continue scanning without

1022 // worrying whether the following characters are part of the escape

1023 // or not, since any '/', '\\' or '[' is guaranteed to not be part

1024 // of the escape sequence.

1025

1026 // TODO(896): At some point, parse RegExps more throughly to capture

1027 // octal esacpes in strict mode.

1028 } else { // Unescaped character.

1029 if (c0_ == '[') in_character_class = true;

1030 if (c0_ == ']') in_character_class = false;

1031 AddLiteralCharAdvance();

1032 }

1033 }

1034 Advance(); // consume '/'

1035

1036 literal.Complete();

1037

1038 return true;

1039 }

1040

1041

1042 bool JavaScriptScanner::ScanLiteralUnicodeEscape() {

1043 ASSERT(c0_ == '\\');

1044 uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};

1045 Advance();

1046 int i = 1;

1047 if (c0_ == 'u') {

1048 i++;

1049 while (i < 6) {

1050 Advance();

1051 if (!IsHexDigit(c0_)) break;

1052 chars_read[i] = c0_;

1053 i++;

1054 }

1055 }

1056 if (i < 6) {

1057 // Incomplete escape. Undo all advances and return false.

1058 while (i > 0) {

1059 i--;

1060 PushBack(chars_read[i]);

1061 }

1062 return false;

1063 }

1064 // Complete escape. Add all chars to current literal buffer.

1065 for (int i = 0; i < 6; i++) {

1066 AddLiteralChar(chars_read[i]);

1067 }

1068 return true;

1069 }

1070

1071

1072 bool JavaScriptScanner::ScanRegExpFlags() {

1073 // Scan regular expression flags.

1074 LiteralScope literal(this);

1075 while (unicode_cache_->IsIdentifierPart(c0_)) {

1076 if (c0_ != '\\') {

1077 AddLiteralCharAdvance();

1078 } else {

1079 if (!ScanLiteralUnicodeEscape()) {

1080 break;

1081 }

1082 }

1083 }

1084 literal.Complete();

1085

1086 next_.location.end_pos = source_pos() - 1;

1087 return true;

1088 }

1089

1090 } } // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/scanner-base.h ('k') | src/scanner-character-streams.h » ('j') | no next file with comments »