src/prescanner.h - Issue 5188009: Merge preparser Scanner with main JavaScript scanner.

Side by Side Diff: src/prescanner.h

Issue 5188009: Merge preparser Scanner with main JavaScript scanner. (Closed)

Patch Set: Address review. Fix thinko in keyword matcher. Created 10 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 // Copyright 2010 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are

4 // met:

5 //

6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided

11 // with the distribution.

12 // * Neither the name of Google Inc. nor the names of its

13 // contributors may be used to endorse or promote products derived

14 // from this software without specific prior written permission.

15 //

16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

27

28 #ifndef V8_PRESCANNER_H_

29 #define V8_PRESCANNER_H_

30

31 #include "token.h"

32 #include "char-predicates-inl.h"

33 #include "utils.h"

34 #include "scanner-base.h"

35

36 namespace v8 {

37 namespace preparser {

38

39 namespace i = v8::internal;

40

41 typedef int uc32;

42

43 class PreScannerStackGuard {

44 public:

45 explicit PreScannerStackGuard(int max_size)

46 : limit_(StackPoint().at() - max_size) { }

47 bool has_overflowed() {

48 return StackPoint().at() < limit_;

49 }

50 private:

51 class StackPoint {

52 public:

53 char* at() { return reinterpret_cast<char*>(this); }

54 };

55 char* limit_;

56 };

57

58

59 // Scanner for preparsing.

60 // InputStream is a source of UC16 characters with limited push-back.

61 // LiteralsBuffer is a collector of (UTF-8) characters used to capture literals.

62 class Scanner {

63 public:

64 enum LiteralType {

65 kLiteralNumber,

66 kLiteralIdentifier,

67 kLiteralString,

68 kLiteralRegExp,

69 kLiteralRegExpFlags

70 };

71

72 class LiteralScope {

73 public:

74 explicit LiteralScope(Scanner* self, LiteralType type);

75 ~LiteralScope();

76 void Complete();

77

78 private:

79 Scanner* scanner_;

80 bool complete_;

81 };

82

83 Scanner();

84

85 void Initialize(i::UTF16Buffer* stream);

86

87 // Returns the next token.

88 i::Token::Value Next();

89

90 // Returns the current token again.

91 i::Token::Value current_token() { return current_.token; }

92

93 // One token look-ahead (past the token returned by Next()).

94 i::Token::Value peek() const { return next_.token; }

95

96 // Returns true if there was a line terminator before the peek'ed token.

97 bool has_line_terminator_before_next() const {

98 return has_line_terminator_before_next_;

99 }

100

101 struct Location {

102 Location(int b, int e) : beg_pos(b), end_pos(e) { }

103 Location() : beg_pos(0), end_pos(0) { }

104 int beg_pos;

105 int end_pos;

106 };

107

108 // Returns the location information for the current token

109 // (the token returned by Next()).

110 Location location() const { return current_.location; }

111 // Returns the location information for the look-ahead token

112 // (the token returned by peek()).

113 Location peek_location() const { return next_.location; }

114

115 // Returns the literal string, if any, for the current token (the

116 // token returned by Next()). The string is 0-terminated and in

117 // UTF-8 format; they may contain 0-characters. Literal strings are

118 // collected for identifiers, strings, and numbers.

119 // These functions only give the correct result if the literal

120 // was scanned between calls to StartLiteral() and TerminateLiteral().

121 const char* literal_string() const {

122 return current_.literal_chars;

123 }

124

125 int literal_length() const {

126 // Excluding terminal '\x00' added by TerminateLiteral().

127 return current_.literal_length - 1;

128 }

129

130 i::Vector<const char> literal() const {

131 return i::Vector<const char>(literal_string(), literal_length());

132 }

133

134 // Returns the literal string for the next token (the token that

135 // would be returned if Next() were called).

136 const char* next_literal_string() const {

137 return next_.literal_chars;

138 }

139

140 // Returns the length of the next token (that would be returned if

141 // Next() were called).

142 int next_literal_length() const {

143 // Excluding terminal '\x00' added by TerminateLiteral().

144 return next_.literal_length - 1;

145 }

146

147 i::Vector<const char> next_literal() const {

148 return i::Vector<const char>(next_literal_string(), next_literal_length());

149 }

150

151 // Scans the input as a regular expression pattern, previous

152 // character(s) must be /(=). Returns true if a pattern is scanned.

153 bool ScanRegExpPattern(bool seen_equal);

154 // Returns true if regexp flags are scanned (always since flags can

155 // be empty).

156 bool ScanRegExpFlags();

157

158 // Seek forward to the given position. This operation does not

159 // work in general, for instance when there are pushed back

160 // characters, but works for seeking forward until simple delimiter

161 // tokens, which is what it is used for.

162 void SeekForward(int pos);

163

164 bool stack_overflow() { return stack_overflow_; }

165

166 static const int kCharacterLookaheadBufferSize = 1;

167 static const int kNoEndPosition = 1;

168

169 private:

170 // The current and look-ahead token.

171 struct TokenDesc {

172 i::Token::Value token;

173 Location location;

174 const char* literal_chars;

175 int literal_length;

176 };

177

178 // Default stack limit is 128K pointers.

179 static const int kMaxStackSize = 128 * 1024 * sizeof(void*); // NOLINT.

180

181 void Init(unibrow::CharacterStream* stream);

182

183 // Literal buffer support

184 inline void StartLiteral(LiteralType type);

185 inline void AddLiteralChar(uc32 ch);

186 inline void AddLiteralCharAdvance();

187 inline void TerminateLiteral();

188 // Stops scanning of a literal, e.g., due to an encountered error.

189 inline void DropLiteral();

190

191 // Low-level scanning support.

192 void Advance() { c0_ = source_->Advance(); }

193 void PushBack(uc32 ch) {

194 source_->PushBack(ch);

195 c0_ = ch;

196 }

197

198 bool SkipWhiteSpace();

199

200 i::Token::Value SkipSingleLineComment();

201 i::Token::Value SkipMultiLineComment();

202

203 inline i::Token::Value Select(i::Token::Value tok);

204 inline i::Token::Value Select(uc32 next,

205 i::Token::Value then,

206 i::Token::Value else_);

207

208 // Scans a single JavaScript token.

209 void Scan();

210

211 void ScanDecimalDigits();

212 i::Token::Value ScanNumber(bool seen_period);

213 i::Token::Value ScanIdentifier();

214 uc32 ScanHexEscape(uc32 c, int length);

215 uc32 ScanOctalEscape(uc32 c, int length);

216 void ScanEscape();

217 i::Token::Value ScanString();

218

219 // Scans a possible HTML comment -- begins with '<!'.

220 i::Token::Value ScanHtmlComment();

221

222 // Return the current source position.

223 int source_pos() {

224 return source_->pos() - kCharacterLookaheadBufferSize;

225 }

226

227 // Decodes a unicode escape-sequence which is part of an identifier.

228 // If the escape sequence cannot be decoded the result is kBadRune.

229 uc32 ScanIdentifierUnicodeEscape();

230

231 PreScannerStackGuard stack_guard_;

232

233 TokenDesc current_; // desc for current token (as returned by Next())

234 TokenDesc next_; // desc for next token (one token look-ahead)

235 bool has_line_terminator_before_next_;

236

237 // Source.

238 i::UTF16Buffer* source_;

239

240 // Buffer to hold literal values (identifiers, strings, numerals, regexps and

241 // regexp flags) using '\x00'-terminated UTF-8 encoding.

242 // Handles allocation internally.

243 // Notice that the '\x00' termination is meaningless for strings and regexps

244 // which may contain the zero-character, but can be used as terminator for

245 // identifiers, numerals and regexp flags.Collector

246 i::LiteralCollector literal_buffer_;

247

248 bool stack_overflow_;

249

250 // One Unicode character look-ahead; c0_ < 0 at the end of the input.

251 uc32 c0_;

252 };

253

254

255 // ----------------------------------------------------------------------------

256 // Scanner::LiteralScope

257

258 Scanner::LiteralScope::LiteralScope(

259 Scanner* self, LiteralType type)

260 : scanner_(self), complete_(false) {

261 self->StartLiteral(type);

262 }

263

264

265 Scanner::LiteralScope::~LiteralScope() {

266 if (!complete_) scanner_->DropLiteral();

267 }

268

269 void Scanner::LiteralScope::Complete() {

270 scanner_->TerminateLiteral();

271 complete_ = true;

272 }

273

274

275 // ----------------------------------------------------------------------------

276 // Scanner.

277 Scanner::Scanner()

278 : stack_guard_(kMaxStackSize),

279 has_line_terminator_before_next_(false),

280 source_(NULL),

281 stack_overflow_(false) {}

282

283

284 void Scanner::Initialize(i::UTF16Buffer* stream) {

285 source_ = stream;

286

287 // Initialize current_ to not refer to a literal.

288 current_.literal_length = 0;

289 // Reset literal buffer.

290 literal_buffer_.Reset();

291

292 // Set c0_ (one character ahead)

293 ASSERT(kCharacterLookaheadBufferSize == 1);

294 Advance();

295

296 // Skip initial whitespace allowing HTML comment ends just like

297 // after a newline and scan first token.

298 has_line_terminator_before_next_ = true;

299 SkipWhiteSpace();

300 Scan();

301 }

302

303

304 i::Token::Value Scanner::Next() {

305 // BUG 1215673: Find a thread safe way to set a stack limit in

306 // pre-parse mode. Otherwise, we cannot safely pre-parse from other

307 // threads.

308 current_ = next_;

309 // Check for stack-overflow before returning any tokens.

310 if (stack_guard_.has_overflowed()) {

311 stack_overflow_ = true;

312 next_.token = i::Token::ILLEGAL;

313 } else {

314 has_line_terminator_before_next_ = false;

315 Scan();

316 }

317 return current_.token;

318 }

319

320

321 void Scanner::StartLiteral(LiteralType type) {

322 // Only record string and literal identifiers when preparsing.

323 // Those are the ones that are recorded as symbols. Numbers and

324 // regexps are not recorded.

325 if (type == kLiteralString \|\| type == kLiteralIdentifier) {

326 literal_buffer_.StartLiteral();

327 }

328 }

329

330

331 void Scanner::AddLiteralChar(uc32 c) {

332 literal_buffer_.AddChar(c);

333 }

334

335

336 void Scanner::TerminateLiteral() {

337 i::Vector<const char> chars = literal_buffer_.EndLiteral();

338 next_.literal_chars = chars.start();

339 next_.literal_length = chars.length();

340 }

341

342

343 void Scanner::DropLiteral() {

344 literal_buffer_.DropLiteral();

345 }

346

347

348 void Scanner::AddLiteralCharAdvance() {

349 AddLiteralChar(c0_);

350 Advance();

351 }

352

353

354 static inline bool IsByteOrderMark(uc32 c) {

355 // The Unicode value U+FFFE is guaranteed never to be assigned as a

356 // Unicode character; this implies that in a Unicode context the

357 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF

358 // character expressed in little-endian byte order (since it could

359 // not be a U+FFFE character expressed in big-endian byte

360 // order). Nevertheless, we check for it to be compatible with

361 // Spidermonkey.

362 return c == 0xFEFF \|\| c == 0xFFFE;

363 }

364

365

366 bool Scanner::SkipWhiteSpace() {

367 int start_position = source_pos();

368

369 while (true) {

370 // We treat byte-order marks (BOMs) as whitespace for better

371 // compatibility with Spidermonkey and other JavaScript engines.

372 while (i::ScannerConstants::kIsWhiteSpace.get(c0_)

373 \|\| IsByteOrderMark(c0_)) {

374 // IsWhiteSpace() includes line terminators!

375 if (i::ScannerConstants::kIsLineTerminator.get(c0_)) {

376 // Ignore line terminators, but remember them. This is necessary

377 // for automatic semicolon insertion.

378 has_line_terminator_before_next_ = true;

379 }

380 Advance();

381 }

382

383 // If there is an HTML comment end '-->' at the beginning of a

384 // line (with only whitespace in front of it), we treat the rest

385 // of the line as a comment. This is in line with the way

386 // SpiderMonkey handles it.

387 if (c0_ == '-' && has_line_terminator_before_next_) {

388 Advance();

389 if (c0_ == '-') {

390 Advance();

391 if (c0_ == '>') {

392 // Treat the rest of the line as a comment.

393 SkipSingleLineComment();

394 // Continue skipping white space after the comment.

395 continue;

396 }

397 PushBack('-'); // undo Advance()

398 }

399 PushBack('-'); // undo Advance()

400 }

401 // Return whether or not we skipped any characters.

402 return source_pos() != start_position;

403 }

404 }

405

406

407 i::Token::Value Scanner::SkipSingleLineComment() {

408 Advance();

409

410 // The line terminator at the end of the line is not considered

411 // to be part of the single-line comment; it is recognized

412 // separately by the lexical grammar and becomes part of the

413 // stream of input elements for the syntactic grammar (see

414 // ECMA-262, section 7.4, page 12).

415 while (c0_ >= 0 && !i::ScannerConstants::kIsLineTerminator.get(c0_)) {

416 Advance();

417 }

418

419 return i::Token::WHITESPACE;

420 }

421

422

423 i::Token::Value Scanner::SkipMultiLineComment() {

424 ASSERT(c0_ == '*');

425 Advance();

426

427 while (c0_ >= 0) {

428 char ch = c0_;

429 Advance();

430 // If we have reached the end of the multi-line comment, we

431 // consume the '/' and insert a whitespace. This way all

432 // multi-line comments are treated as whitespace - even the ones

433 // containing line terminators. This contradicts ECMA-262, section

434 // 7.4, page 12, that says that multi-line comments containing

435 // line terminators should be treated as a line terminator, but it

436 // matches the behaviour of SpiderMonkey and KJS.

437 if (ch == '*' && c0_ == '/') {

438 c0_ = ' ';

439 return i::Token::WHITESPACE;

440 }

441 }

442

443 // Unterminated multi-line comment.

444 return i::Token::ILLEGAL;

445 }

446

447

448 i::Token::Value Scanner::ScanHtmlComment() {

449 // Check for <!-- comments.

450 ASSERT(c0_ == '!');

451 Advance();

452 if (c0_ == '-') {

453 Advance();

454 if (c0_ == '-') return SkipSingleLineComment();

455 PushBack('-'); // undo Advance()

456 }

457 PushBack('!'); // undo Advance()

458 ASSERT(c0_ == '!');

459 return i::Token::LT;

460 }

461

462

463 void Scanner::Scan() {

464 next_.literal_length = 0;

465 i::Token::Value token;

466 do {

467 // Remember the position of the next token

468 next_.location.beg_pos = source_pos();

469

470 switch (c0_) {

471 case ' ':

472 case '\t':

473 Advance();

474 token = i::Token::WHITESPACE;

475 break;

476

477 case '\n':

478 Advance();

479 has_line_terminator_before_next_ = true;

480 token = i::Token::WHITESPACE;

481 break;

482

483 case '"': case '\'':

484 token = ScanString();

485 break;

486

487 case '<':

488 // < <= << <<= <!--

489 Advance();

490 if (c0_ == '=') {

491 token = Select(i::Token::LTE);

492 } else if (c0_ == '<') {

493 token = Select('=', i::Token::ASSIGN_SHL, i::Token::SHL);

494 } else if (c0_ == '!') {

495 token = ScanHtmlComment();

496 } else {

497 token = i::Token::LT;

498 }

499 break;

500

501 case '>':

502 // > >= >> >>= >>> >>>=

503 Advance();

504 if (c0_ == '=') {

505 token = Select(i::Token::GTE);

506 } else if (c0_ == '>') {

507 // >> >>= >>> >>>=

508 Advance();

509 if (c0_ == '=') {

510 token = Select(i::Token::ASSIGN_SAR);

511 } else if (c0_ == '>') {

512 token = Select('=', i::Token::ASSIGN_SHR, i::Token::SHR);

513 } else {

514 token = i::Token::SAR;

515 }

516 } else {

517 token = i::Token::GT;

518 }

519 break;

520

521 case '=':

522 // = == ===

523 Advance();

524 if (c0_ == '=') {

525 token = Select('=', i::Token::EQ_STRICT, i::Token::EQ);

526 } else {

527 token = i::Token::ASSIGN;

528 }

529 break;

530

531 case '!':

532 // ! != !==

533 Advance();

534 if (c0_ == '=') {

535 token = Select('=', i::Token::NE_STRICT, i::Token::NE);

536 } else {

537 token = i::Token::NOT;

538 }

539 break;

540

541 case '+':

542 // + ++ +=

543 Advance();

544 if (c0_ == '+') {

545 token = Select(i::Token::INC);

546 } else if (c0_ == '=') {

547 token = Select(i::Token::ASSIGN_ADD);

548 } else {

549 token = i::Token::ADD;

550 }

551 break;

552

553 case '-':

554 // - -- --> -=

555 Advance();

556 if (c0_ == '-') {

557 Advance();

558 if (c0_ == '>' && has_line_terminator_before_next_) {

559 // For compatibility with SpiderMonkey, we skip lines that

560 // start with an HTML comment end '-->'.

561 token = SkipSingleLineComment();

562 } else {

563 token = i::Token::DEC;

564 }

565 } else if (c0_ == '=') {

566 token = Select(i::Token::ASSIGN_SUB);

567 } else {

568 token = i::Token::SUB;

569 }

570 break;

571

572 case '*':

573 // * *=

574 token = Select('=', i::Token::ASSIGN_MUL, i::Token::MUL);

575 break;

576

577 case '%':

578 // % %=

579 token = Select('=', i::Token::ASSIGN_MOD, i::Token::MOD);

580 break;

581

582 case '/':

583 // / // /* /=

584 Advance();

585 if (c0_ == '/') {

586 token = SkipSingleLineComment();

587 } else if (c0_ == '*') {

588 token = SkipMultiLineComment();

589 } else if (c0_ == '=') {

590 token = Select(i::Token::ASSIGN_DIV);

591 } else {

592 token = i::Token::DIV;

593 }

594 break;

595

596 case '&':

597 // & && &=

598 Advance();

599 if (c0_ == '&') {

600 token = Select(i::Token::AND);

601 } else if (c0_ == '=') {

602 token = Select(i::Token::ASSIGN_BIT_AND);

603 } else {

604 token = i::Token::BIT_AND;

605 }

606 break;

607

608 case '\|':

609 // \| \|\| \|=

610 Advance();

611 if (c0_ == '\|') {

612 token = Select(i::Token::OR);

613 } else if (c0_ == '=') {

614 token = Select(i::Token::ASSIGN_BIT_OR);

615 } else {

616 token = i::Token::BIT_OR;

617 }

618 break;

619

620 case '^':

621 // ^ ^=

622 token = Select('=', i::Token::ASSIGN_BIT_XOR, i::Token::BIT_XOR);

623 break;

624

625 case '.':

626 // . Number

627 Advance();

628 if (i::IsDecimalDigit(c0_)) {

629 token = ScanNumber(true);

630 } else {

631 token = i::Token::PERIOD;

632 }

633 break;

634

635 case ':':

636 token = Select(i::Token::COLON);

637 break;

638

639 case ';':

640 token = Select(i::Token::SEMICOLON);

641 break;

642

643 case ',':

644 token = Select(i::Token::COMMA);

645 break;

646

647 case '(':

648 token = Select(i::Token::LPAREN);

649 break;

650

651 case ')':

652 token = Select(i::Token::RPAREN);

653 break;

654

655 case '[':

656 token = Select(i::Token::LBRACK);

657 break;

658

659 case ']':

660 token = Select(i::Token::RBRACK);

661 break;

662

663 case '{':

664 token = Select(i::Token::LBRACE);

665 break;

666

667 case '}':

668 token = Select(i::Token::RBRACE);

669 break;

670

671 case '?':

672 token = Select(i::Token::CONDITIONAL);

673 break;

674

675 case '~':

676 token = Select(i::Token::BIT_NOT);

677 break;

678

679 default:

680 if (i::ScannerConstants::kIsIdentifierStart.get(c0_)) {

681 token = ScanIdentifier();

682 } else if (i::IsDecimalDigit(c0_)) {

683 token = ScanNumber(false);

684 } else if (SkipWhiteSpace()) {

685 token = i::Token::WHITESPACE;

686 } else if (c0_ < 0) {

687 token = i::Token::EOS;

688 } else {

689 token = Select(i::Token::ILLEGAL);

690 }

691 break;

692 }

693

694 // Continue scanning for tokens as long as we're just skipping

695 // whitespace.

696 } while (token == i::Token::WHITESPACE);

697

698 next_.location.end_pos = source_pos();

699 next_.token = token;

700 }

701

702

703 void Scanner::SeekForward(int pos) {

704 source_->SeekForward(pos - 1);

705 Advance();

706 // This function is only called to seek to the location

707 // of the end of a function (at the "}" token). It doesn't matter

708 // whether there was a line terminator in the part we skip.

709 has_line_terminator_before_next_ = false;

710 Scan();

711 }

712

713

714 uc32 Scanner::ScanHexEscape(uc32 c, int length) {

715 ASSERT(length <= 4); // prevent overflow

716

717 uc32 digits[4];

718 uc32 x = 0;

719 for (int i = 0; i < length; i++) {

720 digits[i] = c0_;

721 int d = i::HexValue(c0_);

722 if (d < 0) {

723 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes

724 // should be illegal, but other JS VMs just return the

725 // non-escaped version of the original character.

726

727 // Push back digits read, except the last one (in c0_).

728 for (int j = i-1; j >= 0; j--) {

729 PushBack(digits[j]);

730 }

731 // Notice: No handling of error - treat it as "\u"->"u".

732 return c;

733 }

734 x = x * 16 + d;

735 Advance();

736 }

737

738 return x;

739 }

740

741

742 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of

743 // ECMA-262. Other JS VMs support them.

744 uc32 Scanner::ScanOctalEscape(

745 uc32 c, int length) {

746 uc32 x = c - '0';

747 for (int i = 0; i < length; i++) {

748 int d = c0_ - '0';

749 if (d < 0 \|\| d > 7) break;

750 int nx = x * 8 + d;

751 if (nx >= 256) break;

752 x = nx;

753 Advance();

754 }

755 return x;

756 }

757

758

759 void Scanner::ScanEscape() {

760 uc32 c = c0_;

761 Advance();

762

763 // Skip escaped newlines.

764 if (i::ScannerConstants::kIsLineTerminator.get(c)) {

765 // Allow CR+LF newlines in multiline string literals.

766 if (i::IsCarriageReturn(c) && i::IsLineFeed(c0_)) Advance();

767 // Allow LF+CR newlines in multiline string literals.

768 if (i::IsLineFeed(c) && i::IsCarriageReturn(c0_)) Advance();

769 return;

770 }

771

772 switch (c) {

773 case '\'': // fall through

774 case '"' : // fall through

775 case '\\': break;

776 case 'b' : c = '\b'; break;

777 case 'f' : c = '\f'; break;

778 case 'n' : c = '\n'; break;

779 case 'r' : c = '\r'; break;

780 case 't' : c = '\t'; break;

781 case 'u' : c = ScanHexEscape(c, 4); break;

782 case 'v' : c = '\v'; break;

783 case 'x' : c = ScanHexEscape(c, 2); break;

784 case '0' : // fall through

785 case '1' : // fall through

786 case '2' : // fall through

787 case '3' : // fall through

788 case '4' : // fall through

789 case '5' : // fall through

790 case '6' : // fall through

791 case '7' : c = ScanOctalEscape(c, 2); break;

792 }

793

794 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these

795 // should be illegal, but they are commonly handled

796 // as non-escaped characters by JS VMs.

797 AddLiteralChar(c);

798 }

799

800

801 i::Token::Value Scanner::ScanString() {

802 uc32 quote = c0_;

803 Advance(); // consume quote

804

805 LiteralScope literal(this, kLiteralString);

806 while (c0_ != quote && c0_ >= 0

807 && !i::ScannerConstants::kIsLineTerminator.get(c0_)) {

808 uc32 c = c0_;

809 Advance();

810 if (c == '\\') {

811 if (c0_ < 0) return i::Token::ILLEGAL;

812 ScanEscape();

813 } else {

814 AddLiteralChar(c);

815 }

816 }

817 if (c0_ != quote) return i::Token::ILLEGAL;

818 literal.Complete();

819

820 Advance(); // consume quote

821 return i::Token::STRING;

822 }

823

824

825 i::Token::Value Scanner::Select(

826 i::Token::Value tok) {

827 Advance();

828 return tok;

829 }

830

831

832 i::Token::Value Scanner::Select(

833 uc32 next,

834 i::Token::Value then,

835 i::Token::Value else_) {

836 Advance();

837 if (c0_ == next) {

838 Advance();

839 return then;

840 } else {

841 return else_;

842 }

843 }

844

845

846 // Returns true if any decimal digits were scanned, returns false otherwise.

847 void Scanner::ScanDecimalDigits() {

848 while (i::IsDecimalDigit(c0_))

849 AddLiteralCharAdvance();

850 }

851

852

853 i::Token::Value Scanner::ScanNumber(

854 bool seen_period) {

855 // c0_ is the first digit of the number or the fraction.

856 ASSERT(i::IsDecimalDigit(c0_));

857

858 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;

859

860 LiteralScope literal(this, kLiteralNumber);

861 if (seen_period) {

862 // we have already seen a decimal point of the float

863 AddLiteralChar('.');

864 ScanDecimalDigits(); // we know we have at least one digit

865

866 } else {

867 // if the first character is '0' we must check for octals and hex

868 if (c0_ == '0') {

869 AddLiteralCharAdvance();

870

871 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number

872 if (c0_ == 'x' \|\| c0_ == 'X') {

873 // hex number

874 kind = HEX;

875 AddLiteralCharAdvance();

876 if (!i::IsHexDigit(c0_)) {

877 // we must have at least one hex digit after 'x'/'X'

878 return i::Token::ILLEGAL;

879 }

880 while (i::IsHexDigit(c0_)) {

881 AddLiteralCharAdvance();

882 }

883 } else if ('0' <= c0_ && c0_ <= '7') {

884 // (possible) octal number

885 kind = OCTAL;

886 while (true) {

887 if (c0_ == '8' \|\| c0_ == '9') {

888 kind = DECIMAL;

889 break;

890 }

891 if (c0_ < '0' \|\| '7' < c0_) break;

892 AddLiteralCharAdvance();

893 }

894 }

895 }

896

897 // Parse decimal digits and allow trailing fractional part.

898 if (kind == DECIMAL) {

899 ScanDecimalDigits(); // optional

900 if (c0_ == '.') {

901 AddLiteralCharAdvance();

902 ScanDecimalDigits(); // optional

903 }

904 }

905 }

906

907 // scan exponent, if any

908 if (c0_ == 'e' \|\| c0_ == 'E') {

909 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number

910 if (kind == OCTAL) return i::Token::ILLEGAL;

911 // scan exponent

912 AddLiteralCharAdvance();

913 if (c0_ == '+' \|\| c0_ == '-')

914 AddLiteralCharAdvance();

915 if (!i::IsDecimalDigit(c0_)) {

916 // we must have at least one decimal digit after 'e'/'E'

917 return i::Token::ILLEGAL;

918 }

919 ScanDecimalDigits();

920 }

921

922 // The source character immediately following a numeric literal must

923 // not be an identifier start or a decimal digit; see ECMA-262

924 // section 7.8.3, page 17 (note that we read only one decimal digit

925 // if the value is 0).

926 if (i::IsDecimalDigit(c0_)

927 \|\| i::ScannerConstants::kIsIdentifierStart.get(c0_))

928 return i::Token::ILLEGAL;

929

930 literal.Complete();

931

932 return i::Token::NUMBER;

933 }

934

935

936 uc32 Scanner::ScanIdentifierUnicodeEscape() {

937 Advance();

938 if (c0_ != 'u') return unibrow::Utf8::kBadChar;

939 Advance();

940 uc32 c = ScanHexEscape('u', 4);

941 // We do not allow a unicode escape sequence to start another

942 // unicode escape sequence.

943 if (c == '\\') return unibrow::Utf8::kBadChar;

944 return c;

945 }

946

947

948 i::Token::Value Scanner::ScanIdentifier() {

949 ASSERT(i::ScannerConstants::kIsIdentifierStart.get(c0_));

950

951 LiteralScope literal(this, kLiteralIdentifier);

952 i::KeywordMatcher keyword_match;

953

954 // Scan identifier start character.

955 if (c0_ == '\\') {

956 uc32 c = ScanIdentifierUnicodeEscape();

957 // Only allow legal identifier start characters.

958 if (!i::ScannerConstants::kIsIdentifierStart.get(c)) {

959 return i::Token::ILLEGAL;

960 }

961 AddLiteralChar(c);

962 keyword_match.Fail();

963 } else {

964 AddLiteralChar(c0_);

965 keyword_match.AddChar(c0_);

966 Advance();

967 }

968

969 // Scan the rest of the identifier characters.

970 while (i::ScannerConstants::kIsIdentifierPart.get(c0_)) {

971 if (c0_ == '\\') {

972 uc32 c = ScanIdentifierUnicodeEscape();

973 // Only allow legal identifier part characters.

974 if (!i::ScannerConstants::kIsIdentifierPart.get(c)) {

975 return i::Token::ILLEGAL;

976 }

977 AddLiteralChar(c);

978 keyword_match.Fail();

979 } else {

980 AddLiteralChar(c0_);

981 keyword_match.AddChar(c0_);

982 Advance();

983 }

984 }

985 literal.Complete();

986

987 return keyword_match.token();

988 }

989

990

991 bool Scanner::ScanRegExpPattern(bool seen_equal) {

992 // Scan: ('/' \| '/=') RegularExpressionBody '/' RegularExpressionFlags

993 bool in_character_class = false;

994

995 // Previous token is either '/' or '/=', in the second case, the

996 // pattern starts at =.

997 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);

998 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);

999

1000 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,

1001 // the scanner should pass uninterpreted bodies to the RegExp

1002 // constructor.

1003 LiteralScope literal(this, kLiteralRegExp);

1004 if (seen_equal)

1005 AddLiteralChar('=');

1006

1007 while (c0_ != '/' \|\| in_character_class) {

1008 if (i::ScannerConstants::kIsLineTerminator.get(c0_) \|\| c0_ < 0) {

1009 return false;

1010 }

1011 if (c0_ == '\\') { // escaped character

1012 AddLiteralCharAdvance();

1013 if (i::ScannerConstants::kIsLineTerminator.get(c0_) \|\| c0_ < 0) {

1014 return false;

1015 }

1016 AddLiteralCharAdvance();

1017 } else { // unescaped character

1018 if (c0_ == '[') in_character_class = true;

1019 if (c0_ == ']') in_character_class = false;

1020 AddLiteralCharAdvance();

1021 }

1022 }

1023 Advance(); // consume '/'

1024

1025 literal.Complete();

1026

1027 return true;

1028 }

1029

1030 bool Scanner::ScanRegExpFlags() {

1031 // Scan regular expression flags.

1032 LiteralScope literal(this, kLiteralRegExpFlags);

1033 while (i::ScannerConstants::kIsIdentifierPart.get(c0_)) {

1034 if (c0_ == '\\') {

1035 uc32 c = ScanIdentifierUnicodeEscape();

1036 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {

1037 // We allow any escaped character, unlike the restriction on

1038 // IdentifierPart when it is used to build an IdentifierName.

1039 AddLiteralChar(c);

1040 continue;

1041 }

1042 }

1043 AddLiteralCharAdvance();

1044 }

1045 literal.Complete();

1046

1047 next_.location.end_pos = source_pos() - 1;

1048 return true;

1049 }

1050

1051

1052 } } // namespace v8::preparser

1053

1054 #endif // V8_PRESCANNER_H_

OLD	NEW

« no previous file with comments | « src/parser.cc ('k') | src/scanner.h » ('j') | no next file with comments »