src/scanner.cc - Issue 5136002: Extract scanner base/JS/JSON and move base and JS to scanner-base.

Side by Side Diff: src/scanner.cc

Issue 5136002: Extract scanner base/JS/JSON and move base and JS to scanner-base. (Closed)

Patch Set: Created 10 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2010 the V8 project authors. All rights reserved.	1 // Copyright 2010 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 18 matching lines...) Expand all Loading...
29	29

30 #include "ast.h"	30 #include "ast.h"

31 #include "handles.h"	31 #include "handles.h"

32 #include "scanner.h"	32 #include "scanner.h"

33 #include "unicode-inl.h"	33 #include "unicode-inl.h"

34	34

35 namespace v8 {	35 namespace v8 {

36 namespace internal {	36 namespace internal {

37	37

38 // ----------------------------------------------------------------------------	38 // ----------------------------------------------------------------------------

39 // UTF8Buffer

40

41 UTF8Buffer::UTF8Buffer() : buffer_(kInitialCapacity), recording_(false) { }

42

43

44 UTF8Buffer::~UTF8Buffer() {}

45

46

47 void UTF8Buffer::AddCharSlow(uc32 c) {

48 ASSERT(static_cast<unsigned>(c) > unibrow::Utf8::kMaxOneByteChar);

49 int length = unibrow::Utf8::Length(c);

50 Vector<char> block = buffer_.AddBlock(length, '\0');

51 #ifdef DEBUG

52 int written_length = unibrow::Utf8::Encode(block.start(), c);

53 CHECK_EQ(length, written_length);

54 #else

55 unibrow::Utf8::Encode(block.start(), c);

56 #endif

57 }

58

59

60 // ----------------------------------------------------------------------------

61 // UTF16Buffer	39 // UTF16Buffer

62	40

63

64 UTF16Buffer::UTF16Buffer()

65 : pos_(0), end_(Scanner::kNoEndPosition) { }

66

67

68 // CharacterStreamUTF16Buffer	41 // CharacterStreamUTF16Buffer

69 CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer()	42 CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer()

70 : pushback_buffer_(0), last_(0), stream_(NULL) { }	43 : pushback_buffer_(0), last_(0), stream_(NULL) { }

71	44

72	45

73 void CharacterStreamUTF16Buffer::Initialize(Handle<String> data,	46 void CharacterStreamUTF16Buffer::Initialize(Handle<String> data,

74 unibrow::CharacterStream* input,	47 unibrow::CharacterStream* input,

75 int start_position,	48 int start_position,

76 int end_position) {	49 int end_position) {

77 stream_ = input;	50 stream_ = input;

78 if (start_position > 0) {	51 if (start_position > 0) {

79 SeekForward(start_position);	52 SeekForward(start_position);

80 }	53 }

81 end_ = end_position != Scanner::kNoEndPosition ? end_position : kMaxInt;	54 end_ = end_position != kNoEndPosition ? end_position : kMaxInt;

82 }	55 }

83	56

84	57

85 void CharacterStreamUTF16Buffer::PushBack(uc32 ch) {	58 void CharacterStreamUTF16Buffer::PushBack(uc32 ch) {

86 pushback_buffer()->Add(last_);	59 pushback_buffer()->Add(last_);

87 last_ = ch;	60 last_ = ch;

88 pos_--;	61 pos_--;

89 }	62 }

90	63

91	64

92 uc32 CharacterStreamUTF16Buffer::Advance() {	65 uc32 CharacterStreamUTF16Buffer::Advance() {

93 ASSERT(end_ != Scanner::kNoEndPosition);	66 ASSERT(end_ != kNoEndPosition);

94 ASSERT(end_ >= 0);	67 ASSERT(end_ >= 0);

95 // NOTE: It is of importance to Persian / Farsi resources that we do	68 // NOTE: It is of importance to Persian / Farsi resources that we do

96 // not strip format control characters in the scanner; see	69 // not strip format control characters in the scanner; see

97 //	70 //

98 // https://bugzilla.mozilla.org/show_bug.cgi?id=274152	71 // https://bugzilla.mozilla.org/show_bug.cgi?id=274152

99 //	72 //

100 // So, even though ECMA-262, section 7.1, page 11, dictates that we	73 // So, even though ECMA-262, section 7.1, page 11, dictates that we

101 // must remove Unicode format-control characters, we do not. This is	74 // must remove Unicode format-control characters, we do not. This is

102 // in line with how IE and SpiderMonkey handles it.	75 // in line with how IE and SpiderMonkey handles it.

103 if (!pushback_buffer()->is_empty()) {	76 if (!pushback_buffer()->is_empty()) {

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
136 if (!complete_) scanner_->DropLiteral();	109 if (!complete_) scanner_->DropLiteral();

137 }	110 }

138	111

139	112

140 void Scanner::LiteralScope::Complete() {	113 void Scanner::LiteralScope::Complete() {

141 scanner_->TerminateLiteral();	114 scanner_->TerminateLiteral();

142 complete_ = true;	115 complete_ = true;

143 }	116 }

144	117

145 // ----------------------------------------------------------------------------	118 // ----------------------------------------------------------------------------

146 // Scanner	119 // V8JavaScriptScanner

147	120

148 Scanner::Scanner()	121 void V8JavaScriptScanner::Initialize(Handle<String> source) {

149 : has_line_terminator_before_next_(false),	122 source_ = stream_initializer_.Init(source, NULL, 0, source->length());

150 is_parsing_json_(false),	123 Init();

151 source_(NULL),	124 // Skip initial whitespace allowing HTML comment ends just like

152 stack_overflow_(false) {}	125 // after a newline and scan first token.

153	126 has_line_terminator_before_next_ = true;

154	127 SkipWhiteSpace();

155 void Scanner::Initialize(Handle<String> source,	128 Scan();

156 ParserLanguage language) {

157 Init(source, NULL, 0, source->length(), language);

158 }	129 }

159	130

160	131

161 void Scanner::Initialize(Handle<String> source,	132 void V8JavaScriptScanner::Initialize(Handle<String> source,

162 unibrow::CharacterStream* stream,	133 unibrow::CharacterStream* stream) {

163 ParserLanguage language) {	134 source_ = stream_initializer_.Init(source, stream,

164 Init(source, stream, 0, kNoEndPosition, language);	135 0, UTF16Buffer::kNoEndPosition);

	136 Init();

	137 // Skip initial whitespace allowing HTML comment ends just like

	138 // after a newline and scan first token.

	139 has_line_terminator_before_next_ = true;

	140 SkipWhiteSpace();

	141 Scan();

165 }	142 }

166	143

167	144

168 void Scanner::Initialize(Handle<String> source,	145 void V8JavaScriptScanner::Initialize(Handle<String> source,

169 int start_position,	146 int start_position,

170 int end_position,	147 int end_position) {

171 ParserLanguage language) {	148 source_ = stream_initializer_.Init(source, NULL,

172 Init(source, NULL, start_position, end_position, language);	149 start_position, end_position);

	150 Init();

	151 // Skip initial whitespace allowing HTML comment ends just like

	152 // after a newline and scan first token.

	153 has_line_terminator_before_next_ = true;

	154 SkipWhiteSpace();

	155 Scan();

173 }	156 }

174	157

175	158

176 void Scanner::Init(Handle<String> source,	159 Token::Value V8JavaScriptScanner::NextCheckStack() {

177 unibrow::CharacterStream* stream,	160 // BUG 1215673: Find a thread safe way to set a stack limit in

178 int start_position,	161 // pre-parse mode. Otherwise, we cannot safely pre-parse from other

179 int end_position,	162 // threads.

180 ParserLanguage language) {	163 StackLimitCheck check;

	164 if (check.HasOverflowed()) {

	165 stack_overflow_ = true;

	166 current_ = next_;

	167 next_.token = Token::ILLEGAL;

	168 return current_.token;

	169 } else {

	170 return Next();

	171 }

	172 }

	173

	174

	175 UTF16Buffer* StreamInitializer::Init(Handle<String> source,

	176 unibrow::CharacterStream* stream,

	177 int start_position,

	178 int end_position) {

181 // Either initialize the scanner from a character stream or from a	179 // Either initialize the scanner from a character stream or from a

182 // string.	180 // string.

183 ASSERT(source.is_null() \|\| stream == NULL);	181 ASSERT(source.is_null() \|\| stream == NULL);

184	182

185 // Initialize the source buffer.	183 // Initialize the source buffer.

186 if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) {	184 if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) {

187 two_byte_string_buffer_.Initialize(	185 two_byte_string_buffer_.Initialize(

188 Handle<ExternalTwoByteString>::cast(source),	186 Handle<ExternalTwoByteString>::cast(source),

189 start_position,	187 start_position,

190 end_position);	188 end_position);

191 source_ = &two_byte_string_buffer_;	189 return &two_byte_string_buffer_;

192 } else if (!source.is_null() && StringShape(*source).IsExternalAscii()) {	190 } else if (!source.is_null() && StringShape(*source).IsExternalAscii()) {

193 ascii_string_buffer_.Initialize(	191 ascii_string_buffer_.Initialize(

194 Handle<ExternalAsciiString>::cast(source),	192 Handle<ExternalAsciiString>::cast(source),

195 start_position,	193 start_position,

196 end_position);	194 end_position);

197 source_ = &ascii_string_buffer_;	195 return &ascii_string_buffer_;

198 } else {	196 } else {

199 if (!source.is_null()) {	197 if (!source.is_null()) {

200 safe_string_input_buffer_.Reset(source.location());	198 safe_string_input_buffer_.Reset(source.location());

201 stream = &safe_string_input_buffer_;	199 stream = &safe_string_input_buffer_;

202 }	200 }

203 char_stream_buffer_.Initialize(source,	201 char_stream_buffer_.Initialize(source,

204 stream,	202 stream,

205 start_position,	203 start_position,

206 end_position);	204 end_position);

207 source_ = &char_stream_buffer_;	205 return &char_stream_buffer_;

208 }	206 }

	207 }

209	208

210 is_parsing_json_ = (language == JSON);	209 // ----------------------------------------------------------------------------

	210 // JsonScanner

211	211

212 // Set c0_ (one character ahead)	212 JsonScanner::JsonScanner() {}

213 ASSERT(kCharacterLookaheadBufferSize == 1);

214 Advance();

215 // Initialize current_ to not refer to a literal.

216 current_.literal_chars = Vector<const char>();

217 // Reset literal buffer.

218 literal_buffer_.Reset();

219	213

220 // Skip initial whitespace allowing HTML comment ends just like	214

221 // after a newline and scan first token.	215 void JsonScanner::Initialize(Handle<String> source) {

222 has_line_terminator_before_next_ = true;	216 source_ = stream_initializer_.Init(source, NULL, 0, source->length());

223 SkipWhiteSpace();	217 Init();

224 Scan();	218 // Skip initial whitespace.

	219 SkipJsonWhiteSpace();

	220 // Preload first token as look-ahead.

	221 ScanJson();

225 }	222 }

226	223

227	224

228 Token::Value Scanner::Next() {	225 Token::Value JsonScanner::Next() {

229 // BUG 1215673: Find a thread safe way to set a stack limit in	226 // BUG 1215673: Find a thread safe way to set a stack limit in

230 // pre-parse mode. Otherwise, we cannot safely pre-parse from other	227 // pre-parse mode. Otherwise, we cannot safely pre-parse from other

231 // threads.	228 // threads.

232 current_ = next_;	229 current_ = next_;

233 // Check for stack-overflow before returning any tokens.	230 // Check for stack-overflow before returning any tokens.

234 StackLimitCheck check;	231 StackLimitCheck check;

235 if (check.HasOverflowed()) {	232 if (check.HasOverflowed()) {

236 stack_overflow_ = true;	233 stack_overflow_ = true;

237 next_.token = Token::ILLEGAL;	234 next_.token = Token::ILLEGAL;

238 } else {	235 } else {

239 has_line_terminator_before_next_ = false;	236 ScanJson();

240 Scan();

241 }	237 }

242 return current_.token;	238 return current_.token;

243 }	239 }

244	240

245	241

246 void Scanner::StartLiteral() {	242 bool JsonScanner::SkipJsonWhiteSpace() {

247 literal_buffer_.StartLiteral();

248 }

249

250

251 void Scanner::AddLiteralChar(uc32 c) {

252 literal_buffer_.AddChar(c);

253 }

254

255

256 void Scanner::TerminateLiteral() {

257 next_.literal_chars = literal_buffer_.EndLiteral();

258 }

259

260

261 void Scanner::DropLiteral() {

262 literal_buffer_.DropLiteral();

263 }

264

265

266 void Scanner::AddLiteralCharAdvance() {

267 AddLiteralChar(c0_);

268 Advance();

269 }

270

271

272 static inline bool IsByteOrderMark(uc32 c) {

273 // The Unicode value U+FFFE is guaranteed never to be assigned as a

274 // Unicode character; this implies that in a Unicode context the

275 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF

276 // character expressed in little-endian byte order (since it could

277 // not be a U+FFFE character expressed in big-endian byte

278 // order). Nevertheless, we check for it to be compatible with

279 // Spidermonkey.

280 return c == 0xFEFF \|\| c == 0xFFFE;

281 }

282

283

284 bool Scanner::SkipJsonWhiteSpace() {

285 int start_position = source_pos();	243 int start_position = source_pos();

286 // JSON WhiteSpace is tab, carrige-return, newline and space.	244 // JSON WhiteSpace is tab, carrige-return, newline and space.

287 while (c0_ == ' ' \|\| c0_ == '\n' \|\| c0_ == '\r' \|\| c0_ == '\t') {	245 while (c0_ == ' ' \|\| c0_ == '\n' \|\| c0_ == '\r' \|\| c0_ == '\t') {

288 Advance();	246 Advance();

289 }	247 }

290 return source_pos() != start_position;	248 return source_pos() != start_position;

291 }	249 }

292	250

293	251

294 bool Scanner::SkipJavaScriptWhiteSpace() {	252 void JsonScanner::ScanJson() {

295 int start_position = source_pos();

296

297 while (true) {

298 // We treat byte-order marks (BOMs) as whitespace for better

299 // compatibility with Spidermonkey and other JavaScript engines.

300 while (ScannerConstants::kIsWhiteSpace.get(c0_) \|\| IsByteOrderMark(c0_)) {

301 // IsWhiteSpace() includes line terminators!

302 if (ScannerConstants::kIsLineTerminator.get(c0_)) {

303 // Ignore line terminators, but remember them. This is necessary

304 // for automatic semicolon insertion.

305 has_line_terminator_before_next_ = true;

306 }

307 Advance();

308 }

309

310 // If there is an HTML comment end '-->' at the beginning of a

311 // line (with only whitespace in front of it), we treat the rest

312 // of the line as a comment. This is in line with the way

313 // SpiderMonkey handles it.

314 if (c0_ == '-' && has_line_terminator_before_next_) {

315 Advance();

316 if (c0_ == '-') {

317 Advance();

318 if (c0_ == '>') {

319 // Treat the rest of the line as a comment.

320 SkipSingleLineComment();

321 // Continue skipping white space after the comment.

322 continue;

323 }

324 PushBack('-'); // undo Advance()

325 }

326 PushBack('-'); // undo Advance()

327 }

328 // Return whether or not we skipped any characters.

329 return source_pos() != start_position;

330 }

331 }

332

333

334 Token::Value Scanner::SkipSingleLineComment() {

335 Advance();

336

337 // The line terminator at the end of the line is not considered

338 // to be part of the single-line comment; it is recognized

339 // separately by the lexical grammar and becomes part of the

340 // stream of input elements for the syntactic grammar (see

341 // ECMA-262, section 7.4, page 12).

342 while (c0_ >= 0 && !ScannerConstants::kIsLineTerminator.get(c0_)) {

343 Advance();

344 }

345

346 return Token::WHITESPACE;

347 }

348

349

350 Token::Value Scanner::SkipMultiLineComment() {

351 ASSERT(c0_ == '*');

352 Advance();

353

354 while (c0_ >= 0) {

355 char ch = c0_;

356 Advance();

357 // If we have reached the end of the multi-line comment, we

358 // consume the '/' and insert a whitespace. This way all

359 // multi-line comments are treated as whitespace - even the ones

360 // containing line terminators. This contradicts ECMA-262, section

361 // 7.4, page 12, that says that multi-line comments containing

362 // line terminators should be treated as a line terminator, but it

363 // matches the behaviour of SpiderMonkey and KJS.

364 if (ch == '*' && c0_ == '/') {

365 c0_ = ' ';

366 return Token::WHITESPACE;

367 }

368 }

369

370 // Unterminated multi-line comment.

371 return Token::ILLEGAL;

372 }

373

374

375 Token::Value Scanner::ScanHtmlComment() {

376 // Check for <!-- comments.

377 ASSERT(c0_ == '!');

378 Advance();

379 if (c0_ == '-') {

380 Advance();

381 if (c0_ == '-') return SkipSingleLineComment();

382 PushBack('-'); // undo Advance()

383 }

384 PushBack('!'); // undo Advance()

385 ASSERT(c0_ == '!');

386 return Token::LT;

387 }

388

389

390

391 void Scanner::ScanJson() {

392 next_.literal_chars = Vector<const char>();	253 next_.literal_chars = Vector<const char>();

393 Token::Value token;	254 Token::Value token;

394 has_line_terminator_before_next_ = false;

395 do {	255 do {

396 // Remember the position of the next token	256 // Remember the position of the next token

397 next_.location.beg_pos = source_pos();	257 next_.location.beg_pos = source_pos();

398 switch (c0_) {	258 switch (c0_) {

399 case '\t':	259 case '\t':

400 case '\r':	260 case '\r':

401 case '\n':	261 case '\n':

402 case ' ':	262 case ' ':

403 Advance();	263 Advance();

404 token = Token::WHITESPACE;	264 token = Token::WHITESPACE;

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
461 token = Select(Token::ILLEGAL);	321 token = Select(Token::ILLEGAL);

462 }	322 }

463 }	323 }

464 } while (token == Token::WHITESPACE);	324 } while (token == Token::WHITESPACE);

465	325

466 next_.location.end_pos = source_pos();	326 next_.location.end_pos = source_pos();

467 next_.token = token;	327 next_.token = token;

468 }	328 }

469	329

470	330

471 Token::Value Scanner::ScanJsonString() {	331 Token::Value JsonScanner::ScanJsonString() {

472 ASSERT_EQ('"', c0_);	332 ASSERT_EQ('"', c0_);

473 Advance();	333 Advance();

474 LiteralScope literal(this);	334 LiteralScope literal(this);

475 while (c0_ != '"' && c0_ > 0) {	335 while (c0_ != '"' && c0_ > 0) {

476 // Check for control character (0x00-0x1f) or unterminated string (<0).	336 // Check for control character (0x00-0x1f) or unterminated string (<0).

477 if (c0_ < 0x20) return Token::ILLEGAL;	337 if (c0_ < 0x20) return Token::ILLEGAL;

478 if (c0_ != '\\') {	338 if (c0_ != '\\') {

479 AddLiteralCharAdvance();	339 AddLiteralCharAdvance();

480 } else {	340 } else {

481 Advance();	341 Advance();

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
521 }	381 }

522 if (c0_ != '"') {	382 if (c0_ != '"') {

523 return Token::ILLEGAL;	383 return Token::ILLEGAL;

524 }	384 }

525 literal.Complete();	385 literal.Complete();

526 Advance();	386 Advance();

527 return Token::STRING;	387 return Token::STRING;

528 }	388 }

529	389

530	390

531 Token::Value Scanner::ScanJsonNumber() {	391 Token::Value JsonScanner::ScanJsonNumber() {

532 LiteralScope literal(this);	392 LiteralScope literal(this);

533 if (c0_ == '-') AddLiteralCharAdvance();	393 if (c0_ == '-') AddLiteralCharAdvance();

534 if (c0_ == '0') {	394 if (c0_ == '0') {

535 AddLiteralCharAdvance();	395 AddLiteralCharAdvance();

536 // Prefix zero is only allowed if it's the only digit before	396 // Prefix zero is only allowed if it's the only digit before

537 // a decimal point or exponent.	397 // a decimal point or exponent.

538 if ('0' <= c0_ && c0_ <= '9') return Token::ILLEGAL;	398 if ('0' <= c0_ && c0_ <= '9') return Token::ILLEGAL;

539 } else {	399 } else {

540 if (c0_ < '1' \|\| c0_ > '9') return Token::ILLEGAL;	400 if (c0_ < '1' \|\| c0_ > '9') return Token::ILLEGAL;

541 do {	401 do {

(...skipping 13 matching lines...) Expand all Loading...
555 if (c0_ < '0' \|\| c0_ > '9') return Token::ILLEGAL;	415 if (c0_ < '0' \|\| c0_ > '9') return Token::ILLEGAL;

556 do {	416 do {

557 AddLiteralCharAdvance();	417 AddLiteralCharAdvance();

558 } while (c0_ >= '0' && c0_ <= '9');	418 } while (c0_ >= '0' && c0_ <= '9');

559 }	419 }

560 literal.Complete();	420 literal.Complete();

561 return Token::NUMBER;	421 return Token::NUMBER;

562 }	422 }

563	423

564	424

565 Token::Value Scanner::ScanJsonIdentifier(const char* text,	425 Token::Value JsonScanner::ScanJsonIdentifier(const char* text,

566 Token::Value token) {	426 Token::Value token) {

567 LiteralScope literal(this);	427 LiteralScope literal(this);

568 while (*text != '\0') {	428 while (*text != '\0') {

569 if (c0_ != *text) return Token::ILLEGAL;	429 if (c0_ != *text) return Token::ILLEGAL;

570 Advance();	430 Advance();

571 text++;	431 text++;

572 }	432 }

573 if (ScannerConstants::kIsIdentifierPart.get(c0_)) return Token::ILLEGAL;	433 if (ScannerConstants::kIsIdentifierPart.get(c0_)) return Token::ILLEGAL;

574 literal.Complete();	434 literal.Complete();

575 return token;	435 return token;

576 }	436 }

577	437

578	438

579 void Scanner::ScanJavaScript() {

580 next_.literal_chars = Vector<const char>();

581 Token::Value token;

582 do {

583 // Remember the position of the next token

584 next_.location.beg_pos = source_pos();

585

586 switch (c0_) {

587 case ' ':

588 case '\t':

589 Advance();

590 token = Token::WHITESPACE;

591 break;

592

593 case '\n':

594 Advance();

595 has_line_terminator_before_next_ = true;

596 token = Token::WHITESPACE;

597 break;

598

599 case '"': case '\'':

600 token = ScanString();

601 break;

602

603 case '<':

604 // < <= << <<= <!--

605 Advance();

606 if (c0_ == '=') {

607 token = Select(Token::LTE);

608 } else if (c0_ == '<') {

609 token = Select('=', Token::ASSIGN_SHL, Token::SHL);

610 } else if (c0_ == '!') {

611 token = ScanHtmlComment();

612 } else {

613 token = Token::LT;

614 }

615 break;

616

617 case '>':

618 // > >= >> >>= >>> >>>=

619 Advance();

620 if (c0_ == '=') {

621 token = Select(Token::GTE);

622 } else if (c0_ == '>') {

623 // >> >>= >>> >>>=

624 Advance();

625 if (c0_ == '=') {

626 token = Select(Token::ASSIGN_SAR);

627 } else if (c0_ == '>') {

628 token = Select('=', Token::ASSIGN_SHR, Token::SHR);

629 } else {

630 token = Token::SAR;

631 }

632 } else {

633 token = Token::GT;

634 }

635 break;

636

637 case '=':

638 // = == ===

639 Advance();

640 if (c0_ == '=') {

641 token = Select('=', Token::EQ_STRICT, Token::EQ);

642 } else {

643 token = Token::ASSIGN;

644 }

645 break;

646

647 case '!':

648 // ! != !==

649 Advance();

650 if (c0_ == '=') {

651 token = Select('=', Token::NE_STRICT, Token::NE);

652 } else {

653 token = Token::NOT;

654 }

655 break;

656

657 case '+':

658 // + ++ +=

659 Advance();

660 if (c0_ == '+') {

661 token = Select(Token::INC);

662 } else if (c0_ == '=') {

663 token = Select(Token::ASSIGN_ADD);

664 } else {

665 token = Token::ADD;

666 }

667 break;

668

669 case '-':

670 // - -- --> -=

671 Advance();

672 if (c0_ == '-') {

673 Advance();

674 if (c0_ == '>' && has_line_terminator_before_next_) {

675 // For compatibility with SpiderMonkey, we skip lines that

676 // start with an HTML comment end '-->'.

677 token = SkipSingleLineComment();

678 } else {

679 token = Token::DEC;

680 }

681 } else if (c0_ == '=') {

682 token = Select(Token::ASSIGN_SUB);

683 } else {

684 token = Token::SUB;

685 }

686 break;

687

688 case '*':

689 // * *=

690 token = Select('=', Token::ASSIGN_MUL, Token::MUL);

691 break;

692

693 case '%':

694 // % %=

695 token = Select('=', Token::ASSIGN_MOD, Token::MOD);

696 break;

697

698 case '/':

699 // / // /* /=

700 Advance();

701 if (c0_ == '/') {

702 token = SkipSingleLineComment();

703 } else if (c0_ == '*') {

704 token = SkipMultiLineComment();

705 } else if (c0_ == '=') {

706 token = Select(Token::ASSIGN_DIV);

707 } else {

708 token = Token::DIV;

709 }

710 break;

711

712 case '&':

713 // & && &=

714 Advance();

715 if (c0_ == '&') {

716 token = Select(Token::AND);

717 } else if (c0_ == '=') {

718 token = Select(Token::ASSIGN_BIT_AND);

719 } else {

720 token = Token::BIT_AND;

721 }

722 break;

723

724 case '\|':

725 // \| \|\| \|=

726 Advance();

727 if (c0_ == '\|') {

728 token = Select(Token::OR);

729 } else if (c0_ == '=') {

730 token = Select(Token::ASSIGN_BIT_OR);

731 } else {

732 token = Token::BIT_OR;

733 }

734 break;

735

736 case '^':

737 // ^ ^=

738 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);

739 break;

740

741 case '.':

742 // . Number

743 Advance();

744 if (IsDecimalDigit(c0_)) {

745 token = ScanNumber(true);

746 } else {

747 token = Token::PERIOD;

748 }

749 break;

750

751 case ':':

752 token = Select(Token::COLON);

753 break;

754

755 case ';':

756 token = Select(Token::SEMICOLON);

757 break;

758

759 case ',':

760 token = Select(Token::COMMA);

761 break;

762

763 case '(':

764 token = Select(Token::LPAREN);

765 break;

766

767 case ')':

768 token = Select(Token::RPAREN);

769 break;

770

771 case '[':

772 token = Select(Token::LBRACK);

773 break;

774

775 case ']':

776 token = Select(Token::RBRACK);

777 break;

778

779 case '{':

780 token = Select(Token::LBRACE);

781 break;

782

783 case '}':

784 token = Select(Token::RBRACE);

785 break;

786

787 case '?':

788 token = Select(Token::CONDITIONAL);

789 break;

790

791 case '~':

792 token = Select(Token::BIT_NOT);

793 break;

794

795 default:

796 if (ScannerConstants::kIsIdentifierStart.get(c0_)) {

797 token = ScanIdentifier();

798 } else if (IsDecimalDigit(c0_)) {

799 token = ScanNumber(false);

800 } else if (SkipWhiteSpace()) {

801 token = Token::WHITESPACE;

802 } else if (c0_ < 0) {

803 token = Token::EOS;

804 } else {

805 token = Select(Token::ILLEGAL);

806 }

807 break;

808 }

809

810 // Continue scanning for tokens as long as we're just skipping

811 // whitespace.

812 } while (token == Token::WHITESPACE);

813

814 next_.location.end_pos = source_pos();

815 next_.token = token;

816 }

817

818

819 void Scanner::SeekForward(int pos) {

820 source_->SeekForward(pos - 1);

821 Advance();

822 // This function is only called to seek to the location

823 // of the end of a function (at the "}" token). It doesn't matter

824 // whether there was a line terminator in the part we skip.

825 has_line_terminator_before_next_ = false;

826 Scan();

827 }

828

829

830 uc32 Scanner::ScanHexEscape(uc32 c, int length) {

831 ASSERT(length <= 4); // prevent overflow

832

833 uc32 digits[4];

834 uc32 x = 0;

835 for (int i = 0; i < length; i++) {

836 digits[i] = c0_;

837 int d = HexValue(c0_);

838 if (d < 0) {

839 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes

840 // should be illegal, but other JS VMs just return the

841 // non-escaped version of the original character.

842

843 // Push back digits read, except the last one (in c0_).

844 for (int j = i-1; j >= 0; j--) {

845 PushBack(digits[j]);

846 }

847 // Notice: No handling of error - treat it as "\u"->"u".

848 return c;

849 }

850 x = x * 16 + d;

851 Advance();

852 }

853

854 return x;

855 }

856

857

858 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of

859 // ECMA-262. Other JS VMs support them.

860 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {

861 uc32 x = c - '0';

862 for (int i = 0; i < length; i++) {

863 int d = c0_ - '0';

864 if (d < 0 \|\| d > 7) break;

865 int nx = x * 8 + d;

866 if (nx >= 256) break;

867 x = nx;

868 Advance();

869 }

870 return x;

871 }

872

873

874 void Scanner::ScanEscape() {

875 uc32 c = c0_;

876 Advance();

877

878 // Skip escaped newlines.

879 if (ScannerConstants::kIsLineTerminator.get(c)) {

880 // Allow CR+LF newlines in multiline string literals.

881 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();

882 // Allow LF+CR newlines in multiline string literals.

883 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();

884 return;

885 }

886

887 switch (c) {

888 case '\'': // fall through

889 case '"' : // fall through

890 case '\\': break;

891 case 'b' : c = '\b'; break;

892 case 'f' : c = '\f'; break;

893 case 'n' : c = '\n'; break;

894 case 'r' : c = '\r'; break;

895 case 't' : c = '\t'; break;

896 case 'u' : c = ScanHexEscape(c, 4); break;

897 case 'v' : c = '\v'; break;

898 case 'x' : c = ScanHexEscape(c, 2); break;

899 case '0' : // fall through

900 case '1' : // fall through

901 case '2' : // fall through

902 case '3' : // fall through

903 case '4' : // fall through

904 case '5' : // fall through

905 case '6' : // fall through

906 case '7' : c = ScanOctalEscape(c, 2); break;

907 }

908

909 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these

910 // should be illegal, but they are commonly handled

911 // as non-escaped characters by JS VMs.

912 AddLiteralChar(c);

913 }

914

915

916 Token::Value Scanner::ScanString() {

917 uc32 quote = c0_;

918 Advance(); // consume quote

919

920 LiteralScope literal(this);

921 while (c0_ != quote && c0_ >= 0

922 && !ScannerConstants::kIsLineTerminator.get(c0_)) {

923 uc32 c = c0_;

924 Advance();

925 if (c == '\\') {

926 if (c0_ < 0) return Token::ILLEGAL;

927 ScanEscape();

928 } else {

929 AddLiteralChar(c);

930 }

931 }

932 if (c0_ != quote) return Token::ILLEGAL;

933 literal.Complete();

934

935 Advance(); // consume quote

936 return Token::STRING;

937 }

938

939

940 Token::Value Scanner::Select(Token::Value tok) {

941 Advance();

942 return tok;

943 }

944

945

946 Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) {

947 Advance();

948 if (c0_ == next) {

949 Advance();

950 return then;

951 } else {

952 return else_;

953 }

954 }

955

956

957 // Returns true if any decimal digits were scanned, returns false otherwise.

958 void Scanner::ScanDecimalDigits() {

959 while (IsDecimalDigit(c0_))

960 AddLiteralCharAdvance();

961 }

962

963

964 Token::Value Scanner::ScanNumber(bool seen_period) {

965 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction

966

967 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;

968

969 LiteralScope literal(this);

970 if (seen_period) {

971 // we have already seen a decimal point of the float

972 AddLiteralChar('.');

973 ScanDecimalDigits(); // we know we have at least one digit

974

975 } else {

976 // if the first character is '0' we must check for octals and hex

977 if (c0_ == '0') {

978 AddLiteralCharAdvance();

979

980 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number

981 if (c0_ == 'x' \|\| c0_ == 'X') {

982 // hex number

983 kind = HEX;

984 AddLiteralCharAdvance();

985 if (!IsHexDigit(c0_)) {

986 // we must have at least one hex digit after 'x'/'X'

987 return Token::ILLEGAL;

988 }

989 while (IsHexDigit(c0_)) {

990 AddLiteralCharAdvance();

991 }

992 } else if ('0' <= c0_ && c0_ <= '7') {

993 // (possible) octal number

994 kind = OCTAL;

995 while (true) {

996 if (c0_ == '8' \|\| c0_ == '9') {

997 kind = DECIMAL;

998 break;

999 }

1000 if (c0_ < '0' \|\| '7' < c0_) break;

1001 AddLiteralCharAdvance();

1002 }

1003 }

1004 }

1005

1006 // Parse decimal digits and allow trailing fractional part.

1007 if (kind == DECIMAL) {

1008 ScanDecimalDigits(); // optional

1009 if (c0_ == '.') {

1010 AddLiteralCharAdvance();

1011 ScanDecimalDigits(); // optional

1012 }

1013 }

1014 }

1015

1016 // scan exponent, if any

1017 if (c0_ == 'e' \|\| c0_ == 'E') {

1018 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number

1019 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed

1020 // scan exponent

1021 AddLiteralCharAdvance();

1022 if (c0_ == '+' \|\| c0_ == '-')

1023 AddLiteralCharAdvance();

1024 if (!IsDecimalDigit(c0_)) {

1025 // we must have at least one decimal digit after 'e'/'E'

1026 return Token::ILLEGAL;

1027 }

1028 ScanDecimalDigits();

1029 }

1030

1031 // The source character immediately following a numeric literal must

1032 // not be an identifier start or a decimal digit; see ECMA-262

1033 // section 7.8.3, page 17 (note that we read only one decimal digit

1034 // if the value is 0).

1035 if (IsDecimalDigit(c0_) \|\| ScannerConstants::kIsIdentifierStart.get(c0_))

1036 return Token::ILLEGAL;

1037

1038 literal.Complete();

1039

1040 return Token::NUMBER;

1041 }

1042

1043

1044 uc32 Scanner::ScanIdentifierUnicodeEscape() {

1045 Advance();

1046 if (c0_ != 'u') return unibrow::Utf8::kBadChar;

1047 Advance();

1048 uc32 c = ScanHexEscape('u', 4);

1049 // We do not allow a unicode escape sequence to start another

1050 // unicode escape sequence.

1051 if (c == '\\') return unibrow::Utf8::kBadChar;

1052 return c;

1053 }

1054

1055

1056 Token::Value Scanner::ScanIdentifier() {

1057 ASSERT(ScannerConstants::kIsIdentifierStart.get(c0_));

1058

1059 LiteralScope literal(this);

1060 KeywordMatcher keyword_match;

1061

1062 // Scan identifier start character.

1063 if (c0_ == '\\') {

1064 uc32 c = ScanIdentifierUnicodeEscape();

1065 // Only allow legal identifier start characters.

1066 if (!ScannerConstants::kIsIdentifierStart.get(c)) return Token::ILLEGAL;

1067 AddLiteralChar(c);

1068 keyword_match.Fail();

1069 } else {

1070 AddLiteralChar(c0_);

1071 keyword_match.AddChar(c0_);

1072 Advance();

1073 }

1074

1075 // Scan the rest of the identifier characters.

1076 while (ScannerConstants::kIsIdentifierPart.get(c0_)) {

1077 if (c0_ == '\\') {

1078 uc32 c = ScanIdentifierUnicodeEscape();

1079 // Only allow legal identifier part characters.

1080 if (!ScannerConstants::kIsIdentifierPart.get(c)) return Token::ILLEGAL;

1081 AddLiteralChar(c);

1082 keyword_match.Fail();

1083 } else {

1084 AddLiteralChar(c0_);

1085 keyword_match.AddChar(c0_);

1086 Advance();

1087 }

1088 }

1089 literal.Complete();

1090

1091 return keyword_match.token();

1092 }

1093

1094

1095	439

1096 bool Scanner::ScanRegExpPattern(bool seen_equal) {

1097 // Scan: ('/' \| '/=') RegularExpressionBody '/' RegularExpressionFlags

1098 bool in_character_class = false;

1099

1100 // Previous token is either '/' or '/=', in the second case, the

1101 // pattern starts at =.

1102 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);

1103 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);

1104

1105 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,

1106 // the scanner should pass uninterpreted bodies to the RegExp

1107 // constructor.

1108 LiteralScope literal(this);

1109 if (seen_equal)

1110 AddLiteralChar('=');

1111

1112 while (c0_ != '/' \|\| in_character_class) {

1113 if (ScannerConstants::kIsLineTerminator.get(c0_) \|\| c0_ < 0) return false;

1114 if (c0_ == '\\') { // escaped character

1115 AddLiteralCharAdvance();

1116 if (ScannerConstants::kIsLineTerminator.get(c0_) \|\| c0_ < 0) return false;

1117 AddLiteralCharAdvance();

1118 } else { // unescaped character

1119 if (c0_ == '[') in_character_class = true;

1120 if (c0_ == ']') in_character_class = false;

1121 AddLiteralCharAdvance();

1122 }

1123 }

1124 Advance(); // consume '/'

1125

1126 literal.Complete();

1127

1128 return true;

1129 }

1130

1131 bool Scanner::ScanRegExpFlags() {

1132 // Scan regular expression flags.

1133 LiteralScope literal(this);

1134 while (ScannerConstants::kIsIdentifierPart.get(c0_)) {

1135 if (c0_ == '\\') {

1136 uc32 c = ScanIdentifierUnicodeEscape();

1137 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {

1138 // We allow any escaped character, unlike the restriction on

1139 // IdentifierPart when it is used to build an IdentifierName.

1140 AddLiteralChar(c);

1141 continue;

1142 }

1143 }

1144 AddLiteralCharAdvance();

1145 }

1146 literal.Complete();

1147

1148 next_.location.end_pos = source_pos() - 1;

1149 return true;

1150 }

1151

1152 } } // namespace v8::internal	440 } } // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/scanner.h ('k') | src/scanner-base.h » ('j') | no next file with comments »