src/prescanner.h - Issue 5063003: Add separate scanner only intended for preparsing.

Side by Side Diff: src/prescanner.h

Issue 5063003: Add separate scanner only intended for preparsing. (Closed)

Patch Set: Created 10 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 // Copyright 2010 the V8 project authors. All rights reserved.

	2 // Redistribution and use in source and binary forms, with or without

	3 // modification, are permitted provided that the following conditions are

	4 // met:

	5 //

	6 // * Redistributions of source code must retain the above copyright

	7 // notice, this list of conditions and the following disclaimer.

	8 // * Redistributions in binary form must reproduce the above

	9 // copyright notice, this list of conditions and the following

	10 // disclaimer in the documentation and/or other materials provided

	11 // with the distribution.

	12 // * Neither the name of Google Inc. nor the names of its

	13 // contributors may be used to endorse or promote products derived

	14 // from this software without specific prior written permission.

	15 //

	16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

	17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

	18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

	19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

	20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

	21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

	22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

	23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

	24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

	25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

	26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	27

	28 #ifndef V8_PRESCANNER_H_

	29 #define V8_PRESCANNER_H_

	30

	31 #include "token.h"

	32 #include "char-predicates-inl.h"

	33 #include "utils.h"

	34 #include "scanner-base.h"

	35

	36 namespace v8 {

	37 namespace preparser {

	38

	39 namespace i = v8::internal;

	40

	41 typedef int uc32;

	42

	43 int HexValue(uc32 c) {

	44 int res = c \| 0x20; // Uppercase letters.

	45 int is_digit = (c & 0x10) >> 4; // 0 if non-digit, 1 if digit.

	46 // What to add to digits to make them consecutive with 'a'-'f' letters.

	47 int kDelta = 'a' - '9' - 1;

	48 // What to subtract to digits and letters to get them back to the range 0..15.

	49 int kStart = '0' + kDelta;

	50 res -= kStart;

	51 res += kDelta * is_digit;

	52 return res;

	53 }

	54

	55

	56 class PreScannerStackGuard {

	57 public:

	58 explicit PreScannerStackGuard(int max_size)

	59 : limit_(StackPoint().at() - max_size) { }

	60 bool has_overflowed() {

	61 return StackPoint().at() < limit_;

	62 }

	63 private:

	64 class StackPoint {

	65 public:

	66 char* at() { return reinterpret_cast<char*>(this); }

	67 };

	68 char* limit_;

	69 };

	70

	71

	72 template <typename UTF16Buffer, typename UTF8Buffer>
	Søren Thygesen Gjesse 2010/11/17 10:37:46 Please explain these template parameters a bit. It Please explain these template parameters a bit. It is rather nasty to have these all over the place just to declare two member variables of these types. Also they are both named XXXBuffer, but one is for input and the other is for some output and their "interfaces" are quite different. Lasse Reichstein 2010/11/17 13:08:39 Agree. The long term plan is to have a stand-alone Agree. The long term plan is to have a stand-alone preparser that doesn't depend on the V8 runtime system. Having these types as templates is a step on the way, allowing this file to not depend on V8, but still be usable inside V8. I'll try to move the UTF8Buffer to a shared file in a later iteration, since there is only one implementation, and it only depends on unibrow and utils. I haven't yet decided whether to keep the UTF16Buffer as a template type or to have a purely virtual superclass for it in a V8-independent file. The names of the classes suck, but there is no reason why the template parameters have to have the same names. I'll rename them to be more distinct.
	73 class Scanner {

	74 public:

	75 enum LiteralType {

	76 kLiteralNumber,

	77 kLiteralIdentifier,

	78 kLiteralString,

	79 kLiteralRegExp,

	80 kLiteralRegExpFlags

	81 };

	82

	83 class LiteralScope {

	84 public:

	85 explicit LiteralScope(Scanner* self, LiteralType type);

	86 ~LiteralScope();

	87 void Complete();

	88

	89 private:

	90 Scanner* scanner_;

	91 bool complete_;

	92 };

	93

	94 Scanner();

	95

	96 void Initialize(UTF16Buffer* stream);

	97

	98 // Returns the next token.

	99 i::Token::Value Next();

	100

	101 // Returns the current token again.

	102 i::Token::Value current_token() { return current_.token; }

	103

	104 // One token look-ahead (past the token returned by Next()).

	105 i::Token::Value peek() const { return next_.token; }

	106

	107 // Returns true if there was a line terminator before the peek'ed token.

	108 bool has_line_terminator_before_next() const {

	109 return has_line_terminator_before_next_;

	110 }

	111

	112 struct Location {

	113 Location(int b, int e) : beg_pos(b), end_pos(e) { }

	114 Location() : beg_pos(0), end_pos(0) { }

	115 int beg_pos;

	116 int end_pos;

	117 };

	118

	119 // Returns the location information for the current token

	120 // (the token returned by Next()).

	121 Location location() const { return current_.location; }
	Søren Thygesen Gjesse 2010/11/17 10:37:46 Maybe add a separate comment for peek_location. Maybe add a separate comment for peek_location. Lasse Reichstein 2010/11/17 13:08:39 Done. Done.
	122 Location peek_location() const { return next_.location; }

	123

	124 // Returns the literal string, if any, for the current token (the

	125 // token returned by Next()). The string is 0-terminated and in

	126 // UTF-8 format; they may contain 0-characters. Literal strings are

	127 // collected for identifiers, strings, and numbers.

	128 // These functions only give the correct result if the literal

	129 // was scanned between calls to StartLiteral() and TerminateLiteral().

	130 const char* literal_string() const {

	131 return current_.literal_chars;

	132 }

	133

	134 int literal_length() const {

	135 // Excluding terminal '\x00' added by TerminateLiteral().

	136 return current_.literal_length - 1;

	137 }

	138

	139 i::Vector<const char> literal() const {

	140 return i::Vector<const char>(literal_string(), literal_length());

	141 }

	142

	143 // Returns the literal string for the next token (the token that

	144 // would be returned if Next() were called).

	145 const char* next_literal_string() const {

	146 return next_.literal_chars;

	147 }

	148

	149

	150 // Returns the length of the next token (that would be returned if

	151 // Next() were called).

	152 int next_literal_length() const {

	153 // Excluding terminal '\x00' added by TerminateLiteral().

	154 return next_.literal_length - 1;

	155 }

	156

	157 i::Vector<const char> next_literal() const {

	158 return i::Vector<const char>(next_literal_string(), next_literal_length());

	159 }

	160

	161 // Scans the input as a regular expression pattern, previous

	162 // character(s) must be /(=). Returns true if a pattern is scanned.

	163 bool ScanRegExpPattern(bool seen_equal);

	164 // Returns true if regexp flags are scanned (always since flags can

	165 // be empty).

	166 bool ScanRegExpFlags();

	167

	168 // Seek forward to the given position. This operation does not

	169 // work in general, for instance when there are pushed back

	170 // characters, but works for seeking forward until simple delimiter

	171 // tokens, which is what it is used for.

	172 void SeekForward(int pos);

	173

	174 bool stack_overflow() { return stack_overflow_; }

	175

	176 static const int kCharacterLookaheadBufferSize = 1;

	177 static const int kNoEndPosition = 1;

	178

	179 private:

	180 // The current and look-ahead token.

	181 struct TokenDesc {

	182 i::Token::Value token;

	183 Location location;

	184 const char* literal_chars;

	185 int literal_length;

	186 };

	187

	188 // Default stack limit is 128K pointers.

	189 static const int kMaxStackSize = 128 * 1024 * sizeof(void*); // NOLINT.

	190

	191 void Init(unibrow::CharacterStream* stream);

	192

	193 // Literal buffer support

	194 inline void StartLiteral(LiteralType type);

	195 inline void AddChar(uc32 ch);

	196 inline void AddCharAdvance();

	197 inline void TerminateLiteral();

	198 // Stops scanning of a literal, e.g., due to an encountered error.

	199 inline void DropLiteral();

	200

	201 // Low-level scanning support.

	202 void Advance() { c0_ = source_->Advance(); }

	203 void PushBack(uc32 ch) {

	204 source_->PushBack(ch);

	205 c0_ = ch;

	206 }

	207

	208 bool SkipWhiteSpace();

	209

	210 i::Token::Value SkipSingleLineComment();

	211 i::Token::Value SkipMultiLineComment();

	212

	213 inline i::Token::Value Select(i::Token::Value tok);

	214 inline i::Token::Value Select(uc32 next,

	215 i::Token::Value then,

	216 i::Token::Value else_);

	217

	218 // Scans a single JavaScript token.

	219 void Scan();

	220

	221 void ScanDecimalDigits();

	222 i::Token::Value ScanNumber(bool seen_period);

	223 i::Token::Value ScanIdentifier();

	224 uc32 ScanHexEscape(uc32 c, int length);

	225 uc32 ScanOctalEscape(uc32 c, int length);

	226 void ScanEscape();

	227 i::Token::Value ScanString();

	228

	229 // Scans a possible HTML comment -- begins with '<!'.

	230 i::Token::Value ScanHtmlComment();

	231

	232 // Return the current source position.

	233 int source_pos() {

	234 return source_->pos() - kCharacterLookaheadBufferSize;

	235 }

	236

	237 // Decodes a unicode escape-sequence which is part of an identifier.

	238 // If the escape sequence cannot be decoded the result is kBadRune.

	239 uc32 ScanIdentifierUnicodeEscape();

	240

	241 PreScannerStackGuard stack_guard_;

	242

	243 TokenDesc current_; // desc for current token (as returned by Next())

	244 TokenDesc next_; // desc for next token (one token look-ahead)

	245 bool has_line_terminator_before_next_;

	246

	247 // Source.

	248 UTF16Buffer* source_;

	249

	250 // Buffer to hold literal values (identifiers, strings, numerals, regexps and

	251 // regexp flags) using '\x00'-terminated UTF-8 encoding.

	252 // Handles allocation internally.

	253 // Notice that the '\x00' termination is meaningless for strings and regexps

	254 // which may contain the zero-character, but can be used as terminator for

	255 // identifiers, numerals and regexp flags.

	256 UTF8Buffer literal_buffer_;

	257

	258 bool stack_overflow_;

	259

	260 // One Unicode character look-ahead; c0_ < 0 at the end of the input.

	261 uc32 c0_;

	262 };

	263

	264

	265 // ----------------------------------------------------------------------------

	266 // Scanner::LiteralScope

	267

	268 template <typename UTF16Buffer, typename UTF8Buffer>

	269 Scanner<UTF16Buffer, UTF8Buffer>::LiteralScope::LiteralScope(

	270 Scanner* self, LiteralType type)

	271 : scanner_(self), complete_(false) {

	272 self->StartLiteral(type);

	273 }

	274

	275

	276 template <typename UTF16Buffer, typename UTF8Buffer>

	277 Scanner<UTF16Buffer, UTF8Buffer>::LiteralScope::~LiteralScope() {

	278 if (!complete_) scanner_->DropLiteral();

	279 }

	280

	281 template <typename UTF16Buffer, typename UTF8Buffer>

	282 void Scanner<UTF16Buffer, UTF8Buffer>::LiteralScope::Complete() {

	283 scanner_->TerminateLiteral();

	284 complete_ = true;

	285 }

	286

	287

	288 // ----------------------------------------------------------------------------

	289 // Scanner

	290 template <typename UTF16Buffer, typename UTF8Buffer>

	291 Scanner<UTF16Buffer, UTF8Buffer>::Scanner()

	292 : stack_guard_(kMaxStackSize),

	293 has_line_terminator_before_next_(false),

	294 source_(NULL),

	295 stack_overflow_(false) {}

	296

	297

	298 template <typename UTF16Buffer, typename UTF8Buffer>

	299 void Scanner<UTF16Buffer, UTF8Buffer>::Initialize(UTF16Buffer* stream) {

	300 source_ = stream;

	301

	302 // Initialize current_ to not refer to a literal.

	303 current_.literal_length = 0;

	304 // Reset literal buffer.

	305 literal_buffer_.Reset();

	306

	307 // Set c0_ (one character ahead)

	308 ASSERT(kCharacterLookaheadBufferSize == 1);

	309 Advance();

	310

	311 // Skip initial whitespace allowing HTML comment ends just like

	312 // after a newline and scan first token.

	313 has_line_terminator_before_next_ = true;

	314 SkipWhiteSpace();

	315 Scan();

	316 }

	317

	318

	319 template <typename UTF16Buffer, typename UTF8Buffer>

	320 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::Next() {

	321 // BUG 1215673: Find a thread safe way to set a stack limit in

	322 // pre-parse mode. Otherwise, we cannot safely pre-parse from other

	323 // threads.

	324 current_ = next_;

	325 // Check for stack-overflow before returning any tokens.

	326 if (stack_guard_.has_overflowed()) {

	327 stack_overflow_ = true;

	328 next_.token = i::Token::ILLEGAL;

	329 } else {

	330 has_line_terminator_before_next_ = false;

	331 Scan();

	332 }

	333 return current_.token;

	334 }

	335

	336

	337 template <typename UTF16Buffer, typename UTF8Buffer>

	338 void Scanner<UTF16Buffer, UTF8Buffer>::StartLiteral(LiteralType type) {

	339 // Only record string and literal identifiers when preparsing.

	340 // Those are the ones that are recorded as symbols. Numbers and

	341 // regexps are not recorded.

	342 if (type == kLiteralString \|\| type == kLiteralIdentifier) {

	343 literal_buffer_.StartLiteral();

	344 }

	345 }

	346

	347

	348 template <typename UTF16Buffer, typename UTF8Buffer>

	349 void Scanner<UTF16Buffer, UTF8Buffer>::AddChar(uc32 c) {
	Søren Thygesen Gjesse 2010/11/17 10:37:46 AddChar -> AddLiteralChar? AddChar -> AddLiteralChar? Lasse Reichstein 2010/11/17 13:08:39 Done. Also in scanner.h/.cc. Done. Also in scanner.h/.cc.
	350 literal_buffer_.AddChar(c);

	351 }

	352

	353

	354 template <typename UTF16Buffer, typename UTF8Buffer>

	355 void Scanner<UTF16Buffer, UTF8Buffer>::TerminateLiteral() {

	356 i::Vector<const char> chars = literal_buffer_.EndLiteral();

	357 next_.literal_chars = chars.start();

	358 next_.literal_length = chars.length();

	359 }

	360

	361

	362 template <typename UTF16Buffer, typename UTF8Buffer>

	363 void Scanner<UTF16Buffer, UTF8Buffer>::DropLiteral() {

	364 literal_buffer_.DropLiteral();

	365 }

	366

	367

	368 template <typename UTF16Buffer, typename UTF8Buffer>

	369 void Scanner<UTF16Buffer, UTF8Buffer>::AddCharAdvance() {

	370 AddChar(c0_);

	371 Advance();

	372 }

	373

	374

	375 static inline bool IsByteOrderMark(uc32 c) {

	376 // The Unicode value U+FFFE is guaranteed never to be assigned as a

	377 // Unicode character; this implies that in a Unicode context the

	378 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF

	379 // character expressed in little-endian byte order (since it could

	380 // not be a U+FFFE character expressed in big-endian byte

	381 // order). Nevertheless, we check for it to be compatible with

	382 // Spidermonkey.

	383 return c == 0xFEFF \|\| c == 0xFFFE;

	384 }

	385

	386

	387 template <typename UTF16Buffer, typename UTF8Buffer>

	388 bool Scanner<UTF16Buffer, UTF8Buffer>::SkipWhiteSpace() {

	389 int start_position = source_pos();

	390

	391 while (true) {

	392 // We treat byte-order marks (BOMs) as whitespace for better

	393 // compatibility with Spidermonkey and other JavaScript engines.

	394 while (i::ScannerConstants::kIsWhiteSpace.get(c0_)

	395 \|\| IsByteOrderMark(c0_)) {

	396 // IsWhiteSpace() includes line terminators!

	397 if (i::ScannerConstants::kIsLineTerminator.get(c0_)) {

	398 // Ignore line terminators, but remember them. This is necessary

	399 // for automatic semicolon insertion.

	400 has_line_terminator_before_next_ = true;

	401 }

	402 Advance();

	403 }

	404

	405 // If there is an HTML comment end '-->' at the beginning of a

	406 // line (with only whitespace in front of it), we treat the rest

	407 // of the line as a comment. This is in line with the way

	408 // SpiderMonkey handles it.

	409 if (c0_ == '-' && has_line_terminator_before_next_) {

	410 Advance();

	411 if (c0_ == '-') {

	412 Advance();

	413 if (c0_ == '>') {

	414 // Treat the rest of the line as a comment.

	415 SkipSingleLineComment();

	416 // Continue skipping white space after the comment.

	417 continue;

	418 }

	419 PushBack('-'); // undo Advance()

	420 }

	421 PushBack('-'); // undo Advance()

	422 }

	423 // Return whether or not we skipped any characters.

	424 return source_pos() != start_position;

	425 }

	426 }

	427

	428

	429 template <typename UTF16Buffer, typename UTF8Buffer>

	430 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::SkipSingleLineComment() {

	431 Advance();

	432

	433 // The line terminator at the end of the line is not considered

	434 // to be part of the single-line comment; it is recognized

	435 // separately by the lexical grammar and becomes part of the

	436 // stream of input elements for the syntactic grammar (see

	437 // ECMA-262, section 7.4, page 12).

	438 while (c0_ >= 0 && !i::ScannerConstants::kIsLineTerminator.get(c0_)) {

	439 Advance();

	440 }

	441

	442 return i::Token::WHITESPACE;

	443 }

	444

	445

	446 template <typename UTF16Buffer, typename UTF8Buffer>

	447 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::SkipMultiLineComment() {

	448 ASSERT(c0_ == '*');

	449 Advance();

	450

	451 while (c0_ >= 0) {

	452 char ch = c0_;

	453 Advance();

	454 // If we have reached the end of the multi-line comment, we

	455 // consume the '/' and insert a whitespace. This way all

	456 // multi-line comments are treated as whitespace - even the ones

	457 // containing line terminators. This contradicts ECMA-262, section

	458 // 7.4, page 12, that says that multi-line comments containing

	459 // line terminators should be treated as a line terminator, but it

	460 // matches the behaviour of SpiderMonkey and KJS.

	461 if (ch == '*' && c0_ == '/') {

	462 c0_ = ' ';

	463 return i::Token::WHITESPACE;

	464 }

	465 }

	466

	467 // Unterminated multi-line comment.

	468 return i::Token::ILLEGAL;

	469 }

	470

	471

	472 template <typename UTF16Buffer, typename UTF8Buffer>

	473 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::ScanHtmlComment() {

	474 // Check for <!-- comments.

	475 ASSERT(c0_ == '!');

	476 Advance();

	477 if (c0_ == '-') {

	478 Advance();

	479 if (c0_ == '-') return SkipSingleLineComment();

	480 PushBack('-'); // undo Advance()

	481 }

	482 PushBack('!'); // undo Advance()

	483 ASSERT(c0_ == '!');

	484 return i::Token::LT;

	485 }

	486

	487

	488 template <typename UTF16Buffer, typename UTF8Buffer>

	489 void Scanner<UTF16Buffer, UTF8Buffer>::Scan() {

	490 next_.literal_length = 0;

	491 i::Token::Value token;

	492 do {

	493 // Remember the position of the next token

	494 next_.location.beg_pos = source_pos();

	495

	496 switch (c0_) {

	497 case ' ':

	498 case '\t':

	499 Advance();

	500 token = i::Token::WHITESPACE;

	501 break;

	502

	503 case '\n':

	504 Advance();

	505 has_line_terminator_before_next_ = true;

	506 token = i::Token::WHITESPACE;

	507 break;

	508

	509 case '"': case '\'':

	510 token = ScanString();

	511 break;

	512

	513 case '<':

	514 // < <= << <<= <!--

	515 Advance();

	516 if (c0_ == '=') {

	517 token = Select(i::Token::LTE);

	518 } else if (c0_ == '<') {

	519 token = Select('=', i::Token::ASSIGN_SHL, i::Token::SHL);

	520 } else if (c0_ == '!') {

	521 token = ScanHtmlComment();

	522 } else {

	523 token = i::Token::LT;

	524 }

	525 break;

	526

	527 case '>':

	528 // > >= >> >>= >>> >>>=

	529 Advance();

	530 if (c0_ == '=') {

	531 token = Select(i::Token::GTE);

	532 } else if (c0_ == '>') {

	533 // >> >>= >>> >>>=

	534 Advance();

	535 if (c0_ == '=') {

	536 token = Select(i::Token::ASSIGN_SAR);

	537 } else if (c0_ == '>') {

	538 token = Select('=', i::Token::ASSIGN_SHR, i::Token::SHR);

	539 } else {

	540 token = i::Token::SAR;

	541 }

	542 } else {

	543 token = i::Token::GT;

	544 }

	545 break;

	546

	547 case '=':

	548 // = == ===

	549 Advance();

	550 if (c0_ == '=') {

	551 token = Select('=', i::Token::EQ_STRICT, i::Token::EQ);

	552 } else {

	553 token = i::Token::ASSIGN;

	554 }

	555 break;

	556

	557 case '!':

	558 // ! != !==

	559 Advance();

	560 if (c0_ == '=') {

	561 token = Select('=', i::Token::NE_STRICT, i::Token::NE);

	562 } else {

	563 token = i::Token::NOT;

	564 }

	565 break;

	566

	567 case '+':

	568 // + ++ +=

	569 Advance();

	570 if (c0_ == '+') {

	571 token = Select(i::Token::INC);

	572 } else if (c0_ == '=') {

	573 token = Select(i::Token::ASSIGN_ADD);

	574 } else {

	575 token = i::Token::ADD;

	576 }

	577 break;

	578

	579 case '-':

	580 // - -- --> -=

	581 Advance();

	582 if (c0_ == '-') {

	583 Advance();

	584 if (c0_ == '>' && has_line_terminator_before_next_) {

	585 // For compatibility with SpiderMonkey, we skip lines that

	586 // start with an HTML comment end '-->'.

	587 token = SkipSingleLineComment();

	588 } else {

	589 token = i::Token::DEC;

	590 }

	591 } else if (c0_ == '=') {

	592 token = Select(i::Token::ASSIGN_SUB);

	593 } else {

	594 token = i::Token::SUB;

	595 }

	596 break;

	597

	598 case '*':

	599 // * *=

	600 token = Select('=', i::Token::ASSIGN_MUL, i::Token::MUL);

	601 break;

	602

	603 case '%':

	604 // % %=

	605 token = Select('=', i::Token::ASSIGN_MOD, i::Token::MOD);

	606 break;

	607

	608 case '/':

	609 // / // /* /=

	610 Advance();

	611 if (c0_ == '/') {

	612 token = SkipSingleLineComment();

	613 } else if (c0_ == '*') {

	614 token = SkipMultiLineComment();

	615 } else if (c0_ == '=') {

	616 token = Select(i::Token::ASSIGN_DIV);

	617 } else {

	618 token = i::Token::DIV;

	619 }

	620 break;

	621

	622 case '&':

	623 // & && &=

	624 Advance();

	625 if (c0_ == '&') {

	626 token = Select(i::Token::AND);

	627 } else if (c0_ == '=') {

	628 token = Select(i::Token::ASSIGN_BIT_AND);

	629 } else {

	630 token = i::Token::BIT_AND;

	631 }

	632 break;

	633

	634 case '\|':

	635 // \| \|\| \|=

	636 Advance();

	637 if (c0_ == '\|') {

	638 token = Select(i::Token::OR);

	639 } else if (c0_ == '=') {

	640 token = Select(i::Token::ASSIGN_BIT_OR);

	641 } else {

	642 token = i::Token::BIT_OR;

	643 }

	644 break;

	645

	646 case '^':

	647 // ^ ^=

	648 token = Select('=', i::Token::ASSIGN_BIT_XOR, i::Token::BIT_XOR);

	649 break;

	650

	651 case '.':

	652 // . Number

	653 Advance();

	654 if (i::IsDecimalDigit(c0_)) {

	655 token = ScanNumber(true);

	656 } else {

	657 token = i::Token::PERIOD;

	658 }

	659 break;

	660

	661 case ':':

	662 token = Select(i::Token::COLON);

	663 break;

	664

	665 case ';':

	666 token = Select(i::Token::SEMICOLON);

	667 break;

	668

	669 case ',':

	670 token = Select(i::Token::COMMA);

	671 break;

	672

	673 case '(':

	674 token = Select(i::Token::LPAREN);

	675 break;

	676

	677 case ')':

	678 token = Select(i::Token::RPAREN);

	679 break;

	680

	681 case '[':

	682 token = Select(i::Token::LBRACK);

	683 break;

	684

	685 case ']':

	686 token = Select(i::Token::RBRACK);

	687 break;

	688

	689 case '{':

	690 token = Select(i::Token::LBRACE);

	691 break;

	692

	693 case '}':

	694 token = Select(i::Token::RBRACE);

	695 break;

	696

	697 case '?':

	698 token = Select(i::Token::CONDITIONAL);

	699 break;

	700

	701 case '~':

	702 token = Select(i::Token::BIT_NOT);

	703 break;

	704

	705 default:

	706 if (i::ScannerConstants::kIsIdentifierStart.get(c0_)) {

	707 token = ScanIdentifier();

	708 } else if (i::IsDecimalDigit(c0_)) {

	709 token = ScanNumber(false);

	710 } else if (SkipWhiteSpace()) {

	711 token = i::Token::WHITESPACE;

	712 } else if (c0_ < 0) {

	713 token = i::Token::EOS;

	714 } else {

	715 token = Select(i::Token::ILLEGAL);

	716 }

	717 break;

	718 }

	719

	720 // Continue scanning for tokens as long as we're just skipping

	721 // whitespace.

	722 } while (token == i::Token::WHITESPACE);

	723

	724 next_.location.end_pos = source_pos();

	725 next_.token = token;

	726 }

	727

	728

	729 template <typename UTF16Buffer, typename UTF8Buffer>

	730 void Scanner<UTF16Buffer, UTF8Buffer>::SeekForward(int pos) {

	731 source_->SeekForward(pos - 1);

	732 Advance();

	733 // This function is only called to seek to the location

	734 // of the end of a function (at the "}" token). It doesn't matter

	735 // whether there was a line terminator in the part we skip.

	736 has_line_terminator_before_next_ = false;

	737 Scan();

	738 }

	739

	740

	741 template <typename UTF16Buffer, typename UTF8Buffer>

	742 uc32 Scanner<UTF16Buffer, UTF8Buffer>::ScanHexEscape(uc32 c, int length) {

	743 ASSERT(length <= 4); // prevent overflow

	744

	745 uc32 digits[4];

	746 uc32 x = 0;

	747 for (int i = 0; i < length; i++) {

	748 digits[i] = c0_;

	749 int d = HexValue(c0_);

	750 if (d < 0) {

	751 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes

	752 // should be illegal, but other JS VMs just return the

	753 // non-escaped version of the original character.

	754

	755 // Push back digits read, except the last one (in c0_).

	756 for (int j = i-1; j >= 0; j--) {

	757 PushBack(digits[j]);

	758 }

	759 // Notice: No handling of error - treat it as "\u"->"u".

	760 return c;

	761 }

	762 x = x * 16 + d;

	763 Advance();

	764 }

	765

	766 return x;

	767 }

	768

	769

	770 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of

	771 // ECMA-262. Other JS VMs support them.

	772 template <typename UTF16Buffer, typename UTF8Buffer>

	773 uc32 Scanner<UTF16Buffer, UTF8Buffer>::ScanOctalEscape(

	774 uc32 c, int length) {

	775 uc32 x = c - '0';

	776 for (int i = 0; i < length; i++) {

	777 int d = c0_ - '0';

	778 if (d < 0 \|\| d > 7) break;

	779 int nx = x * 8 + d;

	780 if (nx >= 256) break;

	781 x = nx;

	782 Advance();

	783 }

	784 return x;

	785 }

	786

	787

	788 template <typename UTF16Buffer, typename UTF8Buffer>

	789 void Scanner<UTF16Buffer, UTF8Buffer>::ScanEscape() {

	790 uc32 c = c0_;

	791 Advance();

	792

	793 // Skip escaped newlines.

	794 if (i::ScannerConstants::kIsLineTerminator.get(c)) {

	795 // Allow CR+LF newlines in multiline string literals.

	796 if (i::IsCarriageReturn(c) && i::IsLineFeed(c0_)) Advance();

	797 // Allow LF+CR newlines in multiline string literals.

	798 if (i::IsLineFeed(c) && i::IsCarriageReturn(c0_)) Advance();

	799 return;

	800 }

	801

	802 switch (c) {

	803 case '\'': // fall through

	804 case '"' : // fall through

	805 case '\\': break;

	806 case 'b' : c = '\b'; break;

	807 case 'f' : c = '\f'; break;

	808 case 'n' : c = '\n'; break;

	809 case 'r' : c = '\r'; break;

	810 case 't' : c = '\t'; break;

	811 case 'u' : c = ScanHexEscape(c, 4); break;

	812 case 'v' : c = '\v'; break;

	813 case 'x' : c = ScanHexEscape(c, 2); break;

	814 case '0' : // fall through

	815 case '1' : // fall through

	816 case '2' : // fall through

	817 case '3' : // fall through

	818 case '4' : // fall through

	819 case '5' : // fall through

	820 case '6' : // fall through

	821 case '7' : c = ScanOctalEscape(c, 2); break;

	822 }

	823

	824 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these

	825 // should be illegal, but they are commonly handled

	826 // as non-escaped characters by JS VMs.

	827 AddChar(c);

	828 }

	829

	830

	831 template <typename UTF16Buffer, typename UTF8Buffer>

	832 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::ScanString() {

	833 uc32 quote = c0_;

	834 Advance(); // consume quote

	835

	836 LiteralScope literal(this, kLiteralString);

	837 while (c0_ != quote && c0_ >= 0

	838 && !i::ScannerConstants::kIsLineTerminator.get(c0_)) {

	839 uc32 c = c0_;

	840 Advance();

	841 if (c == '\\') {

	842 if (c0_ < 0) return i::Token::ILLEGAL;

	843 ScanEscape();

	844 } else {

	845 AddChar(c);

	846 }

	847 }

	848 if (c0_ != quote) return i::Token::ILLEGAL;

	849 literal.Complete();

	850

	851 Advance(); // consume quote

	852 return i::Token::STRING;

	853 }

	854

	855

	856 template <typename UTF16Buffer, typename UTF8Buffer>

	857 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::Select(i::Token::Value tok) {

	858 Advance();

	859 return tok;

	860 }

	861

	862

	863 template <typename UTF16Buffer, typename UTF8Buffer>

	864 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::Select(

	865 uc32 next,

	866 i::Token::Value then,

	867 i::Token::Value else_) {

	868 Advance();

	869 if (c0_ == next) {

	870 Advance();

	871 return then;

	872 } else {

	873 return else_;

	874 }

	875 }

	876

	877

	878 // Returns true if any decimal digits were scanned, returns false otherwise.

	879 template <typename UTF16Buffer, typename UTF8Buffer>

	880 void Scanner<UTF16Buffer, UTF8Buffer>::ScanDecimalDigits() {

	881 while (i::IsDecimalDigit(c0_))

	882 AddCharAdvance();

	883 }

	884

	885

	886 template <typename UTF16Buffer, typename UTF8Buffer>

	887 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::ScanNumber(bool seen_period) {

	888 // c0_ is the first digit of the number or the fraction.

	889 ASSERT(i::IsDecimalDigit(c0_));

	890

	891 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;

	892

	893 LiteralScope literal(this, kLiteralNumber);

	894 if (seen_period) {

	895 // we have already seen a decimal point of the float

	896 AddChar('.');

	897 ScanDecimalDigits(); // we know we have at least one digit

	898

	899 } else {

	900 // if the first character is '0' we must check for octals and hex

	901 if (c0_ == '0') {

	902 AddCharAdvance();

	903

	904 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number

	905 if (c0_ == 'x' \|\| c0_ == 'X') {

	906 // hex number

	907 kind = HEX;

	908 AddCharAdvance();

	909 if (!i::IsHexDigit(c0_)) {

	910 // we must have at least one hex digit after 'x'/'X'

	911 return i::Token::ILLEGAL;

	912 }

	913 while (i::IsHexDigit(c0_)) {

	914 AddCharAdvance();

	915 }

	916 } else if ('0' <= c0_ && c0_ <= '7') {

	917 // (possible) octal number

	918 kind = OCTAL;

	919 while (true) {

	920 if (c0_ == '8' \|\| c0_ == '9') {

	921 kind = DECIMAL;

	922 break;

	923 }

	924 if (c0_ < '0' \|\| '7' < c0_) break;

	925 AddCharAdvance();

	926 }

	927 }

	928 }

	929

	930 // Parse decimal digits and allow trailing fractional part.

	931 if (kind == DECIMAL) {

	932 ScanDecimalDigits(); // optional

	933 if (c0_ == '.') {

	934 AddCharAdvance();

	935 ScanDecimalDigits(); // optional

	936 }

	937 }

	938 }

	939

	940 // scan exponent, if any

	941 if (c0_ == 'e' \|\| c0_ == 'E') {

	942 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number

	943 if (kind == OCTAL) return i::Token::ILLEGAL;

	944 // scan exponent

	945 AddCharAdvance();

	946 if (c0_ == '+' \|\| c0_ == '-')

	947 AddCharAdvance();

	948 if (!i::IsDecimalDigit(c0_)) {

	949 // we must have at least one decimal digit after 'e'/'E'

	950 return i::Token::ILLEGAL;

	951 }

	952 ScanDecimalDigits();

	953 }

	954

	955 // The source character immediately following a numeric literal must

	956 // not be an identifier start or a decimal digit; see ECMA-262

	957 // section 7.8.3, page 17 (note that we read only one decimal digit

	958 // if the value is 0).

	959 if (i::IsDecimalDigit(c0_)

	960 \|\| i::ScannerConstants::kIsIdentifierStart.get(c0_))

	961 return i::Token::ILLEGAL;

	962

	963 literal.Complete();

	964

	965 return i::Token::NUMBER;

	966 }

	967

	968

	969 template <typename UTF16Buffer, typename UTF8Buffer>

	970 uc32 Scanner<UTF16Buffer, UTF8Buffer>::ScanIdentifierUnicodeEscape() {

	971 Advance();

	972 if (c0_ != 'u') return unibrow::Utf8::kBadChar;

	973 Advance();

	974 uc32 c = ScanHexEscape('u', 4);

	975 // We do not allow a unicode escape sequence to start another

	976 // unicode escape sequence.

	977 if (c == '\\') return unibrow::Utf8::kBadChar;

	978 return c;

	979 }

	980

	981

	982 template <typename UTF16Buffer, typename UTF8Buffer>

	983 i::Token::Value Scanner<UTF16Buffer, UTF8Buffer>::ScanIdentifier() {

	984 ASSERT(i::ScannerConstants::kIsIdentifierStart.get(c0_));

	985

	986 LiteralScope literal(this, kLiteralIdentifier);

	987 i::KeywordMatcher keyword_match;

	988

	989 // Scan identifier start character.

	990 if (c0_ == '\\') {

	991 uc32 c = ScanIdentifierUnicodeEscape();

	992 // Only allow legal identifier start characters.

	993 if (!i::ScannerConstants::kIsIdentifierStart.get(c)) {

	994 return i::Token::ILLEGAL;

	995 }

	996 AddChar(c);

	997 keyword_match.Fail();

	998 } else {

	999 AddChar(c0_);

	1000 keyword_match.AddChar(c0_);

	1001 Advance();

	1002 }

	1003

	1004 // Scan the rest of the identifier characters.

	1005 while (i::ScannerConstants::kIsIdentifierPart.get(c0_)) {

	1006 if (c0_ == '\\') {

	1007 uc32 c = ScanIdentifierUnicodeEscape();

	1008 // Only allow legal identifier part characters.

	1009 if (!i::ScannerConstants::kIsIdentifierPart.get(c)) {

	1010 return i::Token::ILLEGAL;

	1011 }

	1012 AddChar(c);

	1013 keyword_match.Fail();

	1014 } else {

	1015 AddChar(c0_);

	1016 keyword_match.AddChar(c0_);

	1017 Advance();

	1018 }

	1019 }

	1020 literal.Complete();

	1021

	1022 return keyword_match.token();

	1023 }

	1024

	1025

	1026 template <typename UTF16Buffer, typename UTF8Buffer>

	1027 bool Scanner<UTF16Buffer, UTF8Buffer>::ScanRegExpPattern(bool seen_equal) {

	1028 // Scan: ('/' \| '/=') RegularExpressionBody '/' RegularExpressionFlags

	1029 bool in_character_class = false;

	1030

	1031 // Previous token is either '/' or '/=', in the second case, the

	1032 // pattern starts at =.

	1033 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);

	1034 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);

	1035

	1036 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,

	1037 // the scanner should pass uninterpreted bodies to the RegExp

	1038 // constructor.

	1039 LiteralScope literal(this, kLiteralRegExp);

	1040 if (seen_equal)

	1041 AddChar('=');

	1042

	1043 while (c0_ != '/' \|\| in_character_class) {

	1044 if (i::ScannerConstants::kIsLineTerminator.get(c0_) \|\| c0_ < 0) {

	1045 return false;

	1046 }

	1047 if (c0_ == '\\') { // escaped character

	1048 AddCharAdvance();

	1049 if (i::ScannerConstants::kIsLineTerminator.get(c0_) \|\| c0_ < 0) {

	1050 return false;

	1051 }

	1052 AddCharAdvance();

	1053 } else { // unescaped character

	1054 if (c0_ == '[') in_character_class = true;

	1055 if (c0_ == ']') in_character_class = false;

	1056 AddCharAdvance();

	1057 }

	1058 }

	1059 Advance(); // consume '/'

	1060

	1061 literal.Complete();

	1062

	1063 return true;

	1064 }

	1065

	1066 template <typename UTF16Buffer, typename UTF8Buffer>

	1067 bool Scanner<UTF16Buffer, UTF8Buffer>::ScanRegExpFlags() {

	1068 // Scan regular expression flags.

	1069 LiteralScope literal(this, kLiteralRegExpFlags);

	1070 while (i::ScannerConstants::kIsIdentifierPart.get(c0_)) {

	1071 if (c0_ == '\\') {

	1072 uc32 c = ScanIdentifierUnicodeEscape();

	1073 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {

	1074 // We allow any escaped character, unlike the restriction on

	1075 // IdentifierPart when it is used to build an IdentifierName.

	1076 AddChar(c);

	1077 continue;

	1078 }

	1079 }

	1080 AddCharAdvance();

	1081 }

	1082 literal.Complete();

	1083

	1084 next_.location.end_pos = source_pos() - 1;

	1085 return true;

	1086 }

	1087

	1088

	1089 } } // namespace v8::preparser

	1090

	1091 #endif // V8_PRESCANNER_H_

OLD	NEW

« src/parser.cc ('K') | « src/parser.cc ('k') | src/scanner.h » ('j') | no next file with comments »