src/prescanner.h - Issue 5063003: Add separate scanner only intended for preparsing.

Side by Side Diff: src/prescanner.h

Issue 5063003: Add separate scanner only intended for preparsing. (Closed)

Patch Set: Address review comments. Created 10 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 // Copyright 2010 the V8 project authors. All rights reserved.

	2 // Redistribution and use in source and binary forms, with or without

	3 // modification, are permitted provided that the following conditions are

	4 // met:

	5 //

	6 // * Redistributions of source code must retain the above copyright

	7 // notice, this list of conditions and the following disclaimer.

	8 // * Redistributions in binary form must reproduce the above

	9 // copyright notice, this list of conditions and the following

	10 // disclaimer in the documentation and/or other materials provided

	11 // with the distribution.

	12 // * Neither the name of Google Inc. nor the names of its

	13 // contributors may be used to endorse or promote products derived

	14 // from this software without specific prior written permission.

	15 //

	16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

	17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

	18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

	19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

	20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

	21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

	22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

	23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

	24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

	25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

	26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	27

	28 #ifndef V8_PRESCANNER_H_

	29 #define V8_PRESCANNER_H_

	30

	31 #include "token.h"

	32 #include "char-predicates-inl.h"

	33 #include "utils.h"

	34 #include "scanner-base.h"

	35

	36 namespace v8 {

	37 namespace preparser {

	38

	39 namespace i = v8::internal;

	40

	41 typedef int uc32;

	42

	43 int HexValue(uc32 c) {

	44 int res = c \| 0x20; // Uppercase letters.

	45 int is_digit = (c & 0x10) >> 4; // 0 if non-digit, 1 if digit.

	46 // What to add to digits to make them consecutive with 'a'-'f' letters.

	47 int kDelta = 'a' - '9' - 1;

	48 // What to subtract to digits and letters to get them back to the range 0..15.

	49 int kStart = '0' + kDelta;

	50 res -= kStart;

	51 res += kDelta * is_digit;

	52 return res;

	53 }

	54

	55

	56 class PreScannerStackGuard {

	57 public:

	58 explicit PreScannerStackGuard(int max_size)

	59 : limit_(StackPoint().at() - max_size) { }

	60 bool has_overflowed() {

	61 return StackPoint().at() < limit_;

	62 }

	63 private:

	64 class StackPoint {

	65 public:

	66 char* at() { return reinterpret_cast<char*>(this); }

	67 };

	68 char* limit_;

	69 };

	70

	71

	72 // Scanner for preparsing.

	73 // InputStream is a source of UC16 characters with limited push-back.

	74 // LiteralsBuffer is a collector of (UTF-8) characters used to capture literals.

	75 template <typename InputStream, typename LiteralsBuffer>

	76 class Scanner {

	77 public:

	78 enum LiteralType {

	79 kLiteralNumber,

	80 kLiteralIdentifier,

	81 kLiteralString,

	82 kLiteralRegExp,

	83 kLiteralRegExpFlags

	84 };

	85

	86 class LiteralScope {

	87 public:

	88 explicit LiteralScope(Scanner* self, LiteralType type);

	89 ~LiteralScope();

	90 void Complete();

	91

	92 private:

	93 Scanner* scanner_;

	94 bool complete_;

	95 };

	96

	97 Scanner();

	98

	99 void Initialize(InputStream* stream);

	100

	101 // Returns the next token.

	102 i::Token::Value Next();

	103

	104 // Returns the current token again.

	105 i::Token::Value current_token() { return current_.token; }

	106

	107 // One token look-ahead (past the token returned by Next()).

	108 i::Token::Value peek() const { return next_.token; }

	109

	110 // Returns true if there was a line terminator before the peek'ed token.

	111 bool has_line_terminator_before_next() const {

	112 return has_line_terminator_before_next_;

	113 }

	114

	115 struct Location {

	116 Location(int b, int e) : beg_pos(b), end_pos(e) { }

	117 Location() : beg_pos(0), end_pos(0) { }

	118 int beg_pos;

	119 int end_pos;

	120 };

	121

	122 // Returns the location information for the current token

	123 // (the token returned by Next()).

	124 Location location() const { return current_.location; }

	125 // Returns the location information for the look-ahead token

	126 // (the token returned by peek()).

	127 Location peek_location() const { return next_.location; }

	128

	129 // Returns the literal string, if any, for the current token (the

	130 // token returned by Next()). The string is 0-terminated and in

	131 // UTF-8 format; they may contain 0-characters. Literal strings are

	132 // collected for identifiers, strings, and numbers.

	133 // These functions only give the correct result if the literal

	134 // was scanned between calls to StartLiteral() and TerminateLiteral().

	135 const char* literal_string() const {

	136 return current_.literal_chars;

	137 }

	138

	139 int literal_length() const {

	140 // Excluding terminal '\x00' added by TerminateLiteral().

	141 return current_.literal_length - 1;

	142 }

	143

	144 i::Vector<const char> literal() const {

	145 return i::Vector<const char>(literal_string(), literal_length());

	146 }

	147

	148 // Returns the literal string for the next token (the token that

	149 // would be returned if Next() were called).

	150 const char* next_literal_string() const {

	151 return next_.literal_chars;

	152 }

	153

	154

	155 // Returns the length of the next token (that would be returned if

	156 // Next() were called).

	157 int next_literal_length() const {

	158 // Excluding terminal '\x00' added by TerminateLiteral().

	159 return next_.literal_length - 1;

	160 }

	161

	162 i::Vector<const char> next_literal() const {

	163 return i::Vector<const char>(next_literal_string(), next_literal_length());

	164 }

	165

	166 // Scans the input as a regular expression pattern, previous

	167 // character(s) must be /(=). Returns true if a pattern is scanned.

	168 bool ScanRegExpPattern(bool seen_equal);

	169 // Returns true if regexp flags are scanned (always since flags can

	170 // be empty).

	171 bool ScanRegExpFlags();

	172

	173 // Seek forward to the given position. This operation does not

	174 // work in general, for instance when there are pushed back

	175 // characters, but works for seeking forward until simple delimiter

	176 // tokens, which is what it is used for.

	177 void SeekForward(int pos);

	178

	179 bool stack_overflow() { return stack_overflow_; }

	180

	181 static const int kCharacterLookaheadBufferSize = 1;

	182 static const int kNoEndPosition = 1;

	183

	184 private:

	185 // The current and look-ahead token.

	186 struct TokenDesc {

	187 i::Token::Value token;

	188 Location location;

	189 const char* literal_chars;

	190 int literal_length;

	191 };

	192

	193 // Default stack limit is 128K pointers.

	194 static const int kMaxStackSize = 128 * 1024 * sizeof(void*); // NOLINT.

	195

	196 void Init(unibrow::CharacterStream* stream);

	197

	198 // Literal buffer support

	199 inline void StartLiteral(LiteralType type);

	200 inline void AddLiteralChar(uc32 ch);

	201 inline void AddLiteralCharAdvance();

	202 inline void TerminateLiteral();

	203 // Stops scanning of a literal, e.g., due to an encountered error.

	204 inline void DropLiteral();

	205

	206 // Low-level scanning support.

	207 void Advance() { c0_ = source_->Advance(); }

	208 void PushBack(uc32 ch) {

	209 source_->PushBack(ch);

	210 c0_ = ch;

	211 }

	212

	213 bool SkipWhiteSpace();

	214

	215 i::Token::Value SkipSingleLineComment();

	216 i::Token::Value SkipMultiLineComment();

	217

	218 inline i::Token::Value Select(i::Token::Value tok);

	219 inline i::Token::Value Select(uc32 next,

	220 i::Token::Value then,

	221 i::Token::Value else_);

	222

	223 // Scans a single JavaScript token.

	224 void Scan();

	225

	226 void ScanDecimalDigits();

	227 i::Token::Value ScanNumber(bool seen_period);

	228 i::Token::Value ScanIdentifier();

	229 uc32 ScanHexEscape(uc32 c, int length);

	230 uc32 ScanOctalEscape(uc32 c, int length);

	231 void ScanEscape();

	232 i::Token::Value ScanString();

	233

	234 // Scans a possible HTML comment -- begins with '<!'.

	235 i::Token::Value ScanHtmlComment();

	236

	237 // Return the current source position.

	238 int source_pos() {

	239 return source_->pos() - kCharacterLookaheadBufferSize;

	240 }

	241

	242 // Decodes a unicode escape-sequence which is part of an identifier.

	243 // If the escape sequence cannot be decoded the result is kBadRune.

	244 uc32 ScanIdentifierUnicodeEscape();

	245

	246 PreScannerStackGuard stack_guard_;

	247

	248 TokenDesc current_; // desc for current token (as returned by Next())

	249 TokenDesc next_; // desc for next token (one token look-ahead)

	250 bool has_line_terminator_before_next_;

	251

	252 // Source.

	253 InputStream* source_;

	254

	255 // Buffer to hold literal values (identifiers, strings, numerals, regexps and

	256 // regexp flags) using '\x00'-terminated UTF-8 encoding.

	257 // Handles allocation internally.

	258 // Notice that the '\x00' termination is meaningless for strings and regexps

	259 // which may contain the zero-character, but can be used as terminator for

	260 // identifiers, numerals and regexp flags.

	261 LiteralsBuffer literal_buffer_;

	262

	263 bool stack_overflow_;

	264

	265 // One Unicode character look-ahead; c0_ < 0 at the end of the input.

	266 uc32 c0_;

	267 };

	268

	269

	270 // ----------------------------------------------------------------------------

	271 // Scanner::LiteralScope

	272

	273 template <typename InputStream, typename LiteralsBuffer>

	274 Scanner<InputStream, LiteralsBuffer>::LiteralScope::LiteralScope(

	275 Scanner* self, LiteralType type)

	276 : scanner_(self), complete_(false) {

	277 self->StartLiteral(type);

	278 }

	279

	280

	281 template <typename InputStream, typename LiteralsBuffer>

	282 Scanner<InputStream, LiteralsBuffer>::LiteralScope::~LiteralScope() {

	283 if (!complete_) scanner_->DropLiteral();

	284 }

	285

	286 template <typename InputStream, typename LiteralsBuffer>

	287 void Scanner<InputStream, LiteralsBuffer>::LiteralScope::Complete() {

	288 scanner_->TerminateLiteral();

	289 complete_ = true;

	290 }

	291

	292

	293 // ----------------------------------------------------------------------------

	294 // Scanner.

	295 template <typename InputStream, typename LiteralsBuffer>

	296 Scanner<InputStream, LiteralsBuffer>::Scanner()

	297 : stack_guard_(kMaxStackSize),

	298 has_line_terminator_before_next_(false),

	299 source_(NULL),

	300 stack_overflow_(false) {}

	301

	302

	303 template <typename InputStream, typename LiteralsBuffer>

	304 void Scanner<InputStream, LiteralsBuffer>::Initialize(InputStream* stream) {

	305 source_ = stream;

	306

	307 // Initialize current_ to not refer to a literal.

	308 current_.literal_length = 0;

	309 // Reset literal buffer.

	310 literal_buffer_.Reset();

	311

	312 // Set c0_ (one character ahead)

	313 ASSERT(kCharacterLookaheadBufferSize == 1);

	314 Advance();

	315

	316 // Skip initial whitespace allowing HTML comment ends just like

	317 // after a newline and scan first token.

	318 has_line_terminator_before_next_ = true;

	319 SkipWhiteSpace();

	320 Scan();

	321 }

	322

	323

	324 template <typename InputStream, typename LiteralsBuffer>

	325 i::Token::Value Scanner<InputStream, LiteralsBuffer>::Next() {

	326 // BUG 1215673: Find a thread safe way to set a stack limit in

	327 // pre-parse mode. Otherwise, we cannot safely pre-parse from other

	328 // threads.

	329 current_ = next_;

	330 // Check for stack-overflow before returning any tokens.

	331 if (stack_guard_.has_overflowed()) {

	332 stack_overflow_ = true;

	333 next_.token = i::Token::ILLEGAL;

	334 } else {

	335 has_line_terminator_before_next_ = false;

	336 Scan();

	337 }

	338 return current_.token;

	339 }

	340

	341

	342 template <typename InputStream, typename LiteralsBuffer>

	343 void Scanner<InputStream, LiteralsBuffer>::StartLiteral(LiteralType type) {

	344 // Only record string and literal identifiers when preparsing.

	345 // Those are the ones that are recorded as symbols. Numbers and

	346 // regexps are not recorded.

	347 if (type == kLiteralString \|\| type == kLiteralIdentifier) {

	348 literal_buffer_.StartLiteral();

	349 }

	350 }

	351

	352

	353 template <typename InputStream, typename LiteralsBuffer>

	354 void Scanner<InputStream, LiteralsBuffer>::AddLiteralChar(uc32 c) {

	355 literal_buffer_.AddChar(c);

	356 }

	357

	358

	359 template <typename InputStream, typename LiteralsBuffer>

	360 void Scanner<InputStream, LiteralsBuffer>::TerminateLiteral() {

	361 i::Vector<const char> chars = literal_buffer_.EndLiteral();

	362 next_.literal_chars = chars.start();

	363 next_.literal_length = chars.length();

	364 }

	365

	366

	367 template <typename InputStream, typename LiteralsBuffer>

	368 void Scanner<InputStream, LiteralsBuffer>::DropLiteral() {

	369 literal_buffer_.DropLiteral();

	370 }

	371

	372

	373 template <typename InputStream, typename LiteralsBuffer>

	374 void Scanner<InputStream, LiteralsBuffer>::AddLiteralCharAdvance() {

	375 AddLiteralChar(c0_);

	376 Advance();

	377 }

	378

	379

	380 static inline bool IsByteOrderMark(uc32 c) {

	381 // The Unicode value U+FFFE is guaranteed never to be assigned as a

	382 // Unicode character; this implies that in a Unicode context the

	383 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF

	384 // character expressed in little-endian byte order (since it could

	385 // not be a U+FFFE character expressed in big-endian byte

	386 // order). Nevertheless, we check for it to be compatible with

	387 // Spidermonkey.

	388 return c == 0xFEFF \|\| c == 0xFFFE;

	389 }

	390

	391

	392 template <typename InputStream, typename LiteralsBuffer>

	393 bool Scanner<InputStream, LiteralsBuffer>::SkipWhiteSpace() {

	394 int start_position = source_pos();

	395

	396 while (true) {

	397 // We treat byte-order marks (BOMs) as whitespace for better

	398 // compatibility with Spidermonkey and other JavaScript engines.

	399 while (i::ScannerConstants::kIsWhiteSpace.get(c0_)

	400 \|\| IsByteOrderMark(c0_)) {

	401 // IsWhiteSpace() includes line terminators!

	402 if (i::ScannerConstants::kIsLineTerminator.get(c0_)) {

	403 // Ignore line terminators, but remember them. This is necessary

	404 // for automatic semicolon insertion.

	405 has_line_terminator_before_next_ = true;

	406 }

	407 Advance();

	408 }

	409

	410 // If there is an HTML comment end '-->' at the beginning of a

	411 // line (with only whitespace in front of it), we treat the rest

	412 // of the line as a comment. This is in line with the way

	413 // SpiderMonkey handles it.

	414 if (c0_ == '-' && has_line_terminator_before_next_) {

	415 Advance();

	416 if (c0_ == '-') {

	417 Advance();

	418 if (c0_ == '>') {

	419 // Treat the rest of the line as a comment.

	420 SkipSingleLineComment();

	421 // Continue skipping white space after the comment.

	422 continue;

	423 }

	424 PushBack('-'); // undo Advance()

	425 }

	426 PushBack('-'); // undo Advance()

	427 }

	428 // Return whether or not we skipped any characters.

	429 return source_pos() != start_position;

	430 }

	431 }

	432

	433

	434 template <typename InputStream, typename LiteralsBuffer>

	435 i::Token::Value Scanner<InputStream, LiteralsBuffer>::SkipSingleLineComment() {

	436 Advance();

	437

	438 // The line terminator at the end of the line is not considered

	439 // to be part of the single-line comment; it is recognized

	440 // separately by the lexical grammar and becomes part of the

	441 // stream of input elements for the syntactic grammar (see

	442 // ECMA-262, section 7.4, page 12).

	443 while (c0_ >= 0 && !i::ScannerConstants::kIsLineTerminator.get(c0_)) {

	444 Advance();

	445 }

	446

	447 return i::Token::WHITESPACE;

	448 }

	449

	450

	451 template <typename InputStream, typename LiteralsBuffer>

	452 i::Token::Value Scanner<InputStream, LiteralsBuffer>::SkipMultiLineComment() {

	453 ASSERT(c0_ == '*');

	454 Advance();

	455

	456 while (c0_ >= 0) {

	457 char ch = c0_;

	458 Advance();

	459 // If we have reached the end of the multi-line comment, we

	460 // consume the '/' and insert a whitespace. This way all

	461 // multi-line comments are treated as whitespace - even the ones

	462 // containing line terminators. This contradicts ECMA-262, section

	463 // 7.4, page 12, that says that multi-line comments containing

	464 // line terminators should be treated as a line terminator, but it

	465 // matches the behaviour of SpiderMonkey and KJS.

	466 if (ch == '*' && c0_ == '/') {

	467 c0_ = ' ';

	468 return i::Token::WHITESPACE;

	469 }

	470 }

	471

	472 // Unterminated multi-line comment.

	473 return i::Token::ILLEGAL;

	474 }

	475

	476

	477 template <typename InputStream, typename LiteralsBuffer>

	478 i::Token::Value Scanner<InputStream, LiteralsBuffer>::ScanHtmlComment() {

	479 // Check for <!-- comments.

	480 ASSERT(c0_ == '!');

	481 Advance();

	482 if (c0_ == '-') {

	483 Advance();

	484 if (c0_ == '-') return SkipSingleLineComment();

	485 PushBack('-'); // undo Advance()

	486 }

	487 PushBack('!'); // undo Advance()

	488 ASSERT(c0_ == '!');

	489 return i::Token::LT;

	490 }

	491

	492

	493 template <typename InputStream, typename LiteralsBuffer>

	494 void Scanner<InputStream, LiteralsBuffer>::Scan() {

	495 next_.literal_length = 0;

	496 i::Token::Value token;

	497 do {

	498 // Remember the position of the next token

	499 next_.location.beg_pos = source_pos();

	500

	501 switch (c0_) {

	502 case ' ':

	503 case '\t':

	504 Advance();

	505 token = i::Token::WHITESPACE;

	506 break;

	507

	508 case '\n':

	509 Advance();

	510 has_line_terminator_before_next_ = true;

	511 token = i::Token::WHITESPACE;

	512 break;

	513

	514 case '"': case '\'':

	515 token = ScanString();

	516 break;

	517

	518 case '<':

	519 // < <= << <<= <!--

	520 Advance();

	521 if (c0_ == '=') {

	522 token = Select(i::Token::LTE);

	523 } else if (c0_ == '<') {

	524 token = Select('=', i::Token::ASSIGN_SHL, i::Token::SHL);

	525 } else if (c0_ == '!') {

	526 token = ScanHtmlComment();

	527 } else {

	528 token = i::Token::LT;

	529 }

	530 break;

	531

	532 case '>':

	533 // > >= >> >>= >>> >>>=

	534 Advance();

	535 if (c0_ == '=') {

	536 token = Select(i::Token::GTE);

	537 } else if (c0_ == '>') {

	538 // >> >>= >>> >>>=

	539 Advance();

	540 if (c0_ == '=') {

	541 token = Select(i::Token::ASSIGN_SAR);

	542 } else if (c0_ == '>') {

	543 token = Select('=', i::Token::ASSIGN_SHR, i::Token::SHR);

	544 } else {

	545 token = i::Token::SAR;

	546 }

	547 } else {

	548 token = i::Token::GT;

	549 }

	550 break;

	551

	552 case '=':

	553 // = == ===

	554 Advance();

	555 if (c0_ == '=') {

	556 token = Select('=', i::Token::EQ_STRICT, i::Token::EQ);

	557 } else {

	558 token = i::Token::ASSIGN;

	559 }

	560 break;

	561

	562 case '!':

	563 // ! != !==

	564 Advance();

	565 if (c0_ == '=') {

	566 token = Select('=', i::Token::NE_STRICT, i::Token::NE);

	567 } else {

	568 token = i::Token::NOT;

	569 }

	570 break;

	571

	572 case '+':

	573 // + ++ +=

	574 Advance();

	575 if (c0_ == '+') {

	576 token = Select(i::Token::INC);

	577 } else if (c0_ == '=') {

	578 token = Select(i::Token::ASSIGN_ADD);

	579 } else {

	580 token = i::Token::ADD;

	581 }

	582 break;

	583

	584 case '-':

	585 // - -- --> -=

	586 Advance();

	587 if (c0_ == '-') {

	588 Advance();

	589 if (c0_ == '>' && has_line_terminator_before_next_) {

	590 // For compatibility with SpiderMonkey, we skip lines that

	591 // start with an HTML comment end '-->'.

	592 token = SkipSingleLineComment();

	593 } else {

	594 token = i::Token::DEC;

	595 }

	596 } else if (c0_ == '=') {

	597 token = Select(i::Token::ASSIGN_SUB);

	598 } else {

	599 token = i::Token::SUB;

	600 }

	601 break;

	602

	603 case '*':

	604 // * *=

	605 token = Select('=', i::Token::ASSIGN_MUL, i::Token::MUL);

	606 break;

	607

	608 case '%':

	609 // % %=

	610 token = Select('=', i::Token::ASSIGN_MOD, i::Token::MOD);

	611 break;

	612

	613 case '/':

	614 // / // /* /=

	615 Advance();

	616 if (c0_ == '/') {

	617 token = SkipSingleLineComment();

	618 } else if (c0_ == '*') {

	619 token = SkipMultiLineComment();

	620 } else if (c0_ == '=') {

	621 token = Select(i::Token::ASSIGN_DIV);

	622 } else {

	623 token = i::Token::DIV;

	624 }

	625 break;

	626

	627 case '&':

	628 // & && &=

	629 Advance();

	630 if (c0_ == '&') {

	631 token = Select(i::Token::AND);

	632 } else if (c0_ == '=') {

	633 token = Select(i::Token::ASSIGN_BIT_AND);

	634 } else {

	635 token = i::Token::BIT_AND;

	636 }

	637 break;

	638

	639 case '\|':

	640 // \| \|\| \|=

	641 Advance();

	642 if (c0_ == '\|') {

	643 token = Select(i::Token::OR);

	644 } else if (c0_ == '=') {

	645 token = Select(i::Token::ASSIGN_BIT_OR);

	646 } else {

	647 token = i::Token::BIT_OR;

	648 }

	649 break;

	650

	651 case '^':

	652 // ^ ^=

	653 token = Select('=', i::Token::ASSIGN_BIT_XOR, i::Token::BIT_XOR);

	654 break;

	655

	656 case '.':

	657 // . Number

	658 Advance();

	659 if (i::IsDecimalDigit(c0_)) {

	660 token = ScanNumber(true);

	661 } else {

	662 token = i::Token::PERIOD;

	663 }

	664 break;

	665

	666 case ':':

	667 token = Select(i::Token::COLON);

	668 break;

	669

	670 case ';':

	671 token = Select(i::Token::SEMICOLON);

	672 break;

	673

	674 case ',':

	675 token = Select(i::Token::COMMA);

	676 break;

	677

	678 case '(':

	679 token = Select(i::Token::LPAREN);

	680 break;

	681

	682 case ')':

	683 token = Select(i::Token::RPAREN);

	684 break;

	685

	686 case '[':

	687 token = Select(i::Token::LBRACK);

	688 break;

	689

	690 case ']':

	691 token = Select(i::Token::RBRACK);

	692 break;

	693

	694 case '{':

	695 token = Select(i::Token::LBRACE);

	696 break;

	697

	698 case '}':

	699 token = Select(i::Token::RBRACE);

	700 break;

	701

	702 case '?':

	703 token = Select(i::Token::CONDITIONAL);

	704 break;

	705

	706 case '~':

	707 token = Select(i::Token::BIT_NOT);

	708 break;

	709

	710 default:

	711 if (i::ScannerConstants::kIsIdentifierStart.get(c0_)) {

	712 token = ScanIdentifier();

	713 } else if (i::IsDecimalDigit(c0_)) {

	714 token = ScanNumber(false);

	715 } else if (SkipWhiteSpace()) {

	716 token = i::Token::WHITESPACE;

	717 } else if (c0_ < 0) {

	718 token = i::Token::EOS;

	719 } else {

	720 token = Select(i::Token::ILLEGAL);

	721 }

	722 break;

	723 }

	724

	725 // Continue scanning for tokens as long as we're just skipping

	726 // whitespace.

	727 } while (token == i::Token::WHITESPACE);

	728

	729 next_.location.end_pos = source_pos();

	730 next_.token = token;

	731 }

	732

	733

	734 template <typename InputStream, typename LiteralsBuffer>

	735 void Scanner<InputStream, LiteralsBuffer>::SeekForward(int pos) {

	736 source_->SeekForward(pos - 1);

	737 Advance();

	738 // This function is only called to seek to the location

	739 // of the end of a function (at the "}" token). It doesn't matter

	740 // whether there was a line terminator in the part we skip.

	741 has_line_terminator_before_next_ = false;

	742 Scan();

	743 }

	744

	745

	746 template <typename InputStream, typename LiteralsBuffer>

	747 uc32 Scanner<InputStream, LiteralsBuffer>::ScanHexEscape(uc32 c, int length) {

	748 ASSERT(length <= 4); // prevent overflow

	749

	750 uc32 digits[4];

	751 uc32 x = 0;

	752 for (int i = 0; i < length; i++) {

	753 digits[i] = c0_;

	754 int d = HexValue(c0_);

	755 if (d < 0) {

	756 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes

	757 // should be illegal, but other JS VMs just return the

	758 // non-escaped version of the original character.

	759

	760 // Push back digits read, except the last one (in c0_).

	761 for (int j = i-1; j >= 0; j--) {

	762 PushBack(digits[j]);

	763 }

	764 // Notice: No handling of error - treat it as "\u"->"u".

	765 return c;

	766 }

	767 x = x * 16 + d;

	768 Advance();

	769 }

	770

	771 return x;

	772 }

	773

	774

	775 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of

	776 // ECMA-262. Other JS VMs support them.

	777 template <typename InputStream, typename LiteralsBuffer>

	778 uc32 Scanner<InputStream, LiteralsBuffer>::ScanOctalEscape(

	779 uc32 c, int length) {

	780 uc32 x = c - '0';

	781 for (int i = 0; i < length; i++) {

	782 int d = c0_ - '0';

	783 if (d < 0 \|\| d > 7) break;

	784 int nx = x * 8 + d;

	785 if (nx >= 256) break;

	786 x = nx;

	787 Advance();

	788 }

	789 return x;

	790 }

	791

	792

	793 template <typename InputStream, typename LiteralsBuffer>

	794 void Scanner<InputStream, LiteralsBuffer>::ScanEscape() {

	795 uc32 c = c0_;

	796 Advance();

	797

	798 // Skip escaped newlines.

	799 if (i::ScannerConstants::kIsLineTerminator.get(c)) {

	800 // Allow CR+LF newlines in multiline string literals.

	801 if (i::IsCarriageReturn(c) && i::IsLineFeed(c0_)) Advance();

	802 // Allow LF+CR newlines in multiline string literals.

	803 if (i::IsLineFeed(c) && i::IsCarriageReturn(c0_)) Advance();

	804 return;

	805 }

	806

	807 switch (c) {

	808 case '\'': // fall through

	809 case '"' : // fall through

	810 case '\\': break;

	811 case 'b' : c = '\b'; break;

	812 case 'f' : c = '\f'; break;

	813 case 'n' : c = '\n'; break;

	814 case 'r' : c = '\r'; break;

	815 case 't' : c = '\t'; break;

	816 case 'u' : c = ScanHexEscape(c, 4); break;

	817 case 'v' : c = '\v'; break;

	818 case 'x' : c = ScanHexEscape(c, 2); break;

	819 case '0' : // fall through

	820 case '1' : // fall through

	821 case '2' : // fall through

	822 case '3' : // fall through

	823 case '4' : // fall through

	824 case '5' : // fall through

	825 case '6' : // fall through

	826 case '7' : c = ScanOctalEscape(c, 2); break;

	827 }

	828

	829 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these

	830 // should be illegal, but they are commonly handled

	831 // as non-escaped characters by JS VMs.

	832 AddLiteralChar(c);

	833 }

	834

	835

	836 template <typename InputStream, typename LiteralsBuffer>

	837 i::Token::Value Scanner<InputStream, LiteralsBuffer>::ScanString() {

	838 uc32 quote = c0_;

	839 Advance(); // consume quote

	840

	841 LiteralScope literal(this, kLiteralString);

	842 while (c0_ != quote && c0_ >= 0

	843 && !i::ScannerConstants::kIsLineTerminator.get(c0_)) {

	844 uc32 c = c0_;

	845 Advance();

	846 if (c == '\\') {

	847 if (c0_ < 0) return i::Token::ILLEGAL;

	848 ScanEscape();

	849 } else {

	850 AddLiteralChar(c);

	851 }

	852 }

	853 if (c0_ != quote) return i::Token::ILLEGAL;

	854 literal.Complete();

	855

	856 Advance(); // consume quote

	857 return i::Token::STRING;

	858 }

	859

	860

	861 template <typename InputStream, typename LiteralsBuffer>

	862 i::Token::Value Scanner<InputStream, LiteralsBuffer>::Select(

	863 i::Token::Value tok) {

	864 Advance();

	865 return tok;

	866 }

	867

	868

	869 template <typename InputStream, typename LiteralsBuffer>

	870 i::Token::Value Scanner<InputStream, LiteralsBuffer>::Select(

	871 uc32 next,

	872 i::Token::Value then,

	873 i::Token::Value else_) {

	874 Advance();

	875 if (c0_ == next) {

	876 Advance();

	877 return then;

	878 } else {

	879 return else_;

	880 }

	881 }

	882

	883

	884 // Returns true if any decimal digits were scanned, returns false otherwise.

	885 template <typename InputStream, typename LiteralsBuffer>

	886 void Scanner<InputStream, LiteralsBuffer>::ScanDecimalDigits() {

	887 while (i::IsDecimalDigit(c0_))

	888 AddLiteralCharAdvance();

	889 }

	890

	891

	892 template <typename InputStream, typename LiteralsBuffer>

	893 i::Token::Value Scanner<InputStream, LiteralsBuffer>::ScanNumber(

	894 bool seen_period) {

	895 // c0_ is the first digit of the number or the fraction.

	896 ASSERT(i::IsDecimalDigit(c0_));

	897

	898 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;

	899

	900 LiteralScope literal(this, kLiteralNumber);

	901 if (seen_period) {

	902 // we have already seen a decimal point of the float

	903 AddLiteralChar('.');

	904 ScanDecimalDigits(); // we know we have at least one digit

	905

	906 } else {

	907 // if the first character is '0' we must check for octals and hex

	908 if (c0_ == '0') {

	909 AddLiteralCharAdvance();

	910

	911 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number

	912 if (c0_ == 'x' \|\| c0_ == 'X') {

	913 // hex number

	914 kind = HEX;

	915 AddLiteralCharAdvance();

	916 if (!i::IsHexDigit(c0_)) {

	917 // we must have at least one hex digit after 'x'/'X'

	918 return i::Token::ILLEGAL;

	919 }

	920 while (i::IsHexDigit(c0_)) {

	921 AddLiteralCharAdvance();

	922 }

	923 } else if ('0' <= c0_ && c0_ <= '7') {

	924 // (possible) octal number

	925 kind = OCTAL;

	926 while (true) {

	927 if (c0_ == '8' \|\| c0_ == '9') {

	928 kind = DECIMAL;

	929 break;

	930 }

	931 if (c0_ < '0' \|\| '7' < c0_) break;

	932 AddLiteralCharAdvance();

	933 }

	934 }

	935 }

	936

	937 // Parse decimal digits and allow trailing fractional part.

	938 if (kind == DECIMAL) {

	939 ScanDecimalDigits(); // optional

	940 if (c0_ == '.') {

	941 AddLiteralCharAdvance();

	942 ScanDecimalDigits(); // optional

	943 }

	944 }

	945 }

	946

	947 // scan exponent, if any

	948 if (c0_ == 'e' \|\| c0_ == 'E') {

	949 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number

	950 if (kind == OCTAL) return i::Token::ILLEGAL;

	951 // scan exponent

	952 AddLiteralCharAdvance();

	953 if (c0_ == '+' \|\| c0_ == '-')

	954 AddLiteralCharAdvance();

	955 if (!i::IsDecimalDigit(c0_)) {

	956 // we must have at least one decimal digit after 'e'/'E'

	957 return i::Token::ILLEGAL;

	958 }

	959 ScanDecimalDigits();

	960 }

	961

	962 // The source character immediately following a numeric literal must

	963 // not be an identifier start or a decimal digit; see ECMA-262

	964 // section 7.8.3, page 17 (note that we read only one decimal digit

	965 // if the value is 0).

	966 if (i::IsDecimalDigit(c0_)

	967 \|\| i::ScannerConstants::kIsIdentifierStart.get(c0_))

	968 return i::Token::ILLEGAL;

	969

	970 literal.Complete();

	971

	972 return i::Token::NUMBER;

	973 }

	974

	975

	976 template <typename InputStream, typename LiteralsBuffer>

	977 uc32 Scanner<InputStream, LiteralsBuffer>::ScanIdentifierUnicodeEscape() {

	978 Advance();

	979 if (c0_ != 'u') return unibrow::Utf8::kBadChar;

	980 Advance();

	981 uc32 c = ScanHexEscape('u', 4);

	982 // We do not allow a unicode escape sequence to start another

	983 // unicode escape sequence.

	984 if (c == '\\') return unibrow::Utf8::kBadChar;

	985 return c;

	986 }

	987

	988

	989 template <typename InputStream, typename LiteralsBuffer>

	990 i::Token::Value Scanner<InputStream, LiteralsBuffer>::ScanIdentifier() {

	991 ASSERT(i::ScannerConstants::kIsIdentifierStart.get(c0_));

	992

	993 LiteralScope literal(this, kLiteralIdentifier);

	994 i::KeywordMatcher keyword_match;

	995

	996 // Scan identifier start character.

	997 if (c0_ == '\\') {

	998 uc32 c = ScanIdentifierUnicodeEscape();

	999 // Only allow legal identifier start characters.

	1000 if (!i::ScannerConstants::kIsIdentifierStart.get(c)) {

	1001 return i::Token::ILLEGAL;

	1002 }

	1003 AddLiteralChar(c);

	1004 keyword_match.Fail();

	1005 } else {

	1006 AddLiteralChar(c0_);

	1007 keyword_match.AddChar(c0_);

	1008 Advance();

	1009 }

	1010

	1011 // Scan the rest of the identifier characters.

	1012 while (i::ScannerConstants::kIsIdentifierPart.get(c0_)) {

	1013 if (c0_ == '\\') {

	1014 uc32 c = ScanIdentifierUnicodeEscape();

	1015 // Only allow legal identifier part characters.

	1016 if (!i::ScannerConstants::kIsIdentifierPart.get(c)) {

	1017 return i::Token::ILLEGAL;

	1018 }

	1019 AddLiteralChar(c);

	1020 keyword_match.Fail();

	1021 } else {

	1022 AddLiteralChar(c0_);

	1023 keyword_match.AddChar(c0_);

	1024 Advance();

	1025 }

	1026 }

	1027 literal.Complete();

	1028

	1029 return keyword_match.token();

	1030 }

	1031

	1032

	1033 template <typename InputStream, typename LiteralsBuffer>

	1034 bool Scanner<InputStream, LiteralsBuffer>::ScanRegExpPattern(bool seen_equal) {

	1035 // Scan: ('/' \| '/=') RegularExpressionBody '/' RegularExpressionFlags

	1036 bool in_character_class = false;

	1037

	1038 // Previous token is either '/' or '/=', in the second case, the

	1039 // pattern starts at =.

	1040 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);

	1041 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);

	1042

	1043 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,

	1044 // the scanner should pass uninterpreted bodies to the RegExp

	1045 // constructor.

	1046 LiteralScope literal(this, kLiteralRegExp);

	1047 if (seen_equal)

	1048 AddLiteralChar('=');

	1049

	1050 while (c0_ != '/' \|\| in_character_class) {

	1051 if (i::ScannerConstants::kIsLineTerminator.get(c0_) \|\| c0_ < 0) {

	1052 return false;

	1053 }

	1054 if (c0_ == '\\') { // escaped character

	1055 AddLiteralCharAdvance();

	1056 if (i::ScannerConstants::kIsLineTerminator.get(c0_) \|\| c0_ < 0) {

	1057 return false;

	1058 }

	1059 AddLiteralCharAdvance();

	1060 } else { // unescaped character

	1061 if (c0_ == '[') in_character_class = true;

	1062 if (c0_ == ']') in_character_class = false;

	1063 AddLiteralCharAdvance();

	1064 }

	1065 }

	1066 Advance(); // consume '/'

	1067

	1068 literal.Complete();

	1069

	1070 return true;

	1071 }

	1072

	1073 template <typename InputStream, typename LiteralsBuffer>

	1074 bool Scanner<InputStream, LiteralsBuffer>::ScanRegExpFlags() {

	1075 // Scan regular expression flags.

	1076 LiteralScope literal(this, kLiteralRegExpFlags);

	1077 while (i::ScannerConstants::kIsIdentifierPart.get(c0_)) {

	1078 if (c0_ == '\\') {

	1079 uc32 c = ScanIdentifierUnicodeEscape();

	1080 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {

	1081 // We allow any escaped character, unlike the restriction on

	1082 // IdentifierPart when it is used to build an IdentifierName.

	1083 AddLiteralChar(c);

	1084 continue;

	1085 }

	1086 }

	1087 AddLiteralCharAdvance();

	1088 }

	1089 literal.Complete();

	1090

	1091 next_.location.end_pos = source_pos() - 1;

	1092 return true;

	1093 }

	1094

	1095

	1096 } } // namespace v8::preparser

	1097

	1098 #endif // V8_PRESCANNER_H_

OLD	NEW

« no previous file with comments | « src/parser.cc ('k') | src/scanner.h » ('j') | no next file with comments »