src/scanner.cc - Issue 7739020: Rename scanner.* to scanner-character-streams.* and scanner-base.* to scanner.*

Side by Side Diff: src/scanner.cc

Issue 7739020: Rename scanner.* to scanner-character-streams.* and scanner-base.* to scanner.* (Closed) Base URL: git://github.com/v8/v8.git@master

Patch Set: rename scanner-base.* to scanner.* Created 9 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2011 the V8 project authors. All rights reserved.	1 // Copyright 2011 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

11 // with the distribution.	11 // with the distribution.

12 // * Neither the name of Google Inc. nor the names of its	12 // * Neither the name of Google Inc. nor the names of its

13 // contributors may be used to endorse or promote products derived	13 // contributors may be used to endorse or promote products derived

14 // from this software without specific prior written permission.	14 // from this software without specific prior written permission.

15 //	15 //

16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS	16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT	17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR	18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT	19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,	20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT	21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,	22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY	23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT	24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE	25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.	26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

27	27

28 #include "v8.h"	28 // Features shared by parsing and pre-parsing scanners.

29	29

30 #include "ast.h"

31 #include "handles.h"

32 #include "scanner.h"	30 #include "scanner.h"

33 #include "unicode-inl.h"	31

	32 #include "../include/v8stdint.h"

	33 #include "char-predicates-inl.h"

34	34

35 namespace v8 {	35 namespace v8 {

36 namespace internal {	36 namespace internal {

37	37

38 // ----------------------------------------------------------------------------	38 // ----------------------------------------------------------------------------

39 // BufferedUC16CharacterStreams	39 // Scanner

40	40

41 BufferedUC16CharacterStream::BufferedUC16CharacterStream()	41 Scanner::Scanner(UnicodeCache* unicode_cache)

42 : UC16CharacterStream(),	42 : unicode_cache_(unicode_cache) { }

43 pushback_limit_(NULL) {	43

44 // Initialize buffer as being empty. First read will fill the buffer.	44

45 buffer_cursor_ = buffer_;	45 uc32 Scanner::ScanHexNumber(int expected_length) {

46 buffer_end_ = buffer_;	46 ASSERT(expected_length <= 4); // prevent overflow

47 }	47

48	48 uc32 digits[4] = { 0, 0, 0, 0 };

49 BufferedUC16CharacterStream::~BufferedUC16CharacterStream() { }	49 uc32 x = 0;

50	50 for (int i = 0; i < expected_length; i++) {

51 void BufferedUC16CharacterStream::PushBack(uc32 character) {	51 digits[i] = c0_;

52 if (character == kEndOfInput) {	52 int d = HexValue(c0_);

53 pos_--;	53 if (d < 0) {

	54 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes

	55 // should be illegal, but other JS VMs just return the

	56 // non-escaped version of the original character.

	57

	58 // Push back digits that we have advanced past.

	59 for (int j = i-1; j >= 0; j--) {

	60 PushBack(digits[j]);

	61 }

	62 return -1;

	63 }

	64 x = x * 16 + d;

	65 Advance();

	66 }

	67

	68 return x;

	69 }

	70

	71

	72

	73 // ----------------------------------------------------------------------------

	74 // JavaScriptScanner

	75

	76 JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants)

	77 : Scanner(scanner_contants),

	78 octal_pos_(Location::invalid()),

	79 harmony_block_scoping_(false) { }

	80

	81

	82 void JavaScriptScanner::Initialize(UC16CharacterStream* source) {

	83 source_ = source;

	84 // Need to capture identifiers in order to recognize "get" and "set"

	85 // in object literals.

	86 Init();

	87 // Skip initial whitespace allowing HTML comment ends just like

	88 // after a newline and scan first token.

	89 has_line_terminator_before_next_ = true;

	90 SkipWhiteSpace();

	91 Scan();

	92 }

	93

	94

	95 // Ensure that tokens can be stored in a byte.

	96 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);

	97

	98 // Table of one-character tokens, by character (0x00..0x7f only).

	99 static const byte one_char_tokens[] = {

	100 Token::ILLEGAL,

	101 Token::ILLEGAL,

	102 Token::ILLEGAL,

	103 Token::ILLEGAL,

	104 Token::ILLEGAL,

	105 Token::ILLEGAL,

	106 Token::ILLEGAL,

	107 Token::ILLEGAL,

	108 Token::ILLEGAL,

	109 Token::ILLEGAL,

	110 Token::ILLEGAL,

	111 Token::ILLEGAL,

	112 Token::ILLEGAL,

	113 Token::ILLEGAL,

	114 Token::ILLEGAL,

	115 Token::ILLEGAL,

	116 Token::ILLEGAL,

	117 Token::ILLEGAL,

	118 Token::ILLEGAL,

	119 Token::ILLEGAL,

	120 Token::ILLEGAL,

	121 Token::ILLEGAL,

	122 Token::ILLEGAL,

	123 Token::ILLEGAL,

	124 Token::ILLEGAL,

	125 Token::ILLEGAL,

	126 Token::ILLEGAL,

	127 Token::ILLEGAL,

	128 Token::ILLEGAL,

	129 Token::ILLEGAL,

	130 Token::ILLEGAL,

	131 Token::ILLEGAL,

	132 Token::ILLEGAL,

	133 Token::ILLEGAL,

	134 Token::ILLEGAL,

	135 Token::ILLEGAL,

	136 Token::ILLEGAL,

	137 Token::ILLEGAL,

	138 Token::ILLEGAL,

	139 Token::ILLEGAL,

	140 Token::LPAREN, // 0x28

	141 Token::RPAREN, // 0x29

	142 Token::ILLEGAL,

	143 Token::ILLEGAL,

	144 Token::COMMA, // 0x2c

	145 Token::ILLEGAL,

	146 Token::ILLEGAL,

	147 Token::ILLEGAL,

	148 Token::ILLEGAL,

	149 Token::ILLEGAL,

	150 Token::ILLEGAL,

	151 Token::ILLEGAL,

	152 Token::ILLEGAL,

	153 Token::ILLEGAL,

	154 Token::ILLEGAL,

	155 Token::ILLEGAL,

	156 Token::ILLEGAL,

	157 Token::ILLEGAL,

	158 Token::COLON, // 0x3a

	159 Token::SEMICOLON, // 0x3b

	160 Token::ILLEGAL,

	161 Token::ILLEGAL,

	162 Token::ILLEGAL,

	163 Token::CONDITIONAL, // 0x3f

	164 Token::ILLEGAL,

	165 Token::ILLEGAL,

	166 Token::ILLEGAL,

	167 Token::ILLEGAL,

	168 Token::ILLEGAL,

	169 Token::ILLEGAL,

	170 Token::ILLEGAL,

	171 Token::ILLEGAL,

	172 Token::ILLEGAL,

	173 Token::ILLEGAL,

	174 Token::ILLEGAL,

	175 Token::ILLEGAL,

	176 Token::ILLEGAL,

	177 Token::ILLEGAL,

	178 Token::ILLEGAL,

	179 Token::ILLEGAL,

	180 Token::ILLEGAL,

	181 Token::ILLEGAL,

	182 Token::ILLEGAL,

	183 Token::ILLEGAL,

	184 Token::ILLEGAL,

	185 Token::ILLEGAL,

	186 Token::ILLEGAL,

	187 Token::ILLEGAL,

	188 Token::ILLEGAL,

	189 Token::ILLEGAL,

	190 Token::ILLEGAL,

	191 Token::LBRACK, // 0x5b

	192 Token::ILLEGAL,

	193 Token::RBRACK, // 0x5d

	194 Token::ILLEGAL,

	195 Token::ILLEGAL,

	196 Token::ILLEGAL,

	197 Token::ILLEGAL,

	198 Token::ILLEGAL,

	199 Token::ILLEGAL,

	200 Token::ILLEGAL,

	201 Token::ILLEGAL,

	202 Token::ILLEGAL,

	203 Token::ILLEGAL,

	204 Token::ILLEGAL,

	205 Token::ILLEGAL,

	206 Token::ILLEGAL,

	207 Token::ILLEGAL,

	208 Token::ILLEGAL,

	209 Token::ILLEGAL,

	210 Token::ILLEGAL,

	211 Token::ILLEGAL,

	212 Token::ILLEGAL,

	213 Token::ILLEGAL,

	214 Token::ILLEGAL,

	215 Token::ILLEGAL,

	216 Token::ILLEGAL,

	217 Token::ILLEGAL,

	218 Token::ILLEGAL,

	219 Token::ILLEGAL,

	220 Token::ILLEGAL,

	221 Token::ILLEGAL,

	222 Token::ILLEGAL,

	223 Token::LBRACE, // 0x7b

	224 Token::ILLEGAL,

	225 Token::RBRACE, // 0x7d

	226 Token::BIT_NOT, // 0x7e

	227 Token::ILLEGAL

	228 };

	229

	230

	231 Token::Value JavaScriptScanner::Next() {

	232 current_ = next_;

	233 has_line_terminator_before_next_ = false;

	234 has_multiline_comment_before_next_ = false;

	235 if (static_cast<unsigned>(c0_) <= 0x7f) {

	236 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);

	237 if (token != Token::ILLEGAL) {

	238 int pos = source_pos();

	239 next_.token = token;

	240 next_.location.beg_pos = pos;

	241 next_.location.end_pos = pos + 1;

	242 Advance();

	243 return current_.token;

	244 }

	245 }

	246 Scan();

	247 return current_.token;

	248 }

	249

	250

	251 static inline bool IsByteOrderMark(uc32 c) {

	252 // The Unicode value U+FFFE is guaranteed never to be assigned as a

	253 // Unicode character; this implies that in a Unicode context the

	254 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF

	255 // character expressed in little-endian byte order (since it could

	256 // not be a U+FFFE character expressed in big-endian byte

	257 // order). Nevertheless, we check for it to be compatible with

	258 // Spidermonkey.

	259 return c == 0xFEFF \|\| c == 0xFFFE;

	260 }

	261

	262

	263 bool JavaScriptScanner::SkipWhiteSpace() {

	264 int start_position = source_pos();

	265

	266 while (true) {

	267 // We treat byte-order marks (BOMs) as whitespace for better

	268 // compatibility with Spidermonkey and other JavaScript engines.

	269 while (unicode_cache_->IsWhiteSpace(c0_) \|\| IsByteOrderMark(c0_)) {

	270 // IsWhiteSpace() includes line terminators!

	271 if (unicode_cache_->IsLineTerminator(c0_)) {

	272 // Ignore line terminators, but remember them. This is necessary

	273 // for automatic semicolon insertion.

	274 has_line_terminator_before_next_ = true;

	275 }

	276 Advance();

	277 }

	278

	279 // If there is an HTML comment end '-->' at the beginning of a

	280 // line (with only whitespace in front of it), we treat the rest

	281 // of the line as a comment. This is in line with the way

	282 // SpiderMonkey handles it.

	283 if (c0_ == '-' && has_line_terminator_before_next_) {

	284 Advance();

	285 if (c0_ == '-') {

	286 Advance();

	287 if (c0_ == '>') {

	288 // Treat the rest of the line as a comment.

	289 SkipSingleLineComment();

	290 // Continue skipping white space after the comment.

	291 continue;

	292 }

	293 PushBack('-'); // undo Advance()

	294 }

	295 PushBack('-'); // undo Advance()

	296 }

	297 // Return whether or not we skipped any characters.

	298 return source_pos() != start_position;

	299 }

	300 }

	301

	302

	303 Token::Value JavaScriptScanner::SkipSingleLineComment() {

	304 Advance();

	305

	306 // The line terminator at the end of the line is not considered

	307 // to be part of the single-line comment; it is recognized

	308 // separately by the lexical grammar and becomes part of the

	309 // stream of input elements for the syntactic grammar (see

	310 // ECMA-262, section 7.4).

	311 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {

	312 Advance();

	313 }

	314

	315 return Token::WHITESPACE;

	316 }

	317

	318

	319 Token::Value JavaScriptScanner::SkipMultiLineComment() {

	320 ASSERT(c0_ == '*');

	321 Advance();

	322

	323 while (c0_ >= 0) {

	324 uc32 ch = c0_;

	325 Advance();

	326 if (unicode_cache_->IsLineTerminator(ch)) {

	327 // Following ECMA-262, section 7.4, a comment containing

	328 // a newline will make the comment count as a line-terminator.

	329 has_multiline_comment_before_next_ = true;

	330 }

	331 // If we have reached the end of the multi-line comment, we

	332 // consume the '/' and insert a whitespace. This way all

	333 // multi-line comments are treated as whitespace.

	334 if (ch == '*' && c0_ == '/') {

	335 c0_ = ' ';

	336 return Token::WHITESPACE;

	337 }

	338 }

	339

	340 // Unterminated multi-line comment.

	341 return Token::ILLEGAL;

	342 }

	343

	344

	345 Token::Value JavaScriptScanner::ScanHtmlComment() {

	346 // Check for <!-- comments.

	347 ASSERT(c0_ == '!');

	348 Advance();

	349 if (c0_ == '-') {

	350 Advance();

	351 if (c0_ == '-') return SkipSingleLineComment();

	352 PushBack('-'); // undo Advance()

	353 }

	354 PushBack('!'); // undo Advance()

	355 ASSERT(c0_ == '!');

	356 return Token::LT;

	357 }

	358

	359

	360 void JavaScriptScanner::Scan() {

	361 next_.literal_chars = NULL;

	362 Token::Value token;

	363 do {

	364 // Remember the position of the next token

	365 next_.location.beg_pos = source_pos();

	366

	367 switch (c0_) {

	368 case ' ':

	369 case '\t':

	370 Advance();

	371 token = Token::WHITESPACE;

	372 break;

	373

	374 case '\n':

	375 Advance();

	376 has_line_terminator_before_next_ = true;

	377 token = Token::WHITESPACE;

	378 break;

	379

	380 case '"': case '\'':

	381 token = ScanString();

	382 break;

	383

	384 case '<':

	385 // < <= << <<= <!--

	386 Advance();

	387 if (c0_ == '=') {

	388 token = Select(Token::LTE);

	389 } else if (c0_ == '<') {

	390 token = Select('=', Token::ASSIGN_SHL, Token::SHL);

	391 } else if (c0_ == '!') {

	392 token = ScanHtmlComment();

	393 } else {

	394 token = Token::LT;

	395 }

	396 break;

	397

	398 case '>':

	399 // > >= >> >>= >>> >>>=

	400 Advance();

	401 if (c0_ == '=') {

	402 token = Select(Token::GTE);

	403 } else if (c0_ == '>') {

	404 // >> >>= >>> >>>=

	405 Advance();

	406 if (c0_ == '=') {

	407 token = Select(Token::ASSIGN_SAR);

	408 } else if (c0_ == '>') {

	409 token = Select('=', Token::ASSIGN_SHR, Token::SHR);

	410 } else {

	411 token = Token::SAR;

	412 }

	413 } else {

	414 token = Token::GT;

	415 }

	416 break;

	417

	418 case '=':

	419 // = == ===

	420 Advance();

	421 if (c0_ == '=') {

	422 token = Select('=', Token::EQ_STRICT, Token::EQ);

	423 } else {

	424 token = Token::ASSIGN;

	425 }

	426 break;

	427

	428 case '!':

	429 // ! != !==

	430 Advance();

	431 if (c0_ == '=') {

	432 token = Select('=', Token::NE_STRICT, Token::NE);

	433 } else {

	434 token = Token::NOT;

	435 }

	436 break;

	437

	438 case '+':

	439 // + ++ +=

	440 Advance();

	441 if (c0_ == '+') {

	442 token = Select(Token::INC);

	443 } else if (c0_ == '=') {

	444 token = Select(Token::ASSIGN_ADD);

	445 } else {

	446 token = Token::ADD;

	447 }

	448 break;

	449

	450 case '-':

	451 // - -- --> -=

	452 Advance();

	453 if (c0_ == '-') {

	454 Advance();

	455 if (c0_ == '>' && has_line_terminator_before_next_) {

	456 // For compatibility with SpiderMonkey, we skip lines that

	457 // start with an HTML comment end '-->'.

	458 token = SkipSingleLineComment();

	459 } else {

	460 token = Token::DEC;

	461 }

	462 } else if (c0_ == '=') {

	463 token = Select(Token::ASSIGN_SUB);

	464 } else {

	465 token = Token::SUB;

	466 }

	467 break;

	468

	469 case '*':

	470 // * *=

	471 token = Select('=', Token::ASSIGN_MUL, Token::MUL);

	472 break;

	473

	474 case '%':

	475 // % %=

	476 token = Select('=', Token::ASSIGN_MOD, Token::MOD);

	477 break;

	478

	479 case '/':

	480 // / // /* /=

	481 Advance();

	482 if (c0_ == '/') {

	483 token = SkipSingleLineComment();

	484 } else if (c0_ == '*') {

	485 token = SkipMultiLineComment();

	486 } else if (c0_ == '=') {

	487 token = Select(Token::ASSIGN_DIV);

	488 } else {

	489 token = Token::DIV;

	490 }

	491 break;

	492

	493 case '&':

	494 // & && &=

	495 Advance();

	496 if (c0_ == '&') {

	497 token = Select(Token::AND);

	498 } else if (c0_ == '=') {

	499 token = Select(Token::ASSIGN_BIT_AND);

	500 } else {

	501 token = Token::BIT_AND;

	502 }

	503 break;

	504

	505 case '\|':

	506 // \| \|\| \|=

	507 Advance();

	508 if (c0_ == '\|') {

	509 token = Select(Token::OR);

	510 } else if (c0_ == '=') {

	511 token = Select(Token::ASSIGN_BIT_OR);

	512 } else {

	513 token = Token::BIT_OR;

	514 }

	515 break;

	516

	517 case '^':

	518 // ^ ^=

	519 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);

	520 break;

	521

	522 case '.':

	523 // . Number

	524 Advance();

	525 if (IsDecimalDigit(c0_)) {

	526 token = ScanNumber(true);

	527 } else {

	528 token = Token::PERIOD;

	529 }

	530 break;

	531

	532 case ':':

	533 token = Select(Token::COLON);

	534 break;

	535

	536 case ';':

	537 token = Select(Token::SEMICOLON);

	538 break;

	539

	540 case ',':

	541 token = Select(Token::COMMA);

	542 break;

	543

	544 case '(':

	545 token = Select(Token::LPAREN);

	546 break;

	547

	548 case ')':

	549 token = Select(Token::RPAREN);

	550 break;

	551

	552 case '[':

	553 token = Select(Token::LBRACK);

	554 break;

	555

	556 case ']':

	557 token = Select(Token::RBRACK);

	558 break;

	559

	560 case '{':

	561 token = Select(Token::LBRACE);

	562 break;

	563

	564 case '}':

	565 token = Select(Token::RBRACE);

	566 break;

	567

	568 case '?':

	569 token = Select(Token::CONDITIONAL);

	570 break;

	571

	572 case '~':

	573 token = Select(Token::BIT_NOT);

	574 break;

	575

	576 default:

	577 if (unicode_cache_->IsIdentifierStart(c0_)) {

	578 token = ScanIdentifierOrKeyword();

	579 } else if (IsDecimalDigit(c0_)) {

	580 token = ScanNumber(false);

	581 } else if (SkipWhiteSpace()) {

	582 token = Token::WHITESPACE;

	583 } else if (c0_ < 0) {

	584 token = Token::EOS;

	585 } else {

	586 token = Select(Token::ILLEGAL);

	587 }

	588 break;

	589 }

	590

	591 // Continue scanning for tokens as long as we're just skipping

	592 // whitespace.

	593 } while (token == Token::WHITESPACE);

	594

	595 next_.location.end_pos = source_pos();

	596 next_.token = token;

	597 }

	598

	599

	600 void JavaScriptScanner::SeekForward(int pos) {

	601 // After this call, we will have the token at the given position as

	602 // the "next" token. The "current" token will be invalid.

	603 if (pos == next_.location.beg_pos) return;

	604 int current_pos = source_pos();

	605 ASSERT_EQ(next_.location.end_pos, current_pos);

	606 // Positions inside the lookahead token aren't supported.

	607 ASSERT(pos >= current_pos);

	608 if (pos != current_pos) {

	609 source_->SeekForward(pos - source_->pos());

	610 Advance();

	611 // This function is only called to seek to the location

	612 // of the end of a function (at the "}" token). It doesn't matter

	613 // whether there was a line terminator in the part we skip.

	614 has_line_terminator_before_next_ = false;

	615 has_multiline_comment_before_next_ = false;

	616 }

	617 Scan();

	618 }

	619

	620

	621 void JavaScriptScanner::ScanEscape() {

	622 uc32 c = c0_;

	623 Advance();

	624

	625 // Skip escaped newlines.

	626 if (unicode_cache_->IsLineTerminator(c)) {

	627 // Allow CR+LF newlines in multiline string literals.

	628 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();

	629 // Allow LF+CR newlines in multiline string literals.

	630 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();

54 return;	631 return;

55 }	632 }

56 if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) {	633

57 // buffer_ is writable, buffer_cursor_ is const pointer.	634 switch (c) {

58 buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character);	635 case '\'': // fall through

59 pos_--;	636 case '"' : // fall through

60 return;	637 case '\\': break;

61 }	638 case 'b' : c = '\b'; break;

62 SlowPushBack(static_cast<uc16>(character));	639 case 'f' : c = '\f'; break;

63 }	640 case 'n' : c = '\n'; break;

64	641 case 'r' : c = '\r'; break;

65	642 case 't' : c = '\t'; break;

66 void BufferedUC16CharacterStream::SlowPushBack(uc16 character) {	643 case 'u' : {

67 // In pushback mode, the end of the buffer contains pushback,	644 c = ScanHexNumber(4);

68 // and the start of the buffer (from buffer start to pushback_limit_)	645 if (c < 0) c = 'u';

69 // contains valid data that comes just after the pushback.	646 break;

70 // We NULL the pushback_limit_ if pushing all the way back to the	647 }

71 // start of the buffer.	648 case 'v' : c = '\v'; break;

72	649 case 'x' : {

73 if (pushback_limit_ == NULL) {	650 c = ScanHexNumber(2);

74 // Enter pushback mode.	651 if (c < 0) c = 'x';

75 pushback_limit_ = buffer_end_;	652 break;

76 buffer_end_ = buffer_ + kBufferSize;	653 }

77 buffer_cursor_ = buffer_end_;	654 case '0' : // fall through

78 }	655 case '1' : // fall through

79 // Ensure that there is room for at least one pushback.	656 case '2' : // fall through

80 ASSERT(buffer_cursor_ > buffer_);	657 case '3' : // fall through

81 ASSERT(pos_ > 0);	658 case '4' : // fall through

82 buffer_[--buffer_cursor_ - buffer_] = character;	659 case '5' : // fall through

83 if (buffer_cursor_ == buffer_) {	660 case '6' : // fall through

84 pushback_limit_ = NULL;	661 case '7' : c = ScanOctalEscape(c, 2); break;

85 } else if (buffer_cursor_ < pushback_limit_) {	662 }

86 pushback_limit_ = buffer_cursor_;	663

87 }	664 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these

88 pos_--;	665 // should be illegal, but they are commonly handled

89 }	666 // as non-escaped characters by JS VMs.

90	667 AddLiteralChar(c);

91	668 }

92 bool BufferedUC16CharacterStream::ReadBlock() {	669

93 buffer_cursor_ = buffer_;	670

94 if (pushback_limit_ != NULL) {	671 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of

95 // Leave pushback mode.	672 // ECMA-262. Other JS VMs support them.

96 buffer_end_ = pushback_limit_;	673 uc32 JavaScriptScanner::ScanOctalEscape(uc32 c, int length) {

97 pushback_limit_ = NULL;	674 uc32 x = c - '0';

98 // If there were any valid characters left at the	675 int i = 0;

99 // start of the buffer, use those.	676 for (; i < length; i++) {

100 if (buffer_cursor_ < buffer_end_) return true;	677 int d = c0_ - '0';

101 // Otherwise read a new block.	678 if (d < 0 \|\| d > 7) break;

102 }	679 int nx = x * 8 + d;

103 unsigned length = FillBuffer(pos_, kBufferSize);	680 if (nx >= 256) break;

104 buffer_end_ = buffer_ + length;	681 x = nx;

105 return length > 0;	682 Advance();

106 }	683 }

107	684 // Anything except '\0' is an octal escape sequence, illegal in strict mode.

108	685 // Remember the position of octal escape sequences so that an error

109 unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) {	686 // can be reported later (in strict mode).

110 // Leave pushback mode (i.e., ignore that there might be valid data	687 // We don't report the error immediately, because the octal escape can

111 // in the buffer before the pushback_limit_ point).	688 // occur before the "use strict" directive.

112 pushback_limit_ = NULL;	689 if (c != '0' \|\| i > 0) {

113 return BufferSeekForward(delta);	690 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);

114 }	691 }

	692 return x;

	693 }

	694

	695

	696 Token::Value JavaScriptScanner::ScanString() {

	697 uc32 quote = c0_;

	698 Advance(); // consume quote

	699

	700 LiteralScope literal(this);

	701 while (c0_ != quote && c0_ >= 0

	702 && !unicode_cache_->IsLineTerminator(c0_)) {

	703 uc32 c = c0_;

	704 Advance();

	705 if (c == '\\') {

	706 if (c0_ < 0) return Token::ILLEGAL;

	707 ScanEscape();

	708 } else {

	709 AddLiteralChar(c);

	710 }

	711 }

	712 if (c0_ != quote) return Token::ILLEGAL;

	713 literal.Complete();

	714

	715 Advance(); // consume quote

	716 return Token::STRING;

	717 }

	718

	719

	720 void JavaScriptScanner::ScanDecimalDigits() {

	721 while (IsDecimalDigit(c0_))

	722 AddLiteralCharAdvance();

	723 }

	724

	725

	726 Token::Value JavaScriptScanner::ScanNumber(bool seen_period) {

	727 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction

	728

	729 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;

	730

	731 LiteralScope literal(this);

	732 if (seen_period) {

	733 // we have already seen a decimal point of the float

	734 AddLiteralChar('.');

	735 ScanDecimalDigits(); // we know we have at least one digit

	736

	737 } else {

	738 // if the first character is '0' we must check for octals and hex

	739 if (c0_ == '0') {

	740 int start_pos = source_pos(); // For reporting octal positions.

	741 AddLiteralCharAdvance();

	742

	743 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number

	744 if (c0_ == 'x' \|\| c0_ == 'X') {

	745 // hex number

	746 kind = HEX;

	747 AddLiteralCharAdvance();

	748 if (!IsHexDigit(c0_)) {

	749 // we must have at least one hex digit after 'x'/'X'

	750 return Token::ILLEGAL;

	751 }

	752 while (IsHexDigit(c0_)) {

	753 AddLiteralCharAdvance();

	754 }

	755 } else if ('0' <= c0_ && c0_ <= '7') {

	756 // (possible) octal number

	757 kind = OCTAL;

	758 while (true) {

	759 if (c0_ == '8' \|\| c0_ == '9') {

	760 kind = DECIMAL;

	761 break;

	762 }

	763 if (c0_ < '0' \|\| '7' < c0_) {

	764 // Octal literal finished.

	765 octal_pos_ = Location(start_pos, source_pos());

	766 break;

	767 }

	768 AddLiteralCharAdvance();

	769 }

	770 }

	771 }

	772

	773 // Parse decimal digits and allow trailing fractional part.

	774 if (kind == DECIMAL) {

	775 ScanDecimalDigits(); // optional

	776 if (c0_ == '.') {

	777 AddLiteralCharAdvance();

	778 ScanDecimalDigits(); // optional

	779 }

	780 }

	781 }

	782

	783 // scan exponent, if any

	784 if (c0_ == 'e' \|\| c0_ == 'E') {

	785 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number

	786 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed

	787 // scan exponent

	788 AddLiteralCharAdvance();

	789 if (c0_ == '+' \|\| c0_ == '-')

	790 AddLiteralCharAdvance();

	791 if (!IsDecimalDigit(c0_)) {

	792 // we must have at least one decimal digit after 'e'/'E'

	793 return Token::ILLEGAL;

	794 }

	795 ScanDecimalDigits();

	796 }

	797

	798 // The source character immediately following a numeric literal must

	799 // not be an identifier start or a decimal digit; see ECMA-262

	800 // section 7.8.3, page 17 (note that we read only one decimal digit

	801 // if the value is 0).

	802 if (IsDecimalDigit(c0_) \|\| unicode_cache_->IsIdentifierStart(c0_))

	803 return Token::ILLEGAL;

	804

	805 literal.Complete();

	806

	807 return Token::NUMBER;

	808 }

	809

	810

	811 uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {

	812 Advance();

	813 if (c0_ != 'u') return -1;

	814 Advance();

	815 uc32 result = ScanHexNumber(4);

	816 if (result < 0) PushBack('u');

	817 return result;

	818 }

	819

115	820

116 // ----------------------------------------------------------------------------	821 // ----------------------------------------------------------------------------

117 // GenericStringUC16CharacterStream	822 // Keyword Matcher

118	823

119	824 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \

120 GenericStringUC16CharacterStream::GenericStringUC16CharacterStream(	825 KEYWORD_GROUP('b') \

121 Handle<String> data,	826 KEYWORD("break", Token::BREAK) \

122 unsigned start_position,	827 KEYWORD_GROUP('c') \

123 unsigned end_position)	828 KEYWORD("case", Token::CASE) \

124 : string_(data),	829 KEYWORD("catch", Token::CATCH) \

125 length_(end_position) {	830 KEYWORD("class", Token::FUTURE_RESERVED_WORD) \

126 ASSERT(end_position >= start_position);	831 KEYWORD("const", Token::CONST) \

127 buffer_cursor_ = buffer_;	832 KEYWORD("continue", Token::CONTINUE) \

128 buffer_end_ = buffer_;	833 KEYWORD_GROUP('d') \

129 pos_ = start_position;	834 KEYWORD("debugger", Token::DEBUGGER) \

130 }	835 KEYWORD("default", Token::DEFAULT) \

131	836 KEYWORD("delete", Token::DELETE) \

132	837 KEYWORD("do", Token::DO) \

133 GenericStringUC16CharacterStream::~GenericStringUC16CharacterStream() { }	838 KEYWORD_GROUP('e') \

134	839 KEYWORD("else", Token::ELSE) \

135	840 KEYWORD("enum", Token::FUTURE_RESERVED_WORD) \

136 unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) {	841 KEYWORD("export", Token::FUTURE_RESERVED_WORD) \

137 unsigned old_pos = pos_;	842 KEYWORD("extends", Token::FUTURE_RESERVED_WORD) \

138 pos_ = Min(pos_ + delta, length_);	843 KEYWORD_GROUP('f') \

139 ReadBlock();	844 KEYWORD("false", Token::FALSE_LITERAL) \

140 return pos_ - old_pos;	845 KEYWORD("finally", Token::FINALLY) \

141 }	846 KEYWORD("for", Token::FOR) \

142	847 KEYWORD("function", Token::FUNCTION) \

143	848 KEYWORD_GROUP('i') \

144 unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos,	849 KEYWORD("if", Token::IF) \

145 unsigned length) {	850 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \

146 if (from_pos >= length_) return 0;	851 KEYWORD("import", Token::FUTURE_RESERVED_WORD) \

147 if (from_pos + length > length_) {	852 KEYWORD("in", Token::IN) \

148 length = length_ - from_pos;	853 KEYWORD("instanceof", Token::INSTANCEOF) \

149 }	854 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \

150 String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);	855 KEYWORD_GROUP('l') \

151 return length;	856 KEYWORD("let", harmony_block_scoping \

152 }	857 ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \

153	858 KEYWORD_GROUP('n') \

154	859 KEYWORD("new", Token::NEW) \

155 // ----------------------------------------------------------------------------	860 KEYWORD("null", Token::NULL_LITERAL) \

156 // Utf8ToUC16CharacterStream	861 KEYWORD_GROUP('p') \

157 Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data,	862 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \

158 unsigned length)	863 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \

159 : BufferedUC16CharacterStream(),	864 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \

160 raw_data_(data),	865 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \

161 raw_data_length_(length),	866 KEYWORD_GROUP('r') \

162 raw_data_pos_(0),	867 KEYWORD("return", Token::RETURN) \

163 raw_character_position_(0) {	868 KEYWORD_GROUP('s') \

164 ReadBlock();	869 KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD) \

165 }	870 KEYWORD("super", Token::FUTURE_RESERVED_WORD) \

166	871 KEYWORD("switch", Token::SWITCH) \

167	872 KEYWORD_GROUP('t') \

168 Utf8ToUC16CharacterStream::~Utf8ToUC16CharacterStream() { }	873 KEYWORD("this", Token::THIS) \

169	874 KEYWORD("throw", Token::THROW) \

170	875 KEYWORD("true", Token::TRUE_LITERAL) \

171 unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) {	876 KEYWORD("try", Token::TRY) \

172 unsigned old_pos = pos_;	877 KEYWORD("typeof", Token::TYPEOF) \

173 unsigned target_pos = pos_ + delta;	878 KEYWORD_GROUP('v') \

174 SetRawPosition(target_pos);	879 KEYWORD("var", Token::VAR) \

175 pos_ = raw_character_position_;	880 KEYWORD("void", Token::VOID) \

176 ReadBlock();	881 KEYWORD_GROUP('w') \

177 return pos_ - old_pos;	882 KEYWORD("while", Token::WHILE) \

178 }	883 KEYWORD("with", Token::WITH) \

179	884 KEYWORD_GROUP('y') \

180	885 KEYWORD("yield", Token::FUTURE_STRICT_RESERVED_WORD)

181 unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,	886

182 unsigned length) {	887

183 static const unibrow::uchar kMaxUC16Character = 0xffff;	888 static Token::Value KeywordOrIdentifierToken(const char* input,

184 SetRawPosition(char_position);	889 int input_length,

185 if (raw_character_position_ != char_position) {	890 bool harmony_block_scoping) {

186 // char_position was not a valid position in the stream (hit the end	891 ASSERT(input_length >= 1);

187 // while spooling to it).	892 const int kMinLength = 2;

188 return 0u;	893 const int kMaxLength = 10;

189 }	894 if (input_length < kMinLength \|\| input_length > kMaxLength) {

190 unsigned i = 0;	895 return Token::IDENTIFIER;

191 while (i < length) {	896 }

192 if (raw_data_pos_ == raw_data_length_) break;	897 switch (input[0]) {

193 unibrow::uchar c = raw_data_[raw_data_pos_];	898 default:

194 if (c <= unibrow::Utf8::kMaxOneByteChar) {	899 #define KEYWORD_GROUP_CASE(ch) \

195 raw_data_pos_++;	900 break; \

	901 case ch:

	902 #define KEYWORD(keyword, token) \

	903 { \

	904 /* 'keyword' is a char array, so sizeof(keyword) is */ \

	905 /* strlen(keyword) plus 1 for the NUL char. */ \

	906 const int keyword_length = sizeof(keyword) - 1; \

	907 STATIC_ASSERT(keyword_length >= kMinLength); \

	908 STATIC_ASSERT(keyword_length <= kMaxLength); \

	909 if (input_length == keyword_length && \

	910 input[1] == keyword[1] && \

	911 (keyword_length <= 2 \|\| input[2] == keyword[2]) && \

	912 (keyword_length <= 3 \|\| input[3] == keyword[3]) && \

	913 (keyword_length <= 4 \|\| input[4] == keyword[4]) && \

	914 (keyword_length <= 5 \|\| input[5] == keyword[5]) && \

	915 (keyword_length <= 6 \|\| input[6] == keyword[6]) && \

	916 (keyword_length <= 7 \|\| input[7] == keyword[7]) && \

	917 (keyword_length <= 8 \|\| input[8] == keyword[8]) && \

	918 (keyword_length <= 9 \|\| input[9] == keyword[9])) { \

	919 return token; \

	920 } \

	921 }

	922 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)

	923 }

	924 return Token::IDENTIFIER;

	925 }

	926

	927

	928 Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {

	929 ASSERT(unicode_cache_->IsIdentifierStart(c0_));

	930 LiteralScope literal(this);

	931 // Scan identifier start character.

	932 if (c0_ == '\\') {

	933 uc32 c = ScanIdentifierUnicodeEscape();

	934 // Only allow legal identifier start characters.

	935 if (c < 0 \|\|

	936 c == '\\' \|\| // No recursive escapes.

	937 !unicode_cache_->IsIdentifierStart(c)) {

	938 return Token::ILLEGAL;

	939 }

	940 AddLiteralChar(c);

	941 return ScanIdentifierSuffix(&literal);

	942 }

	943

	944 uc32 first_char = c0_;

	945 Advance();

	946 AddLiteralChar(first_char);

	947

	948 // Scan the rest of the identifier characters.

	949 while (unicode_cache_->IsIdentifierPart(c0_)) {

	950 if (c0_ != '\\') {

	951 uc32 next_char = c0_;

	952 Advance();

	953 AddLiteralChar(next_char);

	954 continue;

	955 }

	956 // Fallthrough if no longer able to complete keyword.

	957 return ScanIdentifierSuffix(&literal);

	958 }

	959

	960 literal.Complete();

	961

	962 if (next_.literal_chars->is_ascii()) {

	963 Vector<const char> chars = next_.literal_chars->ascii_literal();

	964 return KeywordOrIdentifierToken(chars.start(),

	965 chars.length(),

	966 harmony_block_scoping_);

	967 }

	968

	969 return Token::IDENTIFIER;

	970 }

	971

	972

	973 Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {

	974 // Scan the rest of the identifier characters.

	975 while (unicode_cache_->IsIdentifierPart(c0_)) {

	976 if (c0_ == '\\') {

	977 uc32 c = ScanIdentifierUnicodeEscape();

	978 // Only allow legal identifier part characters.

	979 if (c < 0 \|\|

	980 c == '\\' \|\|

	981 !unicode_cache_->IsIdentifierPart(c)) {

	982 return Token::ILLEGAL;

	983 }

	984 AddLiteralChar(c);

196 } else {	985 } else {

197 c = unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_,	986 AddLiteralChar(c0_);

198 raw_data_length_ - raw_data_pos_,	987 Advance();

199 &raw_data_pos_);	988 }

200 // Don't allow characters outside of the BMP.	989 }

201 if (c > kMaxUC16Character) {	990 literal->Complete();

202 c = unibrow::Utf8::kBadChar;	991

	992 return Token::IDENTIFIER;

	993 }

	994

	995

	996 bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {

	997 // Scan: ('/' \| '/=') RegularExpressionBody '/' RegularExpressionFlags

	998 bool in_character_class = false;

	999

	1000 // Previous token is either '/' or '/=', in the second case, the

	1001 // pattern starts at =.

	1002 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);

	1003 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);

	1004

	1005 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,

	1006 // the scanner should pass uninterpreted bodies to the RegExp

	1007 // constructor.

	1008 LiteralScope literal(this);

	1009 if (seen_equal) {

	1010 AddLiteralChar('=');

	1011 }

	1012

	1013 while (c0_ != '/' \|\| in_character_class) {

	1014 if (unicode_cache_->IsLineTerminator(c0_) \|\| c0_ < 0) return false;

	1015 if (c0_ == '\\') { // Escape sequence.

	1016 AddLiteralCharAdvance();

	1017 if (unicode_cache_->IsLineTerminator(c0_) \|\| c0_ < 0) return false;

	1018 AddLiteralCharAdvance();

	1019 // If the escape allows more characters, i.e., \x??, \u????, or \c?,

	1020 // only "safe" characters are allowed (letters, digits, underscore),

	1021 // otherwise the escape isn't valid and the invalid character has

	1022 // its normal meaning. I.e., we can just continue scanning without

	1023 // worrying whether the following characters are part of the escape

	1024 // or not, since any '/', '\\' or '[' is guaranteed to not be part

	1025 // of the escape sequence.

	1026

	1027 // TODO(896): At some point, parse RegExps more throughly to capture

	1028 // octal esacpes in strict mode.

	1029 } else { // Unescaped character.

	1030 if (c0_ == '[') in_character_class = true;

	1031 if (c0_ == ']') in_character_class = false;

	1032 AddLiteralCharAdvance();

	1033 }

	1034 }

	1035 Advance(); // consume '/'

	1036

	1037 literal.Complete();

	1038

	1039 return true;

	1040 }

	1041

	1042

	1043 bool JavaScriptScanner::ScanLiteralUnicodeEscape() {

	1044 ASSERT(c0_ == '\\');

	1045 uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};

	1046 Advance();

	1047 int i = 1;

	1048 if (c0_ == 'u') {

	1049 i++;

	1050 while (i < 6) {

	1051 Advance();

	1052 if (!IsHexDigit(c0_)) break;

	1053 chars_read[i] = c0_;

	1054 i++;

	1055 }

	1056 }

	1057 if (i < 6) {

	1058 // Incomplete escape. Undo all advances and return false.

	1059 while (i > 0) {

	1060 i--;

	1061 PushBack(chars_read[i]);

	1062 }

	1063 return false;

	1064 }

	1065 // Complete escape. Add all chars to current literal buffer.

	1066 for (int i = 0; i < 6; i++) {

	1067 AddLiteralChar(chars_read[i]);

	1068 }

	1069 return true;

	1070 }

	1071

	1072

	1073 bool JavaScriptScanner::ScanRegExpFlags() {

	1074 // Scan regular expression flags.

	1075 LiteralScope literal(this);

	1076 while (unicode_cache_->IsIdentifierPart(c0_)) {

	1077 if (c0_ != '\\') {

	1078 AddLiteralCharAdvance();

	1079 } else {

	1080 if (!ScanLiteralUnicodeEscape()) {

	1081 break;

203 }	1082 }

204 }	1083 }

205 buffer_[i++] = static_cast<uc16>(c);	1084 }

206 }	1085 literal.Complete();

207 raw_character_position_ = char_position + i;	1086

208 return i;	1087 next_.location.end_pos = source_pos() - 1;

209 }	1088 return true;

210

211

212 static const byte kUtf8MultiByteMask = 0xC0;

213 static const byte kUtf8MultiByteCharStart = 0xC0;

214 static const byte kUtf8MultiByteCharFollower = 0x80;

215

216

217 #ifdef DEBUG

218 static bool IsUtf8MultiCharacterStart(byte first_byte) {

219 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;

220 }

221 #endif

222

223

224 static bool IsUtf8MultiCharacterFollower(byte later_byte) {

225 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;

226 }

227

228

229 // Move the cursor back to point at the preceding UTF-8 character start

230 // in the buffer.

231 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {

232 byte character = buffer[--*cursor];

233 if (character > unibrow::Utf8::kMaxOneByteChar) {

234 ASSERT(IsUtf8MultiCharacterFollower(character));

235 // Last byte of a multi-byte character encoding. Step backwards until

236 // pointing to the first byte of the encoding, recognized by having the

237 // top two bits set.

238 while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }

239 ASSERT(IsUtf8MultiCharacterStart(buffer[*cursor]));

240 }

241 }

242

243

244 // Move the cursor forward to point at the next following UTF-8 character start

245 // in the buffer.

246 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {

247 byte character = buffer[(*cursor)++];

248 if (character > unibrow::Utf8::kMaxOneByteChar) {

249 // First character of a multi-byte character encoding.

250 // The number of most-significant one-bits determines the length of the

251 // encoding:

252 // 110..... - (0xCx, 0xDx) one additional byte (minimum).

253 // 1110.... - (0xEx) two additional bytes.

254 // 11110... - (0xFx) three additional bytes (maximum).

255 ASSERT(IsUtf8MultiCharacterStart(character));

256 // Additional bytes is:

257 // 1 if value in range 0xC0 .. 0xDF.

258 // 2 if value in range 0xE0 .. 0xEF.

259 // 3 if value in range 0xF0 .. 0xF7.

260 // Encode that in a single value.

261 unsigned additional_bytes =

262 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;

263 *cursor += additional_bytes;

264 ASSERT(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));

265 }

266 }

267

268

269 void Utf8ToUC16CharacterStream::SetRawPosition(unsigned target_position) {

270 if (raw_character_position_ > target_position) {

271 // Spool backwards in utf8 buffer.

272 do {

273 Utf8CharacterBack(raw_data_, &raw_data_pos_);

274 raw_character_position_--;

275 } while (raw_character_position_ > target_position);

276 return;

277 }

278 // Spool forwards in the utf8 buffer.

279 while (raw_character_position_ < target_position) {

280 if (raw_data_pos_ == raw_data_length_) return;

281 Utf8CharacterForward(raw_data_, &raw_data_pos_);

282 raw_character_position_++;

283 }

284 }

285

286

287 // ----------------------------------------------------------------------------

288 // ExternalTwoByteStringUC16CharacterStream

289

290 ExternalTwoByteStringUC16CharacterStream::

291 ~ExternalTwoByteStringUC16CharacterStream() { }

292

293

294 ExternalTwoByteStringUC16CharacterStream

295 ::ExternalTwoByteStringUC16CharacterStream(

296 Handle<ExternalTwoByteString> data,

297 int start_position,

298 int end_position)

299 : UC16CharacterStream(),

300 source_(data),

301 raw_data_(data->GetTwoByteData(start_position)) {

302 buffer_cursor_ = raw_data_,

303 buffer_end_ = raw_data_ + (end_position - start_position);

304 pos_ = start_position;

305 }

306

307

308 // ----------------------------------------------------------------------------

309 // Scanner::LiteralScope

310

311 Scanner::LiteralScope::LiteralScope(Scanner* self)

312 : scanner_(self), complete_(false) {

313 self->StartLiteral();

314 }

315

316

317 Scanner::LiteralScope::~LiteralScope() {

318 if (!complete_) scanner_->DropLiteral();

319 }

320

321

322 void Scanner::LiteralScope::Complete() {

323 scanner_->TerminateLiteral();

324 complete_ = true;

325 }	1089 }

326	1090

327 } } // namespace v8::internal	1091 } } // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/scanner.h ('k') | src/scanner-base.h » ('j') | no next file with comments »