| OLD | NEW |
| (Empty) |
| 1 // Copyright 2011 the V8 project authors. All rights reserved. | |
| 2 // Redistribution and use in source and binary forms, with or without | |
| 3 // modification, are permitted provided that the following conditions are | |
| 4 // met: | |
| 5 // | |
| 6 // * Redistributions of source code must retain the above copyright | |
| 7 // notice, this list of conditions and the following disclaimer. | |
| 8 // * Redistributions in binary form must reproduce the above | |
| 9 // copyright notice, this list of conditions and the following | |
| 10 // disclaimer in the documentation and/or other materials provided | |
| 11 // with the distribution. | |
| 12 // * Neither the name of Google Inc. nor the names of its | |
| 13 // contributors may be used to endorse or promote products derived | |
| 14 // from this software without specific prior written permission. | |
| 15 // | |
| 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
| 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
| 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
| 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
| 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
| 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
| 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 27 | |
| 28 // Features shared by parsing and pre-parsing scanners. | |
| 29 | |
| 30 #include "../include/v8stdint.h" | |
| 31 #include "scanner-base.h" | |
| 32 #include "char-predicates-inl.h" | |
| 33 | |
| 34 namespace v8 { | |
| 35 namespace internal { | |
| 36 | |
| 37 // ---------------------------------------------------------------------------- | |
| 38 // Scanner | |
| 39 | |
| 40 Scanner::Scanner(UnicodeCache* unicode_cache) | |
| 41 : unicode_cache_(unicode_cache) { } | |
| 42 | |
| 43 | |
| 44 uc32 Scanner::ScanHexNumber(int expected_length) { | |
| 45 ASSERT(expected_length <= 4); // prevent overflow | |
| 46 | |
| 47 uc32 digits[4] = { 0, 0, 0, 0 }; | |
| 48 uc32 x = 0; | |
| 49 for (int i = 0; i < expected_length; i++) { | |
| 50 digits[i] = c0_; | |
| 51 int d = HexValue(c0_); | |
| 52 if (d < 0) { | |
| 53 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes | |
| 54 // should be illegal, but other JS VMs just return the | |
| 55 // non-escaped version of the original character. | |
| 56 | |
| 57 // Push back digits that we have advanced past. | |
| 58 for (int j = i-1; j >= 0; j--) { | |
| 59 PushBack(digits[j]); | |
| 60 } | |
| 61 return -1; | |
| 62 } | |
| 63 x = x * 16 + d; | |
| 64 Advance(); | |
| 65 } | |
| 66 | |
| 67 return x; | |
| 68 } | |
| 69 | |
| 70 | |
| 71 | |
| 72 // ---------------------------------------------------------------------------- | |
| 73 // JavaScriptScanner | |
| 74 | |
| 75 JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants) | |
| 76 : Scanner(scanner_contants), | |
| 77 octal_pos_(Location::invalid()), | |
| 78 harmony_block_scoping_(false) { } | |
| 79 | |
| 80 | |
| 81 void JavaScriptScanner::Initialize(UC16CharacterStream* source) { | |
| 82 source_ = source; | |
| 83 // Need to capture identifiers in order to recognize "get" and "set" | |
| 84 // in object literals. | |
| 85 Init(); | |
| 86 // Skip initial whitespace allowing HTML comment ends just like | |
| 87 // after a newline and scan first token. | |
| 88 has_line_terminator_before_next_ = true; | |
| 89 SkipWhiteSpace(); | |
| 90 Scan(); | |
| 91 } | |
| 92 | |
| 93 | |
| 94 // Ensure that tokens can be stored in a byte. | |
| 95 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100); | |
| 96 | |
| 97 // Table of one-character tokens, by character (0x00..0x7f only). | |
| 98 static const byte one_char_tokens[] = { | |
| 99 Token::ILLEGAL, | |
| 100 Token::ILLEGAL, | |
| 101 Token::ILLEGAL, | |
| 102 Token::ILLEGAL, | |
| 103 Token::ILLEGAL, | |
| 104 Token::ILLEGAL, | |
| 105 Token::ILLEGAL, | |
| 106 Token::ILLEGAL, | |
| 107 Token::ILLEGAL, | |
| 108 Token::ILLEGAL, | |
| 109 Token::ILLEGAL, | |
| 110 Token::ILLEGAL, | |
| 111 Token::ILLEGAL, | |
| 112 Token::ILLEGAL, | |
| 113 Token::ILLEGAL, | |
| 114 Token::ILLEGAL, | |
| 115 Token::ILLEGAL, | |
| 116 Token::ILLEGAL, | |
| 117 Token::ILLEGAL, | |
| 118 Token::ILLEGAL, | |
| 119 Token::ILLEGAL, | |
| 120 Token::ILLEGAL, | |
| 121 Token::ILLEGAL, | |
| 122 Token::ILLEGAL, | |
| 123 Token::ILLEGAL, | |
| 124 Token::ILLEGAL, | |
| 125 Token::ILLEGAL, | |
| 126 Token::ILLEGAL, | |
| 127 Token::ILLEGAL, | |
| 128 Token::ILLEGAL, | |
| 129 Token::ILLEGAL, | |
| 130 Token::ILLEGAL, | |
| 131 Token::ILLEGAL, | |
| 132 Token::ILLEGAL, | |
| 133 Token::ILLEGAL, | |
| 134 Token::ILLEGAL, | |
| 135 Token::ILLEGAL, | |
| 136 Token::ILLEGAL, | |
| 137 Token::ILLEGAL, | |
| 138 Token::ILLEGAL, | |
| 139 Token::LPAREN, // 0x28 | |
| 140 Token::RPAREN, // 0x29 | |
| 141 Token::ILLEGAL, | |
| 142 Token::ILLEGAL, | |
| 143 Token::COMMA, // 0x2c | |
| 144 Token::ILLEGAL, | |
| 145 Token::ILLEGAL, | |
| 146 Token::ILLEGAL, | |
| 147 Token::ILLEGAL, | |
| 148 Token::ILLEGAL, | |
| 149 Token::ILLEGAL, | |
| 150 Token::ILLEGAL, | |
| 151 Token::ILLEGAL, | |
| 152 Token::ILLEGAL, | |
| 153 Token::ILLEGAL, | |
| 154 Token::ILLEGAL, | |
| 155 Token::ILLEGAL, | |
| 156 Token::ILLEGAL, | |
| 157 Token::COLON, // 0x3a | |
| 158 Token::SEMICOLON, // 0x3b | |
| 159 Token::ILLEGAL, | |
| 160 Token::ILLEGAL, | |
| 161 Token::ILLEGAL, | |
| 162 Token::CONDITIONAL, // 0x3f | |
| 163 Token::ILLEGAL, | |
| 164 Token::ILLEGAL, | |
| 165 Token::ILLEGAL, | |
| 166 Token::ILLEGAL, | |
| 167 Token::ILLEGAL, | |
| 168 Token::ILLEGAL, | |
| 169 Token::ILLEGAL, | |
| 170 Token::ILLEGAL, | |
| 171 Token::ILLEGAL, | |
| 172 Token::ILLEGAL, | |
| 173 Token::ILLEGAL, | |
| 174 Token::ILLEGAL, | |
| 175 Token::ILLEGAL, | |
| 176 Token::ILLEGAL, | |
| 177 Token::ILLEGAL, | |
| 178 Token::ILLEGAL, | |
| 179 Token::ILLEGAL, | |
| 180 Token::ILLEGAL, | |
| 181 Token::ILLEGAL, | |
| 182 Token::ILLEGAL, | |
| 183 Token::ILLEGAL, | |
| 184 Token::ILLEGAL, | |
| 185 Token::ILLEGAL, | |
| 186 Token::ILLEGAL, | |
| 187 Token::ILLEGAL, | |
| 188 Token::ILLEGAL, | |
| 189 Token::ILLEGAL, | |
| 190 Token::LBRACK, // 0x5b | |
| 191 Token::ILLEGAL, | |
| 192 Token::RBRACK, // 0x5d | |
| 193 Token::ILLEGAL, | |
| 194 Token::ILLEGAL, | |
| 195 Token::ILLEGAL, | |
| 196 Token::ILLEGAL, | |
| 197 Token::ILLEGAL, | |
| 198 Token::ILLEGAL, | |
| 199 Token::ILLEGAL, | |
| 200 Token::ILLEGAL, | |
| 201 Token::ILLEGAL, | |
| 202 Token::ILLEGAL, | |
| 203 Token::ILLEGAL, | |
| 204 Token::ILLEGAL, | |
| 205 Token::ILLEGAL, | |
| 206 Token::ILLEGAL, | |
| 207 Token::ILLEGAL, | |
| 208 Token::ILLEGAL, | |
| 209 Token::ILLEGAL, | |
| 210 Token::ILLEGAL, | |
| 211 Token::ILLEGAL, | |
| 212 Token::ILLEGAL, | |
| 213 Token::ILLEGAL, | |
| 214 Token::ILLEGAL, | |
| 215 Token::ILLEGAL, | |
| 216 Token::ILLEGAL, | |
| 217 Token::ILLEGAL, | |
| 218 Token::ILLEGAL, | |
| 219 Token::ILLEGAL, | |
| 220 Token::ILLEGAL, | |
| 221 Token::ILLEGAL, | |
| 222 Token::LBRACE, // 0x7b | |
| 223 Token::ILLEGAL, | |
| 224 Token::RBRACE, // 0x7d | |
| 225 Token::BIT_NOT, // 0x7e | |
| 226 Token::ILLEGAL | |
| 227 }; | |
| 228 | |
| 229 | |
| 230 Token::Value JavaScriptScanner::Next() { | |
| 231 current_ = next_; | |
| 232 has_line_terminator_before_next_ = false; | |
| 233 has_multiline_comment_before_next_ = false; | |
| 234 if (static_cast<unsigned>(c0_) <= 0x7f) { | |
| 235 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]); | |
| 236 if (token != Token::ILLEGAL) { | |
| 237 int pos = source_pos(); | |
| 238 next_.token = token; | |
| 239 next_.location.beg_pos = pos; | |
| 240 next_.location.end_pos = pos + 1; | |
| 241 Advance(); | |
| 242 return current_.token; | |
| 243 } | |
| 244 } | |
| 245 Scan(); | |
| 246 return current_.token; | |
| 247 } | |
| 248 | |
| 249 | |
| 250 static inline bool IsByteOrderMark(uc32 c) { | |
| 251 // The Unicode value U+FFFE is guaranteed never to be assigned as a | |
| 252 // Unicode character; this implies that in a Unicode context the | |
| 253 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF | |
| 254 // character expressed in little-endian byte order (since it could | |
| 255 // not be a U+FFFE character expressed in big-endian byte | |
| 256 // order). Nevertheless, we check for it to be compatible with | |
| 257 // Spidermonkey. | |
| 258 return c == 0xFEFF || c == 0xFFFE; | |
| 259 } | |
| 260 | |
| 261 | |
| 262 bool JavaScriptScanner::SkipWhiteSpace() { | |
| 263 int start_position = source_pos(); | |
| 264 | |
| 265 while (true) { | |
| 266 // We treat byte-order marks (BOMs) as whitespace for better | |
| 267 // compatibility with Spidermonkey and other JavaScript engines. | |
| 268 while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) { | |
| 269 // IsWhiteSpace() includes line terminators! | |
| 270 if (unicode_cache_->IsLineTerminator(c0_)) { | |
| 271 // Ignore line terminators, but remember them. This is necessary | |
| 272 // for automatic semicolon insertion. | |
| 273 has_line_terminator_before_next_ = true; | |
| 274 } | |
| 275 Advance(); | |
| 276 } | |
| 277 | |
| 278 // If there is an HTML comment end '-->' at the beginning of a | |
| 279 // line (with only whitespace in front of it), we treat the rest | |
| 280 // of the line as a comment. This is in line with the way | |
| 281 // SpiderMonkey handles it. | |
| 282 if (c0_ == '-' && has_line_terminator_before_next_) { | |
| 283 Advance(); | |
| 284 if (c0_ == '-') { | |
| 285 Advance(); | |
| 286 if (c0_ == '>') { | |
| 287 // Treat the rest of the line as a comment. | |
| 288 SkipSingleLineComment(); | |
| 289 // Continue skipping white space after the comment. | |
| 290 continue; | |
| 291 } | |
| 292 PushBack('-'); // undo Advance() | |
| 293 } | |
| 294 PushBack('-'); // undo Advance() | |
| 295 } | |
| 296 // Return whether or not we skipped any characters. | |
| 297 return source_pos() != start_position; | |
| 298 } | |
| 299 } | |
| 300 | |
| 301 | |
| 302 Token::Value JavaScriptScanner::SkipSingleLineComment() { | |
| 303 Advance(); | |
| 304 | |
| 305 // The line terminator at the end of the line is not considered | |
| 306 // to be part of the single-line comment; it is recognized | |
| 307 // separately by the lexical grammar and becomes part of the | |
| 308 // stream of input elements for the syntactic grammar (see | |
| 309 // ECMA-262, section 7.4). | |
| 310 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { | |
| 311 Advance(); | |
| 312 } | |
| 313 | |
| 314 return Token::WHITESPACE; | |
| 315 } | |
| 316 | |
| 317 | |
| 318 Token::Value JavaScriptScanner::SkipMultiLineComment() { | |
| 319 ASSERT(c0_ == '*'); | |
| 320 Advance(); | |
| 321 | |
| 322 while (c0_ >= 0) { | |
| 323 uc32 ch = c0_; | |
| 324 Advance(); | |
| 325 if (unicode_cache_->IsLineTerminator(ch)) { | |
| 326 // Following ECMA-262, section 7.4, a comment containing | |
| 327 // a newline will make the comment count as a line-terminator. | |
| 328 has_multiline_comment_before_next_ = true; | |
| 329 } | |
| 330 // If we have reached the end of the multi-line comment, we | |
| 331 // consume the '/' and insert a whitespace. This way all | |
| 332 // multi-line comments are treated as whitespace. | |
| 333 if (ch == '*' && c0_ == '/') { | |
| 334 c0_ = ' '; | |
| 335 return Token::WHITESPACE; | |
| 336 } | |
| 337 } | |
| 338 | |
| 339 // Unterminated multi-line comment. | |
| 340 return Token::ILLEGAL; | |
| 341 } | |
| 342 | |
| 343 | |
| 344 Token::Value JavaScriptScanner::ScanHtmlComment() { | |
| 345 // Check for <!-- comments. | |
| 346 ASSERT(c0_ == '!'); | |
| 347 Advance(); | |
| 348 if (c0_ == '-') { | |
| 349 Advance(); | |
| 350 if (c0_ == '-') return SkipSingleLineComment(); | |
| 351 PushBack('-'); // undo Advance() | |
| 352 } | |
| 353 PushBack('!'); // undo Advance() | |
| 354 ASSERT(c0_ == '!'); | |
| 355 return Token::LT; | |
| 356 } | |
| 357 | |
| 358 | |
| 359 void JavaScriptScanner::Scan() { | |
| 360 next_.literal_chars = NULL; | |
| 361 Token::Value token; | |
| 362 do { | |
| 363 // Remember the position of the next token | |
| 364 next_.location.beg_pos = source_pos(); | |
| 365 | |
| 366 switch (c0_) { | |
| 367 case ' ': | |
| 368 case '\t': | |
| 369 Advance(); | |
| 370 token = Token::WHITESPACE; | |
| 371 break; | |
| 372 | |
| 373 case '\n': | |
| 374 Advance(); | |
| 375 has_line_terminator_before_next_ = true; | |
| 376 token = Token::WHITESPACE; | |
| 377 break; | |
| 378 | |
| 379 case '"': case '\'': | |
| 380 token = ScanString(); | |
| 381 break; | |
| 382 | |
| 383 case '<': | |
| 384 // < <= << <<= <!-- | |
| 385 Advance(); | |
| 386 if (c0_ == '=') { | |
| 387 token = Select(Token::LTE); | |
| 388 } else if (c0_ == '<') { | |
| 389 token = Select('=', Token::ASSIGN_SHL, Token::SHL); | |
| 390 } else if (c0_ == '!') { | |
| 391 token = ScanHtmlComment(); | |
| 392 } else { | |
| 393 token = Token::LT; | |
| 394 } | |
| 395 break; | |
| 396 | |
| 397 case '>': | |
| 398 // > >= >> >>= >>> >>>= | |
| 399 Advance(); | |
| 400 if (c0_ == '=') { | |
| 401 token = Select(Token::GTE); | |
| 402 } else if (c0_ == '>') { | |
| 403 // >> >>= >>> >>>= | |
| 404 Advance(); | |
| 405 if (c0_ == '=') { | |
| 406 token = Select(Token::ASSIGN_SAR); | |
| 407 } else if (c0_ == '>') { | |
| 408 token = Select('=', Token::ASSIGN_SHR, Token::SHR); | |
| 409 } else { | |
| 410 token = Token::SAR; | |
| 411 } | |
| 412 } else { | |
| 413 token = Token::GT; | |
| 414 } | |
| 415 break; | |
| 416 | |
| 417 case '=': | |
| 418 // = == === | |
| 419 Advance(); | |
| 420 if (c0_ == '=') { | |
| 421 token = Select('=', Token::EQ_STRICT, Token::EQ); | |
| 422 } else { | |
| 423 token = Token::ASSIGN; | |
| 424 } | |
| 425 break; | |
| 426 | |
| 427 case '!': | |
| 428 // ! != !== | |
| 429 Advance(); | |
| 430 if (c0_ == '=') { | |
| 431 token = Select('=', Token::NE_STRICT, Token::NE); | |
| 432 } else { | |
| 433 token = Token::NOT; | |
| 434 } | |
| 435 break; | |
| 436 | |
| 437 case '+': | |
| 438 // + ++ += | |
| 439 Advance(); | |
| 440 if (c0_ == '+') { | |
| 441 token = Select(Token::INC); | |
| 442 } else if (c0_ == '=') { | |
| 443 token = Select(Token::ASSIGN_ADD); | |
| 444 } else { | |
| 445 token = Token::ADD; | |
| 446 } | |
| 447 break; | |
| 448 | |
| 449 case '-': | |
| 450 // - -- --> -= | |
| 451 Advance(); | |
| 452 if (c0_ == '-') { | |
| 453 Advance(); | |
| 454 if (c0_ == '>' && has_line_terminator_before_next_) { | |
| 455 // For compatibility with SpiderMonkey, we skip lines that | |
| 456 // start with an HTML comment end '-->'. | |
| 457 token = SkipSingleLineComment(); | |
| 458 } else { | |
| 459 token = Token::DEC; | |
| 460 } | |
| 461 } else if (c0_ == '=') { | |
| 462 token = Select(Token::ASSIGN_SUB); | |
| 463 } else { | |
| 464 token = Token::SUB; | |
| 465 } | |
| 466 break; | |
| 467 | |
| 468 case '*': | |
| 469 // * *= | |
| 470 token = Select('=', Token::ASSIGN_MUL, Token::MUL); | |
| 471 break; | |
| 472 | |
| 473 case '%': | |
| 474 // % %= | |
| 475 token = Select('=', Token::ASSIGN_MOD, Token::MOD); | |
| 476 break; | |
| 477 | |
| 478 case '/': | |
| 479 // / // /* /= | |
| 480 Advance(); | |
| 481 if (c0_ == '/') { | |
| 482 token = SkipSingleLineComment(); | |
| 483 } else if (c0_ == '*') { | |
| 484 token = SkipMultiLineComment(); | |
| 485 } else if (c0_ == '=') { | |
| 486 token = Select(Token::ASSIGN_DIV); | |
| 487 } else { | |
| 488 token = Token::DIV; | |
| 489 } | |
| 490 break; | |
| 491 | |
| 492 case '&': | |
| 493 // & && &= | |
| 494 Advance(); | |
| 495 if (c0_ == '&') { | |
| 496 token = Select(Token::AND); | |
| 497 } else if (c0_ == '=') { | |
| 498 token = Select(Token::ASSIGN_BIT_AND); | |
| 499 } else { | |
| 500 token = Token::BIT_AND; | |
| 501 } | |
| 502 break; | |
| 503 | |
| 504 case '|': | |
| 505 // | || |= | |
| 506 Advance(); | |
| 507 if (c0_ == '|') { | |
| 508 token = Select(Token::OR); | |
| 509 } else if (c0_ == '=') { | |
| 510 token = Select(Token::ASSIGN_BIT_OR); | |
| 511 } else { | |
| 512 token = Token::BIT_OR; | |
| 513 } | |
| 514 break; | |
| 515 | |
| 516 case '^': | |
| 517 // ^ ^= | |
| 518 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); | |
| 519 break; | |
| 520 | |
| 521 case '.': | |
| 522 // . Number | |
| 523 Advance(); | |
| 524 if (IsDecimalDigit(c0_)) { | |
| 525 token = ScanNumber(true); | |
| 526 } else { | |
| 527 token = Token::PERIOD; | |
| 528 } | |
| 529 break; | |
| 530 | |
| 531 case ':': | |
| 532 token = Select(Token::COLON); | |
| 533 break; | |
| 534 | |
| 535 case ';': | |
| 536 token = Select(Token::SEMICOLON); | |
| 537 break; | |
| 538 | |
| 539 case ',': | |
| 540 token = Select(Token::COMMA); | |
| 541 break; | |
| 542 | |
| 543 case '(': | |
| 544 token = Select(Token::LPAREN); | |
| 545 break; | |
| 546 | |
| 547 case ')': | |
| 548 token = Select(Token::RPAREN); | |
| 549 break; | |
| 550 | |
| 551 case '[': | |
| 552 token = Select(Token::LBRACK); | |
| 553 break; | |
| 554 | |
| 555 case ']': | |
| 556 token = Select(Token::RBRACK); | |
| 557 break; | |
| 558 | |
| 559 case '{': | |
| 560 token = Select(Token::LBRACE); | |
| 561 break; | |
| 562 | |
| 563 case '}': | |
| 564 token = Select(Token::RBRACE); | |
| 565 break; | |
| 566 | |
| 567 case '?': | |
| 568 token = Select(Token::CONDITIONAL); | |
| 569 break; | |
| 570 | |
| 571 case '~': | |
| 572 token = Select(Token::BIT_NOT); | |
| 573 break; | |
| 574 | |
| 575 default: | |
| 576 if (unicode_cache_->IsIdentifierStart(c0_)) { | |
| 577 token = ScanIdentifierOrKeyword(); | |
| 578 } else if (IsDecimalDigit(c0_)) { | |
| 579 token = ScanNumber(false); | |
| 580 } else if (SkipWhiteSpace()) { | |
| 581 token = Token::WHITESPACE; | |
| 582 } else if (c0_ < 0) { | |
| 583 token = Token::EOS; | |
| 584 } else { | |
| 585 token = Select(Token::ILLEGAL); | |
| 586 } | |
| 587 break; | |
| 588 } | |
| 589 | |
| 590 // Continue scanning for tokens as long as we're just skipping | |
| 591 // whitespace. | |
| 592 } while (token == Token::WHITESPACE); | |
| 593 | |
| 594 next_.location.end_pos = source_pos(); | |
| 595 next_.token = token; | |
| 596 } | |
| 597 | |
| 598 | |
| 599 void JavaScriptScanner::SeekForward(int pos) { | |
| 600 // After this call, we will have the token at the given position as | |
| 601 // the "next" token. The "current" token will be invalid. | |
| 602 if (pos == next_.location.beg_pos) return; | |
| 603 int current_pos = source_pos(); | |
| 604 ASSERT_EQ(next_.location.end_pos, current_pos); | |
| 605 // Positions inside the lookahead token aren't supported. | |
| 606 ASSERT(pos >= current_pos); | |
| 607 if (pos != current_pos) { | |
| 608 source_->SeekForward(pos - source_->pos()); | |
| 609 Advance(); | |
| 610 // This function is only called to seek to the location | |
| 611 // of the end of a function (at the "}" token). It doesn't matter | |
| 612 // whether there was a line terminator in the part we skip. | |
| 613 has_line_terminator_before_next_ = false; | |
| 614 has_multiline_comment_before_next_ = false; | |
| 615 } | |
| 616 Scan(); | |
| 617 } | |
| 618 | |
| 619 | |
| 620 void JavaScriptScanner::ScanEscape() { | |
| 621 uc32 c = c0_; | |
| 622 Advance(); | |
| 623 | |
| 624 // Skip escaped newlines. | |
| 625 if (unicode_cache_->IsLineTerminator(c)) { | |
| 626 // Allow CR+LF newlines in multiline string literals. | |
| 627 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance(); | |
| 628 // Allow LF+CR newlines in multiline string literals. | |
| 629 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance(); | |
| 630 return; | |
| 631 } | |
| 632 | |
| 633 switch (c) { | |
| 634 case '\'': // fall through | |
| 635 case '"' : // fall through | |
| 636 case '\\': break; | |
| 637 case 'b' : c = '\b'; break; | |
| 638 case 'f' : c = '\f'; break; | |
| 639 case 'n' : c = '\n'; break; | |
| 640 case 'r' : c = '\r'; break; | |
| 641 case 't' : c = '\t'; break; | |
| 642 case 'u' : { | |
| 643 c = ScanHexNumber(4); | |
| 644 if (c < 0) c = 'u'; | |
| 645 break; | |
| 646 } | |
| 647 case 'v' : c = '\v'; break; | |
| 648 case 'x' : { | |
| 649 c = ScanHexNumber(2); | |
| 650 if (c < 0) c = 'x'; | |
| 651 break; | |
| 652 } | |
| 653 case '0' : // fall through | |
| 654 case '1' : // fall through | |
| 655 case '2' : // fall through | |
| 656 case '3' : // fall through | |
| 657 case '4' : // fall through | |
| 658 case '5' : // fall through | |
| 659 case '6' : // fall through | |
| 660 case '7' : c = ScanOctalEscape(c, 2); break; | |
| 661 } | |
| 662 | |
| 663 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these | |
| 664 // should be illegal, but they are commonly handled | |
| 665 // as non-escaped characters by JS VMs. | |
| 666 AddLiteralChar(c); | |
| 667 } | |
| 668 | |
| 669 | |
| 670 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of | |
| 671 // ECMA-262. Other JS VMs support them. | |
| 672 uc32 JavaScriptScanner::ScanOctalEscape(uc32 c, int length) { | |
| 673 uc32 x = c - '0'; | |
| 674 int i = 0; | |
| 675 for (; i < length; i++) { | |
| 676 int d = c0_ - '0'; | |
| 677 if (d < 0 || d > 7) break; | |
| 678 int nx = x * 8 + d; | |
| 679 if (nx >= 256) break; | |
| 680 x = nx; | |
| 681 Advance(); | |
| 682 } | |
| 683 // Anything except '\0' is an octal escape sequence, illegal in strict mode. | |
| 684 // Remember the position of octal escape sequences so that an error | |
| 685 // can be reported later (in strict mode). | |
| 686 // We don't report the error immediately, because the octal escape can | |
| 687 // occur before the "use strict" directive. | |
| 688 if (c != '0' || i > 0) { | |
| 689 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1); | |
| 690 } | |
| 691 return x; | |
| 692 } | |
| 693 | |
| 694 | |
| 695 Token::Value JavaScriptScanner::ScanString() { | |
| 696 uc32 quote = c0_; | |
| 697 Advance(); // consume quote | |
| 698 | |
| 699 LiteralScope literal(this); | |
| 700 while (c0_ != quote && c0_ >= 0 | |
| 701 && !unicode_cache_->IsLineTerminator(c0_)) { | |
| 702 uc32 c = c0_; | |
| 703 Advance(); | |
| 704 if (c == '\\') { | |
| 705 if (c0_ < 0) return Token::ILLEGAL; | |
| 706 ScanEscape(); | |
| 707 } else { | |
| 708 AddLiteralChar(c); | |
| 709 } | |
| 710 } | |
| 711 if (c0_ != quote) return Token::ILLEGAL; | |
| 712 literal.Complete(); | |
| 713 | |
| 714 Advance(); // consume quote | |
| 715 return Token::STRING; | |
| 716 } | |
| 717 | |
| 718 | |
| 719 void JavaScriptScanner::ScanDecimalDigits() { | |
| 720 while (IsDecimalDigit(c0_)) | |
| 721 AddLiteralCharAdvance(); | |
| 722 } | |
| 723 | |
| 724 | |
| 725 Token::Value JavaScriptScanner::ScanNumber(bool seen_period) { | |
| 726 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction | |
| 727 | |
| 728 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL; | |
| 729 | |
| 730 LiteralScope literal(this); | |
| 731 if (seen_period) { | |
| 732 // we have already seen a decimal point of the float | |
| 733 AddLiteralChar('.'); | |
| 734 ScanDecimalDigits(); // we know we have at least one digit | |
| 735 | |
| 736 } else { | |
| 737 // if the first character is '0' we must check for octals and hex | |
| 738 if (c0_ == '0') { | |
| 739 int start_pos = source_pos(); // For reporting octal positions. | |
| 740 AddLiteralCharAdvance(); | |
| 741 | |
| 742 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number | |
| 743 if (c0_ == 'x' || c0_ == 'X') { | |
| 744 // hex number | |
| 745 kind = HEX; | |
| 746 AddLiteralCharAdvance(); | |
| 747 if (!IsHexDigit(c0_)) { | |
| 748 // we must have at least one hex digit after 'x'/'X' | |
| 749 return Token::ILLEGAL; | |
| 750 } | |
| 751 while (IsHexDigit(c0_)) { | |
| 752 AddLiteralCharAdvance(); | |
| 753 } | |
| 754 } else if ('0' <= c0_ && c0_ <= '7') { | |
| 755 // (possible) octal number | |
| 756 kind = OCTAL; | |
| 757 while (true) { | |
| 758 if (c0_ == '8' || c0_ == '9') { | |
| 759 kind = DECIMAL; | |
| 760 break; | |
| 761 } | |
| 762 if (c0_ < '0' || '7' < c0_) { | |
| 763 // Octal literal finished. | |
| 764 octal_pos_ = Location(start_pos, source_pos()); | |
| 765 break; | |
| 766 } | |
| 767 AddLiteralCharAdvance(); | |
| 768 } | |
| 769 } | |
| 770 } | |
| 771 | |
| 772 // Parse decimal digits and allow trailing fractional part. | |
| 773 if (kind == DECIMAL) { | |
| 774 ScanDecimalDigits(); // optional | |
| 775 if (c0_ == '.') { | |
| 776 AddLiteralCharAdvance(); | |
| 777 ScanDecimalDigits(); // optional | |
| 778 } | |
| 779 } | |
| 780 } | |
| 781 | |
| 782 // scan exponent, if any | |
| 783 if (c0_ == 'e' || c0_ == 'E') { | |
| 784 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number | |
| 785 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed | |
| 786 // scan exponent | |
| 787 AddLiteralCharAdvance(); | |
| 788 if (c0_ == '+' || c0_ == '-') | |
| 789 AddLiteralCharAdvance(); | |
| 790 if (!IsDecimalDigit(c0_)) { | |
| 791 // we must have at least one decimal digit after 'e'/'E' | |
| 792 return Token::ILLEGAL; | |
| 793 } | |
| 794 ScanDecimalDigits(); | |
| 795 } | |
| 796 | |
| 797 // The source character immediately following a numeric literal must | |
| 798 // not be an identifier start or a decimal digit; see ECMA-262 | |
| 799 // section 7.8.3, page 17 (note that we read only one decimal digit | |
| 800 // if the value is 0). | |
| 801 if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_)) | |
| 802 return Token::ILLEGAL; | |
| 803 | |
| 804 literal.Complete(); | |
| 805 | |
| 806 return Token::NUMBER; | |
| 807 } | |
| 808 | |
| 809 | |
| 810 uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() { | |
| 811 Advance(); | |
| 812 if (c0_ != 'u') return -1; | |
| 813 Advance(); | |
| 814 uc32 result = ScanHexNumber(4); | |
| 815 if (result < 0) PushBack('u'); | |
| 816 return result; | |
| 817 } | |
| 818 | |
| 819 | |
| 820 // ---------------------------------------------------------------------------- | |
| 821 // Keyword Matcher | |
| 822 | |
| 823 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ | |
| 824 KEYWORD_GROUP('b') \ | |
| 825 KEYWORD("break", Token::BREAK) \ | |
| 826 KEYWORD_GROUP('c') \ | |
| 827 KEYWORD("case", Token::CASE) \ | |
| 828 KEYWORD("catch", Token::CATCH) \ | |
| 829 KEYWORD("class", Token::FUTURE_RESERVED_WORD) \ | |
| 830 KEYWORD("const", Token::CONST) \ | |
| 831 KEYWORD("continue", Token::CONTINUE) \ | |
| 832 KEYWORD_GROUP('d') \ | |
| 833 KEYWORD("debugger", Token::DEBUGGER) \ | |
| 834 KEYWORD("default", Token::DEFAULT) \ | |
| 835 KEYWORD("delete", Token::DELETE) \ | |
| 836 KEYWORD("do", Token::DO) \ | |
| 837 KEYWORD_GROUP('e') \ | |
| 838 KEYWORD("else", Token::ELSE) \ | |
| 839 KEYWORD("enum", Token::FUTURE_RESERVED_WORD) \ | |
| 840 KEYWORD("export", Token::FUTURE_RESERVED_WORD) \ | |
| 841 KEYWORD("extends", Token::FUTURE_RESERVED_WORD) \ | |
| 842 KEYWORD_GROUP('f') \ | |
| 843 KEYWORD("false", Token::FALSE_LITERAL) \ | |
| 844 KEYWORD("finally", Token::FINALLY) \ | |
| 845 KEYWORD("for", Token::FOR) \ | |
| 846 KEYWORD("function", Token::FUNCTION) \ | |
| 847 KEYWORD_GROUP('i') \ | |
| 848 KEYWORD("if", Token::IF) \ | |
| 849 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
| 850 KEYWORD("import", Token::FUTURE_RESERVED_WORD) \ | |
| 851 KEYWORD("in", Token::IN) \ | |
| 852 KEYWORD("instanceof", Token::INSTANCEOF) \ | |
| 853 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
| 854 KEYWORD_GROUP('l') \ | |
| 855 KEYWORD("let", harmony_block_scoping \ | |
| 856 ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \ | |
| 857 KEYWORD_GROUP('n') \ | |
| 858 KEYWORD("new", Token::NEW) \ | |
| 859 KEYWORD("null", Token::NULL_LITERAL) \ | |
| 860 KEYWORD_GROUP('p') \ | |
| 861 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
| 862 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
| 863 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
| 864 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
| 865 KEYWORD_GROUP('r') \ | |
| 866 KEYWORD("return", Token::RETURN) \ | |
| 867 KEYWORD_GROUP('s') \ | |
| 868 KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD) \ | |
| 869 KEYWORD("super", Token::FUTURE_RESERVED_WORD) \ | |
| 870 KEYWORD("switch", Token::SWITCH) \ | |
| 871 KEYWORD_GROUP('t') \ | |
| 872 KEYWORD("this", Token::THIS) \ | |
| 873 KEYWORD("throw", Token::THROW) \ | |
| 874 KEYWORD("true", Token::TRUE_LITERAL) \ | |
| 875 KEYWORD("try", Token::TRY) \ | |
| 876 KEYWORD("typeof", Token::TYPEOF) \ | |
| 877 KEYWORD_GROUP('v') \ | |
| 878 KEYWORD("var", Token::VAR) \ | |
| 879 KEYWORD("void", Token::VOID) \ | |
| 880 KEYWORD_GROUP('w') \ | |
| 881 KEYWORD("while", Token::WHILE) \ | |
| 882 KEYWORD("with", Token::WITH) \ | |
| 883 KEYWORD_GROUP('y') \ | |
| 884 KEYWORD("yield", Token::FUTURE_STRICT_RESERVED_WORD) | |
| 885 | |
| 886 | |
| 887 static Token::Value KeywordOrIdentifierToken(const char* input, | |
| 888 int input_length, | |
| 889 bool harmony_block_scoping) { | |
| 890 ASSERT(input_length >= 1); | |
| 891 const int kMinLength = 2; | |
| 892 const int kMaxLength = 10; | |
| 893 if (input_length < kMinLength || input_length > kMaxLength) { | |
| 894 return Token::IDENTIFIER; | |
| 895 } | |
| 896 switch (input[0]) { | |
| 897 default: | |
| 898 #define KEYWORD_GROUP_CASE(ch) \ | |
| 899 break; \ | |
| 900 case ch: | |
| 901 #define KEYWORD(keyword, token) \ | |
| 902 { \ | |
| 903 /* 'keyword' is a char array, so sizeof(keyword) is */ \ | |
| 904 /* strlen(keyword) plus 1 for the NUL char. */ \ | |
| 905 const int keyword_length = sizeof(keyword) - 1; \ | |
| 906 STATIC_ASSERT(keyword_length >= kMinLength); \ | |
| 907 STATIC_ASSERT(keyword_length <= kMaxLength); \ | |
| 908 if (input_length == keyword_length && \ | |
| 909 input[1] == keyword[1] && \ | |
| 910 (keyword_length <= 2 || input[2] == keyword[2]) && \ | |
| 911 (keyword_length <= 3 || input[3] == keyword[3]) && \ | |
| 912 (keyword_length <= 4 || input[4] == keyword[4]) && \ | |
| 913 (keyword_length <= 5 || input[5] == keyword[5]) && \ | |
| 914 (keyword_length <= 6 || input[6] == keyword[6]) && \ | |
| 915 (keyword_length <= 7 || input[7] == keyword[7]) && \ | |
| 916 (keyword_length <= 8 || input[8] == keyword[8]) && \ | |
| 917 (keyword_length <= 9 || input[9] == keyword[9])) { \ | |
| 918 return token; \ | |
| 919 } \ | |
| 920 } | |
| 921 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD) | |
| 922 } | |
| 923 return Token::IDENTIFIER; | |
| 924 } | |
| 925 | |
| 926 | |
| 927 Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() { | |
| 928 ASSERT(unicode_cache_->IsIdentifierStart(c0_)); | |
| 929 LiteralScope literal(this); | |
| 930 // Scan identifier start character. | |
| 931 if (c0_ == '\\') { | |
| 932 uc32 c = ScanIdentifierUnicodeEscape(); | |
| 933 // Only allow legal identifier start characters. | |
| 934 if (c < 0 || | |
| 935 c == '\\' || // No recursive escapes. | |
| 936 !unicode_cache_->IsIdentifierStart(c)) { | |
| 937 return Token::ILLEGAL; | |
| 938 } | |
| 939 AddLiteralChar(c); | |
| 940 return ScanIdentifierSuffix(&literal); | |
| 941 } | |
| 942 | |
| 943 uc32 first_char = c0_; | |
| 944 Advance(); | |
| 945 AddLiteralChar(first_char); | |
| 946 | |
| 947 // Scan the rest of the identifier characters. | |
| 948 while (unicode_cache_->IsIdentifierPart(c0_)) { | |
| 949 if (c0_ != '\\') { | |
| 950 uc32 next_char = c0_; | |
| 951 Advance(); | |
| 952 AddLiteralChar(next_char); | |
| 953 continue; | |
| 954 } | |
| 955 // Fallthrough if no longer able to complete keyword. | |
| 956 return ScanIdentifierSuffix(&literal); | |
| 957 } | |
| 958 | |
| 959 literal.Complete(); | |
| 960 | |
| 961 if (next_.literal_chars->is_ascii()) { | |
| 962 Vector<const char> chars = next_.literal_chars->ascii_literal(); | |
| 963 return KeywordOrIdentifierToken(chars.start(), | |
| 964 chars.length(), | |
| 965 harmony_block_scoping_); | |
| 966 } | |
| 967 | |
| 968 return Token::IDENTIFIER; | |
| 969 } | |
| 970 | |
| 971 | |
| 972 Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) { | |
| 973 // Scan the rest of the identifier characters. | |
| 974 while (unicode_cache_->IsIdentifierPart(c0_)) { | |
| 975 if (c0_ == '\\') { | |
| 976 uc32 c = ScanIdentifierUnicodeEscape(); | |
| 977 // Only allow legal identifier part characters. | |
| 978 if (c < 0 || | |
| 979 c == '\\' || | |
| 980 !unicode_cache_->IsIdentifierPart(c)) { | |
| 981 return Token::ILLEGAL; | |
| 982 } | |
| 983 AddLiteralChar(c); | |
| 984 } else { | |
| 985 AddLiteralChar(c0_); | |
| 986 Advance(); | |
| 987 } | |
| 988 } | |
| 989 literal->Complete(); | |
| 990 | |
| 991 return Token::IDENTIFIER; | |
| 992 } | |
| 993 | |
| 994 | |
| 995 bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) { | |
| 996 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags | |
| 997 bool in_character_class = false; | |
| 998 | |
| 999 // Previous token is either '/' or '/=', in the second case, the | |
| 1000 // pattern starts at =. | |
| 1001 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); | |
| 1002 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); | |
| 1003 | |
| 1004 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, | |
| 1005 // the scanner should pass uninterpreted bodies to the RegExp | |
| 1006 // constructor. | |
| 1007 LiteralScope literal(this); | |
| 1008 if (seen_equal) { | |
| 1009 AddLiteralChar('='); | |
| 1010 } | |
| 1011 | |
| 1012 while (c0_ != '/' || in_character_class) { | |
| 1013 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; | |
| 1014 if (c0_ == '\\') { // Escape sequence. | |
| 1015 AddLiteralCharAdvance(); | |
| 1016 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; | |
| 1017 AddLiteralCharAdvance(); | |
| 1018 // If the escape allows more characters, i.e., \x??, \u????, or \c?, | |
| 1019 // only "safe" characters are allowed (letters, digits, underscore), | |
| 1020 // otherwise the escape isn't valid and the invalid character has | |
| 1021 // its normal meaning. I.e., we can just continue scanning without | |
| 1022 // worrying whether the following characters are part of the escape | |
| 1023 // or not, since any '/', '\\' or '[' is guaranteed to not be part | |
| 1024 // of the escape sequence. | |
| 1025 | |
| 1026 // TODO(896): At some point, parse RegExps more throughly to capture | |
| 1027 // octal esacpes in strict mode. | |
| 1028 } else { // Unescaped character. | |
| 1029 if (c0_ == '[') in_character_class = true; | |
| 1030 if (c0_ == ']') in_character_class = false; | |
| 1031 AddLiteralCharAdvance(); | |
| 1032 } | |
| 1033 } | |
| 1034 Advance(); // consume '/' | |
| 1035 | |
| 1036 literal.Complete(); | |
| 1037 | |
| 1038 return true; | |
| 1039 } | |
| 1040 | |
| 1041 | |
| 1042 bool JavaScriptScanner::ScanLiteralUnicodeEscape() { | |
| 1043 ASSERT(c0_ == '\\'); | |
| 1044 uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0}; | |
| 1045 Advance(); | |
| 1046 int i = 1; | |
| 1047 if (c0_ == 'u') { | |
| 1048 i++; | |
| 1049 while (i < 6) { | |
| 1050 Advance(); | |
| 1051 if (!IsHexDigit(c0_)) break; | |
| 1052 chars_read[i] = c0_; | |
| 1053 i++; | |
| 1054 } | |
| 1055 } | |
| 1056 if (i < 6) { | |
| 1057 // Incomplete escape. Undo all advances and return false. | |
| 1058 while (i > 0) { | |
| 1059 i--; | |
| 1060 PushBack(chars_read[i]); | |
| 1061 } | |
| 1062 return false; | |
| 1063 } | |
| 1064 // Complete escape. Add all chars to current literal buffer. | |
| 1065 for (int i = 0; i < 6; i++) { | |
| 1066 AddLiteralChar(chars_read[i]); | |
| 1067 } | |
| 1068 return true; | |
| 1069 } | |
| 1070 | |
| 1071 | |
| 1072 bool JavaScriptScanner::ScanRegExpFlags() { | |
| 1073 // Scan regular expression flags. | |
| 1074 LiteralScope literal(this); | |
| 1075 while (unicode_cache_->IsIdentifierPart(c0_)) { | |
| 1076 if (c0_ != '\\') { | |
| 1077 AddLiteralCharAdvance(); | |
| 1078 } else { | |
| 1079 if (!ScanLiteralUnicodeEscape()) { | |
| 1080 break; | |
| 1081 } | |
| 1082 } | |
| 1083 } | |
| 1084 literal.Complete(); | |
| 1085 | |
| 1086 next_.location.end_pos = source_pos() - 1; | |
| 1087 return true; | |
| 1088 } | |
| 1089 | |
| 1090 } } // namespace v8::internal | |
| OLD | NEW |