OLD | NEW |
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
11 // with the distribution. | 11 // with the distribution. |
12 // * Neither the name of Google Inc. nor the names of its | 12 // * Neither the name of Google Inc. nor the names of its |
13 // contributors may be used to endorse or promote products derived | 13 // contributors may be used to endorse or promote products derived |
14 // from this software without specific prior written permission. | 14 // from this software without specific prior written permission. |
15 // | 15 // |
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
27 | 27 |
28 #include "v8.h" | 28 // Features shared by parsing and pre-parsing scanners. |
29 | 29 |
30 #include "ast.h" | |
31 #include "handles.h" | |
32 #include "scanner.h" | 30 #include "scanner.h" |
33 #include "unicode-inl.h" | 31 |
| 32 #include "../include/v8stdint.h" |
| 33 #include "char-predicates-inl.h" |
34 | 34 |
35 namespace v8 { | 35 namespace v8 { |
36 namespace internal { | 36 namespace internal { |
37 | 37 |
38 // ---------------------------------------------------------------------------- | 38 // ---------------------------------------------------------------------------- |
39 // BufferedUC16CharacterStreams | 39 // Scanner |
40 | 40 |
41 BufferedUC16CharacterStream::BufferedUC16CharacterStream() | 41 Scanner::Scanner(UnicodeCache* unicode_cache) |
42 : UC16CharacterStream(), | 42 : unicode_cache_(unicode_cache) { } |
43 pushback_limit_(NULL) { | 43 |
44 // Initialize buffer as being empty. First read will fill the buffer. | 44 |
45 buffer_cursor_ = buffer_; | 45 uc32 Scanner::ScanHexNumber(int expected_length) { |
46 buffer_end_ = buffer_; | 46 ASSERT(expected_length <= 4); // prevent overflow |
47 } | 47 |
48 | 48 uc32 digits[4] = { 0, 0, 0, 0 }; |
49 BufferedUC16CharacterStream::~BufferedUC16CharacterStream() { } | 49 uc32 x = 0; |
50 | 50 for (int i = 0; i < expected_length; i++) { |
51 void BufferedUC16CharacterStream::PushBack(uc32 character) { | 51 digits[i] = c0_; |
52 if (character == kEndOfInput) { | 52 int d = HexValue(c0_); |
53 pos_--; | 53 if (d < 0) { |
| 54 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes |
| 55 // should be illegal, but other JS VMs just return the |
| 56 // non-escaped version of the original character. |
| 57 |
| 58 // Push back digits that we have advanced past. |
| 59 for (int j = i-1; j >= 0; j--) { |
| 60 PushBack(digits[j]); |
| 61 } |
| 62 return -1; |
| 63 } |
| 64 x = x * 16 + d; |
| 65 Advance(); |
| 66 } |
| 67 |
| 68 return x; |
| 69 } |
| 70 |
| 71 |
| 72 |
| 73 // ---------------------------------------------------------------------------- |
| 74 // JavaScriptScanner |
| 75 |
| 76 JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants) |
| 77 : Scanner(scanner_contants), |
| 78 octal_pos_(Location::invalid()), |
| 79 harmony_block_scoping_(false) { } |
| 80 |
| 81 |
| 82 void JavaScriptScanner::Initialize(UC16CharacterStream* source) { |
| 83 source_ = source; |
| 84 // Need to capture identifiers in order to recognize "get" and "set" |
| 85 // in object literals. |
| 86 Init(); |
| 87 // Skip initial whitespace allowing HTML comment ends just like |
| 88 // after a newline and scan first token. |
| 89 has_line_terminator_before_next_ = true; |
| 90 SkipWhiteSpace(); |
| 91 Scan(); |
| 92 } |
| 93 |
| 94 |
| 95 // Ensure that tokens can be stored in a byte. |
| 96 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100); |
| 97 |
| 98 // Table of one-character tokens, by character (0x00..0x7f only). |
| 99 static const byte one_char_tokens[] = { |
| 100 Token::ILLEGAL, |
| 101 Token::ILLEGAL, |
| 102 Token::ILLEGAL, |
| 103 Token::ILLEGAL, |
| 104 Token::ILLEGAL, |
| 105 Token::ILLEGAL, |
| 106 Token::ILLEGAL, |
| 107 Token::ILLEGAL, |
| 108 Token::ILLEGAL, |
| 109 Token::ILLEGAL, |
| 110 Token::ILLEGAL, |
| 111 Token::ILLEGAL, |
| 112 Token::ILLEGAL, |
| 113 Token::ILLEGAL, |
| 114 Token::ILLEGAL, |
| 115 Token::ILLEGAL, |
| 116 Token::ILLEGAL, |
| 117 Token::ILLEGAL, |
| 118 Token::ILLEGAL, |
| 119 Token::ILLEGAL, |
| 120 Token::ILLEGAL, |
| 121 Token::ILLEGAL, |
| 122 Token::ILLEGAL, |
| 123 Token::ILLEGAL, |
| 124 Token::ILLEGAL, |
| 125 Token::ILLEGAL, |
| 126 Token::ILLEGAL, |
| 127 Token::ILLEGAL, |
| 128 Token::ILLEGAL, |
| 129 Token::ILLEGAL, |
| 130 Token::ILLEGAL, |
| 131 Token::ILLEGAL, |
| 132 Token::ILLEGAL, |
| 133 Token::ILLEGAL, |
| 134 Token::ILLEGAL, |
| 135 Token::ILLEGAL, |
| 136 Token::ILLEGAL, |
| 137 Token::ILLEGAL, |
| 138 Token::ILLEGAL, |
| 139 Token::ILLEGAL, |
| 140 Token::LPAREN, // 0x28 |
| 141 Token::RPAREN, // 0x29 |
| 142 Token::ILLEGAL, |
| 143 Token::ILLEGAL, |
| 144 Token::COMMA, // 0x2c |
| 145 Token::ILLEGAL, |
| 146 Token::ILLEGAL, |
| 147 Token::ILLEGAL, |
| 148 Token::ILLEGAL, |
| 149 Token::ILLEGAL, |
| 150 Token::ILLEGAL, |
| 151 Token::ILLEGAL, |
| 152 Token::ILLEGAL, |
| 153 Token::ILLEGAL, |
| 154 Token::ILLEGAL, |
| 155 Token::ILLEGAL, |
| 156 Token::ILLEGAL, |
| 157 Token::ILLEGAL, |
| 158 Token::COLON, // 0x3a |
| 159 Token::SEMICOLON, // 0x3b |
| 160 Token::ILLEGAL, |
| 161 Token::ILLEGAL, |
| 162 Token::ILLEGAL, |
| 163 Token::CONDITIONAL, // 0x3f |
| 164 Token::ILLEGAL, |
| 165 Token::ILLEGAL, |
| 166 Token::ILLEGAL, |
| 167 Token::ILLEGAL, |
| 168 Token::ILLEGAL, |
| 169 Token::ILLEGAL, |
| 170 Token::ILLEGAL, |
| 171 Token::ILLEGAL, |
| 172 Token::ILLEGAL, |
| 173 Token::ILLEGAL, |
| 174 Token::ILLEGAL, |
| 175 Token::ILLEGAL, |
| 176 Token::ILLEGAL, |
| 177 Token::ILLEGAL, |
| 178 Token::ILLEGAL, |
| 179 Token::ILLEGAL, |
| 180 Token::ILLEGAL, |
| 181 Token::ILLEGAL, |
| 182 Token::ILLEGAL, |
| 183 Token::ILLEGAL, |
| 184 Token::ILLEGAL, |
| 185 Token::ILLEGAL, |
| 186 Token::ILLEGAL, |
| 187 Token::ILLEGAL, |
| 188 Token::ILLEGAL, |
| 189 Token::ILLEGAL, |
| 190 Token::ILLEGAL, |
| 191 Token::LBRACK, // 0x5b |
| 192 Token::ILLEGAL, |
| 193 Token::RBRACK, // 0x5d |
| 194 Token::ILLEGAL, |
| 195 Token::ILLEGAL, |
| 196 Token::ILLEGAL, |
| 197 Token::ILLEGAL, |
| 198 Token::ILLEGAL, |
| 199 Token::ILLEGAL, |
| 200 Token::ILLEGAL, |
| 201 Token::ILLEGAL, |
| 202 Token::ILLEGAL, |
| 203 Token::ILLEGAL, |
| 204 Token::ILLEGAL, |
| 205 Token::ILLEGAL, |
| 206 Token::ILLEGAL, |
| 207 Token::ILLEGAL, |
| 208 Token::ILLEGAL, |
| 209 Token::ILLEGAL, |
| 210 Token::ILLEGAL, |
| 211 Token::ILLEGAL, |
| 212 Token::ILLEGAL, |
| 213 Token::ILLEGAL, |
| 214 Token::ILLEGAL, |
| 215 Token::ILLEGAL, |
| 216 Token::ILLEGAL, |
| 217 Token::ILLEGAL, |
| 218 Token::ILLEGAL, |
| 219 Token::ILLEGAL, |
| 220 Token::ILLEGAL, |
| 221 Token::ILLEGAL, |
| 222 Token::ILLEGAL, |
| 223 Token::LBRACE, // 0x7b |
| 224 Token::ILLEGAL, |
| 225 Token::RBRACE, // 0x7d |
| 226 Token::BIT_NOT, // 0x7e |
| 227 Token::ILLEGAL |
| 228 }; |
| 229 |
| 230 |
| 231 Token::Value JavaScriptScanner::Next() { |
| 232 current_ = next_; |
| 233 has_line_terminator_before_next_ = false; |
| 234 has_multiline_comment_before_next_ = false; |
| 235 if (static_cast<unsigned>(c0_) <= 0x7f) { |
| 236 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]); |
| 237 if (token != Token::ILLEGAL) { |
| 238 int pos = source_pos(); |
| 239 next_.token = token; |
| 240 next_.location.beg_pos = pos; |
| 241 next_.location.end_pos = pos + 1; |
| 242 Advance(); |
| 243 return current_.token; |
| 244 } |
| 245 } |
| 246 Scan(); |
| 247 return current_.token; |
| 248 } |
| 249 |
| 250 |
| 251 static inline bool IsByteOrderMark(uc32 c) { |
| 252 // The Unicode value U+FFFE is guaranteed never to be assigned as a |
| 253 // Unicode character; this implies that in a Unicode context the |
| 254 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF |
| 255 // character expressed in little-endian byte order (since it could |
| 256 // not be a U+FFFE character expressed in big-endian byte |
| 257 // order). Nevertheless, we check for it to be compatible with |
| 258 // Spidermonkey. |
| 259 return c == 0xFEFF || c == 0xFFFE; |
| 260 } |
| 261 |
| 262 |
| 263 bool JavaScriptScanner::SkipWhiteSpace() { |
| 264 int start_position = source_pos(); |
| 265 |
| 266 while (true) { |
| 267 // We treat byte-order marks (BOMs) as whitespace for better |
| 268 // compatibility with Spidermonkey and other JavaScript engines. |
| 269 while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) { |
| 270 // IsWhiteSpace() includes line terminators! |
| 271 if (unicode_cache_->IsLineTerminator(c0_)) { |
| 272 // Ignore line terminators, but remember them. This is necessary |
| 273 // for automatic semicolon insertion. |
| 274 has_line_terminator_before_next_ = true; |
| 275 } |
| 276 Advance(); |
| 277 } |
| 278 |
| 279 // If there is an HTML comment end '-->' at the beginning of a |
| 280 // line (with only whitespace in front of it), we treat the rest |
| 281 // of the line as a comment. This is in line with the way |
| 282 // SpiderMonkey handles it. |
| 283 if (c0_ == '-' && has_line_terminator_before_next_) { |
| 284 Advance(); |
| 285 if (c0_ == '-') { |
| 286 Advance(); |
| 287 if (c0_ == '>') { |
| 288 // Treat the rest of the line as a comment. |
| 289 SkipSingleLineComment(); |
| 290 // Continue skipping white space after the comment. |
| 291 continue; |
| 292 } |
| 293 PushBack('-'); // undo Advance() |
| 294 } |
| 295 PushBack('-'); // undo Advance() |
| 296 } |
| 297 // Return whether or not we skipped any characters. |
| 298 return source_pos() != start_position; |
| 299 } |
| 300 } |
| 301 |
| 302 |
| 303 Token::Value JavaScriptScanner::SkipSingleLineComment() { |
| 304 Advance(); |
| 305 |
| 306 // The line terminator at the end of the line is not considered |
| 307 // to be part of the single-line comment; it is recognized |
| 308 // separately by the lexical grammar and becomes part of the |
| 309 // stream of input elements for the syntactic grammar (see |
| 310 // ECMA-262, section 7.4). |
| 311 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { |
| 312 Advance(); |
| 313 } |
| 314 |
| 315 return Token::WHITESPACE; |
| 316 } |
| 317 |
| 318 |
| 319 Token::Value JavaScriptScanner::SkipMultiLineComment() { |
| 320 ASSERT(c0_ == '*'); |
| 321 Advance(); |
| 322 |
| 323 while (c0_ >= 0) { |
| 324 uc32 ch = c0_; |
| 325 Advance(); |
| 326 if (unicode_cache_->IsLineTerminator(ch)) { |
| 327 // Following ECMA-262, section 7.4, a comment containing |
| 328 // a newline will make the comment count as a line-terminator. |
| 329 has_multiline_comment_before_next_ = true; |
| 330 } |
| 331 // If we have reached the end of the multi-line comment, we |
| 332 // consume the '/' and insert a whitespace. This way all |
| 333 // multi-line comments are treated as whitespace. |
| 334 if (ch == '*' && c0_ == '/') { |
| 335 c0_ = ' '; |
| 336 return Token::WHITESPACE; |
| 337 } |
| 338 } |
| 339 |
| 340 // Unterminated multi-line comment. |
| 341 return Token::ILLEGAL; |
| 342 } |
| 343 |
| 344 |
| 345 Token::Value JavaScriptScanner::ScanHtmlComment() { |
| 346 // Check for <!-- comments. |
| 347 ASSERT(c0_ == '!'); |
| 348 Advance(); |
| 349 if (c0_ == '-') { |
| 350 Advance(); |
| 351 if (c0_ == '-') return SkipSingleLineComment(); |
| 352 PushBack('-'); // undo Advance() |
| 353 } |
| 354 PushBack('!'); // undo Advance() |
| 355 ASSERT(c0_ == '!'); |
| 356 return Token::LT; |
| 357 } |
| 358 |
| 359 |
| 360 void JavaScriptScanner::Scan() { |
| 361 next_.literal_chars = NULL; |
| 362 Token::Value token; |
| 363 do { |
| 364 // Remember the position of the next token |
| 365 next_.location.beg_pos = source_pos(); |
| 366 |
| 367 switch (c0_) { |
| 368 case ' ': |
| 369 case '\t': |
| 370 Advance(); |
| 371 token = Token::WHITESPACE; |
| 372 break; |
| 373 |
| 374 case '\n': |
| 375 Advance(); |
| 376 has_line_terminator_before_next_ = true; |
| 377 token = Token::WHITESPACE; |
| 378 break; |
| 379 |
| 380 case '"': case '\'': |
| 381 token = ScanString(); |
| 382 break; |
| 383 |
| 384 case '<': |
| 385 // < <= << <<= <!-- |
| 386 Advance(); |
| 387 if (c0_ == '=') { |
| 388 token = Select(Token::LTE); |
| 389 } else if (c0_ == '<') { |
| 390 token = Select('=', Token::ASSIGN_SHL, Token::SHL); |
| 391 } else if (c0_ == '!') { |
| 392 token = ScanHtmlComment(); |
| 393 } else { |
| 394 token = Token::LT; |
| 395 } |
| 396 break; |
| 397 |
| 398 case '>': |
| 399 // > >= >> >>= >>> >>>= |
| 400 Advance(); |
| 401 if (c0_ == '=') { |
| 402 token = Select(Token::GTE); |
| 403 } else if (c0_ == '>') { |
| 404 // >> >>= >>> >>>= |
| 405 Advance(); |
| 406 if (c0_ == '=') { |
| 407 token = Select(Token::ASSIGN_SAR); |
| 408 } else if (c0_ == '>') { |
| 409 token = Select('=', Token::ASSIGN_SHR, Token::SHR); |
| 410 } else { |
| 411 token = Token::SAR; |
| 412 } |
| 413 } else { |
| 414 token = Token::GT; |
| 415 } |
| 416 break; |
| 417 |
| 418 case '=': |
| 419 // = == === |
| 420 Advance(); |
| 421 if (c0_ == '=') { |
| 422 token = Select('=', Token::EQ_STRICT, Token::EQ); |
| 423 } else { |
| 424 token = Token::ASSIGN; |
| 425 } |
| 426 break; |
| 427 |
| 428 case '!': |
| 429 // ! != !== |
| 430 Advance(); |
| 431 if (c0_ == '=') { |
| 432 token = Select('=', Token::NE_STRICT, Token::NE); |
| 433 } else { |
| 434 token = Token::NOT; |
| 435 } |
| 436 break; |
| 437 |
| 438 case '+': |
| 439 // + ++ += |
| 440 Advance(); |
| 441 if (c0_ == '+') { |
| 442 token = Select(Token::INC); |
| 443 } else if (c0_ == '=') { |
| 444 token = Select(Token::ASSIGN_ADD); |
| 445 } else { |
| 446 token = Token::ADD; |
| 447 } |
| 448 break; |
| 449 |
| 450 case '-': |
| 451 // - -- --> -= |
| 452 Advance(); |
| 453 if (c0_ == '-') { |
| 454 Advance(); |
| 455 if (c0_ == '>' && has_line_terminator_before_next_) { |
| 456 // For compatibility with SpiderMonkey, we skip lines that |
| 457 // start with an HTML comment end '-->'. |
| 458 token = SkipSingleLineComment(); |
| 459 } else { |
| 460 token = Token::DEC; |
| 461 } |
| 462 } else if (c0_ == '=') { |
| 463 token = Select(Token::ASSIGN_SUB); |
| 464 } else { |
| 465 token = Token::SUB; |
| 466 } |
| 467 break; |
| 468 |
| 469 case '*': |
| 470 // * *= |
| 471 token = Select('=', Token::ASSIGN_MUL, Token::MUL); |
| 472 break; |
| 473 |
| 474 case '%': |
| 475 // % %= |
| 476 token = Select('=', Token::ASSIGN_MOD, Token::MOD); |
| 477 break; |
| 478 |
| 479 case '/': |
| 480 // / // /* /= |
| 481 Advance(); |
| 482 if (c0_ == '/') { |
| 483 token = SkipSingleLineComment(); |
| 484 } else if (c0_ == '*') { |
| 485 token = SkipMultiLineComment(); |
| 486 } else if (c0_ == '=') { |
| 487 token = Select(Token::ASSIGN_DIV); |
| 488 } else { |
| 489 token = Token::DIV; |
| 490 } |
| 491 break; |
| 492 |
| 493 case '&': |
| 494 // & && &= |
| 495 Advance(); |
| 496 if (c0_ == '&') { |
| 497 token = Select(Token::AND); |
| 498 } else if (c0_ == '=') { |
| 499 token = Select(Token::ASSIGN_BIT_AND); |
| 500 } else { |
| 501 token = Token::BIT_AND; |
| 502 } |
| 503 break; |
| 504 |
| 505 case '|': |
| 506 // | || |= |
| 507 Advance(); |
| 508 if (c0_ == '|') { |
| 509 token = Select(Token::OR); |
| 510 } else if (c0_ == '=') { |
| 511 token = Select(Token::ASSIGN_BIT_OR); |
| 512 } else { |
| 513 token = Token::BIT_OR; |
| 514 } |
| 515 break; |
| 516 |
| 517 case '^': |
| 518 // ^ ^= |
| 519 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); |
| 520 break; |
| 521 |
| 522 case '.': |
| 523 // . Number |
| 524 Advance(); |
| 525 if (IsDecimalDigit(c0_)) { |
| 526 token = ScanNumber(true); |
| 527 } else { |
| 528 token = Token::PERIOD; |
| 529 } |
| 530 break; |
| 531 |
| 532 case ':': |
| 533 token = Select(Token::COLON); |
| 534 break; |
| 535 |
| 536 case ';': |
| 537 token = Select(Token::SEMICOLON); |
| 538 break; |
| 539 |
| 540 case ',': |
| 541 token = Select(Token::COMMA); |
| 542 break; |
| 543 |
| 544 case '(': |
| 545 token = Select(Token::LPAREN); |
| 546 break; |
| 547 |
| 548 case ')': |
| 549 token = Select(Token::RPAREN); |
| 550 break; |
| 551 |
| 552 case '[': |
| 553 token = Select(Token::LBRACK); |
| 554 break; |
| 555 |
| 556 case ']': |
| 557 token = Select(Token::RBRACK); |
| 558 break; |
| 559 |
| 560 case '{': |
| 561 token = Select(Token::LBRACE); |
| 562 break; |
| 563 |
| 564 case '}': |
| 565 token = Select(Token::RBRACE); |
| 566 break; |
| 567 |
| 568 case '?': |
| 569 token = Select(Token::CONDITIONAL); |
| 570 break; |
| 571 |
| 572 case '~': |
| 573 token = Select(Token::BIT_NOT); |
| 574 break; |
| 575 |
| 576 default: |
| 577 if (unicode_cache_->IsIdentifierStart(c0_)) { |
| 578 token = ScanIdentifierOrKeyword(); |
| 579 } else if (IsDecimalDigit(c0_)) { |
| 580 token = ScanNumber(false); |
| 581 } else if (SkipWhiteSpace()) { |
| 582 token = Token::WHITESPACE; |
| 583 } else if (c0_ < 0) { |
| 584 token = Token::EOS; |
| 585 } else { |
| 586 token = Select(Token::ILLEGAL); |
| 587 } |
| 588 break; |
| 589 } |
| 590 |
| 591 // Continue scanning for tokens as long as we're just skipping |
| 592 // whitespace. |
| 593 } while (token == Token::WHITESPACE); |
| 594 |
| 595 next_.location.end_pos = source_pos(); |
| 596 next_.token = token; |
| 597 } |
| 598 |
| 599 |
| 600 void JavaScriptScanner::SeekForward(int pos) { |
| 601 // After this call, we will have the token at the given position as |
| 602 // the "next" token. The "current" token will be invalid. |
| 603 if (pos == next_.location.beg_pos) return; |
| 604 int current_pos = source_pos(); |
| 605 ASSERT_EQ(next_.location.end_pos, current_pos); |
| 606 // Positions inside the lookahead token aren't supported. |
| 607 ASSERT(pos >= current_pos); |
| 608 if (pos != current_pos) { |
| 609 source_->SeekForward(pos - source_->pos()); |
| 610 Advance(); |
| 611 // This function is only called to seek to the location |
| 612 // of the end of a function (at the "}" token). It doesn't matter |
| 613 // whether there was a line terminator in the part we skip. |
| 614 has_line_terminator_before_next_ = false; |
| 615 has_multiline_comment_before_next_ = false; |
| 616 } |
| 617 Scan(); |
| 618 } |
| 619 |
| 620 |
| 621 void JavaScriptScanner::ScanEscape() { |
| 622 uc32 c = c0_; |
| 623 Advance(); |
| 624 |
| 625 // Skip escaped newlines. |
| 626 if (unicode_cache_->IsLineTerminator(c)) { |
| 627 // Allow CR+LF newlines in multiline string literals. |
| 628 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance(); |
| 629 // Allow LF+CR newlines in multiline string literals. |
| 630 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance(); |
54 return; | 631 return; |
55 } | 632 } |
56 if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) { | 633 |
57 // buffer_ is writable, buffer_cursor_ is const pointer. | 634 switch (c) { |
58 buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character); | 635 case '\'': // fall through |
59 pos_--; | 636 case '"' : // fall through |
60 return; | 637 case '\\': break; |
61 } | 638 case 'b' : c = '\b'; break; |
62 SlowPushBack(static_cast<uc16>(character)); | 639 case 'f' : c = '\f'; break; |
63 } | 640 case 'n' : c = '\n'; break; |
64 | 641 case 'r' : c = '\r'; break; |
65 | 642 case 't' : c = '\t'; break; |
66 void BufferedUC16CharacterStream::SlowPushBack(uc16 character) { | 643 case 'u' : { |
67 // In pushback mode, the end of the buffer contains pushback, | 644 c = ScanHexNumber(4); |
68 // and the start of the buffer (from buffer start to pushback_limit_) | 645 if (c < 0) c = 'u'; |
69 // contains valid data that comes just after the pushback. | 646 break; |
70 // We NULL the pushback_limit_ if pushing all the way back to the | 647 } |
71 // start of the buffer. | 648 case 'v' : c = '\v'; break; |
72 | 649 case 'x' : { |
73 if (pushback_limit_ == NULL) { | 650 c = ScanHexNumber(2); |
74 // Enter pushback mode. | 651 if (c < 0) c = 'x'; |
75 pushback_limit_ = buffer_end_; | 652 break; |
76 buffer_end_ = buffer_ + kBufferSize; | 653 } |
77 buffer_cursor_ = buffer_end_; | 654 case '0' : // fall through |
78 } | 655 case '1' : // fall through |
79 // Ensure that there is room for at least one pushback. | 656 case '2' : // fall through |
80 ASSERT(buffer_cursor_ > buffer_); | 657 case '3' : // fall through |
81 ASSERT(pos_ > 0); | 658 case '4' : // fall through |
82 buffer_[--buffer_cursor_ - buffer_] = character; | 659 case '5' : // fall through |
83 if (buffer_cursor_ == buffer_) { | 660 case '6' : // fall through |
84 pushback_limit_ = NULL; | 661 case '7' : c = ScanOctalEscape(c, 2); break; |
85 } else if (buffer_cursor_ < pushback_limit_) { | 662 } |
86 pushback_limit_ = buffer_cursor_; | 663 |
87 } | 664 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these |
88 pos_--; | 665 // should be illegal, but they are commonly handled |
89 } | 666 // as non-escaped characters by JS VMs. |
90 | 667 AddLiteralChar(c); |
91 | 668 } |
92 bool BufferedUC16CharacterStream::ReadBlock() { | 669 |
93 buffer_cursor_ = buffer_; | 670 |
94 if (pushback_limit_ != NULL) { | 671 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of |
95 // Leave pushback mode. | 672 // ECMA-262. Other JS VMs support them. |
96 buffer_end_ = pushback_limit_; | 673 uc32 JavaScriptScanner::ScanOctalEscape(uc32 c, int length) { |
97 pushback_limit_ = NULL; | 674 uc32 x = c - '0'; |
98 // If there were any valid characters left at the | 675 int i = 0; |
99 // start of the buffer, use those. | 676 for (; i < length; i++) { |
100 if (buffer_cursor_ < buffer_end_) return true; | 677 int d = c0_ - '0'; |
101 // Otherwise read a new block. | 678 if (d < 0 || d > 7) break; |
102 } | 679 int nx = x * 8 + d; |
103 unsigned length = FillBuffer(pos_, kBufferSize); | 680 if (nx >= 256) break; |
104 buffer_end_ = buffer_ + length; | 681 x = nx; |
105 return length > 0; | 682 Advance(); |
106 } | 683 } |
107 | 684 // Anything except '\0' is an octal escape sequence, illegal in strict mode. |
108 | 685 // Remember the position of octal escape sequences so that an error |
109 unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) { | 686 // can be reported later (in strict mode). |
110 // Leave pushback mode (i.e., ignore that there might be valid data | 687 // We don't report the error immediately, because the octal escape can |
111 // in the buffer before the pushback_limit_ point). | 688 // occur before the "use strict" directive. |
112 pushback_limit_ = NULL; | 689 if (c != '0' || i > 0) { |
113 return BufferSeekForward(delta); | 690 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1); |
114 } | 691 } |
| 692 return x; |
| 693 } |
| 694 |
| 695 |
| 696 Token::Value JavaScriptScanner::ScanString() { |
| 697 uc32 quote = c0_; |
| 698 Advance(); // consume quote |
| 699 |
| 700 LiteralScope literal(this); |
| 701 while (c0_ != quote && c0_ >= 0 |
| 702 && !unicode_cache_->IsLineTerminator(c0_)) { |
| 703 uc32 c = c0_; |
| 704 Advance(); |
| 705 if (c == '\\') { |
| 706 if (c0_ < 0) return Token::ILLEGAL; |
| 707 ScanEscape(); |
| 708 } else { |
| 709 AddLiteralChar(c); |
| 710 } |
| 711 } |
| 712 if (c0_ != quote) return Token::ILLEGAL; |
| 713 literal.Complete(); |
| 714 |
| 715 Advance(); // consume quote |
| 716 return Token::STRING; |
| 717 } |
| 718 |
| 719 |
| 720 void JavaScriptScanner::ScanDecimalDigits() { |
| 721 while (IsDecimalDigit(c0_)) |
| 722 AddLiteralCharAdvance(); |
| 723 } |
| 724 |
| 725 |
| 726 Token::Value JavaScriptScanner::ScanNumber(bool seen_period) { |
| 727 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction |
| 728 |
| 729 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL; |
| 730 |
| 731 LiteralScope literal(this); |
| 732 if (seen_period) { |
| 733 // we have already seen a decimal point of the float |
| 734 AddLiteralChar('.'); |
| 735 ScanDecimalDigits(); // we know we have at least one digit |
| 736 |
| 737 } else { |
| 738 // if the first character is '0' we must check for octals and hex |
| 739 if (c0_ == '0') { |
| 740 int start_pos = source_pos(); // For reporting octal positions. |
| 741 AddLiteralCharAdvance(); |
| 742 |
| 743 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number |
| 744 if (c0_ == 'x' || c0_ == 'X') { |
| 745 // hex number |
| 746 kind = HEX; |
| 747 AddLiteralCharAdvance(); |
| 748 if (!IsHexDigit(c0_)) { |
| 749 // we must have at least one hex digit after 'x'/'X' |
| 750 return Token::ILLEGAL; |
| 751 } |
| 752 while (IsHexDigit(c0_)) { |
| 753 AddLiteralCharAdvance(); |
| 754 } |
| 755 } else if ('0' <= c0_ && c0_ <= '7') { |
| 756 // (possible) octal number |
| 757 kind = OCTAL; |
| 758 while (true) { |
| 759 if (c0_ == '8' || c0_ == '9') { |
| 760 kind = DECIMAL; |
| 761 break; |
| 762 } |
| 763 if (c0_ < '0' || '7' < c0_) { |
| 764 // Octal literal finished. |
| 765 octal_pos_ = Location(start_pos, source_pos()); |
| 766 break; |
| 767 } |
| 768 AddLiteralCharAdvance(); |
| 769 } |
| 770 } |
| 771 } |
| 772 |
| 773 // Parse decimal digits and allow trailing fractional part. |
| 774 if (kind == DECIMAL) { |
| 775 ScanDecimalDigits(); // optional |
| 776 if (c0_ == '.') { |
| 777 AddLiteralCharAdvance(); |
| 778 ScanDecimalDigits(); // optional |
| 779 } |
| 780 } |
| 781 } |
| 782 |
| 783 // scan exponent, if any |
| 784 if (c0_ == 'e' || c0_ == 'E') { |
| 785 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number |
| 786 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed |
| 787 // scan exponent |
| 788 AddLiteralCharAdvance(); |
| 789 if (c0_ == '+' || c0_ == '-') |
| 790 AddLiteralCharAdvance(); |
| 791 if (!IsDecimalDigit(c0_)) { |
| 792 // we must have at least one decimal digit after 'e'/'E' |
| 793 return Token::ILLEGAL; |
| 794 } |
| 795 ScanDecimalDigits(); |
| 796 } |
| 797 |
| 798 // The source character immediately following a numeric literal must |
| 799 // not be an identifier start or a decimal digit; see ECMA-262 |
| 800 // section 7.8.3, page 17 (note that we read only one decimal digit |
| 801 // if the value is 0). |
| 802 if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_)) |
| 803 return Token::ILLEGAL; |
| 804 |
| 805 literal.Complete(); |
| 806 |
| 807 return Token::NUMBER; |
| 808 } |
| 809 |
| 810 |
| 811 uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() { |
| 812 Advance(); |
| 813 if (c0_ != 'u') return -1; |
| 814 Advance(); |
| 815 uc32 result = ScanHexNumber(4); |
| 816 if (result < 0) PushBack('u'); |
| 817 return result; |
| 818 } |
| 819 |
115 | 820 |
116 // ---------------------------------------------------------------------------- | 821 // ---------------------------------------------------------------------------- |
117 // GenericStringUC16CharacterStream | 822 // Keyword Matcher |
118 | 823 |
119 | 824 #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ |
120 GenericStringUC16CharacterStream::GenericStringUC16CharacterStream( | 825 KEYWORD_GROUP('b') \ |
121 Handle<String> data, | 826 KEYWORD("break", Token::BREAK) \ |
122 unsigned start_position, | 827 KEYWORD_GROUP('c') \ |
123 unsigned end_position) | 828 KEYWORD("case", Token::CASE) \ |
124 : string_(data), | 829 KEYWORD("catch", Token::CATCH) \ |
125 length_(end_position) { | 830 KEYWORD("class", Token::FUTURE_RESERVED_WORD) \ |
126 ASSERT(end_position >= start_position); | 831 KEYWORD("const", Token::CONST) \ |
127 buffer_cursor_ = buffer_; | 832 KEYWORD("continue", Token::CONTINUE) \ |
128 buffer_end_ = buffer_; | 833 KEYWORD_GROUP('d') \ |
129 pos_ = start_position; | 834 KEYWORD("debugger", Token::DEBUGGER) \ |
130 } | 835 KEYWORD("default", Token::DEFAULT) \ |
131 | 836 KEYWORD("delete", Token::DELETE) \ |
132 | 837 KEYWORD("do", Token::DO) \ |
133 GenericStringUC16CharacterStream::~GenericStringUC16CharacterStream() { } | 838 KEYWORD_GROUP('e') \ |
134 | 839 KEYWORD("else", Token::ELSE) \ |
135 | 840 KEYWORD("enum", Token::FUTURE_RESERVED_WORD) \ |
136 unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) { | 841 KEYWORD("export", Token::FUTURE_RESERVED_WORD) \ |
137 unsigned old_pos = pos_; | 842 KEYWORD("extends", Token::FUTURE_RESERVED_WORD) \ |
138 pos_ = Min(pos_ + delta, length_); | 843 KEYWORD_GROUP('f') \ |
139 ReadBlock(); | 844 KEYWORD("false", Token::FALSE_LITERAL) \ |
140 return pos_ - old_pos; | 845 KEYWORD("finally", Token::FINALLY) \ |
141 } | 846 KEYWORD("for", Token::FOR) \ |
142 | 847 KEYWORD("function", Token::FUNCTION) \ |
143 | 848 KEYWORD_GROUP('i') \ |
144 unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos, | 849 KEYWORD("if", Token::IF) \ |
145 unsigned length) { | 850 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \ |
146 if (from_pos >= length_) return 0; | 851 KEYWORD("import", Token::FUTURE_RESERVED_WORD) \ |
147 if (from_pos + length > length_) { | 852 KEYWORD("in", Token::IN) \ |
148 length = length_ - from_pos; | 853 KEYWORD("instanceof", Token::INSTANCEOF) \ |
149 } | 854 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \ |
150 String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length); | 855 KEYWORD_GROUP('l') \ |
151 return length; | 856 KEYWORD("let", harmony_block_scoping \ |
152 } | 857 ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \ |
153 | 858 KEYWORD_GROUP('n') \ |
154 | 859 KEYWORD("new", Token::NEW) \ |
155 // ---------------------------------------------------------------------------- | 860 KEYWORD("null", Token::NULL_LITERAL) \ |
156 // Utf8ToUC16CharacterStream | 861 KEYWORD_GROUP('p') \ |
157 Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data, | 862 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \ |
158 unsigned length) | 863 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \ |
159 : BufferedUC16CharacterStream(), | 864 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \ |
160 raw_data_(data), | 865 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \ |
161 raw_data_length_(length), | 866 KEYWORD_GROUP('r') \ |
162 raw_data_pos_(0), | 867 KEYWORD("return", Token::RETURN) \ |
163 raw_character_position_(0) { | 868 KEYWORD_GROUP('s') \ |
164 ReadBlock(); | 869 KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD) \ |
165 } | 870 KEYWORD("super", Token::FUTURE_RESERVED_WORD) \ |
166 | 871 KEYWORD("switch", Token::SWITCH) \ |
167 | 872 KEYWORD_GROUP('t') \ |
168 Utf8ToUC16CharacterStream::~Utf8ToUC16CharacterStream() { } | 873 KEYWORD("this", Token::THIS) \ |
169 | 874 KEYWORD("throw", Token::THROW) \ |
170 | 875 KEYWORD("true", Token::TRUE_LITERAL) \ |
171 unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) { | 876 KEYWORD("try", Token::TRY) \ |
172 unsigned old_pos = pos_; | 877 KEYWORD("typeof", Token::TYPEOF) \ |
173 unsigned target_pos = pos_ + delta; | 878 KEYWORD_GROUP('v') \ |
174 SetRawPosition(target_pos); | 879 KEYWORD("var", Token::VAR) \ |
175 pos_ = raw_character_position_; | 880 KEYWORD("void", Token::VOID) \ |
176 ReadBlock(); | 881 KEYWORD_GROUP('w') \ |
177 return pos_ - old_pos; | 882 KEYWORD("while", Token::WHILE) \ |
178 } | 883 KEYWORD("with", Token::WITH) \ |
179 | 884 KEYWORD_GROUP('y') \ |
180 | 885 KEYWORD("yield", Token::FUTURE_STRICT_RESERVED_WORD) |
181 unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position, | 886 |
182 unsigned length) { | 887 |
183 static const unibrow::uchar kMaxUC16Character = 0xffff; | 888 static Token::Value KeywordOrIdentifierToken(const char* input, |
184 SetRawPosition(char_position); | 889 int input_length, |
185 if (raw_character_position_ != char_position) { | 890 bool harmony_block_scoping) { |
186 // char_position was not a valid position in the stream (hit the end | 891 ASSERT(input_length >= 1); |
187 // while spooling to it). | 892 const int kMinLength = 2; |
188 return 0u; | 893 const int kMaxLength = 10; |
189 } | 894 if (input_length < kMinLength || input_length > kMaxLength) { |
190 unsigned i = 0; | 895 return Token::IDENTIFIER; |
191 while (i < length) { | 896 } |
192 if (raw_data_pos_ == raw_data_length_) break; | 897 switch (input[0]) { |
193 unibrow::uchar c = raw_data_[raw_data_pos_]; | 898 default: |
194 if (c <= unibrow::Utf8::kMaxOneByteChar) { | 899 #define KEYWORD_GROUP_CASE(ch) \ |
195 raw_data_pos_++; | 900 break; \ |
| 901 case ch: |
| 902 #define KEYWORD(keyword, token) \ |
| 903 { \ |
| 904 /* 'keyword' is a char array, so sizeof(keyword) is */ \ |
| 905 /* strlen(keyword) plus 1 for the NUL char. */ \ |
| 906 const int keyword_length = sizeof(keyword) - 1; \ |
| 907 STATIC_ASSERT(keyword_length >= kMinLength); \ |
| 908 STATIC_ASSERT(keyword_length <= kMaxLength); \ |
| 909 if (input_length == keyword_length && \ |
| 910 input[1] == keyword[1] && \ |
| 911 (keyword_length <= 2 || input[2] == keyword[2]) && \ |
| 912 (keyword_length <= 3 || input[3] == keyword[3]) && \ |
| 913 (keyword_length <= 4 || input[4] == keyword[4]) && \ |
| 914 (keyword_length <= 5 || input[5] == keyword[5]) && \ |
| 915 (keyword_length <= 6 || input[6] == keyword[6]) && \ |
| 916 (keyword_length <= 7 || input[7] == keyword[7]) && \ |
| 917 (keyword_length <= 8 || input[8] == keyword[8]) && \ |
| 918 (keyword_length <= 9 || input[9] == keyword[9])) { \ |
| 919 return token; \ |
| 920 } \ |
| 921 } |
| 922 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD) |
| 923 } |
| 924 return Token::IDENTIFIER; |
| 925 } |
| 926 |
| 927 |
| 928 Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() { |
| 929 ASSERT(unicode_cache_->IsIdentifierStart(c0_)); |
| 930 LiteralScope literal(this); |
| 931 // Scan identifier start character. |
| 932 if (c0_ == '\\') { |
| 933 uc32 c = ScanIdentifierUnicodeEscape(); |
| 934 // Only allow legal identifier start characters. |
| 935 if (c < 0 || |
| 936 c == '\\' || // No recursive escapes. |
| 937 !unicode_cache_->IsIdentifierStart(c)) { |
| 938 return Token::ILLEGAL; |
| 939 } |
| 940 AddLiteralChar(c); |
| 941 return ScanIdentifierSuffix(&literal); |
| 942 } |
| 943 |
| 944 uc32 first_char = c0_; |
| 945 Advance(); |
| 946 AddLiteralChar(first_char); |
| 947 |
| 948 // Scan the rest of the identifier characters. |
| 949 while (unicode_cache_->IsIdentifierPart(c0_)) { |
| 950 if (c0_ != '\\') { |
| 951 uc32 next_char = c0_; |
| 952 Advance(); |
| 953 AddLiteralChar(next_char); |
| 954 continue; |
| 955 } |
| 956 // Fallthrough if no longer able to complete keyword. |
| 957 return ScanIdentifierSuffix(&literal); |
| 958 } |
| 959 |
| 960 literal.Complete(); |
| 961 |
| 962 if (next_.literal_chars->is_ascii()) { |
| 963 Vector<const char> chars = next_.literal_chars->ascii_literal(); |
| 964 return KeywordOrIdentifierToken(chars.start(), |
| 965 chars.length(), |
| 966 harmony_block_scoping_); |
| 967 } |
| 968 |
| 969 return Token::IDENTIFIER; |
| 970 } |
| 971 |
| 972 |
| 973 Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) { |
| 974 // Scan the rest of the identifier characters. |
| 975 while (unicode_cache_->IsIdentifierPart(c0_)) { |
| 976 if (c0_ == '\\') { |
| 977 uc32 c = ScanIdentifierUnicodeEscape(); |
| 978 // Only allow legal identifier part characters. |
| 979 if (c < 0 || |
| 980 c == '\\' || |
| 981 !unicode_cache_->IsIdentifierPart(c)) { |
| 982 return Token::ILLEGAL; |
| 983 } |
| 984 AddLiteralChar(c); |
196 } else { | 985 } else { |
197 c = unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_, | 986 AddLiteralChar(c0_); |
198 raw_data_length_ - raw_data_pos_, | 987 Advance(); |
199 &raw_data_pos_); | 988 } |
200 // Don't allow characters outside of the BMP. | 989 } |
201 if (c > kMaxUC16Character) { | 990 literal->Complete(); |
202 c = unibrow::Utf8::kBadChar; | 991 |
| 992 return Token::IDENTIFIER; |
| 993 } |
| 994 |
| 995 |
| 996 bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) { |
| 997 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags |
| 998 bool in_character_class = false; |
| 999 |
| 1000 // Previous token is either '/' or '/=', in the second case, the |
| 1001 // pattern starts at =. |
| 1002 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); |
| 1003 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); |
| 1004 |
| 1005 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, |
| 1006 // the scanner should pass uninterpreted bodies to the RegExp |
| 1007 // constructor. |
| 1008 LiteralScope literal(this); |
| 1009 if (seen_equal) { |
| 1010 AddLiteralChar('='); |
| 1011 } |
| 1012 |
| 1013 while (c0_ != '/' || in_character_class) { |
| 1014 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; |
| 1015 if (c0_ == '\\') { // Escape sequence. |
| 1016 AddLiteralCharAdvance(); |
| 1017 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; |
| 1018 AddLiteralCharAdvance(); |
| 1019 // If the escape allows more characters, i.e., \x??, \u????, or \c?, |
| 1020 // only "safe" characters are allowed (letters, digits, underscore), |
| 1021 // otherwise the escape isn't valid and the invalid character has |
| 1022 // its normal meaning. I.e., we can just continue scanning without |
| 1023 // worrying whether the following characters are part of the escape |
| 1024 // or not, since any '/', '\\' or '[' is guaranteed to not be part |
| 1025 // of the escape sequence. |
| 1026 |
| 1027 // TODO(896): At some point, parse RegExps more throughly to capture |
| 1028 // octal esacpes in strict mode. |
| 1029 } else { // Unescaped character. |
| 1030 if (c0_ == '[') in_character_class = true; |
| 1031 if (c0_ == ']') in_character_class = false; |
| 1032 AddLiteralCharAdvance(); |
| 1033 } |
| 1034 } |
| 1035 Advance(); // consume '/' |
| 1036 |
| 1037 literal.Complete(); |
| 1038 |
| 1039 return true; |
| 1040 } |
| 1041 |
| 1042 |
| 1043 bool JavaScriptScanner::ScanLiteralUnicodeEscape() { |
| 1044 ASSERT(c0_ == '\\'); |
| 1045 uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0}; |
| 1046 Advance(); |
| 1047 int i = 1; |
| 1048 if (c0_ == 'u') { |
| 1049 i++; |
| 1050 while (i < 6) { |
| 1051 Advance(); |
| 1052 if (!IsHexDigit(c0_)) break; |
| 1053 chars_read[i] = c0_; |
| 1054 i++; |
| 1055 } |
| 1056 } |
| 1057 if (i < 6) { |
| 1058 // Incomplete escape. Undo all advances and return false. |
| 1059 while (i > 0) { |
| 1060 i--; |
| 1061 PushBack(chars_read[i]); |
| 1062 } |
| 1063 return false; |
| 1064 } |
| 1065 // Complete escape. Add all chars to current literal buffer. |
| 1066 for (int i = 0; i < 6; i++) { |
| 1067 AddLiteralChar(chars_read[i]); |
| 1068 } |
| 1069 return true; |
| 1070 } |
| 1071 |
| 1072 |
| 1073 bool JavaScriptScanner::ScanRegExpFlags() { |
| 1074 // Scan regular expression flags. |
| 1075 LiteralScope literal(this); |
| 1076 while (unicode_cache_->IsIdentifierPart(c0_)) { |
| 1077 if (c0_ != '\\') { |
| 1078 AddLiteralCharAdvance(); |
| 1079 } else { |
| 1080 if (!ScanLiteralUnicodeEscape()) { |
| 1081 break; |
203 } | 1082 } |
204 } | 1083 } |
205 buffer_[i++] = static_cast<uc16>(c); | 1084 } |
206 } | 1085 literal.Complete(); |
207 raw_character_position_ = char_position + i; | 1086 |
208 return i; | 1087 next_.location.end_pos = source_pos() - 1; |
209 } | 1088 return true; |
210 | |
211 | |
212 static const byte kUtf8MultiByteMask = 0xC0; | |
213 static const byte kUtf8MultiByteCharStart = 0xC0; | |
214 static const byte kUtf8MultiByteCharFollower = 0x80; | |
215 | |
216 | |
217 #ifdef DEBUG | |
218 static bool IsUtf8MultiCharacterStart(byte first_byte) { | |
219 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart; | |
220 } | |
221 #endif | |
222 | |
223 | |
224 static bool IsUtf8MultiCharacterFollower(byte later_byte) { | |
225 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower; | |
226 } | |
227 | |
228 | |
229 // Move the cursor back to point at the preceding UTF-8 character start | |
230 // in the buffer. | |
231 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) { | |
232 byte character = buffer[--*cursor]; | |
233 if (character > unibrow::Utf8::kMaxOneByteChar) { | |
234 ASSERT(IsUtf8MultiCharacterFollower(character)); | |
235 // Last byte of a multi-byte character encoding. Step backwards until | |
236 // pointing to the first byte of the encoding, recognized by having the | |
237 // top two bits set. | |
238 while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { } | |
239 ASSERT(IsUtf8MultiCharacterStart(buffer[*cursor])); | |
240 } | |
241 } | |
242 | |
243 | |
244 // Move the cursor forward to point at the next following UTF-8 character start | |
245 // in the buffer. | |
246 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) { | |
247 byte character = buffer[(*cursor)++]; | |
248 if (character > unibrow::Utf8::kMaxOneByteChar) { | |
249 // First character of a multi-byte character encoding. | |
250 // The number of most-significant one-bits determines the length of the | |
251 // encoding: | |
252 // 110..... - (0xCx, 0xDx) one additional byte (minimum). | |
253 // 1110.... - (0xEx) two additional bytes. | |
254 // 11110... - (0xFx) three additional bytes (maximum). | |
255 ASSERT(IsUtf8MultiCharacterStart(character)); | |
256 // Additional bytes is: | |
257 // 1 if value in range 0xC0 .. 0xDF. | |
258 // 2 if value in range 0xE0 .. 0xEF. | |
259 // 3 if value in range 0xF0 .. 0xF7. | |
260 // Encode that in a single value. | |
261 unsigned additional_bytes = | |
262 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03; | |
263 *cursor += additional_bytes; | |
264 ASSERT(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes])); | |
265 } | |
266 } | |
267 | |
268 | |
269 void Utf8ToUC16CharacterStream::SetRawPosition(unsigned target_position) { | |
270 if (raw_character_position_ > target_position) { | |
271 // Spool backwards in utf8 buffer. | |
272 do { | |
273 Utf8CharacterBack(raw_data_, &raw_data_pos_); | |
274 raw_character_position_--; | |
275 } while (raw_character_position_ > target_position); | |
276 return; | |
277 } | |
278 // Spool forwards in the utf8 buffer. | |
279 while (raw_character_position_ < target_position) { | |
280 if (raw_data_pos_ == raw_data_length_) return; | |
281 Utf8CharacterForward(raw_data_, &raw_data_pos_); | |
282 raw_character_position_++; | |
283 } | |
284 } | |
285 | |
286 | |
287 // ---------------------------------------------------------------------------- | |
288 // ExternalTwoByteStringUC16CharacterStream | |
289 | |
290 ExternalTwoByteStringUC16CharacterStream:: | |
291 ~ExternalTwoByteStringUC16CharacterStream() { } | |
292 | |
293 | |
294 ExternalTwoByteStringUC16CharacterStream | |
295 ::ExternalTwoByteStringUC16CharacterStream( | |
296 Handle<ExternalTwoByteString> data, | |
297 int start_position, | |
298 int end_position) | |
299 : UC16CharacterStream(), | |
300 source_(data), | |
301 raw_data_(data->GetTwoByteData(start_position)) { | |
302 buffer_cursor_ = raw_data_, | |
303 buffer_end_ = raw_data_ + (end_position - start_position); | |
304 pos_ = start_position; | |
305 } | |
306 | |
307 | |
308 // ---------------------------------------------------------------------------- | |
309 // Scanner::LiteralScope | |
310 | |
311 Scanner::LiteralScope::LiteralScope(Scanner* self) | |
312 : scanner_(self), complete_(false) { | |
313 self->StartLiteral(); | |
314 } | |
315 | |
316 | |
317 Scanner::LiteralScope::~LiteralScope() { | |
318 if (!complete_) scanner_->DropLiteral(); | |
319 } | |
320 | |
321 | |
322 void Scanner::LiteralScope::Complete() { | |
323 scanner_->TerminateLiteral(); | |
324 complete_ = true; | |
325 } | 1089 } |
326 | 1090 |
327 } } // namespace v8::internal | 1091 } } // namespace v8::internal |
OLD | NEW |