OLD | NEW |
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 242 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
253 | 253 |
254 bool is_ascii_; | 254 bool is_ascii_; |
255 int position_; | 255 int position_; |
256 Vector<byte> backing_store_; | 256 Vector<byte> backing_store_; |
257 | 257 |
258 DISALLOW_COPY_AND_ASSIGN(LiteralBuffer); | 258 DISALLOW_COPY_AND_ASSIGN(LiteralBuffer); |
259 }; | 259 }; |
260 | 260 |
261 | 261 |
262 // ---------------------------------------------------------------------------- | 262 // ---------------------------------------------------------------------------- |
263 // Scanner base-class. | 263 // JavaScript Scanner. |
264 | 264 |
265 // Generic functionality used by both JSON and JavaScript scanners. | |
266 class Scanner { | 265 class Scanner { |
267 public: | 266 public: |
268 // -1 is outside of the range of any real source code. | 267 // Scoped helper for literal recording. Automatically drops the literal |
269 static const int kNoOctalLocation = -1; | 268 // if aborting the scanning before it's complete. |
270 | |
271 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; | |
272 | |
273 class LiteralScope { | 269 class LiteralScope { |
274 public: | 270 public: |
275 explicit LiteralScope(Scanner* self); | 271 explicit LiteralScope(Scanner* self) |
276 ~LiteralScope(); | 272 : scanner_(self), complete_(false) { |
277 void Complete(); | 273 scanner_->StartLiteral(); |
| 274 } |
| 275 ~LiteralScope() { |
| 276 if (!complete_) scanner_->DropLiteral(); |
| 277 } |
| 278 void Complete() { |
| 279 scanner_->TerminateLiteral(); |
| 280 complete_ = true; |
| 281 } |
278 | 282 |
279 private: | 283 private: |
280 Scanner* scanner_; | 284 Scanner* scanner_; |
281 bool complete_; | 285 bool complete_; |
282 }; | 286 }; |
283 | 287 |
284 explicit Scanner(UnicodeCache* scanner_contants); | 288 // Representation of an interval of source positions. |
285 | |
286 // Returns the current token again. | |
287 Token::Value current_token() { return current_.token; } | |
288 | |
289 // One token look-ahead (past the token returned by Next()). | |
290 Token::Value peek() const { return next_.token; } | |
291 | |
292 struct Location { | 289 struct Location { |
293 Location(int b, int e) : beg_pos(b), end_pos(e) { } | 290 Location(int b, int e) : beg_pos(b), end_pos(e) { } |
294 Location() : beg_pos(0), end_pos(0) { } | 291 Location() : beg_pos(0), end_pos(0) { } |
295 | 292 |
296 bool IsValid() const { | 293 bool IsValid() const { |
297 return beg_pos >= 0 && end_pos >= beg_pos; | 294 return beg_pos >= 0 && end_pos >= beg_pos; |
298 } | 295 } |
299 | 296 |
300 static Location invalid() { return Location(-1, -1); } | 297 static Location invalid() { return Location(-1, -1); } |
301 | 298 |
302 int beg_pos; | 299 int beg_pos; |
303 int end_pos; | 300 int end_pos; |
304 }; | 301 }; |
305 | 302 |
| 303 // -1 is outside of the range of any real source code. |
| 304 static const int kNoOctalLocation = -1; |
| 305 |
| 306 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; |
| 307 |
| 308 explicit Scanner(UnicodeCache* scanner_contants); |
| 309 |
| 310 void Initialize(UC16CharacterStream* source); |
| 311 |
| 312 // Returns the next token and advances input. |
| 313 Token::Value Next(); |
| 314 // Returns the current token again. |
| 315 Token::Value current_token() { return current_.token; } |
306 // Returns the location information for the current token | 316 // Returns the location information for the current token |
307 // (the token returned by Next()). | 317 // (the token last returned by Next()). |
308 Location location() const { return current_.location; } | 318 Location location() const { return current_.location; } |
309 Location peek_location() const { return next_.location; } | |
310 | |
311 // Returns the literal string, if any, for the current token (the | 319 // Returns the literal string, if any, for the current token (the |
312 // token returned by Next()). The string is 0-terminated and in | 320 // token last returned by Next()). The string is 0-terminated. |
313 // UTF-8 format; they may contain 0-characters. Literal strings are | 321 // Literal strings are collected for identifiers, strings, and |
314 // collected for identifiers, strings, and numbers. | 322 // numbers. |
315 // These functions only give the correct result if the literal | 323 // These functions only give the correct result if the literal |
316 // was scanned between calls to StartLiteral() and TerminateLiteral(). | 324 // was scanned between calls to StartLiteral() and TerminateLiteral(). |
317 bool is_literal_ascii() { | |
318 ASSERT_NOT_NULL(current_.literal_chars); | |
319 return current_.literal_chars->is_ascii(); | |
320 } | |
321 Vector<const char> literal_ascii_string() { | 325 Vector<const char> literal_ascii_string() { |
322 ASSERT_NOT_NULL(current_.literal_chars); | 326 ASSERT_NOT_NULL(current_.literal_chars); |
323 return current_.literal_chars->ascii_literal(); | 327 return current_.literal_chars->ascii_literal(); |
324 } | 328 } |
325 Vector<const uc16> literal_uc16_string() { | 329 Vector<const uc16> literal_uc16_string() { |
326 ASSERT_NOT_NULL(current_.literal_chars); | 330 ASSERT_NOT_NULL(current_.literal_chars); |
327 return current_.literal_chars->uc16_literal(); | 331 return current_.literal_chars->uc16_literal(); |
328 } | 332 } |
| 333 bool is_literal_ascii() { |
| 334 ASSERT_NOT_NULL(current_.literal_chars); |
| 335 return current_.literal_chars->is_ascii(); |
| 336 } |
329 int literal_length() const { | 337 int literal_length() const { |
330 ASSERT_NOT_NULL(current_.literal_chars); | 338 ASSERT_NOT_NULL(current_.literal_chars); |
331 return current_.literal_chars->length(); | 339 return current_.literal_chars->length(); |
332 } | 340 } |
333 | 341 |
334 bool literal_contains_escapes() const { | 342 bool literal_contains_escapes() const { |
335 Location location = current_.location; | 343 Location location = current_.location; |
336 int source_length = (location.end_pos - location.beg_pos); | 344 int source_length = (location.end_pos - location.beg_pos); |
337 if (current_.token == Token::STRING) { | 345 if (current_.token == Token::STRING) { |
338 // Subtract delimiters. | 346 // Subtract delimiters. |
339 source_length -= 2; | 347 source_length -= 2; |
340 } | 348 } |
341 return current_.literal_chars->length() != source_length; | 349 return current_.literal_chars->length() != source_length; |
342 } | 350 } |
343 | 351 |
| 352 // Similar functions for the upcoming token. |
| 353 |
| 354 // One token look-ahead (past the token returned by Next()). |
| 355 Token::Value peek() const { return next_.token; } |
| 356 |
| 357 Location peek_location() const { return next_.location; } |
| 358 |
344 // Returns the literal string for the next token (the token that | 359 // Returns the literal string for the next token (the token that |
345 // would be returned if Next() were called). | 360 // would be returned if Next() were called). |
346 bool is_next_literal_ascii() { | |
347 ASSERT_NOT_NULL(next_.literal_chars); | |
348 return next_.literal_chars->is_ascii(); | |
349 } | |
350 Vector<const char> next_literal_ascii_string() { | 361 Vector<const char> next_literal_ascii_string() { |
351 ASSERT_NOT_NULL(next_.literal_chars); | 362 ASSERT_NOT_NULL(next_.literal_chars); |
352 return next_.literal_chars->ascii_literal(); | 363 return next_.literal_chars->ascii_literal(); |
353 } | 364 } |
354 Vector<const uc16> next_literal_uc16_string() { | 365 Vector<const uc16> next_literal_uc16_string() { |
355 ASSERT_NOT_NULL(next_.literal_chars); | 366 ASSERT_NOT_NULL(next_.literal_chars); |
356 return next_.literal_chars->uc16_literal(); | 367 return next_.literal_chars->uc16_literal(); |
357 } | 368 } |
| 369 bool is_next_literal_ascii() { |
| 370 ASSERT_NOT_NULL(next_.literal_chars); |
| 371 return next_.literal_chars->is_ascii(); |
| 372 } |
358 int next_literal_length() const { | 373 int next_literal_length() const { |
359 ASSERT_NOT_NULL(next_.literal_chars); | 374 ASSERT_NOT_NULL(next_.literal_chars); |
360 return next_.literal_chars->length(); | 375 return next_.literal_chars->length(); |
361 } | 376 } |
362 | 377 |
363 UnicodeCache* unicode_cache() { return unicode_cache_; } | 378 UnicodeCache* unicode_cache() { return unicode_cache_; } |
364 | 379 |
365 static const int kCharacterLookaheadBufferSize = 1; | 380 static const int kCharacterLookaheadBufferSize = 1; |
366 | 381 |
367 protected: | 382 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence. |
| 383 uc32 ScanOctalEscape(uc32 c, int length); |
| 384 |
| 385 // Returns the location of the last seen octal literal. |
| 386 Location octal_position() const { return octal_pos_; } |
| 387 void clear_octal_position() { octal_pos_ = Location::invalid(); } |
| 388 |
| 389 // Seek forward to the given position. This operation does not |
| 390 // work in general, for instance when there are pushed back |
| 391 // characters, but works for seeking forward until simple delimiter |
| 392 // tokens, which is what it is used for. |
| 393 void SeekForward(int pos); |
| 394 |
| 395 bool HarmonyScoping() const { |
| 396 return harmony_scoping_; |
| 397 } |
| 398 void SetHarmonyScoping(bool block_scoping) { |
| 399 harmony_scoping_ = block_scoping; |
| 400 } |
| 401 |
| 402 |
| 403 // Returns true if there was a line terminator before the peek'ed token, |
| 404 // possibly inside a multi-line comment. |
| 405 bool HasAnyLineTerminatorBeforeNext() const { |
| 406 return has_line_terminator_before_next_ || |
| 407 has_multiline_comment_before_next_; |
| 408 } |
| 409 |
| 410 // Scans the input as a regular expression pattern, previous |
| 411 // character(s) must be /(=). Returns true if a pattern is scanned. |
| 412 bool ScanRegExpPattern(bool seen_equal); |
| 413 // Returns true if regexp flags are scanned (always since flags can |
| 414 // be empty). |
| 415 bool ScanRegExpFlags(); |
| 416 |
| 417 // Tells whether the buffer contains an identifier (no escapes). |
| 418 // Used for checking if a property name is an identifier. |
| 419 static bool IsIdentifier(unibrow::CharacterStream* buffer); |
| 420 |
| 421 private: |
368 // The current and look-ahead token. | 422 // The current and look-ahead token. |
369 struct TokenDesc { | 423 struct TokenDesc { |
370 Token::Value token; | 424 Token::Value token; |
371 Location location; | 425 Location location; |
372 LiteralBuffer* literal_chars; | 426 LiteralBuffer* literal_chars; |
373 }; | 427 }; |
374 | 428 |
375 // Call this after setting source_ to the input. | 429 // Call this after setting source_ to the input. |
376 void Init() { | 430 void Init() { |
377 // Set c0_ (one character ahead) | 431 // Set c0_ (one character ahead) |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
427 if (c0_ == next) { | 481 if (c0_ == next) { |
428 Advance(); | 482 Advance(); |
429 return then; | 483 return then; |
430 } else { | 484 } else { |
431 return else_; | 485 return else_; |
432 } | 486 } |
433 } | 487 } |
434 | 488 |
435 uc32 ScanHexNumber(int expected_length); | 489 uc32 ScanHexNumber(int expected_length); |
436 | 490 |
437 // Return the current source position. | |
438 int source_pos() { | |
439 return source_->pos() - kCharacterLookaheadBufferSize; | |
440 } | |
441 | |
442 UnicodeCache* unicode_cache_; | |
443 | |
444 // Buffers collecting literal strings, numbers, etc. | |
445 LiteralBuffer literal_buffer1_; | |
446 LiteralBuffer literal_buffer2_; | |
447 | |
448 TokenDesc current_; // desc for current token (as returned by Next()) | |
449 TokenDesc next_; // desc for next token (one token look-ahead) | |
450 | |
451 // Input stream. Must be initialized to an UC16CharacterStream. | |
452 UC16CharacterStream* source_; | |
453 | |
454 // One Unicode character look-ahead; c0_ < 0 at the end of the input. | |
455 uc32 c0_; | |
456 }; | |
457 | |
458 // ---------------------------------------------------------------------------- | |
459 // JavaScriptScanner - base logic for JavaScript scanning. | |
460 | |
461 class JavaScriptScanner : public Scanner { | |
462 public: | |
463 // A LiteralScope that disables recording of some types of JavaScript | |
464 // literals. If the scanner is configured to not record the specific | |
465 // type of literal, the scope will not call StartLiteral. | |
466 class LiteralScope { | |
467 public: | |
468 explicit LiteralScope(JavaScriptScanner* self) | |
469 : scanner_(self), complete_(false) { | |
470 scanner_->StartLiteral(); | |
471 } | |
472 ~LiteralScope() { | |
473 if (!complete_) scanner_->DropLiteral(); | |
474 } | |
475 void Complete() { | |
476 scanner_->TerminateLiteral(); | |
477 complete_ = true; | |
478 } | |
479 | |
480 private: | |
481 JavaScriptScanner* scanner_; | |
482 bool complete_; | |
483 }; | |
484 | |
485 explicit JavaScriptScanner(UnicodeCache* scanner_contants); | |
486 | |
487 void Initialize(UC16CharacterStream* source); | |
488 | |
489 // Returns the next token. | |
490 Token::Value Next(); | |
491 | |
492 // Returns true if there was a line terminator before the peek'ed token, | |
493 // possibly inside a multi-line comment. | |
494 bool HasAnyLineTerminatorBeforeNext() const { | |
495 return has_line_terminator_before_next_ || | |
496 has_multiline_comment_before_next_; | |
497 } | |
498 | |
499 // Scans the input as a regular expression pattern, previous | |
500 // character(s) must be /(=). Returns true if a pattern is scanned. | |
501 bool ScanRegExpPattern(bool seen_equal); | |
502 // Returns true if regexp flags are scanned (always since flags can | |
503 // be empty). | |
504 bool ScanRegExpFlags(); | |
505 | |
506 // Tells whether the buffer contains an identifier (no escapes). | |
507 // Used for checking if a property name is an identifier. | |
508 static bool IsIdentifier(unibrow::CharacterStream* buffer); | |
509 | |
510 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence. | |
511 uc32 ScanOctalEscape(uc32 c, int length); | |
512 | |
513 // Returns the location of the last seen octal literal | |
514 Location octal_position() const { return octal_pos_; } | |
515 void clear_octal_position() { octal_pos_ = Location::invalid(); } | |
516 | |
517 // Seek forward to the given position. This operation does not | |
518 // work in general, for instance when there are pushed back | |
519 // characters, but works for seeking forward until simple delimiter | |
520 // tokens, which is what it is used for. | |
521 void SeekForward(int pos); | |
522 | |
523 bool HarmonyScoping() const { | |
524 return harmony_scoping_; | |
525 } | |
526 void SetHarmonyScoping(bool block_scoping) { | |
527 harmony_scoping_ = block_scoping; | |
528 } | |
529 | |
530 | |
531 protected: | |
532 bool SkipWhiteSpace(); | |
533 Token::Value SkipSingleLineComment(); | |
534 Token::Value SkipMultiLineComment(); | |
535 | |
536 // Scans a single JavaScript token. | 491 // Scans a single JavaScript token. |
537 void Scan(); | 492 void Scan(); |
538 | 493 |
| 494 bool SkipWhiteSpace(); |
| 495 Token::Value SkipSingleLineComment(); |
| 496 Token::Value SkipMultiLineComment(); |
| 497 // Scans a possible HTML comment -- begins with '<!'. |
| 498 Token::Value ScanHtmlComment(); |
| 499 |
539 void ScanDecimalDigits(); | 500 void ScanDecimalDigits(); |
540 Token::Value ScanNumber(bool seen_period); | 501 Token::Value ScanNumber(bool seen_period); |
541 Token::Value ScanIdentifierOrKeyword(); | 502 Token::Value ScanIdentifierOrKeyword(); |
542 Token::Value ScanIdentifierSuffix(LiteralScope* literal); | 503 Token::Value ScanIdentifierSuffix(LiteralScope* literal); |
543 | 504 |
544 void ScanEscape(); | 505 void ScanEscape(); |
545 Token::Value ScanString(); | 506 Token::Value ScanString(); |
546 | 507 |
547 // Scans a possible HTML comment -- begins with '<!'. | |
548 Token::Value ScanHtmlComment(); | |
549 | |
550 // Decodes a unicode escape-sequence which is part of an identifier. | 508 // Decodes a unicode escape-sequence which is part of an identifier. |
551 // If the escape sequence cannot be decoded the result is kBadChar. | 509 // If the escape sequence cannot be decoded the result is kBadChar. |
552 uc32 ScanIdentifierUnicodeEscape(); | 510 uc32 ScanIdentifierUnicodeEscape(); |
553 // Recognizes a uniocde escape-sequence and adds its characters, | 511 // Recognizes a uniocde escape-sequence and adds its characters, |
554 // uninterpreted, to the current literal. Used for parsing RegExp | 512 // uninterpreted, to the current literal. Used for parsing RegExp |
555 // flags. | 513 // flags. |
556 bool ScanLiteralUnicodeEscape(); | 514 bool ScanLiteralUnicodeEscape(); |
557 | 515 |
| 516 // Return the current source position. |
| 517 int source_pos() { |
| 518 return source_->pos() - kCharacterLookaheadBufferSize; |
| 519 } |
| 520 |
| 521 UnicodeCache* unicode_cache_; |
| 522 |
| 523 // Buffers collecting literal strings, numbers, etc. |
| 524 LiteralBuffer literal_buffer1_; |
| 525 LiteralBuffer literal_buffer2_; |
| 526 |
| 527 TokenDesc current_; // desc for current token (as returned by Next()) |
| 528 TokenDesc next_; // desc for next token (one token look-ahead) |
| 529 |
| 530 // Input stream. Must be initialized to an UC16CharacterStream. |
| 531 UC16CharacterStream* source_; |
| 532 |
| 533 |
558 // Start position of the octal literal last scanned. | 534 // Start position of the octal literal last scanned. |
559 Location octal_pos_; | 535 Location octal_pos_; |
560 | 536 |
| 537 // One Unicode character look-ahead; c0_ < 0 at the end of the input. |
| 538 uc32 c0_; |
| 539 |
561 // Whether there is a line terminator whitespace character after | 540 // Whether there is a line terminator whitespace character after |
562 // the current token, and before the next. Does not count newlines | 541 // the current token, and before the next. Does not count newlines |
563 // inside multiline comments. | 542 // inside multiline comments. |
564 bool has_line_terminator_before_next_; | 543 bool has_line_terminator_before_next_; |
565 // Whether there is a multi-line comment that contains a | 544 // Whether there is a multi-line comment that contains a |
566 // line-terminator after the current token, and before the next. | 545 // line-terminator after the current token, and before the next. |
567 bool has_multiline_comment_before_next_; | 546 bool has_multiline_comment_before_next_; |
568 // Whether we scan 'let' as a keyword for harmony block scoped | 547 // Whether we scan 'let' as a keyword for harmony block scoped |
569 // let bindings. | 548 // let bindings. |
570 bool harmony_scoping_; | 549 bool harmony_scoping_; |
571 }; | 550 }; |
572 | 551 |
573 } } // namespace v8::internal | 552 } } // namespace v8::internal |
574 | 553 |
575 #endif // V8_SCANNER_H_ | 554 #endif // V8_SCANNER_H_ |
OLD | NEW |