Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(190)

Side by Side Diff: src/scanner.h

Issue 8384003: Merged Scanner and JavaScriptScanner. (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge
Patch Set: Created 9 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « src/preparser-api.cc ('k') | src/scanner.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 242 matching lines...) Expand 10 before | Expand all | Expand 10 after
253 253
254 bool is_ascii_; 254 bool is_ascii_;
255 int position_; 255 int position_;
256 Vector<byte> backing_store_; 256 Vector<byte> backing_store_;
257 257
258 DISALLOW_COPY_AND_ASSIGN(LiteralBuffer); 258 DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
259 }; 259 };
260 260
261 261
262 // ---------------------------------------------------------------------------- 262 // ----------------------------------------------------------------------------
263 // Scanner base-class. 263 // JavaScript Scanner.
264 264
265 // Generic functionality used by both JSON and JavaScript scanners.
266 class Scanner { 265 class Scanner {
267 public: 266 public:
268 // -1 is outside of the range of any real source code. 267 // Scoped helper for literal recording. Automatically drops the literal
269 static const int kNoOctalLocation = -1; 268 // if aborting the scanning before it's complete.
270
271 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
272
273 class LiteralScope { 269 class LiteralScope {
274 public: 270 public:
275 explicit LiteralScope(Scanner* self); 271 explicit LiteralScope(Scanner* self)
276 ~LiteralScope(); 272 : scanner_(self), complete_(false) {
277 void Complete(); 273 scanner_->StartLiteral();
274 }
275 ~LiteralScope() {
276 if (!complete_) scanner_->DropLiteral();
277 }
278 void Complete() {
279 scanner_->TerminateLiteral();
280 complete_ = true;
281 }
278 282
279 private: 283 private:
280 Scanner* scanner_; 284 Scanner* scanner_;
281 bool complete_; 285 bool complete_;
282 }; 286 };
283 287
284 explicit Scanner(UnicodeCache* scanner_contants); 288 // Representation of an interval of source positions.
285
286 // Returns the current token again.
287 Token::Value current_token() { return current_.token; }
288
289 // One token look-ahead (past the token returned by Next()).
290 Token::Value peek() const { return next_.token; }
291
292 struct Location { 289 struct Location {
293 Location(int b, int e) : beg_pos(b), end_pos(e) { } 290 Location(int b, int e) : beg_pos(b), end_pos(e) { }
294 Location() : beg_pos(0), end_pos(0) { } 291 Location() : beg_pos(0), end_pos(0) { }
295 292
296 bool IsValid() const { 293 bool IsValid() const {
297 return beg_pos >= 0 && end_pos >= beg_pos; 294 return beg_pos >= 0 && end_pos >= beg_pos;
298 } 295 }
299 296
300 static Location invalid() { return Location(-1, -1); } 297 static Location invalid() { return Location(-1, -1); }
301 298
302 int beg_pos; 299 int beg_pos;
303 int end_pos; 300 int end_pos;
304 }; 301 };
305 302
303 // -1 is outside of the range of any real source code.
304 static const int kNoOctalLocation = -1;
305
306 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
307
308 explicit Scanner(UnicodeCache* scanner_contants);
309
310 void Initialize(UC16CharacterStream* source);
311
312 // Returns the next token and advances input.
313 Token::Value Next();
314 // Returns the current token again.
315 Token::Value current_token() { return current_.token; }
306 // Returns the location information for the current token 316 // Returns the location information for the current token
307 // (the token returned by Next()). 317 // (the token last returned by Next()).
308 Location location() const { return current_.location; } 318 Location location() const { return current_.location; }
309 Location peek_location() const { return next_.location; }
310
311 // Returns the literal string, if any, for the current token (the 319 // Returns the literal string, if any, for the current token (the
312 // token returned by Next()). The string is 0-terminated and in 320 // token last returned by Next()). The string is 0-terminated.
313 // UTF-8 format; they may contain 0-characters. Literal strings are 321 // Literal strings are collected for identifiers, strings, and
314 // collected for identifiers, strings, and numbers. 322 // numbers.
315 // These functions only give the correct result if the literal 323 // These functions only give the correct result if the literal
316 // was scanned between calls to StartLiteral() and TerminateLiteral(). 324 // was scanned between calls to StartLiteral() and TerminateLiteral().
317 bool is_literal_ascii() {
318 ASSERT_NOT_NULL(current_.literal_chars);
319 return current_.literal_chars->is_ascii();
320 }
321 Vector<const char> literal_ascii_string() { 325 Vector<const char> literal_ascii_string() {
322 ASSERT_NOT_NULL(current_.literal_chars); 326 ASSERT_NOT_NULL(current_.literal_chars);
323 return current_.literal_chars->ascii_literal(); 327 return current_.literal_chars->ascii_literal();
324 } 328 }
325 Vector<const uc16> literal_uc16_string() { 329 Vector<const uc16> literal_uc16_string() {
326 ASSERT_NOT_NULL(current_.literal_chars); 330 ASSERT_NOT_NULL(current_.literal_chars);
327 return current_.literal_chars->uc16_literal(); 331 return current_.literal_chars->uc16_literal();
328 } 332 }
333 bool is_literal_ascii() {
334 ASSERT_NOT_NULL(current_.literal_chars);
335 return current_.literal_chars->is_ascii();
336 }
329 int literal_length() const { 337 int literal_length() const {
330 ASSERT_NOT_NULL(current_.literal_chars); 338 ASSERT_NOT_NULL(current_.literal_chars);
331 return current_.literal_chars->length(); 339 return current_.literal_chars->length();
332 } 340 }
333 341
334 bool literal_contains_escapes() const { 342 bool literal_contains_escapes() const {
335 Location location = current_.location; 343 Location location = current_.location;
336 int source_length = (location.end_pos - location.beg_pos); 344 int source_length = (location.end_pos - location.beg_pos);
337 if (current_.token == Token::STRING) { 345 if (current_.token == Token::STRING) {
338 // Subtract delimiters. 346 // Subtract delimiters.
339 source_length -= 2; 347 source_length -= 2;
340 } 348 }
341 return current_.literal_chars->length() != source_length; 349 return current_.literal_chars->length() != source_length;
342 } 350 }
343 351
352 // Similar functions for the upcoming token.
353
354 // One token look-ahead (past the token returned by Next()).
355 Token::Value peek() const { return next_.token; }
356
357 Location peek_location() const { return next_.location; }
358
344 // Returns the literal string for the next token (the token that 359 // Returns the literal string for the next token (the token that
345 // would be returned if Next() were called). 360 // would be returned if Next() were called).
346 bool is_next_literal_ascii() {
347 ASSERT_NOT_NULL(next_.literal_chars);
348 return next_.literal_chars->is_ascii();
349 }
350 Vector<const char> next_literal_ascii_string() { 361 Vector<const char> next_literal_ascii_string() {
351 ASSERT_NOT_NULL(next_.literal_chars); 362 ASSERT_NOT_NULL(next_.literal_chars);
352 return next_.literal_chars->ascii_literal(); 363 return next_.literal_chars->ascii_literal();
353 } 364 }
354 Vector<const uc16> next_literal_uc16_string() { 365 Vector<const uc16> next_literal_uc16_string() {
355 ASSERT_NOT_NULL(next_.literal_chars); 366 ASSERT_NOT_NULL(next_.literal_chars);
356 return next_.literal_chars->uc16_literal(); 367 return next_.literal_chars->uc16_literal();
357 } 368 }
369 bool is_next_literal_ascii() {
370 ASSERT_NOT_NULL(next_.literal_chars);
371 return next_.literal_chars->is_ascii();
372 }
358 int next_literal_length() const { 373 int next_literal_length() const {
359 ASSERT_NOT_NULL(next_.literal_chars); 374 ASSERT_NOT_NULL(next_.literal_chars);
360 return next_.literal_chars->length(); 375 return next_.literal_chars->length();
361 } 376 }
362 377
363 UnicodeCache* unicode_cache() { return unicode_cache_; } 378 UnicodeCache* unicode_cache() { return unicode_cache_; }
364 379
365 static const int kCharacterLookaheadBufferSize = 1; 380 static const int kCharacterLookaheadBufferSize = 1;
366 381
367 protected: 382 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
383 uc32 ScanOctalEscape(uc32 c, int length);
384
385 // Returns the location of the last seen octal literal.
386 Location octal_position() const { return octal_pos_; }
387 void clear_octal_position() { octal_pos_ = Location::invalid(); }
388
389 // Seek forward to the given position. This operation does not
390 // work in general, for instance when there are pushed back
391 // characters, but works for seeking forward until simple delimiter
392 // tokens, which is what it is used for.
393 void SeekForward(int pos);
394
395 bool HarmonyScoping() const {
396 return harmony_scoping_;
397 }
398 void SetHarmonyScoping(bool block_scoping) {
399 harmony_scoping_ = block_scoping;
400 }
401
402
403 // Returns true if there was a line terminator before the peek'ed token,
404 // possibly inside a multi-line comment.
405 bool HasAnyLineTerminatorBeforeNext() const {
406 return has_line_terminator_before_next_ ||
407 has_multiline_comment_before_next_;
408 }
409
410 // Scans the input as a regular expression pattern, previous
411 // character(s) must be /(=). Returns true if a pattern is scanned.
412 bool ScanRegExpPattern(bool seen_equal);
413 // Returns true if regexp flags are scanned (always since flags can
414 // be empty).
415 bool ScanRegExpFlags();
416
417 // Tells whether the buffer contains an identifier (no escapes).
418 // Used for checking if a property name is an identifier.
419 static bool IsIdentifier(unibrow::CharacterStream* buffer);
420
421 private:
368 // The current and look-ahead token. 422 // The current and look-ahead token.
369 struct TokenDesc { 423 struct TokenDesc {
370 Token::Value token; 424 Token::Value token;
371 Location location; 425 Location location;
372 LiteralBuffer* literal_chars; 426 LiteralBuffer* literal_chars;
373 }; 427 };
374 428
375 // Call this after setting source_ to the input. 429 // Call this after setting source_ to the input.
376 void Init() { 430 void Init() {
377 // Set c0_ (one character ahead) 431 // Set c0_ (one character ahead)
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
427 if (c0_ == next) { 481 if (c0_ == next) {
428 Advance(); 482 Advance();
429 return then; 483 return then;
430 } else { 484 } else {
431 return else_; 485 return else_;
432 } 486 }
433 } 487 }
434 488
435 uc32 ScanHexNumber(int expected_length); 489 uc32 ScanHexNumber(int expected_length);
436 490
437 // Return the current source position.
438 int source_pos() {
439 return source_->pos() - kCharacterLookaheadBufferSize;
440 }
441
442 UnicodeCache* unicode_cache_;
443
444 // Buffers collecting literal strings, numbers, etc.
445 LiteralBuffer literal_buffer1_;
446 LiteralBuffer literal_buffer2_;
447
448 TokenDesc current_; // desc for current token (as returned by Next())
449 TokenDesc next_; // desc for next token (one token look-ahead)
450
451 // Input stream. Must be initialized to an UC16CharacterStream.
452 UC16CharacterStream* source_;
453
454 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
455 uc32 c0_;
456 };
457
458 // ----------------------------------------------------------------------------
459 // JavaScriptScanner - base logic for JavaScript scanning.
460
461 class JavaScriptScanner : public Scanner {
462 public:
463 // A LiteralScope that disables recording of some types of JavaScript
464 // literals. If the scanner is configured to not record the specific
465 // type of literal, the scope will not call StartLiteral.
466 class LiteralScope {
467 public:
468 explicit LiteralScope(JavaScriptScanner* self)
469 : scanner_(self), complete_(false) {
470 scanner_->StartLiteral();
471 }
472 ~LiteralScope() {
473 if (!complete_) scanner_->DropLiteral();
474 }
475 void Complete() {
476 scanner_->TerminateLiteral();
477 complete_ = true;
478 }
479
480 private:
481 JavaScriptScanner* scanner_;
482 bool complete_;
483 };
484
485 explicit JavaScriptScanner(UnicodeCache* scanner_contants);
486
487 void Initialize(UC16CharacterStream* source);
488
489 // Returns the next token.
490 Token::Value Next();
491
492 // Returns true if there was a line terminator before the peek'ed token,
493 // possibly inside a multi-line comment.
494 bool HasAnyLineTerminatorBeforeNext() const {
495 return has_line_terminator_before_next_ ||
496 has_multiline_comment_before_next_;
497 }
498
499 // Scans the input as a regular expression pattern, previous
500 // character(s) must be /(=). Returns true if a pattern is scanned.
501 bool ScanRegExpPattern(bool seen_equal);
502 // Returns true if regexp flags are scanned (always since flags can
503 // be empty).
504 bool ScanRegExpFlags();
505
506 // Tells whether the buffer contains an identifier (no escapes).
507 // Used for checking if a property name is an identifier.
508 static bool IsIdentifier(unibrow::CharacterStream* buffer);
509
510 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
511 uc32 ScanOctalEscape(uc32 c, int length);
512
513 // Returns the location of the last seen octal literal
514 Location octal_position() const { return octal_pos_; }
515 void clear_octal_position() { octal_pos_ = Location::invalid(); }
516
517 // Seek forward to the given position. This operation does not
518 // work in general, for instance when there are pushed back
519 // characters, but works for seeking forward until simple delimiter
520 // tokens, which is what it is used for.
521 void SeekForward(int pos);
522
523 bool HarmonyScoping() const {
524 return harmony_scoping_;
525 }
526 void SetHarmonyScoping(bool block_scoping) {
527 harmony_scoping_ = block_scoping;
528 }
529
530
531 protected:
532 bool SkipWhiteSpace();
533 Token::Value SkipSingleLineComment();
534 Token::Value SkipMultiLineComment();
535
536 // Scans a single JavaScript token. 491 // Scans a single JavaScript token.
537 void Scan(); 492 void Scan();
538 493
494 bool SkipWhiteSpace();
495 Token::Value SkipSingleLineComment();
496 Token::Value SkipMultiLineComment();
497 // Scans a possible HTML comment -- begins with '<!'.
498 Token::Value ScanHtmlComment();
499
539 void ScanDecimalDigits(); 500 void ScanDecimalDigits();
540 Token::Value ScanNumber(bool seen_period); 501 Token::Value ScanNumber(bool seen_period);
541 Token::Value ScanIdentifierOrKeyword(); 502 Token::Value ScanIdentifierOrKeyword();
542 Token::Value ScanIdentifierSuffix(LiteralScope* literal); 503 Token::Value ScanIdentifierSuffix(LiteralScope* literal);
543 504
544 void ScanEscape(); 505 void ScanEscape();
545 Token::Value ScanString(); 506 Token::Value ScanString();
546 507
547 // Scans a possible HTML comment -- begins with '<!'.
548 Token::Value ScanHtmlComment();
549
550 // Decodes a unicode escape-sequence which is part of an identifier. 508 // Decodes a unicode escape-sequence which is part of an identifier.
551 // If the escape sequence cannot be decoded the result is kBadChar. 509 // If the escape sequence cannot be decoded the result is kBadChar.
552 uc32 ScanIdentifierUnicodeEscape(); 510 uc32 ScanIdentifierUnicodeEscape();
553 // Recognizes a uniocde escape-sequence and adds its characters, 511 // Recognizes a uniocde escape-sequence and adds its characters,
554 // uninterpreted, to the current literal. Used for parsing RegExp 512 // uninterpreted, to the current literal. Used for parsing RegExp
555 // flags. 513 // flags.
556 bool ScanLiteralUnicodeEscape(); 514 bool ScanLiteralUnicodeEscape();
557 515
516 // Return the current source position.
517 int source_pos() {
518 return source_->pos() - kCharacterLookaheadBufferSize;
519 }
520
521 UnicodeCache* unicode_cache_;
522
523 // Buffers collecting literal strings, numbers, etc.
524 LiteralBuffer literal_buffer1_;
525 LiteralBuffer literal_buffer2_;
526
527 TokenDesc current_; // desc for current token (as returned by Next())
528 TokenDesc next_; // desc for next token (one token look-ahead)
529
530 // Input stream. Must be initialized to an UC16CharacterStream.
531 UC16CharacterStream* source_;
532
533
558 // Start position of the octal literal last scanned. 534 // Start position of the octal literal last scanned.
559 Location octal_pos_; 535 Location octal_pos_;
560 536
537 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
538 uc32 c0_;
539
561 // Whether there is a line terminator whitespace character after 540 // Whether there is a line terminator whitespace character after
562 // the current token, and before the next. Does not count newlines 541 // the current token, and before the next. Does not count newlines
563 // inside multiline comments. 542 // inside multiline comments.
564 bool has_line_terminator_before_next_; 543 bool has_line_terminator_before_next_;
565 // Whether there is a multi-line comment that contains a 544 // Whether there is a multi-line comment that contains a
566 // line-terminator after the current token, and before the next. 545 // line-terminator after the current token, and before the next.
567 bool has_multiline_comment_before_next_; 546 bool has_multiline_comment_before_next_;
568 // Whether we scan 'let' as a keyword for harmony block scoped 547 // Whether we scan 'let' as a keyword for harmony block scoped
569 // let bindings. 548 // let bindings.
570 bool harmony_scoping_; 549 bool harmony_scoping_;
571 }; 550 };
572 551
573 } } // namespace v8::internal 552 } } // namespace v8::internal
574 553
575 #endif // V8_SCANNER_H_ 554 #endif // V8_SCANNER_H_
OLDNEW
« no previous file with comments | « src/preparser-api.cc ('k') | src/scanner.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698