| OLD | NEW | 
|    1 // Copyright 2010 the V8 project authors. All rights reserved. |    1 // Copyright 2010 the V8 project authors. All rights reserved. | 
|    2 // Redistribution and use in source and binary forms, with or without |    2 // Redistribution and use in source and binary forms, with or without | 
|    3 // modification, are permitted provided that the following conditions are |    3 // modification, are permitted provided that the following conditions are | 
|    4 // met: |    4 // met: | 
|    5 // |    5 // | 
|    6 //     * Redistributions of source code must retain the above copyright |    6 //     * Redistributions of source code must retain the above copyright | 
|    7 //       notice, this list of conditions and the following disclaimer. |    7 //       notice, this list of conditions and the following disclaimer. | 
|    8 //     * Redistributions in binary form must reproduce the above |    8 //     * Redistributions in binary form must reproduce the above | 
|    9 //       copyright notice, this list of conditions and the following |    9 //       copyright notice, this list of conditions and the following | 
|   10 //       disclaimer in the documentation and/or other materials provided |   10 //       disclaimer in the documentation and/or other materials provided | 
| (...skipping 19 matching lines...) Expand all  Loading... | 
|   30 #ifndef V8_SCANNER_BASE_H_ |   30 #ifndef V8_SCANNER_BASE_H_ | 
|   31 #define V8_SCANNER_BASE_H_ |   31 #define V8_SCANNER_BASE_H_ | 
|   32  |   32  | 
|   33 #include "globals.h" |   33 #include "globals.h" | 
|   34 #include "checks.h" |   34 #include "checks.h" | 
|   35 #include "allocation.h" |   35 #include "allocation.h" | 
|   36 #include "token.h" |   36 #include "token.h" | 
|   37 #include "unicode-inl.h" |   37 #include "unicode-inl.h" | 
|   38 #include "char-predicates.h" |   38 #include "char-predicates.h" | 
|   39 #include "utils.h" |   39 #include "utils.h" | 
 |   40 #include "list-inl.h" | 
|   40  |   41  | 
|   41 namespace v8 { |   42 namespace v8 { | 
|   42 namespace internal { |   43 namespace internal { | 
|   43  |   44  | 
|   44 // Interface through which the scanner reads characters from the input source. |   45 // Returns the value (0 .. 15) of a hexadecimal character c. | 
 |   46 // If c is not a legal hexadecimal character, returns a value < 0. | 
 |   47 inline int HexValue(uc32 c) { | 
 |   48   c -= '0'; | 
 |   49   if (static_cast<unsigned>(c) <= 9) return c; | 
 |   50   c = (c | 0x20) - ('a' - '0');  // detect 0x11..0x16 and 0x31..0x36. | 
 |   51   if (static_cast<unsigned>(c) <= 6) return c + 10; | 
 |   52   return -1; | 
 |   53 } | 
 |   54  | 
 |   55 // ---------------------------------------------------------------------------- | 
 |   56 // UTF16Buffer - scanner input source with pushback. | 
 |   57  | 
|   45 class UTF16Buffer { |   58 class UTF16Buffer { | 
|   46  public: |   59  public: | 
|   47   UTF16Buffer(); |   60   UTF16Buffer(); | 
|   48   virtual ~UTF16Buffer() {} |   61   virtual ~UTF16Buffer() {} | 
|   49  |   62  | 
|   50   virtual void PushBack(uc32 ch) = 0; |   63   virtual void PushBack(uc32 ch) = 0; | 
|   51   // Returns a value < 0 when the buffer end is reached. |   64   // Returns a value < 0 when the buffer end is reached. | 
|   52   virtual uc32 Advance() = 0; |   65   virtual uc32 Advance() = 0; | 
|   53   virtual void SeekForward(int pos) = 0; |   66   virtual void SeekForward(int pos) = 0; | 
|   54  |   67  | 
|   55   int pos() const { return pos_; } |   68   int pos() const { return pos_; } | 
|   56  |   69  | 
 |   70   static const int kNoEndPosition = 1; | 
 |   71  | 
|   57  protected: |   72  protected: | 
 |   73   // Initial value of end_ before the input stream is initialized. | 
 |   74  | 
|   58   int pos_;  // Current position in the buffer. |   75   int pos_;  // Current position in the buffer. | 
|   59   int end_;  // Position where scanning should stop (EOF). |   76   int end_;  // Position where scanning should stop (EOF). | 
|   60 }; |   77 }; | 
|   61  |   78  | 
|   62  |   79  | 
|   63 class ScannerConstants : AllStatic { |   80 class ScannerConstants : AllStatic { | 
|   64  public: |   81  public: | 
|   65   typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; |   82   typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; | 
|   66  |   83  | 
|   67   static StaticResource<Utf8Decoder>* utf8_decoder() { |   84   static StaticResource<Utf8Decoder>* utf8_decoder() { | 
|   68     return &utf8_decoder_; |   85     return &utf8_decoder_; | 
|   69   } |   86   } | 
|   70  |   87  | 
|   71   static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart; |   88   static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart; | 
|   72   static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; |   89   static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; | 
|   73   static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; |   90   static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; | 
|   74   static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; |   91   static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; | 
|   75  |   92  | 
|   76   static bool IsIdentifier(unibrow::CharacterStream* buffer); |   93   static bool IsIdentifier(unibrow::CharacterStream* buffer); | 
|   77  |   94  | 
|   78  private: |   95  private: | 
|   79   static StaticResource<Utf8Decoder> utf8_decoder_; |   96   static StaticResource<Utf8Decoder> utf8_decoder_; | 
|   80 }; |   97 }; | 
|   81  |   98  | 
 |   99 // ---------------------------------------------------------------------------- | 
 |  100 // LiteralCollector -  Collector of chars of literals. | 
 |  101  | 
 |  102 class LiteralCollector { | 
 |  103  public: | 
 |  104   LiteralCollector(); | 
 |  105   ~LiteralCollector(); | 
 |  106  | 
 |  107   inline void AddChar(uc32 c) { | 
 |  108     if (recording_) { | 
 |  109       if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { | 
 |  110         buffer_.Add(static_cast<char>(c)); | 
 |  111       } else { | 
 |  112         AddCharSlow(c); | 
 |  113       } | 
 |  114     } | 
 |  115   } | 
 |  116  | 
 |  117   void StartLiteral() { | 
 |  118     buffer_.StartSequence(); | 
 |  119     recording_ = true; | 
 |  120   } | 
 |  121  | 
 |  122   Vector<const char> EndLiteral() { | 
 |  123     if (recording_) { | 
 |  124       recording_ = false; | 
 |  125       buffer_.Add(kEndMarker); | 
 |  126       Vector<char> sequence = buffer_.EndSequence(); | 
 |  127       return Vector<const char>(sequence.start(), sequence.length()); | 
 |  128     } | 
 |  129     return Vector<const char>(); | 
 |  130   } | 
 |  131  | 
 |  132   void DropLiteral() { | 
 |  133     if (recording_) { | 
 |  134       recording_ = false; | 
 |  135       buffer_.DropSequence(); | 
 |  136     } | 
 |  137   } | 
 |  138  | 
 |  139   void Reset() { | 
 |  140     buffer_.Reset(); | 
 |  141   } | 
 |  142  | 
 |  143   // The end marker added after a parsed literal. | 
 |  144   // Using zero allows the usage of strlen and similar functions on | 
 |  145   // identifiers and numbers (but not strings, since they may contain zero | 
 |  146   // bytes). | 
 |  147   static const char kEndMarker = '\x00'; | 
 |  148  private: | 
 |  149   static const int kInitialCapacity = 256; | 
 |  150   SequenceCollector<char, 4> buffer_; | 
 |  151   bool recording_; | 
 |  152   void AddCharSlow(uc32 c); | 
 |  153 }; | 
 |  154  | 
 |  155 // ---------------------------------------------------------------------------- | 
 |  156 // Scanner base-class. | 
 |  157  | 
 |  158 // Generic functionality used by both JSON and JavaScript scanners. | 
 |  159 class Scanner { | 
 |  160  public: | 
 |  161   typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; | 
 |  162  | 
 |  163   class LiteralScope { | 
 |  164    public: | 
 |  165     explicit LiteralScope(Scanner* self); | 
 |  166     ~LiteralScope(); | 
 |  167     void Complete(); | 
 |  168  | 
 |  169    private: | 
 |  170     Scanner* scanner_; | 
 |  171     bool complete_; | 
 |  172   }; | 
 |  173  | 
 |  174   Scanner(); | 
 |  175  | 
 |  176   // Returns the current token again. | 
 |  177   Token::Value current_token() { return current_.token; } | 
 |  178  | 
 |  179   // One token look-ahead (past the token returned by Next()). | 
 |  180   Token::Value peek() const { return next_.token; } | 
 |  181  | 
 |  182   struct Location { | 
 |  183     Location(int b, int e) : beg_pos(b), end_pos(e) { } | 
 |  184     Location() : beg_pos(0), end_pos(0) { } | 
 |  185     int beg_pos; | 
 |  186     int end_pos; | 
 |  187   }; | 
 |  188  | 
 |  189   // Returns the location information for the current token | 
 |  190   // (the token returned by Next()). | 
 |  191   Location location() const { return current_.location; } | 
 |  192   Location peek_location() const { return next_.location; } | 
 |  193  | 
 |  194   // Returns the literal string, if any, for the current token (the | 
 |  195   // token returned by Next()). The string is 0-terminated and in | 
 |  196   // UTF-8 format; they may contain 0-characters. Literal strings are | 
 |  197   // collected for identifiers, strings, and numbers. | 
 |  198   // These functions only give the correct result if the literal | 
 |  199   // was scanned between calls to StartLiteral() and TerminateLiteral(). | 
 |  200   const char* literal_string() const { | 
 |  201     return current_.literal_chars.start(); | 
 |  202   } | 
 |  203  | 
 |  204   int literal_length() const { | 
 |  205     // Excluding terminal '\x00' added by TerminateLiteral(). | 
 |  206     return current_.literal_chars.length() - 1; | 
 |  207   } | 
 |  208  | 
 |  209   Vector<const char> literal() const { | 
 |  210     return Vector<const char>(literal_string(), literal_length()); | 
 |  211   } | 
 |  212  | 
 |  213   // Returns the literal string for the next token (the token that | 
 |  214   // would be returned if Next() were called). | 
 |  215   const char* next_literal_string() const { | 
 |  216     return next_.literal_chars.start(); | 
 |  217   } | 
 |  218  | 
 |  219  | 
 |  220   // Returns the length of the next token (that would be returned if | 
 |  221   // Next() were called). | 
 |  222   int next_literal_length() const { | 
 |  223     // Excluding terminal '\x00' added by TerminateLiteral(). | 
 |  224     return next_.literal_chars.length() - 1; | 
 |  225   } | 
 |  226  | 
 |  227   Vector<const char> next_literal() const { | 
 |  228     return Vector<const char>(next_literal_string(), next_literal_length()); | 
 |  229   } | 
 |  230  | 
 |  231   bool stack_overflow() { return stack_overflow_; } | 
 |  232  | 
 |  233   static const int kCharacterLookaheadBufferSize = 1; | 
 |  234  | 
 |  235  protected: | 
 |  236   // The current and look-ahead token. | 
 |  237   struct TokenDesc { | 
 |  238     Token::Value token; | 
 |  239     Location location; | 
 |  240     Vector<const char> literal_chars; | 
 |  241   }; | 
 |  242  | 
 |  243   // Call this after setting source_ to the input. | 
 |  244   void Init() { | 
 |  245     // Set c0_ (one character ahead) | 
 |  246     ASSERT(kCharacterLookaheadBufferSize == 1); | 
 |  247     Advance(); | 
 |  248     // Initialize current_ to not refer to a literal. | 
 |  249     current_.literal_chars = Vector<const char>(); | 
 |  250     // Reset literal buffer. | 
 |  251     literal_buffer_.Reset(); | 
 |  252   } | 
 |  253  | 
 |  254   // Literal buffer support | 
 |  255   inline void StartLiteral() { | 
 |  256     literal_buffer_.StartLiteral(); | 
 |  257   } | 
 |  258  | 
 |  259   inline void AddLiteralChar(uc32 c) { | 
 |  260     literal_buffer_.AddChar(c); | 
 |  261   } | 
 |  262  | 
 |  263   // Complete scanning of a literal. | 
 |  264   inline void TerminateLiteral() { | 
 |  265     next_.literal_chars = literal_buffer_.EndLiteral(); | 
 |  266   } | 
 |  267  | 
 |  268   // Stops scanning of a literal and drop the collected characters, | 
 |  269   // e.g., due to an encountered error. | 
 |  270   inline void DropLiteral() { | 
 |  271     literal_buffer_.DropLiteral(); | 
 |  272   } | 
 |  273  | 
 |  274   inline void AddLiteralCharAdvance() { | 
 |  275     AddLiteralChar(c0_); | 
 |  276     Advance(); | 
 |  277   } | 
 |  278  | 
 |  279   // Low-level scanning support. | 
 |  280   void Advance() { c0_ = source_->Advance(); } | 
 |  281   void PushBack(uc32 ch) { | 
 |  282     source_->PushBack(ch); | 
 |  283     c0_ = ch; | 
 |  284   } | 
 |  285  | 
 |  286   inline Token::Value Select(Token::Value tok) { | 
 |  287     Advance(); | 
 |  288     return tok; | 
 |  289   } | 
 |  290  | 
 |  291   inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) { | 
 |  292     Advance(); | 
 |  293     if (c0_ == next) { | 
 |  294       Advance(); | 
 |  295       return then; | 
 |  296     } else { | 
 |  297       return else_; | 
 |  298     } | 
 |  299   } | 
 |  300  | 
 |  301   uc32 ScanHexEscape(uc32 c, int length); | 
 |  302   uc32 ScanOctalEscape(uc32 c, int length); | 
 |  303  | 
 |  304   // Return the current source position. | 
 |  305   int source_pos() { | 
 |  306     return source_->pos() - kCharacterLookaheadBufferSize; | 
 |  307   } | 
 |  308  | 
 |  309   TokenDesc current_;  // desc for current token (as returned by Next()) | 
 |  310   TokenDesc next_;     // desc for next token (one token look-ahead) | 
 |  311  | 
 |  312   // Input stream. Must be initialized to an UTF16Buffer. | 
 |  313   UTF16Buffer* source_; | 
 |  314  | 
 |  315   // Buffer to hold literal values (identifiers, strings, numbers) | 
 |  316   // using '\x00'-terminated UTF-8 encoding. Handles allocation internally. | 
 |  317   LiteralCollector literal_buffer_; | 
 |  318  | 
 |  319   bool stack_overflow_; | 
 |  320  | 
 |  321   // One Unicode character look-ahead; c0_ < 0 at the end of the input. | 
 |  322   uc32 c0_; | 
 |  323 }; | 
 |  324  | 
 |  325 // ---------------------------------------------------------------------------- | 
 |  326 // JavaScriptScanner - base logic for JavaScript scanning. | 
 |  327  | 
 |  328 class JavaScriptScanner : public Scanner { | 
 |  329  public: | 
 |  330   JavaScriptScanner(); | 
 |  331  | 
 |  332   // Returns the next token. | 
 |  333   Token::Value Next(); | 
 |  334  | 
 |  335   // Returns true if there was a line terminator before the peek'ed token. | 
 |  336   bool has_line_terminator_before_next() const { | 
 |  337     return has_line_terminator_before_next_; | 
 |  338   } | 
 |  339  | 
 |  340   // Scans the input as a regular expression pattern, previous | 
 |  341   // character(s) must be /(=). Returns true if a pattern is scanned. | 
 |  342   bool ScanRegExpPattern(bool seen_equal); | 
 |  343   // Returns true if regexp flags are scanned (always since flags can | 
 |  344   // be empty). | 
 |  345   bool ScanRegExpFlags(); | 
 |  346  | 
 |  347   // Tells whether the buffer contains an identifier (no escapes). | 
 |  348   // Used for checking if a property name is an identifier. | 
 |  349   static bool IsIdentifier(unibrow::CharacterStream* buffer); | 
 |  350  | 
 |  351   // Seek forward to the given position.  This operation does not | 
 |  352   // work in general, for instance when there are pushed back | 
 |  353   // characters, but works for seeking forward until simple delimiter | 
 |  354   // tokens, which is what it is used for. | 
 |  355   void SeekForward(int pos); | 
 |  356  | 
 |  357  protected: | 
 |  358   bool SkipWhiteSpace(); | 
 |  359   Token::Value SkipSingleLineComment(); | 
 |  360   Token::Value SkipMultiLineComment(); | 
 |  361  | 
 |  362   // Scans a single JavaScript token. | 
 |  363   void Scan(); | 
 |  364  | 
 |  365   void ScanDecimalDigits(); | 
 |  366   Token::Value ScanNumber(bool seen_period); | 
 |  367   Token::Value ScanIdentifier(); | 
 |  368  | 
 |  369   void ScanEscape(); | 
 |  370   Token::Value ScanString(); | 
 |  371  | 
 |  372   // Scans a possible HTML comment -- begins with '<!'. | 
 |  373   Token::Value ScanHtmlComment(); | 
 |  374  | 
 |  375   // Decodes a unicode escape-sequence which is part of an identifier. | 
 |  376   // If the escape sequence cannot be decoded the result is kBadChar. | 
 |  377   uc32 ScanIdentifierUnicodeEscape(); | 
 |  378  | 
 |  379   bool has_line_terminator_before_next_; | 
 |  380 }; | 
 |  381  | 
 |  382  | 
 |  383 // ---------------------------------------------------------------------------- | 
 |  384 // Keyword matching state machine. | 
|   82  |  385  | 
|   83 class KeywordMatcher { |  386 class KeywordMatcher { | 
|   84 //  Incrementally recognize keywords. |  387 //  Incrementally recognize keywords. | 
|   85 // |  388 // | 
|   86 //  Recognized keywords: |  389 //  Recognized keywords: | 
|   87 //      break case catch const* continue debugger* default delete do else |  390 //      break case catch const* continue debugger* default delete do else | 
|   88 //      finally false for function if in instanceof native* new null |  391 //      finally false for function if in instanceof native* new null | 
|   89 //      return switch this throw true try typeof var void while with |  392 //      return switch this throw true try typeof var void while with | 
|   90 // |  393 // | 
|   91 //  *: Actually "future reserved keywords". These are the only ones we |  394 //  *: Actually "future reserved keywords". These are the only ones we | 
| (...skipping 105 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
|  197   // keyword with the current prefix). |  500   // keyword with the current prefix). | 
|  198   const char* keyword_; |  501   const char* keyword_; | 
|  199   int counter_; |  502   int counter_; | 
|  200   Token::Value keyword_token_; |  503   Token::Value keyword_token_; | 
|  201 }; |  504 }; | 
|  202  |  505  | 
|  203  |  506  | 
|  204 } }  // namespace v8::internal |  507 } }  // namespace v8::internal | 
|  205  |  508  | 
|  206 #endif  // V8_SCANNER_BASE_H_ |  509 #endif  // V8_SCANNER_BASE_H_ | 
| OLD | NEW |