Index: src/scanner-base.h |
diff --git a/src/scanner-base.h b/src/scanner-base.h |
index 50f30305c4aa3e150bedb3965668c56678d8af1b..4952f1cd650cea9f9447c01a75c95ad7e2f9aa5c 100644 |
--- a/src/scanner-base.h |
+++ b/src/scanner-base.h |
@@ -37,11 +37,24 @@ |
#include "unicode-inl.h" |
#include "char-predicates.h" |
#include "utils.h" |
+#include "list-inl.h" |
namespace v8 { |
namespace internal { |
-// Interface through which the scanner reads characters from the input source. |
+// Returns the value (0 .. 15) of a hexadecimal character c. |
+// If c is not a legal hexadecimal character, returns a value < 0. |
+inline int HexValue(uc32 c) { |
+ c -= '0'; |
+ if (static_cast<unsigned>(c) <= 9) return c; |
+ c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36. |
+ if (static_cast<unsigned>(c) <= 6) return c + 10; |
+ return -1; |
+} |
+ |
+// ---------------------------------------------------------------------------- |
+// UTF16Buffer - scanner input source with pushback. |
+ |
class UTF16Buffer { |
public: |
UTF16Buffer(); |
@@ -54,7 +67,11 @@ class UTF16Buffer { |
int pos() const { return pos_; } |
+ static const int kNoEndPosition = 1; |
+ |
protected: |
+ // Initial value of end_ before the input stream is initialized. |
+ |
int pos_; // Current position in the buffer. |
int end_; // Position where scanning should stop (EOF). |
}; |
@@ -79,6 +96,292 @@ class ScannerConstants : AllStatic { |
static StaticResource<Utf8Decoder> utf8_decoder_; |
}; |
+// ---------------------------------------------------------------------------- |
+// LiteralCollector - Collector of chars of literals. |
+ |
+class LiteralCollector { |
+ public: |
+ LiteralCollector(); |
+ ~LiteralCollector(); |
+ |
+ inline void AddChar(uc32 c) { |
+ if (recording_) { |
+ if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { |
+ buffer_.Add(static_cast<char>(c)); |
+ } else { |
+ AddCharSlow(c); |
+ } |
+ } |
+ } |
+ |
+ void StartLiteral() { |
+ buffer_.StartSequence(); |
+ recording_ = true; |
+ } |
+ |
+ Vector<const char> EndLiteral() { |
+ if (recording_) { |
+ recording_ = false; |
+ buffer_.Add(kEndMarker); |
+ Vector<char> sequence = buffer_.EndSequence(); |
+ return Vector<const char>(sequence.start(), sequence.length()); |
+ } |
+ return Vector<const char>(); |
+ } |
+ |
+ void DropLiteral() { |
+ if (recording_) { |
+ recording_ = false; |
+ buffer_.DropSequence(); |
+ } |
+ } |
+ |
+ void Reset() { |
+ buffer_.Reset(); |
+ } |
+ |
+ // The end marker added after a parsed literal. |
+ // Using zero allows the usage of strlen and similar functions on |
+ // identifiers and numbers (but not strings, since they may contain zero |
+ // bytes). |
+ static const char kEndMarker = '\x00'; |
+ private: |
+ static const int kInitialCapacity = 256; |
+ SequenceCollector<char, 4> buffer_; |
+ bool recording_; |
+ void AddCharSlow(uc32 c); |
+}; |
+ |
+// ---------------------------------------------------------------------------- |
+// Scanner base-class. |
+ |
+// Generic functionality used by both JSON and JavaScript scanners. |
+class Scanner { |
+ public: |
+ typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; |
+ |
+ class LiteralScope { |
+ public: |
+ explicit LiteralScope(Scanner* self); |
+ ~LiteralScope(); |
+ void Complete(); |
+ |
+ private: |
+ Scanner* scanner_; |
+ bool complete_; |
+ }; |
+ |
+ Scanner(); |
+ |
+ // Returns the current token again. |
+ Token::Value current_token() { return current_.token; } |
+ |
+ // One token look-ahead (past the token returned by Next()). |
+ Token::Value peek() const { return next_.token; } |
+ |
+ struct Location { |
+ Location(int b, int e) : beg_pos(b), end_pos(e) { } |
+ Location() : beg_pos(0), end_pos(0) { } |
+ int beg_pos; |
+ int end_pos; |
+ }; |
+ |
+ // Returns the location information for the current token |
+ // (the token returned by Next()). |
+ Location location() const { return current_.location; } |
+ Location peek_location() const { return next_.location; } |
+ |
+ // Returns the literal string, if any, for the current token (the |
+ // token returned by Next()). The string is 0-terminated and in |
+ // UTF-8 format; they may contain 0-characters. Literal strings are |
+ // collected for identifiers, strings, and numbers. |
+ // These functions only give the correct result if the literal |
+ // was scanned between calls to StartLiteral() and TerminateLiteral(). |
+ const char* literal_string() const { |
+ return current_.literal_chars.start(); |
+ } |
+ |
+ int literal_length() const { |
+ // Excluding terminal '\x00' added by TerminateLiteral(). |
+ return current_.literal_chars.length() - 1; |
+ } |
+ |
+ Vector<const char> literal() const { |
+ return Vector<const char>(literal_string(), literal_length()); |
+ } |
+ |
+ // Returns the literal string for the next token (the token that |
+ // would be returned if Next() were called). |
+ const char* next_literal_string() const { |
+ return next_.literal_chars.start(); |
+ } |
+ |
+ |
+ // Returns the length of the next token (that would be returned if |
+ // Next() were called). |
+ int next_literal_length() const { |
+ // Excluding terminal '\x00' added by TerminateLiteral(). |
+ return next_.literal_chars.length() - 1; |
+ } |
+ |
+ Vector<const char> next_literal() const { |
+ return Vector<const char>(next_literal_string(), next_literal_length()); |
+ } |
+ |
+ bool stack_overflow() { return stack_overflow_; } |
+ |
+ static const int kCharacterLookaheadBufferSize = 1; |
+ |
+ protected: |
+ // The current and look-ahead token. |
+ struct TokenDesc { |
+ Token::Value token; |
+ Location location; |
+ Vector<const char> literal_chars; |
+ }; |
+ |
+ // Call this after setting source_ to the input. |
+ void Init() { |
+ // Set c0_ (one character ahead) |
+ ASSERT(kCharacterLookaheadBufferSize == 1); |
+ Advance(); |
+ // Initialize current_ to not refer to a literal. |
+ current_.literal_chars = Vector<const char>(); |
+ // Reset literal buffer. |
+ literal_buffer_.Reset(); |
+ } |
+ |
+ // Literal buffer support |
+ inline void StartLiteral() { |
+ literal_buffer_.StartLiteral(); |
+ } |
+ |
+ inline void AddLiteralChar(uc32 c) { |
+ literal_buffer_.AddChar(c); |
+ } |
+ |
+ // Complete scanning of a literal. |
+ inline void TerminateLiteral() { |
+ next_.literal_chars = literal_buffer_.EndLiteral(); |
+ } |
+ |
+ // Stops scanning of a literal and drop the collected characters, |
+ // e.g., due to an encountered error. |
+ inline void DropLiteral() { |
+ literal_buffer_.DropLiteral(); |
+ } |
+ |
+ inline void AddLiteralCharAdvance() { |
+ AddLiteralChar(c0_); |
+ Advance(); |
+ } |
+ |
+ // Low-level scanning support. |
+ void Advance() { c0_ = source_->Advance(); } |
+ void PushBack(uc32 ch) { |
+ source_->PushBack(ch); |
+ c0_ = ch; |
+ } |
+ |
+ inline Token::Value Select(Token::Value tok) { |
+ Advance(); |
+ return tok; |
+ } |
+ |
+ inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) { |
+ Advance(); |
+ if (c0_ == next) { |
+ Advance(); |
+ return then; |
+ } else { |
+ return else_; |
+ } |
+ } |
+ |
+ uc32 ScanHexEscape(uc32 c, int length); |
+ uc32 ScanOctalEscape(uc32 c, int length); |
+ |
+ // Return the current source position. |
+ int source_pos() { |
+ return source_->pos() - kCharacterLookaheadBufferSize; |
+ } |
+ |
+ TokenDesc current_; // desc for current token (as returned by Next()) |
+ TokenDesc next_; // desc for next token (one token look-ahead) |
+ |
+ // Input stream. Must be initialized to an UTF16Buffer. |
+ UTF16Buffer* source_; |
+ |
+ // Buffer to hold literal values (identifiers, strings, numbers) |
+ // using '\x00'-terminated UTF-8 encoding. Handles allocation internally. |
+ LiteralCollector literal_buffer_; |
+ |
+ bool stack_overflow_; |
+ |
+ // One Unicode character look-ahead; c0_ < 0 at the end of the input. |
+ uc32 c0_; |
+}; |
+ |
+// ---------------------------------------------------------------------------- |
+// JavaScriptScanner - base logic for JavaScript scanning. |
+ |
+class JavaScriptScanner : public Scanner { |
+ public: |
+ JavaScriptScanner(); |
+ |
+ // Returns the next token. |
+ Token::Value Next(); |
+ |
+ // Returns true if there was a line terminator before the peek'ed token. |
+ bool has_line_terminator_before_next() const { |
+ return has_line_terminator_before_next_; |
+ } |
+ |
+ // Scans the input as a regular expression pattern, previous |
+ // character(s) must be /(=). Returns true if a pattern is scanned. |
+ bool ScanRegExpPattern(bool seen_equal); |
+ // Returns true if regexp flags are scanned (always since flags can |
+ // be empty). |
+ bool ScanRegExpFlags(); |
+ |
+ // Tells whether the buffer contains an identifier (no escapes). |
+ // Used for checking if a property name is an identifier. |
+ static bool IsIdentifier(unibrow::CharacterStream* buffer); |
+ |
+ // Seek forward to the given position. This operation does not |
+ // work in general, for instance when there are pushed back |
+ // characters, but works for seeking forward until simple delimiter |
+ // tokens, which is what it is used for. |
+ void SeekForward(int pos); |
+ |
+ protected: |
+ bool SkipWhiteSpace(); |
+ Token::Value SkipSingleLineComment(); |
+ Token::Value SkipMultiLineComment(); |
+ |
+ // Scans a single JavaScript token. |
+ void Scan(); |
+ |
+ void ScanDecimalDigits(); |
+ Token::Value ScanNumber(bool seen_period); |
+ Token::Value ScanIdentifier(); |
+ |
+ void ScanEscape(); |
+ Token::Value ScanString(); |
+ |
+ // Scans a possible HTML comment -- begins with '<!'. |
+ Token::Value ScanHtmlComment(); |
+ |
+ // Decodes a unicode escape-sequence which is part of an identifier. |
+ // If the escape sequence cannot be decoded the result is kBadChar. |
+ uc32 ScanIdentifierUnicodeEscape(); |
+ |
+ bool has_line_terminator_before_next_; |
+}; |
+ |
+ |
+// ---------------------------------------------------------------------------- |
+// Keyword matching state machine. |
class KeywordMatcher { |
// Incrementally recognize keywords. |