Index: src/scanner.h |
diff --git a/src/scanner.h b/src/scanner.h |
index df5cd729491de7c3aa61649935d28867cf4e941c..daf39219839117ff596f1b9bd46054e8f667e599 100644 |
--- a/src/scanner.h |
+++ b/src/scanner.h |
@@ -35,65 +35,6 @@ |
namespace v8 { |
namespace internal { |
- |
-class UTF8Buffer { |
- public: |
- UTF8Buffer(); |
- ~UTF8Buffer(); |
- |
- inline void AddChar(uc32 c) { |
- if (recording_) { |
- if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { |
- buffer_.Add(static_cast<char>(c)); |
- } else { |
- AddCharSlow(c); |
- } |
- } |
- } |
- |
- void StartLiteral() { |
- buffer_.StartSequence(); |
- recording_ = true; |
- } |
- |
- Vector<const char> EndLiteral() { |
- if (recording_) { |
- recording_ = false; |
- buffer_.Add(kEndMarker); |
- Vector<char> sequence = buffer_.EndSequence(); |
- return Vector<const char>(sequence.start(), sequence.length()); |
- } |
- return Vector<const char>(); |
- } |
- |
- void DropLiteral() { |
- if (recording_) { |
- recording_ = false; |
- buffer_.DropSequence(); |
- } |
- } |
- |
- void Reset() { |
- buffer_.Reset(); |
- } |
- |
- // The end marker added after a parsed literal. |
- // Using zero allows the usage of strlen and similar functions on |
- // identifiers and numbers (but not strings, since they may contain zero |
- // bytes). |
- // TODO(lrn): Use '\xff' as end marker, since it cannot occur inside |
- // an utf-8 string. This requires changes in all places that uses |
- // str-functions on the literals, but allows a single pointer to represent |
- // the literal, even if it contains embedded zeros. |
- static const char kEndMarker = '\x00'; |
- private: |
- static const int kInitialCapacity = 256; |
- SequenceCollector<char, 4> buffer_; |
- bool recording_; |
- void AddCharSlow(uc32 c); |
-}; |
- |
- |
// UTF16 buffer to read characters from a character stream. |
class CharacterStreamUTF16Buffer: public UTF16Buffer { |
public: |
@@ -134,175 +75,63 @@ class ExternalStringUTF16Buffer: public UTF16Buffer { |
}; |
-enum ParserLanguage { JAVASCRIPT, JSON }; |
- |
- |
-class Scanner { |
+// Initializes a UTF16Buffer as input stream, using one of a number |
+// of strategies depending on the available character sources. |
+class StreamInitializer { |
public: |
- typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; |
+ UTF16Buffer* Init(Handle<String> source, |
+ unibrow::CharacterStream* stream, |
+ int start_position, |
+ int end_position); |
+ private: |
+ // Different UTF16 buffers used to pull characters from. Based on input one of |
+ // these will be initialized as the actual data source. |
+ CharacterStreamUTF16Buffer char_stream_buffer_; |
+ ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t> |
+ two_byte_string_buffer_; |
+ ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_; |
- class LiteralScope { |
- public: |
- explicit LiteralScope(Scanner* self); |
- ~LiteralScope(); |
- void Complete(); |
+ // Used to convert the source string into a character stream when a stream |
+ // is not passed to the scanner. |
+ SafeStringInputBuffer safe_string_input_buffer_; |
+}; |
+ |
+// ---------------------------------------------------------------------------- |
+// V8JavaScriptScanner |
+// JavaScript scanner getting its input from either a V8 String or a unicode |
+// CharacterStream. |
- private: |
- Scanner* scanner_; |
- bool complete_; |
- }; |
+class V8JavaScriptScanner : public JavaScriptScanner { |
+ public: |
+ V8JavaScriptScanner() {} |
- Scanner(); |
+ Token::Value NextCheckStack(); |
// Initialize the Scanner to scan source. |
+ void Initialize(Handle<String> source); |
void Initialize(Handle<String> source, |
- ParserLanguage language); |
- void Initialize(Handle<String> source, |
- unibrow::CharacterStream* stream, |
- ParserLanguage language); |
+ unibrow::CharacterStream* stream); |
void Initialize(Handle<String> source, |
- int start_position, int end_position, |
- ParserLanguage language); |
+ int start_position, int end_position); |
- // Returns the next token. |
- Token::Value Next(); |
- |
- // Returns the current token again. |
- Token::Value current_token() { return current_.token; } |
- |
- // One token look-ahead (past the token returned by Next()). |
- Token::Value peek() const { return next_.token; } |
- |
- // Returns true if there was a line terminator before the peek'ed token. |
- bool has_line_terminator_before_next() const { |
- return has_line_terminator_before_next_; |
- } |
- |
- struct Location { |
- Location(int b, int e) : beg_pos(b), end_pos(e) { } |
- Location() : beg_pos(0), end_pos(0) { } |
- int beg_pos; |
- int end_pos; |
- }; |
- |
- // Returns the location information for the current token |
- // (the token returned by Next()). |
- Location location() const { return current_.location; } |
- Location peek_location() const { return next_.location; } |
- |
- // Returns the literal string, if any, for the current token (the |
- // token returned by Next()). The string is 0-terminated and in |
- // UTF-8 format; they may contain 0-characters. Literal strings are |
- // collected for identifiers, strings, and numbers. |
- // These functions only give the correct result if the literal |
- // was scanned between calls to StartLiteral() and TerminateLiteral(). |
- const char* literal_string() const { |
- return current_.literal_chars.start(); |
- } |
- |
- int literal_length() const { |
- // Excluding terminal '\x00' added by TerminateLiteral(). |
- return current_.literal_chars.length() - 1; |
- } |
- |
- Vector<const char> literal() const { |
- return Vector<const char>(literal_string(), literal_length()); |
- } |
- |
- // Returns the literal string for the next token (the token that |
- // would be returned if Next() were called). |
- const char* next_literal_string() const { |
- return next_.literal_chars.start(); |
- } |
- |
- |
- // Returns the length of the next token (that would be returned if |
- // Next() were called). |
- int next_literal_length() const { |
- // Excluding terminal '\x00' added by TerminateLiteral(). |
- return next_.literal_chars.length() - 1; |
- } |
- |
- Vector<const char> next_literal() const { |
- return Vector<const char>(next_literal_string(), next_literal_length()); |
- } |
- |
- // Scans the input as a regular expression pattern, previous |
- // character(s) must be /(=). Returns true if a pattern is scanned. |
- bool ScanRegExpPattern(bool seen_equal); |
- // Returns true if regexp flags are scanned (always since flags can |
- // be empty). |
- bool ScanRegExpFlags(); |
- |
- // Seek forward to the given position. This operation does not |
- // work in general, for instance when there are pushed back |
- // characters, but works for seeking forward until simple delimiter |
- // tokens, which is what it is used for. |
- void SeekForward(int pos); |
- |
- bool stack_overflow() { return stack_overflow_; } |
+ protected: |
+ StreamInitializer stream_initializer_; |
+}; |
- // Tells whether the buffer contains an identifier (no escapes). |
- // Used for checking if a property name is an identifier. |
- static bool IsIdentifier(unibrow::CharacterStream* buffer); |
- static const int kCharacterLookaheadBufferSize = 1; |
- static const int kNoEndPosition = 1; |
+class JsonScanner : public Scanner { |
+ public: |
+ JsonScanner(); |
- private: |
- // The current and look-ahead token. |
- struct TokenDesc { |
- Token::Value token; |
- Location location; |
- Vector<const char> literal_chars; |
- }; |
- |
- void Init(Handle<String> source, |
- unibrow::CharacterStream* stream, |
- int start_position, int end_position, |
- ParserLanguage language); |
- |
- // Literal buffer support |
- inline void StartLiteral(); |
- inline void AddLiteralChar(uc32 ch); |
- inline void AddLiteralCharAdvance(); |
- inline void TerminateLiteral(); |
- // Stops scanning of a literal, e.g., due to an encountered error. |
- inline void DropLiteral(); |
- |
- // Low-level scanning support. |
- void Advance() { c0_ = source_->Advance(); } |
- void PushBack(uc32 ch) { |
- source_->PushBack(ch); |
- c0_ = ch; |
- } |
+ // Initialize the Scanner to scan source. |
+ void Initialize(Handle<String> source); |
- bool SkipWhiteSpace() { |
- if (is_parsing_json_) { |
- return SkipJsonWhiteSpace(); |
- } else { |
- return SkipJavaScriptWhiteSpace(); |
- } |
- } |
+ // Returns the next token. |
+ Token::Value Next(); |
- bool SkipJavaScriptWhiteSpace(); |
+ protected: |
+ // Skip past JSON whitespace (only space, tab, newline and carrige-return). |
bool SkipJsonWhiteSpace(); |
- Token::Value SkipSingleLineComment(); |
- Token::Value SkipMultiLineComment(); |
- |
- inline Token::Value Select(Token::Value tok); |
- inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_); |
- |
- inline void Scan() { |
- if (is_parsing_json_) { |
- ScanJson(); |
- } else { |
- ScanJavaScript(); |
- } |
- } |
- |
- // Scans a single JavaScript token. |
- void ScanJavaScript(); |
// Scan a single JSON token. The JSON lexical grammar is specified in the |
// ECMAScript 5 standard, section 15.12.1.1. |
@@ -331,53 +160,7 @@ class Scanner { |
// JSONNullLiteral). |
Token::Value ScanJsonIdentifier(const char* text, Token::Value token); |
- void ScanDecimalDigits(); |
- Token::Value ScanNumber(bool seen_period); |
- Token::Value ScanIdentifier(); |
- uc32 ScanHexEscape(uc32 c, int length); |
- uc32 ScanOctalEscape(uc32 c, int length); |
- void ScanEscape(); |
- Token::Value ScanString(); |
- |
- // Scans a possible HTML comment -- begins with '<!'. |
- Token::Value ScanHtmlComment(); |
- |
- // Return the current source position. |
- int source_pos() { |
- return source_->pos() - kCharacterLookaheadBufferSize; |
- } |
- |
- // Decodes a unicode escape-sequence which is part of an identifier. |
- // If the escape sequence cannot be decoded the result is kBadRune. |
- uc32 ScanIdentifierUnicodeEscape(); |
- |
- TokenDesc current_; // desc for current token (as returned by Next()) |
- TokenDesc next_; // desc for next token (one token look-ahead) |
- bool has_line_terminator_before_next_; |
- bool is_parsing_json_; |
- |
- // Different UTF16 buffers used to pull characters from. Based on input one of |
- // these will be initialized as the actual data source. |
- CharacterStreamUTF16Buffer char_stream_buffer_; |
- ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t> |
- two_byte_string_buffer_; |
- ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_; |
- |
- // Source. Will point to one of the buffers declared above. |
- UTF16Buffer* source_; |
- |
- // Used to convert the source string into a character stream when a stream |
- // is not passed to the scanner. |
- SafeStringInputBuffer safe_string_input_buffer_; |
- |
- // Buffer to hold literal values (identifiers, strings, numbers) |
- // using '\x00'-terminated UTF-8 encoding. Handles allocation internally. |
- UTF8Buffer literal_buffer_; |
- |
- bool stack_overflow_; |
- |
- // One Unicode character look-ahead; c0_ < 0 at the end of the input. |
- uc32 c0_; |
+ StreamInitializer stream_initializer_; |
}; |
@@ -400,7 +183,7 @@ void ExternalStringUTF16Buffer<StringType, CharType>::Initialize( |
SeekForward(start_position); |
} |
end_ = |
- end_position != Scanner::kNoEndPosition ? end_position : data->length(); |
+ end_position != kNoEndPosition ? end_position : data->length(); |
} |