Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(588)

Unified Diff: src/scanner.h

Issue 5136002: Extract scanner base/JS/JSON and move base and JS to scanner-base. (Closed)
Patch Set: Created 10 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/prescanner.h ('k') | src/scanner.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/scanner.h
diff --git a/src/scanner.h b/src/scanner.h
index df5cd729491de7c3aa61649935d28867cf4e941c..daf39219839117ff596f1b9bd46054e8f667e599 100644
--- a/src/scanner.h
+++ b/src/scanner.h
@@ -35,65 +35,6 @@
namespace v8 {
namespace internal {
-
-class UTF8Buffer {
- public:
- UTF8Buffer();
- ~UTF8Buffer();
-
- inline void AddChar(uc32 c) {
- if (recording_) {
- if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
- buffer_.Add(static_cast<char>(c));
- } else {
- AddCharSlow(c);
- }
- }
- }
-
- void StartLiteral() {
- buffer_.StartSequence();
- recording_ = true;
- }
-
- Vector<const char> EndLiteral() {
- if (recording_) {
- recording_ = false;
- buffer_.Add(kEndMarker);
- Vector<char> sequence = buffer_.EndSequence();
- return Vector<const char>(sequence.start(), sequence.length());
- }
- return Vector<const char>();
- }
-
- void DropLiteral() {
- if (recording_) {
- recording_ = false;
- buffer_.DropSequence();
- }
- }
-
- void Reset() {
- buffer_.Reset();
- }
-
- // The end marker added after a parsed literal.
- // Using zero allows the usage of strlen and similar functions on
- // identifiers and numbers (but not strings, since they may contain zero
- // bytes).
- // TODO(lrn): Use '\xff' as end marker, since it cannot occur inside
- // an utf-8 string. This requires changes in all places that uses
- // str-functions on the literals, but allows a single pointer to represent
- // the literal, even if it contains embedded zeros.
- static const char kEndMarker = '\x00';
- private:
- static const int kInitialCapacity = 256;
- SequenceCollector<char, 4> buffer_;
- bool recording_;
- void AddCharSlow(uc32 c);
-};
-
-
// UTF16 buffer to read characters from a character stream.
class CharacterStreamUTF16Buffer: public UTF16Buffer {
public:
@@ -134,175 +75,63 @@ class ExternalStringUTF16Buffer: public UTF16Buffer {
};
-enum ParserLanguage { JAVASCRIPT, JSON };
-
-
-class Scanner {
+// Initializes a UTF16Buffer as input stream, using one of a number
+// of strategies depending on the available character sources.
+class StreamInitializer {
public:
- typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
+ UTF16Buffer* Init(Handle<String> source,
+ unibrow::CharacterStream* stream,
+ int start_position,
+ int end_position);
+ private:
+ // Different UTF16 buffers used to pull characters from. Based on input one of
+ // these will be initialized as the actual data source.
+ CharacterStreamUTF16Buffer char_stream_buffer_;
+ ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t>
+ two_byte_string_buffer_;
+ ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_;
- class LiteralScope {
- public:
- explicit LiteralScope(Scanner* self);
- ~LiteralScope();
- void Complete();
+ // Used to convert the source string into a character stream when a stream
+ // is not passed to the scanner.
+ SafeStringInputBuffer safe_string_input_buffer_;
+};
+
+// ----------------------------------------------------------------------------
+// V8JavaScriptScanner
+// JavaScript scanner getting its input from either a V8 String or a unicode
+// CharacterStream.
- private:
- Scanner* scanner_;
- bool complete_;
- };
+class V8JavaScriptScanner : public JavaScriptScanner {
+ public:
+ V8JavaScriptScanner() {}
- Scanner();
+ Token::Value NextCheckStack();
// Initialize the Scanner to scan source.
+ void Initialize(Handle<String> source);
void Initialize(Handle<String> source,
- ParserLanguage language);
- void Initialize(Handle<String> source,
- unibrow::CharacterStream* stream,
- ParserLanguage language);
+ unibrow::CharacterStream* stream);
void Initialize(Handle<String> source,
- int start_position, int end_position,
- ParserLanguage language);
+ int start_position, int end_position);
- // Returns the next token.
- Token::Value Next();
-
- // Returns the current token again.
- Token::Value current_token() { return current_.token; }
-
- // One token look-ahead (past the token returned by Next()).
- Token::Value peek() const { return next_.token; }
-
- // Returns true if there was a line terminator before the peek'ed token.
- bool has_line_terminator_before_next() const {
- return has_line_terminator_before_next_;
- }
-
- struct Location {
- Location(int b, int e) : beg_pos(b), end_pos(e) { }
- Location() : beg_pos(0), end_pos(0) { }
- int beg_pos;
- int end_pos;
- };
-
- // Returns the location information for the current token
- // (the token returned by Next()).
- Location location() const { return current_.location; }
- Location peek_location() const { return next_.location; }
-
- // Returns the literal string, if any, for the current token (the
- // token returned by Next()). The string is 0-terminated and in
- // UTF-8 format; they may contain 0-characters. Literal strings are
- // collected for identifiers, strings, and numbers.
- // These functions only give the correct result if the literal
- // was scanned between calls to StartLiteral() and TerminateLiteral().
- const char* literal_string() const {
- return current_.literal_chars.start();
- }
-
- int literal_length() const {
- // Excluding terminal '\x00' added by TerminateLiteral().
- return current_.literal_chars.length() - 1;
- }
-
- Vector<const char> literal() const {
- return Vector<const char>(literal_string(), literal_length());
- }
-
- // Returns the literal string for the next token (the token that
- // would be returned if Next() were called).
- const char* next_literal_string() const {
- return next_.literal_chars.start();
- }
-
-
- // Returns the length of the next token (that would be returned if
- // Next() were called).
- int next_literal_length() const {
- // Excluding terminal '\x00' added by TerminateLiteral().
- return next_.literal_chars.length() - 1;
- }
-
- Vector<const char> next_literal() const {
- return Vector<const char>(next_literal_string(), next_literal_length());
- }
-
- // Scans the input as a regular expression pattern, previous
- // character(s) must be /(=). Returns true if a pattern is scanned.
- bool ScanRegExpPattern(bool seen_equal);
- // Returns true if regexp flags are scanned (always since flags can
- // be empty).
- bool ScanRegExpFlags();
-
- // Seek forward to the given position. This operation does not
- // work in general, for instance when there are pushed back
- // characters, but works for seeking forward until simple delimiter
- // tokens, which is what it is used for.
- void SeekForward(int pos);
-
- bool stack_overflow() { return stack_overflow_; }
+ protected:
+ StreamInitializer stream_initializer_;
+};
- // Tells whether the buffer contains an identifier (no escapes).
- // Used for checking if a property name is an identifier.
- static bool IsIdentifier(unibrow::CharacterStream* buffer);
- static const int kCharacterLookaheadBufferSize = 1;
- static const int kNoEndPosition = 1;
+class JsonScanner : public Scanner {
+ public:
+ JsonScanner();
- private:
- // The current and look-ahead token.
- struct TokenDesc {
- Token::Value token;
- Location location;
- Vector<const char> literal_chars;
- };
-
- void Init(Handle<String> source,
- unibrow::CharacterStream* stream,
- int start_position, int end_position,
- ParserLanguage language);
-
- // Literal buffer support
- inline void StartLiteral();
- inline void AddLiteralChar(uc32 ch);
- inline void AddLiteralCharAdvance();
- inline void TerminateLiteral();
- // Stops scanning of a literal, e.g., due to an encountered error.
- inline void DropLiteral();
-
- // Low-level scanning support.
- void Advance() { c0_ = source_->Advance(); }
- void PushBack(uc32 ch) {
- source_->PushBack(ch);
- c0_ = ch;
- }
+ // Initialize the Scanner to scan source.
+ void Initialize(Handle<String> source);
- bool SkipWhiteSpace() {
- if (is_parsing_json_) {
- return SkipJsonWhiteSpace();
- } else {
- return SkipJavaScriptWhiteSpace();
- }
- }
+ // Returns the next token.
+ Token::Value Next();
- bool SkipJavaScriptWhiteSpace();
+ protected:
+ // Skip past JSON whitespace (only space, tab, newline and carrige-return).
bool SkipJsonWhiteSpace();
- Token::Value SkipSingleLineComment();
- Token::Value SkipMultiLineComment();
-
- inline Token::Value Select(Token::Value tok);
- inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_);
-
- inline void Scan() {
- if (is_parsing_json_) {
- ScanJson();
- } else {
- ScanJavaScript();
- }
- }
-
- // Scans a single JavaScript token.
- void ScanJavaScript();
// Scan a single JSON token. The JSON lexical grammar is specified in the
// ECMAScript 5 standard, section 15.12.1.1.
@@ -331,53 +160,7 @@ class Scanner {
// JSONNullLiteral).
Token::Value ScanJsonIdentifier(const char* text, Token::Value token);
- void ScanDecimalDigits();
- Token::Value ScanNumber(bool seen_period);
- Token::Value ScanIdentifier();
- uc32 ScanHexEscape(uc32 c, int length);
- uc32 ScanOctalEscape(uc32 c, int length);
- void ScanEscape();
- Token::Value ScanString();
-
- // Scans a possible HTML comment -- begins with '<!'.
- Token::Value ScanHtmlComment();
-
- // Return the current source position.
- int source_pos() {
- return source_->pos() - kCharacterLookaheadBufferSize;
- }
-
- // Decodes a unicode escape-sequence which is part of an identifier.
- // If the escape sequence cannot be decoded the result is kBadRune.
- uc32 ScanIdentifierUnicodeEscape();
-
- TokenDesc current_; // desc for current token (as returned by Next())
- TokenDesc next_; // desc for next token (one token look-ahead)
- bool has_line_terminator_before_next_;
- bool is_parsing_json_;
-
- // Different UTF16 buffers used to pull characters from. Based on input one of
- // these will be initialized as the actual data source.
- CharacterStreamUTF16Buffer char_stream_buffer_;
- ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t>
- two_byte_string_buffer_;
- ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_;
-
- // Source. Will point to one of the buffers declared above.
- UTF16Buffer* source_;
-
- // Used to convert the source string into a character stream when a stream
- // is not passed to the scanner.
- SafeStringInputBuffer safe_string_input_buffer_;
-
- // Buffer to hold literal values (identifiers, strings, numbers)
- // using '\x00'-terminated UTF-8 encoding. Handles allocation internally.
- UTF8Buffer literal_buffer_;
-
- bool stack_overflow_;
-
- // One Unicode character look-ahead; c0_ < 0 at the end of the input.
- uc32 c0_;
+ StreamInitializer stream_initializer_;
};
@@ -400,7 +183,7 @@ void ExternalStringUTF16Buffer<StringType, CharType>::Initialize(
SeekForward(start_position);
}
end_ =
- end_position != Scanner::kNoEndPosition ? end_position : data->length();
+ end_position != kNoEndPosition ? end_position : data->length();
}
« no previous file with comments | « src/prescanner.h ('k') | src/scanner.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698