src/scanner-base.h - Issue 6075005: Change scanner buffers to not use utf-8.

Unified Diff: src/scanner-base.h

Issue 6075005: Change scanner buffers to not use utf-8. (Closed)

Patch Set: Fixed linto. Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/scanner-base.h

diff --git a/src/scanner-base.h b/src/scanner-base.h

index c50b8f3ef62dee77f0daba35aa61225de62a9d04..e773586d044ef047e5a0a88dba2f0754e3a6a412 100644

--- a/src/scanner-base.h

+++ b/src/scanner-base.h

@@ -141,61 +141,103 @@ class ScannerConstants : AllStatic {

};

// ----------------------------------------------------------------------------

-// LiteralCollector - Collector of chars of literals.

+// LiteralBuffer - Collector of chars of literals.

-class LiteralCollector {

+class LiteralBuffer {

public:

- LiteralCollector();

- ~LiteralCollector();

- inline void AddChar(uc32 c) {

- if (recording_) {

- if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {

- buffer_.Add(static_cast<char>(c));

- } else {

- AddCharSlow(c);

+ LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }

+ ~LiteralBuffer() {

+ if (backing_store_.length() > 0) {

+ backing_store_.Dispose();

+ }

+ inline void AddChar(uc16 character) {

+ if (position_ >= backing_store_.length()) ExpandBuffer();

+ if (is_ascii_) {

+ if (character < kMaxAsciiCharCodeU) {

+ backing_store_[position_] = static_cast<byte>(character);

+ position_ += kASCIISize;

+ return;

}

+ ConvertToUC16();

}

+ *reinterpret_cast<uc16*>(&backing_store_[position_]) = character;

+ position_ += kUC16Size;

}

- void StartLiteral() {

- buffer_.StartSequence();

- recording_ = true;

+ bool is_ascii() { return is_ascii_; }

+ Vector<const uc16> uc16_literal() {

+ ASSERT(!is_ascii_);

+ ASSERT((position_ & 0x1) == 0);

+ return Vector<const uc16>(

+ reinterpret_cast<const uc16*>(backing_store_.start()),

+ position_ >> 1);

}

- Vector<const char> EndLiteral() {

- if (recording_) {

- recording_ = false;

- buffer_.Add(kEndMarker);

- Vector<char> sequence = buffer_.EndSequence();

- return Vector<const char>(sequence.start(), sequence.length());

- }

- return Vector<const char>();

+ Vector<const char> ascii_literal() {

+ ASSERT(is_ascii_);

+ return Vector<const char>(

+ reinterpret_cast<const char*>(backing_store_.start()),

+ position_);

}

- void DropLiteral() {

- if (recording_) {

- recording_ = false;

- buffer_.DropSequence();

- }

+ int length() {

+ return is_ascii_ ? position_ : (position_ >> 1);

}

void Reset() {

- buffer_.Reset();

+ position_ = 0;

+ is_ascii_ = true;

}

- // The end marker added after a parsed literal.

- // Using zero allows the usage of strlen and similar functions on

- // identifiers and numbers (but not strings, since they may contain zero

- // bytes).

- static const char kEndMarker = '\x00';

private:

- static const int kInitialCapacity = 256;

- SequenceCollector<char, 4> buffer_;

- bool recording_;

- void AddCharSlow(uc32 c);

+ static const int kInitialCapacity = 16;

+ static const int kGrowthFactory = 4;

+ static const int kMinConversionSlack = 256;

+ static const int kMaxGrowth = 1 * MB;

+ inline int NewCapacity(int min_capacity) {

+ int capacity = Max(min_capacity, backing_store_.length());

+ int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);

+ return new_capacity;

+ }

+ void ExpandBuffer() {

+ Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));

+ memcpy(new_store.start(), backing_store_.start(), position_);

+ backing_store_.Dispose();

+ backing_store_ = new_store;

+ }

+ void ConvertToUC16() {

+ ASSERT(is_ascii_);

+ Vector<byte> new_store;

+ int new_content_size = position_ * kUC16Size;

+ if (new_content_size > backing_store_.length()) {

+ new_store = Vector<byte>::New(NewCapacity(new_content_size));

+ } else {

+ new_store = backing_store_;

+ }

+ char* src = reinterpret_cast<char*>(backing_store_.start());

+ uc16* dst = reinterpret_cast<uc16*>(new_store.start());

+ for (int i = position_ - 1; i >= 0; i--) {

+ dst[i] = src[i];

+ }

+ if (new_store.start() != backing_store_.start()) {

+ backing_store_.Dispose();

+ backing_store_ = new_store;

+ }

+ position_ = new_content_size;

+ is_ascii_ = false;

+ }

+ bool is_ascii_;

+ int position_;

+ Vector<byte> backing_store_;

};

// ----------------------------------------------------------------------------

// Scanner base-class.

@@ -241,35 +283,40 @@ class Scanner {

// collected for identifiers, strings, and numbers.

// These functions only give the correct result if the literal

// was scanned between calls to StartLiteral() and TerminateLiteral().

- const char* literal_string() const {

- return current_.literal_chars.start();

+ bool is_literal_ascii() {

+ ASSERT_NOT_NULL(current_.literal_chars);

+ return current_.literal_chars->is_ascii();

}

- int literal_length() const {

- // Excluding terminal '\x00' added by TerminateLiteral().

- return current_.literal_chars.length() - 1;

+ Vector<const char> literal_ascii_string() {

+ ASSERT_NOT_NULL(current_.literal_chars);

+ return current_.literal_chars->ascii_literal();

}

- Vector<const char> literal() const {

- return Vector<const char>(literal_string(), literal_length());

+ Vector<const uc16> literal_uc16_string() {

+ ASSERT_NOT_NULL(current_.literal_chars);

+ return current_.literal_chars->uc16_literal();

+ }

+ int literal_length() const {

+ ASSERT_NOT_NULL(current_.literal_chars);

+ return current_.literal_chars->length();

}

// Returns the literal string for the next token (the token that

// would be returned if Next() were called).

- const char* next_literal_string() const {

- return next_.literal_chars.start();

+ bool is_next_literal_ascii() {

+ ASSERT_NOT_NULL(next_.literal_chars);

+ return next_.literal_chars->is_ascii();

}

- // Returns the length of the next token (that would be returned if

- // Next() were called).

- int next_literal_length() const {

- // Excluding terminal '\x00' added by TerminateLiteral().

- return next_.literal_chars.length() - 1;

+ Vector<const char> next_literal_ascii_string() {

+ ASSERT_NOT_NULL(next_.literal_chars);

+ return next_.literal_chars->ascii_literal();

}

- Vector<const char> next_literal() const {

- return Vector<const char>(next_literal_string(), next_literal_length());

+ Vector<const uc16> next_literal_uc16_string() {

+ ASSERT_NOT_NULL(next_.literal_chars);

+ return next_.literal_chars->uc16_literal();

+ }

+ int next_literal_length() const {

+ ASSERT_NOT_NULL(next_.literal_chars);

+ return next_.literal_chars->length();

}

static const int kCharacterLookaheadBufferSize = 1;

@@ -279,7 +326,7 @@ class Scanner {

struct TokenDesc {

Token::Value token;

Location location;

- Vector<const char> literal_chars;

+ LiteralBuffer* literal_chars;

};

// Call this after setting source_ to the input.

@@ -288,29 +335,31 @@ class Scanner {

ASSERT(kCharacterLookaheadBufferSize == 1);

Advance();

// Initialize current_ to not refer to a literal.

- current_.literal_chars = Vector<const char>();

- // Reset literal buffer.

- literal_buffer_.Reset();

+ current_.literal_chars = NULL;

}

// Literal buffer support

inline void StartLiteral() {

- literal_buffer_.StartLiteral();

+ LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ?

+ &literal_buffer2_ : &literal_buffer1_;

+ free_buffer->Reset();

+ next_.literal_chars = free_buffer;

}

inline void AddLiteralChar(uc32 c) {

- literal_buffer_.AddChar(c);

+ ASSERT_NOT_NULL(next_.literal_chars);

+ next_.literal_chars->AddChar(c);

}

// Complete scanning of a literal.

inline void TerminateLiteral() {

- next_.literal_chars = literal_buffer_.EndLiteral();

+ // Does nothing in the current implementation.

}

// Stops scanning of a literal and drop the collected characters,

// e.g., due to an encountered error.

inline void DropLiteral() {

- literal_buffer_.DropLiteral();

+ next_.literal_chars = NULL;

}

inline void AddLiteralCharAdvance() {

@@ -348,15 +397,16 @@ class Scanner {

return source_->pos() - kCharacterLookaheadBufferSize;

}

+ // Buffers collecting literal strings, numbers, etc.

+ LiteralBuffer literal_buffer1_;

+ LiteralBuffer literal_buffer2_;

TokenDesc current_; // desc for current token (as returned by Next())

TokenDesc next_; // desc for next token (one token look-ahead)

// Input stream. Must be initialized to an UC16CharacterStream.

UC16CharacterStream* source_;

- // Buffer to hold literal values (identifiers, strings, numbers)

- // using '\x00'-terminated UTF-8 encoding. Handles allocation internally.

- LiteralCollector literal_buffer_;

// One Unicode character look-ahead; c0_ < 0 at the end of the input.

uc32 c0_;

@@ -367,28 +417,14 @@ class Scanner {

class JavaScriptScanner : public Scanner {

public:

- // Bit vector representing set of types of literals.

- enum LiteralType {

- kNoLiterals = 0,

- kLiteralNumber = 1,

- kLiteralIdentifier = 2,

- kLiteralString = 4,

- kLiteralRegExp = 8,

- kLiteralRegExpFlags = 16,

- kAllLiterals = 31

- };

// A LiteralScope that disables recording of some types of JavaScript

// literals. If the scanner is configured to not record the specific

// type of literal, the scope will not call StartLiteral.

class LiteralScope {

public:

- LiteralScope(JavaScriptScanner* self, LiteralType type)

+ explicit LiteralScope(JavaScriptScanner* self)

: scanner_(self), complete_(false) {

- if (scanner_->RecordsLiteral(type)) {

- scanner_->StartLiteral();

- }

+ scanner_->StartLiteral();

}

~LiteralScope() {

if (!complete_) scanner_->DropLiteral();

@@ -430,11 +466,6 @@ class JavaScriptScanner : public Scanner {

// tokens, which is what it is used for.

void SeekForward(int pos);

- // Whether this scanner records the given literal type or not.

- bool RecordsLiteral(LiteralType type) {

- return (literal_flags_ & type) != 0;

- }

protected:

bool SkipWhiteSpace();

Token::Value SkipSingleLineComment();

@@ -458,7 +489,6 @@ class JavaScriptScanner : public Scanner {

// If the escape sequence cannot be decoded the result is kBadChar.

uc32 ScanIdentifierUnicodeEscape();

- int literal_flags_;

bool has_line_terminator_before_next_;

};

« no previous file with comments | « src/scanner.cc ('k') | src/scanner-base.cc » ('j') | no next file with comments »