src/scanner.cc - Issue 5545006: Optimized scanner to avoid virtual calls for every character read.

Unified Diff: src/scanner.cc

Issue 5545006: Optimized scanner to avoid virtual calls for every character read. (Closed)

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/scanner.cc

diff --git a/src/scanner.cc b/src/scanner.cc

index d22ebc741fd482d19bcc7b098faf4616ff59dda9..f1dda9097d4b3c34cde8c3f1c20482ef3d037ca2 100755

--- a/src/scanner.cc

+++ b/src/scanner.cc

@@ -36,65 +36,241 @@ namespace v8 {

namespace internal {

// ----------------------------------------------------------------------------

-// UTF16Buffer

-// CharacterStreamUTF16Buffer

-CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer()

- : pushback_buffer_(0), last_(0), stream_(NULL) { }

+// BufferedUC16CharacterStreams

+BufferedUC16CharacterStream::BufferedUC16CharacterStream()

+ : UC16CharacterStream(),

+ pushback_limit_(NULL) {

+ // Initialize buffer as being empty. First read will fill the buffer.

+ buffer_cursor_ = buffer_;

+ buffer_end_ = buffer_;

+BufferedUC16CharacterStream::~BufferedUC16CharacterStream() { }

-void CharacterStreamUTF16Buffer::Initialize(Handle<String> data,

- unibrow::CharacterStream* input,

- int start_position,

- int end_position) {

- stream_ = input;

- if (start_position > 0) {

- SeekForward(start_position);

+void BufferedUC16CharacterStream::PushBack(uc16 character) {

+ if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) {

+ // buffer_ is writable, buffer_cursor_ is const pointer.

+ buffer_[--buffer_cursor_ - buffer_] = character;

+ pos_--;

+ return;

}

- end_ = end_position != kNoEndPosition ? end_position : kMaxInt;

+ SlowPushBack(character);

}

-void CharacterStreamUTF16Buffer::PushBack(uc32 ch) {

- pushback_buffer()->Add(last_);

- last_ = ch;

+void BufferedUC16CharacterStream::SlowPushBack(uc16 character) {

+ // In pushback mode, the end of the buffer contains pushback,

+ // and the start of the buffer (from buffer start to pushback_limit_)

+ // contains valid data that comes just after the pushback.

+ // We NULL the pushback_limit_ if pushing all the way back to the

+ // start of the buffer.

+ if (pushback_limit_ == NULL) {

+ // Enter pushback mode.

+ pushback_limit_ = buffer_end_;

+ buffer_end_ = buffer_ + kBufferSize;

+ buffer_cursor_ = buffer_end_;

+ }

+ ASSERT(pushback_limit_ > buffer_);

+ ASSERT(pos_ > 0);

+ buffer_[--buffer_cursor_ - buffer_] = character;

+ if (buffer_cursor_ == buffer_) {

+ pushback_limit_ = NULL;

+ } else if (buffer_cursor_ < pushback_limit_) {

+ pushback_limit_ = buffer_cursor_;

+ }

pos_--;

}

-uc32 CharacterStreamUTF16Buffer::Advance() {

- ASSERT(end_ != kNoEndPosition);

- ASSERT(end_ >= 0);

- // NOTE: It is of importance to Persian / Farsi resources that we do

- // *not* strip format control characters in the scanner; see

- //

- // https://bugzilla.mozilla.org/show_bug.cgi?id=274152

- //

- // So, even though ECMA-262, section 7.1, page 11, dictates that we

- // must remove Unicode format-control characters, we do not. This is

- // in line with how IE and SpiderMonkey handles it.

- if (!pushback_buffer()->is_empty()) {

- pos_++;

- return last_ = pushback_buffer()->RemoveLast();

- } else if (stream_->has_more() && pos_ < end_) {

- pos_++;

- uc32 next = stream_->GetNext();

- return last_ = next;

- } else {

- // Note: currently the following increment is necessary to avoid a

- // test-parser problem!

- pos_++;

- return last_ = static_cast<uc32>(-1);

+bool BufferedUC16CharacterStream::ReadBlock() {

+ if (pushback_limit_ != NULL) {

+ buffer_cursor_ = buffer_;

+ buffer_end_ = pushback_limit_;

+ pushback_limit_ = NULL;

+ ASSERT(buffer_cursor_ != buffer_end_);

+ return true;

+ }

+ unsigned length = FillBuffer(pos_, kBufferSize);

+ buffer_cursor_ = buffer_;

+ buffer_end_ = buffer_ + length;

+ return length > 0;

+unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) {

+ // Leave pushback mode (i.e., ignore that there might be valid data

+ // in the buffer before the pushback_limit_ point).

+ pushback_limit_ = NULL;

+ return BufferSeekForward(delta);

+// ----------------------------------------------------------------------------

+// GenericStringUC16CharacterStream

+GenericStringUC16CharacterStream::GenericStringUC16CharacterStream(

+ Handle<String> data,

+ unsigned start_position,

+ unsigned end_position)

+ : string_(data),

+ length_(end_position) {

+ ASSERT(end_position >= start_position);

+ buffer_cursor_ = buffer_;

+ buffer_end_ = buffer_;

+ pos_ = start_position;

Erik Corry 2010/12/07 12:27:30 blanky blanky here and a few more places.

+GenericStringUC16CharacterStream::~GenericStringUC16CharacterStream() { }

+unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) {

+ unsigned old_pos = pos_;

+ pos_ = Min(pos_ + delta, length_);

+ ReadBlock();

+ return pos_ - old_pos;

+unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos,

+ unsigned length) {

+ if (from_pos >= length_) return 0;

+ if (from_pos + length > length_) {

+ length = length_ - from_pos;

+ }

+ String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);

+ return length;

+// ----------------------------------------------------------------------------

+// Utf8ToUC16CharacterStream

+Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data,

+ unsigned length)

+ : BufferedUC16CharacterStream(),

+ raw_data_(data),

+ raw_data_length_(length),

+ raw_data_pos_(0),

+ raw_character_position_(0) {

+ ReadBlock();

+Utf8ToUC16CharacterStream::~Utf8ToUC16CharacterStream() { }

+unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) {

+ unsigned old_pos = pos_;

+ unsigned target_pos = pos_ + delta;

+ SetRawPosition(target_pos);

+ pos_ = raw_character_position_;

+ ReadBlock();

+ return pos_ - old_pos;

+unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,

+ unsigned length) {

+ static const unibrow::uchar kMaxUC16Character = 0xffff;

+ SetRawPosition(char_position);

+ if (raw_character_position_ != char_position) {

+ // char_position was not a valid position in the stream (hit the end

+ // while spooling to it).

+ return 0u;

+ }

+ unsigned i = 0;

+ while (i < length) {

+ if (raw_data_pos_ == raw_data_length_) break;

+ unibrow::uchar c = raw_data_[raw_data_pos_];

+ if (c <= unibrow::Utf8::kMaxOneByteChar) {

+ raw_data_pos_++;

+ } else {

+ c = unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_,

+ raw_data_length_ - raw_data_pos_,

+ &raw_data_pos_);

+ // Don't allow characters outside of the BMP.

+ if (c > kMaxUC16Character) {

+ c = unibrow::Utf8::kBadChar;

+ }

+ buffer_[i++] = static_cast<uc16>(c);

+ }

+ raw_character_position_ = char_position + i;

+ return i;

+// Move the cursor back to point at the preceding utf-8 character start

Erik Corry 2010/12/07 12:27:30 utf -> UTF here and in other places.

Lasse Reichstein 2010/12/07 14:05:54 Fixed. I hope.

+// in the buffer.

+static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {

+ byte character = buffer[--*cursor];

+ if ((character & 0x80u) != 0) {

Erik Corry 2010/12/07 12:27:30 if (character > Utf8::kMaxOneByteChar) is nicer be

Lasse Reichstein 2010/12/07 14:05:54 Done.

+ ASSERT((character & 0xC0) == 0x80);

Erik Corry 2010/12/07 12:27:30 (character & Utf8::kMultiByteEncodingMask) == Utf8

Lasse Reichstein 2010/12/07 14:05:54 Added functions, but not to unibrow::Utf8. I'll co

+ // Last byte of a multi-byte character encoding. Step backwards until

+ // pointing to the first byte of the encoding, recognized by having the

+ // top two bits set.

+ while (buffer[--*cursor] < 0xC0u) { }

Erik Corry 2010/12/07 12:27:30 kMultiByteEncodingFirstChar

Lasse Reichstein 2010/12/07 14:05:54 IsUtf8MultiCharacterFollower(buffer[--*cursor])

}

-void CharacterStreamUTF16Buffer::SeekForward(int pos) {

- pos_ = pos;

- ASSERT(pushback_buffer()->is_empty());

- stream_->Seek(pos);

+// Move the cursor forward to point at the next following utf-8 character start

+// in the buffer.

+static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {

+ byte character = buffer[(*cursor)++];

+ if ((character & 0x80u) != 0) {

+ // First character of a multi-byte character encoding.

+ // The number of most-significant one-bits determines the length of the

+ // encoding:

+ // 110..... - (0xCx, 0xDx) one additional byte (minimum).

+ // 1110.... - (0xEx) two additional bytes.

+ // 11110... - (0xFx) three additional bytes (maximum).

+ ASSERT((character & 0xC0) == 0xC0);

Erik Corry 2010/12/07 12:27:30 Named constants.

Lasse Reichstein 2010/12/07 14:05:54 Done.

+ // Additional bytes is:

+ // 1 if value in range 0xC0 .. 0xDF.

+ // 2 if value in range 0xE0 .. 0xEF.

+ // 3 if value in range 0xF0 .. 0xF7.

+ // Encode that in a single value

Erik Corry 2010/12/07 12:27:30 Missing full stop.

Lasse Reichstein 2010/12/07 14:05:54 Done.

+ unsigned additional_bytes =

+ ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;

Erik Corry 2010/12/07 12:27:30 Some places we check whether the unicode character

Lasse Reichstein 2010/12/07 14:05:54 It works with four-byte encodings as well. "additi

+ *cursor += additional_bytes;

+ }

+void Utf8ToUC16CharacterStream::SetRawPosition(unsigned target_position) {

+ if (raw_character_position_ > target_position) {

+ // Spool backwards in utf8 buffer.

+ do {

+ Utf8CharacterBack(raw_data_, &raw_data_pos_);

+ raw_character_position_--;

+ } while (raw_character_position_ > target_position);

+ return;

+ }

+ // Spool forwards in the utf8 buffer.

+ while (raw_character_position_ < target_position) {

+ if (raw_data_pos_ == raw_data_length_) return;

+ Utf8CharacterForward(raw_data_, &raw_data_pos_);

+ raw_character_position_++;

+ }

}

+// ----------------------------------------------------------------------------

+// ExternalTwoByteStringUC16CharacterStream

+ExternalTwoByteStringUC16CharacterStream::

+ ~ExternalTwoByteStringUC16CharacterStream() { }

+ExternalTwoByteStringUC16CharacterStream

+ ::ExternalTwoByteStringUC16CharacterStream(

+ Handle<ExternalTwoByteString> data,

+ int start_position,

+ int end_position)

+ : UC16CharacterStream(),

+ source_(data),

+ raw_data_(data->GetTwoByteData(start_position)) {

+ buffer_cursor_ = raw_data_,

+ buffer_end_ = raw_data_ + (end_position - start_position);

+ pos_ = start_position;

// ----------------------------------------------------------------------------

// Scanner::LiteralScope

@@ -118,43 +294,15 @@ void Scanner::LiteralScope::Complete() {

// ----------------------------------------------------------------------------

// V8JavaScriptScanner

-void V8JavaScriptScanner::Initialize(Handle<String> source,

- int literal_flags) {

- source_ = stream_initializer_.Init(source, NULL, 0, source->length());

- // Need to capture identifiers in order to recognize "get" and "set"

- // in object literals.

- literal_flags_ = literal_flags | kLiteralIdentifier;

- Init();

- // Skip initial whitespace allowing HTML comment ends just like

- // after a newline and scan first token.

- has_line_terminator_before_next_ = true;

- SkipWhiteSpace();

- Scan();

-void V8JavaScriptScanner::Initialize(Handle<String> source,

- unibrow::CharacterStream* stream,

- int literal_flags) {

- source_ = stream_initializer_.Init(source, stream,

- 0, UTF16Buffer::kNoEndPosition);

- literal_flags_ = literal_flags | kLiteralIdentifier;

- Init();

- // Skip initial whitespace allowing HTML comment ends just like

- // after a newline and scan first token.

- has_line_terminator_before_next_ = true;

- SkipWhiteSpace();

- Scan();

+V8JavaScriptScanner::V8JavaScriptScanner() : JavaScriptScanner() { }

-void V8JavaScriptScanner::Initialize(Handle<String> source,

- int start_position,

- int end_position,

+void V8JavaScriptScanner::Initialize(UC16CharacterStream* source,

int literal_flags) {

- source_ = stream_initializer_.Init(source, NULL,

- start_position, end_position);

+ source_ = source;

literal_flags_ = literal_flags | kLiteralIdentifier;

+ // Need to capture identifiers in order to recognize "get" and "set"

+ // in object literals.

Init();

// Skip initial whitespace allowing HTML comment ends just like

// after a newline and scan first token.

@@ -164,48 +312,14 @@ void V8JavaScriptScanner::Initialize(Handle<String> source,

}

-UTF16Buffer* StreamInitializer::Init(Handle<String> source,

- unibrow::CharacterStream* stream,

- int start_position,

- int end_position) {

- // Either initialize the scanner from a character stream or from a

- // string.

- ASSERT(source.is_null() || stream == NULL);

- // Initialize the source buffer.

- if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) {

- two_byte_string_buffer_.Initialize(

- Handle<ExternalTwoByteString>::cast(source),

- start_position,

- end_position);

- return &two_byte_string_buffer_;

- } else if (!source.is_null() && StringShape(*source).IsExternalAscii()) {

- ascii_string_buffer_.Initialize(

- Handle<ExternalAsciiString>::cast(source),

- start_position,

- end_position);

- return &ascii_string_buffer_;

- } else {

- if (!source.is_null()) {

- safe_string_input_buffer_.Reset(source.location());

- stream = &safe_string_input_buffer_;

- }

- char_stream_buffer_.Initialize(source,

- stream,

- start_position,

- end_position);

- return &char_stream_buffer_;

- }

// ----------------------------------------------------------------------------

// JsonScanner

-JsonScanner::JsonScanner() {}

+JsonScanner::JsonScanner() : Scanner() { }

-void JsonScanner::Initialize(Handle<String> source) {

- source_ = stream_initializer_.Init(source, NULL, 0, source->length());

+void JsonScanner::Initialize(UC16CharacterStream* source) {

+ source_ = source;

Init();

// Skip initial whitespace.

SkipJsonWhiteSpace();

« src/scanner.h ('K') | « src/scanner.h ('k') | src/scanner-base.h » ('j') | src/scanner-base.h » ('J')