src/scanner-character-streams.cc - Issue 708823002: Streaming API: detect UTF-8 BOM.

Unified Diff: src/scanner-character-streams.cc

Issue 708823002: Streaming API: detect UTF-8 BOM. (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Patch Set: . Created 6 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/scanner-character-streams.cc

diff --git a/src/scanner-character-streams.cc b/src/scanner-character-streams.cc

index 50c3955c1bdc888200f593cad0f45ec69900ae18..0dec5daed4aa321a458c69041fe82def9fa941f1 100644

--- a/src/scanner-character-streams.cc

+++ b/src/scanner-character-streams.cc

@@ -232,14 +232,12 @@ unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) {

static const byte kUtf8MultiByteMask = 0xC0;

static const byte kUtf8MultiByteCharFollower = 0x80;

+static const byte kUtf8MultiByteCharStart = 0xC0;

-#ifdef DEBUG

-static const byte kUtf8MultiByteCharStart = 0xC0;

static bool IsUtf8MultiCharacterStart(byte first_byte) {

return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;

}

-#endif

static bool IsUtf8MultiCharacterFollower(byte later_byte) {

@@ -341,6 +339,14 @@ unsigned ExternalStreamingStream::FillBuffer(unsigned position) {

// A caveat: a data chunk might end with bytes from an incomplete UTF-8

// character (the rest of the bytes will be in the next chunk).

if (encoding_ == ScriptCompiler::StreamedSource::UTF8) {

+ if (first_chunk_) {

+ // Get rid of the byte order mark (if any).

+ if (current_data_length_ >= 3 && current_data_[0] == 0xef &&

+ current_data_[1] == 0xbb && current_data_[2] == 0xbf) {

+ current_data_offset_ = 3;

+ }

HandleUtf8SplitCharacters(&data_in_buffer);

if (!data_ends && current_data_offset_ == current_data_length_) {

// The data stream didn't end, but we used all the data in the

@@ -360,6 +366,8 @@ unsigned ExternalStreamingStream::FillBuffer(unsigned position) {

DCHECK(utf8_split_char_buffer_length_ == 0);

return data_in_buffer;

}

+ first_chunk_ = false;

}

// Fill the buffer from current_data_.

@@ -396,11 +404,11 @@ void ExternalStreamingStream::HandleUtf8SplitCharacters(

unibrow::uchar c;

if (utf8_split_char_buffer_length_ > 0) {

// Move the bytes which are part of the split character (which started in

- // the previous chunk) into utf8_split_char_buffer_. Note that the

- // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2.

- while (current_data_offset_ < current_data_length_ &&

- utf8_split_char_buffer_length_ < 4 &&

- (c = current_data_[current_data_offset_]) >> 6 == 2) {

+ // the previous chunk) into utf8_split_char_buffer_.

+ while (

+ current_data_offset_ < current_data_length_ &&

+ utf8_split_char_buffer_length_ < 4 &&

+ IsUtf8MultiCharacterFollower(c = current_data_[current_data_offset_])) {

utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;

++utf8_split_char_buffer_length_;

++current_data_offset_;

@@ -426,15 +434,16 @@ void ExternalStreamingStream::HandleUtf8SplitCharacters(

// bytes long, but if the data is invalid, we can have character values bigger

// than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes.

while (current_data_length_ > current_data_offset_ &&

- (c = current_data_[current_data_length_ - 1]) >

- unibrow::Utf8::kMaxOneByteChar &&

- utf8_split_char_buffer_length_ < 4) {

+ utf8_split_char_buffer_length_ < 4 &&

+ (IsUtf8MultiCharacterFollower(

+ c = current_data_[current_data_length_ - 1]) ||

+ IsUtf8MultiCharacterStart(c))) {

--current_data_length_;

++utf8_split_char_buffer_length_;

- if (c >= (3 << 6)) {

- // 3 << 6 = 0b11000000; this is the first byte of the multi-byte

- // character. No need to copy the previous characters into the conversion

- // buffer (even if they're multi-byte).

+ if (IsUtf8MultiCharacterStart(c)) {

+ // This is the first byte of the multi-byte character. No need to copy the

+ // previous characters into the conversion buffer (even if they're

+ // multi-byte).

break;

}

« no previous file with comments | « src/scanner-character-streams.h ('k') | test/cctest/test-api.cc » ('j') | no next file with comments »