| Index: src/scanner-character-streams.cc
|
| diff --git a/src/scanner-character-streams.cc b/src/scanner-character-streams.cc
|
| index 50c3955c1bdc888200f593cad0f45ec69900ae18..0dec5daed4aa321a458c69041fe82def9fa941f1 100644
|
| --- a/src/scanner-character-streams.cc
|
| +++ b/src/scanner-character-streams.cc
|
| @@ -232,14 +232,12 @@ unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) {
|
|
|
| static const byte kUtf8MultiByteMask = 0xC0;
|
| static const byte kUtf8MultiByteCharFollower = 0x80;
|
| +static const byte kUtf8MultiByteCharStart = 0xC0;
|
|
|
|
|
| -#ifdef DEBUG
|
| -static const byte kUtf8MultiByteCharStart = 0xC0;
|
| static bool IsUtf8MultiCharacterStart(byte first_byte) {
|
| return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
|
| }
|
| -#endif
|
|
|
|
|
| static bool IsUtf8MultiCharacterFollower(byte later_byte) {
|
| @@ -341,6 +339,14 @@ unsigned ExternalStreamingStream::FillBuffer(unsigned position) {
|
| // A caveat: a data chunk might end with bytes from an incomplete UTF-8
|
| // character (the rest of the bytes will be in the next chunk).
|
| if (encoding_ == ScriptCompiler::StreamedSource::UTF8) {
|
| + if (first_chunk_) {
|
| + // Get rid of the byte order mark (if any).
|
| + if (current_data_length_ >= 3 && current_data_[0] == 0xef &&
|
| + current_data_[1] == 0xbb && current_data_[2] == 0xbf) {
|
| + current_data_offset_ = 3;
|
| + }
|
| + }
|
| +
|
| HandleUtf8SplitCharacters(&data_in_buffer);
|
| if (!data_ends && current_data_offset_ == current_data_length_) {
|
| // The data stream didn't end, but we used all the data in the
|
| @@ -360,6 +366,8 @@ unsigned ExternalStreamingStream::FillBuffer(unsigned position) {
|
| DCHECK(utf8_split_char_buffer_length_ == 0);
|
| return data_in_buffer;
|
| }
|
| +
|
| + first_chunk_ = false;
|
| }
|
|
|
| // Fill the buffer from current_data_.
|
| @@ -396,11 +404,11 @@ void ExternalStreamingStream::HandleUtf8SplitCharacters(
|
| unibrow::uchar c;
|
| if (utf8_split_char_buffer_length_ > 0) {
|
| // Move the bytes which are part of the split character (which started in
|
| - // the previous chunk) into utf8_split_char_buffer_. Note that the
|
| - // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2.
|
| - while (current_data_offset_ < current_data_length_ &&
|
| - utf8_split_char_buffer_length_ < 4 &&
|
| - (c = current_data_[current_data_offset_]) >> 6 == 2) {
|
| + // the previous chunk) into utf8_split_char_buffer_.
|
| + while (
|
| + current_data_offset_ < current_data_length_ &&
|
| + utf8_split_char_buffer_length_ < 4 &&
|
| + IsUtf8MultiCharacterFollower(c = current_data_[current_data_offset_])) {
|
| utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;
|
| ++utf8_split_char_buffer_length_;
|
| ++current_data_offset_;
|
| @@ -426,15 +434,16 @@ void ExternalStreamingStream::HandleUtf8SplitCharacters(
|
| // bytes long, but if the data is invalid, we can have character values bigger
|
| // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes.
|
| while (current_data_length_ > current_data_offset_ &&
|
| - (c = current_data_[current_data_length_ - 1]) >
|
| - unibrow::Utf8::kMaxOneByteChar &&
|
| - utf8_split_char_buffer_length_ < 4) {
|
| + utf8_split_char_buffer_length_ < 4 &&
|
| + (IsUtf8MultiCharacterFollower(
|
| + c = current_data_[current_data_length_ - 1]) ||
|
| + IsUtf8MultiCharacterStart(c))) {
|
| --current_data_length_;
|
| ++utf8_split_char_buffer_length_;
|
| - if (c >= (3 << 6)) {
|
| - // 3 << 6 = 0b11000000; this is the first byte of the multi-byte
|
| - // character. No need to copy the previous characters into the conversion
|
| - // buffer (even if they're multi-byte).
|
| + if (IsUtf8MultiCharacterStart(c)) {
|
| + // This is the first byte of the multi-byte character. No need to copy the
|
| + // previous characters into the conversion buffer (even if they're
|
| + // multi-byte).
|
| break;
|
| }
|
| }
|
|
|