Index: src/scanner-character-streams.cc |
diff --git a/src/scanner-character-streams.cc b/src/scanner-character-streams.cc |
index 31b4ee47c4835077998ee189c226411d70f5e0a3..d06f479f94bef5e4d6507d0018406955bdc80360 100644 |
--- a/src/scanner-character-streams.cc |
+++ b/src/scanner-character-streams.cc |
@@ -411,13 +411,17 @@ void ExternalStreamingStream::HandleUtf8SplitCharacters( |
// Move bytes which are part of an incomplete character from the end of the |
// current chunk to utf8_split_char_buffer_. They will be converted when the |
- // next data chunk arrives. |
+ // next data chunk arrives. Note that all valid UTF-8 characters are at most 4 |
+ // bytes long, but if the data is invalid, we can have character values bigger |
+ // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes. |
while (current_data_length_ > current_data_offset_ && |
(c = current_data_[current_data_length_ - 1]) > |
- unibrow::Utf8::kMaxOneByteChar) { |
+ unibrow::Utf8::kMaxOneByteChar && |
+ utf8_split_char_buffer_length_ < 4) { |
--current_data_length_; |
++utf8_split_char_buffer_length_; |
} |
+ CHECK(utf8_split_char_buffer_length_ <= 4); |
for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) { |
utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i]; |
} |