Index: src/scanner-character-streams.cc |
diff --git a/src/scanner-character-streams.cc b/src/scanner-character-streams.cc |
index 732b2b43f6469ee8ce9dcb9e0373678cbf62cdce..50c3955c1bdc888200f593cad0f45ec69900ae18 100644 |
--- a/src/scanner-character-streams.cc |
+++ b/src/scanner-character-streams.cc |
@@ -18,6 +18,10 @@ namespace { |
unsigned CopyCharsHelper(uint16_t* dest, unsigned length, const uint8_t* src, |
unsigned* src_pos, unsigned src_length, |
ScriptCompiler::StreamedSource::Encoding encoding) { |
+ // It's possible that this will be called with length 0, but don't assume that |
+ // the functions this calls handle it gracefully. |
+ if (length == 0) return 0; |
+ |
if (encoding == ScriptCompiler::StreamedSource::UTF8) { |
return v8::internal::Utf8ToUtf16CharacterStream::CopyChars( |
dest, length, src, src_pos, src_length); |
@@ -381,15 +385,22 @@ unsigned ExternalStreamingStream::FillBuffer(unsigned position) { |
void ExternalStreamingStream::HandleUtf8SplitCharacters( |
unsigned* data_in_buffer) { |
+ // Note the following property of UTF-8 which makes this function possible: |
+ // Given any byte, we can always read its local environment (in both |
+ // directions) to find out the (possibly multi-byte) character it belongs |
+ // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a |
+ // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or |
+ // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX. |
+ |
// First check if we have leftover data from the last chunk. |
unibrow::uchar c; |
if (utf8_split_char_buffer_length_ > 0) { |
// Move the bytes which are part of the split character (which started in |
- // the previous chunk) into utf8_split_char_buffer_. |
+ // the previous chunk) into utf8_split_char_buffer_. Note that the |
+ // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2. |
while (current_data_offset_ < current_data_length_ && |
utf8_split_char_buffer_length_ < 4 && |
- (c = current_data_[current_data_offset_]) > |
- unibrow::Utf8::kMaxOneByteChar) { |
+ (c = current_data_[current_data_offset_]) >> 6 == 2) { |
utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c; |
++utf8_split_char_buffer_length_; |
++current_data_offset_; |