Index: src/parsing/scanner-character-streams.cc |
diff --git a/src/parsing/scanner-character-streams.cc b/src/parsing/scanner-character-streams.cc |
index 19c93fa3c885162c41b28deef3ac81a5f4ba9e8b..7cdef87c9c5cf429c59317c2f6176acc8e2aca6e 100644 |
--- a/src/parsing/scanner-character-streams.cc |
+++ b/src/parsing/scanner-character-streams.cc |
@@ -16,6 +16,33 @@ namespace internal { |
namespace { |
+size_t CopyUtf8CharsToUtf16Chars(uint16_t* dest, size_t length, const byte* src, |
+ size_t* src_pos, size_t src_length) { |
+ static const unibrow::uchar kMaxUtf16Character = |
+ unibrow::Utf16::kMaxNonSurrogateCharCode; |
+ size_t i = 0; |
+ // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer |
+ // one character early (in the normal case), because we need to have at least |
+ // two free spaces in the buffer to be sure that the next character will fit. |
+ while (i < length - 1) { |
+ if (*src_pos == src_length) break; |
+ unibrow::uchar c = src[*src_pos]; |
+ if (c <= unibrow::Utf8::kMaxOneByteChar) { |
+ *src_pos = *src_pos + 1; |
+ } else { |
+ c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos, |
+ src_pos); |
+ } |
+ if (c > kMaxUtf16Character) { |
+ dest[i++] = unibrow::Utf16::LeadSurrogate(c); |
+ dest[i++] = unibrow::Utf16::TrailSurrogate(c); |
+ } else { |
+ dest[i++] = static_cast<uc16>(c); |
+ } |
+ } |
+ return i; |
+} |
+ |
size_t CopyCharsHelper(uint16_t* dest, size_t length, const uint8_t* src, |
size_t* src_pos, size_t src_length, |
ScriptCompiler::StreamedSource::Encoding encoding) { |
@@ -24,8 +51,7 @@ size_t CopyCharsHelper(uint16_t* dest, size_t length, const uint8_t* src, |
if (length == 0) return 0; |
if (encoding == ScriptCompiler::StreamedSource::UTF8) { |
- return v8::internal::Utf8ToUtf16CharacterStream::CopyChars( |
- dest, length, src, src_pos, src_length); |
+ return CopyUtf8CharsToUtf16Chars(dest, length, src, src_pos, src_length); |
} |
size_t to_fill = length; |
@@ -175,163 +201,7 @@ size_t GenericStringUtf16CharacterStream::FillBuffer(size_t from_pos) { |
// ---------------------------------------------------------------------------- |
-// Utf8ToUtf16CharacterStream |
-Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data, |
- size_t length) |
- : BufferedUtf16CharacterStream(), |
- raw_data_(data), |
- raw_data_length_(length), |
- raw_data_pos_(0), |
- raw_character_position_(0) { |
- ReadBlock(); |
-} |
- |
- |
-Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } |
- |
- |
-size_t Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, size_t length, |
- const byte* src, size_t* src_pos, |
- size_t src_length) { |
- static const unibrow::uchar kMaxUtf16Character = |
- unibrow::Utf16::kMaxNonSurrogateCharCode; |
- size_t i = 0; |
- // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer |
- // one character early (in the normal case), because we need to have at least |
- // two free spaces in the buffer to be sure that the next character will fit. |
- while (i < length - 1) { |
- if (*src_pos == src_length) break; |
- unibrow::uchar c = src[*src_pos]; |
- if (c <= unibrow::Utf8::kMaxOneByteChar) { |
- *src_pos = *src_pos + 1; |
- } else { |
- c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos, |
- src_pos); |
- } |
- if (c > kMaxUtf16Character) { |
- dest[i++] = unibrow::Utf16::LeadSurrogate(c); |
- dest[i++] = unibrow::Utf16::TrailSurrogate(c); |
- } else { |
- dest[i++] = static_cast<uc16>(c); |
- } |
- } |
- return i; |
-} |
- |
- |
-size_t Utf8ToUtf16CharacterStream::BufferSeekForward(size_t delta) { |
- size_t old_pos = pos_; |
- size_t target_pos = pos_ + delta; |
- SetRawPosition(target_pos); |
- pos_ = raw_character_position_; |
- ReadBlock(); |
- return pos_ - old_pos; |
-} |
- |
- |
-size_t Utf8ToUtf16CharacterStream::FillBuffer(size_t char_position) { |
- SetRawPosition(char_position); |
- if (raw_character_position_ != char_position) { |
- // char_position was not a valid position in the stream (hit the end |
- // while spooling to it). |
- return 0u; |
- } |
- size_t i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_, |
- raw_data_length_); |
- raw_character_position_ = char_position + i; |
- return i; |
-} |
- |
- |
-static const byte kUtf8MultiByteMask = 0xC0; |
-static const byte kUtf8MultiByteCharFollower = 0x80; |
- |
- |
-#ifdef DEBUG |
-static const byte kUtf8MultiByteCharStart = 0xC0; |
-static bool IsUtf8MultiCharacterStart(byte first_byte) { |
- return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart; |
-} |
-#endif |
- |
- |
-static bool IsUtf8MultiCharacterFollower(byte later_byte) { |
- return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower; |
-} |
- |
- |
-// Move the cursor back to point at the preceding UTF-8 character start |
-// in the buffer. |
-static inline void Utf8CharacterBack(const byte* buffer, size_t* cursor) { |
- byte character = buffer[--*cursor]; |
- if (character > unibrow::Utf8::kMaxOneByteChar) { |
- DCHECK(IsUtf8MultiCharacterFollower(character)); |
- // Last byte of a multi-byte character encoding. Step backwards until |
- // pointing to the first byte of the encoding, recognized by having the |
- // top two bits set. |
- while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { } |
- DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor])); |
- } |
-} |
- |
- |
-// Move the cursor forward to point at the next following UTF-8 character start |
-// in the buffer. |
-static inline void Utf8CharacterForward(const byte* buffer, size_t* cursor) { |
- byte character = buffer[(*cursor)++]; |
- if (character > unibrow::Utf8::kMaxOneByteChar) { |
- // First character of a multi-byte character encoding. |
- // The number of most-significant one-bits determines the length of the |
- // encoding: |
- // 110..... - (0xCx, 0xDx) one additional byte (minimum). |
- // 1110.... - (0xEx) two additional bytes. |
- // 11110... - (0xFx) three additional bytes (maximum). |
- DCHECK(IsUtf8MultiCharacterStart(character)); |
- // Additional bytes is: |
- // 1 if value in range 0xC0 .. 0xDF. |
- // 2 if value in range 0xE0 .. 0xEF. |
- // 3 if value in range 0xF0 .. 0xF7. |
- // Encode that in a single value. |
- size_t additional_bytes = |
- ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03; |
- *cursor += additional_bytes; |
- DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes])); |
- } |
-} |
- |
- |
-// This can't set a raw position between two surrogate pairs, since there |
-// is no position in the UTF8 stream that corresponds to that. This assumes |
-// that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If |
-// it is illegally coded as two 3 byte sequences then there is no problem here. |
-void Utf8ToUtf16CharacterStream::SetRawPosition(size_t target_position) { |
- if (raw_character_position_ > target_position) { |
- // Spool backwards in utf8 buffer. |
- do { |
- size_t old_pos = raw_data_pos_; |
- Utf8CharacterBack(raw_data_, &raw_data_pos_); |
- raw_character_position_--; |
- DCHECK(old_pos - raw_data_pos_ <= 4); |
- // Step back over both code units for surrogate pairs. |
- if (old_pos - raw_data_pos_ == 4) raw_character_position_--; |
- } while (raw_character_position_ > target_position); |
- // No surrogate pair splitting. |
- DCHECK(raw_character_position_ == target_position); |
- return; |
- } |
- // Spool forwards in the utf8 buffer. |
- while (raw_character_position_ < target_position) { |
- if (raw_data_pos_ == raw_data_length_) return; |
- size_t old_pos = raw_data_pos_; |
- Utf8CharacterForward(raw_data_, &raw_data_pos_); |
- raw_character_position_++; |
- DCHECK(raw_data_pos_ - old_pos <= 4); |
- if (raw_data_pos_ - old_pos == 4) raw_character_position_++; |
- } |
- // No surrogate pair splitting. |
- DCHECK(raw_character_position_ == target_position); |
-} |
- |
+// ExternalStreamingStream |
size_t ExternalStreamingStream::FillBuffer(size_t position) { |
// Ignore "position" which is the position in the decoded data. Instead, |
@@ -599,6 +469,16 @@ ExternalOneByteStringUtf16CharacterStream:: |
pos_ = start_position; |
} |
+ExternalOneByteStringUtf16CharacterStream:: |
+ ExternalOneByteStringUtf16CharacterStream(const char* data, size_t length) |
+ : raw_data_(reinterpret_cast<const uint8_t*>(data)), |
+ length_(length), |
+ bookmark_(kNoBookmark) {} |
+ |
+ExternalOneByteStringUtf16CharacterStream:: |
+ ExternalOneByteStringUtf16CharacterStream(const char* data) |
+ : ExternalOneByteStringUtf16CharacterStream(data, strlen(data)) {} |
+ |
bool ExternalOneByteStringUtf16CharacterStream::SetBookmark() { |
bookmark_ = pos_; |
return true; |