| OLD | NEW |
| (Empty) |
| 1 // Copyright 2011 the V8 project authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "src/scanner-character-streams.h" | |
| 6 | |
| 7 #include "include/v8.h" | |
| 8 #include "src/globals.h" | |
| 9 #include "src/handles.h" | |
| 10 #include "src/list-inl.h" // TODO(mstarzinger): Temporary cycle breaker! | |
| 11 #include "src/objects.h" | |
| 12 #include "src/unicode-inl.h" | |
| 13 | |
| 14 namespace v8 { | |
| 15 namespace internal { | |
| 16 | |
| 17 namespace { | |
| 18 | |
| 19 size_t CopyCharsHelper(uint16_t* dest, size_t length, const uint8_t* src, | |
| 20 size_t* src_pos, size_t src_length, | |
| 21 ScriptCompiler::StreamedSource::Encoding encoding) { | |
| 22 // It's possible that this will be called with length 0, but don't assume that | |
| 23 // the functions this calls handle it gracefully. | |
| 24 if (length == 0) return 0; | |
| 25 | |
| 26 if (encoding == ScriptCompiler::StreamedSource::UTF8) { | |
| 27 return v8::internal::Utf8ToUtf16CharacterStream::CopyChars( | |
| 28 dest, length, src, src_pos, src_length); | |
| 29 } | |
| 30 | |
| 31 size_t to_fill = length; | |
| 32 if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos; | |
| 33 | |
| 34 if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) { | |
| 35 v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill); | |
| 36 } else { | |
| 37 DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE); | |
| 38 v8::internal::CopyChars<uint16_t, uint16_t>( | |
| 39 dest, reinterpret_cast<const uint16_t*>(src + *src_pos), to_fill); | |
| 40 } | |
| 41 *src_pos += to_fill; | |
| 42 return to_fill; | |
| 43 } | |
| 44 | |
| 45 } // namespace | |
| 46 | |
| 47 | |
| 48 // ---------------------------------------------------------------------------- | |
| 49 // BufferedUtf16CharacterStreams | |
| 50 | |
| 51 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream() | |
| 52 : Utf16CharacterStream(), | |
| 53 pushback_limit_(NULL) { | |
| 54 // Initialize buffer as being empty. First read will fill the buffer. | |
| 55 buffer_cursor_ = buffer_; | |
| 56 buffer_end_ = buffer_; | |
| 57 } | |
| 58 | |
| 59 | |
| 60 BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { } | |
| 61 | |
| 62 void BufferedUtf16CharacterStream::PushBack(uc32 character) { | |
| 63 if (character == kEndOfInput) { | |
| 64 pos_--; | |
| 65 return; | |
| 66 } | |
| 67 if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) { | |
| 68 // buffer_ is writable, buffer_cursor_ is const pointer. | |
| 69 buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character); | |
| 70 pos_--; | |
| 71 return; | |
| 72 } | |
| 73 SlowPushBack(static_cast<uc16>(character)); | |
| 74 } | |
| 75 | |
| 76 | |
| 77 void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) { | |
| 78 // In pushback mode, the end of the buffer contains pushback, | |
| 79 // and the start of the buffer (from buffer start to pushback_limit_) | |
| 80 // contains valid data that comes just after the pushback. | |
| 81 // We NULL the pushback_limit_ if pushing all the way back to the | |
| 82 // start of the buffer. | |
| 83 | |
| 84 if (pushback_limit_ == NULL) { | |
| 85 // Enter pushback mode. | |
| 86 pushback_limit_ = buffer_end_; | |
| 87 buffer_end_ = buffer_ + kBufferSize; | |
| 88 buffer_cursor_ = buffer_end_; | |
| 89 } | |
| 90 // Ensure that there is room for at least one pushback. | |
| 91 DCHECK(buffer_cursor_ > buffer_); | |
| 92 DCHECK(pos_ > 0); | |
| 93 buffer_[--buffer_cursor_ - buffer_] = character; | |
| 94 if (buffer_cursor_ == buffer_) { | |
| 95 pushback_limit_ = NULL; | |
| 96 } else if (buffer_cursor_ < pushback_limit_) { | |
| 97 pushback_limit_ = buffer_cursor_; | |
| 98 } | |
| 99 pos_--; | |
| 100 } | |
| 101 | |
| 102 | |
| 103 bool BufferedUtf16CharacterStream::ReadBlock() { | |
| 104 buffer_cursor_ = buffer_; | |
| 105 if (pushback_limit_ != NULL) { | |
| 106 // Leave pushback mode. | |
| 107 buffer_end_ = pushback_limit_; | |
| 108 pushback_limit_ = NULL; | |
| 109 // If there were any valid characters left at the | |
| 110 // start of the buffer, use those. | |
| 111 if (buffer_cursor_ < buffer_end_) return true; | |
| 112 // Otherwise read a new block. | |
| 113 } | |
| 114 size_t length = FillBuffer(pos_); | |
| 115 buffer_end_ = buffer_ + length; | |
| 116 return length > 0; | |
| 117 } | |
| 118 | |
| 119 | |
| 120 size_t BufferedUtf16CharacterStream::SlowSeekForward(size_t delta) { | |
| 121 // Leave pushback mode (i.e., ignore that there might be valid data | |
| 122 // in the buffer before the pushback_limit_ point). | |
| 123 pushback_limit_ = NULL; | |
| 124 return BufferSeekForward(delta); | |
| 125 } | |
| 126 | |
| 127 | |
| 128 // ---------------------------------------------------------------------------- | |
| 129 // GenericStringUtf16CharacterStream | |
| 130 | |
| 131 | |
| 132 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream( | |
| 133 Handle<String> data, size_t start_position, size_t end_position) | |
| 134 : string_(data), length_(end_position), bookmark_(kNoBookmark) { | |
| 135 DCHECK(end_position >= start_position); | |
| 136 pos_ = start_position; | |
| 137 } | |
| 138 | |
| 139 | |
| 140 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { } | |
| 141 | |
| 142 | |
| 143 bool GenericStringUtf16CharacterStream::SetBookmark() { | |
| 144 bookmark_ = pos_; | |
| 145 return true; | |
| 146 } | |
| 147 | |
| 148 | |
| 149 void GenericStringUtf16CharacterStream::ResetToBookmark() { | |
| 150 DCHECK(bookmark_ != kNoBookmark); | |
| 151 pos_ = bookmark_; | |
| 152 buffer_cursor_ = buffer_; | |
| 153 buffer_end_ = buffer_ + FillBuffer(pos_); | |
| 154 } | |
| 155 | |
| 156 | |
| 157 size_t GenericStringUtf16CharacterStream::BufferSeekForward(size_t delta) { | |
| 158 size_t old_pos = pos_; | |
| 159 pos_ = Min(pos_ + delta, length_); | |
| 160 ReadBlock(); | |
| 161 return pos_ - old_pos; | |
| 162 } | |
| 163 | |
| 164 | |
| 165 size_t GenericStringUtf16CharacterStream::FillBuffer(size_t from_pos) { | |
| 166 if (from_pos >= length_) return 0; | |
| 167 size_t length = kBufferSize; | |
| 168 if (from_pos + length > length_) { | |
| 169 length = length_ - from_pos; | |
| 170 } | |
| 171 String::WriteToFlat<uc16>(*string_, buffer_, static_cast<int>(from_pos), | |
| 172 static_cast<int>(from_pos + length)); | |
| 173 return length; | |
| 174 } | |
| 175 | |
| 176 | |
| 177 // ---------------------------------------------------------------------------- | |
| 178 // Utf8ToUtf16CharacterStream | |
| 179 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data, | |
| 180 size_t length) | |
| 181 : BufferedUtf16CharacterStream(), | |
| 182 raw_data_(data), | |
| 183 raw_data_length_(length), | |
| 184 raw_data_pos_(0), | |
| 185 raw_character_position_(0) { | |
| 186 ReadBlock(); | |
| 187 } | |
| 188 | |
| 189 | |
| 190 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } | |
| 191 | |
| 192 | |
| 193 size_t Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, size_t length, | |
| 194 const byte* src, size_t* src_pos, | |
| 195 size_t src_length) { | |
| 196 static const unibrow::uchar kMaxUtf16Character = 0xffff; | |
| 197 size_t i = 0; | |
| 198 // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer | |
| 199 // one character early (in the normal case), because we need to have at least | |
| 200 // two free spaces in the buffer to be sure that the next character will fit. | |
| 201 while (i < length - 1) { | |
| 202 if (*src_pos == src_length) break; | |
| 203 unibrow::uchar c = src[*src_pos]; | |
| 204 if (c <= unibrow::Utf8::kMaxOneByteChar) { | |
| 205 *src_pos = *src_pos + 1; | |
| 206 } else { | |
| 207 c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos, | |
| 208 src_pos); | |
| 209 } | |
| 210 if (c > kMaxUtf16Character) { | |
| 211 dest[i++] = unibrow::Utf16::LeadSurrogate(c); | |
| 212 dest[i++] = unibrow::Utf16::TrailSurrogate(c); | |
| 213 } else { | |
| 214 dest[i++] = static_cast<uc16>(c); | |
| 215 } | |
| 216 } | |
| 217 return i; | |
| 218 } | |
| 219 | |
| 220 | |
| 221 size_t Utf8ToUtf16CharacterStream::BufferSeekForward(size_t delta) { | |
| 222 size_t old_pos = pos_; | |
| 223 size_t target_pos = pos_ + delta; | |
| 224 SetRawPosition(target_pos); | |
| 225 pos_ = raw_character_position_; | |
| 226 ReadBlock(); | |
| 227 return pos_ - old_pos; | |
| 228 } | |
| 229 | |
| 230 | |
| 231 size_t Utf8ToUtf16CharacterStream::FillBuffer(size_t char_position) { | |
| 232 SetRawPosition(char_position); | |
| 233 if (raw_character_position_ != char_position) { | |
| 234 // char_position was not a valid position in the stream (hit the end | |
| 235 // while spooling to it). | |
| 236 return 0u; | |
| 237 } | |
| 238 size_t i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_, | |
| 239 raw_data_length_); | |
| 240 raw_character_position_ = char_position + i; | |
| 241 return i; | |
| 242 } | |
| 243 | |
| 244 | |
| 245 static const byte kUtf8MultiByteMask = 0xC0; | |
| 246 static const byte kUtf8MultiByteCharFollower = 0x80; | |
| 247 | |
| 248 | |
| 249 #ifdef DEBUG | |
| 250 static const byte kUtf8MultiByteCharStart = 0xC0; | |
| 251 static bool IsUtf8MultiCharacterStart(byte first_byte) { | |
| 252 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart; | |
| 253 } | |
| 254 #endif | |
| 255 | |
| 256 | |
| 257 static bool IsUtf8MultiCharacterFollower(byte later_byte) { | |
| 258 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower; | |
| 259 } | |
| 260 | |
| 261 | |
| 262 // Move the cursor back to point at the preceding UTF-8 character start | |
| 263 // in the buffer. | |
| 264 static inline void Utf8CharacterBack(const byte* buffer, size_t* cursor) { | |
| 265 byte character = buffer[--*cursor]; | |
| 266 if (character > unibrow::Utf8::kMaxOneByteChar) { | |
| 267 DCHECK(IsUtf8MultiCharacterFollower(character)); | |
| 268 // Last byte of a multi-byte character encoding. Step backwards until | |
| 269 // pointing to the first byte of the encoding, recognized by having the | |
| 270 // top two bits set. | |
| 271 while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { } | |
| 272 DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor])); | |
| 273 } | |
| 274 } | |
| 275 | |
| 276 | |
| 277 // Move the cursor forward to point at the next following UTF-8 character start | |
| 278 // in the buffer. | |
| 279 static inline void Utf8CharacterForward(const byte* buffer, size_t* cursor) { | |
| 280 byte character = buffer[(*cursor)++]; | |
| 281 if (character > unibrow::Utf8::kMaxOneByteChar) { | |
| 282 // First character of a multi-byte character encoding. | |
| 283 // The number of most-significant one-bits determines the length of the | |
| 284 // encoding: | |
| 285 // 110..... - (0xCx, 0xDx) one additional byte (minimum). | |
| 286 // 1110.... - (0xEx) two additional bytes. | |
| 287 // 11110... - (0xFx) three additional bytes (maximum). | |
| 288 DCHECK(IsUtf8MultiCharacterStart(character)); | |
| 289 // Additional bytes is: | |
| 290 // 1 if value in range 0xC0 .. 0xDF. | |
| 291 // 2 if value in range 0xE0 .. 0xEF. | |
| 292 // 3 if value in range 0xF0 .. 0xF7. | |
| 293 // Encode that in a single value. | |
| 294 size_t additional_bytes = | |
| 295 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03; | |
| 296 *cursor += additional_bytes; | |
| 297 DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes])); | |
| 298 } | |
| 299 } | |
| 300 | |
| 301 | |
| 302 // This can't set a raw position between two surrogate pairs, since there | |
| 303 // is no position in the UTF8 stream that corresponds to that. This assumes | |
| 304 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If | |
| 305 // it is illegally coded as two 3 byte sequences then there is no problem here. | |
| 306 void Utf8ToUtf16CharacterStream::SetRawPosition(size_t target_position) { | |
| 307 if (raw_character_position_ > target_position) { | |
| 308 // Spool backwards in utf8 buffer. | |
| 309 do { | |
| 310 size_t old_pos = raw_data_pos_; | |
| 311 Utf8CharacterBack(raw_data_, &raw_data_pos_); | |
| 312 raw_character_position_--; | |
| 313 DCHECK(old_pos - raw_data_pos_ <= 4); | |
| 314 // Step back over both code units for surrogate pairs. | |
| 315 if (old_pos - raw_data_pos_ == 4) raw_character_position_--; | |
| 316 } while (raw_character_position_ > target_position); | |
| 317 // No surrogate pair splitting. | |
| 318 DCHECK(raw_character_position_ == target_position); | |
| 319 return; | |
| 320 } | |
| 321 // Spool forwards in the utf8 buffer. | |
| 322 while (raw_character_position_ < target_position) { | |
| 323 if (raw_data_pos_ == raw_data_length_) return; | |
| 324 size_t old_pos = raw_data_pos_; | |
| 325 Utf8CharacterForward(raw_data_, &raw_data_pos_); | |
| 326 raw_character_position_++; | |
| 327 DCHECK(raw_data_pos_ - old_pos <= 4); | |
| 328 if (raw_data_pos_ - old_pos == 4) raw_character_position_++; | |
| 329 } | |
| 330 // No surrogate pair splitting. | |
| 331 DCHECK(raw_character_position_ == target_position); | |
| 332 } | |
| 333 | |
| 334 | |
| 335 size_t ExternalStreamingStream::FillBuffer(size_t position) { | |
| 336 // Ignore "position" which is the position in the decoded data. Instead, | |
| 337 // ExternalStreamingStream keeps track of the position in the raw data. | |
| 338 size_t data_in_buffer = 0; | |
| 339 // Note that the UTF-8 decoder might not be able to fill the buffer | |
| 340 // completely; it will typically leave the last character empty (see | |
| 341 // Utf8ToUtf16CharacterStream::CopyChars). | |
| 342 while (data_in_buffer < kBufferSize - 1) { | |
| 343 if (current_data_ == NULL) { | |
| 344 // GetSomeData will wait until the embedder has enough data. Here's an | |
| 345 // interface between the API which uses size_t (which is the correct type | |
| 346 // here) and the internal parts which use size_t. | |
| 347 current_data_length_ = source_stream_->GetMoreData(¤t_data_); | |
| 348 current_data_offset_ = 0; | |
| 349 bool data_ends = current_data_length_ == 0; | |
| 350 bookmark_data_is_from_current_data_ = false; | |
| 351 | |
| 352 // A caveat: a data chunk might end with bytes from an incomplete UTF-8 | |
| 353 // character (the rest of the bytes will be in the next chunk). | |
| 354 if (encoding_ == ScriptCompiler::StreamedSource::UTF8) { | |
| 355 HandleUtf8SplitCharacters(&data_in_buffer); | |
| 356 if (!data_ends && current_data_offset_ == current_data_length_) { | |
| 357 // The data stream didn't end, but we used all the data in the | |
| 358 // chunk. This will only happen when the chunk was really small. We | |
| 359 // don't handle the case where a UTF-8 character is split over several | |
| 360 // chunks; in that case V8 won't crash, but it will be a parse error. | |
| 361 FlushCurrent(); | |
| 362 continue; // Request a new chunk. | |
| 363 } | |
| 364 } | |
| 365 | |
| 366 // Did the data stream end? | |
| 367 if (data_ends) { | |
| 368 DCHECK(utf8_split_char_buffer_length_ == 0); | |
| 369 return data_in_buffer; | |
| 370 } | |
| 371 } | |
| 372 | |
| 373 // Fill the buffer from current_data_. | |
| 374 size_t new_offset = 0; | |
| 375 size_t new_chars_in_buffer = | |
| 376 CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer, | |
| 377 current_data_ + current_data_offset_, &new_offset, | |
| 378 current_data_length_ - current_data_offset_, encoding_); | |
| 379 data_in_buffer += new_chars_in_buffer; | |
| 380 current_data_offset_ += new_offset; | |
| 381 DCHECK(data_in_buffer <= kBufferSize); | |
| 382 | |
| 383 // Did we use all the data in the data chunk? | |
| 384 if (current_data_offset_ == current_data_length_) { | |
| 385 FlushCurrent(); | |
| 386 } | |
| 387 } | |
| 388 return data_in_buffer; | |
| 389 } | |
| 390 | |
| 391 | |
| 392 bool ExternalStreamingStream::SetBookmark() { | |
| 393 // Bookmarking for this stream is a bit more complex than expected, since | |
| 394 // the stream state is distributed over several places: | |
| 395 // - pos_ (inherited from Utf16CharacterStream) | |
| 396 // - buffer_cursor_ and buffer_end_ (also from Utf16CharacterStream) | |
| 397 // - buffer_ (from BufferedUtf16CharacterStream) | |
| 398 // - current_data_ (+ .._offset_ and .._length) (this class) | |
| 399 // - utf8_split_char_buffer_* (a partial utf8 symbol at the block boundary) | |
| 400 // | |
| 401 // The underlying source_stream_ instance likely could re-construct this | |
| 402 // local data for us, but with the given interfaces we have no way of | |
| 403 // accomplishing this. Thus, we'll have to save all data locally. | |
| 404 // | |
| 405 // What gets saved where: | |
| 406 // - pos_ => bookmark_ | |
| 407 // - buffer_[buffer_cursor_ .. buffer_end_] => bookmark_buffer_ | |
| 408 // - current_data_[.._offset_ .. .._length_] => bookmark_data_ | |
| 409 // - utf8_split_char_buffer_* => bookmark_utf8_split... | |
| 410 // | |
| 411 // To make sure we don't unnecessarily copy data, we also maintain | |
| 412 // whether bookmark_data_ contains a copy of the current current_data_ | |
| 413 // block. This is done with: | |
| 414 // - bookmark_data_is_from_current_data_ | |
| 415 // - bookmark_data_offset_: offset into bookmark_data_ | |
| 416 // | |
| 417 // Note that bookmark_data_is_from_current_data_ must be maintained | |
| 418 // whenever current_data_ is updated. | |
| 419 | |
| 420 bookmark_ = pos_; | |
| 421 | |
| 422 size_t buffer_length = buffer_end_ - buffer_cursor_; | |
| 423 bookmark_buffer_.Dispose(); | |
| 424 bookmark_buffer_ = Vector<uint16_t>::New(static_cast<int>(buffer_length)); | |
| 425 CopyCharsUnsigned(bookmark_buffer_.start(), buffer_cursor_, buffer_length); | |
| 426 | |
| 427 size_t data_length = current_data_length_ - current_data_offset_; | |
| 428 size_t bookmark_data_length = static_cast<size_t>(bookmark_data_.length()); | |
| 429 if (bookmark_data_is_from_current_data_ && | |
| 430 data_length < bookmark_data_length) { | |
| 431 // Fast case: bookmark_data_ was previously copied from the current | |
| 432 // data block, and we have enough data for this bookmark. | |
| 433 bookmark_data_offset_ = bookmark_data_length - data_length; | |
| 434 } else { | |
| 435 // Slow case: We need to copy current_data_. | |
| 436 bookmark_data_.Dispose(); | |
| 437 bookmark_data_ = Vector<uint8_t>::New(static_cast<int>(data_length)); | |
| 438 CopyBytes(bookmark_data_.start(), current_data_ + current_data_offset_, | |
| 439 data_length); | |
| 440 bookmark_data_is_from_current_data_ = true; | |
| 441 bookmark_data_offset_ = 0; | |
| 442 } | |
| 443 | |
| 444 bookmark_utf8_split_char_buffer_length_ = utf8_split_char_buffer_length_; | |
| 445 for (size_t i = 0; i < utf8_split_char_buffer_length_; i++) { | |
| 446 bookmark_utf8_split_char_buffer_[i] = utf8_split_char_buffer_[i]; | |
| 447 } | |
| 448 | |
| 449 return source_stream_->SetBookmark(); | |
| 450 } | |
| 451 | |
| 452 | |
| 453 void ExternalStreamingStream::ResetToBookmark() { | |
| 454 source_stream_->ResetToBookmark(); | |
| 455 FlushCurrent(); | |
| 456 | |
| 457 pos_ = bookmark_; | |
| 458 | |
| 459 // bookmark_data_* => current_data_* | |
| 460 // (current_data_ assumes ownership of its memory.) | |
| 461 current_data_offset_ = 0; | |
| 462 current_data_length_ = bookmark_data_.length() - bookmark_data_offset_; | |
| 463 uint8_t* data = new uint8_t[current_data_length_]; | |
| 464 CopyCharsUnsigned(data, bookmark_data_.begin() + bookmark_data_offset_, | |
| 465 current_data_length_); | |
| 466 delete[] current_data_; | |
| 467 current_data_ = data; | |
| 468 bookmark_data_is_from_current_data_ = true; | |
| 469 | |
| 470 // bookmark_buffer_ needs to be copied to buffer_. | |
| 471 CopyCharsUnsigned(buffer_, bookmark_buffer_.begin(), | |
| 472 bookmark_buffer_.length()); | |
| 473 buffer_cursor_ = buffer_; | |
| 474 buffer_end_ = buffer_ + bookmark_buffer_.length(); | |
| 475 | |
| 476 // utf8 split char buffer | |
| 477 utf8_split_char_buffer_length_ = bookmark_utf8_split_char_buffer_length_; | |
| 478 for (size_t i = 0; i < bookmark_utf8_split_char_buffer_length_; i++) { | |
| 479 utf8_split_char_buffer_[i] = bookmark_utf8_split_char_buffer_[i]; | |
| 480 } | |
| 481 } | |
| 482 | |
| 483 | |
| 484 void ExternalStreamingStream::FlushCurrent() { | |
| 485 delete[] current_data_; | |
| 486 current_data_ = NULL; | |
| 487 current_data_length_ = 0; | |
| 488 current_data_offset_ = 0; | |
| 489 bookmark_data_is_from_current_data_ = false; | |
| 490 } | |
| 491 | |
| 492 | |
| 493 void ExternalStreamingStream::HandleUtf8SplitCharacters( | |
| 494 size_t* data_in_buffer) { | |
| 495 // Note the following property of UTF-8 which makes this function possible: | |
| 496 // Given any byte, we can always read its local environment (in both | |
| 497 // directions) to find out the (possibly multi-byte) character it belongs | |
| 498 // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a | |
| 499 // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or | |
| 500 // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX. | |
| 501 | |
| 502 // First check if we have leftover data from the last chunk. | |
| 503 unibrow::uchar c; | |
| 504 if (utf8_split_char_buffer_length_ > 0) { | |
| 505 // Move the bytes which are part of the split character (which started in | |
| 506 // the previous chunk) into utf8_split_char_buffer_. Note that the | |
| 507 // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2. | |
| 508 while (current_data_offset_ < current_data_length_ && | |
| 509 utf8_split_char_buffer_length_ < 4 && | |
| 510 (c = current_data_[current_data_offset_]) >> 6 == 2) { | |
| 511 utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c; | |
| 512 ++utf8_split_char_buffer_length_; | |
| 513 ++current_data_offset_; | |
| 514 } | |
| 515 | |
| 516 // Convert the data in utf8_split_char_buffer_. | |
| 517 size_t new_offset = 0; | |
| 518 size_t new_chars_in_buffer = | |
| 519 CopyCharsHelper(buffer_ + *data_in_buffer, | |
| 520 kBufferSize - *data_in_buffer, utf8_split_char_buffer_, | |
| 521 &new_offset, utf8_split_char_buffer_length_, encoding_); | |
| 522 *data_in_buffer += new_chars_in_buffer; | |
| 523 // Make sure we used all the data. | |
| 524 DCHECK(new_offset == utf8_split_char_buffer_length_); | |
| 525 DCHECK(*data_in_buffer <= kBufferSize); | |
| 526 | |
| 527 utf8_split_char_buffer_length_ = 0; | |
| 528 } | |
| 529 | |
| 530 // Move bytes which are part of an incomplete character from the end of the | |
| 531 // current chunk to utf8_split_char_buffer_. They will be converted when the | |
| 532 // next data chunk arrives. Note that all valid UTF-8 characters are at most 4 | |
| 533 // bytes long, but if the data is invalid, we can have character values bigger | |
| 534 // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes. | |
| 535 while (current_data_length_ > current_data_offset_ && | |
| 536 (c = current_data_[current_data_length_ - 1]) > | |
| 537 unibrow::Utf8::kMaxOneByteChar && | |
| 538 utf8_split_char_buffer_length_ < 4) { | |
| 539 --current_data_length_; | |
| 540 ++utf8_split_char_buffer_length_; | |
| 541 if (c >= (3 << 6)) { | |
| 542 // 3 << 6 = 0b11000000; this is the first byte of the multi-byte | |
| 543 // character. No need to copy the previous characters into the conversion | |
| 544 // buffer (even if they're multi-byte). | |
| 545 break; | |
| 546 } | |
| 547 } | |
| 548 CHECK(utf8_split_char_buffer_length_ <= 4); | |
| 549 for (size_t i = 0; i < utf8_split_char_buffer_length_; ++i) { | |
| 550 utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i]; | |
| 551 } | |
| 552 } | |
| 553 | |
| 554 | |
| 555 // ---------------------------------------------------------------------------- | |
| 556 // ExternalTwoByteStringUtf16CharacterStream | |
| 557 | |
| 558 ExternalTwoByteStringUtf16CharacterStream:: | |
| 559 ~ExternalTwoByteStringUtf16CharacterStream() { } | |
| 560 | |
| 561 | |
| 562 ExternalTwoByteStringUtf16CharacterStream:: | |
| 563 ExternalTwoByteStringUtf16CharacterStream( | |
| 564 Handle<ExternalTwoByteString> data, int start_position, | |
| 565 int end_position) | |
| 566 : Utf16CharacterStream(), | |
| 567 source_(data), | |
| 568 raw_data_(data->GetTwoByteData(start_position)), | |
| 569 bookmark_(kNoBookmark) { | |
| 570 buffer_cursor_ = raw_data_, | |
| 571 buffer_end_ = raw_data_ + (end_position - start_position); | |
| 572 pos_ = start_position; | |
| 573 } | |
| 574 | |
| 575 | |
| 576 bool ExternalTwoByteStringUtf16CharacterStream::SetBookmark() { | |
| 577 bookmark_ = pos_; | |
| 578 return true; | |
| 579 } | |
| 580 | |
| 581 | |
| 582 void ExternalTwoByteStringUtf16CharacterStream::ResetToBookmark() { | |
| 583 DCHECK(bookmark_ != kNoBookmark); | |
| 584 pos_ = bookmark_; | |
| 585 buffer_cursor_ = raw_data_ + bookmark_; | |
| 586 } | |
| 587 } // namespace internal | |
| 588 } // namespace v8 | |
| OLD | NEW |