| OLD | NEW |
| 1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "src/v8.h" | 5 #include "src/v8.h" |
| 6 | 6 |
| 7 #include "src/scanner-character-streams.h" | 7 #include "src/scanner-character-streams.h" |
| 8 | 8 |
| 9 #include "include/v8.h" | 9 #include "include/v8.h" |
| 10 #include "src/handles.h" | 10 #include "src/handles.h" |
| (...skipping 214 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 225 } | 225 } |
| 226 unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_, | 226 unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_, |
| 227 raw_data_length_); | 227 raw_data_length_); |
| 228 raw_character_position_ = char_position + i; | 228 raw_character_position_ = char_position + i; |
| 229 return i; | 229 return i; |
| 230 } | 230 } |
| 231 | 231 |
| 232 | 232 |
| 233 static const byte kUtf8MultiByteMask = 0xC0; | 233 static const byte kUtf8MultiByteMask = 0xC0; |
| 234 static const byte kUtf8MultiByteCharFollower = 0x80; | 234 static const byte kUtf8MultiByteCharFollower = 0x80; |
| 235 static const byte kUtf8MultiByteCharStart = 0xC0; |
| 235 | 236 |
| 236 | 237 |
| 237 #ifdef DEBUG | |
| 238 static const byte kUtf8MultiByteCharStart = 0xC0; | |
| 239 static bool IsUtf8MultiCharacterStart(byte first_byte) { | 238 static bool IsUtf8MultiCharacterStart(byte first_byte) { |
| 240 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart; | 239 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart; |
| 241 } | 240 } |
| 242 #endif | |
| 243 | 241 |
| 244 | 242 |
| 245 static bool IsUtf8MultiCharacterFollower(byte later_byte) { | 243 static bool IsUtf8MultiCharacterFollower(byte later_byte) { |
| 246 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower; | 244 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower; |
| 247 } | 245 } |
| 248 | 246 |
| 249 | 247 |
| 250 // Move the cursor back to point at the preceding UTF-8 character start | 248 // Move the cursor back to point at the preceding UTF-8 character start |
| 251 // in the buffer. | 249 // in the buffer. |
| 252 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) { | 250 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) { |
| (...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 334 // here) and the internal parts which use unsigned. TODO(marja): make the | 332 // here) and the internal parts which use unsigned. TODO(marja): make the |
| 335 // internal parts use size_t too. | 333 // internal parts use size_t too. |
| 336 current_data_length_ = | 334 current_data_length_ = |
| 337 static_cast<unsigned>(source_stream_->GetMoreData(¤t_data_)); | 335 static_cast<unsigned>(source_stream_->GetMoreData(¤t_data_)); |
| 338 current_data_offset_ = 0; | 336 current_data_offset_ = 0; |
| 339 bool data_ends = current_data_length_ == 0; | 337 bool data_ends = current_data_length_ == 0; |
| 340 | 338 |
| 341 // A caveat: a data chunk might end with bytes from an incomplete UTF-8 | 339 // A caveat: a data chunk might end with bytes from an incomplete UTF-8 |
| 342 // character (the rest of the bytes will be in the next chunk). | 340 // character (the rest of the bytes will be in the next chunk). |
| 343 if (encoding_ == ScriptCompiler::StreamedSource::UTF8) { | 341 if (encoding_ == ScriptCompiler::StreamedSource::UTF8) { |
| 342 if (first_chunk_) { |
| 343 // Get rid of the byte order mark (if any). |
| 344 if (current_data_length_ >= 3 && current_data_[0] == 0xef && |
| 345 current_data_[1] == 0xbb && current_data_[2] == 0xbf) { |
| 346 current_data_offset_ = 3; |
| 347 } |
| 348 } |
| 349 |
| 344 HandleUtf8SplitCharacters(&data_in_buffer); | 350 HandleUtf8SplitCharacters(&data_in_buffer); |
| 345 if (!data_ends && current_data_offset_ == current_data_length_) { | 351 if (!data_ends && current_data_offset_ == current_data_length_) { |
| 346 // The data stream didn't end, but we used all the data in the | 352 // The data stream didn't end, but we used all the data in the |
| 347 // chunk. This will only happen when the chunk was really small. We | 353 // chunk. This will only happen when the chunk was really small. We |
| 348 // don't handle the case where a UTF-8 character is split over several | 354 // don't handle the case where a UTF-8 character is split over several |
| 349 // chunks; in that case V8 won't crash, but it will be a parse error. | 355 // chunks; in that case V8 won't crash, but it will be a parse error. |
| 350 delete[] current_data_; | 356 delete[] current_data_; |
| 351 current_data_ = NULL; | 357 current_data_ = NULL; |
| 352 current_data_length_ = 0; | 358 current_data_length_ = 0; |
| 353 current_data_offset_ = 0; | 359 current_data_offset_ = 0; |
| 354 continue; // Request a new chunk. | 360 continue; // Request a new chunk. |
| 355 } | 361 } |
| 356 } | 362 } |
| 357 | 363 |
| 358 // Did the data stream end? | 364 // Did the data stream end? |
| 359 if (data_ends) { | 365 if (data_ends) { |
| 360 DCHECK(utf8_split_char_buffer_length_ == 0); | 366 DCHECK(utf8_split_char_buffer_length_ == 0); |
| 361 return data_in_buffer; | 367 return data_in_buffer; |
| 362 } | 368 } |
| 369 |
| 370 first_chunk_ = false; |
| 363 } | 371 } |
| 364 | 372 |
| 365 // Fill the buffer from current_data_. | 373 // Fill the buffer from current_data_. |
| 366 unsigned new_offset = 0; | 374 unsigned new_offset = 0; |
| 367 unsigned new_chars_in_buffer = | 375 unsigned new_chars_in_buffer = |
| 368 CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer, | 376 CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer, |
| 369 current_data_ + current_data_offset_, &new_offset, | 377 current_data_ + current_data_offset_, &new_offset, |
| 370 current_data_length_ - current_data_offset_, encoding_); | 378 current_data_length_ - current_data_offset_, encoding_); |
| 371 data_in_buffer += new_chars_in_buffer; | 379 data_in_buffer += new_chars_in_buffer; |
| 372 current_data_offset_ += new_offset; | 380 current_data_offset_ += new_offset; |
| (...skipping 16 matching lines...) Expand all Loading... |
| 389 // Given any byte, we can always read its local environment (in both | 397 // Given any byte, we can always read its local environment (in both |
| 390 // directions) to find out the (possibly multi-byte) character it belongs | 398 // directions) to find out the (possibly multi-byte) character it belongs |
| 391 // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a | 399 // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a |
| 392 // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or | 400 // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or |
| 393 // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX. | 401 // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX. |
| 394 | 402 |
| 395 // First check if we have leftover data from the last chunk. | 403 // First check if we have leftover data from the last chunk. |
| 396 unibrow::uchar c; | 404 unibrow::uchar c; |
| 397 if (utf8_split_char_buffer_length_ > 0) { | 405 if (utf8_split_char_buffer_length_ > 0) { |
| 398 // Move the bytes which are part of the split character (which started in | 406 // Move the bytes which are part of the split character (which started in |
| 399 // the previous chunk) into utf8_split_char_buffer_. Note that the | 407 // the previous chunk) into utf8_split_char_buffer_. |
| 400 // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2. | 408 while ( |
| 401 while (current_data_offset_ < current_data_length_ && | 409 current_data_offset_ < current_data_length_ && |
| 402 utf8_split_char_buffer_length_ < 4 && | 410 utf8_split_char_buffer_length_ < 4 && |
| 403 (c = current_data_[current_data_offset_]) >> 6 == 2) { | 411 IsUtf8MultiCharacterFollower(c = current_data_[current_data_offset_])) { |
| 404 utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c; | 412 utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c; |
| 405 ++utf8_split_char_buffer_length_; | 413 ++utf8_split_char_buffer_length_; |
| 406 ++current_data_offset_; | 414 ++current_data_offset_; |
| 407 } | 415 } |
| 408 | 416 |
| 409 // Convert the data in utf8_split_char_buffer_. | 417 // Convert the data in utf8_split_char_buffer_. |
| 410 unsigned new_offset = 0; | 418 unsigned new_offset = 0; |
| 411 unsigned new_chars_in_buffer = | 419 unsigned new_chars_in_buffer = |
| 412 CopyCharsHelper(buffer_ + *data_in_buffer, | 420 CopyCharsHelper(buffer_ + *data_in_buffer, |
| 413 kBufferSize - *data_in_buffer, utf8_split_char_buffer_, | 421 kBufferSize - *data_in_buffer, utf8_split_char_buffer_, |
| 414 &new_offset, utf8_split_char_buffer_length_, encoding_); | 422 &new_offset, utf8_split_char_buffer_length_, encoding_); |
| 415 *data_in_buffer += new_chars_in_buffer; | 423 *data_in_buffer += new_chars_in_buffer; |
| 416 // Make sure we used all the data. | 424 // Make sure we used all the data. |
| 417 DCHECK(new_offset == utf8_split_char_buffer_length_); | 425 DCHECK(new_offset == utf8_split_char_buffer_length_); |
| 418 DCHECK(*data_in_buffer <= kBufferSize); | 426 DCHECK(*data_in_buffer <= kBufferSize); |
| 419 | 427 |
| 420 utf8_split_char_buffer_length_ = 0; | 428 utf8_split_char_buffer_length_ = 0; |
| 421 } | 429 } |
| 422 | 430 |
| 423 // Move bytes which are part of an incomplete character from the end of the | 431 // Move bytes which are part of an incomplete character from the end of the |
| 424 // current chunk to utf8_split_char_buffer_. They will be converted when the | 432 // current chunk to utf8_split_char_buffer_. They will be converted when the |
| 425 // next data chunk arrives. Note that all valid UTF-8 characters are at most 4 | 433 // next data chunk arrives. Note that all valid UTF-8 characters are at most 4 |
| 426 // bytes long, but if the data is invalid, we can have character values bigger | 434 // bytes long, but if the data is invalid, we can have character values bigger |
| 427 // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes. | 435 // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes. |
| 428 while (current_data_length_ > current_data_offset_ && | 436 while (current_data_length_ > current_data_offset_ && |
| 429 (c = current_data_[current_data_length_ - 1]) > | 437 utf8_split_char_buffer_length_ < 4 && |
| 430 unibrow::Utf8::kMaxOneByteChar && | 438 (IsUtf8MultiCharacterFollower( |
| 431 utf8_split_char_buffer_length_ < 4) { | 439 c = current_data_[current_data_length_ - 1]) || |
| 440 IsUtf8MultiCharacterStart(c))) { |
| 432 --current_data_length_; | 441 --current_data_length_; |
| 433 ++utf8_split_char_buffer_length_; | 442 ++utf8_split_char_buffer_length_; |
| 434 if (c >= (3 << 6)) { | 443 if (IsUtf8MultiCharacterStart(c)) { |
| 435 // 3 << 6 = 0b11000000; this is the first byte of the multi-byte | 444 // This is the first byte of the multi-byte character. No need to copy the |
| 436 // character. No need to copy the previous characters into the conversion | 445 // previous characters into the conversion buffer (even if they're |
| 437 // buffer (even if they're multi-byte). | 446 // multi-byte). |
| 438 break; | 447 break; |
| 439 } | 448 } |
| 440 } | 449 } |
| 441 CHECK(utf8_split_char_buffer_length_ <= 4); | 450 CHECK(utf8_split_char_buffer_length_ <= 4); |
| 442 for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) { | 451 for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) { |
| 443 utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i]; | 452 utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i]; |
| 444 } | 453 } |
| 445 } | 454 } |
| 446 | 455 |
| 447 | 456 |
| (...skipping 11 matching lines...) Expand all Loading... |
| 459 int end_position) | 468 int end_position) |
| 460 : Utf16CharacterStream(), | 469 : Utf16CharacterStream(), |
| 461 source_(data), | 470 source_(data), |
| 462 raw_data_(data->GetTwoByteData(start_position)) { | 471 raw_data_(data->GetTwoByteData(start_position)) { |
| 463 buffer_cursor_ = raw_data_, | 472 buffer_cursor_ = raw_data_, |
| 464 buffer_end_ = raw_data_ + (end_position - start_position); | 473 buffer_end_ = raw_data_ + (end_position - start_position); |
| 465 pos_ = start_position; | 474 pos_ = start_position; |
| 466 } | 475 } |
| 467 | 476 |
| 468 } } // namespace v8::internal | 477 } } // namespace v8::internal |
| OLD | NEW |