 Chromium Code Reviews
 Chromium Code Reviews Issue 121173009:
  String:WriteUtf8: Add REPLACE_INVALID_UTF8 option  (Closed) 
  Base URL: git://github.com/v8/v8.git@master
    
  
    Issue 121173009:
  String:WriteUtf8: Add REPLACE_INVALID_UTF8 option  (Closed) 
  Base URL: git://github.com/v8/v8.git@master| OLD | NEW | 
|---|---|
| 1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. | 
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without | 
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are | 
| 4 // met: | 4 // met: | 
| 5 // | 5 // | 
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright | 
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. | 
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above | 
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following | 
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided | 
| (...skipping 4486 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 4497 int String::Utf8Length() const { | 4497 int String::Utf8Length() const { | 
| 4498 i::Handle<i::String> str = Utils::OpenHandle(this); | 4498 i::Handle<i::String> str = Utils::OpenHandle(this); | 
| 4499 i::Isolate* isolate = str->GetIsolate(); | 4499 i::Isolate* isolate = str->GetIsolate(); | 
| 4500 return v8::Utf8Length(*str, isolate); | 4500 return v8::Utf8Length(*str, isolate); | 
| 4501 } | 4501 } | 
| 4502 | 4502 | 
| 4503 | 4503 | 
| 4504 class Utf8WriterVisitor { | 4504 class Utf8WriterVisitor { | 
| 4505 public: | 4505 public: | 
| 4506 Utf8WriterVisitor( | 4506 Utf8WriterVisitor( | 
| 4507 char* buffer, int capacity, bool skip_capacity_check) | 4507 char* buffer, | 
| 4508 int capacity, | |
| 4509 bool skip_capacity_check, | |
| 4510 bool replace_invalid_utf8) | |
| 4508 : early_termination_(false), | 4511 : early_termination_(false), | 
| 4509 last_character_(unibrow::Utf16::kNoPreviousCharacter), | 4512 last_character_(unibrow::Utf16::kNoPreviousCharacter), | 
| 4510 buffer_(buffer), | 4513 buffer_(buffer), | 
| 4511 start_(buffer), | 4514 start_(buffer), | 
| 4512 capacity_(capacity), | 4515 capacity_(capacity), | 
| 4513 skip_capacity_check_(capacity == -1 || skip_capacity_check), | 4516 skip_capacity_check_(capacity == -1 || skip_capacity_check), | 
| 4517 replace_invalid_utf8_(replace_invalid_utf8), | |
| 4514 utf16_chars_read_(0) { | 4518 utf16_chars_read_(0) { | 
| 4515 } | 4519 } | 
| 4516 | 4520 | 
| 4517 static int WriteEndCharacter(uint16_t character, | 4521 static int WriteEndCharacter(uint16_t character, | 
| 4518 int last_character, | 4522 int last_character, | 
| 4519 int remaining, | 4523 int remaining, | 
| 4520 char* const buffer) { | 4524 char* const buffer, | 
| 4525 bool replace_invalid_utf8) { | |
| 4521 using namespace unibrow; | 4526 using namespace unibrow; | 
| 4522 ASSERT(remaining > 0); | 4527 ASSERT(remaining > 0); | 
| 4523 // We can't use a local buffer here because Encode needs to modify | 4528 // We can't use a local buffer here because Encode needs to modify | 
| 4524 // previous characters in the stream. We know, however, that | 4529 // previous characters in the stream. We know, however, that | 
| 4525 // exactly one character will be advanced. | 4530 // exactly one character will be advanced. | 
| 4526 if (Utf16::IsTrailSurrogate(character) && | 4531 if (Utf16::IsSurrogatePair(last_character, character)) { | 
| 4527 Utf16::IsLeadSurrogate(last_character)) { | 4532 int written = Utf8::Encode(buffer, | 
| 4528 int written = Utf8::Encode(buffer, character, last_character); | 4533 character, | 
| 4534 last_character, | |
| 4535 replace_invalid_utf8); | |
| 4529 ASSERT(written == 1); | 4536 ASSERT(written == 1); | 
| 4530 return written; | 4537 return written; | 
| 4531 } | 4538 } | 
| 4532 // Use a scratch buffer to check the required characters. | 4539 // Use a scratch buffer to check the required characters. | 
| 4533 char temp_buffer[Utf8::kMaxEncodedSize]; | 4540 char temp_buffer[Utf8::kMaxEncodedSize]; | 
| 4534 // Can't encode using last_character as gcc has array bounds issues. | 4541 // Can't encode using last_character as gcc has array bounds issues. | 
| 4535 int written = Utf8::Encode(temp_buffer, | 4542 int written = Utf8::Encode(temp_buffer, | 
| 4536 character, | 4543 character, | 
| 4537 Utf16::kNoPreviousCharacter); | 4544 Utf16::kNoPreviousCharacter, | 
| 4545 replace_invalid_utf8); | |
| 4538 // Won't fit. | 4546 // Won't fit. | 
| 4539 if (written > remaining) return 0; | 4547 if (written > remaining) return 0; | 
| 4540 // Copy over the character from temp_buffer. | 4548 // Copy over the character from temp_buffer. | 
| 4541 for (int j = 0; j < written; j++) { | 4549 for (int j = 0; j < written; j++) { | 
| 4542 buffer[j] = temp_buffer[j]; | 4550 buffer[j] = temp_buffer[j]; | 
| 4543 } | 4551 } | 
| 4544 return written; | 4552 return written; | 
| 4545 } | 4553 } | 
| 4546 | 4554 | 
| 4555 // Visit writes out a group of code units (chars) of a v8::String to the | |
| 4556 // internal buffer_. This is done in two phases. The first phase calculates a | |
| 4557 // pesimistic estimate (writable_length) on how many code units can be safely | |
| 4558 // written without exceeding the buffer capacity and without writing the last | |
| 4559 // code unit (it could be a lead surrogate). The estimated number of code | |
| 4560 // units is then written out in one go, and the reported byte usage is used | |
| 4561 // to correct the estimate. This is repeated until the estimate becomes <= 0 | |
| 4562 // or all code units have been written out. The second phase writes out code | |
| 4563 // units until the buffer capacity is reached, would be exceeded by the next | |
| 4564 // unit, or all units have been written out. | |
| 4565 // TODO(felixge) This function is rather complex and could benefit from | |
| 
dcarney
2014/01/17 09:10:12
drop the todo
 
haimuiba
2014/01/20 08:10:27
Done.
 | |
| 4566 // better variable naming and/or splitting up. | |
| 4547 template<typename Char> | 4567 template<typename Char> | 
| 4548 void Visit(const Char* chars, const int length) { | 4568 void Visit(const Char* chars, const int length) { | 
| 4549 using namespace unibrow; | 4569 using namespace unibrow; | 
| 4550 ASSERT(!early_termination_); | 4570 ASSERT(!early_termination_); | 
| 4551 if (length == 0) return; | 4571 if (length == 0) return; | 
| 4552 // Copy state to stack. | 4572 // Copy state to stack. | 
| 4553 char* buffer = buffer_; | 4573 char* buffer = buffer_; | 
| 4554 int last_character = | 4574 int last_character = | 
| 4555 sizeof(Char) == 1 ? Utf16::kNoPreviousCharacter : last_character_; | 4575 sizeof(Char) == 1 ? Utf16::kNoPreviousCharacter : last_character_; | 
| 4556 int i = 0; | 4576 int i = 0; | 
| (...skipping 17 matching lines...) Expand all Loading... | |
| 4574 // Write the characters to the stream. | 4594 // Write the characters to the stream. | 
| 4575 if (sizeof(Char) == 1) { | 4595 if (sizeof(Char) == 1) { | 
| 4576 for (; i < fast_length; i++) { | 4596 for (; i < fast_length; i++) { | 
| 4577 buffer += | 4597 buffer += | 
| 4578 Utf8::EncodeOneByte(buffer, static_cast<uint8_t>(*chars++)); | 4598 Utf8::EncodeOneByte(buffer, static_cast<uint8_t>(*chars++)); | 
| 4579 ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); | 4599 ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); | 
| 4580 } | 4600 } | 
| 4581 } else { | 4601 } else { | 
| 4582 for (; i < fast_length; i++) { | 4602 for (; i < fast_length; i++) { | 
| 4583 uint16_t character = *chars++; | 4603 uint16_t character = *chars++; | 
| 4584 buffer += Utf8::Encode(buffer, character, last_character); | 4604 buffer += Utf8::Encode(buffer, | 
| 4605 character, | |
| 4606 last_character, | |
| 4607 replace_invalid_utf8_); | |
| 4585 last_character = character; | 4608 last_character = character; | 
| 4586 ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); | 4609 ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); | 
| 4587 } | 4610 } | 
| 4588 } | 4611 } | 
| 4589 // Array is fully written. Exit. | 4612 // Array is fully written. Exit. | 
| 4590 if (fast_length == length) { | 4613 if (fast_length == length) { | 
| 4591 // Write state back out to object. | 4614 // Write state back out to object. | 
| 4592 last_character_ = last_character; | 4615 last_character_ = last_character; | 
| 4593 buffer_ = buffer; | 4616 buffer_ = buffer; | 
| 4594 utf16_chars_read_ += length; | 4617 utf16_chars_read_ += length; | 
| 4595 return; | 4618 return; | 
| 4596 } | 4619 } | 
| 4597 } | 4620 } | 
| 4598 ASSERT(!skip_capacity_check_); | 4621 ASSERT(!skip_capacity_check_); | 
| 4599 // Slow loop. Must check capacity on each iteration. | 4622 // Slow loop. Must check capacity on each iteration. | 
| 4600 int remaining_capacity = capacity_ - static_cast<int>(buffer - start_); | 4623 int remaining_capacity = capacity_ - static_cast<int>(buffer - start_); | 
| 4601 ASSERT(remaining_capacity >= 0); | 4624 ASSERT(remaining_capacity >= 0); | 
| 4602 for (; i < length && remaining_capacity > 0; i++) { | 4625 for (; i < length && remaining_capacity > 0; i++) { | 
| 4603 uint16_t character = *chars++; | 4626 uint16_t character = *chars++; | 
| 4627 if (replace_invalid_utf8_ && Utf16::IsLeadSurrogate(character)) { | |
| 
dcarney
2014/01/17 09:10:12
this line is in the correct place, but it's only t
 
haimuiba
2014/01/20 08:10:27
Done.
 | |
| 4628 early_termination_ = true; | |
| 4629 break; | |
| 4630 } | |
| 4631 | |
| 
dcarney
2014/01/17 09:10:12
no space
 
haimuiba
2014/01/20 08:10:27
Done.
 | |
| 4604 int written = WriteEndCharacter(character, | 4632 int written = WriteEndCharacter(character, | 
| 4605 last_character, | 4633 last_character, | 
| 4606 remaining_capacity, | 4634 remaining_capacity, | 
| 4607 buffer); | 4635 buffer, | 
| 4636 replace_invalid_utf8_); | |
| 4608 if (written == 0) { | 4637 if (written == 0) { | 
| 4609 early_termination_ = true; | 4638 early_termination_ = true; | 
| 4610 break; | 4639 break; | 
| 4611 } | 4640 } | 
| 4612 buffer += written; | 4641 buffer += written; | 
| 4613 remaining_capacity -= written; | 4642 remaining_capacity -= written; | 
| 4614 last_character = character; | 4643 last_character = character; | 
| 4615 } | 4644 } | 
| 4616 // Write state back out to object. | 4645 // Write state back out to object. | 
| 4617 last_character_ = last_character; | 4646 last_character_ = last_character; | 
| (...skipping 27 matching lines...) Expand all Loading... | |
| 4645 return static_cast<int>(buffer_ - start_); | 4674 return static_cast<int>(buffer_ - start_); | 
| 4646 } | 4675 } | 
| 4647 | 4676 | 
| 4648 private: | 4677 private: | 
| 4649 bool early_termination_; | 4678 bool early_termination_; | 
| 4650 int last_character_; | 4679 int last_character_; | 
| 4651 char* buffer_; | 4680 char* buffer_; | 
| 4652 char* const start_; | 4681 char* const start_; | 
| 4653 int capacity_; | 4682 int capacity_; | 
| 4654 bool const skip_capacity_check_; | 4683 bool const skip_capacity_check_; | 
| 4684 bool const replace_invalid_utf8_; | |
| 4655 int utf16_chars_read_; | 4685 int utf16_chars_read_; | 
| 4656 DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor); | 4686 DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor); | 
| 4657 }; | 4687 }; | 
| 4658 | 4688 | 
| 4659 | 4689 | 
| 4660 static bool RecursivelySerializeToUtf8(i::String* current, | 4690 static bool RecursivelySerializeToUtf8(i::String* current, | 
| 4661 Utf8WriterVisitor* writer, | 4691 Utf8WriterVisitor* writer, | 
| 4662 int recursion_budget) { | 4692 int recursion_budget) { | 
| 4663 while (!writer->IsDone()) { | 4693 while (!writer->IsDone()) { | 
| 4664 i::ConsString* cons_string = i::String::VisitFlat(writer, current); | 4694 i::ConsString* cons_string = i::String::VisitFlat(writer, current); | 
| (...skipping 18 matching lines...) Expand all Loading... | |
| 4683 int options) const { | 4713 int options) const { | 
| 4684 i::Isolate* isolate = Utils::OpenHandle(this)->GetIsolate(); | 4714 i::Isolate* isolate = Utils::OpenHandle(this)->GetIsolate(); | 
| 4685 LOG_API(isolate, "String::WriteUtf8"); | 4715 LOG_API(isolate, "String::WriteUtf8"); | 
| 4686 ENTER_V8(isolate); | 4716 ENTER_V8(isolate); | 
| 4687 i::Handle<i::String> str = Utils::OpenHandle(this); | 4717 i::Handle<i::String> str = Utils::OpenHandle(this); | 
| 4688 if (options & HINT_MANY_WRITES_EXPECTED) { | 4718 if (options & HINT_MANY_WRITES_EXPECTED) { | 
| 4689 FlattenString(str); // Flatten the string for efficiency. | 4719 FlattenString(str); // Flatten the string for efficiency. | 
| 4690 } | 4720 } | 
| 4691 const int string_length = str->length(); | 4721 const int string_length = str->length(); | 
| 4692 bool write_null = !(options & NO_NULL_TERMINATION); | 4722 bool write_null = !(options & NO_NULL_TERMINATION); | 
| 4723 bool replace_invalid_utf8 = (options & REPLACE_INVALID_UTF8); | |
| 4724 int max16BitCodeUnitSize = unibrow::Utf8::kMax16BitCodeUnitSize; | |
| 4693 // First check if we can just write the string without checking capacity. | 4725 // First check if we can just write the string without checking capacity. | 
| 4694 if (capacity == -1 || capacity / 3 >= string_length) { | 4726 if (capacity == -1 || capacity / max16BitCodeUnitSize >= string_length) { | 
| 4695 Utf8WriterVisitor writer(buffer, capacity, true); | 4727 Utf8WriterVisitor writer(buffer, capacity, true, replace_invalid_utf8); | 
| 4696 const int kMaxRecursion = 100; | 4728 const int kMaxRecursion = 100; | 
| 4697 bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion); | 4729 bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion); | 
| 4698 if (success) return writer.CompleteWrite(write_null, nchars_ref); | 4730 if (success) return writer.CompleteWrite(write_null, nchars_ref); | 
| 4699 } else if (capacity >= string_length) { | 4731 } else if (capacity >= string_length) { | 
| 4700 // First check that the buffer is large enough. | 4732 // First check that the buffer is large enough. | 
| 4701 int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate()); | 4733 int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate()); | 
| 4702 if (utf8_bytes <= capacity) { | 4734 if (utf8_bytes <= capacity) { | 
| 4703 // ASCII fast path. | 4735 // ASCII fast path. | 
| 4704 if (utf8_bytes == string_length) { | 4736 if (utf8_bytes == string_length) { | 
| 4705 WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity, options); | 4737 WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity, options); | 
| 4706 if (nchars_ref != NULL) *nchars_ref = string_length; | 4738 if (nchars_ref != NULL) *nchars_ref = string_length; | 
| 4707 if (write_null && (utf8_bytes+1 <= capacity)) { | 4739 if (write_null && (utf8_bytes+1 <= capacity)) { | 
| 4708 return string_length + 1; | 4740 return string_length + 1; | 
| 4709 } | 4741 } | 
| 4710 return string_length; | 4742 return string_length; | 
| 4711 } | 4743 } | 
| 4712 if (write_null && (utf8_bytes+1 > capacity)) { | 4744 if (write_null && (utf8_bytes+1 > capacity)) { | 
| 4713 options |= NO_NULL_TERMINATION; | 4745 options |= NO_NULL_TERMINATION; | 
| 4714 } | 4746 } | 
| 4715 // Recurse once without a capacity limit. | 4747 // Recurse once without a capacity limit. | 
| 4716 // This will get into the first branch above. | 4748 // This will get into the first branch above. | 
| 4717 // TODO(dcarney) Check max left rec. in Utf8Length and fall through. | 4749 // TODO(dcarney) Check max left rec. in Utf8Length and fall through. | 
| 4718 return WriteUtf8(buffer, -1, nchars_ref, options); | 4750 return WriteUtf8(buffer, -1, nchars_ref, options); | 
| 4719 } | 4751 } | 
| 4720 } | 4752 } | 
| 4721 // Recursive slow path can potentially be unreasonable slow. Flatten. | 4753 // Recursive slow path can potentially be unreasonable slow. Flatten. | 
| 4722 str = FlattenGetString(str); | 4754 str = FlattenGetString(str); | 
| 4723 Utf8WriterVisitor writer(buffer, capacity, false); | 4755 Utf8WriterVisitor writer(buffer, capacity, false, replace_invalid_utf8); | 
| 4724 i::String::VisitFlat(&writer, *str); | 4756 i::String::VisitFlat(&writer, *str); | 
| 4725 return writer.CompleteWrite(write_null, nchars_ref); | 4757 return writer.CompleteWrite(write_null, nchars_ref); | 
| 4726 } | 4758 } | 
| 4727 | 4759 | 
| 4728 | 4760 | 
| 4729 template<typename CharType> | 4761 template<typename CharType> | 
| 4730 static inline int WriteHelper(const String* string, | 4762 static inline int WriteHelper(const String* string, | 
| 4731 CharType* buffer, | 4763 CharType* buffer, | 
| 4732 int start, | 4764 int start, | 
| 4733 int length, | 4765 int length, | 
| (...skipping 2781 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 7515 Isolate* isolate = reinterpret_cast<Isolate*>(info.GetIsolate()); | 7547 Isolate* isolate = reinterpret_cast<Isolate*>(info.GetIsolate()); | 
| 7516 Address callback_address = | 7548 Address callback_address = | 
| 7517 reinterpret_cast<Address>(reinterpret_cast<intptr_t>(callback)); | 7549 reinterpret_cast<Address>(reinterpret_cast<intptr_t>(callback)); | 
| 7518 VMState<EXTERNAL> state(isolate); | 7550 VMState<EXTERNAL> state(isolate); | 
| 7519 ExternalCallbackScope call_scope(isolate, callback_address); | 7551 ExternalCallbackScope call_scope(isolate, callback_address); | 
| 7520 callback(info); | 7552 callback(info); | 
| 7521 } | 7553 } | 
| 7522 | 7554 | 
| 7523 | 7555 | 
| 7524 } } // namespace v8::internal | 7556 } } // namespace v8::internal | 
| OLD | NEW |