Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(311)

Side by Side Diff: src/api.cc

Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option (Closed) Base URL: git://github.com/v8/v8.git@master
Patch Set: Fix mistake in test case, finish patch Created 6 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « include/v8.h ('k') | src/unicode.h » ('j') | src/unicode-inl.h » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 4486 matching lines...) Expand 10 before | Expand all | Expand 10 after
4497 int String::Utf8Length() const { 4497 int String::Utf8Length() const {
4498 i::Handle<i::String> str = Utils::OpenHandle(this); 4498 i::Handle<i::String> str = Utils::OpenHandle(this);
4499 i::Isolate* isolate = str->GetIsolate(); 4499 i::Isolate* isolate = str->GetIsolate();
4500 return v8::Utf8Length(*str, isolate); 4500 return v8::Utf8Length(*str, isolate);
4501 } 4501 }
4502 4502
4503 4503
4504 class Utf8WriterVisitor { 4504 class Utf8WriterVisitor {
4505 public: 4505 public:
4506 Utf8WriterVisitor( 4506 Utf8WriterVisitor(
4507 char* buffer, int capacity, bool skip_capacity_check) 4507 char* buffer,
4508 int capacity,
4509 bool skip_capacity_check,
4510 bool replace_invalid_utf8)
4508 : early_termination_(false), 4511 : early_termination_(false),
4509 last_character_(unibrow::Utf16::kNoPreviousCharacter), 4512 last_character_(unibrow::Utf16::kNoPreviousCharacter),
4510 buffer_(buffer), 4513 buffer_(buffer),
4511 start_(buffer), 4514 start_(buffer),
4512 capacity_(capacity), 4515 capacity_(capacity),
4513 skip_capacity_check_(capacity == -1 || skip_capacity_check), 4516 skip_capacity_check_(capacity == -1 || skip_capacity_check),
4517 replace_invalid_utf8_(replace_invalid_utf8),
4514 utf16_chars_read_(0) { 4518 utf16_chars_read_(0) {
4515 } 4519 }
4516 4520
4517 static int WriteEndCharacter(uint16_t character, 4521 static int WriteEndCharacter(uint16_t character,
4518 int last_character, 4522 int last_character,
4519 int remaining, 4523 int remaining,
4520 char* const buffer) { 4524 char* const buffer,
4525 bool replace_invalid_utf8) {
4521 using namespace unibrow; 4526 using namespace unibrow;
4522 ASSERT(remaining > 0); 4527 ASSERT(remaining > 0);
4523 // We can't use a local buffer here because Encode needs to modify 4528 // We can't use a local buffer here because Encode needs to modify
4524 // previous characters in the stream. We know, however, that 4529 // previous characters in the stream. We know, however, that
4525 // exactly one character will be advanced. 4530 // exactly one character will be advanced.
4526 if (Utf16::IsTrailSurrogate(character) && 4531 if (Utf16::IsSurrogatePair(last_character, character)) {
4527 Utf16::IsLeadSurrogate(last_character)) { 4532 int written = Utf8::Encode(buffer,
4528 int written = Utf8::Encode(buffer, character, last_character); 4533 character,
4534 last_character,
4535 replace_invalid_utf8);
4529 ASSERT(written == 1); 4536 ASSERT(written == 1);
4530 return written; 4537 return written;
4531 } 4538 }
4532 // Use a scratch buffer to check the required characters. 4539 // Use a scratch buffer to check the required characters.
4533 char temp_buffer[Utf8::kMaxEncodedSize]; 4540 char temp_buffer[Utf8::kMaxEncodedSize];
4534 // Can't encode using last_character as gcc has array bounds issues. 4541 // Can't encode using last_character as gcc has array bounds issues.
4535 int written = Utf8::Encode(temp_buffer, 4542 int written = Utf8::Encode(temp_buffer,
4536 character, 4543 character,
4537 Utf16::kNoPreviousCharacter); 4544 Utf16::kNoPreviousCharacter,
4545 replace_invalid_utf8);
4538 // Won't fit. 4546 // Won't fit.
4539 if (written > remaining) return 0; 4547 if (written > remaining) return 0;
4540 // Copy over the character from temp_buffer. 4548 // Copy over the character from temp_buffer.
4541 for (int j = 0; j < written; j++) { 4549 for (int j = 0; j < written; j++) {
4542 buffer[j] = temp_buffer[j]; 4550 buffer[j] = temp_buffer[j];
4543 } 4551 }
4544 return written; 4552 return written;
4545 } 4553 }
4546 4554
4555 // Visit writes out a group of code units (chars) of a v8::String to the
4556 // internal buffer_. This is done in two phases. The first phase calculates a
4557 // pesimistic estimate (writable_length) on how many code units can be safely
4558 // written without exceeding the buffer capacity and without writing the last
4559 // code unit (it could be a lead surrogate). The estimated number of code
4560 // units is then written out in one go, and the reported byte usage is used
4561 // to correct the estimate. This is repeated until the estimate becomes <= 0
4562 // or all code units have been written out. The second phase writes out code
4563 // units until the buffer capacity is reached, would be exceeded by the next
4564 // unit, or all units have been written out.
4565 // TODO(felixge) This function is rather complex and could benefit from
dcarney 2014/01/17 09:10:12 drop the todo
haimuiba 2014/01/20 08:10:27 Done.
4566 // better variable naming and/or splitting up.
4547 template<typename Char> 4567 template<typename Char>
4548 void Visit(const Char* chars, const int length) { 4568 void Visit(const Char* chars, const int length) {
4549 using namespace unibrow; 4569 using namespace unibrow;
4550 ASSERT(!early_termination_); 4570 ASSERT(!early_termination_);
4551 if (length == 0) return; 4571 if (length == 0) return;
4552 // Copy state to stack. 4572 // Copy state to stack.
4553 char* buffer = buffer_; 4573 char* buffer = buffer_;
4554 int last_character = 4574 int last_character =
4555 sizeof(Char) == 1 ? Utf16::kNoPreviousCharacter : last_character_; 4575 sizeof(Char) == 1 ? Utf16::kNoPreviousCharacter : last_character_;
4556 int i = 0; 4576 int i = 0;
(...skipping 17 matching lines...) Expand all
4574 // Write the characters to the stream. 4594 // Write the characters to the stream.
4575 if (sizeof(Char) == 1) { 4595 if (sizeof(Char) == 1) {
4576 for (; i < fast_length; i++) { 4596 for (; i < fast_length; i++) {
4577 buffer += 4597 buffer +=
4578 Utf8::EncodeOneByte(buffer, static_cast<uint8_t>(*chars++)); 4598 Utf8::EncodeOneByte(buffer, static_cast<uint8_t>(*chars++));
4579 ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); 4599 ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
4580 } 4600 }
4581 } else { 4601 } else {
4582 for (; i < fast_length; i++) { 4602 for (; i < fast_length; i++) {
4583 uint16_t character = *chars++; 4603 uint16_t character = *chars++;
4584 buffer += Utf8::Encode(buffer, character, last_character); 4604 buffer += Utf8::Encode(buffer,
4605 character,
4606 last_character,
4607 replace_invalid_utf8_);
4585 last_character = character; 4608 last_character = character;
4586 ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); 4609 ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
4587 } 4610 }
4588 } 4611 }
4589 // Array is fully written. Exit. 4612 // Array is fully written. Exit.
4590 if (fast_length == length) { 4613 if (fast_length == length) {
4591 // Write state back out to object. 4614 // Write state back out to object.
4592 last_character_ = last_character; 4615 last_character_ = last_character;
4593 buffer_ = buffer; 4616 buffer_ = buffer;
4594 utf16_chars_read_ += length; 4617 utf16_chars_read_ += length;
4595 return; 4618 return;
4596 } 4619 }
4597 } 4620 }
4598 ASSERT(!skip_capacity_check_); 4621 ASSERT(!skip_capacity_check_);
4599 // Slow loop. Must check capacity on each iteration. 4622 // Slow loop. Must check capacity on each iteration.
4600 int remaining_capacity = capacity_ - static_cast<int>(buffer - start_); 4623 int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);
4601 ASSERT(remaining_capacity >= 0); 4624 ASSERT(remaining_capacity >= 0);
4602 for (; i < length && remaining_capacity > 0; i++) { 4625 for (; i < length && remaining_capacity > 0; i++) {
4603 uint16_t character = *chars++; 4626 uint16_t character = *chars++;
4627 if (replace_invalid_utf8_ && Utf16::IsLeadSurrogate(character)) {
dcarney 2014/01/17 09:10:12 this line is in the correct place, but it's only t
haimuiba 2014/01/20 08:10:27 Done.
4628 early_termination_ = true;
4629 break;
4630 }
4631
dcarney 2014/01/17 09:10:12 no space
haimuiba 2014/01/20 08:10:27 Done.
4604 int written = WriteEndCharacter(character, 4632 int written = WriteEndCharacter(character,
4605 last_character, 4633 last_character,
4606 remaining_capacity, 4634 remaining_capacity,
4607 buffer); 4635 buffer,
4636 replace_invalid_utf8_);
4608 if (written == 0) { 4637 if (written == 0) {
4609 early_termination_ = true; 4638 early_termination_ = true;
4610 break; 4639 break;
4611 } 4640 }
4612 buffer += written; 4641 buffer += written;
4613 remaining_capacity -= written; 4642 remaining_capacity -= written;
4614 last_character = character; 4643 last_character = character;
4615 } 4644 }
4616 // Write state back out to object. 4645 // Write state back out to object.
4617 last_character_ = last_character; 4646 last_character_ = last_character;
(...skipping 27 matching lines...) Expand all
4645 return static_cast<int>(buffer_ - start_); 4674 return static_cast<int>(buffer_ - start_);
4646 } 4675 }
4647 4676
4648 private: 4677 private:
4649 bool early_termination_; 4678 bool early_termination_;
4650 int last_character_; 4679 int last_character_;
4651 char* buffer_; 4680 char* buffer_;
4652 char* const start_; 4681 char* const start_;
4653 int capacity_; 4682 int capacity_;
4654 bool const skip_capacity_check_; 4683 bool const skip_capacity_check_;
4684 bool const replace_invalid_utf8_;
4655 int utf16_chars_read_; 4685 int utf16_chars_read_;
4656 DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor); 4686 DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);
4657 }; 4687 };
4658 4688
4659 4689
4660 static bool RecursivelySerializeToUtf8(i::String* current, 4690 static bool RecursivelySerializeToUtf8(i::String* current,
4661 Utf8WriterVisitor* writer, 4691 Utf8WriterVisitor* writer,
4662 int recursion_budget) { 4692 int recursion_budget) {
4663 while (!writer->IsDone()) { 4693 while (!writer->IsDone()) {
4664 i::ConsString* cons_string = i::String::VisitFlat(writer, current); 4694 i::ConsString* cons_string = i::String::VisitFlat(writer, current);
(...skipping 18 matching lines...) Expand all
4683 int options) const { 4713 int options) const {
4684 i::Isolate* isolate = Utils::OpenHandle(this)->GetIsolate(); 4714 i::Isolate* isolate = Utils::OpenHandle(this)->GetIsolate();
4685 LOG_API(isolate, "String::WriteUtf8"); 4715 LOG_API(isolate, "String::WriteUtf8");
4686 ENTER_V8(isolate); 4716 ENTER_V8(isolate);
4687 i::Handle<i::String> str = Utils::OpenHandle(this); 4717 i::Handle<i::String> str = Utils::OpenHandle(this);
4688 if (options & HINT_MANY_WRITES_EXPECTED) { 4718 if (options & HINT_MANY_WRITES_EXPECTED) {
4689 FlattenString(str); // Flatten the string for efficiency. 4719 FlattenString(str); // Flatten the string for efficiency.
4690 } 4720 }
4691 const int string_length = str->length(); 4721 const int string_length = str->length();
4692 bool write_null = !(options & NO_NULL_TERMINATION); 4722 bool write_null = !(options & NO_NULL_TERMINATION);
4723 bool replace_invalid_utf8 = (options & REPLACE_INVALID_UTF8);
4724 int max16BitCodeUnitSize = unibrow::Utf8::kMax16BitCodeUnitSize;
4693 // First check if we can just write the string without checking capacity. 4725 // First check if we can just write the string without checking capacity.
4694 if (capacity == -1 || capacity / 3 >= string_length) { 4726 if (capacity == -1 || capacity / max16BitCodeUnitSize >= string_length) {
4695 Utf8WriterVisitor writer(buffer, capacity, true); 4727 Utf8WriterVisitor writer(buffer, capacity, true, replace_invalid_utf8);
4696 const int kMaxRecursion = 100; 4728 const int kMaxRecursion = 100;
4697 bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion); 4729 bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);
4698 if (success) return writer.CompleteWrite(write_null, nchars_ref); 4730 if (success) return writer.CompleteWrite(write_null, nchars_ref);
4699 } else if (capacity >= string_length) { 4731 } else if (capacity >= string_length) {
4700 // First check that the buffer is large enough. 4732 // First check that the buffer is large enough.
4701 int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate()); 4733 int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate());
4702 if (utf8_bytes <= capacity) { 4734 if (utf8_bytes <= capacity) {
4703 // ASCII fast path. 4735 // ASCII fast path.
4704 if (utf8_bytes == string_length) { 4736 if (utf8_bytes == string_length) {
4705 WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity, options); 4737 WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity, options);
4706 if (nchars_ref != NULL) *nchars_ref = string_length; 4738 if (nchars_ref != NULL) *nchars_ref = string_length;
4707 if (write_null && (utf8_bytes+1 <= capacity)) { 4739 if (write_null && (utf8_bytes+1 <= capacity)) {
4708 return string_length + 1; 4740 return string_length + 1;
4709 } 4741 }
4710 return string_length; 4742 return string_length;
4711 } 4743 }
4712 if (write_null && (utf8_bytes+1 > capacity)) { 4744 if (write_null && (utf8_bytes+1 > capacity)) {
4713 options |= NO_NULL_TERMINATION; 4745 options |= NO_NULL_TERMINATION;
4714 } 4746 }
4715 // Recurse once without a capacity limit. 4747 // Recurse once without a capacity limit.
4716 // This will get into the first branch above. 4748 // This will get into the first branch above.
4717 // TODO(dcarney) Check max left rec. in Utf8Length and fall through. 4749 // TODO(dcarney) Check max left rec. in Utf8Length and fall through.
4718 return WriteUtf8(buffer, -1, nchars_ref, options); 4750 return WriteUtf8(buffer, -1, nchars_ref, options);
4719 } 4751 }
4720 } 4752 }
4721 // Recursive slow path can potentially be unreasonable slow. Flatten. 4753 // Recursive slow path can potentially be unreasonable slow. Flatten.
4722 str = FlattenGetString(str); 4754 str = FlattenGetString(str);
4723 Utf8WriterVisitor writer(buffer, capacity, false); 4755 Utf8WriterVisitor writer(buffer, capacity, false, replace_invalid_utf8);
4724 i::String::VisitFlat(&writer, *str); 4756 i::String::VisitFlat(&writer, *str);
4725 return writer.CompleteWrite(write_null, nchars_ref); 4757 return writer.CompleteWrite(write_null, nchars_ref);
4726 } 4758 }
4727 4759
4728 4760
4729 template<typename CharType> 4761 template<typename CharType>
4730 static inline int WriteHelper(const String* string, 4762 static inline int WriteHelper(const String* string,
4731 CharType* buffer, 4763 CharType* buffer,
4732 int start, 4764 int start,
4733 int length, 4765 int length,
(...skipping 2781 matching lines...) Expand 10 before | Expand all | Expand 10 after
7515 Isolate* isolate = reinterpret_cast<Isolate*>(info.GetIsolate()); 7547 Isolate* isolate = reinterpret_cast<Isolate*>(info.GetIsolate());
7516 Address callback_address = 7548 Address callback_address =
7517 reinterpret_cast<Address>(reinterpret_cast<intptr_t>(callback)); 7549 reinterpret_cast<Address>(reinterpret_cast<intptr_t>(callback));
7518 VMState<EXTERNAL> state(isolate); 7550 VMState<EXTERNAL> state(isolate);
7519 ExternalCallbackScope call_scope(isolate, callback_address); 7551 ExternalCallbackScope call_scope(isolate, callback_address);
7520 callback(info); 7552 callback(info);
7521 } 7553 }
7522 7554
7523 7555
7524 } } // namespace v8::internal 7556 } } // namespace v8::internal
OLDNEW
« no previous file with comments | « include/v8.h ('k') | src/unicode.h » ('j') | src/unicode-inl.h » ('J')

Powered by Google App Engine
This is Rietveld 408576698