src/api.cc - Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option

Side by Side Diff: src/api.cc

Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option (Closed) Base URL: git://github.com/v8/v8.git@master

Patch Set: Fix mistake in test case, finish patch Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 4486 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4497 int String::Utf8Length() const {	4497 int String::Utf8Length() const {

4498 i::Handle<i::String> str = Utils::OpenHandle(this);	4498 i::Handle<i::String> str = Utils::OpenHandle(this);

4499 i::Isolate* isolate = str->GetIsolate();	4499 i::Isolate* isolate = str->GetIsolate();

4500 return v8::Utf8Length(*str, isolate);	4500 return v8::Utf8Length(*str, isolate);

4501 }	4501 }

4502	4502

4503	4503

4504 class Utf8WriterVisitor {	4504 class Utf8WriterVisitor {

4505 public:	4505 public:

4506 Utf8WriterVisitor(	4506 Utf8WriterVisitor(

4507 char* buffer, int capacity, bool skip_capacity_check)	4507 char* buffer,

	4508 int capacity,

	4509 bool skip_capacity_check,

	4510 bool replace_invalid_utf8)

4508 : early_termination_(false),	4511 : early_termination_(false),

4509 last_character_(unibrow::Utf16::kNoPreviousCharacter),	4512 last_character_(unibrow::Utf16::kNoPreviousCharacter),

4510 buffer_(buffer),	4513 buffer_(buffer),

4511 start_(buffer),	4514 start_(buffer),

4512 capacity_(capacity),	4515 capacity_(capacity),

4513 skip_capacity_check_(capacity == -1 \|\| skip_capacity_check),	4516 skip_capacity_check_(capacity == -1 \|\| skip_capacity_check),

	4517 replace_invalid_utf8_(replace_invalid_utf8),

4514 utf16_chars_read_(0) {	4518 utf16_chars_read_(0) {

4515 }	4519 }

4516	4520

4517 static int WriteEndCharacter(uint16_t character,	4521 static int WriteEndCharacter(uint16_t character,

4518 int last_character,	4522 int last_character,

4519 int remaining,	4523 int remaining,

4520 char* const buffer) {	4524 char* const buffer,

	4525 bool replace_invalid_utf8) {

4521 using namespace unibrow;	4526 using namespace unibrow;

4522 ASSERT(remaining > 0);	4527 ASSERT(remaining > 0);

4523 // We can't use a local buffer here because Encode needs to modify	4528 // We can't use a local buffer here because Encode needs to modify

4524 // previous characters in the stream. We know, however, that	4529 // previous characters in the stream. We know, however, that

4525 // exactly one character will be advanced.	4530 // exactly one character will be advanced.

4526 if (Utf16::IsTrailSurrogate(character) &&	4531 if (Utf16::IsSurrogatePair(last_character, character)) {

4527 Utf16::IsLeadSurrogate(last_character)) {	4532 int written = Utf8::Encode(buffer,

4528 int written = Utf8::Encode(buffer, character, last_character);	4533 character,

	4534 last_character,

	4535 replace_invalid_utf8);

4529 ASSERT(written == 1);	4536 ASSERT(written == 1);

4530 return written;	4537 return written;

4531 }	4538 }

4532 // Use a scratch buffer to check the required characters.	4539 // Use a scratch buffer to check the required characters.

4533 char temp_buffer[Utf8::kMaxEncodedSize];	4540 char temp_buffer[Utf8::kMaxEncodedSize];

4534 // Can't encode using last_character as gcc has array bounds issues.	4541 // Can't encode using last_character as gcc has array bounds issues.

4535 int written = Utf8::Encode(temp_buffer,	4542 int written = Utf8::Encode(temp_buffer,

4536 character,	4543 character,

4537 Utf16::kNoPreviousCharacter);	4544 Utf16::kNoPreviousCharacter,

	4545 replace_invalid_utf8);

4538 // Won't fit.	4546 // Won't fit.

4539 if (written > remaining) return 0;	4547 if (written > remaining) return 0;

4540 // Copy over the character from temp_buffer.	4548 // Copy over the character from temp_buffer.

4541 for (int j = 0; j < written; j++) {	4549 for (int j = 0; j < written; j++) {

4542 buffer[j] = temp_buffer[j];	4550 buffer[j] = temp_buffer[j];

4543 }	4551 }

4544 return written;	4552 return written;

4545 }	4553 }

4546	4554

	4555 // Visit writes out a group of code units (chars) of a v8::String to the

	4556 // internal buffer_. This is done in two phases. The first phase calculates a

	4557 // pesimistic estimate (writable_length) on how many code units can be safely

	4558 // written without exceeding the buffer capacity and without writing the last

	4559 // code unit (it could be a lead surrogate). The estimated number of code

	4560 // units is then written out in one go, and the reported byte usage is used

	4561 // to correct the estimate. This is repeated until the estimate becomes <= 0

	4562 // or all code units have been written out. The second phase writes out code

	4563 // units until the buffer capacity is reached, would be exceeded by the next

	4564 // unit, or all units have been written out.

	4565 // TODO(felixge) This function is rather complex and could benefit from
	dcarney 2014/01/17 09:10:12 drop the todo drop the todo haimuiba 2014/01/20 08:10:27 Done. Show quoted text On 2014/01/17 09:10:12, dcarney wrote: > drop the todo Done.
	4566 // better variable naming and/or splitting up.

4547 template<typename Char>	4567 template<typename Char>

4548 void Visit(const Char* chars, const int length) {	4568 void Visit(const Char* chars, const int length) {

4549 using namespace unibrow;	4569 using namespace unibrow;

4550 ASSERT(!early_termination_);	4570 ASSERT(!early_termination_);

4551 if (length == 0) return;	4571 if (length == 0) return;

4552 // Copy state to stack.	4572 // Copy state to stack.

4553 char* buffer = buffer_;	4573 char* buffer = buffer_;

4554 int last_character =	4574 int last_character =

4555 sizeof(Char) == 1 ? Utf16::kNoPreviousCharacter : last_character_;	4575 sizeof(Char) == 1 ? Utf16::kNoPreviousCharacter : last_character_;

4556 int i = 0;	4576 int i = 0;

(...skipping 17 matching lines...) Expand all Loading...
4574 // Write the characters to the stream.	4594 // Write the characters to the stream.

4575 if (sizeof(Char) == 1) {	4595 if (sizeof(Char) == 1) {

4576 for (; i < fast_length; i++) {	4596 for (; i < fast_length; i++) {

4577 buffer +=	4597 buffer +=

4578 Utf8::EncodeOneByte(buffer, static_cast<uint8_t>(*chars++));	4598 Utf8::EncodeOneByte(buffer, static_cast<uint8_t>(*chars++));

4579 ASSERT(capacity_ == -1 \|\| (buffer - start_) <= capacity_);	4599 ASSERT(capacity_ == -1 \|\| (buffer - start_) <= capacity_);

4580 }	4600 }

4581 } else {	4601 } else {

4582 for (; i < fast_length; i++) {	4602 for (; i < fast_length; i++) {

4583 uint16_t character = *chars++;	4603 uint16_t character = *chars++;

4584 buffer += Utf8::Encode(buffer, character, last_character);	4604 buffer += Utf8::Encode(buffer,

	4605 character,

	4606 last_character,

	4607 replace_invalid_utf8_);

4585 last_character = character;	4608 last_character = character;

4586 ASSERT(capacity_ == -1 \|\| (buffer - start_) <= capacity_);	4609 ASSERT(capacity_ == -1 \|\| (buffer - start_) <= capacity_);

4587 }	4610 }

4588 }	4611 }

4589 // Array is fully written. Exit.	4612 // Array is fully written. Exit.

4590 if (fast_length == length) {	4613 if (fast_length == length) {

4591 // Write state back out to object.	4614 // Write state back out to object.

4592 last_character_ = last_character;	4615 last_character_ = last_character;

4593 buffer_ = buffer;	4616 buffer_ = buffer;

4594 utf16_chars_read_ += length;	4617 utf16_chars_read_ += length;

4595 return;	4618 return;

4596 }	4619 }

4597 }	4620 }

4598 ASSERT(!skip_capacity_check_);	4621 ASSERT(!skip_capacity_check_);

4599 // Slow loop. Must check capacity on each iteration.	4622 // Slow loop. Must check capacity on each iteration.

4600 int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);	4623 int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);

4601 ASSERT(remaining_capacity >= 0);	4624 ASSERT(remaining_capacity >= 0);

4602 for (; i < length && remaining_capacity > 0; i++) {	4625 for (; i < length && remaining_capacity > 0; i++) {

4603 uint16_t character = *chars++;	4626 uint16_t character = *chars++;

	4627 if (replace_invalid_utf8_ && Utf16::IsLeadSurrogate(character)) {
	dcarney 2014/01/17 09:10:12 this line is in the correct place, but it's only t this line is in the correct place, but it's only true because we can have at most 3 characters remaining here please add a one line comment haimuiba 2014/01/20 08:10:27 Done. Show quoted text On 2014/01/17 09:10:12, dcarney wrote: > this line is in the correct place, but it's only true because we can have at > most 3 characters remaining here > please add a one line comment Done.
	4628 early_termination_ = true;

	4629 break;

	4630 }

	4631
	dcarney 2014/01/17 09:10:12 no space no space haimuiba 2014/01/20 08:10:27 Done. Show quoted text On 2014/01/17 09:10:12, dcarney wrote: > no space Done.
4604 int written = WriteEndCharacter(character,	4632 int written = WriteEndCharacter(character,

4605 last_character,	4633 last_character,

4606 remaining_capacity,	4634 remaining_capacity,

4607 buffer);	4635 buffer,

	4636 replace_invalid_utf8_);

4608 if (written == 0) {	4637 if (written == 0) {

4609 early_termination_ = true;	4638 early_termination_ = true;

4610 break;	4639 break;

4611 }	4640 }

4612 buffer += written;	4641 buffer += written;

4613 remaining_capacity -= written;	4642 remaining_capacity -= written;

4614 last_character = character;	4643 last_character = character;

4615 }	4644 }

4616 // Write state back out to object.	4645 // Write state back out to object.

4617 last_character_ = last_character;	4646 last_character_ = last_character;

(...skipping 27 matching lines...) Expand all Loading...
4645 return static_cast<int>(buffer_ - start_);	4674 return static_cast<int>(buffer_ - start_);

4646 }	4675 }

4647	4676

4648 private:	4677 private:

4649 bool early_termination_;	4678 bool early_termination_;

4650 int last_character_;	4679 int last_character_;

4651 char* buffer_;	4680 char* buffer_;

4652 char* const start_;	4681 char* const start_;

4653 int capacity_;	4682 int capacity_;

4654 bool const skip_capacity_check_;	4683 bool const skip_capacity_check_;

	4684 bool const replace_invalid_utf8_;

4655 int utf16_chars_read_;	4685 int utf16_chars_read_;

4656 DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);	4686 DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);

4657 };	4687 };

4658	4688

4659	4689

4660 static bool RecursivelySerializeToUtf8(i::String* current,	4690 static bool RecursivelySerializeToUtf8(i::String* current,

4661 Utf8WriterVisitor* writer,	4691 Utf8WriterVisitor* writer,

4662 int recursion_budget) {	4692 int recursion_budget) {

4663 while (!writer->IsDone()) {	4693 while (!writer->IsDone()) {

4664 i::ConsString* cons_string = i::String::VisitFlat(writer, current);	4694 i::ConsString* cons_string = i::String::VisitFlat(writer, current);

(...skipping 18 matching lines...) Expand all Loading...
4683 int options) const {	4713 int options) const {

4684 i::Isolate* isolate = Utils::OpenHandle(this)->GetIsolate();	4714 i::Isolate* isolate = Utils::OpenHandle(this)->GetIsolate();

4685 LOG_API(isolate, "String::WriteUtf8");	4715 LOG_API(isolate, "String::WriteUtf8");

4686 ENTER_V8(isolate);	4716 ENTER_V8(isolate);

4687 i::Handle<i::String> str = Utils::OpenHandle(this);	4717 i::Handle<i::String> str = Utils::OpenHandle(this);

4688 if (options & HINT_MANY_WRITES_EXPECTED) {	4718 if (options & HINT_MANY_WRITES_EXPECTED) {

4689 FlattenString(str); // Flatten the string for efficiency.	4719 FlattenString(str); // Flatten the string for efficiency.

4690 }	4720 }

4691 const int string_length = str->length();	4721 const int string_length = str->length();

4692 bool write_null = !(options & NO_NULL_TERMINATION);	4722 bool write_null = !(options & NO_NULL_TERMINATION);

	4723 bool replace_invalid_utf8 = (options & REPLACE_INVALID_UTF8);

	4724 int max16BitCodeUnitSize = unibrow::Utf8::kMax16BitCodeUnitSize;

4693 // First check if we can just write the string without checking capacity.	4725 // First check if we can just write the string without checking capacity.

4694 if (capacity == -1 \|\| capacity / 3 >= string_length) {	4726 if (capacity == -1 \|\| capacity / max16BitCodeUnitSize >= string_length) {

4695 Utf8WriterVisitor writer(buffer, capacity, true);	4727 Utf8WriterVisitor writer(buffer, capacity, true, replace_invalid_utf8);

4696 const int kMaxRecursion = 100;	4728 const int kMaxRecursion = 100;

4697 bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);	4729 bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);

4698 if (success) return writer.CompleteWrite(write_null, nchars_ref);	4730 if (success) return writer.CompleteWrite(write_null, nchars_ref);

4699 } else if (capacity >= string_length) {	4731 } else if (capacity >= string_length) {

4700 // First check that the buffer is large enough.	4732 // First check that the buffer is large enough.

4701 int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate());	4733 int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate());

4702 if (utf8_bytes <= capacity) {	4734 if (utf8_bytes <= capacity) {

4703 // ASCII fast path.	4735 // ASCII fast path.

4704 if (utf8_bytes == string_length) {	4736 if (utf8_bytes == string_length) {

4705 WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity, options);	4737 WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity, options);

4706 if (nchars_ref != NULL) *nchars_ref = string_length;	4738 if (nchars_ref != NULL) *nchars_ref = string_length;

4707 if (write_null && (utf8_bytes+1 <= capacity)) {	4739 if (write_null && (utf8_bytes+1 <= capacity)) {

4708 return string_length + 1;	4740 return string_length + 1;

4709 }	4741 }

4710 return string_length;	4742 return string_length;

4711 }	4743 }

4712 if (write_null && (utf8_bytes+1 > capacity)) {	4744 if (write_null && (utf8_bytes+1 > capacity)) {

4713 options \|= NO_NULL_TERMINATION;	4745 options \|= NO_NULL_TERMINATION;

4714 }	4746 }

4715 // Recurse once without a capacity limit.	4747 // Recurse once without a capacity limit.

4716 // This will get into the first branch above.	4748 // This will get into the first branch above.

4717 // TODO(dcarney) Check max left rec. in Utf8Length and fall through.	4749 // TODO(dcarney) Check max left rec. in Utf8Length and fall through.

4718 return WriteUtf8(buffer, -1, nchars_ref, options);	4750 return WriteUtf8(buffer, -1, nchars_ref, options);

4719 }	4751 }

4720 }	4752 }

4721 // Recursive slow path can potentially be unreasonable slow. Flatten.	4753 // Recursive slow path can potentially be unreasonable slow. Flatten.

4722 str = FlattenGetString(str);	4754 str = FlattenGetString(str);

4723 Utf8WriterVisitor writer(buffer, capacity, false);	4755 Utf8WriterVisitor writer(buffer, capacity, false, replace_invalid_utf8);

4724 i::String::VisitFlat(&writer, *str);	4756 i::String::VisitFlat(&writer, *str);

4725 return writer.CompleteWrite(write_null, nchars_ref);	4757 return writer.CompleteWrite(write_null, nchars_ref);

4726 }	4758 }

4727	4759

4728	4760

4729 template<typename CharType>	4761 template<typename CharType>

4730 static inline int WriteHelper(const String* string,	4762 static inline int WriteHelper(const String* string,

4731 CharType* buffer,	4763 CharType* buffer,

4732 int start,	4764 int start,

4733 int length,	4765 int length,

(...skipping 2781 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
7515 Isolate* isolate = reinterpret_cast<Isolate*>(info.GetIsolate());	7547 Isolate* isolate = reinterpret_cast<Isolate*>(info.GetIsolate());

7516 Address callback_address =	7548 Address callback_address =

7517 reinterpret_cast<Address>(reinterpret_cast<intptr_t>(callback));	7549 reinterpret_cast<Address>(reinterpret_cast<intptr_t>(callback));

7518 VMState<EXTERNAL> state(isolate);	7550 VMState<EXTERNAL> state(isolate);

7519 ExternalCallbackScope call_scope(isolate, callback_address);	7551 ExternalCallbackScope call_scope(isolate, callback_address);

7520 callback(info);	7552 callback(info);

7521 }	7553 }

7522	7554

7523	7555

7524 } } // namespace v8::internal	7556 } } // namespace v8::internal

OLD	NEW

« no previous file with comments | « include/v8.h ('k') | src/unicode.h » ('j') | src/unicode-inl.h » ('J')