src/api.cc - Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option

Side by Side Diff: src/api.cc

Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option (Closed) Base URL: git://github.com/v8/v8.git@master

Patch Set: DISALLOW_INVALID_UTF8 flag and fixes Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 4486 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4497 int String::Utf8Length() const {	4497 int String::Utf8Length() const {

4498 i::Handle<i::String> str = Utils::OpenHandle(this);	4498 i::Handle<i::String> str = Utils::OpenHandle(this);

4499 i::Isolate* isolate = str->GetIsolate();	4499 i::Isolate* isolate = str->GetIsolate();

4500 return v8::Utf8Length(*str, isolate);	4500 return v8::Utf8Length(*str, isolate);

4501 }	4501 }

4502	4502

4503	4503

4504 class Utf8WriterVisitor {	4504 class Utf8WriterVisitor {

4505 public:	4505 public:

4506 Utf8WriterVisitor(	4506 Utf8WriterVisitor(

4507 char* buffer, int capacity, bool skip_capacity_check)	4507 char* buffer,

	4508 int capacity,

	4509 bool skip_capacity_check,

	4510 bool allow_invalid_utf8)

4508 : early_termination_(false),	4511 : early_termination_(false),

4509 last_character_(unibrow::Utf16::kNoPreviousCharacter),	4512 last_character_(unibrow::Utf16::kNoPreviousCharacter),

4510 buffer_(buffer),	4513 buffer_(buffer),

4511 start_(buffer),	4514 start_(buffer),

4512 capacity_(capacity),	4515 capacity_(capacity),

4513 skip_capacity_check_(capacity == -1 \|\| skip_capacity_check),	4516 skip_capacity_check_(capacity == -1 \|\| skip_capacity_check),

	4517 allow_invalid_utf8_(allow_invalid_utf8),

4514 utf16_chars_read_(0) {	4518 utf16_chars_read_(0) {

4515 }	4519 }

4516	4520

4517 static int WriteEndCharacter(uint16_t character,	4521 // WritePair writes the current UTF-16 code unit to the given buffer. The

4518 int last_character,	4522 // function will go back inside the buffer to combine surrogate pairs.

	4523 static int WritePair(uint16_t current,
	dcarney 2014/01/07 10:12:16 WritePair is a bad name here, since it may or not WritePair is a bad name here, since it may or not be a pair.
	4524 int previous,

	4525 char* buffer,

	4526 bool allow_invalid_utf8) {

	4527 using namespace unibrow;

	4528 int code_point = current;

	4529 int written = 0;

	4530 if (Utf16::IsSurrogatePair(previous, current)) {

	4531 code_point = Utf16::CombineSurrogatePair(previous, current);

	4532 buffer -= Utf8::kSizeOfUnmatchedSurrogate;

	4533 written -= Utf8::kSizeOfUnmatchedSurrogate;

	4534 }

	4535 return written + Utf8::Encode(buffer, code_point, allow_invalid_utf8);

	4536 }

	4537

	4538 static int WriteEndCharacter(uint16_t current,

	4539 int previous,

4519 int remaining,	4540 int remaining,

4520 char* const buffer) {	4541 char* const buffer,

	4542 bool allow_invalid_utf8) {

4521 using namespace unibrow;	4543 using namespace unibrow;

4522 ASSERT(remaining > 0);	4544 ASSERT(remaining > 0);

4523 // We can't use a local buffer here because Encode needs to modify	4545 // We can't use a local buffer here because WritePair needs to modify

4524 // previous characters in the stream. We know, however, that	4546 // previous characters in the stream. We know, however, that exactly one

4525 // exactly one character will be advanced.	4547 // character will be advanced.

4526 if (Utf16::IsTrailSurrogate(character) &&	4548 if (Utf16::IsSurrogatePair(previous, current)) {

4527 Utf16::IsLeadSurrogate(last_character)) {	4549 int written = WritePair(current, previous, buffer, allow_invalid_utf8);

4528 int written = Utf8::Encode(buffer, character, last_character);

4529 ASSERT(written == 1);	4550 ASSERT(written == 1);

4530 return written;	4551 return written;

4531 }	4552 }

4532 // Use a scratch buffer to check the required characters.	4553 // Use a scratch buffer to check the required characters.

4533 char temp_buffer[Utf8::kMaxEncodedSize];	4554 char temp_buffer[Utf8::kMaxEncodedSize];

4534 // Can't encode using last_character as gcc has array bounds issues.	4555 // Can't encode using last_character as gcc has array bounds issues.

4535 int written = Utf8::Encode(temp_buffer,	4556 int written = WritePair(current,
	dcarney 2014/01/07 10:12:16 this is not a surrogate pair, could use Encode dir this is not a surrogate pair, could use Encode directly
4536 character,	4557 Utf16::kNoPreviousCharacter,

4537 Utf16::kNoPreviousCharacter);	4558 temp_buffer,

	4559 allow_invalid_utf8);

4538 // Won't fit.	4560 // Won't fit.

4539 if (written > remaining) return 0;	4561 if (written > remaining) return 0;

4540 // Copy over the character from temp_buffer.	4562 // Copy over the character from temp_buffer.

4541 for (int j = 0; j < written; j++) {	4563 for (int j = 0; j < written; j++) {

4542 buffer[j] = temp_buffer[j];	4564 buffer[j] = temp_buffer[j];

4543 }	4565 }

4544 return written;	4566 return written;

4545 }	4567 }

4546	4568

4547 template<typename Char>	4569 template<typename Char>

(...skipping 26 matching lines...) Expand all Loading...
4574 // Write the characters to the stream.	4596 // Write the characters to the stream.

4575 if (sizeof(Char) == 1) {	4597 if (sizeof(Char) == 1) {

4576 for (; i < fast_length; i++) {	4598 for (; i < fast_length; i++) {

4577 buffer +=	4599 buffer +=

4578 Utf8::EncodeOneByte(buffer, static_cast<uint8_t>(*chars++));	4600 Utf8::EncodeOneByte(buffer, static_cast<uint8_t>(*chars++));

4579 ASSERT(capacity_ == -1 \|\| (buffer - start_) <= capacity_);	4601 ASSERT(capacity_ == -1 \|\| (buffer - start_) <= capacity_);

4580 }	4602 }

4581 } else {	4603 } else {

4582 for (; i < fast_length; i++) {	4604 for (; i < fast_length; i++) {

4583 uint16_t character = *chars++;	4605 uint16_t character = *chars++;

4584 buffer += Utf8::Encode(buffer, character, last_character);	4606 buffer += WritePair(character,

	4607 last_character,

	4608 buffer,

	4609 allow_invalid_utf8_);

4585 last_character = character;	4610 last_character = character;

4586 ASSERT(capacity_ == -1 \|\| (buffer - start_) <= capacity_);	4611 ASSERT(capacity_ == -1 \|\| (buffer - start_) <= capacity_);

4587 }	4612 }

4588 }	4613 }

4589 // Array is fully written. Exit.	4614 // Array is fully written. Exit.

4590 if (fast_length == length) {	4615 if (fast_length == length) {

4591 // Write state back out to object.	4616 // Write state back out to object.

4592 last_character_ = last_character;	4617 last_character_ = last_character;

4593 buffer_ = buffer;	4618 buffer_ = buffer;

4594 utf16_chars_read_ += length;	4619 utf16_chars_read_ += length;

4595 return;	4620 return;

4596 }	4621 }

4597 }	4622 }

4598 ASSERT(!skip_capacity_check_);	4623 ASSERT(!skip_capacity_check_);

4599 // Slow loop. Must check capacity on each iteration.	4624 // Slow loop. Must check capacity on each iteration.

4600 int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);	4625 int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);

4601 ASSERT(remaining_capacity >= 0);	4626 ASSERT(remaining_capacity >= 0);

4602 for (; i < length && remaining_capacity > 0; i++) {	4627 for (; i < length && remaining_capacity > 0; i++) {

4603 uint16_t character = *chars++;	4628 uint16_t character = *chars++;

4604 int written = WriteEndCharacter(character,	4629 int written = WriteEndCharacter(character,

4605 last_character,	4630 last_character,

4606 remaining_capacity,	4631 remaining_capacity,

4607 buffer);	4632 buffer,

	4633 allow_invalid_utf8_);

4608 if (written == 0) {	4634 if (written == 0) {

4609 early_termination_ = true;	4635 early_termination_ = true;

4610 break;	4636 break;

4611 }	4637 }

4612 buffer += written;	4638 buffer += written;

4613 remaining_capacity -= written;	4639 remaining_capacity -= written;

4614 last_character = character;	4640 last_character = character;

4615 }	4641 }

4616 // Write state back out to object.	4642 // Write state back out to object.

4617 last_character_ = last_character;	4643 last_character_ = last_character;

(...skipping 27 matching lines...) Expand all Loading...
4645 return static_cast<int>(buffer_ - start_);	4671 return static_cast<int>(buffer_ - start_);

4646 }	4672 }

4647	4673

4648 private:	4674 private:

4649 bool early_termination_;	4675 bool early_termination_;

4650 int last_character_;	4676 int last_character_;

4651 char* buffer_;	4677 char* buffer_;

4652 char* const start_;	4678 char* const start_;

4653 int capacity_;	4679 int capacity_;

4654 bool const skip_capacity_check_;	4680 bool const skip_capacity_check_;

	4681 bool const allow_invalid_utf8_;

4655 int utf16_chars_read_;	4682 int utf16_chars_read_;

4656 DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);	4683 DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);

4657 };	4684 };

4658	4685

4659	4686

4660 static bool RecursivelySerializeToUtf8(i::String* current,	4687 static bool RecursivelySerializeToUtf8(i::String* current,

4661 Utf8WriterVisitor* writer,	4688 Utf8WriterVisitor* writer,

4662 int recursion_budget) {	4689 int recursion_budget) {

4663 while (!writer->IsDone()) {	4690 while (!writer->IsDone()) {

4664 i::ConsString* cons_string = i::String::VisitFlat(writer, current);	4691 i::ConsString* cons_string = i::String::VisitFlat(writer, current);

(...skipping 18 matching lines...) Expand all Loading...
4683 int options) const {	4710 int options) const {

4684 i::Isolate* isolate = Utils::OpenHandle(this)->GetIsolate();	4711 i::Isolate* isolate = Utils::OpenHandle(this)->GetIsolate();

4685 LOG_API(isolate, "String::WriteUtf8");	4712 LOG_API(isolate, "String::WriteUtf8");

4686 ENTER_V8(isolate);	4713 ENTER_V8(isolate);

4687 i::Handle<i::String> str = Utils::OpenHandle(this);	4714 i::Handle<i::String> str = Utils::OpenHandle(this);

4688 if (options & HINT_MANY_WRITES_EXPECTED) {	4715 if (options & HINT_MANY_WRITES_EXPECTED) {

4689 FlattenString(str); // Flatten the string for efficiency.	4716 FlattenString(str); // Flatten the string for efficiency.

4690 }	4717 }

4691 const int string_length = str->length();	4718 const int string_length = str->length();

4692 bool write_null = !(options & NO_NULL_TERMINATION);	4719 bool write_null = !(options & NO_NULL_TERMINATION);

	4720 bool allow_invalid_utf8 = !(options & DISALLOW_INVALID_UTF8);

4693 // First check if we can just write the string without checking capacity.	4721 // First check if we can just write the string without checking capacity.

	4722 // @TODO Replace magic number 3 with something more descriptive. E.g.

	4723 // Utf8::kMaxTwoByteSize (as in the maximum size an unsighed 2 byte code unit

	4724 // value will take up when encoded to UTF-8)? When I first read this code I

	4725 // thought there might be a overflow bug here since UTF-8 may take up to 4

	4726 // bytes per code unit. Then I realized that a surrogate pair has a

	4727 // str.length of 2, making the code correct.

4694 if (capacity == -1 \|\| capacity / 3 >= string_length) {	4728 if (capacity == -1 \|\| capacity / 3 >= string_length) {

4695 Utf8WriterVisitor writer(buffer, capacity, true);	4729 Utf8WriterVisitor writer(buffer, capacity, true, allow_invalid_utf8);

4696 const int kMaxRecursion = 100;	4730 const int kMaxRecursion = 100;

4697 bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);	4731 bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);

4698 if (success) return writer.CompleteWrite(write_null, nchars_ref);	4732 if (success) return writer.CompleteWrite(write_null, nchars_ref);

4699 } else if (capacity >= string_length) {	4733 } else if (capacity >= string_length) {

4700 // First check that the buffer is large enough.	4734 // First check that the buffer is large enough.

4701 int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate());	4735 int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate());

4702 if (utf8_bytes <= capacity) {	4736 if (utf8_bytes <= capacity) {

4703 // ASCII fast path.	4737 // ASCII fast path.

4704 if (utf8_bytes == string_length) {	4738 if (utf8_bytes == string_length) {

4705 WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity, options);	4739 WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity, options);

4706 if (nchars_ref != NULL) *nchars_ref = string_length;	4740 if (nchars_ref != NULL) *nchars_ref = string_length;

4707 if (write_null && (utf8_bytes+1 <= capacity)) {	4741 if (write_null && (utf8_bytes+1 <= capacity)) {

4708 return string_length + 1;	4742 return string_length + 1;

4709 }	4743 }

4710 return string_length;	4744 return string_length;

4711 }	4745 }

4712 if (write_null && (utf8_bytes+1 > capacity)) {	4746 if (write_null && (utf8_bytes+1 > capacity)) {

4713 options \|= NO_NULL_TERMINATION;	4747 options \|= NO_NULL_TERMINATION;

4714 }	4748 }

4715 // Recurse once without a capacity limit.	4749 // Recurse once without a capacity limit.

4716 // This will get into the first branch above.	4750 // This will get into the first branch above.

4717 // TODO(dcarney) Check max left rec. in Utf8Length and fall through.	4751 // TODO(dcarney) Check max left rec. in Utf8Length and fall through.

4718 return WriteUtf8(buffer, -1, nchars_ref, options);	4752 return WriteUtf8(buffer, -1, nchars_ref, options);

4719 }	4753 }

4720 }	4754 }

4721 // Recursive slow path can potentially be unreasonable slow. Flatten.	4755 // Recursive slow path can potentially be unreasonable slow. Flatten.

4722 str = FlattenGetString(str);	4756 str = FlattenGetString(str);

4723 Utf8WriterVisitor writer(buffer, capacity, false);	4757 Utf8WriterVisitor writer(buffer, capacity, false, allow_invalid_utf8);

4724 i::String::VisitFlat(&writer, *str);	4758 i::String::VisitFlat(&writer, *str);

4725 return writer.CompleteWrite(write_null, nchars_ref);	4759 return writer.CompleteWrite(write_null, nchars_ref);

4726 }	4760 }

4727	4761

4728	4762

4729 template<typename CharType>	4763 template<typename CharType>

4730 static inline int WriteHelper(const String* string,	4764 static inline int WriteHelper(const String* string,

4731 CharType* buffer,	4765 CharType* buffer,

4732 int start,	4766 int start,

4733 int length,	4767 int length,

(...skipping 2781 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
7515 Isolate* isolate = reinterpret_cast<Isolate*>(info.GetIsolate());	7549 Isolate* isolate = reinterpret_cast<Isolate*>(info.GetIsolate());

7516 Address callback_address =	7550 Address callback_address =

7517 reinterpret_cast<Address>(reinterpret_cast<intptr_t>(callback));	7551 reinterpret_cast<Address>(reinterpret_cast<intptr_t>(callback));

7518 VMState<EXTERNAL> state(isolate);	7552 VMState<EXTERNAL> state(isolate);

7519 ExternalCallbackScope call_scope(isolate, callback_address);	7553 ExternalCallbackScope call_scope(isolate, callback_address);

7520 callback(info);	7554 callback(info);

7521 }	7555 }

7522	7556

7523	7557

7524 } } // namespace v8::internal	7558 } } // namespace v8::internal

OLD	NEW

« no previous file with comments | « include/v8.h ('k') | src/debug-agent.cc » ('j') | src/unicode-inl.h » ('J')