| Index: src/api.cc
|
| diff --git a/src/api.cc b/src/api.cc
|
| index 50879a13766eb660d58d530299706801812550f7..6b5c1a921c2d7904d44241ab4b99ee929c2f159b 100644
|
| --- a/src/api.cc
|
| +++ b/src/api.cc
|
| @@ -4454,28 +4454,35 @@ int String::Utf8Length() const {
|
| class Utf8WriterVisitor {
|
| public:
|
| Utf8WriterVisitor(
|
| - char* buffer, int capacity, bool skip_capacity_check)
|
| - : early_termination_(false),
|
| - last_character_(unibrow::Utf16::kNoPreviousCharacter),
|
| - buffer_(buffer),
|
| - start_(buffer),
|
| - capacity_(capacity),
|
| - skip_capacity_check_(capacity == -1 || skip_capacity_check),
|
| - utf16_chars_read_(0) {
|
| + char* buffer,
|
| + int capacity,
|
| + bool skip_capacity_check,
|
| + bool replace_invalid_utf8)
|
| + : early_termination_(false),
|
| + last_character_(unibrow::Utf16::kNoPreviousCharacter),
|
| + buffer_(buffer),
|
| + start_(buffer),
|
| + capacity_(capacity),
|
| + skip_capacity_check_(capacity == -1 || skip_capacity_check),
|
| + replace_invalid_utf8_(replace_invalid_utf8),
|
| + utf16_chars_read_(0) {
|
| }
|
|
|
| static int WriteEndCharacter(uint16_t character,
|
| int last_character,
|
| int remaining,
|
| - char* const buffer) {
|
| + char* const buffer,
|
| + bool replace_invalid_utf8) {
|
| using namespace unibrow;
|
| ASSERT(remaining > 0);
|
| // We can't use a local buffer here because Encode needs to modify
|
| // previous characters in the stream. We know, however, that
|
| // exactly one character will be advanced.
|
| - if (Utf16::IsTrailSurrogate(character) &&
|
| - Utf16::IsLeadSurrogate(last_character)) {
|
| - int written = Utf8::Encode(buffer, character, last_character);
|
| + if (Utf16::IsSurrogatePair(last_character, character)) {
|
| + int written = Utf8::Encode(buffer,
|
| + character,
|
| + last_character,
|
| + replace_invalid_utf8);
|
| ASSERT(written == 1);
|
| return written;
|
| }
|
| @@ -4484,7 +4491,8 @@ class Utf8WriterVisitor {
|
| // Can't encode using last_character as gcc has array bounds issues.
|
| int written = Utf8::Encode(temp_buffer,
|
| character,
|
| - Utf16::kNoPreviousCharacter);
|
| + Utf16::kNoPreviousCharacter,
|
| + replace_invalid_utf8);
|
| // Won't fit.
|
| if (written > remaining) return 0;
|
| // Copy over the character from temp_buffer.
|
| @@ -4494,6 +4502,16 @@ class Utf8WriterVisitor {
|
| return written;
|
| }
|
|
|
| + // Visit writes out a group of code units (chars) of a v8::String to the
|
| + // internal buffer_. This is done in two phases. The first phase calculates a
|
| + // pesimistic estimate (writable_length) on how many code units can be safely
|
| + // written without exceeding the buffer capacity and without writing the last
|
| + // code unit (it could be a lead surrogate). The estimated number of code
|
| + // units is then written out in one go, and the reported byte usage is used
|
| + // to correct the estimate. This is repeated until the estimate becomes <= 0
|
| + // or all code units have been written out. The second phase writes out code
|
| + // units until the buffer capacity is reached, would be exceeded by the next
|
| + // unit, or all units have been written out.
|
| template<typename Char>
|
| void Visit(const Char* chars, const int length) {
|
| using namespace unibrow;
|
| @@ -4531,7 +4549,10 @@ class Utf8WriterVisitor {
|
| } else {
|
| for (; i < fast_length; i++) {
|
| uint16_t character = *chars++;
|
| - buffer += Utf8::Encode(buffer, character, last_character);
|
| + buffer += Utf8::Encode(buffer,
|
| + character,
|
| + last_character,
|
| + replace_invalid_utf8_);
|
| last_character = character;
|
| ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
|
| }
|
| @@ -4551,10 +4572,17 @@ class Utf8WriterVisitor {
|
| ASSERT(remaining_capacity >= 0);
|
| for (; i < length && remaining_capacity > 0; i++) {
|
| uint16_t character = *chars++;
|
| + // remaining_capacity is <= 3 bytes at this point, so we do not write out
|
| + // an umatched lead surrogate.
|
| + if (replace_invalid_utf8_ && Utf16::IsLeadSurrogate(character)) {
|
| + early_termination_ = true;
|
| + break;
|
| + }
|
| int written = WriteEndCharacter(character,
|
| last_character,
|
| remaining_capacity,
|
| - buffer);
|
| + buffer,
|
| + replace_invalid_utf8_);
|
| if (written == 0) {
|
| early_termination_ = true;
|
| break;
|
| @@ -4602,6 +4630,7 @@ class Utf8WriterVisitor {
|
| char* const start_;
|
| int capacity_;
|
| bool const skip_capacity_check_;
|
| + bool const replace_invalid_utf8_;
|
| int utf16_chars_read_;
|
| DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);
|
| };
|
| @@ -4640,9 +4669,11 @@ int String::WriteUtf8(char* buffer,
|
| }
|
| const int string_length = str->length();
|
| bool write_null = !(options & NO_NULL_TERMINATION);
|
| + bool replace_invalid_utf8 = (options & REPLACE_INVALID_UTF8);
|
| + int max16BitCodeUnitSize = unibrow::Utf8::kMax16BitCodeUnitSize;
|
| // First check if we can just write the string without checking capacity.
|
| - if (capacity == -1 || capacity / 3 >= string_length) {
|
| - Utf8WriterVisitor writer(buffer, capacity, true);
|
| + if (capacity == -1 || capacity / max16BitCodeUnitSize >= string_length) {
|
| + Utf8WriterVisitor writer(buffer, capacity, true, replace_invalid_utf8);
|
| const int kMaxRecursion = 100;
|
| bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);
|
| if (success) return writer.CompleteWrite(write_null, nchars_ref);
|
| @@ -4670,7 +4701,7 @@ int String::WriteUtf8(char* buffer,
|
| }
|
| // Recursive slow path can potentially be unreasonable slow. Flatten.
|
| str = FlattenGetString(str);
|
| - Utf8WriterVisitor writer(buffer, capacity, false);
|
| + Utf8WriterVisitor writer(buffer, capacity, false, replace_invalid_utf8);
|
| i::String::VisitFlat(&writer, *str);
|
| return writer.CompleteWrite(write_null, nchars_ref);
|
| }
|
|
|