src/api.cc - Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option

Unified Diff: src/api.cc

Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option (Closed) Base URL: git://github.com/v8/v8.git@master

Patch Set: Rebase Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/api.cc

diff --git a/src/api.cc b/src/api.cc

index 50879a13766eb660d58d530299706801812550f7..6b5c1a921c2d7904d44241ab4b99ee929c2f159b 100644

--- a/src/api.cc

+++ b/src/api.cc

@@ -4454,28 +4454,35 @@ int String::Utf8Length() const {

class Utf8WriterVisitor {

public:

Utf8WriterVisitor(

- char* buffer, int capacity, bool skip_capacity_check)

- : early_termination_(false),

- last_character_(unibrow::Utf16::kNoPreviousCharacter),

- buffer_(buffer),

- start_(buffer),

- capacity_(capacity),

- skip_capacity_check_(capacity == -1 || skip_capacity_check),

- utf16_chars_read_(0) {

+ char* buffer,

+ int capacity,

+ bool skip_capacity_check,

+ bool replace_invalid_utf8)

+ : early_termination_(false),

+ last_character_(unibrow::Utf16::kNoPreviousCharacter),

+ buffer_(buffer),

+ start_(buffer),

+ capacity_(capacity),

+ skip_capacity_check_(capacity == -1 || skip_capacity_check),

+ replace_invalid_utf8_(replace_invalid_utf8),

+ utf16_chars_read_(0) {

}

static int WriteEndCharacter(uint16_t character,

int last_character,

int remaining,

- char* const buffer) {

+ char* const buffer,

+ bool replace_invalid_utf8) {

using namespace unibrow;

ASSERT(remaining > 0);

// We can't use a local buffer here because Encode needs to modify

// previous characters in the stream. We know, however, that

// exactly one character will be advanced.

- if (Utf16::IsTrailSurrogate(character) &&

- Utf16::IsLeadSurrogate(last_character)) {

- int written = Utf8::Encode(buffer, character, last_character);

+ if (Utf16::IsSurrogatePair(last_character, character)) {

+ int written = Utf8::Encode(buffer,

+ character,

+ last_character,

+ replace_invalid_utf8);

ASSERT(written == 1);

return written;

}

@@ -4484,7 +4491,8 @@ class Utf8WriterVisitor {

// Can't encode using last_character as gcc has array bounds issues.

int written = Utf8::Encode(temp_buffer,

character,

- Utf16::kNoPreviousCharacter);

+ Utf16::kNoPreviousCharacter,

+ replace_invalid_utf8);

// Won't fit.

if (written > remaining) return 0;

// Copy over the character from temp_buffer.

@@ -4494,6 +4502,16 @@ class Utf8WriterVisitor {

return written;

}

+ // Visit writes out a group of code units (chars) of a v8::String to the

+ // internal buffer_. This is done in two phases. The first phase calculates a

+ // pesimistic estimate (writable_length) on how many code units can be safely

+ // written without exceeding the buffer capacity and without writing the last

+ // code unit (it could be a lead surrogate). The estimated number of code

+ // units is then written out in one go, and the reported byte usage is used

+ // to correct the estimate. This is repeated until the estimate becomes <= 0

+ // or all code units have been written out. The second phase writes out code

+ // units until the buffer capacity is reached, would be exceeded by the next

+ // unit, or all units have been written out.

template<typename Char>

void Visit(const Char* chars, const int length) {

using namespace unibrow;

@@ -4531,7 +4549,10 @@ class Utf8WriterVisitor {

} else {

for (; i < fast_length; i++) {

uint16_t character = *chars++;

- buffer += Utf8::Encode(buffer, character, last_character);

+ buffer += Utf8::Encode(buffer,

+ character,

+ last_character,

+ replace_invalid_utf8_);

last_character = character;

ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);

}

@@ -4551,10 +4572,17 @@ class Utf8WriterVisitor {

ASSERT(remaining_capacity >= 0);

for (; i < length && remaining_capacity > 0; i++) {

uint16_t character = *chars++;

+ // remaining_capacity is <= 3 bytes at this point, so we do not write out

+ // an umatched lead surrogate.

+ if (replace_invalid_utf8_ && Utf16::IsLeadSurrogate(character)) {

+ early_termination_ = true;

+ break;

+ }

int written = WriteEndCharacter(character,

last_character,

remaining_capacity,

- buffer);

+ buffer,

+ replace_invalid_utf8_);

if (written == 0) {

early_termination_ = true;

break;

@@ -4602,6 +4630,7 @@ class Utf8WriterVisitor {

char* const start_;

int capacity_;

bool const skip_capacity_check_;

+ bool const replace_invalid_utf8_;

int utf16_chars_read_;

DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);

};

@@ -4640,9 +4669,11 @@ int String::WriteUtf8(char* buffer,

}

const int string_length = str->length();

bool write_null = !(options & NO_NULL_TERMINATION);

+ bool replace_invalid_utf8 = (options & REPLACE_INVALID_UTF8);

+ int max16BitCodeUnitSize = unibrow::Utf8::kMax16BitCodeUnitSize;

// First check if we can just write the string without checking capacity.

- if (capacity == -1 || capacity / 3 >= string_length) {

- Utf8WriterVisitor writer(buffer, capacity, true);

+ if (capacity == -1 || capacity / max16BitCodeUnitSize >= string_length) {

+ Utf8WriterVisitor writer(buffer, capacity, true, replace_invalid_utf8);

const int kMaxRecursion = 100;

bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);

if (success) return writer.CompleteWrite(write_null, nchars_ref);

@@ -4670,7 +4701,7 @@ int String::WriteUtf8(char* buffer,

}

// Recursive slow path can potentially be unreasonable slow. Flatten.

str = FlattenGetString(str);

- Utf8WriterVisitor writer(buffer, capacity, false);

+ Utf8WriterVisitor writer(buffer, capacity, false, replace_invalid_utf8);

i::String::VisitFlat(&writer, *str);

return writer.CompleteWrite(write_null, nchars_ref);

}

« no previous file with comments | « include/v8.h ('k') | src/unicode.h » ('j') | no next file with comments »