src/api.cc - Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option

Unified Diff: src/api.cc

Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option (Closed) Base URL: git://github.com/v8/v8.git@master

Patch Set: DISALLOW_INVALID_UTF8 flag and fixes Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/api.cc

diff --git a/src/api.cc b/src/api.cc

index 9a68f639efec56559c0aa7ffe76a58ada5776fc4..59d0cb409724856d3b3127868d51714d353d5c97 100644

--- a/src/api.cc

+++ b/src/api.cc

@@ -4504,37 +4504,59 @@ int String::Utf8Length() const {

class Utf8WriterVisitor {

public:

Utf8WriterVisitor(

- char* buffer, int capacity, bool skip_capacity_check)

+ char* buffer,

+ int capacity,

+ bool skip_capacity_check,

+ bool allow_invalid_utf8)

: early_termination_(false),

last_character_(unibrow::Utf16::kNoPreviousCharacter),

buffer_(buffer),

start_(buffer),

capacity_(capacity),

skip_capacity_check_(capacity == -1 || skip_capacity_check),

+ allow_invalid_utf8_(allow_invalid_utf8),

utf16_chars_read_(0) {

}

- static int WriteEndCharacter(uint16_t character,

- int last_character,

+ // WritePair writes the current UTF-16 code unit to the given buffer. The

+ // function will go back inside the buffer to combine surrogate pairs.

+ static int WritePair(uint16_t current,

dcarney 2014/01/07 10:12:16 WritePair is a bad name here, since it may or not

+ int previous,

+ char* buffer,

+ bool allow_invalid_utf8) {

+ using namespace unibrow;

+ int code_point = current;

+ int written = 0;

+ if (Utf16::IsSurrogatePair(previous, current)) {

+ code_point = Utf16::CombineSurrogatePair(previous, current);

+ buffer -= Utf8::kSizeOfUnmatchedSurrogate;

+ written -= Utf8::kSizeOfUnmatchedSurrogate;

+ }

+ return written + Utf8::Encode(buffer, code_point, allow_invalid_utf8);

+ }

+ static int WriteEndCharacter(uint16_t current,

+ int previous,

int remaining,

- char* const buffer) {

+ char* const buffer,

+ bool allow_invalid_utf8) {

using namespace unibrow;

ASSERT(remaining > 0);

- // We can't use a local buffer here because Encode needs to modify

- // previous characters in the stream. We know, however, that

- // exactly one character will be advanced.

- if (Utf16::IsTrailSurrogate(character) &&

- Utf16::IsLeadSurrogate(last_character)) {

- int written = Utf8::Encode(buffer, character, last_character);

+ // We can't use a local buffer here because WritePair needs to modify

+ // previous characters in the stream. We know, however, that exactly one

+ // character will be advanced.

+ if (Utf16::IsSurrogatePair(previous, current)) {

+ int written = WritePair(current, previous, buffer, allow_invalid_utf8);

ASSERT(written == 1);

return written;

}

// Use a scratch buffer to check the required characters.

char temp_buffer[Utf8::kMaxEncodedSize];

// Can't encode using last_character as gcc has array bounds issues.

- int written = Utf8::Encode(temp_buffer,

- character,

- Utf16::kNoPreviousCharacter);

+ int written = WritePair(current,

dcarney 2014/01/07 10:12:16 this is not a surrogate pair, could use Encode dir

+ Utf16::kNoPreviousCharacter,

+ temp_buffer,

+ allow_invalid_utf8);

// Won't fit.

if (written > remaining) return 0;

// Copy over the character from temp_buffer.

@@ -4581,7 +4603,10 @@ class Utf8WriterVisitor {

} else {

for (; i < fast_length; i++) {

uint16_t character = *chars++;

- buffer += Utf8::Encode(buffer, character, last_character);

+ buffer += WritePair(character,

+ last_character,

+ buffer,

+ allow_invalid_utf8_);

last_character = character;

ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);

}

@@ -4604,7 +4629,8 @@ class Utf8WriterVisitor {

int written = WriteEndCharacter(character,

last_character,

remaining_capacity,

- buffer);

+ buffer,

+ allow_invalid_utf8_);

if (written == 0) {

early_termination_ = true;

break;

@@ -4652,6 +4678,7 @@ class Utf8WriterVisitor {

char* const start_;

int capacity_;

bool const skip_capacity_check_;

+ bool const allow_invalid_utf8_;

int utf16_chars_read_;

DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);

};

@@ -4690,9 +4717,16 @@ int String::WriteUtf8(char* buffer,

}

const int string_length = str->length();

bool write_null = !(options & NO_NULL_TERMINATION);

+ bool allow_invalid_utf8 = !(options & DISALLOW_INVALID_UTF8);

// First check if we can just write the string without checking capacity.

+ // @TODO Replace magic number 3 with something more descriptive. E.g.

+ // Utf8::kMaxTwoByteSize (as in the maximum size an unsighed 2 byte code unit

+ // value will take up when encoded to UTF-8)? When I first read this code I

+ // thought there might be a overflow bug here since UTF-8 may take up to 4

+ // bytes per code unit. Then I realized that a surrogate pair has a

+ // str.length of 2, making the code correct.

if (capacity == -1 || capacity / 3 >= string_length) {

- Utf8WriterVisitor writer(buffer, capacity, true);

+ Utf8WriterVisitor writer(buffer, capacity, true, allow_invalid_utf8);

const int kMaxRecursion = 100;

bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);

if (success) return writer.CompleteWrite(write_null, nchars_ref);

@@ -4720,7 +4754,7 @@ int String::WriteUtf8(char* buffer,

}

// Recursive slow path can potentially be unreasonable slow. Flatten.

str = FlattenGetString(str);

- Utf8WriterVisitor writer(buffer, capacity, false);

+ Utf8WriterVisitor writer(buffer, capacity, false, allow_invalid_utf8);

i::String::VisitFlat(&writer, *str);

return writer.CompleteWrite(write_null, nchars_ref);

}

« no previous file with comments | « include/v8.h ('k') | src/debug-agent.cc » ('j') | src/unicode-inl.h » ('J')