Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(30)

Unified Diff: src/api.cc

Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option (Closed) Base URL: git://github.com/v8/v8.git@master
Patch Set: DISALLOW_INVALID_UTF8 flag and fixes Created 6 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « include/v8.h ('k') | src/debug-agent.cc » ('j') | src/unicode-inl.h » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/api.cc
diff --git a/src/api.cc b/src/api.cc
index 9a68f639efec56559c0aa7ffe76a58ada5776fc4..59d0cb409724856d3b3127868d51714d353d5c97 100644
--- a/src/api.cc
+++ b/src/api.cc
@@ -4504,37 +4504,59 @@ int String::Utf8Length() const {
class Utf8WriterVisitor {
public:
Utf8WriterVisitor(
- char* buffer, int capacity, bool skip_capacity_check)
+ char* buffer,
+ int capacity,
+ bool skip_capacity_check,
+ bool allow_invalid_utf8)
: early_termination_(false),
last_character_(unibrow::Utf16::kNoPreviousCharacter),
buffer_(buffer),
start_(buffer),
capacity_(capacity),
skip_capacity_check_(capacity == -1 || skip_capacity_check),
+ allow_invalid_utf8_(allow_invalid_utf8),
utf16_chars_read_(0) {
}
- static int WriteEndCharacter(uint16_t character,
- int last_character,
+ // WritePair writes the current UTF-16 code unit to the given buffer. The
+ // function will go back inside the buffer to combine surrogate pairs.
+ static int WritePair(uint16_t current,
dcarney 2014/01/07 10:12:16 WritePair is a bad name here, since it may or not
+ int previous,
+ char* buffer,
+ bool allow_invalid_utf8) {
+ using namespace unibrow;
+ int code_point = current;
+ int written = 0;
+ if (Utf16::IsSurrogatePair(previous, current)) {
+ code_point = Utf16::CombineSurrogatePair(previous, current);
+ buffer -= Utf8::kSizeOfUnmatchedSurrogate;
+ written -= Utf8::kSizeOfUnmatchedSurrogate;
+ }
+ return written + Utf8::Encode(buffer, code_point, allow_invalid_utf8);
+ }
+
+ static int WriteEndCharacter(uint16_t current,
+ int previous,
int remaining,
- char* const buffer) {
+ char* const buffer,
+ bool allow_invalid_utf8) {
using namespace unibrow;
ASSERT(remaining > 0);
- // We can't use a local buffer here because Encode needs to modify
- // previous characters in the stream. We know, however, that
- // exactly one character will be advanced.
- if (Utf16::IsTrailSurrogate(character) &&
- Utf16::IsLeadSurrogate(last_character)) {
- int written = Utf8::Encode(buffer, character, last_character);
+ // We can't use a local buffer here because WritePair needs to modify
+ // previous characters in the stream. We know, however, that exactly one
+ // character will be advanced.
+ if (Utf16::IsSurrogatePair(previous, current)) {
+ int written = WritePair(current, previous, buffer, allow_invalid_utf8);
ASSERT(written == 1);
return written;
}
// Use a scratch buffer to check the required characters.
char temp_buffer[Utf8::kMaxEncodedSize];
// Can't encode using last_character as gcc has array bounds issues.
- int written = Utf8::Encode(temp_buffer,
- character,
- Utf16::kNoPreviousCharacter);
+ int written = WritePair(current,
dcarney 2014/01/07 10:12:16 this is not a surrogate pair, could use Encode dir
+ Utf16::kNoPreviousCharacter,
+ temp_buffer,
+ allow_invalid_utf8);
// Won't fit.
if (written > remaining) return 0;
// Copy over the character from temp_buffer.
@@ -4581,7 +4603,10 @@ class Utf8WriterVisitor {
} else {
for (; i < fast_length; i++) {
uint16_t character = *chars++;
- buffer += Utf8::Encode(buffer, character, last_character);
+ buffer += WritePair(character,
+ last_character,
+ buffer,
+ allow_invalid_utf8_);
last_character = character;
ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
}
@@ -4604,7 +4629,8 @@ class Utf8WriterVisitor {
int written = WriteEndCharacter(character,
last_character,
remaining_capacity,
- buffer);
+ buffer,
+ allow_invalid_utf8_);
if (written == 0) {
early_termination_ = true;
break;
@@ -4652,6 +4678,7 @@ class Utf8WriterVisitor {
char* const start_;
int capacity_;
bool const skip_capacity_check_;
+ bool const allow_invalid_utf8_;
int utf16_chars_read_;
DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);
};
@@ -4690,9 +4717,16 @@ int String::WriteUtf8(char* buffer,
}
const int string_length = str->length();
bool write_null = !(options & NO_NULL_TERMINATION);
+ bool allow_invalid_utf8 = !(options & DISALLOW_INVALID_UTF8);
// First check if we can just write the string without checking capacity.
+ // @TODO Replace magic number 3 with something more descriptive. E.g.
+ // Utf8::kMaxTwoByteSize (as in the maximum size an unsighed 2 byte code unit
+ // value will take up when encoded to UTF-8)? When I first read this code I
+ // thought there might be a overflow bug here since UTF-8 may take up to 4
+ // bytes per code unit. Then I realized that a surrogate pair has a
+ // str.length of 2, making the code correct.
if (capacity == -1 || capacity / 3 >= string_length) {
- Utf8WriterVisitor writer(buffer, capacity, true);
+ Utf8WriterVisitor writer(buffer, capacity, true, allow_invalid_utf8);
const int kMaxRecursion = 100;
bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);
if (success) return writer.CompleteWrite(write_null, nchars_ref);
@@ -4720,7 +4754,7 @@ int String::WriteUtf8(char* buffer,
}
// Recursive slow path can potentially be unreasonable slow. Flatten.
str = FlattenGetString(str);
- Utf8WriterVisitor writer(buffer, capacity, false);
+ Utf8WriterVisitor writer(buffer, capacity, false, allow_invalid_utf8);
i::String::VisitFlat(&writer, *str);
return writer.CompleteWrite(write_null, nchars_ref);
}
« no previous file with comments | « include/v8.h ('k') | src/debug-agent.cc » ('j') | src/unicode-inl.h » ('J')

Powered by Google App Engine
This is Rietveld 408576698