Index: src/api.cc |
diff --git a/src/api.cc b/src/api.cc |
index 9a68f639efec56559c0aa7ffe76a58ada5776fc4..59d0cb409724856d3b3127868d51714d353d5c97 100644 |
--- a/src/api.cc |
+++ b/src/api.cc |
@@ -4504,37 +4504,59 @@ int String::Utf8Length() const { |
class Utf8WriterVisitor { |
public: |
Utf8WriterVisitor( |
- char* buffer, int capacity, bool skip_capacity_check) |
+ char* buffer, |
+ int capacity, |
+ bool skip_capacity_check, |
+ bool allow_invalid_utf8) |
: early_termination_(false), |
last_character_(unibrow::Utf16::kNoPreviousCharacter), |
buffer_(buffer), |
start_(buffer), |
capacity_(capacity), |
skip_capacity_check_(capacity == -1 || skip_capacity_check), |
+ allow_invalid_utf8_(allow_invalid_utf8), |
utf16_chars_read_(0) { |
} |
- static int WriteEndCharacter(uint16_t character, |
- int last_character, |
+ // WritePair writes the current UTF-16 code unit to the given buffer. The |
+ // function will go back inside the buffer to combine surrogate pairs. |
+ static int WritePair(uint16_t current, |
dcarney
2014/01/07 10:12:16
WritePair is a bad name here, since it may or not
|
+ int previous, |
+ char* buffer, |
+ bool allow_invalid_utf8) { |
+ using namespace unibrow; |
+ int code_point = current; |
+ int written = 0; |
+ if (Utf16::IsSurrogatePair(previous, current)) { |
+ code_point = Utf16::CombineSurrogatePair(previous, current); |
+ buffer -= Utf8::kSizeOfUnmatchedSurrogate; |
+ written -= Utf8::kSizeOfUnmatchedSurrogate; |
+ } |
+ return written + Utf8::Encode(buffer, code_point, allow_invalid_utf8); |
+ } |
+ |
+ static int WriteEndCharacter(uint16_t current, |
+ int previous, |
int remaining, |
- char* const buffer) { |
+ char* const buffer, |
+ bool allow_invalid_utf8) { |
using namespace unibrow; |
ASSERT(remaining > 0); |
- // We can't use a local buffer here because Encode needs to modify |
- // previous characters in the stream. We know, however, that |
- // exactly one character will be advanced. |
- if (Utf16::IsTrailSurrogate(character) && |
- Utf16::IsLeadSurrogate(last_character)) { |
- int written = Utf8::Encode(buffer, character, last_character); |
+ // We can't use a local buffer here because WritePair needs to modify |
+ // previous characters in the stream. We know, however, that exactly one |
+ // character will be advanced. |
+ if (Utf16::IsSurrogatePair(previous, current)) { |
+ int written = WritePair(current, previous, buffer, allow_invalid_utf8); |
ASSERT(written == 1); |
return written; |
} |
// Use a scratch buffer to check the required characters. |
char temp_buffer[Utf8::kMaxEncodedSize]; |
// Can't encode using last_character as gcc has array bounds issues. |
- int written = Utf8::Encode(temp_buffer, |
- character, |
- Utf16::kNoPreviousCharacter); |
+ int written = WritePair(current, |
dcarney
2014/01/07 10:12:16
this is not a surrogate pair, could use Encode dir
|
+ Utf16::kNoPreviousCharacter, |
+ temp_buffer, |
+ allow_invalid_utf8); |
// Won't fit. |
if (written > remaining) return 0; |
// Copy over the character from temp_buffer. |
@@ -4581,7 +4603,10 @@ class Utf8WriterVisitor { |
} else { |
for (; i < fast_length; i++) { |
uint16_t character = *chars++; |
- buffer += Utf8::Encode(buffer, character, last_character); |
+ buffer += WritePair(character, |
+ last_character, |
+ buffer, |
+ allow_invalid_utf8_); |
last_character = character; |
ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); |
} |
@@ -4604,7 +4629,8 @@ class Utf8WriterVisitor { |
int written = WriteEndCharacter(character, |
last_character, |
remaining_capacity, |
- buffer); |
+ buffer, |
+ allow_invalid_utf8_); |
if (written == 0) { |
early_termination_ = true; |
break; |
@@ -4652,6 +4678,7 @@ class Utf8WriterVisitor { |
char* const start_; |
int capacity_; |
bool const skip_capacity_check_; |
+ bool const allow_invalid_utf8_; |
int utf16_chars_read_; |
DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor); |
}; |
@@ -4690,9 +4717,16 @@ int String::WriteUtf8(char* buffer, |
} |
const int string_length = str->length(); |
bool write_null = !(options & NO_NULL_TERMINATION); |
+ bool allow_invalid_utf8 = !(options & DISALLOW_INVALID_UTF8); |
// First check if we can just write the string without checking capacity. |
+ // @TODO Replace magic number 3 with something more descriptive. E.g. |
+ // Utf8::kMaxTwoByteSize (as in the maximum size an unsighed 2 byte code unit |
+ // value will take up when encoded to UTF-8)? When I first read this code I |
+ // thought there might be a overflow bug here since UTF-8 may take up to 4 |
+ // bytes per code unit. Then I realized that a surrogate pair has a |
+ // str.length of 2, making the code correct. |
if (capacity == -1 || capacity / 3 >= string_length) { |
- Utf8WriterVisitor writer(buffer, capacity, true); |
+ Utf8WriterVisitor writer(buffer, capacity, true, allow_invalid_utf8); |
const int kMaxRecursion = 100; |
bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion); |
if (success) return writer.CompleteWrite(write_null, nchars_ref); |
@@ -4720,7 +4754,7 @@ int String::WriteUtf8(char* buffer, |
} |
// Recursive slow path can potentially be unreasonable slow. Flatten. |
str = FlattenGetString(str); |
- Utf8WriterVisitor writer(buffer, capacity, false); |
+ Utf8WriterVisitor writer(buffer, capacity, false, allow_invalid_utf8); |
i::String::VisitFlat(&writer, *str); |
return writer.CompleteWrite(write_null, nchars_ref); |
} |