Index: src/api.cc |
diff --git a/src/api.cc b/src/api.cc |
index e5135a2d297d78a4dc6c52a53b9a1cf6c7dc1ffa..e58e6c894b4ffe6e44710d7d9556c1ee13519bc8 100644 |
--- a/src/api.cc |
+++ b/src/api.cc |
@@ -3831,68 +3831,200 @@ bool String::IsOneByte() const { |
} |
-class Utf8LengthVisitor { |
+class Utf8LengthHelper : public i::AllStatic { |
public: |
- explicit Utf8LengthVisitor() |
- : utf8_length_(0), |
- last_character_(unibrow::Utf16::kNoPreviousCharacter) {} |
+ enum State { |
+ kEndsWithLeadingSurrogate = 1 << 0, |
+ kStartsWithTrailingSurrogate = 1 << 1, |
+ kLeftmostEdgeIsCalculated = 1 << 2, |
+ kRightmostEdgeIsCalculated = 1 << 3, |
+ kLeftmostEdgeIsSurrogate = 1 << 4, |
+ kRightmostEdgeIsSurrogate = 1 << 5 |
+ }; |
+ |
+ static const uint8_t kInitialState = 0; |
+ |
+ static inline bool EndsWithSurrogate(uint8_t state) { |
+ return state & kEndsWithLeadingSurrogate; |
+ } |
+ |
+ static inline bool StartsWithSurrogate(uint8_t state) { |
+ return state & kStartsWithTrailingSurrogate; |
+ } |
+ |
+ class Visitor { |
+ public: |
+ explicit Visitor() |
+ : utf8_length_(0), |
+ state_(kInitialState) {} |
+ |
+ template<typename Char> |
+ inline void Visit(const Char* chars, int length) { |
+ int utf8_length = 0; |
+ int last_character = unibrow::Utf16::kNoPreviousCharacter; |
Erik Corry
2013/03/07 09:38:37
I wonder if it's worth modifying this to take adva
|
+ for (int i = 0; i < length; i++) { |
+ uint16_t c = chars[i]; |
+ utf8_length += unibrow::Utf8::Length(c, last_character); |
+ if (sizeof(Char) > 1) { |
+ last_character = c; |
+ } |
+ } |
+ utf8_length_ = utf8_length; |
+ } |
+ |
+ void VisitOneByteString(const uint8_t* chars, int length) { |
+ Visit(chars, length); |
+ state_ = kInitialState; |
+ } |
- inline int GetLength() { |
- return utf8_length_; |
+ void VisitTwoByteString(const uint16_t* chars, int length) { |
+ Visit(chars, length); |
+ uint8_t state = 0; |
+ if (unibrow::Utf16::IsTrailSurrogate(chars[0])) { |
+ state |= kStartsWithTrailingSurrogate; |
+ } |
+ if (unibrow::Utf16::IsLeadSurrogate(chars[length-1])) { |
+ state |= kEndsWithLeadingSurrogate; |
+ } |
+ state_ = state; |
+ } |
+ |
+ static i::ConsString* VisitFlat(i::String* string, |
+ int* length, |
+ uint8_t* state) { |
+ Visitor visitor; |
+ i::ConsString* cons_string = i::String::VisitFlat(&visitor, string); |
+ *length = visitor.utf8_length_; |
+ *state = visitor.state_; |
+ return cons_string; |
+ } |
+ |
+ private: |
+ int utf8_length_; |
+ uint8_t state_; |
+ DISALLOW_COPY_AND_ASSIGN(Visitor); |
+ }; |
+ |
+ static inline void MergeLeafLeft(int* length, |
+ uint8_t* state, |
+ uint8_t leaf_state) { |
+ bool edge_surrogate = StartsWithSurrogate(leaf_state); |
+ if (!(*state & kLeftmostEdgeIsCalculated)) { |
+ ASSERT(!(*state & kLeftmostEdgeIsSurrogate)); |
+ *state |= kLeftmostEdgeIsCalculated |
+ | (edge_surrogate ? kLeftmostEdgeIsSurrogate : 0); |
+ } else if (EndsWithSurrogate(*state) && edge_surrogate) { |
+ *length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates; |
+ } |
+ if (EndsWithSurrogate(leaf_state)) { |
+ *state |= kEndsWithLeadingSurrogate; |
+ } else { |
+ *state &= ~kEndsWithLeadingSurrogate; |
+ } |
} |
- template<typename Char> |
- inline void Visit(const Char* chars, unsigned length) { |
- ASSERT(length > 0); |
- // TODO(dcarney) Add back ascii fast path. |
- int utf8_length = 0; |
- int last_character = last_character_; |
- for (unsigned i = 0; i < length; i++) { |
- uint16_t c = chars[i]; |
- utf8_length += unibrow::Utf8::Length(c, last_character); |
- last_character = c; |
+ static inline void MergeLeafRight(int* length, |
+ uint8_t* state, |
+ uint8_t leaf_state) { |
+ bool edge_surrogate = EndsWithSurrogate(leaf_state); |
+ if (!(*state & kRightmostEdgeIsCalculated)) { |
+ ASSERT(!(*state & kRightmostEdgeIsSurrogate)); |
+ *state |= (kRightmostEdgeIsCalculated |
+ | (edge_surrogate ? kRightmostEdgeIsSurrogate : 0)); |
+ } else if (edge_surrogate && StartsWithSurrogate(*state)) { |
+ *length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates; |
+ } |
+ if (StartsWithSurrogate(leaf_state)) { |
+ *state |= kStartsWithTrailingSurrogate; |
+ } else { |
+ *state &= ~kStartsWithTrailingSurrogate; |
} |
- last_character_ = last_character; |
- utf8_length_ += utf8_length; |
} |
- inline void VisitOneByteString(const uint8_t* chars, unsigned length) { |
- Visit(chars, length); |
+ static inline void MergeTerminal(int* length, |
+ uint8_t state, |
+ uint8_t* state_out) { |
+ ASSERT((state & kLeftmostEdgeIsCalculated) && |
+ (state & kRightmostEdgeIsCalculated)); |
+ if (EndsWithSurrogate(state) && StartsWithSurrogate(state)) { |
+ *length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates; |
+ } |
+ *state_out = kInitialState | |
+ (state & kLeftmostEdgeIsSurrogate ? kStartsWithTrailingSurrogate : 0) | |
+ (state & kRightmostEdgeIsSurrogate ? kEndsWithLeadingSurrogate : 0); |
} |
- inline void VisitTwoByteString(const uint16_t* chars, unsigned length) { |
- Visit(chars, length); |
+ static int Calculate(i::ConsString* current, uint8_t* state_out) { |
+ using namespace internal; |
+ int total_length = 0; |
+ uint8_t state = kInitialState; |
+ while (true) { |
+ i::String* left = current->first(); |
+ i::String* right = current->second(); |
+ uint8_t right_leaf_state; |
+ uint8_t left_leaf_state; |
+ int leaf_length; |
+ ConsString* left_as_cons = |
+ Visitor::VisitFlat(left, &leaf_length, &left_leaf_state); |
+ if (left_as_cons == NULL) { |
+ total_length += leaf_length; |
+ MergeLeafLeft(&total_length, &state, left_leaf_state); |
+ } |
+ ConsString* right_as_cons = |
+ Visitor::VisitFlat(right, &leaf_length, &right_leaf_state); |
+ if (right_as_cons == NULL) { |
+ total_length += leaf_length; |
+ MergeLeafRight(&total_length, &state, right_leaf_state); |
+ // Terminal node. |
+ if (left_as_cons == NULL) { |
+ MergeTerminal(&total_length, state, state_out); |
+ return total_length; |
+ } |
+ } else if (left_as_cons != NULL) { |
+ // Both strings are ConsStrings. |
+ // Recurse on smallest. |
+ if (left->length() < right->length()) { |
+ total_length += Calculate(left_as_cons, &left_leaf_state); |
+ MergeLeafLeft(&total_length, &state, left_leaf_state); |
+ current = right_as_cons; |
+ continue; |
+ } else { |
+ total_length += Calculate(right_as_cons, &right_leaf_state); |
+ MergeLeafRight(&total_length, &state, right_leaf_state); |
+ current = left_as_cons; |
+ continue; |
+ } |
+ } |
+ // 1 leaf node. Do in place descent. |
+ if (left_as_cons != NULL) { |
+ current = left_as_cons; |
+ } else { |
+ ASSERT(right_as_cons != NULL); |
+ current = right_as_cons; |
+ } |
+ } |
+ UNREACHABLE(); |
+ return 0; |
+ } |
+ |
+ static inline int Calculate(i::ConsString* current) { |
+ uint8_t state = kInitialState; |
+ return Calculate(current, &state); |
} |
private: |
- int utf8_length_; |
- int last_character_; |
- DISALLOW_COPY_AND_ASSIGN(Utf8LengthVisitor); |
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8LengthHelper); |
}; |
static int Utf8Length(i::String* str, i::Isolate* isolate) { |
- unsigned length = static_cast<unsigned>(str->length()); |
+ int length = str->length(); |
if (length == 0) return 0; |
- int32_t type = str->map()->instance_type(); |
- Utf8LengthVisitor visitor; |
- // Non ConsString branch. |
- if ((type & i::kStringRepresentationMask) != i::kConsStringTag) { |
- i::ConsStringNullOp null_op; |
- i::String::Visit(str, 0, visitor, null_op, type, length); |
- return visitor.GetLength(); |
- } |
- i::ConsStringIteratorOp* op = isolate->write_iterator(); |
- unsigned offset = 0; |
- i::String* leaf = op->Operate(str, &offset, &type, &length); |
- ASSERT(leaf != NULL); |
- while (leaf != NULL) { |
- i::ConsStringNullOp null_op; |
- ASSERT(offset == 0); |
- i::String::Visit(leaf, 0, visitor, null_op, type, length); |
- leaf = op->ContinueOperation(&type, &length); |
- } |
- return visitor.GetLength(); |
+ uint8_t state; |
+ i::ConsString* cons_string = |
+ Utf8LengthHelper::Visitor::VisitFlat(str, &length, &state); |
+ if (cons_string == NULL) return length; |
+ return Utf8LengthHelper::Calculate(cons_string); |
} |
@@ -3906,12 +4038,14 @@ int String::Utf8Length() const { |
class Utf8WriterVisitor { |
public: |
- Utf8WriterVisitor(char* buffer, int capacity) |
+ Utf8WriterVisitor( |
+ char* buffer, int capacity, bool skip_capacity_check) |
: early_termination_(false), |
last_character_(unibrow::Utf16::kNoPreviousCharacter), |
buffer_(buffer), |
start_(buffer), |
capacity_(capacity), |
+ skip_capacity_check_(capacity == -1 || skip_capacity_check), |
utf16_chars_read_(0) { |
} |
@@ -3935,7 +4069,7 @@ class Utf8WriterVisitor { |
// Can't encode using last_character as gcc has array bounds issues. |
int written = Utf8::Encode(temp_buffer, |
character, |
- unibrow::Utf16::kNoPreviousCharacter); |
+ Utf16::kNoPreviousCharacter); |
// Won't fit. |
if (written > remaining) return 0; |
// Copy over the character from temp_buffer. |
@@ -3948,45 +4082,55 @@ class Utf8WriterVisitor { |
template<typename Char> |
void Visit(const Char* chars, const int length) { |
using namespace unibrow; |
- // TODO(dcarney): Add back ascii fast path. |
ASSERT(!early_termination_); |
- ASSERT(length > 0); |
+ if (length == 0) return; |
// Copy state to stack. |
char* buffer = buffer_; |
- int last_character = last_character_; |
+ int last_character = |
+ sizeof(Char) == 1 ? Utf16::kNoPreviousCharacter : last_character_; |
int i = 0; |
// Do a fast loop where there is no exit capacity check. |
while (true) { |
int fast_length; |
- if (capacity_ == -1) { |
+ if (skip_capacity_check_) { |
fast_length = length; |
} else { |
int remaining_capacity = capacity_ - static_cast<int>(buffer - start_); |
// Need enough space to write everything but one character. |
STATIC_ASSERT(Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit == 3); |
- int writable_length = (remaining_capacity - 3)/3; |
+ int max_size_per_char = sizeof(Char) == 1 ? 2 : 3; |
+ int writable_length = |
+ (remaining_capacity - max_size_per_char)/max_size_per_char; |
// Need to drop into slow loop. |
if (writable_length <= 0) break; |
fast_length = i + writable_length; |
if (fast_length > length) fast_length = length; |
} |
// Write the characters to the stream. |
- for (; i < fast_length; i++) { |
- uint16_t character = *chars++; |
- buffer += Utf8::Encode(buffer, character, last_character); |
- last_character = character; |
- ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); |
+ if (sizeof(Char) == 1) { |
+ for (; i < fast_length; i++) { |
+ buffer += |
+ Utf8::Encode(buffer, *chars++, Utf16::kNoPreviousCharacter); |
+ ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); |
+ } |
+ } else { |
+ for (; i < fast_length; i++) { |
+ uint16_t character = *chars++; |
+ buffer += Utf8::Encode(buffer, character, last_character); |
+ last_character = character; |
+ ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); |
+ } |
} |
// Array is fully written. Exit. |
if (fast_length == length) { |
// Write state back out to object. |
last_character_ = last_character; |
buffer_ = buffer; |
- utf16_chars_read_ += i; |
+ utf16_chars_read_ += length; |
return; |
} |
} |
- ASSERT(capacity_ != -1); |
+ ASSERT(!skip_capacity_check_); |
// Slow loop. Must check capacity on each iteration. |
int remaining_capacity = capacity_ - static_cast<int>(buffer - start_); |
ASSERT(remaining_capacity >= 0); |
@@ -4014,15 +4158,15 @@ class Utf8WriterVisitor { |
return early_termination_; |
} |
- inline void VisitOneByteString(const uint8_t* chars, unsigned length) { |
- Visit(chars, static_cast<int>(length)); |
+ inline void VisitOneByteString(const uint8_t* chars, int length) { |
+ Visit(chars, length); |
} |
- inline void VisitTwoByteString(const uint16_t* chars, unsigned length) { |
- Visit(chars, static_cast<int>(length)); |
+ inline void VisitTwoByteString(const uint16_t* chars, int length) { |
+ Visit(chars, length); |
} |
- inline int CompleteWrite(bool write_null, int* utf16_chars_read_out) { |
+ int CompleteWrite(bool write_null, int* utf16_chars_read_out) { |
// Write out number of utf16 characters written to the stream. |
if (utf16_chars_read_out != NULL) { |
*utf16_chars_read_out = utf16_chars_read_; |
@@ -4042,11 +4186,32 @@ class Utf8WriterVisitor { |
char* buffer_; |
char* const start_; |
int capacity_; |
+ bool const skip_capacity_check_; |
int utf16_chars_read_; |
DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor); |
}; |
+static bool RecursivelySerializeToUtf8(i::String* current, |
+ Utf8WriterVisitor* writer, |
+ int recursion_budget) { |
+ while (!writer->IsDone()) { |
+ i::ConsString* cons_string = i::String::VisitFlat(writer, current); |
+ if (cons_string == NULL) return true; // Leaf node. |
+ if (recursion_budget <= 0) return false; |
+ // Must write the left branch first. |
+ i::String* first = cons_string->first(); |
+ bool success = RecursivelySerializeToUtf8(first, |
+ writer, |
+ recursion_budget - 1); |
+ if (!success) return false; |
+ // Inline tail recurse for right branch. |
+ current = cons_string->second(); |
+ } |
+ return true; |
+} |
+ |
+ |
int String::WriteUtf8(char* buffer, |
int capacity, |
int* nchars_ref, |
@@ -4059,23 +4224,41 @@ int String::WriteUtf8(char* buffer, |
if (options & HINT_MANY_WRITES_EXPECTED) { |
FlattenString(str); // Flatten the string for efficiency. |
} |
- Utf8WriterVisitor writer(buffer, capacity); |
- i::ConsStringIteratorOp* op = isolate->write_iterator(); |
- op->Reset(); |
- int32_t type = str->map()->instance_type(); |
- unsigned str_length = static_cast<unsigned>(str->length()); |
- if (str_length != 0) { |
- i::String::Visit(*str, 0, writer, *op, type, str_length); |
- while (!writer.IsDone()) { |
- unsigned length_out; |
- i::String* next = op->ContinueOperation(&type, &length_out); |
- if (next == NULL) break; |
- // TODO(dcarney): need an asserting null op. |
- i::ConsStringNullOp null_op; |
- i::String::Visit(next, 0, writer, null_op, type, length_out); |
+ const int string_length = str->length(); |
+ bool write_null = !(options & NO_NULL_TERMINATION); |
+ // First check if we can just write the string without checking capacity. |
+ if (capacity == -1 || capacity / 3 >= string_length) { |
+ Utf8WriterVisitor writer(buffer, capacity, true); |
+ const int kMaxRecursion = 100; |
+ bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion); |
+ if (success) return writer.CompleteWrite(write_null, nchars_ref); |
+ } else if (capacity >= string_length) { |
+ // First check that the buffer is large enough. |
+ int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate()); |
+ if (utf8_bytes <= capacity) { |
+ // ASCII fast path. |
+ if (utf8_bytes == string_length) { |
+ WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity, options); |
+ if (nchars_ref != NULL) *nchars_ref = string_length; |
+ if (write_null && (utf8_bytes+1 <= capacity)) { |
+ return string_length + 1; |
+ } |
+ return string_length; |
+ } |
+ if (write_null && (utf8_bytes+1 > capacity)) { |
+ options |= NO_NULL_TERMINATION; |
+ } |
+ // Recurse once without a capacity limit. |
+ // This will get into the first branch above. |
+ // TODO(dcarney) Check max left rec. in Utf8Length and fall through. |
+ return WriteUtf8(buffer, -1, nchars_ref, options); |
} |
} |
- return writer.CompleteWrite(!(options & NO_NULL_TERMINATION), nchars_ref); |
+ // Recursive slow path can potentially be unreasonable slow. Flatten. |
+ str = FlattenGetString(str); |
+ Utf8WriterVisitor writer(buffer, capacity, false); |
+ i::String::VisitFlat(&writer, *str); |
+ return writer.CompleteWrite(write_null, nchars_ref); |
} |