Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(534)

Unified Diff: src/api.cc

Issue 12390057: Added back some utf8 optimizations (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge
Patch Set: addressed comments, rebased Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | src/objects.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/api.cc
diff --git a/src/api.cc b/src/api.cc
index e5135a2d297d78a4dc6c52a53b9a1cf6c7dc1ffa..e58e6c894b4ffe6e44710d7d9556c1ee13519bc8 100644
--- a/src/api.cc
+++ b/src/api.cc
@@ -3831,68 +3831,200 @@ bool String::IsOneByte() const {
}
-class Utf8LengthVisitor {
+class Utf8LengthHelper : public i::AllStatic {
public:
- explicit Utf8LengthVisitor()
- : utf8_length_(0),
- last_character_(unibrow::Utf16::kNoPreviousCharacter) {}
+ enum State {
+ kEndsWithLeadingSurrogate = 1 << 0,
+ kStartsWithTrailingSurrogate = 1 << 1,
+ kLeftmostEdgeIsCalculated = 1 << 2,
+ kRightmostEdgeIsCalculated = 1 << 3,
+ kLeftmostEdgeIsSurrogate = 1 << 4,
+ kRightmostEdgeIsSurrogate = 1 << 5
+ };
+
+ static const uint8_t kInitialState = 0;
+
+ static inline bool EndsWithSurrogate(uint8_t state) {
+ return state & kEndsWithLeadingSurrogate;
+ }
+
+ static inline bool StartsWithSurrogate(uint8_t state) {
+ return state & kStartsWithTrailingSurrogate;
+ }
+
+ class Visitor {
+ public:
+ explicit Visitor()
+ : utf8_length_(0),
+ state_(kInitialState) {}
+
+ template<typename Char>
+ inline void Visit(const Char* chars, int length) {
+ int utf8_length = 0;
+ int last_character = unibrow::Utf16::kNoPreviousCharacter;
Erik Corry 2013/03/07 09:38:37 I wonder if it's worth modifying this to take adva
+ for (int i = 0; i < length; i++) {
+ uint16_t c = chars[i];
+ utf8_length += unibrow::Utf8::Length(c, last_character);
+ if (sizeof(Char) > 1) {
+ last_character = c;
+ }
+ }
+ utf8_length_ = utf8_length;
+ }
+
+ void VisitOneByteString(const uint8_t* chars, int length) {
+ Visit(chars, length);
+ state_ = kInitialState;
+ }
- inline int GetLength() {
- return utf8_length_;
+ void VisitTwoByteString(const uint16_t* chars, int length) {
+ Visit(chars, length);
+ uint8_t state = 0;
+ if (unibrow::Utf16::IsTrailSurrogate(chars[0])) {
+ state |= kStartsWithTrailingSurrogate;
+ }
+ if (unibrow::Utf16::IsLeadSurrogate(chars[length-1])) {
+ state |= kEndsWithLeadingSurrogate;
+ }
+ state_ = state;
+ }
+
+ static i::ConsString* VisitFlat(i::String* string,
+ int* length,
+ uint8_t* state) {
+ Visitor visitor;
+ i::ConsString* cons_string = i::String::VisitFlat(&visitor, string);
+ *length = visitor.utf8_length_;
+ *state = visitor.state_;
+ return cons_string;
+ }
+
+ private:
+ int utf8_length_;
+ uint8_t state_;
+ DISALLOW_COPY_AND_ASSIGN(Visitor);
+ };
+
+ static inline void MergeLeafLeft(int* length,
+ uint8_t* state,
+ uint8_t leaf_state) {
+ bool edge_surrogate = StartsWithSurrogate(leaf_state);
+ if (!(*state & kLeftmostEdgeIsCalculated)) {
+ ASSERT(!(*state & kLeftmostEdgeIsSurrogate));
+ *state |= kLeftmostEdgeIsCalculated
+ | (edge_surrogate ? kLeftmostEdgeIsSurrogate : 0);
+ } else if (EndsWithSurrogate(*state) && edge_surrogate) {
+ *length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
+ }
+ if (EndsWithSurrogate(leaf_state)) {
+ *state |= kEndsWithLeadingSurrogate;
+ } else {
+ *state &= ~kEndsWithLeadingSurrogate;
+ }
}
- template<typename Char>
- inline void Visit(const Char* chars, unsigned length) {
- ASSERT(length > 0);
- // TODO(dcarney) Add back ascii fast path.
- int utf8_length = 0;
- int last_character = last_character_;
- for (unsigned i = 0; i < length; i++) {
- uint16_t c = chars[i];
- utf8_length += unibrow::Utf8::Length(c, last_character);
- last_character = c;
+ static inline void MergeLeafRight(int* length,
+ uint8_t* state,
+ uint8_t leaf_state) {
+ bool edge_surrogate = EndsWithSurrogate(leaf_state);
+ if (!(*state & kRightmostEdgeIsCalculated)) {
+ ASSERT(!(*state & kRightmostEdgeIsSurrogate));
+ *state |= (kRightmostEdgeIsCalculated
+ | (edge_surrogate ? kRightmostEdgeIsSurrogate : 0));
+ } else if (edge_surrogate && StartsWithSurrogate(*state)) {
+ *length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
+ }
+ if (StartsWithSurrogate(leaf_state)) {
+ *state |= kStartsWithTrailingSurrogate;
+ } else {
+ *state &= ~kStartsWithTrailingSurrogate;
}
- last_character_ = last_character;
- utf8_length_ += utf8_length;
}
- inline void VisitOneByteString(const uint8_t* chars, unsigned length) {
- Visit(chars, length);
+ static inline void MergeTerminal(int* length,
+ uint8_t state,
+ uint8_t* state_out) {
+ ASSERT((state & kLeftmostEdgeIsCalculated) &&
+ (state & kRightmostEdgeIsCalculated));
+ if (EndsWithSurrogate(state) && StartsWithSurrogate(state)) {
+ *length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
+ }
+ *state_out = kInitialState |
+ (state & kLeftmostEdgeIsSurrogate ? kStartsWithTrailingSurrogate : 0) |
+ (state & kRightmostEdgeIsSurrogate ? kEndsWithLeadingSurrogate : 0);
}
- inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {
- Visit(chars, length);
+ static int Calculate(i::ConsString* current, uint8_t* state_out) {
+ using namespace internal;
+ int total_length = 0;
+ uint8_t state = kInitialState;
+ while (true) {
+ i::String* left = current->first();
+ i::String* right = current->second();
+ uint8_t right_leaf_state;
+ uint8_t left_leaf_state;
+ int leaf_length;
+ ConsString* left_as_cons =
+ Visitor::VisitFlat(left, &leaf_length, &left_leaf_state);
+ if (left_as_cons == NULL) {
+ total_length += leaf_length;
+ MergeLeafLeft(&total_length, &state, left_leaf_state);
+ }
+ ConsString* right_as_cons =
+ Visitor::VisitFlat(right, &leaf_length, &right_leaf_state);
+ if (right_as_cons == NULL) {
+ total_length += leaf_length;
+ MergeLeafRight(&total_length, &state, right_leaf_state);
+ // Terminal node.
+ if (left_as_cons == NULL) {
+ MergeTerminal(&total_length, state, state_out);
+ return total_length;
+ }
+ } else if (left_as_cons != NULL) {
+ // Both strings are ConsStrings.
+ // Recurse on smallest.
+ if (left->length() < right->length()) {
+ total_length += Calculate(left_as_cons, &left_leaf_state);
+ MergeLeafLeft(&total_length, &state, left_leaf_state);
+ current = right_as_cons;
+ continue;
+ } else {
+ total_length += Calculate(right_as_cons, &right_leaf_state);
+ MergeLeafRight(&total_length, &state, right_leaf_state);
+ current = left_as_cons;
+ continue;
+ }
+ }
+ // 1 leaf node. Do in place descent.
+ if (left_as_cons != NULL) {
+ current = left_as_cons;
+ } else {
+ ASSERT(right_as_cons != NULL);
+ current = right_as_cons;
+ }
+ }
+ UNREACHABLE();
+ return 0;
+ }
+
+ static inline int Calculate(i::ConsString* current) {
+ uint8_t state = kInitialState;
+ return Calculate(current, &state);
}
private:
- int utf8_length_;
- int last_character_;
- DISALLOW_COPY_AND_ASSIGN(Utf8LengthVisitor);
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8LengthHelper);
};
static int Utf8Length(i::String* str, i::Isolate* isolate) {
- unsigned length = static_cast<unsigned>(str->length());
+ int length = str->length();
if (length == 0) return 0;
- int32_t type = str->map()->instance_type();
- Utf8LengthVisitor visitor;
- // Non ConsString branch.
- if ((type & i::kStringRepresentationMask) != i::kConsStringTag) {
- i::ConsStringNullOp null_op;
- i::String::Visit(str, 0, visitor, null_op, type, length);
- return visitor.GetLength();
- }
- i::ConsStringIteratorOp* op = isolate->write_iterator();
- unsigned offset = 0;
- i::String* leaf = op->Operate(str, &offset, &type, &length);
- ASSERT(leaf != NULL);
- while (leaf != NULL) {
- i::ConsStringNullOp null_op;
- ASSERT(offset == 0);
- i::String::Visit(leaf, 0, visitor, null_op, type, length);
- leaf = op->ContinueOperation(&type, &length);
- }
- return visitor.GetLength();
+ uint8_t state;
+ i::ConsString* cons_string =
+ Utf8LengthHelper::Visitor::VisitFlat(str, &length, &state);
+ if (cons_string == NULL) return length;
+ return Utf8LengthHelper::Calculate(cons_string);
}
@@ -3906,12 +4038,14 @@ int String::Utf8Length() const {
class Utf8WriterVisitor {
public:
- Utf8WriterVisitor(char* buffer, int capacity)
+ Utf8WriterVisitor(
+ char* buffer, int capacity, bool skip_capacity_check)
: early_termination_(false),
last_character_(unibrow::Utf16::kNoPreviousCharacter),
buffer_(buffer),
start_(buffer),
capacity_(capacity),
+ skip_capacity_check_(capacity == -1 || skip_capacity_check),
utf16_chars_read_(0) {
}
@@ -3935,7 +4069,7 @@ class Utf8WriterVisitor {
// Can't encode using last_character as gcc has array bounds issues.
int written = Utf8::Encode(temp_buffer,
character,
- unibrow::Utf16::kNoPreviousCharacter);
+ Utf16::kNoPreviousCharacter);
// Won't fit.
if (written > remaining) return 0;
// Copy over the character from temp_buffer.
@@ -3948,45 +4082,55 @@ class Utf8WriterVisitor {
template<typename Char>
void Visit(const Char* chars, const int length) {
using namespace unibrow;
- // TODO(dcarney): Add back ascii fast path.
ASSERT(!early_termination_);
- ASSERT(length > 0);
+ if (length == 0) return;
// Copy state to stack.
char* buffer = buffer_;
- int last_character = last_character_;
+ int last_character =
+ sizeof(Char) == 1 ? Utf16::kNoPreviousCharacter : last_character_;
int i = 0;
// Do a fast loop where there is no exit capacity check.
while (true) {
int fast_length;
- if (capacity_ == -1) {
+ if (skip_capacity_check_) {
fast_length = length;
} else {
int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);
// Need enough space to write everything but one character.
STATIC_ASSERT(Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit == 3);
- int writable_length = (remaining_capacity - 3)/3;
+ int max_size_per_char = sizeof(Char) == 1 ? 2 : 3;
+ int writable_length =
+ (remaining_capacity - max_size_per_char)/max_size_per_char;
// Need to drop into slow loop.
if (writable_length <= 0) break;
fast_length = i + writable_length;
if (fast_length > length) fast_length = length;
}
// Write the characters to the stream.
- for (; i < fast_length; i++) {
- uint16_t character = *chars++;
- buffer += Utf8::Encode(buffer, character, last_character);
- last_character = character;
- ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
+ if (sizeof(Char) == 1) {
+ for (; i < fast_length; i++) {
+ buffer +=
+ Utf8::Encode(buffer, *chars++, Utf16::kNoPreviousCharacter);
+ ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
+ }
+ } else {
+ for (; i < fast_length; i++) {
+ uint16_t character = *chars++;
+ buffer += Utf8::Encode(buffer, character, last_character);
+ last_character = character;
+ ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
+ }
}
// Array is fully written. Exit.
if (fast_length == length) {
// Write state back out to object.
last_character_ = last_character;
buffer_ = buffer;
- utf16_chars_read_ += i;
+ utf16_chars_read_ += length;
return;
}
}
- ASSERT(capacity_ != -1);
+ ASSERT(!skip_capacity_check_);
// Slow loop. Must check capacity on each iteration.
int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);
ASSERT(remaining_capacity >= 0);
@@ -4014,15 +4158,15 @@ class Utf8WriterVisitor {
return early_termination_;
}
- inline void VisitOneByteString(const uint8_t* chars, unsigned length) {
- Visit(chars, static_cast<int>(length));
+ inline void VisitOneByteString(const uint8_t* chars, int length) {
+ Visit(chars, length);
}
- inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {
- Visit(chars, static_cast<int>(length));
+ inline void VisitTwoByteString(const uint16_t* chars, int length) {
+ Visit(chars, length);
}
- inline int CompleteWrite(bool write_null, int* utf16_chars_read_out) {
+ int CompleteWrite(bool write_null, int* utf16_chars_read_out) {
// Write out number of utf16 characters written to the stream.
if (utf16_chars_read_out != NULL) {
*utf16_chars_read_out = utf16_chars_read_;
@@ -4042,11 +4186,32 @@ class Utf8WriterVisitor {
char* buffer_;
char* const start_;
int capacity_;
+ bool const skip_capacity_check_;
int utf16_chars_read_;
DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);
};
+static bool RecursivelySerializeToUtf8(i::String* current,
+ Utf8WriterVisitor* writer,
+ int recursion_budget) {
+ while (!writer->IsDone()) {
+ i::ConsString* cons_string = i::String::VisitFlat(writer, current);
+ if (cons_string == NULL) return true; // Leaf node.
+ if (recursion_budget <= 0) return false;
+ // Must write the left branch first.
+ i::String* first = cons_string->first();
+ bool success = RecursivelySerializeToUtf8(first,
+ writer,
+ recursion_budget - 1);
+ if (!success) return false;
+ // Inline tail recurse for right branch.
+ current = cons_string->second();
+ }
+ return true;
+}
+
+
int String::WriteUtf8(char* buffer,
int capacity,
int* nchars_ref,
@@ -4059,23 +4224,41 @@ int String::WriteUtf8(char* buffer,
if (options & HINT_MANY_WRITES_EXPECTED) {
FlattenString(str); // Flatten the string for efficiency.
}
- Utf8WriterVisitor writer(buffer, capacity);
- i::ConsStringIteratorOp* op = isolate->write_iterator();
- op->Reset();
- int32_t type = str->map()->instance_type();
- unsigned str_length = static_cast<unsigned>(str->length());
- if (str_length != 0) {
- i::String::Visit(*str, 0, writer, *op, type, str_length);
- while (!writer.IsDone()) {
- unsigned length_out;
- i::String* next = op->ContinueOperation(&type, &length_out);
- if (next == NULL) break;
- // TODO(dcarney): need an asserting null op.
- i::ConsStringNullOp null_op;
- i::String::Visit(next, 0, writer, null_op, type, length_out);
+ const int string_length = str->length();
+ bool write_null = !(options & NO_NULL_TERMINATION);
+ // First check if we can just write the string without checking capacity.
+ if (capacity == -1 || capacity / 3 >= string_length) {
+ Utf8WriterVisitor writer(buffer, capacity, true);
+ const int kMaxRecursion = 100;
+ bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);
+ if (success) return writer.CompleteWrite(write_null, nchars_ref);
+ } else if (capacity >= string_length) {
+ // First check that the buffer is large enough.
+ int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate());
+ if (utf8_bytes <= capacity) {
+ // ASCII fast path.
+ if (utf8_bytes == string_length) {
+ WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity, options);
+ if (nchars_ref != NULL) *nchars_ref = string_length;
+ if (write_null && (utf8_bytes+1 <= capacity)) {
+ return string_length + 1;
+ }
+ return string_length;
+ }
+ if (write_null && (utf8_bytes+1 > capacity)) {
+ options |= NO_NULL_TERMINATION;
+ }
+ // Recurse once without a capacity limit.
+ // This will get into the first branch above.
+ // TODO(dcarney) Check max left rec. in Utf8Length and fall through.
+ return WriteUtf8(buffer, -1, nchars_ref, options);
}
}
- return writer.CompleteWrite(!(options & NO_NULL_TERMINATION), nchars_ref);
+ // Recursive slow path can potentially be unreasonable slow. Flatten.
+ str = FlattenGetString(str);
+ Utf8WriterVisitor writer(buffer, capacity, false);
+ i::String::VisitFlat(&writer, *str);
+ return writer.CompleteWrite(write_null, nchars_ref);
}
« no previous file with comments | « no previous file | src/objects.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698