src/api.cc - Issue 12390057: Added back some utf8 optimizations

Unified Diff: src/api.cc

Issue 12390057: Added back some utf8 optimizations (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Patch Set: addressed comments, rebased Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/api.cc

diff --git a/src/api.cc b/src/api.cc

index e5135a2d297d78a4dc6c52a53b9a1cf6c7dc1ffa..e58e6c894b4ffe6e44710d7d9556c1ee13519bc8 100644

--- a/src/api.cc

+++ b/src/api.cc

@@ -3831,68 +3831,200 @@ bool String::IsOneByte() const {

}

-class Utf8LengthVisitor {

+class Utf8LengthHelper : public i::AllStatic {

public:

- explicit Utf8LengthVisitor()

- : utf8_length_(0),

- last_character_(unibrow::Utf16::kNoPreviousCharacter) {}

+ enum State {

+ kEndsWithLeadingSurrogate = 1 << 0,

+ kStartsWithTrailingSurrogate = 1 << 1,

+ kLeftmostEdgeIsCalculated = 1 << 2,

+ kRightmostEdgeIsCalculated = 1 << 3,

+ kLeftmostEdgeIsSurrogate = 1 << 4,

+ kRightmostEdgeIsSurrogate = 1 << 5

+ };

+ static const uint8_t kInitialState = 0;

+ static inline bool EndsWithSurrogate(uint8_t state) {

+ return state & kEndsWithLeadingSurrogate;

+ }

+ static inline bool StartsWithSurrogate(uint8_t state) {

+ return state & kStartsWithTrailingSurrogate;

+ }

+ class Visitor {

+ public:

+ explicit Visitor()

+ : utf8_length_(0),

+ state_(kInitialState) {}

+ template<typename Char>

+ inline void Visit(const Char* chars, int length) {

+ int utf8_length = 0;

+ int last_character = unibrow::Utf16::kNoPreviousCharacter;

Erik Corry 2013/03/07 09:38:37 I wonder if it's worth modifying this to take adva

+ for (int i = 0; i < length; i++) {

+ uint16_t c = chars[i];

+ utf8_length += unibrow::Utf8::Length(c, last_character);

+ if (sizeof(Char) > 1) {

+ last_character = c;

+ }

+ utf8_length_ = utf8_length;

+ }

+ void VisitOneByteString(const uint8_t* chars, int length) {

+ Visit(chars, length);

+ state_ = kInitialState;

+ }

- inline int GetLength() {

- return utf8_length_;

+ void VisitTwoByteString(const uint16_t* chars, int length) {

+ Visit(chars, length);

+ uint8_t state = 0;

+ if (unibrow::Utf16::IsTrailSurrogate(chars[0])) {

+ state |= kStartsWithTrailingSurrogate;

+ }

+ if (unibrow::Utf16::IsLeadSurrogate(chars[length-1])) {

+ state |= kEndsWithLeadingSurrogate;

+ }

+ state_ = state;

+ }

+ static i::ConsString* VisitFlat(i::String* string,

+ int* length,

+ uint8_t* state) {

+ Visitor visitor;

+ i::ConsString* cons_string = i::String::VisitFlat(&visitor, string);

+ *length = visitor.utf8_length_;

+ *state = visitor.state_;

+ return cons_string;

+ }

+ private:

+ int utf8_length_;

+ uint8_t state_;

+ DISALLOW_COPY_AND_ASSIGN(Visitor);

+ };

+ static inline void MergeLeafLeft(int* length,

+ uint8_t* state,

+ uint8_t leaf_state) {

+ bool edge_surrogate = StartsWithSurrogate(leaf_state);

+ if (!(*state & kLeftmostEdgeIsCalculated)) {

+ ASSERT(!(*state & kLeftmostEdgeIsSurrogate));

+ *state |= kLeftmostEdgeIsCalculated

+ | (edge_surrogate ? kLeftmostEdgeIsSurrogate : 0);

+ } else if (EndsWithSurrogate(*state) && edge_surrogate) {

+ *length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;

+ }

+ if (EndsWithSurrogate(leaf_state)) {

+ *state |= kEndsWithLeadingSurrogate;

+ } else {

+ *state &= ~kEndsWithLeadingSurrogate;

+ }

}

- template<typename Char>

- inline void Visit(const Char* chars, unsigned length) {

- ASSERT(length > 0);

- // TODO(dcarney) Add back ascii fast path.

- int utf8_length = 0;

- int last_character = last_character_;

- for (unsigned i = 0; i < length; i++) {

- uint16_t c = chars[i];

- utf8_length += unibrow::Utf8::Length(c, last_character);

- last_character = c;

+ static inline void MergeLeafRight(int* length,

+ uint8_t* state,

+ uint8_t leaf_state) {

+ bool edge_surrogate = EndsWithSurrogate(leaf_state);

+ if (!(*state & kRightmostEdgeIsCalculated)) {

+ ASSERT(!(*state & kRightmostEdgeIsSurrogate));

+ *state |= (kRightmostEdgeIsCalculated

+ | (edge_surrogate ? kRightmostEdgeIsSurrogate : 0));

+ } else if (edge_surrogate && StartsWithSurrogate(*state)) {

+ *length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;

+ }

+ if (StartsWithSurrogate(leaf_state)) {

+ *state |= kStartsWithTrailingSurrogate;

+ } else {

+ *state &= ~kStartsWithTrailingSurrogate;

}

- last_character_ = last_character;

- utf8_length_ += utf8_length;

}

- inline void VisitOneByteString(const uint8_t* chars, unsigned length) {

- Visit(chars, length);

+ static inline void MergeTerminal(int* length,

+ uint8_t state,

+ uint8_t* state_out) {

+ ASSERT((state & kLeftmostEdgeIsCalculated) &&

+ (state & kRightmostEdgeIsCalculated));

+ if (EndsWithSurrogate(state) && StartsWithSurrogate(state)) {

+ *length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;

+ }

+ *state_out = kInitialState |

+ (state & kLeftmostEdgeIsSurrogate ? kStartsWithTrailingSurrogate : 0) |

+ (state & kRightmostEdgeIsSurrogate ? kEndsWithLeadingSurrogate : 0);

}

- inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {

- Visit(chars, length);

+ static int Calculate(i::ConsString* current, uint8_t* state_out) {

+ using namespace internal;

+ int total_length = 0;

+ uint8_t state = kInitialState;

+ while (true) {

+ i::String* left = current->first();

+ i::String* right = current->second();

+ uint8_t right_leaf_state;

+ uint8_t left_leaf_state;

+ int leaf_length;

+ ConsString* left_as_cons =

+ Visitor::VisitFlat(left, &leaf_length, &left_leaf_state);

+ if (left_as_cons == NULL) {

+ total_length += leaf_length;

+ MergeLeafLeft(&total_length, &state, left_leaf_state);

+ }

+ ConsString* right_as_cons =

+ Visitor::VisitFlat(right, &leaf_length, &right_leaf_state);

+ if (right_as_cons == NULL) {

+ total_length += leaf_length;

+ MergeLeafRight(&total_length, &state, right_leaf_state);

+ // Terminal node.

+ if (left_as_cons == NULL) {

+ MergeTerminal(&total_length, state, state_out);

+ return total_length;

+ }

+ } else if (left_as_cons != NULL) {

+ // Both strings are ConsStrings.

+ // Recurse on smallest.

+ if (left->length() < right->length()) {

+ total_length += Calculate(left_as_cons, &left_leaf_state);

+ MergeLeafLeft(&total_length, &state, left_leaf_state);

+ current = right_as_cons;

+ continue;

+ } else {

+ total_length += Calculate(right_as_cons, &right_leaf_state);

+ MergeLeafRight(&total_length, &state, right_leaf_state);

+ current = left_as_cons;

+ continue;

+ }

+ // 1 leaf node. Do in place descent.

+ if (left_as_cons != NULL) {

+ current = left_as_cons;

+ } else {

+ ASSERT(right_as_cons != NULL);

+ current = right_as_cons;

+ }

+ UNREACHABLE();

+ return 0;

+ }

+ static inline int Calculate(i::ConsString* current) {

+ uint8_t state = kInitialState;

+ return Calculate(current, &state);

}

private:

- int utf8_length_;

- int last_character_;

- DISALLOW_COPY_AND_ASSIGN(Utf8LengthVisitor);

+ DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8LengthHelper);

};

static int Utf8Length(i::String* str, i::Isolate* isolate) {

- unsigned length = static_cast<unsigned>(str->length());

+ int length = str->length();

if (length == 0) return 0;

- int32_t type = str->map()->instance_type();

- Utf8LengthVisitor visitor;

- // Non ConsString branch.

- if ((type & i::kStringRepresentationMask) != i::kConsStringTag) {

- i::ConsStringNullOp null_op;

- i::String::Visit(str, 0, visitor, null_op, type, length);

- return visitor.GetLength();

- }

- i::ConsStringIteratorOp* op = isolate->write_iterator();

- unsigned offset = 0;

- i::String* leaf = op->Operate(str, &offset, &type, &length);

- ASSERT(leaf != NULL);

- while (leaf != NULL) {

- i::ConsStringNullOp null_op;

- ASSERT(offset == 0);

- i::String::Visit(leaf, 0, visitor, null_op, type, length);

- leaf = op->ContinueOperation(&type, &length);

- }

- return visitor.GetLength();

+ uint8_t state;

+ i::ConsString* cons_string =

+ Utf8LengthHelper::Visitor::VisitFlat(str, &length, &state);

+ if (cons_string == NULL) return length;

+ return Utf8LengthHelper::Calculate(cons_string);

}

@@ -3906,12 +4038,14 @@ int String::Utf8Length() const {

class Utf8WriterVisitor {

public:

- Utf8WriterVisitor(char* buffer, int capacity)

+ Utf8WriterVisitor(

+ char* buffer, int capacity, bool skip_capacity_check)

: early_termination_(false),

last_character_(unibrow::Utf16::kNoPreviousCharacter),

buffer_(buffer),

start_(buffer),

capacity_(capacity),

+ skip_capacity_check_(capacity == -1 || skip_capacity_check),

utf16_chars_read_(0) {

}

@@ -3935,7 +4069,7 @@ class Utf8WriterVisitor {

// Can't encode using last_character as gcc has array bounds issues.

int written = Utf8::Encode(temp_buffer,

character,

- unibrow::Utf16::kNoPreviousCharacter);

+ Utf16::kNoPreviousCharacter);

// Won't fit.

if (written > remaining) return 0;

// Copy over the character from temp_buffer.

@@ -3948,45 +4082,55 @@ class Utf8WriterVisitor {

template<typename Char>

void Visit(const Char* chars, const int length) {

using namespace unibrow;

- // TODO(dcarney): Add back ascii fast path.

ASSERT(!early_termination_);

- ASSERT(length > 0);

+ if (length == 0) return;

// Copy state to stack.

char* buffer = buffer_;

- int last_character = last_character_;

+ int last_character =

+ sizeof(Char) == 1 ? Utf16::kNoPreviousCharacter : last_character_;

int i = 0;

// Do a fast loop where there is no exit capacity check.

while (true) {

int fast_length;

- if (capacity_ == -1) {

+ if (skip_capacity_check_) {

fast_length = length;

} else {

int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);

// Need enough space to write everything but one character.

STATIC_ASSERT(Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit == 3);

- int writable_length = (remaining_capacity - 3)/3;

+ int max_size_per_char = sizeof(Char) == 1 ? 2 : 3;

+ int writable_length =

+ (remaining_capacity - max_size_per_char)/max_size_per_char;

// Need to drop into slow loop.

if (writable_length <= 0) break;

fast_length = i + writable_length;

if (fast_length > length) fast_length = length;

}

// Write the characters to the stream.

- for (; i < fast_length; i++) {

- uint16_t character = *chars++;

- buffer += Utf8::Encode(buffer, character, last_character);

- last_character = character;

- ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);

+ if (sizeof(Char) == 1) {

+ for (; i < fast_length; i++) {

+ buffer +=

+ Utf8::Encode(buffer, *chars++, Utf16::kNoPreviousCharacter);

+ ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);

+ }

+ } else {

+ for (; i < fast_length; i++) {

+ uint16_t character = *chars++;

+ buffer += Utf8::Encode(buffer, character, last_character);

+ last_character = character;

+ ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);

+ }

}

// Array is fully written. Exit.

if (fast_length == length) {

// Write state back out to object.

last_character_ = last_character;

buffer_ = buffer;

- utf16_chars_read_ += i;

+ utf16_chars_read_ += length;

return;

}

- ASSERT(capacity_ != -1);

+ ASSERT(!skip_capacity_check_);

// Slow loop. Must check capacity on each iteration.

int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);

ASSERT(remaining_capacity >= 0);

@@ -4014,15 +4158,15 @@ class Utf8WriterVisitor {

return early_termination_;

}

- inline void VisitOneByteString(const uint8_t* chars, unsigned length) {

- Visit(chars, static_cast<int>(length));

+ inline void VisitOneByteString(const uint8_t* chars, int length) {

+ Visit(chars, length);

}

- inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {

- Visit(chars, static_cast<int>(length));

+ inline void VisitTwoByteString(const uint16_t* chars, int length) {

+ Visit(chars, length);

}

- inline int CompleteWrite(bool write_null, int* utf16_chars_read_out) {

+ int CompleteWrite(bool write_null, int* utf16_chars_read_out) {

// Write out number of utf16 characters written to the stream.

if (utf16_chars_read_out != NULL) {

*utf16_chars_read_out = utf16_chars_read_;

@@ -4042,11 +4186,32 @@ class Utf8WriterVisitor {

char* buffer_;

char* const start_;

int capacity_;

+ bool const skip_capacity_check_;

int utf16_chars_read_;

DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);

};

+static bool RecursivelySerializeToUtf8(i::String* current,

+ Utf8WriterVisitor* writer,

+ int recursion_budget) {

+ while (!writer->IsDone()) {

+ i::ConsString* cons_string = i::String::VisitFlat(writer, current);

+ if (cons_string == NULL) return true; // Leaf node.

+ if (recursion_budget <= 0) return false;

+ // Must write the left branch first.

+ i::String* first = cons_string->first();

+ bool success = RecursivelySerializeToUtf8(first,

+ writer,

+ recursion_budget - 1);

+ if (!success) return false;

+ // Inline tail recurse for right branch.

+ current = cons_string->second();

+ }

+ return true;

int String::WriteUtf8(char* buffer,

int capacity,

int* nchars_ref,

@@ -4059,23 +4224,41 @@ int String::WriteUtf8(char* buffer,

if (options & HINT_MANY_WRITES_EXPECTED) {

FlattenString(str); // Flatten the string for efficiency.

}

- Utf8WriterVisitor writer(buffer, capacity);

- i::ConsStringIteratorOp* op = isolate->write_iterator();

- op->Reset();

- int32_t type = str->map()->instance_type();

- unsigned str_length = static_cast<unsigned>(str->length());

- if (str_length != 0) {

- i::String::Visit(*str, 0, writer, *op, type, str_length);

- while (!writer.IsDone()) {

- unsigned length_out;

- i::String* next = op->ContinueOperation(&type, &length_out);

- if (next == NULL) break;

- // TODO(dcarney): need an asserting null op.

- i::ConsStringNullOp null_op;

- i::String::Visit(next, 0, writer, null_op, type, length_out);

+ const int string_length = str->length();

+ bool write_null = !(options & NO_NULL_TERMINATION);

+ // First check if we can just write the string without checking capacity.

+ if (capacity == -1 || capacity / 3 >= string_length) {

+ Utf8WriterVisitor writer(buffer, capacity, true);

+ const int kMaxRecursion = 100;

+ bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);

+ if (success) return writer.CompleteWrite(write_null, nchars_ref);

+ } else if (capacity >= string_length) {

+ // First check that the buffer is large enough.

+ int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate());

+ if (utf8_bytes <= capacity) {

+ // ASCII fast path.

+ if (utf8_bytes == string_length) {

+ WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity, options);

+ if (nchars_ref != NULL) *nchars_ref = string_length;

+ if (write_null && (utf8_bytes+1 <= capacity)) {

+ return string_length + 1;

+ }

+ return string_length;

+ }

+ if (write_null && (utf8_bytes+1 > capacity)) {

+ options |= NO_NULL_TERMINATION;

+ }

+ // Recurse once without a capacity limit.

+ // This will get into the first branch above.

+ // TODO(dcarney) Check max left rec. in Utf8Length and fall through.

+ return WriteUtf8(buffer, -1, nchars_ref, options);

}

- return writer.CompleteWrite(!(options & NO_NULL_TERMINATION), nchars_ref);

+ // Recursive slow path can potentially be unreasonable slow. Flatten.

+ str = FlattenGetString(str);

+ Utf8WriterVisitor writer(buffer, capacity, false);

+ i::String::VisitFlat(&writer, *str);

+ return writer.CompleteWrite(write_null, nchars_ref);

}

« no previous file with comments | « no previous file | src/objects.h » ('j') | no next file with comments »