Chromium Code Reviews| Index: src/objects.cc |
| =================================================================== |
| --- src/objects.cc (revision 10944) |
| +++ src/objects.cc (working copy) |
| @@ -6043,9 +6043,11 @@ |
| buffer->Reset(offset, this); |
| int character_position = offset; |
| int utf8_bytes = 0; |
| + int last = unibrow::Utf8::kNoPreviousCharacter; |
| while (buffer->has_more() && character_position++ < offset + length) { |
| uint16_t character = buffer->GetNext(); |
| - utf8_bytes += unibrow::Utf8::Length(character); |
| + utf8_bytes += unibrow::Utf8::Length(character, last); |
| + last = character; |
| } |
| if (length_return) { |
| @@ -6059,13 +6061,15 @@ |
| buffer->Seek(offset); |
| character_position = offset; |
| int utf8_byte_position = 0; |
| + last = unibrow::Utf8::kNoPreviousCharacter; |
| while (buffer->has_more() && character_position++ < offset + length) { |
| uint16_t character = buffer->GetNext(); |
| if (allow_nulls == DISALLOW_NULLS && character == 0) { |
| character = ' '; |
| } |
| utf8_byte_position += |
| - unibrow::Utf8::Encode(result + utf8_byte_position, character); |
| + unibrow::Utf8::Encode(result + utf8_byte_position, character, last); |
| + last = character; |
| } |
| result[utf8_byte_position] = 0; |
| return SmartArrayPointer<char>(result); |
| @@ -6381,41 +6385,88 @@ |
| // This method determines the type of string involved and then gets the UTF8 |
| // length of the string. It doesn't flatten the string and has log(n) recursion |
| -// for a string of length n. |
| -int String::Utf8Length(String* input, int from, int to) { |
| +// for a string of length n. If the failure flag gets set, then we have to |
| +// flatten the string and retry. Failures are caused by surrogate pairs in deep |
|
rossberg
2012/03/07 13:32:47
Not sure I understand the failure mode. Why does t
Erik Corry
2012/03/11 19:29:22
The old version of this function was constructed a
|
| +// cons strings. |
| +int String::Utf8Length(String* input, |
| + int from, |
| + int to, |
| + bool followed_by_surrogate, |
| + int max_recursion, |
| + bool* failure, |
| + bool* starts_with_surrogate) { |
| if (from == to) return 0; |
| int total = 0; |
| + bool dummy; |
| while (true) { |
| - if (input->IsAsciiRepresentation()) return total + to - from; |
| + if (input->IsAsciiRepresentation()) { |
| + *starts_with_surrogate = false; |
| + return total + to - from; |
| + } |
| switch (StringShape(input).representation_tag()) { |
| case kConsStringTag: { |
| ConsString* str = ConsString::cast(input); |
| String* first = str->first(); |
| String* second = str->second(); |
| int first_length = first->length(); |
| - if (first_length - from < to - first_length) { |
| + if (first_length - from > to - first_length) { |
| + if (first_length < to) { |
| + // Right hand side is shorter. |
| + bool right_starts_with_surrogate = false; |
| + total += Utf8Length(second, |
| + 0, |
| + to - first_length, |
| + followed_by_surrogate, |
| + max_recursion - 1, |
| + failure, |
| + &right_starts_with_surrogate); |
| + if (*failure) return 0; |
| + followed_by_surrogate = right_starts_with_surrogate; |
| + input = first; |
| + to = first_length; |
| + } else { |
| + // We only need the left hand side. |
| + input = first; |
| + } |
| + } else { |
| if (first_length > from) { |
| // Left hand side is shorter. |
| - total += Utf8Length(first, from, first_length); |
| - input = second; |
| - from = 0; |
| - to -= first_length; |
| + if (first->IsAsciiRepresentation()) { |
| + total += first_length - from; |
| + *starts_with_surrogate = false; |
| + starts_with_surrogate = &dummy; |
| + input = second; |
| + from = 0; |
| + to -= first_length; |
| + } else if (second->IsAsciiRepresentation()) { |
| + followed_by_surrogate = false; |
| + total += to - first_length; |
| + input = first; |
| + to = first_length; |
| + } else if (max_recursion > 0) { |
|
rossberg
2012/03/07 13:32:47
Why is this the only recursive path actually check
Erik Corry
2012/03/11 19:29:22
See above.
|
| + bool right_starts_with_surrogate = false; |
| + // Recursing on the long one. This may fail. |
| + total += Utf8Length(second, |
| + 0, |
| + to - first_length, |
| + followed_by_surrogate, |
| + max_recursion - 1, |
| + failure, |
| + &right_starts_with_surrogate); |
| + if (*failure) return 0; |
| + input = first; |
| + to = first_length; |
| + followed_by_surrogate = right_starts_with_surrogate; |
| + } else { |
| + *failure = true; |
| + return 0; |
| + } |
| } else { |
| // We only need the right hand side. |
| input = second; |
| - from -= first_length; |
| + from = 0; |
| to -= first_length; |
| } |
| - } else { |
| - if (first_length <= to) { |
| - // Right hand side is shorter. |
| - total += Utf8Length(second, 0, to - first_length); |
| - input = first; |
| - to = first_length; |
| - } else { |
| - // We only need the left hand side. |
| - input = first; |
| - } |
| } |
| continue; |
| } |
| @@ -6423,9 +6474,21 @@ |
| case kSeqStringTag: { |
| Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector(); |
| const uc16* p = vector.start(); |
| + int previous = unibrow::Utf8::kNoPreviousCharacter; |
| for (int i = from; i < to; i++) { |
| - total += unibrow::Utf8::Length(p[i]); |
| + uc16 c = p[i]; |
| + total += unibrow::Utf8::Length(c, previous); |
| + previous = c; |
| } |
| + if (to - from > 0) { |
| + if (unibrow::Utf16::IsLeadSurrogate(previous) && |
| + followed_by_surrogate) { |
| + total -= 2; |
|
rossberg
2012/03/07 13:32:47
I wouldn't mind a comment here, why -2? Is that kS
Erik Corry
2012/03/11 19:29:22
No, it's kBytesSavedByCombiningSurrogates. Fixed.
|
| + } |
| + if (unibrow::Utf16::IsTrailSurrogate(p[from])) { |
| + *starts_with_surrogate = true; |
| + } |
| + } |
| return total; |
| } |
| case kSlicedStringTag: { |
| @@ -6839,8 +6902,10 @@ |
| // General slow case check. We know that the ia and ib iterators |
| // have the same length. |
| while (ia->has_more()) { |
| - uc32 ca = ia->GetNext(); |
| - uc32 cb = ib->GetNext(); |
| + uint32_t ca = ia->GetNext(); |
| + uint32_t cb = ib->GetNext(); |
| + ASSERT(ca <= unibrow::Utf16::kMaxNonSurrogateCharCode); |
| + ASSERT(cb <= unibrow::Utf16::kMaxNonSurrogateCharCode); |
| if (ca != cb) |
| return false; |
| } |
| @@ -7023,8 +7088,14 @@ |
| decoder->Reset(str.start(), str.length()); |
| int i; |
| for (i = 0; i < slen && decoder->has_more(); i++) { |
| - uc32 r = decoder->GetNext(); |
| - if (Get(i) != r) return false; |
| + uint32_t r = decoder->GetNext(); |
| + if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) { |
| + if (i > slen - 1) return false; |
| + if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false; |
| + if (Get(i) != unibrow::Utf16::TrailSurrogate(r)) return false; |
| + } else { |
| + if (Get(i) != r) return false; |
| + } |
| } |
| return i == slen && !decoder->has_more(); |
| } |
| @@ -7154,6 +7225,22 @@ |
| } |
| +void StringHasher::AddSurrogatePair(uc32 c) { |
| + uint16_t lead = unibrow::Utf16::LeadSurrogate(c); |
| + AddCharacter(lead); |
| + uint16_t trail = unibrow::Utf16::TrailSurrogate(c); |
| + AddCharacter(trail); |
| +} |
| + |
| + |
| +void StringHasher::AddSurrogatePairNoIndex(uc32 c) { |
| + uint16_t lead = unibrow::Utf16::LeadSurrogate(c); |
| + AddCharacterNoIndex(lead); |
| + uint16_t trail = unibrow::Utf16::TrailSurrogate(c); |
| + AddCharacterNoIndex(trail); |
| +} |
| + |
| + |
| uint32_t StringHasher::GetHashField() { |
| ASSERT(is_valid()); |
| if (length_ <= String::kMaxHashCalcLength) { |
| @@ -10746,7 +10833,7 @@ |
| if (hash_field_ != 0) return hash_field_ >> String::kHashShift; |
| unibrow::Utf8InputBuffer<> buffer(string_.start(), |
| static_cast<unsigned>(string_.length())); |
| - chars_ = buffer.Length(); |
| + chars_ = buffer.Utf16Length(); |
| hash_field_ = String::ComputeHashField(&buffer, chars_, seed_); |
| uint32_t result = hash_field_ >> String::kHashShift; |
| ASSERT(result != 0); // Ensure that the hash value of 0 is never computed. |