Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(660)

Unified Diff: src/objects.cc

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: '' Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/objects.cc
===================================================================
--- src/objects.cc (revision 10944)
+++ src/objects.cc (working copy)
@@ -6043,9 +6043,11 @@
buffer->Reset(offset, this);
int character_position = offset;
int utf8_bytes = 0;
+ int last = unibrow::Utf8::kNoPreviousCharacter;
while (buffer->has_more() && character_position++ < offset + length) {
uint16_t character = buffer->GetNext();
- utf8_bytes += unibrow::Utf8::Length(character);
+ utf8_bytes += unibrow::Utf8::Length(character, last);
+ last = character;
}
if (length_return) {
@@ -6059,13 +6061,15 @@
buffer->Seek(offset);
character_position = offset;
int utf8_byte_position = 0;
+ last = unibrow::Utf8::kNoPreviousCharacter;
while (buffer->has_more() && character_position++ < offset + length) {
uint16_t character = buffer->GetNext();
if (allow_nulls == DISALLOW_NULLS && character == 0) {
character = ' ';
}
utf8_byte_position +=
- unibrow::Utf8::Encode(result + utf8_byte_position, character);
+ unibrow::Utf8::Encode(result + utf8_byte_position, character, last);
+ last = character;
}
result[utf8_byte_position] = 0;
return SmartArrayPointer<char>(result);
@@ -6381,41 +6385,88 @@
// This method determines the type of string involved and then gets the UTF8
// length of the string. It doesn't flatten the string and has log(n) recursion
-// for a string of length n.
-int String::Utf8Length(String* input, int from, int to) {
+// for a string of length n. If the failure flag gets set, then we have to
+// flatten the string and retry. Failures are caused by surrogate pairs in deep
rossberg 2012/03/07 13:32:47 Not sure I understand the failure mode. Why does t
Erik Corry 2012/03/11 19:29:22 The old version of this function was constructed a
+// cons strings.
+int String::Utf8Length(String* input,
+ int from,
+ int to,
+ bool followed_by_surrogate,
+ int max_recursion,
+ bool* failure,
+ bool* starts_with_surrogate) {
if (from == to) return 0;
int total = 0;
+ bool dummy;
while (true) {
- if (input->IsAsciiRepresentation()) return total + to - from;
+ if (input->IsAsciiRepresentation()) {
+ *starts_with_surrogate = false;
+ return total + to - from;
+ }
switch (StringShape(input).representation_tag()) {
case kConsStringTag: {
ConsString* str = ConsString::cast(input);
String* first = str->first();
String* second = str->second();
int first_length = first->length();
- if (first_length - from < to - first_length) {
+ if (first_length - from > to - first_length) {
+ if (first_length < to) {
+ // Right hand side is shorter.
+ bool right_starts_with_surrogate = false;
+ total += Utf8Length(second,
+ 0,
+ to - first_length,
+ followed_by_surrogate,
+ max_recursion - 1,
+ failure,
+ &right_starts_with_surrogate);
+ if (*failure) return 0;
+ followed_by_surrogate = right_starts_with_surrogate;
+ input = first;
+ to = first_length;
+ } else {
+ // We only need the left hand side.
+ input = first;
+ }
+ } else {
if (first_length > from) {
// Left hand side is shorter.
- total += Utf8Length(first, from, first_length);
- input = second;
- from = 0;
- to -= first_length;
+ if (first->IsAsciiRepresentation()) {
+ total += first_length - from;
+ *starts_with_surrogate = false;
+ starts_with_surrogate = &dummy;
+ input = second;
+ from = 0;
+ to -= first_length;
+ } else if (second->IsAsciiRepresentation()) {
+ followed_by_surrogate = false;
+ total += to - first_length;
+ input = first;
+ to = first_length;
+ } else if (max_recursion > 0) {
rossberg 2012/03/07 13:32:47 Why is this the only recursive path actually check
Erik Corry 2012/03/11 19:29:22 See above.
+ bool right_starts_with_surrogate = false;
+ // Recursing on the long one. This may fail.
+ total += Utf8Length(second,
+ 0,
+ to - first_length,
+ followed_by_surrogate,
+ max_recursion - 1,
+ failure,
+ &right_starts_with_surrogate);
+ if (*failure) return 0;
+ input = first;
+ to = first_length;
+ followed_by_surrogate = right_starts_with_surrogate;
+ } else {
+ *failure = true;
+ return 0;
+ }
} else {
// We only need the right hand side.
input = second;
- from -= first_length;
+ from = 0;
to -= first_length;
}
- } else {
- if (first_length <= to) {
- // Right hand side is shorter.
- total += Utf8Length(second, 0, to - first_length);
- input = first;
- to = first_length;
- } else {
- // We only need the left hand side.
- input = first;
- }
}
continue;
}
@@ -6423,9 +6474,21 @@
case kSeqStringTag: {
Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();
const uc16* p = vector.start();
+ int previous = unibrow::Utf8::kNoPreviousCharacter;
for (int i = from; i < to; i++) {
- total += unibrow::Utf8::Length(p[i]);
+ uc16 c = p[i];
+ total += unibrow::Utf8::Length(c, previous);
+ previous = c;
}
+ if (to - from > 0) {
+ if (unibrow::Utf16::IsLeadSurrogate(previous) &&
+ followed_by_surrogate) {
+ total -= 2;
rossberg 2012/03/07 13:32:47 I wouldn't mind a comment here, why -2? Is that kS
Erik Corry 2012/03/11 19:29:22 No, it's kBytesSavedByCombiningSurrogates. Fixed.
+ }
+ if (unibrow::Utf16::IsTrailSurrogate(p[from])) {
+ *starts_with_surrogate = true;
+ }
+ }
return total;
}
case kSlicedStringTag: {
@@ -6839,8 +6902,10 @@
// General slow case check. We know that the ia and ib iterators
// have the same length.
while (ia->has_more()) {
- uc32 ca = ia->GetNext();
- uc32 cb = ib->GetNext();
+ uint32_t ca = ia->GetNext();
+ uint32_t cb = ib->GetNext();
+ ASSERT(ca <= unibrow::Utf16::kMaxNonSurrogateCharCode);
+ ASSERT(cb <= unibrow::Utf16::kMaxNonSurrogateCharCode);
if (ca != cb)
return false;
}
@@ -7023,8 +7088,14 @@
decoder->Reset(str.start(), str.length());
int i;
for (i = 0; i < slen && decoder->has_more(); i++) {
- uc32 r = decoder->GetNext();
- if (Get(i) != r) return false;
+ uint32_t r = decoder->GetNext();
+ if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+ if (i > slen - 1) return false;
+ if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false;
+ if (Get(i) != unibrow::Utf16::TrailSurrogate(r)) return false;
+ } else {
+ if (Get(i) != r) return false;
+ }
}
return i == slen && !decoder->has_more();
}
@@ -7154,6 +7225,22 @@
}
+void StringHasher::AddSurrogatePair(uc32 c) {
+ uint16_t lead = unibrow::Utf16::LeadSurrogate(c);
+ AddCharacter(lead);
+ uint16_t trail = unibrow::Utf16::TrailSurrogate(c);
+ AddCharacter(trail);
+}
+
+
+void StringHasher::AddSurrogatePairNoIndex(uc32 c) {
+ uint16_t lead = unibrow::Utf16::LeadSurrogate(c);
+ AddCharacterNoIndex(lead);
+ uint16_t trail = unibrow::Utf16::TrailSurrogate(c);
+ AddCharacterNoIndex(trail);
+}
+
+
uint32_t StringHasher::GetHashField() {
ASSERT(is_valid());
if (length_ <= String::kMaxHashCalcLength) {
@@ -10746,7 +10833,7 @@
if (hash_field_ != 0) return hash_field_ >> String::kHashShift;
unibrow::Utf8InputBuffer<> buffer(string_.start(),
static_cast<unsigned>(string_.length()));
- chars_ = buffer.Length();
+ chars_ = buffer.Utf16Length();
hash_field_ = String::ComputeHashField(&buffer, chars_, seed_);
uint32_t result = hash_field_ >> String::kHashShift;
ASSERT(result != 0); // Ensure that the hash value of 0 is never computed.

Powered by Google App Engine
This is Rietveld 408576698