src/objects.cc - Issue 9600009: Fix input and output to handle UTF16 surrogate pairs.

Unified Diff: src/objects.cc

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: '' Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/objects.cc

===================================================================

--- src/objects.cc (revision 10944)

+++ src/objects.cc (working copy)

@@ -6043,9 +6043,11 @@

buffer->Reset(offset, this);

int character_position = offset;

int utf8_bytes = 0;

+ int last = unibrow::Utf8::kNoPreviousCharacter;

while (buffer->has_more() && character_position++ < offset + length) {

uint16_t character = buffer->GetNext();

- utf8_bytes += unibrow::Utf8::Length(character);

+ utf8_bytes += unibrow::Utf8::Length(character, last);

+ last = character;

}

if (length_return) {

@@ -6059,13 +6061,15 @@

buffer->Seek(offset);

character_position = offset;

int utf8_byte_position = 0;

+ last = unibrow::Utf8::kNoPreviousCharacter;

while (buffer->has_more() && character_position++ < offset + length) {

uint16_t character = buffer->GetNext();

if (allow_nulls == DISALLOW_NULLS && character == 0) {

character = ' ';

}

utf8_byte_position +=

- unibrow::Utf8::Encode(result + utf8_byte_position, character);

+ unibrow::Utf8::Encode(result + utf8_byte_position, character, last);

+ last = character;

}

result[utf8_byte_position] = 0;

return SmartArrayPointer<char>(result);

@@ -6381,41 +6385,88 @@

// This method determines the type of string involved and then gets the UTF8

// length of the string. It doesn't flatten the string and has log(n) recursion

-// for a string of length n.

-int String::Utf8Length(String* input, int from, int to) {

+// for a string of length n. If the failure flag gets set, then we have to

+// flatten the string and retry. Failures are caused by surrogate pairs in deep

rossberg 2012/03/07 13:32:47 Not sure I understand the failure mode. Why does t

Erik Corry 2012/03/11 19:29:22 The old version of this function was constructed a

+// cons strings.

+int String::Utf8Length(String* input,

+ int from,

+ int to,

+ bool followed_by_surrogate,

+ int max_recursion,

+ bool* failure,

+ bool* starts_with_surrogate) {

if (from == to) return 0;

int total = 0;

+ bool dummy;

while (true) {

- if (input->IsAsciiRepresentation()) return total + to - from;

+ if (input->IsAsciiRepresentation()) {

+ *starts_with_surrogate = false;

+ return total + to - from;

+ }

switch (StringShape(input).representation_tag()) {

case kConsStringTag: {

ConsString* str = ConsString::cast(input);

String* first = str->first();

String* second = str->second();

int first_length = first->length();

- if (first_length - from < to - first_length) {

+ if (first_length - from > to - first_length) {

+ if (first_length < to) {

+ // Right hand side is shorter.

+ bool right_starts_with_surrogate = false;

+ total += Utf8Length(second,

+ 0,

+ to - first_length,

+ followed_by_surrogate,

+ max_recursion - 1,

+ failure,

+ &right_starts_with_surrogate);

+ if (*failure) return 0;

+ followed_by_surrogate = right_starts_with_surrogate;

+ input = first;

+ to = first_length;

+ } else {

+ // We only need the left hand side.

+ input = first;

+ }

+ } else {

if (first_length > from) {

// Left hand side is shorter.

- total += Utf8Length(first, from, first_length);

- input = second;

- from = 0;

- to -= first_length;

+ if (first->IsAsciiRepresentation()) {

+ total += first_length - from;

+ *starts_with_surrogate = false;

+ starts_with_surrogate = &dummy;

+ input = second;

+ from = 0;

+ to -= first_length;

+ } else if (second->IsAsciiRepresentation()) {

+ followed_by_surrogate = false;

+ total += to - first_length;

+ input = first;

+ to = first_length;

+ } else if (max_recursion > 0) {

rossberg 2012/03/07 13:32:47 Why is this the only recursive path actually check

Erik Corry 2012/03/11 19:29:22 See above.

+ bool right_starts_with_surrogate = false;

+ // Recursing on the long one. This may fail.

+ total += Utf8Length(second,

+ 0,

+ to - first_length,

+ followed_by_surrogate,

+ max_recursion - 1,

+ failure,

+ &right_starts_with_surrogate);

+ if (*failure) return 0;

+ input = first;

+ to = first_length;

+ followed_by_surrogate = right_starts_with_surrogate;

+ } else {

+ *failure = true;

+ return 0;

+ }

} else {

// We only need the right hand side.

input = second;

- from -= first_length;

+ from = 0;

to -= first_length;

}

- } else {

- if (first_length <= to) {

- // Right hand side is shorter.

- total += Utf8Length(second, 0, to - first_length);

- input = first;

- to = first_length;

- } else {

- // We only need the left hand side.

- input = first;

- }

}

continue;

}

@@ -6423,9 +6474,21 @@

case kSeqStringTag: {

Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();

const uc16* p = vector.start();

+ int previous = unibrow::Utf8::kNoPreviousCharacter;

for (int i = from; i < to; i++) {

- total += unibrow::Utf8::Length(p[i]);

+ uc16 c = p[i];

+ total += unibrow::Utf8::Length(c, previous);

+ previous = c;

}

+ if (to - from > 0) {

+ if (unibrow::Utf16::IsLeadSurrogate(previous) &&

+ followed_by_surrogate) {

+ total -= 2;

rossberg 2012/03/07 13:32:47 I wouldn't mind a comment here, why -2? Is that kS

Erik Corry 2012/03/11 19:29:22 No, it's kBytesSavedByCombiningSurrogates. Fixed.

+ }

+ if (unibrow::Utf16::IsTrailSurrogate(p[from])) {

+ *starts_with_surrogate = true;

+ }

return total;

}

case kSlicedStringTag: {

@@ -6839,8 +6902,10 @@

// General slow case check. We know that the ia and ib iterators

// have the same length.

while (ia->has_more()) {

- uc32 ca = ia->GetNext();

- uc32 cb = ib->GetNext();

+ uint32_t ca = ia->GetNext();

+ uint32_t cb = ib->GetNext();

+ ASSERT(ca <= unibrow::Utf16::kMaxNonSurrogateCharCode);

+ ASSERT(cb <= unibrow::Utf16::kMaxNonSurrogateCharCode);

if (ca != cb)

return false;

}

@@ -7023,8 +7088,14 @@

decoder->Reset(str.start(), str.length());

int i;

for (i = 0; i < slen && decoder->has_more(); i++) {

- uc32 r = decoder->GetNext();

- if (Get(i) != r) return false;

+ uint32_t r = decoder->GetNext();

+ if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {

+ if (i > slen - 1) return false;

+ if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false;

+ if (Get(i) != unibrow::Utf16::TrailSurrogate(r)) return false;

+ } else {

+ if (Get(i) != r) return false;

+ }

}

return i == slen && !decoder->has_more();

}

@@ -7154,6 +7225,22 @@

}

+void StringHasher::AddSurrogatePair(uc32 c) {

+ uint16_t lead = unibrow::Utf16::LeadSurrogate(c);

+ AddCharacter(lead);

+ uint16_t trail = unibrow::Utf16::TrailSurrogate(c);

+ AddCharacter(trail);

+void StringHasher::AddSurrogatePairNoIndex(uc32 c) {

+ uint16_t lead = unibrow::Utf16::LeadSurrogate(c);

+ AddCharacterNoIndex(lead);

+ uint16_t trail = unibrow::Utf16::TrailSurrogate(c);

+ AddCharacterNoIndex(trail);

uint32_t StringHasher::GetHashField() {

ASSERT(is_valid());

if (length_ <= String::kMaxHashCalcLength) {

@@ -10746,7 +10833,7 @@

if (hash_field_ != 0) return hash_field_ >> String::kHashShift;

unibrow::Utf8InputBuffer<> buffer(string_.start(),

static_cast<unsigned>(string_.length()));

- chars_ = buffer.Length();

+ chars_ = buffer.Utf16Length();

hash_field_ = String::ComputeHashField(&buffer, chars_, seed_);

uint32_t result = hash_field_ >> String::kHashShift;

ASSERT(result != 0); // Ensure that the hash value of 0 is never computed.

« src/objects.h ('K') | « src/objects.h ('k') | src/objects-inl.h » ('j') | src/scanner-character-streams.h » ('J')