src/objects.cc - Issue 9600009: Fix input and output to handle UTF16 surrogate pairs.

Side by Side Diff: src/objects.cc

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: '' Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 6025 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6036	6036

6037 // Negative length means the to the end of the string.	6037 // Negative length means the to the end of the string.

6038 if (length < 0) length = kMaxInt - offset;	6038 if (length < 0) length = kMaxInt - offset;

6039	6039

6040 // Compute the size of the UTF-8 string. Start at the specified offset.	6040 // Compute the size of the UTF-8 string. Start at the specified offset.

6041 Access<StringInputBuffer> buffer(	6041 Access<StringInputBuffer> buffer(

6042 heap->isolate()->objects_string_input_buffer());	6042 heap->isolate()->objects_string_input_buffer());

6043 buffer->Reset(offset, this);	6043 buffer->Reset(offset, this);

6044 int character_position = offset;	6044 int character_position = offset;

6045 int utf8_bytes = 0;	6045 int utf8_bytes = 0;

	6046 int last = unibrow::Utf8::kNoPreviousCharacter;

6046 while (buffer->has_more() && character_position++ < offset + length) {	6047 while (buffer->has_more() && character_position++ < offset + length) {

6047 uint16_t character = buffer->GetNext();	6048 uint16_t character = buffer->GetNext();

6048 utf8_bytes += unibrow::Utf8::Length(character);	6049 utf8_bytes += unibrow::Utf8::Length(character, last);

	6050 last = character;

6049 }	6051 }

6050	6052

6051 if (length_return) {	6053 if (length_return) {

6052 *length_return = utf8_bytes;	6054 *length_return = utf8_bytes;

6053 }	6055 }

6054	6056

6055 char* result = NewArray<char>(utf8_bytes + 1);	6057 char* result = NewArray<char>(utf8_bytes + 1);

6056	6058

6057 // Convert the UTF-16 string to a UTF-8 buffer. Start at the specified offset.	6059 // Convert the UTF-16 string to a UTF-8 buffer. Start at the specified offset.

6058 buffer->Rewind();	6060 buffer->Rewind();

6059 buffer->Seek(offset);	6061 buffer->Seek(offset);

6060 character_position = offset;	6062 character_position = offset;

6061 int utf8_byte_position = 0;	6063 int utf8_byte_position = 0;

	6064 last = unibrow::Utf8::kNoPreviousCharacter;

6062 while (buffer->has_more() && character_position++ < offset + length) {	6065 while (buffer->has_more() && character_position++ < offset + length) {

6063 uint16_t character = buffer->GetNext();	6066 uint16_t character = buffer->GetNext();

6064 if (allow_nulls == DISALLOW_NULLS && character == 0) {	6067 if (allow_nulls == DISALLOW_NULLS && character == 0) {

6065 character = ' ';	6068 character = ' ';

6066 }	6069 }

6067 utf8_byte_position +=	6070 utf8_byte_position +=

6068 unibrow::Utf8::Encode(result + utf8_byte_position, character);	6071 unibrow::Utf8::Encode(result + utf8_byte_position, character, last);

	6072 last = character;

6069 }	6073 }

6070 result[utf8_byte_position] = 0;	6074 result[utf8_byte_position] = 0;

6071 return SmartArrayPointer<char>(result);	6075 return SmartArrayPointer<char>(result);

6072 }	6076 }

6073	6077

6074	6078

6075 SmartArrayPointer<char> String::ToCString(AllowNullsFlag allow_nulls,	6079 SmartArrayPointer<char> String::ToCString(AllowNullsFlag allow_nulls,

6076 RobustnessFlag robust_flag,	6080 RobustnessFlag robust_flag,

6077 int* length_return) {	6081 int* length_return) {

6078 return ToCString(allow_nulls, robust_flag, 0, -1, length_return);	6082 return ToCString(allow_nulls, robust_flag, 0, -1, length_return);

(...skipping 295 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6374 break;	6378 break;

6375 }	6379 }

6376	6380

6377 UNREACHABLE();	6381 UNREACHABLE();

6378 return 0;	6382 return 0;

6379 }	6383 }

6380	6384

6381	6385

6382 // This method determines the type of string involved and then gets the UTF8	6386 // This method determines the type of string involved and then gets the UTF8

6383 // length of the string. It doesn't flatten the string and has log(n) recursion	6387 // length of the string. It doesn't flatten the string and has log(n) recursion

6384 // for a string of length n.	6388 // for a string of length n. If the failure flag gets set, then we have to

6385 int String::Utf8Length(String* input, int from, int to) {	6389 // flatten the string and retry. Failures are caused by surrogate pairs in deep
	rossberg 2012/03/07 13:32:47 Not sure I understand the failure mode. Why does t Not sure I understand the failure mode. Why does the function need to fail on these, and not on other recursions? Erik Corry 2012/03/11 19:29:22 The old version of this function was constructed a Show quoted text On 2012/03/07 13:32:47, rossberg wrote: > Not sure I understand the failure mode. Why does the function need to fail on > these, and not on other recursions? The old version of this function was constructed as follows: To get the UTF-8 length of a cons script, recurse on both the car and the cdr and add the lengths. But since we don't want arbitrary recursion that will overflow the stack, we manually transform, the algorithm: 1) Introduce an accumulator for the length 2) Recurse first on the short side (the one with <= half of the characters). 3) The recursion on the other side is now a tail call, which we can manually eliminate, turning it into an iteration. This way, the recursion depth cannot exceed log2 of the length and all is good. WriteToFlat uses the same idea. But now with UTF-16 we have a new case to consider. The car may end in a lead surrogate and the cdr may start with a trail surrogate. Together they make a surrogate pair, which is coded as 4 bytes of UTF-8, instead of two 3-byte encodings (one for each surrogate). This is easy to code as recursion, but if we always recurse on the short one first, I wasn't able to see how to make it into a tail call and transform the second one into iteration. If we always recurse on one (I chose the cdr) then it is still possible to do the transformation. So the new version always recurses on the cdr and iterates on the car. This works for most strings, but we can recurse too deeply, in which case we have to bail out, flatten the string and start over. Since these strings are often build-then-use-once-then-discard strings that is a waste. Currently WriteUTF flattens them, but there are ways to avoid that in another CL (eg writing them backwards normally works because cons strings are almost always unbalanced in the same direction).
	6390 // cons strings.

	6391 int String::Utf8Length(String* input,

	6392 int from,

	6393 int to,

	6394 bool followed_by_surrogate,

	6395 int max_recursion,

	6396 bool* failure,

	6397 bool* starts_with_surrogate) {

6386 if (from == to) return 0;	6398 if (from == to) return 0;

6387 int total = 0;	6399 int total = 0;

	6400 bool dummy;

6388 while (true) {	6401 while (true) {

6389 if (input->IsAsciiRepresentation()) return total + to - from;	6402 if (input->IsAsciiRepresentation()) {

	6403 *starts_with_surrogate = false;

	6404 return total + to - from;

	6405 }

6390 switch (StringShape(input).representation_tag()) {	6406 switch (StringShape(input).representation_tag()) {

6391 case kConsStringTag: {	6407 case kConsStringTag: {

6392 ConsString* str = ConsString::cast(input);	6408 ConsString* str = ConsString::cast(input);

6393 String* first = str->first();	6409 String* first = str->first();

6394 String* second = str->second();	6410 String* second = str->second();

6395 int first_length = first->length();	6411 int first_length = first->length();

6396 if (first_length - from < to - first_length) {	6412 if (first_length - from > to - first_length) {

6397 if (first_length > from) {	6413 if (first_length < to) {

6398 // Left hand side is shorter.

6399 total += Utf8Length(first, from, first_length);

6400 input = second;

6401 from = 0;

6402 to -= first_length;

6403 } else {

6404 // We only need the right hand side.

6405 input = second;

6406 from -= first_length;

6407 to -= first_length;

6408 }

6409 } else {

6410 if (first_length <= to) {

6411 // Right hand side is shorter.	6414 // Right hand side is shorter.

6412 total += Utf8Length(second, 0, to - first_length);	6415 bool right_starts_with_surrogate = false;

	6416 total += Utf8Length(second,

	6417 0,

	6418 to - first_length,

	6419 followed_by_surrogate,

	6420 max_recursion - 1,

	6421 failure,

	6422 &right_starts_with_surrogate);

	6423 if (*failure) return 0;

	6424 followed_by_surrogate = right_starts_with_surrogate;

6413 input = first;	6425 input = first;

6414 to = first_length;	6426 to = first_length;

6415 } else {	6427 } else {

6416 // We only need the left hand side.	6428 // We only need the left hand side.

6417 input = first;	6429 input = first;

6418 }	6430 }

	6431 } else {

	6432 if (first_length > from) {

	6433 // Left hand side is shorter.

	6434 if (first->IsAsciiRepresentation()) {

	6435 total += first_length - from;

	6436 *starts_with_surrogate = false;

	6437 starts_with_surrogate = &dummy;

	6438 input = second;

	6439 from = 0;

	6440 to -= first_length;

	6441 } else if (second->IsAsciiRepresentation()) {

	6442 followed_by_surrogate = false;

	6443 total += to - first_length;

	6444 input = first;

	6445 to = first_length;

	6446 } else if (max_recursion > 0) {
	rossberg 2012/03/07 13:32:47 Why is this the only recursive path actually check Why is this the only recursive path actually checking max_recursion? Erik Corry 2012/03/11 19:29:22 See above. Show quoted text On 2012/03/07 13:32:47, rossberg wrote: > Why is this the only recursive path actually checking max_recursion? See above.
	6447 bool right_starts_with_surrogate = false;

	6448 // Recursing on the long one. This may fail.

	6449 total += Utf8Length(second,

	6450 0,

	6451 to - first_length,

	6452 followed_by_surrogate,

	6453 max_recursion - 1,

	6454 failure,

	6455 &right_starts_with_surrogate);

	6456 if (*failure) return 0;

	6457 input = first;

	6458 to = first_length;

	6459 followed_by_surrogate = right_starts_with_surrogate;

	6460 } else {

	6461 *failure = true;

	6462 return 0;

	6463 }

	6464 } else {

	6465 // We only need the right hand side.

	6466 input = second;

	6467 from = 0;

	6468 to -= first_length;

	6469 }

6419 }	6470 }

6420 continue;	6471 continue;

6421 }	6472 }

6422 case kExternalStringTag:	6473 case kExternalStringTag:

6423 case kSeqStringTag: {	6474 case kSeqStringTag: {

6424 Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();	6475 Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();

6425 const uc16* p = vector.start();	6476 const uc16* p = vector.start();

	6477 int previous = unibrow::Utf8::kNoPreviousCharacter;

6426 for (int i = from; i < to; i++) {	6478 for (int i = from; i < to; i++) {

6427 total += unibrow::Utf8::Length(p[i]);	6479 uc16 c = p[i];

	6480 total += unibrow::Utf8::Length(c, previous);

	6481 previous = c;

	6482 }

	6483 if (to - from > 0) {

	6484 if (unibrow::Utf16::IsLeadSurrogate(previous) &&

	6485 followed_by_surrogate) {

	6486 total -= 2;
	rossberg 2012/03/07 13:32:47 I wouldn't mind a comment here, why -2? Is that kS I wouldn't mind a comment here, why -2? Is that kSizeOfUnmatchedSurrogate-1? Erik Corry 2012/03/11 19:29:22 No, it's kBytesSavedByCombiningSurrogates. Fixed. Show quoted text On 2012/03/07 13:32:47, rossberg wrote: > I wouldn't mind a comment here, why -2? Is that kSizeOfUnmatchedSurrogate-1? No, it's kBytesSavedByCombiningSurrogates. Fixed.
	6487 }

	6488 if (unibrow::Utf16::IsTrailSurrogate(p[from])) {

	6489 *starts_with_surrogate = true;

	6490 }

6428 }	6491 }

6429 return total;	6492 return total;

6430 }	6493 }

6431 case kSlicedStringTag: {	6494 case kSlicedStringTag: {

6432 SlicedString* str = SlicedString::cast(input);	6495 SlicedString* str = SlicedString::cast(input);

6433 int offset = str->offset();	6496 int offset = str->offset();

6434 input = str->parent();	6497 input = str->parent();

6435 from += offset;	6498 from += offset;

6436 to += offset;	6499 to += offset;

6437 continue;	6500 continue;

(...skipping 394 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6832 }	6895 }

6833 }	6896 }

6834 }	6897 }

6835	6898

6836	6899

6837 template <typename IteratorA, typename IteratorB>	6900 template <typename IteratorA, typename IteratorB>

6838 static inline bool CompareStringContents(IteratorA* ia, IteratorB* ib) {	6901 static inline bool CompareStringContents(IteratorA* ia, IteratorB* ib) {

6839 // General slow case check. We know that the ia and ib iterators	6902 // General slow case check. We know that the ia and ib iterators

6840 // have the same length.	6903 // have the same length.

6841 while (ia->has_more()) {	6904 while (ia->has_more()) {

6842 uc32 ca = ia->GetNext();	6905 uint32_t ca = ia->GetNext();

6843 uc32 cb = ib->GetNext();	6906 uint32_t cb = ib->GetNext();

	6907 ASSERT(ca <= unibrow::Utf16::kMaxNonSurrogateCharCode);

	6908 ASSERT(cb <= unibrow::Utf16::kMaxNonSurrogateCharCode);

6844 if (ca != cb)	6909 if (ca != cb)

6845 return false;	6910 return false;

6846 }	6911 }

6847 return true;	6912 return true;

6848 }	6913 }

6849	6914

6850	6915

6851 // Compares the contents of two strings by reading and comparing	6916 // Compares the contents of two strings by reading and comparing

6852 // int-sized blocks of characters.	6917 // int-sized blocks of characters.

6853 template <typename Char>	6918 template <typename Char>

(...skipping 162 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
7016	7081

7017	7082

7018 bool String::IsEqualTo(Vector<const char> str) {	7083 bool String::IsEqualTo(Vector<const char> str) {

7019 Isolate* isolate = GetIsolate();	7084 Isolate* isolate = GetIsolate();

7020 int slen = length();	7085 int slen = length();

7021 Access<UnicodeCache::Utf8Decoder>	7086 Access<UnicodeCache::Utf8Decoder>

7022 decoder(isolate->unicode_cache()->utf8_decoder());	7087 decoder(isolate->unicode_cache()->utf8_decoder());

7023 decoder->Reset(str.start(), str.length());	7088 decoder->Reset(str.start(), str.length());

7024 int i;	7089 int i;

7025 for (i = 0; i < slen && decoder->has_more(); i++) {	7090 for (i = 0; i < slen && decoder->has_more(); i++) {

7026 uc32 r = decoder->GetNext();	7091 uint32_t r = decoder->GetNext();

7027 if (Get(i) != r) return false;	7092 if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {

	7093 if (i > slen - 1) return false;

	7094 if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false;

	7095 if (Get(i) != unibrow::Utf16::TrailSurrogate(r)) return false;

	7096 } else {

	7097 if (Get(i) != r) return false;

	7098 }

7028 }	7099 }

7029 return i == slen && !decoder->has_more();	7100 return i == slen && !decoder->has_more();

7030 }	7101 }

7031	7102

7032	7103

7033 bool String::IsAsciiEqualTo(Vector<const char> str) {	7104 bool String::IsAsciiEqualTo(Vector<const char> str) {

7034 int slen = length();	7105 int slen = length();

7035 if (str.length() != slen) return false;	7106 if (str.length() != slen) return false;

7036 FlatContent content = GetFlatContent();	7107 FlatContent content = GetFlatContent();

7037 if (content.IsAscii()) {	7108 if (content.IsAscii()) {

(...skipping 109 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
7147 value <<= String::kHashShift;	7218 value <<= String::kHashShift;

7148 value \|= length << String::kArrayIndexHashLengthShift;	7219 value \|= length << String::kArrayIndexHashLengthShift;

7149	7220

7150 ASSERT((value & String::kIsNotArrayIndexMask) == 0);	7221 ASSERT((value & String::kIsNotArrayIndexMask) == 0);

7151 ASSERT((length > String::kMaxCachedArrayIndexLength) \|\|	7222 ASSERT((length > String::kMaxCachedArrayIndexLength) \|\|

7152 (value & String::kContainsCachedArrayIndexMask) == 0);	7223 (value & String::kContainsCachedArrayIndexMask) == 0);

7153 return value;	7224 return value;

7154 }	7225 }

7155	7226

7156	7227

	7228 void StringHasher::AddSurrogatePair(uc32 c) {

	7229 uint16_t lead = unibrow::Utf16::LeadSurrogate(c);

	7230 AddCharacter(lead);

	7231 uint16_t trail = unibrow::Utf16::TrailSurrogate(c);

	7232 AddCharacter(trail);

	7233 }

	7234

	7235

	7236 void StringHasher::AddSurrogatePairNoIndex(uc32 c) {

	7237 uint16_t lead = unibrow::Utf16::LeadSurrogate(c);

	7238 AddCharacterNoIndex(lead);

	7239 uint16_t trail = unibrow::Utf16::TrailSurrogate(c);

	7240 AddCharacterNoIndex(trail);

	7241 }

	7242

	7243

7157 uint32_t StringHasher::GetHashField() {	7244 uint32_t StringHasher::GetHashField() {

7158 ASSERT(is_valid());	7245 ASSERT(is_valid());

7159 if (length_ <= String::kMaxHashCalcLength) {	7246 if (length_ <= String::kMaxHashCalcLength) {

7160 if (is_array_index()) {	7247 if (is_array_index()) {

7161 return MakeArrayIndexHash(array_index(), length_);	7248 return MakeArrayIndexHash(array_index(), length_);

7162 }	7249 }

7163 return (GetHash() << String::kHashShift) \| String::kIsNotArrayIndexMask;	7250 return (GetHash() << String::kHashShift) \| String::kIsNotArrayIndexMask;

7164 } else {	7251 } else {

7165 return (length_ << String::kHashShift) \| String::kIsNotArrayIndexMask;	7252 return (length_ << String::kHashShift) \| String::kIsNotArrayIndexMask;

7166 }	7253 }

(...skipping 3572 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
10739 : string_(string), hash_field_(0), seed_(seed) { }	10826 : string_(string), hash_field_(0), seed_(seed) { }

10740	10827

10741 bool IsMatch(Object* string) {	10828 bool IsMatch(Object* string) {

10742 return String::cast(string)->IsEqualTo(string_);	10829 return String::cast(string)->IsEqualTo(string_);

10743 }	10830 }

10744	10831

10745 uint32_t Hash() {	10832 uint32_t Hash() {

10746 if (hash_field_ != 0) return hash_field_ >> String::kHashShift;	10833 if (hash_field_ != 0) return hash_field_ >> String::kHashShift;

10747 unibrow::Utf8InputBuffer<> buffer(string_.start(),	10834 unibrow::Utf8InputBuffer<> buffer(string_.start(),

10748 static_cast<unsigned>(string_.length()));	10835 static_cast<unsigned>(string_.length()));

10749 chars_ = buffer.Length();	10836 chars_ = buffer.Utf16Length();

10750 hash_field_ = String::ComputeHashField(&buffer, chars_, seed_);	10837 hash_field_ = String::ComputeHashField(&buffer, chars_, seed_);

10751 uint32_t result = hash_field_ >> String::kHashShift;	10838 uint32_t result = hash_field_ >> String::kHashShift;

10752 ASSERT(result != 0); // Ensure that the hash value of 0 is never computed.	10839 ASSERT(result != 0); // Ensure that the hash value of 0 is never computed.

10753 return result;	10840 return result;

10754 }	10841 }

10755	10842

10756 uint32_t HashForObject(Object* other) {	10843 uint32_t HashForObject(Object* other) {

10757 return String::cast(other)->Hash();	10844 return String::cast(other)->Hash();

10758 }	10845 }

10759	10846

(...skipping 2197 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
12957 if (break_point_objects()->IsUndefined()) return 0;	13044 if (break_point_objects()->IsUndefined()) return 0;

12958 // Single break point.	13045 // Single break point.

12959 if (!break_point_objects()->IsFixedArray()) return 1;	13046 if (!break_point_objects()->IsFixedArray()) return 1;

12960 // Multiple break points.	13047 // Multiple break points.

12961 return FixedArray::cast(break_point_objects())->length();	13048 return FixedArray::cast(break_point_objects())->length();

12962 }	13049 }

12963 #endif // ENABLE_DEBUGGER_SUPPORT	13050 #endif // ENABLE_DEBUGGER_SUPPORT

12964	13051

12965	13052

12966 } } // namespace v8::internal	13053 } } // namespace v8::internal

OLD	NEW

« src/objects.h ('K') | « src/objects.h ('k') | src/objects-inl.h » ('j') | src/scanner-character-streams.h » ('J')