Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(247)

Side by Side Diff: src/objects.cc

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: '' Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 6025 matching lines...) Expand 10 before | Expand all | Expand 10 after
6036 6036
6037 // Negative length means the to the end of the string. 6037 // Negative length means the to the end of the string.
6038 if (length < 0) length = kMaxInt - offset; 6038 if (length < 0) length = kMaxInt - offset;
6039 6039
6040 // Compute the size of the UTF-8 string. Start at the specified offset. 6040 // Compute the size of the UTF-8 string. Start at the specified offset.
6041 Access<StringInputBuffer> buffer( 6041 Access<StringInputBuffer> buffer(
6042 heap->isolate()->objects_string_input_buffer()); 6042 heap->isolate()->objects_string_input_buffer());
6043 buffer->Reset(offset, this); 6043 buffer->Reset(offset, this);
6044 int character_position = offset; 6044 int character_position = offset;
6045 int utf8_bytes = 0; 6045 int utf8_bytes = 0;
6046 int last = unibrow::Utf8::kNoPreviousCharacter;
6046 while (buffer->has_more() && character_position++ < offset + length) { 6047 while (buffer->has_more() && character_position++ < offset + length) {
6047 uint16_t character = buffer->GetNext(); 6048 uint16_t character = buffer->GetNext();
6048 utf8_bytes += unibrow::Utf8::Length(character); 6049 utf8_bytes += unibrow::Utf8::Length(character, last);
6050 last = character;
6049 } 6051 }
6050 6052
6051 if (length_return) { 6053 if (length_return) {
6052 *length_return = utf8_bytes; 6054 *length_return = utf8_bytes;
6053 } 6055 }
6054 6056
6055 char* result = NewArray<char>(utf8_bytes + 1); 6057 char* result = NewArray<char>(utf8_bytes + 1);
6056 6058
6057 // Convert the UTF-16 string to a UTF-8 buffer. Start at the specified offset. 6059 // Convert the UTF-16 string to a UTF-8 buffer. Start at the specified offset.
6058 buffer->Rewind(); 6060 buffer->Rewind();
6059 buffer->Seek(offset); 6061 buffer->Seek(offset);
6060 character_position = offset; 6062 character_position = offset;
6061 int utf8_byte_position = 0; 6063 int utf8_byte_position = 0;
6064 last = unibrow::Utf8::kNoPreviousCharacter;
6062 while (buffer->has_more() && character_position++ < offset + length) { 6065 while (buffer->has_more() && character_position++ < offset + length) {
6063 uint16_t character = buffer->GetNext(); 6066 uint16_t character = buffer->GetNext();
6064 if (allow_nulls == DISALLOW_NULLS && character == 0) { 6067 if (allow_nulls == DISALLOW_NULLS && character == 0) {
6065 character = ' '; 6068 character = ' ';
6066 } 6069 }
6067 utf8_byte_position += 6070 utf8_byte_position +=
6068 unibrow::Utf8::Encode(result + utf8_byte_position, character); 6071 unibrow::Utf8::Encode(result + utf8_byte_position, character, last);
6072 last = character;
6069 } 6073 }
6070 result[utf8_byte_position] = 0; 6074 result[utf8_byte_position] = 0;
6071 return SmartArrayPointer<char>(result); 6075 return SmartArrayPointer<char>(result);
6072 } 6076 }
6073 6077
6074 6078
6075 SmartArrayPointer<char> String::ToCString(AllowNullsFlag allow_nulls, 6079 SmartArrayPointer<char> String::ToCString(AllowNullsFlag allow_nulls,
6076 RobustnessFlag robust_flag, 6080 RobustnessFlag robust_flag,
6077 int* length_return) { 6081 int* length_return) {
6078 return ToCString(allow_nulls, robust_flag, 0, -1, length_return); 6082 return ToCString(allow_nulls, robust_flag, 0, -1, length_return);
(...skipping 295 matching lines...) Expand 10 before | Expand all | Expand 10 after
6374 break; 6378 break;
6375 } 6379 }
6376 6380
6377 UNREACHABLE(); 6381 UNREACHABLE();
6378 return 0; 6382 return 0;
6379 } 6383 }
6380 6384
6381 6385
6382 // This method determines the type of string involved and then gets the UTF8 6386 // This method determines the type of string involved and then gets the UTF8
6383 // length of the string. It doesn't flatten the string and has log(n) recursion 6387 // length of the string. It doesn't flatten the string and has log(n) recursion
6384 // for a string of length n. 6388 // for a string of length n. If the failure flag gets set, then we have to
6385 int String::Utf8Length(String* input, int from, int to) { 6389 // flatten the string and retry. Failures are caused by surrogate pairs in deep
rossberg 2012/03/07 13:32:47 Not sure I understand the failure mode. Why does t
Erik Corry 2012/03/11 19:29:22 The old version of this function was constructed a
6390 // cons strings.
6391 int String::Utf8Length(String* input,
6392 int from,
6393 int to,
6394 bool followed_by_surrogate,
6395 int max_recursion,
6396 bool* failure,
6397 bool* starts_with_surrogate) {
6386 if (from == to) return 0; 6398 if (from == to) return 0;
6387 int total = 0; 6399 int total = 0;
6400 bool dummy;
6388 while (true) { 6401 while (true) {
6389 if (input->IsAsciiRepresentation()) return total + to - from; 6402 if (input->IsAsciiRepresentation()) {
6403 *starts_with_surrogate = false;
6404 return total + to - from;
6405 }
6390 switch (StringShape(input).representation_tag()) { 6406 switch (StringShape(input).representation_tag()) {
6391 case kConsStringTag: { 6407 case kConsStringTag: {
6392 ConsString* str = ConsString::cast(input); 6408 ConsString* str = ConsString::cast(input);
6393 String* first = str->first(); 6409 String* first = str->first();
6394 String* second = str->second(); 6410 String* second = str->second();
6395 int first_length = first->length(); 6411 int first_length = first->length();
6396 if (first_length - from < to - first_length) { 6412 if (first_length - from > to - first_length) {
6397 if (first_length > from) { 6413 if (first_length < to) {
6398 // Left hand side is shorter.
6399 total += Utf8Length(first, from, first_length);
6400 input = second;
6401 from = 0;
6402 to -= first_length;
6403 } else {
6404 // We only need the right hand side.
6405 input = second;
6406 from -= first_length;
6407 to -= first_length;
6408 }
6409 } else {
6410 if (first_length <= to) {
6411 // Right hand side is shorter. 6414 // Right hand side is shorter.
6412 total += Utf8Length(second, 0, to - first_length); 6415 bool right_starts_with_surrogate = false;
6416 total += Utf8Length(second,
6417 0,
6418 to - first_length,
6419 followed_by_surrogate,
6420 max_recursion - 1,
6421 failure,
6422 &right_starts_with_surrogate);
6423 if (*failure) return 0;
6424 followed_by_surrogate = right_starts_with_surrogate;
6413 input = first; 6425 input = first;
6414 to = first_length; 6426 to = first_length;
6415 } else { 6427 } else {
6416 // We only need the left hand side. 6428 // We only need the left hand side.
6417 input = first; 6429 input = first;
6418 } 6430 }
6431 } else {
6432 if (first_length > from) {
6433 // Left hand side is shorter.
6434 if (first->IsAsciiRepresentation()) {
6435 total += first_length - from;
6436 *starts_with_surrogate = false;
6437 starts_with_surrogate = &dummy;
6438 input = second;
6439 from = 0;
6440 to -= first_length;
6441 } else if (second->IsAsciiRepresentation()) {
6442 followed_by_surrogate = false;
6443 total += to - first_length;
6444 input = first;
6445 to = first_length;
6446 } else if (max_recursion > 0) {
rossberg 2012/03/07 13:32:47 Why is this the only recursive path actually check
Erik Corry 2012/03/11 19:29:22 See above.
6447 bool right_starts_with_surrogate = false;
6448 // Recursing on the long one. This may fail.
6449 total += Utf8Length(second,
6450 0,
6451 to - first_length,
6452 followed_by_surrogate,
6453 max_recursion - 1,
6454 failure,
6455 &right_starts_with_surrogate);
6456 if (*failure) return 0;
6457 input = first;
6458 to = first_length;
6459 followed_by_surrogate = right_starts_with_surrogate;
6460 } else {
6461 *failure = true;
6462 return 0;
6463 }
6464 } else {
6465 // We only need the right hand side.
6466 input = second;
6467 from = 0;
6468 to -= first_length;
6469 }
6419 } 6470 }
6420 continue; 6471 continue;
6421 } 6472 }
6422 case kExternalStringTag: 6473 case kExternalStringTag:
6423 case kSeqStringTag: { 6474 case kSeqStringTag: {
6424 Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector(); 6475 Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();
6425 const uc16* p = vector.start(); 6476 const uc16* p = vector.start();
6477 int previous = unibrow::Utf8::kNoPreviousCharacter;
6426 for (int i = from; i < to; i++) { 6478 for (int i = from; i < to; i++) {
6427 total += unibrow::Utf8::Length(p[i]); 6479 uc16 c = p[i];
6480 total += unibrow::Utf8::Length(c, previous);
6481 previous = c;
6482 }
6483 if (to - from > 0) {
6484 if (unibrow::Utf16::IsLeadSurrogate(previous) &&
6485 followed_by_surrogate) {
6486 total -= 2;
rossberg 2012/03/07 13:32:47 I wouldn't mind a comment here, why -2? Is that kS
Erik Corry 2012/03/11 19:29:22 No, it's kBytesSavedByCombiningSurrogates. Fixed.
6487 }
6488 if (unibrow::Utf16::IsTrailSurrogate(p[from])) {
6489 *starts_with_surrogate = true;
6490 }
6428 } 6491 }
6429 return total; 6492 return total;
6430 } 6493 }
6431 case kSlicedStringTag: { 6494 case kSlicedStringTag: {
6432 SlicedString* str = SlicedString::cast(input); 6495 SlicedString* str = SlicedString::cast(input);
6433 int offset = str->offset(); 6496 int offset = str->offset();
6434 input = str->parent(); 6497 input = str->parent();
6435 from += offset; 6498 from += offset;
6436 to += offset; 6499 to += offset;
6437 continue; 6500 continue;
(...skipping 394 matching lines...) Expand 10 before | Expand all | Expand 10 after
6832 } 6895 }
6833 } 6896 }
6834 } 6897 }
6835 6898
6836 6899
6837 template <typename IteratorA, typename IteratorB> 6900 template <typename IteratorA, typename IteratorB>
6838 static inline bool CompareStringContents(IteratorA* ia, IteratorB* ib) { 6901 static inline bool CompareStringContents(IteratorA* ia, IteratorB* ib) {
6839 // General slow case check. We know that the ia and ib iterators 6902 // General slow case check. We know that the ia and ib iterators
6840 // have the same length. 6903 // have the same length.
6841 while (ia->has_more()) { 6904 while (ia->has_more()) {
6842 uc32 ca = ia->GetNext(); 6905 uint32_t ca = ia->GetNext();
6843 uc32 cb = ib->GetNext(); 6906 uint32_t cb = ib->GetNext();
6907 ASSERT(ca <= unibrow::Utf16::kMaxNonSurrogateCharCode);
6908 ASSERT(cb <= unibrow::Utf16::kMaxNonSurrogateCharCode);
6844 if (ca != cb) 6909 if (ca != cb)
6845 return false; 6910 return false;
6846 } 6911 }
6847 return true; 6912 return true;
6848 } 6913 }
6849 6914
6850 6915
6851 // Compares the contents of two strings by reading and comparing 6916 // Compares the contents of two strings by reading and comparing
6852 // int-sized blocks of characters. 6917 // int-sized blocks of characters.
6853 template <typename Char> 6918 template <typename Char>
(...skipping 162 matching lines...) Expand 10 before | Expand all | Expand 10 after
7016 7081
7017 7082
7018 bool String::IsEqualTo(Vector<const char> str) { 7083 bool String::IsEqualTo(Vector<const char> str) {
7019 Isolate* isolate = GetIsolate(); 7084 Isolate* isolate = GetIsolate();
7020 int slen = length(); 7085 int slen = length();
7021 Access<UnicodeCache::Utf8Decoder> 7086 Access<UnicodeCache::Utf8Decoder>
7022 decoder(isolate->unicode_cache()->utf8_decoder()); 7087 decoder(isolate->unicode_cache()->utf8_decoder());
7023 decoder->Reset(str.start(), str.length()); 7088 decoder->Reset(str.start(), str.length());
7024 int i; 7089 int i;
7025 for (i = 0; i < slen && decoder->has_more(); i++) { 7090 for (i = 0; i < slen && decoder->has_more(); i++) {
7026 uc32 r = decoder->GetNext(); 7091 uint32_t r = decoder->GetNext();
7027 if (Get(i) != r) return false; 7092 if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
7093 if (i > slen - 1) return false;
7094 if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false;
7095 if (Get(i) != unibrow::Utf16::TrailSurrogate(r)) return false;
7096 } else {
7097 if (Get(i) != r) return false;
7098 }
7028 } 7099 }
7029 return i == slen && !decoder->has_more(); 7100 return i == slen && !decoder->has_more();
7030 } 7101 }
7031 7102
7032 7103
7033 bool String::IsAsciiEqualTo(Vector<const char> str) { 7104 bool String::IsAsciiEqualTo(Vector<const char> str) {
7034 int slen = length(); 7105 int slen = length();
7035 if (str.length() != slen) return false; 7106 if (str.length() != slen) return false;
7036 FlatContent content = GetFlatContent(); 7107 FlatContent content = GetFlatContent();
7037 if (content.IsAscii()) { 7108 if (content.IsAscii()) {
(...skipping 109 matching lines...) Expand 10 before | Expand all | Expand 10 after
7147 value <<= String::kHashShift; 7218 value <<= String::kHashShift;
7148 value |= length << String::kArrayIndexHashLengthShift; 7219 value |= length << String::kArrayIndexHashLengthShift;
7149 7220
7150 ASSERT((value & String::kIsNotArrayIndexMask) == 0); 7221 ASSERT((value & String::kIsNotArrayIndexMask) == 0);
7151 ASSERT((length > String::kMaxCachedArrayIndexLength) || 7222 ASSERT((length > String::kMaxCachedArrayIndexLength) ||
7152 (value & String::kContainsCachedArrayIndexMask) == 0); 7223 (value & String::kContainsCachedArrayIndexMask) == 0);
7153 return value; 7224 return value;
7154 } 7225 }
7155 7226
7156 7227
7228 void StringHasher::AddSurrogatePair(uc32 c) {
7229 uint16_t lead = unibrow::Utf16::LeadSurrogate(c);
7230 AddCharacter(lead);
7231 uint16_t trail = unibrow::Utf16::TrailSurrogate(c);
7232 AddCharacter(trail);
7233 }
7234
7235
7236 void StringHasher::AddSurrogatePairNoIndex(uc32 c) {
7237 uint16_t lead = unibrow::Utf16::LeadSurrogate(c);
7238 AddCharacterNoIndex(lead);
7239 uint16_t trail = unibrow::Utf16::TrailSurrogate(c);
7240 AddCharacterNoIndex(trail);
7241 }
7242
7243
7157 uint32_t StringHasher::GetHashField() { 7244 uint32_t StringHasher::GetHashField() {
7158 ASSERT(is_valid()); 7245 ASSERT(is_valid());
7159 if (length_ <= String::kMaxHashCalcLength) { 7246 if (length_ <= String::kMaxHashCalcLength) {
7160 if (is_array_index()) { 7247 if (is_array_index()) {
7161 return MakeArrayIndexHash(array_index(), length_); 7248 return MakeArrayIndexHash(array_index(), length_);
7162 } 7249 }
7163 return (GetHash() << String::kHashShift) | String::kIsNotArrayIndexMask; 7250 return (GetHash() << String::kHashShift) | String::kIsNotArrayIndexMask;
7164 } else { 7251 } else {
7165 return (length_ << String::kHashShift) | String::kIsNotArrayIndexMask; 7252 return (length_ << String::kHashShift) | String::kIsNotArrayIndexMask;
7166 } 7253 }
(...skipping 3572 matching lines...) Expand 10 before | Expand all | Expand 10 after
10739 : string_(string), hash_field_(0), seed_(seed) { } 10826 : string_(string), hash_field_(0), seed_(seed) { }
10740 10827
10741 bool IsMatch(Object* string) { 10828 bool IsMatch(Object* string) {
10742 return String::cast(string)->IsEqualTo(string_); 10829 return String::cast(string)->IsEqualTo(string_);
10743 } 10830 }
10744 10831
10745 uint32_t Hash() { 10832 uint32_t Hash() {
10746 if (hash_field_ != 0) return hash_field_ >> String::kHashShift; 10833 if (hash_field_ != 0) return hash_field_ >> String::kHashShift;
10747 unibrow::Utf8InputBuffer<> buffer(string_.start(), 10834 unibrow::Utf8InputBuffer<> buffer(string_.start(),
10748 static_cast<unsigned>(string_.length())); 10835 static_cast<unsigned>(string_.length()));
10749 chars_ = buffer.Length(); 10836 chars_ = buffer.Utf16Length();
10750 hash_field_ = String::ComputeHashField(&buffer, chars_, seed_); 10837 hash_field_ = String::ComputeHashField(&buffer, chars_, seed_);
10751 uint32_t result = hash_field_ >> String::kHashShift; 10838 uint32_t result = hash_field_ >> String::kHashShift;
10752 ASSERT(result != 0); // Ensure that the hash value of 0 is never computed. 10839 ASSERT(result != 0); // Ensure that the hash value of 0 is never computed.
10753 return result; 10840 return result;
10754 } 10841 }
10755 10842
10756 uint32_t HashForObject(Object* other) { 10843 uint32_t HashForObject(Object* other) {
10757 return String::cast(other)->Hash(); 10844 return String::cast(other)->Hash();
10758 } 10845 }
10759 10846
(...skipping 2197 matching lines...) Expand 10 before | Expand all | Expand 10 after
12957 if (break_point_objects()->IsUndefined()) return 0; 13044 if (break_point_objects()->IsUndefined()) return 0;
12958 // Single break point. 13045 // Single break point.
12959 if (!break_point_objects()->IsFixedArray()) return 1; 13046 if (!break_point_objects()->IsFixedArray()) return 1;
12960 // Multiple break points. 13047 // Multiple break points.
12961 return FixedArray::cast(break_point_objects())->length(); 13048 return FixedArray::cast(break_point_objects())->length();
12962 } 13049 }
12963 #endif // ENABLE_DEBUGGER_SUPPORT 13050 #endif // ENABLE_DEBUGGER_SUPPORT
12964 13051
12965 13052
12966 } } // namespace v8::internal 13053 } } // namespace v8::internal
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698