Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 #include "vm/unicode.h" | 5 #include "vm/unicode.h" |
| 6 | 6 |
| 7 #include "vm/allocation.h" | 7 #include "vm/allocation.h" |
| 8 #include "vm/globals.h" | 8 #include "vm/globals.h" |
| 9 #include "vm/object.h" | 9 #include "vm/object.h" |
| 10 | 10 |
| (...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 105 } else if (ch <= kMaxTwoByteChar) { | 105 } else if (ch <= kMaxTwoByteChar) { |
| 106 return 2; | 106 return 2; |
| 107 } else if (ch <= kMaxThreeByteChar) { | 107 } else if (ch <= kMaxThreeByteChar) { |
| 108 return 3; | 108 return 3; |
| 109 } | 109 } |
| 110 ASSERT(ch <= kMaxFourByteChar); | 110 ASSERT(ch <= kMaxFourByteChar); |
| 111 return 4; | 111 return 4; |
| 112 } | 112 } |
| 113 | 113 |
| 114 | 114 |
| 115 // A constant mask that can be 'and'ed with a word of data to determine if it | |
| 116 // is all ASCII (with no Latin1 characters). | |
| 117 #if defined(ARCH_IS_64_BIT) | |
| 118 static const uintptr_t kAsciiWordMask = DART_UINT64_C(0x8080808080808080); | |
| 119 #else | |
| 120 static const uintptr_t kAsciiWordMask = 0x80808080u; | |
| 121 #endif | |
| 122 | |
| 123 | |
| 115 intptr_t Utf8::Length(const String& str) { | 124 intptr_t Utf8::Length(const String& str) { |
| 125 NoSafepointScope no_safepoint; | |
| 126 if (str.IsOneByteString() || str.IsExternalOneByteString()) { | |
| 127 // For 1-byte strings, all code points < 0x80 have single-byte UTF-8 | |
| 128 // encodings and all >= 0x80 have two-byte encodings. To get the length, | |
| 129 // start with the number of code points and add the number of high bits in | |
| 130 // the bytes. | |
| 131 uintptr_t char_length = str.Length(); | |
| 132 uintptr_t length = char_length; | |
| 133 const uintptr_t* data; | |
| 134 if (str.IsOneByteString()) { | |
|
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
I think NoSafepointScope should go here, right bef
erikcorry
2017/09/25 15:21:52
Done.
| |
| 135 data = | |
| 136 reinterpret_cast<const uintptr_t*>(OneByteString::CharAddr(str, 0)); | |
|
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
maybe it's better to have a helper here
OneByteSt
erikcorry
2017/09/25 15:21:52
Done.
| |
| 137 } else { | |
| 138 data = reinterpret_cast<const uintptr_t*>( | |
| 139 ExternalOneByteString::CharAddr(str, 0)); | |
| 140 } | |
| 141 uintptr_t i; | |
| 142 for (i = 0; i + sizeof(uintptr_t) < char_length; i += sizeof(uintptr_t)) { | |
| 143 uintptr_t chunk = *data++; | |
| 144 chunk &= kAsciiWordMask; | |
| 145 #if defined(ARCH_IS_64_BIT) | |
| 146 chunk += chunk >> 32; | |
|
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
any reason for this shift-add to be before compari
erikcorry
2017/09/25 15:21:51
Done.
| |
| 147 #endif | |
| 148 if (chunk != 0) { | |
| 149 // Shuffle the bits until we have a count of bits in the low nibble. | |
| 150 chunk += chunk >> 16; | |
| 151 chunk += chunk >> 8; | |
| 152 length += (chunk >> 7) & 0xf; | |
| 153 } | |
| 154 } | |
| 155 // Take care of the tail of the string, the last length % wordsize chars. | |
| 156 for (; i < char_length; i++) { | |
| 157 if (str.CharAt(i) > kMaxOneByteChar) length++; | |
|
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
maybe *OneByteString::CharAddr(str, i) here to avo
erikcorry
2017/09/25 15:21:51
I think that would not work with external strings.
| |
| 158 } | |
| 159 return length; | |
| 160 } | |
| 161 | |
| 162 // Slow case for 2-byte strings that handles surrogate pairs and longer UTF-8 | |
| 163 // encodings. | |
| 116 intptr_t length = 0; | 164 intptr_t length = 0; |
| 117 String::CodePointIterator it(str); | 165 String::CodePointIterator it(str); |
| 118 while (it.Next()) { | 166 while (it.Next()) { |
| 119 int32_t ch = it.Current(); | 167 int32_t ch = it.Current(); |
| 120 length += Utf8::Length(ch); | 168 length += Utf8::Length(ch); |
| 121 } | 169 } |
| 122 return length; | 170 return length; |
| 123 } | 171 } |
| 124 | 172 |
| 125 | 173 |
| (...skipping 17 matching lines...) Expand all Loading... | |
| 143 ASSERT(ch <= kMaxFourByteChar); | 191 ASSERT(ch <= kMaxFourByteChar); |
| 144 dst[0] = 0xF0 | (ch >> 18); | 192 dst[0] = 0xF0 | (ch >> 18); |
| 145 dst[1] = 0x80 | ((ch >> 12) & kMask); | 193 dst[1] = 0x80 | ((ch >> 12) & kMask); |
| 146 dst[2] = 0x80 | ((ch >> 6) & kMask); | 194 dst[2] = 0x80 | ((ch >> 6) & kMask); |
| 147 dst[3] = 0x80 | (ch & kMask); | 195 dst[3] = 0x80 | (ch & kMask); |
| 148 return 4; | 196 return 4; |
| 149 } | 197 } |
| 150 | 198 |
| 151 | 199 |
| 152 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) { | 200 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) { |
| 201 NoSafepointScope scope; | |
| 202 uintptr_t array_len = len; | |
| 153 intptr_t pos = 0; | 203 intptr_t pos = 0; |
| 154 String::CodePointIterator it(src); | 204 ASSERT(static_cast<intptr_t>(array_len) >= Length(src)); |
| 155 while (it.Next()) { | 205 if (src.IsOneByteString() || src.IsExternalOneByteString()) { |
| 156 int32_t ch = it.Current(); | 206 // For 1-byte strings, all code points < 0x80 have single-byte UTF-8 |
| 157 intptr_t num_bytes = Utf8::Length(ch); | 207 // encodings and all >= 0x80 have two-byte encodings. |
| 158 if (pos + num_bytes > len) { | 208 const uintptr_t* data; |
| 159 break; | 209 if (src.IsOneByteString()) { |
|
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
I think NoSafepointScope should go here, right bef
erikcorry
2017/09/25 15:21:52
Done.
| |
| 210 data = | |
| 211 reinterpret_cast<const uintptr_t*>(OneByteString::CharAddr(src, 0)); | |
| 212 } else { | |
| 213 data = reinterpret_cast<const uintptr_t*>( | |
| 214 ExternalOneByteString::CharAddr(src, 0)); | |
| 160 } | 215 } |
| 161 Utf8::Encode(ch, &dst[pos]); | 216 uintptr_t char_length = Length(src); |
|
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
why is this called char_length? should not this be
erikcorry
2017/07/03 19:40:56
No, it's the character length. Byte length can be
erikcorry
2017/07/03 20:12:15
Ah, I'm calling the wrong Length() method. The go
| |
| 162 pos += num_bytes; | 217 uintptr_t pos = 0; |
| 218 ASSERT(kMaxOneByteChar + 1 == 0x80); | |
| 219 for (uintptr_t i = 0; i < char_length; i += sizeof(uintptr_t)) { | |
|
Vyacheslav Egorov (Google)
2017/07/03 15:49:04
I don't get how this works. char_length can be up
erikcorry
2017/07/03 19:40:56
No, see above.
erikcorry
2017/09/25 15:21:51
Now fixed, good catch.
| |
| 220 // Read the input one word at a time and just write it verbatim if it is | |
| 221 // plain ASCII, as determined by the mask. | |
| 222 if (i + sizeof(uintptr_t) <= char_length && | |
| 223 (*data & kAsciiWordMask) == 0 && | |
| 224 pos + sizeof(uintptr_t) <= array_len) { | |
| 225 *reinterpret_cast<uintptr_t*>(dst + pos) = *data; | |
|
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
I wonder if this should use StoreUnaligned()? Also
erikcorry
2017/09/25 15:21:52
Done.
| |
| 226 // This can be an unaligned write. | |
| 227 pos += sizeof(uintptr_t); | |
| 228 } else { | |
| 229 const uint8_t* p = reinterpret_cast<const uint8_t*>(data); | |
|
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
this needs a comment that else branch processes up
erikcorry
2017/09/25 15:21:52
Done.
| |
| 230 const uint8_t* limit = p + sizeof(uintptr_t); | |
| 231 for (uintptr_t j = i; j < char_length && p < limit; p++, j++) { | |
|
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
can't this loop have a single condition? e.g. p <
erikcorry
2017/09/25 15:21:51
Done.
| |
| 232 uint8_t c = *p; | |
| 233 // These calls to Length and Encode get inlined and the cases for 3 | |
| 234 // and 4 byte sequences are removed. | |
| 235 intptr_t bytes = Length(c); | |
| 236 if (pos + bytes > array_len) { | |
| 237 return pos; | |
| 238 } | |
| 239 Encode(c, reinterpret_cast<char*>(dst) + pos); | |
| 240 pos += bytes; | |
| 241 } | |
| 242 } | |
| 243 data++; | |
| 244 } | |
| 245 } else { | |
| 246 String::CodePointIterator it(src); | |
| 247 while (it.Next()) { | |
| 248 int32_t ch = it.Current(); | |
| 249 intptr_t num_bytes = Utf8::Length(ch); | |
| 250 if (pos + num_bytes > len) { | |
| 251 break; | |
| 252 } | |
| 253 Utf8::Encode(ch, &dst[pos]); | |
| 254 pos += num_bytes; | |
| 255 } | |
| 163 } | 256 } |
| 164 return pos; | 257 return pos; |
| 165 } | 258 } |
| 166 | 259 |
| 167 | 260 |
| 168 intptr_t Utf8::Decode(const uint8_t* utf8_array, | 261 intptr_t Utf8::Decode(const uint8_t* utf8_array, |
| 169 intptr_t array_len, | 262 intptr_t array_len, |
| 170 int32_t* dst) { | 263 int32_t* dst) { |
| 171 uint32_t ch = utf8_array[0] & 0xFF; | 264 uint32_t ch = utf8_array[0] & 0xFF; |
| 172 intptr_t i = 1; | 265 intptr_t i = 1; |
| (...skipping 105 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 278 | 371 |
| 279 | 372 |
| 280 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { | 373 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { |
| 281 ASSERT(codepoint > Utf16::kMaxCodeUnit); | 374 ASSERT(codepoint > Utf16::kMaxCodeUnit); |
| 282 ASSERT(dst != NULL); | 375 ASSERT(dst != NULL); |
| 283 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); | 376 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); |
| 284 dst[1] = (0xDC00 + (codepoint & 0x3FF)); | 377 dst[1] = (0xDC00 + (codepoint & 0x3FF)); |
| 285 } | 378 } |
| 286 | 379 |
| 287 } // namespace dart | 380 } // namespace dart |
| OLD | NEW |