OLD | NEW |
---|---|
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 #include "vm/unicode.h" | 5 #include "vm/unicode.h" |
6 | 6 |
7 #include "vm/allocation.h" | 7 #include "vm/allocation.h" |
8 #include "vm/globals.h" | 8 #include "vm/globals.h" |
9 #include "vm/object.h" | 9 #include "vm/object.h" |
10 | 10 |
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
105 } else if (ch <= kMaxTwoByteChar) { | 105 } else if (ch <= kMaxTwoByteChar) { |
106 return 2; | 106 return 2; |
107 } else if (ch <= kMaxThreeByteChar) { | 107 } else if (ch <= kMaxThreeByteChar) { |
108 return 3; | 108 return 3; |
109 } | 109 } |
110 ASSERT(ch <= kMaxFourByteChar); | 110 ASSERT(ch <= kMaxFourByteChar); |
111 return 4; | 111 return 4; |
112 } | 112 } |
113 | 113 |
114 | 114 |
115 // A constant mask that can be 'and'ed with a word of data to determine if it | |
116 // is all ASCII (with no Latin1 characters). | |
117 #if defined(ARCH_IS_64_BIT) | |
118 static const uintptr_t kAsciiWordMask = DART_UINT64_C(0x8080808080808080); | |
119 #else | |
120 static const uintptr_t kAsciiWordMask = 0x80808080u; | |
121 #endif | |
122 | |
123 | |
115 intptr_t Utf8::Length(const String& str) { | 124 intptr_t Utf8::Length(const String& str) { |
125 NoSafepointScope no_safepoint; | |
126 if (str.IsOneByteString() || str.IsExternalOneByteString()) { | |
127 // For 1-byte strings, all code points < 0x80 have single-byte UTF-8 | |
128 // encodings and all >= 0x80 have two-byte encodings. To get the length, | |
129 // start with the number of code points and add the number of high bits in | |
130 // the bytes. | |
131 uintptr_t char_length = str.Length(); | |
132 uintptr_t length = char_length; | |
133 const uintptr_t* data; | |
134 if (str.IsOneByteString()) { | |
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
I think NoSafepointScope should go here, right bef
erikcorry
2017/09/25 15:21:52
Done.
| |
135 data = | |
136 reinterpret_cast<const uintptr_t*>(OneByteString::CharAddr(str, 0)); | |
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
maybe it's better to have a helper here
OneByteSt
erikcorry
2017/09/25 15:21:52
Done.
| |
137 } else { | |
138 data = reinterpret_cast<const uintptr_t*>( | |
139 ExternalOneByteString::CharAddr(str, 0)); | |
140 } | |
141 uintptr_t i; | |
142 for (i = 0; i + sizeof(uintptr_t) < char_length; i += sizeof(uintptr_t)) { | |
143 uintptr_t chunk = *data++; | |
144 chunk &= kAsciiWordMask; | |
145 #if defined(ARCH_IS_64_BIT) | |
146 chunk += chunk >> 32; | |
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
any reason for this shift-add to be before compari
erikcorry
2017/09/25 15:21:51
Done.
| |
147 #endif | |
148 if (chunk != 0) { | |
149 // Shuffle the bits until we have a count of bits in the low nibble. | |
150 chunk += chunk >> 16; | |
151 chunk += chunk >> 8; | |
152 length += (chunk >> 7) & 0xf; | |
153 } | |
154 } | |
155 // Take care of the tail of the string, the last length % wordsize chars. | |
156 for (; i < char_length; i++) { | |
157 if (str.CharAt(i) > kMaxOneByteChar) length++; | |
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
maybe *OneByteString::CharAddr(str, i) here to avo
erikcorry
2017/09/25 15:21:51
I think that would not work with external strings.
| |
158 } | |
159 return length; | |
160 } | |
161 | |
162 // Slow case for 2-byte strings that handles surrogate pairs and longer UTF-8 | |
163 // encodings. | |
116 intptr_t length = 0; | 164 intptr_t length = 0; |
117 String::CodePointIterator it(str); | 165 String::CodePointIterator it(str); |
118 while (it.Next()) { | 166 while (it.Next()) { |
119 int32_t ch = it.Current(); | 167 int32_t ch = it.Current(); |
120 length += Utf8::Length(ch); | 168 length += Utf8::Length(ch); |
121 } | 169 } |
122 return length; | 170 return length; |
123 } | 171 } |
124 | 172 |
125 | 173 |
(...skipping 17 matching lines...) Expand all Loading... | |
143 ASSERT(ch <= kMaxFourByteChar); | 191 ASSERT(ch <= kMaxFourByteChar); |
144 dst[0] = 0xF0 | (ch >> 18); | 192 dst[0] = 0xF0 | (ch >> 18); |
145 dst[1] = 0x80 | ((ch >> 12) & kMask); | 193 dst[1] = 0x80 | ((ch >> 12) & kMask); |
146 dst[2] = 0x80 | ((ch >> 6) & kMask); | 194 dst[2] = 0x80 | ((ch >> 6) & kMask); |
147 dst[3] = 0x80 | (ch & kMask); | 195 dst[3] = 0x80 | (ch & kMask); |
148 return 4; | 196 return 4; |
149 } | 197 } |
150 | 198 |
151 | 199 |
152 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) { | 200 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) { |
201 NoSafepointScope scope; | |
202 uintptr_t array_len = len; | |
153 intptr_t pos = 0; | 203 intptr_t pos = 0; |
154 String::CodePointIterator it(src); | 204 ASSERT(static_cast<intptr_t>(array_len) >= Length(src)); |
155 while (it.Next()) { | 205 if (src.IsOneByteString() || src.IsExternalOneByteString()) { |
156 int32_t ch = it.Current(); | 206 // For 1-byte strings, all code points < 0x80 have single-byte UTF-8 |
157 intptr_t num_bytes = Utf8::Length(ch); | 207 // encodings and all >= 0x80 have two-byte encodings. |
158 if (pos + num_bytes > len) { | 208 const uintptr_t* data; |
159 break; | 209 if (src.IsOneByteString()) { |
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
I think NoSafepointScope should go here, right bef
erikcorry
2017/09/25 15:21:52
Done.
| |
210 data = | |
211 reinterpret_cast<const uintptr_t*>(OneByteString::CharAddr(src, 0)); | |
212 } else { | |
213 data = reinterpret_cast<const uintptr_t*>( | |
214 ExternalOneByteString::CharAddr(src, 0)); | |
160 } | 215 } |
161 Utf8::Encode(ch, &dst[pos]); | 216 uintptr_t char_length = Length(src); |
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
why is this called char_length? should not this be
erikcorry
2017/07/03 19:40:56
No, it's the character length. Byte length can be
erikcorry
2017/07/03 20:12:15
Ah, I'm calling the wrong Length() method. The go
| |
162 pos += num_bytes; | 217 uintptr_t pos = 0; |
218 ASSERT(kMaxOneByteChar + 1 == 0x80); | |
219 for (uintptr_t i = 0; i < char_length; i += sizeof(uintptr_t)) { | |
Vyacheslav Egorov (Google)
2017/07/03 15:49:04
I don't get how this works. char_length can be up
erikcorry
2017/07/03 19:40:56
No, see above.
erikcorry
2017/09/25 15:21:51
Now fixed, good catch.
| |
220 // Read the input one word at a time and just write it verbatim if it is | |
221 // plain ASCII, as determined by the mask. | |
222 if (i + sizeof(uintptr_t) <= char_length && | |
223 (*data & kAsciiWordMask) == 0 && | |
224 pos + sizeof(uintptr_t) <= array_len) { | |
225 *reinterpret_cast<uintptr_t*>(dst + pos) = *data; | |
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
I wonder if this should use StoreUnaligned()? Also
erikcorry
2017/09/25 15:21:52
Done.
| |
226 // This can be an unaligned write. | |
227 pos += sizeof(uintptr_t); | |
228 } else { | |
229 const uint8_t* p = reinterpret_cast<const uint8_t*>(data); | |
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
this needs a comment that else branch processes up
erikcorry
2017/09/25 15:21:52
Done.
| |
230 const uint8_t* limit = p + sizeof(uintptr_t); | |
231 for (uintptr_t j = i; j < char_length && p < limit; p++, j++) { | |
Vyacheslav Egorov (Google)
2017/07/03 15:49:05
can't this loop have a single condition? e.g. p <
erikcorry
2017/09/25 15:21:51
Done.
| |
232 uint8_t c = *p; | |
233 // These calls to Length and Encode get inlined and the cases for 3 | |
234 // and 4 byte sequences are removed. | |
235 intptr_t bytes = Length(c); | |
236 if (pos + bytes > array_len) { | |
237 return pos; | |
238 } | |
239 Encode(c, reinterpret_cast<char*>(dst) + pos); | |
240 pos += bytes; | |
241 } | |
242 } | |
243 data++; | |
244 } | |
245 } else { | |
246 String::CodePointIterator it(src); | |
247 while (it.Next()) { | |
248 int32_t ch = it.Current(); | |
249 intptr_t num_bytes = Utf8::Length(ch); | |
250 if (pos + num_bytes > len) { | |
251 break; | |
252 } | |
253 Utf8::Encode(ch, &dst[pos]); | |
254 pos += num_bytes; | |
255 } | |
163 } | 256 } |
164 return pos; | 257 return pos; |
165 } | 258 } |
166 | 259 |
167 | 260 |
168 intptr_t Utf8::Decode(const uint8_t* utf8_array, | 261 intptr_t Utf8::Decode(const uint8_t* utf8_array, |
169 intptr_t array_len, | 262 intptr_t array_len, |
170 int32_t* dst) { | 263 int32_t* dst) { |
171 uint32_t ch = utf8_array[0] & 0xFF; | 264 uint32_t ch = utf8_array[0] & 0xFF; |
172 intptr_t i = 1; | 265 intptr_t i = 1; |
(...skipping 105 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
278 | 371 |
279 | 372 |
280 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { | 373 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { |
281 ASSERT(codepoint > Utf16::kMaxCodeUnit); | 374 ASSERT(codepoint > Utf16::kMaxCodeUnit); |
282 ASSERT(dst != NULL); | 375 ASSERT(dst != NULL); |
283 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); | 376 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); |
284 dst[1] = (0xDC00 + (codepoint & 0x3FF)); | 377 dst[1] = (0xDC00 + (codepoint & 0x3FF)); |
285 } | 378 } |
286 | 379 |
287 } // namespace dart | 380 } // namespace dart |
OLD | NEW |