Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(412)

Side by Side Diff: runtime/vm/unicode.cc

Issue 2952193002: VM: Speed up output of UTF8 for 1-byte strings.
Patch Set: Fix asserts Created 3 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « runtime/vm/object.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 #include "vm/unicode.h" 5 #include "vm/unicode.h"
6 6
7 #include "vm/allocation.h" 7 #include "vm/allocation.h"
8 #include "vm/globals.h" 8 #include "vm/globals.h"
9 #include "vm/object.h" 9 #include "vm/object.h"
10 10
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after
105 } else if (ch <= kMaxTwoByteChar) { 105 } else if (ch <= kMaxTwoByteChar) {
106 return 2; 106 return 2;
107 } else if (ch <= kMaxThreeByteChar) { 107 } else if (ch <= kMaxThreeByteChar) {
108 return 3; 108 return 3;
109 } 109 }
110 ASSERT(ch <= kMaxFourByteChar); 110 ASSERT(ch <= kMaxFourByteChar);
111 return 4; 111 return 4;
112 } 112 }
113 113
114 114
115 // A constant mask that can be 'and'ed with a word of data to determine if it
116 // is all ASCII (with no Latin1 characters).
117 #if defined(ARCH_IS_64_BIT)
118 static const uintptr_t kAsciiWordMask = DART_UINT64_C(0x8080808080808080);
119 #else
120 static const uintptr_t kAsciiWordMask = 0x80808080u;
121 #endif
122
123
115 intptr_t Utf8::Length(const String& str) { 124 intptr_t Utf8::Length(const String& str) {
125 NoSafepointScope no_safepoint;
126 if (str.IsOneByteString() || str.IsExternalOneByteString()) {
127 // For 1-byte strings, all code points < 0x80 have single-byte UTF-8
128 // encodings and all >= 0x80 have two-byte encodings. To get the length,
129 // start with the number of code points and add the number of high bits in
130 // the bytes.
131 uintptr_t char_length = str.Length();
132 uintptr_t length = char_length;
133 const uintptr_t* data;
134 if (str.IsOneByteString()) {
Vyacheslav Egorov (Google) 2017/07/03 15:49:05 I think NoSafepointScope should go here, right bef
erikcorry 2017/09/25 15:21:52 Done.
135 data =
136 reinterpret_cast<const uintptr_t*>(OneByteString::CharAddr(str, 0));
Vyacheslav Egorov (Google) 2017/07/03 15:49:05 maybe it's better to have a helper here OneByteSt
erikcorry 2017/09/25 15:21:52 Done.
137 } else {
138 data = reinterpret_cast<const uintptr_t*>(
139 ExternalOneByteString::CharAddr(str, 0));
140 }
141 uintptr_t i;
142 for (i = 0; i + sizeof(uintptr_t) < char_length; i += sizeof(uintptr_t)) {
143 uintptr_t chunk = *data++;
144 chunk &= kAsciiWordMask;
145 #if defined(ARCH_IS_64_BIT)
146 chunk += chunk >> 32;
Vyacheslav Egorov (Google) 2017/07/03 15:49:05 any reason for this shift-add to be before compari
erikcorry 2017/09/25 15:21:51 Done.
147 #endif
148 if (chunk != 0) {
149 // Shuffle the bits until we have a count of bits in the low nibble.
150 chunk += chunk >> 16;
151 chunk += chunk >> 8;
152 length += (chunk >> 7) & 0xf;
153 }
154 }
155 // Take care of the tail of the string, the last length % wordsize chars.
156 for (; i < char_length; i++) {
157 if (str.CharAt(i) > kMaxOneByteChar) length++;
Vyacheslav Egorov (Google) 2017/07/03 15:49:05 maybe *OneByteString::CharAddr(str, i) here to avo
erikcorry 2017/09/25 15:21:51 I think that would not work with external strings.
158 }
159 return length;
160 }
161
162 // Slow case for 2-byte strings that handles surrogate pairs and longer UTF-8
163 // encodings.
116 intptr_t length = 0; 164 intptr_t length = 0;
117 String::CodePointIterator it(str); 165 String::CodePointIterator it(str);
118 while (it.Next()) { 166 while (it.Next()) {
119 int32_t ch = it.Current(); 167 int32_t ch = it.Current();
120 length += Utf8::Length(ch); 168 length += Utf8::Length(ch);
121 } 169 }
122 return length; 170 return length;
123 } 171 }
124 172
125 173
(...skipping 17 matching lines...) Expand all
143 ASSERT(ch <= kMaxFourByteChar); 191 ASSERT(ch <= kMaxFourByteChar);
144 dst[0] = 0xF0 | (ch >> 18); 192 dst[0] = 0xF0 | (ch >> 18);
145 dst[1] = 0x80 | ((ch >> 12) & kMask); 193 dst[1] = 0x80 | ((ch >> 12) & kMask);
146 dst[2] = 0x80 | ((ch >> 6) & kMask); 194 dst[2] = 0x80 | ((ch >> 6) & kMask);
147 dst[3] = 0x80 | (ch & kMask); 195 dst[3] = 0x80 | (ch & kMask);
148 return 4; 196 return 4;
149 } 197 }
150 198
151 199
152 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) { 200 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) {
201 NoSafepointScope scope;
202 uintptr_t array_len = len;
153 intptr_t pos = 0; 203 intptr_t pos = 0;
154 String::CodePointIterator it(src); 204 ASSERT(static_cast<intptr_t>(array_len) >= Length(src));
155 while (it.Next()) { 205 if (src.IsOneByteString() || src.IsExternalOneByteString()) {
156 int32_t ch = it.Current(); 206 // For 1-byte strings, all code points < 0x80 have single-byte UTF-8
157 intptr_t num_bytes = Utf8::Length(ch); 207 // encodings and all >= 0x80 have two-byte encodings.
158 if (pos + num_bytes > len) { 208 const uintptr_t* data;
159 break; 209 if (src.IsOneByteString()) {
Vyacheslav Egorov (Google) 2017/07/03 15:49:05 I think NoSafepointScope should go here, right bef
erikcorry 2017/09/25 15:21:52 Done.
210 data =
211 reinterpret_cast<const uintptr_t*>(OneByteString::CharAddr(src, 0));
212 } else {
213 data = reinterpret_cast<const uintptr_t*>(
214 ExternalOneByteString::CharAddr(src, 0));
160 } 215 }
161 Utf8::Encode(ch, &dst[pos]); 216 uintptr_t char_length = Length(src);
Vyacheslav Egorov (Google) 2017/07/03 15:49:05 why is this called char_length? should not this be
erikcorry 2017/07/03 19:40:56 No, it's the character length. Byte length can be
erikcorry 2017/07/03 20:12:15 Ah, I'm calling the wrong Length() method. The go
162 pos += num_bytes; 217 uintptr_t pos = 0;
218 ASSERT(kMaxOneByteChar + 1 == 0x80);
219 for (uintptr_t i = 0; i < char_length; i += sizeof(uintptr_t)) {
Vyacheslav Egorov (Google) 2017/07/03 15:49:04 I don't get how this works. char_length can be up
erikcorry 2017/07/03 19:40:56 No, see above.
erikcorry 2017/09/25 15:21:51 Now fixed, good catch.
220 // Read the input one word at a time and just write it verbatim if it is
221 // plain ASCII, as determined by the mask.
222 if (i + sizeof(uintptr_t) <= char_length &&
223 (*data & kAsciiWordMask) == 0 &&
224 pos + sizeof(uintptr_t) <= array_len) {
225 *reinterpret_cast<uintptr_t*>(dst + pos) = *data;
Vyacheslav Egorov (Google) 2017/07/03 15:49:05 I wonder if this should use StoreUnaligned()? Also
erikcorry 2017/09/25 15:21:52 Done.
226 // This can be an unaligned write.
227 pos += sizeof(uintptr_t);
228 } else {
229 const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
Vyacheslav Egorov (Google) 2017/07/03 15:49:05 this needs a comment that else branch processes up
erikcorry 2017/09/25 15:21:52 Done.
230 const uint8_t* limit = p + sizeof(uintptr_t);
231 for (uintptr_t j = i; j < char_length && p < limit; p++, j++) {
Vyacheslav Egorov (Google) 2017/07/03 15:49:05 can't this loop have a single condition? e.g. p <
erikcorry 2017/09/25 15:21:51 Done.
232 uint8_t c = *p;
233 // These calls to Length and Encode get inlined and the cases for 3
234 // and 4 byte sequences are removed.
235 intptr_t bytes = Length(c);
236 if (pos + bytes > array_len) {
237 return pos;
238 }
239 Encode(c, reinterpret_cast<char*>(dst) + pos);
240 pos += bytes;
241 }
242 }
243 data++;
244 }
245 } else {
246 String::CodePointIterator it(src);
247 while (it.Next()) {
248 int32_t ch = it.Current();
249 intptr_t num_bytes = Utf8::Length(ch);
250 if (pos + num_bytes > len) {
251 break;
252 }
253 Utf8::Encode(ch, &dst[pos]);
254 pos += num_bytes;
255 }
163 } 256 }
164 return pos; 257 return pos;
165 } 258 }
166 259
167 260
168 intptr_t Utf8::Decode(const uint8_t* utf8_array, 261 intptr_t Utf8::Decode(const uint8_t* utf8_array,
169 intptr_t array_len, 262 intptr_t array_len,
170 int32_t* dst) { 263 int32_t* dst) {
171 uint32_t ch = utf8_array[0] & 0xFF; 264 uint32_t ch = utf8_array[0] & 0xFF;
172 intptr_t i = 1; 265 intptr_t i = 1;
(...skipping 105 matching lines...) Expand 10 before | Expand all | Expand 10 after
278 371
279 372
280 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { 373 void Utf16::Encode(int32_t codepoint, uint16_t* dst) {
281 ASSERT(codepoint > Utf16::kMaxCodeUnit); 374 ASSERT(codepoint > Utf16::kMaxCodeUnit);
282 ASSERT(dst != NULL); 375 ASSERT(dst != NULL);
283 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); 376 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10));
284 dst[1] = (0xDC00 + (codepoint & 0x3FF)); 377 dst[1] = (0xDC00 + (codepoint & 0x3FF));
285 } 378 }
286 379
287 } // namespace dart 380 } // namespace dart
OLDNEW
« no previous file with comments | « runtime/vm/object.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698