runtime/vm/unicode.cc - Issue 2952193002: VM: Speed up output of UTF8 for 1-byte strings.

Side by Side Diff: runtime/vm/unicode.cc

Issue 2952193002: VM: Speed up output of UTF8 for 1-byte strings.

Patch Set: Fix asserts Created 3 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 #include "vm/unicode.h"	5 #include "vm/unicode.h"

6	6

7 #include "vm/allocation.h"	7 #include "vm/allocation.h"

8 #include "vm/globals.h"	8 #include "vm/globals.h"

9 #include "vm/object.h"	9 #include "vm/object.h"

10	10

(...skipping 94 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
105 } else if (ch <= kMaxTwoByteChar) {	105 } else if (ch <= kMaxTwoByteChar) {

106 return 2;	106 return 2;

107 } else if (ch <= kMaxThreeByteChar) {	107 } else if (ch <= kMaxThreeByteChar) {

108 return 3;	108 return 3;

109 }	109 }

110 ASSERT(ch <= kMaxFourByteChar);	110 ASSERT(ch <= kMaxFourByteChar);

111 return 4;	111 return 4;

112 }	112 }

113	113

114	114

	115 // A constant mask that can be 'and'ed with a word of data to determine if it

	116 // is all ASCII (with no Latin1 characters).

	117 #if defined(ARCH_IS_64_BIT)

	118 static const uintptr_t kAsciiWordMask = DART_UINT64_C(0x8080808080808080);

	119 #else

	120 static const uintptr_t kAsciiWordMask = 0x80808080u;

	121 #endif

	122

	123

115 intptr_t Utf8::Length(const String& str) {	124 intptr_t Utf8::Length(const String& str) {

	125 NoSafepointScope no_safepoint;

	126 if (str.IsOneByteString() \|\| str.IsExternalOneByteString()) {

	127 // For 1-byte strings, all code points < 0x80 have single-byte UTF-8

	128 // encodings and all >= 0x80 have two-byte encodings. To get the length,

	129 // start with the number of code points and add the number of high bits in

	130 // the bytes.

	131 uintptr_t char_length = str.Length();

	132 uintptr_t length = char_length;

	133 const uintptr_t* data;

	134 if (str.IsOneByteString()) {
	Vyacheslav Egorov (Google) 2017/07/03 15:49:05 I think NoSafepointScope should go here, right bef I think NoSafepointScope should go here, right before the if() erikcorry 2017/09/25 15:21:52 Done. Show quoted text On 2017/07/03 15:49:05, Vyacheslav Egorov (Google) wrote: > I think NoSafepointScope should go here, right before the if() Done.
	135 data =

	136 reinterpret_cast<const uintptr_t*>(OneByteString::CharAddr(str, 0));
	Vyacheslav Egorov (Google) 2017/07/03 15:49:05 maybe it's better to have a helper here OneByteSt maybe it's better to have a helper here OneByteString::DataStart(str) instead of passing 0? erikcorry 2017/09/25 15:21:52 Done. Show quoted text On 2017/07/03 15:49:05, Vyacheslav Egorov (Google) wrote: > maybe it's better to have a helper here > > OneByteString::DataStart(str) > > instead of passing 0? Done.
	137 } else {

	138 data = reinterpret_cast<const uintptr_t*>(

	139 ExternalOneByteString::CharAddr(str, 0));

	140 }

	141 uintptr_t i;

	142 for (i = 0; i + sizeof(uintptr_t) < char_length; i += sizeof(uintptr_t)) {

	143 uintptr_t chunk = *data++;

	144 chunk &= kAsciiWordMask;

	145 #if defined(ARCH_IS_64_BIT)

	146 chunk += chunk >> 32;
	Vyacheslav Egorov (Google) 2017/07/03 15:49:05 any reason for this shift-add to be before compari any reason for this shift-add to be before comparison with 0 unlike all others? erikcorry 2017/09/25 15:21:51 Done. Show quoted text On 2017/07/03 15:49:05, Vyacheslav Egorov (Google) wrote: > any reason for this shift-add to be before comparison with 0 unlike all others? Done.
	147 #endif

	148 if (chunk != 0) {

	149 // Shuffle the bits until we have a count of bits in the low nibble.

	150 chunk += chunk >> 16;

	151 chunk += chunk >> 8;

	152 length += (chunk >> 7) & 0xf;

	153 }

	154 }

	155 // Take care of the tail of the string, the last length % wordsize chars.

	156 for (; i < char_length; i++) {

	157 if (str.CharAt(i) > kMaxOneByteChar) length++;
	Vyacheslav Egorov (Google) 2017/07/03 15:49:05 maybe OneByteString::CharAddr(str, i) here to avo maybe OneByteString::CharAddr(str, i) here to avoid virtual dispatch? erikcorry 2017/09/25 15:21:51 I think that would not work with external strings. Show quoted text On 2017/07/03 15:49:05, Vyacheslav Egorov (Google) wrote: > maybe *OneByteString::CharAddr(str, i) here to avoid virtual dispatch? I think that would not work with external strings.
	158 }

	159 return length;

	160 }

	161

	162 // Slow case for 2-byte strings that handles surrogate pairs and longer UTF-8

	163 // encodings.

116 intptr_t length = 0;	164 intptr_t length = 0;

117 String::CodePointIterator it(str);	165 String::CodePointIterator it(str);

118 while (it.Next()) {	166 while (it.Next()) {

119 int32_t ch = it.Current();	167 int32_t ch = it.Current();

120 length += Utf8::Length(ch);	168 length += Utf8::Length(ch);

121 }	169 }

122 return length;	170 return length;

123 }	171 }

124	172

125	173

(...skipping 17 matching lines...) Expand all Loading...
143 ASSERT(ch <= kMaxFourByteChar);	191 ASSERT(ch <= kMaxFourByteChar);

144 dst[0] = 0xF0 \| (ch >> 18);	192 dst[0] = 0xF0 \| (ch >> 18);

145 dst[1] = 0x80 \| ((ch >> 12) & kMask);	193 dst[1] = 0x80 \| ((ch >> 12) & kMask);

146 dst[2] = 0x80 \| ((ch >> 6) & kMask);	194 dst[2] = 0x80 \| ((ch >> 6) & kMask);

147 dst[3] = 0x80 \| (ch & kMask);	195 dst[3] = 0x80 \| (ch & kMask);

148 return 4;	196 return 4;

149 }	197 }

150	198

151	199

152 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) {	200 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) {

	201 NoSafepointScope scope;

	202 uintptr_t array_len = len;

153 intptr_t pos = 0;	203 intptr_t pos = 0;

154 String::CodePointIterator it(src);	204 ASSERT(static_cast<intptr_t>(array_len) >= Length(src));

155 while (it.Next()) {	205 if (src.IsOneByteString() \|\| src.IsExternalOneByteString()) {

156 int32_t ch = it.Current();	206 // For 1-byte strings, all code points < 0x80 have single-byte UTF-8

157 intptr_t num_bytes = Utf8::Length(ch);	207 // encodings and all >= 0x80 have two-byte encodings.

158 if (pos + num_bytes > len) {	208 const uintptr_t* data;

159 break;	209 if (src.IsOneByteString()) {
	Vyacheslav Egorov (Google) 2017/07/03 15:49:05 I think NoSafepointScope should go here, right bef I think NoSafepointScope should go here, right before the if() erikcorry 2017/09/25 15:21:52 Done. Show quoted text On 2017/07/03 15:49:05, Vyacheslav Egorov (Google) wrote: > I think NoSafepointScope should go here, right before the if() Done.
	210 data =

	211 reinterpret_cast<const uintptr_t*>(OneByteString::CharAddr(src, 0));

	212 } else {

	213 data = reinterpret_cast<const uintptr_t*>(

	214 ExternalOneByteString::CharAddr(src, 0));

160 }	215 }

161 Utf8::Encode(ch, &dst[pos]);	216 uintptr_t char_length = Length(src);
	Vyacheslav Egorov (Google) 2017/07/03 15:49:05 why is this called char_length? should not this be why is this called char_length? should not this be byte_length? erikcorry 2017/07/03 19:40:56 No, it's the character length. Byte length can be Show quoted text On 2017/07/03 15:49:05, Vyacheslav Egorov (Google) wrote: > why is this called char_length? should not this be byte_length? No, it's the character length. Byte length can be more. The passed buffer length (argument 3) should be enough for the byte length, but we are defensive if it is too small for the output. In that case we stop writing before the end of the string. erikcorry 2017/07/03 20:12:15 Ah, I'm calling the wrong Length() method. The go Show quoted text On 2017/07/03 19:40:56, erikcorry wrote: > On 2017/07/03 15:49:05, Vyacheslav Egorov (Google) wrote: > > why is this called char_length? should not this be byte_length? > > No, it's the character length. Byte length can be more. The passed buffer > length (argument 3) should be enough for the byte length, but we are defensive > if it is too small for the output. In that case we stop writing before the end > of the string. Ah, I'm calling the wrong Length() method. The good news is it will get faster when I fix this. Also: Need better tests.
162 pos += num_bytes;	217 uintptr_t pos = 0;

	218 ASSERT(kMaxOneByteChar + 1 == 0x80);

	219 for (uintptr_t i = 0; i < char_length; i += sizeof(uintptr_t)) {
	Vyacheslav Egorov (Google) 2017/07/03 15:49:04 I don't get how this works. char_length can be up I don't get how this works. char_length can be up to 2x bigger than actual length of the input data[] (str.Length()). I think this loop can easily go out of bounds and start reading garbage. erikcorry 2017/07/03 19:40:56 No, see above. Show quoted text On 2017/07/03 15:49:04, Vyacheslav Egorov (Google) wrote: > I don't get how this works. char_length can be up to 2x bigger than actual > length of the input data[] (str.Length()). I think this loop can easily go out > of bounds and start reading garbage. No, see above. erikcorry 2017/09/25 15:21:51 Now fixed, good catch. Show quoted text On 2017/07/03 19:40:56, erikcorry wrote: > On 2017/07/03 15:49:04, Vyacheslav Egorov (Google) wrote: > > I don't get how this works. char_length can be up to 2x bigger than actual > > length of the input data[] (str.Length()). I think this loop can easily go out > > of bounds and start reading garbage. > > No, see above. Now fixed, good catch.
	220 // Read the input one word at a time and just write it verbatim if it is

	221 // plain ASCII, as determined by the mask.

	222 if (i + sizeof(uintptr_t) <= char_length &&

	223 (*data & kAsciiWordMask) == 0 &&

	224 pos + sizeof(uintptr_t) <= array_len) {

	225 reinterpret_cast<uintptr_t>(dst + pos) = *data;
	Vyacheslav Egorov (Google) 2017/07/03 15:49:05 I wonder if this should use StoreUnaligned()? Also I wonder if this should use StoreUnaligned()? Also the comment might need some moving. (The comment is unaligned) erikcorry 2017/09/25 15:21:52 Done. Show quoted text On 2017/07/03 15:49:05, Vyacheslav Egorov (Google) wrote: > I wonder if this should use StoreUnaligned()? Also the comment might need some > moving. > > (The comment is unaligned) Done.
	226 // This can be an unaligned write.

	227 pos += sizeof(uintptr_t);

	228 } else {

	229 const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
	Vyacheslav Egorov (Google) 2017/07/03 15:49:05 this needs a comment that else branch processes up this needs a comment that else branch processes up to 1 word at a time. erikcorry 2017/09/25 15:21:52 Done. Show quoted text On 2017/07/03 15:49:05, Vyacheslav Egorov (Google) wrote: > this needs a comment that else branch processes up to 1 word at a time. Done.
	230 const uint8_t* limit = p + sizeof(uintptr_t);

	231 for (uintptr_t j = i; j < char_length && p < limit; p++, j++) {
	Vyacheslav Egorov (Google) 2017/07/03 15:49:05 can't this loop have a single condition? e.g. p < can't this loop have a single condition? e.g. p < limit, where limit is computed to be p + min(sizeof(uintptr_t), char_length - i)? erikcorry 2017/09/25 15:21:51 Done. Show quoted text On 2017/07/03 15:49:05, Vyacheslav Egorov (Google) wrote: > can't this loop have a single condition? e.g. p < limit, where limit is computed > to be p + min(sizeof(uintptr_t), char_length - i)? Done.
	232 uint8_t c = *p;

	233 // These calls to Length and Encode get inlined and the cases for 3

	234 // and 4 byte sequences are removed.

	235 intptr_t bytes = Length(c);

	236 if (pos + bytes > array_len) {

	237 return pos;

	238 }

	239 Encode(c, reinterpret_cast<char*>(dst) + pos);

	240 pos += bytes;

	241 }

	242 }

	243 data++;

	244 }

	245 } else {

	246 String::CodePointIterator it(src);

	247 while (it.Next()) {

	248 int32_t ch = it.Current();

	249 intptr_t num_bytes = Utf8::Length(ch);

	250 if (pos + num_bytes > len) {

	251 break;

	252 }

	253 Utf8::Encode(ch, &dst[pos]);

	254 pos += num_bytes;

	255 }

163 }	256 }

164 return pos;	257 return pos;

165 }	258 }

166	259

167	260

168 intptr_t Utf8::Decode(const uint8_t* utf8_array,	261 intptr_t Utf8::Decode(const uint8_t* utf8_array,

169 intptr_t array_len,	262 intptr_t array_len,

170 int32_t* dst) {	263 int32_t* dst) {

171 uint32_t ch = utf8_array[0] & 0xFF;	264 uint32_t ch = utf8_array[0] & 0xFF;

172 intptr_t i = 1;	265 intptr_t i = 1;

(...skipping 105 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
278	371

279	372

280 void Utf16::Encode(int32_t codepoint, uint16_t* dst) {	373 void Utf16::Encode(int32_t codepoint, uint16_t* dst) {

281 ASSERT(codepoint > Utf16::kMaxCodeUnit);	374 ASSERT(codepoint > Utf16::kMaxCodeUnit);

282 ASSERT(dst != NULL);	375 ASSERT(dst != NULL);

283 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10));	376 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10));

284 dst[1] = (0xDC00 + (codepoint & 0x3FF));	377 dst[1] = (0xDC00 + (codepoint & 0x3FF));

285 }	378 }

286	379

287 } // namespace dart	380 } // namespace dart

OLD	NEW

« no previous file with comments | « runtime/vm/object.cc ('k') | no next file » | no next file with comments »