src/unicode.cc - Issue 2314663002: Rework scanner-character-streams.

Side by Side Diff: src/unicode.cc

Issue 2314663002: Rework scanner-character-streams. (Closed)

Patch Set: Some fixes, and marching down the very long road to make all compilers happy. Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff |

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // This file was generated at 2014-10-08 15:25:47.940335	5 // This file was generated at 2014-10-08 15:25:47.940335

6	6

7 #include "src/unicode.h"	7 #include "src/unicode.h"

8 #include "src/unicode-inl.h"	8 #include "src/unicode-inl.h"

9 #include <stdio.h>	9 #include <stdio.h>

10 #include <stdlib.h>	10 #include <stdlib.h>

(...skipping 172 matching lines...) Loading...
183 default:	183 default:

184 return 0;	184 return 0;

185 }	185 }

186 return -1;	186 return -1;

187 }	187 }

188 } else {	188 } else {

189 return 0;	189 return 0;

190 }	190 }

191 }	191 }

192	192

193	193 static inline uint8_t NonASCIISequenceLength(byte first) {

194 static inline size_t NonASCIISequenceLength(byte first) {

195 // clang-format off	194 // clang-format off

196 static const uint8_t lengths[256] = {	195 static const uint8_t lengths[256] = {

197 // The first 128 entries correspond to ASCII characters.	196 // The first 128 entries correspond to ASCII characters.

198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

(...skipping 93 matching lines...) Loading...
298 }	297 }

299 if (!IsContinuationCharacter(str[3])) {	298 if (!IsContinuationCharacter(str[3])) {

300 *cursor += 1;	299 *cursor += 1;

301 return kBadChar;	300 return kBadChar;

302 }	301 }

303 *cursor += 4;	302 *cursor += 4;

304 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -	303 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -

305 0x03C82080;	304 0x03C82080;

306 }	305 }

307	306

	307 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer& buffer) {

	308 // The common case: 1-byte Utf8 (and no incomplete char in the buffer)

	309 if (V8_LIKELY(next <= kMaxOneByteChar && buffer == 0)) {

	310 return static_cast<uchar>(next);

	311 }

	312

	313 // All other cases:
	marja 2016/09/07 09:17:57 Why does it make sense to do NonAsciiSequenceLengt Why does it make sense to do NonAsciiSequenceLength(next) if buffer is not 0, that is, next is a continuation character? I'd reorder this code like this: fast path (which you have now) if (buffer == 0) { this is the start of a multi-byte character NonAsciiSequenceLength(next) etc } else { expect a continuation character, update the buffer based on that (otherwise kBadChar) } vogelheim 2016/09/07 12:32:56 Done. Your version makes much more sense. Show quoted text On 2016/09/07 09:17:57, marja wrote: > Why does it make sense to do NonAsciiSequenceLength(next) if buffer is not 0, > that is, next is a continuation character? > > I'd reorder this code like this: > > fast path (which you have now) > > if (buffer == 0) { > this is the start of a multi-byte character > NonAsciiSequenceLength(next) > etc > } else { > expect a continuation character, update the buffer based on that (otherwise > kBadChar) > } Done. Your version makes much more sense.
	314 uint32_t kind = NonASCIISequenceLength(next);

	315 switch (kind) {

	316 case 0:

	317 if (IsContinuationCharacter(next)) {

	318 // How many bytes (excluding this one) do we still expect?

	319 uint8_t count = (buffer >> 24) - 1;

	320 // Update the value.

	321 uint32_t value = ((buffer & 0xffffff) << 6) \| (next & 0x3F);

	322 if (count) {

	323 buffer = count << 24 \| value;

	324 return kIncomplete;

	325 } else {

	326 buffer = 0;

	327 return value;

	328 }

	329 } else {

	330 // Not a continuation character? Bad char.

	331 buffer = 0;

	332 return kBadChar;

	333 }

	334 case 2:

	335 case 3:

	336 case 4:

	337 if (buffer == 0) {

	338 // Start of 2..4 byte character, and no buffer.

	339

	340 // The mask for the lower bits depends on the kind, and is

	341 // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that

	342 // with one shift.

	343 uint8_t mask = 0x7f >> kind;

	344

	345 // Store the kind - 1 (i.e., remaining bytes) in the top byte, value

	346 // in the bottom three.

	347 buffer = (kind - 1) << 24 \| (next & mask);

	348 return kIncomplete;

	349 } else {

	350 // Start of new character, but we still have a buffer? Bad char.

	351 buffer = 0;

	352 return kBadChar;

	353 }

	354 default:

	355 UNREACHABLE();

	356 return kBadChar;

	357 }

	358 }

	359

308 bool Utf8::Validate(const byte* bytes, size_t length) {	360 bool Utf8::Validate(const byte* bytes, size_t length) {

309 size_t cursor = 0;	361 size_t cursor = 0;

310	362

311 // Performance optimization: Skip over single-byte values first.	363 // Performance optimization: Skip over single-byte values first.

312 while (cursor < length && bytes[cursor] <= kMaxOneByteChar) {	364 while (cursor < length && bytes[cursor] <= kMaxOneByteChar) {

313 ++cursor;	365 ++cursor;

314 }	366 }

315	367

316 while (cursor < length) {	368 while (cursor < length) {

317 uchar c = ValueOf(bytes + cursor, length - cursor, &cursor);	369 uchar c = ValueOf(bytes + cursor, length - cursor, &cursor);

(...skipping 3129 matching lines...) Loading...
3447 sizeof(MultiCharacterSpecialCase<1>) // NOLINT	3499 sizeof(MultiCharacterSpecialCase<1>) // NOLINT

3448 +	3500 +

3449 kCanonicalizationRangeMultiStrings1Size *	3501 kCanonicalizationRangeMultiStrings1Size *

3450 sizeof(MultiCharacterSpecialCase<1>) // NOLINT	3502 sizeof(MultiCharacterSpecialCase<1>) // NOLINT

3451 +	3503 +

3452 kCanonicalizationRangeMultiStrings7Size *	3504 kCanonicalizationRangeMultiStrings7Size *

3453 sizeof(MultiCharacterSpecialCase<1>); // NOLINT	3505 sizeof(MultiCharacterSpecialCase<1>); // NOLINT

3454 }	3506 }

3455	3507

3456 } // namespace unibrow	3508 } // namespace unibrow

OLD	NEW

« src/parsing/scanner-character-streams.cc ('K') | « src/unicode.h ('k') | test/cctest/cctest.gyp » ('j') | no next file with comments »