src/unicode.cc - Issue 2314663002: Rework scanner-character-streams.

Side by Side Diff: src/unicode.cc

Issue 2314663002: Rework scanner-character-streams. (Closed)

Patch Set: Niko's feedback and fix compile even harder Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // This file was generated at 2014-10-08 15:25:47.940335	5 // This file was generated at 2014-10-08 15:25:47.940335

6	6

7 #include "src/unicode.h"	7 #include "src/unicode.h"

8 #include "src/unicode-inl.h"	8 #include "src/unicode-inl.h"

9 #include <stdio.h>	9 #include <stdio.h>

10 #include <stdlib.h>	10 #include <stdlib.h>

(...skipping 172 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
183 default:	183 default:

184 return 0;	184 return 0;

185 }	185 }

186 return -1;	186 return -1;

187 }	187 }

188 } else {	188 } else {

189 return 0;	189 return 0;

190 }	190 }

191 }	191 }

192	192

193	193 static inline uint8_t NonASCIISequenceLength(byte first) {

194 static inline size_t NonASCIISequenceLength(byte first) {

195 // clang-format off	194 // clang-format off

196 static const uint8_t lengths[256] = {	195 static const uint8_t lengths[256] = {

197 // The first 128 entries correspond to ASCII characters.	196 // The first 128 entries correspond to ASCII characters.

198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

(...skipping 93 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
298 }	297 }

299 if (!IsContinuationCharacter(str[3])) {	298 if (!IsContinuationCharacter(str[3])) {

300 *cursor += 1;	299 *cursor += 1;

301 return kBadChar;	300 return kBadChar;

302 }	301 }

303 *cursor += 4;	302 *cursor += 4;

304 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -	303 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -

305 0x03C82080;	304 0x03C82080;

306 }	305 }

307	306

	307 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {

	308 DCHECK_NOT_NULL(buffer);

	309

	310 // The common case: 1-byte Utf8 (and no incomplete char in the buffer)

	311 if (V8_LIKELY(next <= kMaxOneByteChar && *buffer == 0)) {

	312 return static_cast<uchar>(next);

	313 }

	314

	315 if (*buffer == 0) {

	316 // We're at the start of a new character.

	317 uint32_t kind = NonASCIISequenceLength(next);

	318 if (kind >= 2 && kind <= 4) {

	319 // Start of 2..4 byte character, and no buffer.

	320

	321 // The mask for the lower bits depends on the kind, and is

	322 // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that

	323 // with one shift.

	324 uint8_t mask = 0x7f >> kind;

	325

	326 // Store the kind - 1 (i.e., remaining bytes) in the top byte, value

	327 // in the bottom three.

	328 *buffer = (kind - 1) << 24 \| (next & mask);

	329 return kIncomplete;

	330 } else {

	331 // No buffer, and not the start of a 1-byte char (handled at the

	332 // beginning), and not the start of a 2..4 byte char? Bad char.

	333 *buffer = 0;

	334 return kBadChar;

	335 }

	336 } else {

	337 // We're inside of a character, as described by buffer.

	338 if (IsContinuationCharacter(next)) {

	339 // How many bytes (excluding this one) do we still expect?

	340 uint8_t count = (*buffer >> 24) - 1;

	341 // Update the value.

	342 uint32_t value = ((*buffer & 0xffffff) << 6) \| (next & 0x3F);

	343 if (count) {

	344 *buffer = count << 24 \| value;

	345 return kIncomplete;

	346 } else {

	347 *buffer = 0;

	348 return value;

	349 }

	350 } else {

	351 // Within a character, but not a continuation character? Bad char.

	352 *buffer = 0;

	353 return kBadChar;

	354 }

	355 }

	356 }

	357

308 bool Utf8::Validate(const byte* bytes, size_t length) {	358 bool Utf8::Validate(const byte* bytes, size_t length) {

309 size_t cursor = 0;	359 size_t cursor = 0;

310	360

311 // Performance optimization: Skip over single-byte values first.	361 // Performance optimization: Skip over single-byte values first.

312 while (cursor < length && bytes[cursor] <= kMaxOneByteChar) {	362 while (cursor < length && bytes[cursor] <= kMaxOneByteChar) {

313 ++cursor;	363 ++cursor;

314 }	364 }

315	365

316 while (cursor < length) {	366 while (cursor < length) {

317 uchar c = ValueOf(bytes + cursor, length - cursor, &cursor);	367 uchar c = ValueOf(bytes + cursor, length - cursor, &cursor);

(...skipping 3129 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3447 sizeof(MultiCharacterSpecialCase<1>) // NOLINT	3497 sizeof(MultiCharacterSpecialCase<1>) // NOLINT

3448 +	3498 +

3449 kCanonicalizationRangeMultiStrings1Size *	3499 kCanonicalizationRangeMultiStrings1Size *

3450 sizeof(MultiCharacterSpecialCase<1>) // NOLINT	3500 sizeof(MultiCharacterSpecialCase<1>) // NOLINT

3451 +	3501 +

3452 kCanonicalizationRangeMultiStrings7Size *	3502 kCanonicalizationRangeMultiStrings7Size *

3453 sizeof(MultiCharacterSpecialCase<1>); // NOLINT	3503 sizeof(MultiCharacterSpecialCase<1>); // NOLINT

3454 }	3504 }

3455	3505

3456 } // namespace unibrow	3506 } // namespace unibrow

OLD	NEW

« src/parsing/scanner-character-streams.cc ('K') | « src/unicode.h ('k') | test/cctest/cctest.gyp » ('j') | test/cctest/parsing/test-scanner-streams.cc » ('J')