Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(393)

Side by Side Diff: src/unicode.cc

Issue 2314663002: Rework scanner-character-streams. (Closed)
Patch Set: Marja's feedback, round 1. Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 // 4 //
5 // This file was generated at 2014-10-08 15:25:47.940335 5 // This file was generated at 2014-10-08 15:25:47.940335
6 6
7 #include "src/unicode.h" 7 #include "src/unicode.h"
8 #include "src/unicode-inl.h" 8 #include "src/unicode-inl.h"
9 #include <stdio.h> 9 #include <stdio.h>
10 #include <stdlib.h> 10 #include <stdlib.h>
(...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after
183 default: 183 default:
184 return 0; 184 return 0;
185 } 185 }
186 return -1; 186 return -1;
187 } 187 }
188 } else { 188 } else {
189 return 0; 189 return 0;
190 } 190 }
191 } 191 }
192 192
193 193 static inline uint8_t NonASCIISequenceLength(byte first) {
194 static inline size_t NonASCIISequenceLength(byte first) {
195 // clang-format off 194 // clang-format off
196 static const uint8_t lengths[256] = { 195 static const uint8_t lengths[256] = {
197 // The first 128 entries correspond to ASCII characters. 196 // The first 128 entries correspond to ASCII characters.
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after
298 } 297 }
299 if (!IsContinuationCharacter(str[3])) { 298 if (!IsContinuationCharacter(str[3])) {
300 *cursor += 1; 299 *cursor += 1;
301 return kBadChar; 300 return kBadChar;
302 } 301 }
303 *cursor += 4; 302 *cursor += 4;
304 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - 303 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
305 0x03C82080; 304 0x03C82080;
306 } 305 }
307 306
307 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer& buffer) {
marja 2016/09/08 09:46:23 Much better now!
308 // The common case: 1-byte Utf8 (and no incomplete char in the buffer)
309 if (V8_LIKELY(next <= kMaxOneByteChar && buffer == 0)) {
310 return static_cast<uchar>(next);
311 }
312
313 if (buffer == 0) {
314 // We're at the start of a new character.
315 uint32_t kind = NonASCIISequenceLength(next);
316 if (kind >= 2 && kind <= 4) {
317 // Start of 2..4 byte character, and no buffer.
318
319 // The mask for the lower bits depends on the kind, and is
320 // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that
321 // with one shift.
322 uint8_t mask = 0x7f >> kind;
323
324 // Store the kind - 1 (i.e., remaining bytes) in the top byte, value
325 // in the bottom three.
326 buffer = (kind - 1) << 24 | (next & mask);
327 return kIncomplete;
328 } else {
329 // No buffer, and not the start of a 1-byte char (handled at the
330 // beginning), and not the start of a 2..4 byte char? Bad char.
331 buffer = 0;
332 return kBadChar;
333 }
334 } else {
335 // We're inside of a character, as described by buffer.
336 if (IsContinuationCharacter(next)) {
337 // How many bytes (excluding this one) do we still expect?
338 uint8_t count = (buffer >> 24) - 1;
339 // Update the value.
340 uint32_t value = ((buffer & 0xffffff) << 6) | (next & 0x3F);
341 if (count) {
342 buffer = count << 24 | value;
343 return kIncomplete;
344 } else {
345 buffer = 0;
346 return value;
347 }
348 } else {
349 // Within a character, but not a continuation character? Bad char.
350 buffer = 0;
351 return kBadChar;
352 }
353 }
354 }
355
308 bool Utf8::Validate(const byte* bytes, size_t length) { 356 bool Utf8::Validate(const byte* bytes, size_t length) {
309 size_t cursor = 0; 357 size_t cursor = 0;
310 358
311 // Performance optimization: Skip over single-byte values first. 359 // Performance optimization: Skip over single-byte values first.
312 while (cursor < length && bytes[cursor] <= kMaxOneByteChar) { 360 while (cursor < length && bytes[cursor] <= kMaxOneByteChar) {
313 ++cursor; 361 ++cursor;
314 } 362 }
315 363
316 while (cursor < length) { 364 while (cursor < length) {
317 uchar c = ValueOf(bytes + cursor, length - cursor, &cursor); 365 uchar c = ValueOf(bytes + cursor, length - cursor, &cursor);
(...skipping 3129 matching lines...) Expand 10 before | Expand all | Expand 10 after
3447 sizeof(MultiCharacterSpecialCase<1>) // NOLINT 3495 sizeof(MultiCharacterSpecialCase<1>) // NOLINT
3448 + 3496 +
3449 kCanonicalizationRangeMultiStrings1Size * 3497 kCanonicalizationRangeMultiStrings1Size *
3450 sizeof(MultiCharacterSpecialCase<1>) // NOLINT 3498 sizeof(MultiCharacterSpecialCase<1>) // NOLINT
3451 + 3499 +
3452 kCanonicalizationRangeMultiStrings7Size * 3500 kCanonicalizationRangeMultiStrings7Size *
3453 sizeof(MultiCharacterSpecialCase<1>); // NOLINT 3501 sizeof(MultiCharacterSpecialCase<1>); // NOLINT
3454 } 3502 }
3455 3503
3456 } // namespace unibrow 3504 } // namespace unibrow
OLDNEW
« src/unicode.h ('K') | « src/unicode.h ('k') | test/cctest/cctest.gyp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698