OLD | NEW |
---|---|
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 // | 4 // |
5 // This file was generated at 2014-10-08 15:25:47.940335 | 5 // This file was generated at 2014-10-08 15:25:47.940335 |
6 | 6 |
7 #include "src/unicode.h" | 7 #include "src/unicode.h" |
8 #include "src/unicode-inl.h" | 8 #include "src/unicode-inl.h" |
9 #include <stdio.h> | 9 #include <stdio.h> |
10 #include <stdlib.h> | 10 #include <stdlib.h> |
(...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
183 default: | 183 default: |
184 return 0; | 184 return 0; |
185 } | 185 } |
186 return -1; | 186 return -1; |
187 } | 187 } |
188 } else { | 188 } else { |
189 return 0; | 189 return 0; |
190 } | 190 } |
191 } | 191 } |
192 | 192 |
193 | 193 static inline uint8_t NonASCIISequenceLength(byte first) { |
194 static inline size_t NonASCIISequenceLength(byte first) { | |
195 // clang-format off | 194 // clang-format off |
196 static const uint8_t lengths[256] = { | 195 static const uint8_t lengths[256] = { |
197 // The first 128 entries correspond to ASCII characters. | 196 // The first 128 entries correspond to ASCII characters. |
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
298 } | 297 } |
299 if (!IsContinuationCharacter(str[3])) { | 298 if (!IsContinuationCharacter(str[3])) { |
300 *cursor += 1; | 299 *cursor += 1; |
301 return kBadChar; | 300 return kBadChar; |
302 } | 301 } |
303 *cursor += 4; | 302 *cursor += 4; |
304 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - | 303 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - |
305 0x03C82080; | 304 0x03C82080; |
306 } | 305 } |
307 | 306 |
307 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer& buffer) { | |
marja
2016/09/08 09:46:23
Much better now!
| |
308 // The common case: 1-byte Utf8 (and no incomplete char in the buffer) | |
309 if (V8_LIKELY(next <= kMaxOneByteChar && buffer == 0)) { | |
310 return static_cast<uchar>(next); | |
311 } | |
312 | |
313 if (buffer == 0) { | |
314 // We're at the start of a new character. | |
315 uint32_t kind = NonASCIISequenceLength(next); | |
316 if (kind >= 2 && kind <= 4) { | |
317 // Start of 2..4 byte character, and no buffer. | |
318 | |
319 // The mask for the lower bits depends on the kind, and is | |
320 // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that | |
321 // with one shift. | |
322 uint8_t mask = 0x7f >> kind; | |
323 | |
324 // Store the kind - 1 (i.e., remaining bytes) in the top byte, value | |
325 // in the bottom three. | |
326 buffer = (kind - 1) << 24 | (next & mask); | |
327 return kIncomplete; | |
328 } else { | |
329 // No buffer, and not the start of a 1-byte char (handled at the | |
330 // beginning), and not the start of a 2..4 byte char? Bad char. | |
331 buffer = 0; | |
332 return kBadChar; | |
333 } | |
334 } else { | |
335 // We're inside of a character, as described by buffer. | |
336 if (IsContinuationCharacter(next)) { | |
337 // How many bytes (excluding this one) do we still expect? | |
338 uint8_t count = (buffer >> 24) - 1; | |
339 // Update the value. | |
340 uint32_t value = ((buffer & 0xffffff) << 6) | (next & 0x3F); | |
341 if (count) { | |
342 buffer = count << 24 | value; | |
343 return kIncomplete; | |
344 } else { | |
345 buffer = 0; | |
346 return value; | |
347 } | |
348 } else { | |
349 // Within a character, but not a continuation character? Bad char. | |
350 buffer = 0; | |
351 return kBadChar; | |
352 } | |
353 } | |
354 } | |
355 | |
308 bool Utf8::Validate(const byte* bytes, size_t length) { | 356 bool Utf8::Validate(const byte* bytes, size_t length) { |
309 size_t cursor = 0; | 357 size_t cursor = 0; |
310 | 358 |
311 // Performance optimization: Skip over single-byte values first. | 359 // Performance optimization: Skip over single-byte values first. |
312 while (cursor < length && bytes[cursor] <= kMaxOneByteChar) { | 360 while (cursor < length && bytes[cursor] <= kMaxOneByteChar) { |
313 ++cursor; | 361 ++cursor; |
314 } | 362 } |
315 | 363 |
316 while (cursor < length) { | 364 while (cursor < length) { |
317 uchar c = ValueOf(bytes + cursor, length - cursor, &cursor); | 365 uchar c = ValueOf(bytes + cursor, length - cursor, &cursor); |
(...skipping 3129 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3447 sizeof(MultiCharacterSpecialCase<1>) // NOLINT | 3495 sizeof(MultiCharacterSpecialCase<1>) // NOLINT |
3448 + | 3496 + |
3449 kCanonicalizationRangeMultiStrings1Size * | 3497 kCanonicalizationRangeMultiStrings1Size * |
3450 sizeof(MultiCharacterSpecialCase<1>) // NOLINT | 3498 sizeof(MultiCharacterSpecialCase<1>) // NOLINT |
3451 + | 3499 + |
3452 kCanonicalizationRangeMultiStrings7Size * | 3500 kCanonicalizationRangeMultiStrings7Size * |
3453 sizeof(MultiCharacterSpecialCase<1>); // NOLINT | 3501 sizeof(MultiCharacterSpecialCase<1>); // NOLINT |
3454 } | 3502 } |
3455 | 3503 |
3456 } // namespace unibrow | 3504 } // namespace unibrow |
OLD | NEW |