OLD | NEW |
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 // | 4 // |
5 // This file was generated at 2014-10-08 15:25:47.940335 | 5 // This file was generated at 2014-10-08 15:25:47.940335 |
6 | 6 |
7 #include "src/unicode.h" | 7 #include "src/unicode.h" |
8 #include "src/unicode-inl.h" | 8 #include "src/unicode-inl.h" |
9 #include <stdio.h> | 9 #include <stdio.h> |
10 #include <stdlib.h> | 10 #include <stdlib.h> |
(...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
183 default: | 183 default: |
184 return 0; | 184 return 0; |
185 } | 185 } |
186 return -1; | 186 return -1; |
187 } | 187 } |
188 } else { | 188 } else { |
189 return 0; | 189 return 0; |
190 } | 190 } |
191 } | 191 } |
192 | 192 |
193 | 193 static inline uint8_t NonASCIISequenceLength(byte first) { |
194 static inline size_t NonASCIISequenceLength(byte first) { | |
195 // clang-format off | 194 // clang-format off |
196 static const uint8_t lengths[256] = { | 195 static const uint8_t lengths[256] = { |
197 // The first 128 entries correspond to ASCII characters. | 196 // The first 128 entries correspond to ASCII characters. |
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
298 } | 297 } |
299 if (!IsContinuationCharacter(str[3])) { | 298 if (!IsContinuationCharacter(str[3])) { |
300 *cursor += 1; | 299 *cursor += 1; |
301 return kBadChar; | 300 return kBadChar; |
302 } | 301 } |
303 *cursor += 4; | 302 *cursor += 4; |
304 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - | 303 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - |
305 0x03C82080; | 304 0x03C82080; |
306 } | 305 } |
307 | 306 |
| 307 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) { |
| 308 DCHECK_NOT_NULL(buffer); |
| 309 |
| 310 // The common case: 1-byte Utf8 (and no incomplete char in the buffer) |
| 311 if (V8_LIKELY(next <= kMaxOneByteChar && *buffer == 0)) { |
| 312 return static_cast<uchar>(next); |
| 313 } |
| 314 |
| 315 if (*buffer == 0) { |
| 316 // We're at the start of a new character. |
| 317 uint32_t kind = NonASCIISequenceLength(next); |
| 318 if (kind >= 2 && kind <= 4) { |
| 319 // Start of 2..4 byte character, and no buffer. |
| 320 |
| 321 // The mask for the lower bits depends on the kind, and is |
| 322 // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that |
| 323 // with one shift. |
| 324 uint8_t mask = 0x7f >> kind; |
| 325 |
| 326 // Store the kind - 1 (i.e., remaining bytes) in the top byte, value |
| 327 // in the bottom three. |
| 328 *buffer = (kind - 1) << 24 | (next & mask); |
| 329 return kIncomplete; |
| 330 } else { |
| 331 // No buffer, and not the start of a 1-byte char (handled at the |
| 332 // beginning), and not the start of a 2..4 byte char? Bad char. |
| 333 *buffer = 0; |
| 334 return kBadChar; |
| 335 } |
| 336 } else { |
| 337 // We're inside of a character, as described by buffer. |
| 338 if (IsContinuationCharacter(next)) { |
| 339 // How many bytes (excluding this one) do we still expect? |
| 340 uint8_t count = (*buffer >> 24) - 1; |
| 341 // Update the value. |
| 342 uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F); |
| 343 if (count) { |
| 344 *buffer = count << 24 | value; |
| 345 return kIncomplete; |
| 346 } else { |
| 347 *buffer = 0; |
| 348 return value; |
| 349 } |
| 350 } else { |
| 351 // Within a character, but not a continuation character? Bad char. |
| 352 *buffer = 0; |
| 353 return kBadChar; |
| 354 } |
| 355 } |
| 356 } |
| 357 |
308 bool Utf8::Validate(const byte* bytes, size_t length) { | 358 bool Utf8::Validate(const byte* bytes, size_t length) { |
309 size_t cursor = 0; | 359 size_t cursor = 0; |
310 | 360 |
311 // Performance optimization: Skip over single-byte values first. | 361 // Performance optimization: Skip over single-byte values first. |
312 while (cursor < length && bytes[cursor] <= kMaxOneByteChar) { | 362 while (cursor < length && bytes[cursor] <= kMaxOneByteChar) { |
313 ++cursor; | 363 ++cursor; |
314 } | 364 } |
315 | 365 |
316 while (cursor < length) { | 366 while (cursor < length) { |
317 uchar c = ValueOf(bytes + cursor, length - cursor, &cursor); | 367 uchar c = ValueOf(bytes + cursor, length - cursor, &cursor); |
(...skipping 3129 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3447 sizeof(MultiCharacterSpecialCase<1>) // NOLINT | 3497 sizeof(MultiCharacterSpecialCase<1>) // NOLINT |
3448 + | 3498 + |
3449 kCanonicalizationRangeMultiStrings1Size * | 3499 kCanonicalizationRangeMultiStrings1Size * |
3450 sizeof(MultiCharacterSpecialCase<1>) // NOLINT | 3500 sizeof(MultiCharacterSpecialCase<1>) // NOLINT |
3451 + | 3501 + |
3452 kCanonicalizationRangeMultiStrings7Size * | 3502 kCanonicalizationRangeMultiStrings7Size * |
3453 sizeof(MultiCharacterSpecialCase<1>); // NOLINT | 3503 sizeof(MultiCharacterSpecialCase<1>); // NOLINT |
3454 } | 3504 } |
3455 | 3505 |
3456 } // namespace unibrow | 3506 } // namespace unibrow |
OLD | NEW |