OLD | NEW |
---|---|
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 // | 4 // |
5 // This file was generated at 2014-10-08 15:25:47.940335 | 5 // This file was generated at 2014-10-08 15:25:47.940335 |
6 | 6 |
7 #include "src/unicode.h" | 7 #include "src/unicode.h" |
8 #include "src/unicode-inl.h" | 8 #include "src/unicode-inl.h" |
9 #include <stdio.h> | 9 #include <stdio.h> |
10 #include <stdlib.h> | 10 #include <stdlib.h> |
(...skipping 172 matching lines...) Loading... | |
183 default: | 183 default: |
184 return 0; | 184 return 0; |
185 } | 185 } |
186 return -1; | 186 return -1; |
187 } | 187 } |
188 } else { | 188 } else { |
189 return 0; | 189 return 0; |
190 } | 190 } |
191 } | 191 } |
192 | 192 |
193 | 193 static inline uint8_t NonASCIISequenceLength(byte first) { |
194 static inline size_t NonASCIISequenceLength(byte first) { | |
195 // clang-format off | 194 // clang-format off |
196 static const uint8_t lengths[256] = { | 195 static const uint8_t lengths[256] = { |
197 // The first 128 entries correspond to ASCII characters. | 196 // The first 128 entries correspond to ASCII characters. |
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
(...skipping 93 matching lines...) Loading... | |
298 } | 297 } |
299 if (!IsContinuationCharacter(str[3])) { | 298 if (!IsContinuationCharacter(str[3])) { |
300 *cursor += 1; | 299 *cursor += 1; |
301 return kBadChar; | 300 return kBadChar; |
302 } | 301 } |
303 *cursor += 4; | 302 *cursor += 4; |
304 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - | 303 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - |
305 0x03C82080; | 304 0x03C82080; |
306 } | 305 } |
307 | 306 |
307 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer& buffer) { | |
308 // The common case: 1-byte Utf8 (and no incomplete char in the buffer) | |
309 if (V8_LIKELY(next <= kMaxOneByteChar && buffer == 0)) { | |
310 return static_cast<uchar>(next); | |
311 } | |
312 | |
313 // All other cases: | |
marja
2016/09/07 09:17:57
Why does it make sense to do NonAsciiSequenceLengt
vogelheim
2016/09/07 12:32:56
Done. Your version makes much more sense.
| |
314 uint32_t kind = NonASCIISequenceLength(next); | |
315 switch (kind) { | |
316 case 0: | |
317 if (IsContinuationCharacter(next)) { | |
318 // How many bytes (excluding this one) do we still expect? | |
319 uint8_t count = (buffer >> 24) - 1; | |
320 // Update the value. | |
321 uint32_t value = ((buffer & 0xffffff) << 6) | (next & 0x3F); | |
322 if (count) { | |
323 buffer = count << 24 | value; | |
324 return kIncomplete; | |
325 } else { | |
326 buffer = 0; | |
327 return value; | |
328 } | |
329 } else { | |
330 // Not a continuation character? Bad char. | |
331 buffer = 0; | |
332 return kBadChar; | |
333 } | |
334 case 2: | |
335 case 3: | |
336 case 4: | |
337 if (buffer == 0) { | |
338 // Start of 2..4 byte character, and no buffer. | |
339 | |
340 // The mask for the lower bits depends on the kind, and is | |
341 // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that | |
342 // with one shift. | |
343 uint8_t mask = 0x7f >> kind; | |
344 | |
345 // Store the kind - 1 (i.e., remaining bytes) in the top byte, value | |
346 // in the bottom three. | |
347 buffer = (kind - 1) << 24 | (next & mask); | |
348 return kIncomplete; | |
349 } else { | |
350 // Start of new character, but we still have a buffer? Bad char. | |
351 buffer = 0; | |
352 return kBadChar; | |
353 } | |
354 default: | |
355 UNREACHABLE(); | |
356 return kBadChar; | |
357 } | |
358 } | |
359 | |
308 bool Utf8::Validate(const byte* bytes, size_t length) { | 360 bool Utf8::Validate(const byte* bytes, size_t length) { |
309 size_t cursor = 0; | 361 size_t cursor = 0; |
310 | 362 |
311 // Performance optimization: Skip over single-byte values first. | 363 // Performance optimization: Skip over single-byte values first. |
312 while (cursor < length && bytes[cursor] <= kMaxOneByteChar) { | 364 while (cursor < length && bytes[cursor] <= kMaxOneByteChar) { |
313 ++cursor; | 365 ++cursor; |
314 } | 366 } |
315 | 367 |
316 while (cursor < length) { | 368 while (cursor < length) { |
317 uchar c = ValueOf(bytes + cursor, length - cursor, &cursor); | 369 uchar c = ValueOf(bytes + cursor, length - cursor, &cursor); |
(...skipping 3129 matching lines...) Loading... | |
3447 sizeof(MultiCharacterSpecialCase<1>) // NOLINT | 3499 sizeof(MultiCharacterSpecialCase<1>) // NOLINT |
3448 + | 3500 + |
3449 kCanonicalizationRangeMultiStrings1Size * | 3501 kCanonicalizationRangeMultiStrings1Size * |
3450 sizeof(MultiCharacterSpecialCase<1>) // NOLINT | 3502 sizeof(MultiCharacterSpecialCase<1>) // NOLINT |
3451 + | 3503 + |
3452 kCanonicalizationRangeMultiStrings7Size * | 3504 kCanonicalizationRangeMultiStrings7Size * |
3453 sizeof(MultiCharacterSpecialCase<1>); // NOLINT | 3505 sizeof(MultiCharacterSpecialCase<1>); // NOLINT |
3454 } | 3506 } |
3455 | 3507 |
3456 } // namespace unibrow | 3508 } // namespace unibrow |
OLD | NEW |