Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(271)

Side by Side Diff: src/unicode.cc

Issue 2391273002: Fix bad-char handling in utf-8 streaming streams. Also add test. (Closed)
Patch Set: Improve comments. Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 // 4 //
5 // This file was generated at 2014-10-08 15:25:47.940335 5 // This file was generated at 2014-10-08 15:25:47.940335
6 6
7 #include "src/unicode.h" 7 #include "src/unicode.h"
8 #include "src/unicode-inl.h" 8 #include "src/unicode-inl.h"
9 #include <stdio.h> 9 #include <stdio.h>
10 #include <stdlib.h> 10 #include <stdlib.h>
(...skipping 315 matching lines...) Expand 10 before | Expand all | Expand 10 after
326 // Store the kind - 1 (i.e., remaining bytes) in the top byte, value 326 // Store the kind - 1 (i.e., remaining bytes) in the top byte, value
327 // in the bottom three. 327 // in the bottom three.
328 *buffer = (kind - 1) << 24 | (next & mask); 328 *buffer = (kind - 1) << 24 | (next & mask);
329 return kIncomplete; 329 return kIncomplete;
330 } else { 330 } else {
331 // No buffer, and not the start of a 1-byte char (handled at the 331 // No buffer, and not the start of a 1-byte char (handled at the
332 // beginning), and not the start of a 2..4 byte char? Bad char. 332 // beginning), and not the start of a 2..4 byte char? Bad char.
333 *buffer = 0; 333 *buffer = 0;
334 return kBadChar; 334 return kBadChar;
335 } 335 }
336 } else if (*buffer <= 0xff) {
337 // We have one unprocessed byte left (from the last else case in this if
338 // statement).
339 uchar previous = *buffer;
340 *buffer = 0;
341 uchar t = ValueOfIncremental(previous, buffer);
342 if (t == kIncomplete) {
343 // If we have an incomplete character, process both the previous and the
344 // next byte at once.
345 return ValueOfIncremental(next, buffer);
346 } else {
347 // Otherwise, process the previous byte and save the next byte for next
348 // time.
349 DCHECK_EQ(0, *buffer);
350 *buffer = next;
351 return t;
352 }
353 } else if (IsContinuationCharacter(next)) {
354 // We're inside of a character, as described by buffer.
355
356 // How many bytes (excluding this one) do we still expect?
357 uint8_t count = (*buffer >> 24) - 1;
358 // Update the value.
359 uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
360 if (count) {
361 *buffer = count << 24 | value;
362 return kIncomplete;
363 } else {
364 *buffer = 0;
365 return value;
366 }
336 } else { 367 } else {
337 // We're inside of a character, as described by buffer. 368 // Within a character, but not a continuation character? Then the
338 if (IsContinuationCharacter(next)) { 369 // previous char was a bad char. But we need to save the current
339 // How many bytes (excluding this one) do we still expect? 370 // one.
340 uint8_t count = (*buffer >> 24) - 1; 371 *buffer = next;
341 // Update the value. 372 return kBadChar;
342 uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
343 if (count) {
344 *buffer = count << 24 | value;
345 return kIncomplete;
346 } else {
347 *buffer = 0;
348 return value;
349 }
350 } else {
351 // Within a character, but not a continuation character? Bad char.
352 *buffer = 0;
353 return kBadChar;
354 }
355 } 373 }
356 } 374 }
357 375
376 uchar Utf8::ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer) {
377 DCHECK_NOT_NULL(buffer);
378 if (*buffer == 0) {
379 return kBufferEmpty;
380 } else {
381 // Process left-over chars. An incomplete char at the end maps to kBadChar.
382 uchar t = ValueOfIncremental(0, buffer);
383 return (t == kIncomplete) ? kBadChar : t;
384 }
385 }
386
358 bool Utf8::Validate(const byte* bytes, size_t length) { 387 bool Utf8::Validate(const byte* bytes, size_t length) {
359 size_t cursor = 0; 388 size_t cursor = 0;
360 389
361 // Performance optimization: Skip over single-byte values first. 390 // Performance optimization: Skip over single-byte values first.
362 while (cursor < length && bytes[cursor] <= kMaxOneByteChar) { 391 while (cursor < length && bytes[cursor] <= kMaxOneByteChar) {
363 ++cursor; 392 ++cursor;
364 } 393 }
365 394
366 while (cursor < length) { 395 while (cursor < length) {
367 uchar c = ValueOf(bytes + cursor, length - cursor, &cursor); 396 uchar c = ValueOf(bytes + cursor, length - cursor, &cursor);
(...skipping 3129 matching lines...) Expand 10 before | Expand all | Expand 10 after
3497 sizeof(MultiCharacterSpecialCase<1>) // NOLINT 3526 sizeof(MultiCharacterSpecialCase<1>) // NOLINT
3498 + 3527 +
3499 kCanonicalizationRangeMultiStrings1Size * 3528 kCanonicalizationRangeMultiStrings1Size *
3500 sizeof(MultiCharacterSpecialCase<1>) // NOLINT 3529 sizeof(MultiCharacterSpecialCase<1>) // NOLINT
3501 + 3530 +
3502 kCanonicalizationRangeMultiStrings7Size * 3531 kCanonicalizationRangeMultiStrings7Size *
3503 sizeof(MultiCharacterSpecialCase<1>); // NOLINT 3532 sizeof(MultiCharacterSpecialCase<1>); // NOLINT
3504 } 3533 }
3505 3534
3506 } // namespace unibrow 3535 } // namespace unibrow
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698