src/unicode.cc - Issue 2391273002: Fix bad-char handling in utf-8 streaming streams. Also add test.

Side by Side Diff: src/unicode.cc

Issue 2391273002: Fix bad-char handling in utf-8 streaming streams. Also add test. (Closed)

Patch Set: Improve comments. Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // This file was generated at 2014-10-08 15:25:47.940335	5 // This file was generated at 2014-10-08 15:25:47.940335

6	6

7 #include "src/unicode.h"	7 #include "src/unicode.h"

8 #include "src/unicode-inl.h"	8 #include "src/unicode-inl.h"

9 #include <stdio.h>	9 #include <stdio.h>

10 #include <stdlib.h>	10 #include <stdlib.h>

(...skipping 315 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
326 // Store the kind - 1 (i.e., remaining bytes) in the top byte, value	326 // Store the kind - 1 (i.e., remaining bytes) in the top byte, value

327 // in the bottom three.	327 // in the bottom three.

328 *buffer = (kind - 1) << 24 \| (next & mask);	328 *buffer = (kind - 1) << 24 \| (next & mask);

329 return kIncomplete;	329 return kIncomplete;

330 } else {	330 } else {

331 // No buffer, and not the start of a 1-byte char (handled at the	331 // No buffer, and not the start of a 1-byte char (handled at the

332 // beginning), and not the start of a 2..4 byte char? Bad char.	332 // beginning), and not the start of a 2..4 byte char? Bad char.

333 *buffer = 0;	333 *buffer = 0;

334 return kBadChar;	334 return kBadChar;

335 }	335 }

	336 } else if (*buffer <= 0xff) {

	337 // We have one unprocessed byte left (from the last else case in this if

	338 // statement).

	339 uchar previous = *buffer;

	340 *buffer = 0;

	341 uchar t = ValueOfIncremental(previous, buffer);

	342 if (t == kIncomplete) {

	343 // If we have an incomplete character, process both the previous and the

	344 // next byte at once.

	345 return ValueOfIncremental(next, buffer);

	346 } else {

	347 // Otherwise, process the previous byte and save the next byte for next

	348 // time.

	349 DCHECK_EQ(0, *buffer);

	350 *buffer = next;

	351 return t;

	352 }

	353 } else if (IsContinuationCharacter(next)) {

	354 // We're inside of a character, as described by buffer.

	355

	356 // How many bytes (excluding this one) do we still expect?

	357 uint8_t count = (*buffer >> 24) - 1;

	358 // Update the value.

	359 uint32_t value = ((*buffer & 0xffffff) << 6) \| (next & 0x3F);

	360 if (count) {

	361 *buffer = count << 24 \| value;

	362 return kIncomplete;

	363 } else {

	364 *buffer = 0;

	365 return value;

	366 }

336 } else {	367 } else {

337 // We're inside of a character, as described by buffer.	368 // Within a character, but not a continuation character? Then the

338 if (IsContinuationCharacter(next)) {	369 // previous char was a bad char. But we need to save the current

339 // How many bytes (excluding this one) do we still expect?	370 // one.

340 uint8_t count = (*buffer >> 24) - 1;	371 *buffer = next;

341 // Update the value.	372 return kBadChar;

342 uint32_t value = ((*buffer & 0xffffff) << 6) \| (next & 0x3F);

343 if (count) {

344 *buffer = count << 24 \| value;

345 return kIncomplete;

346 } else {

347 *buffer = 0;

348 return value;

349 }

350 } else {

351 // Within a character, but not a continuation character? Bad char.

352 *buffer = 0;

353 return kBadChar;

354 }

355 }	373 }

356 }	374 }

357	375

	376 uchar Utf8::ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer) {

	377 DCHECK_NOT_NULL(buffer);

	378 if (*buffer == 0) {

	379 return kBufferEmpty;

	380 } else {

	381 // Process left-over chars. An incomplete char at the end maps to kBadChar.

	382 uchar t = ValueOfIncremental(0, buffer);

	383 return (t == kIncomplete) ? kBadChar : t;

	384 }

	385 }

	386

358 bool Utf8::Validate(const byte* bytes, size_t length) {	387 bool Utf8::Validate(const byte* bytes, size_t length) {

359 size_t cursor = 0;	388 size_t cursor = 0;

360	389

361 // Performance optimization: Skip over single-byte values first.	390 // Performance optimization: Skip over single-byte values first.

362 while (cursor < length && bytes[cursor] <= kMaxOneByteChar) {	391 while (cursor < length && bytes[cursor] <= kMaxOneByteChar) {

363 ++cursor;	392 ++cursor;

364 }	393 }

365	394

366 while (cursor < length) {	395 while (cursor < length) {

367 uchar c = ValueOf(bytes + cursor, length - cursor, &cursor);	396 uchar c = ValueOf(bytes + cursor, length - cursor, &cursor);

(...skipping 3129 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3497 sizeof(MultiCharacterSpecialCase<1>) // NOLINT	3526 sizeof(MultiCharacterSpecialCase<1>) // NOLINT

3498 +	3527 +

3499 kCanonicalizationRangeMultiStrings1Size *	3528 kCanonicalizationRangeMultiStrings1Size *

3500 sizeof(MultiCharacterSpecialCase<1>) // NOLINT	3529 sizeof(MultiCharacterSpecialCase<1>) // NOLINT

3501 +	3530 +

3502 kCanonicalizationRangeMultiStrings7Size *	3531 kCanonicalizationRangeMultiStrings7Size *

3503 sizeof(MultiCharacterSpecialCase<1>); // NOLINT	3532 sizeof(MultiCharacterSpecialCase<1>); // NOLINT

3504 }	3533 }

3505	3534

3506 } // namespace unibrow	3535 } // namespace unibrow

OLD	NEW

« src/parsing/scanner-character-streams.cc ('K') | « src/unicode.h ('k') | test/cctest/parsing/test-scanner-streams.cc » ('j') | no next file with comments »