test/cctest/parsing/test-scanner-streams.cc - Issue 2354973002: Handle Utf-8 BOM at beginning of an Utf-8 stream.

Side by Side Diff: test/cctest/parsing/test-scanner-streams.cc

Issue 2354973002: Handle Utf-8 BOM at beginning of an Utf-8 stream. (Closed)

Patch Set: Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2016 the V8 project authors. All rights reserved.	1 // Copyright 2016 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/factory.h" // for i::Factory::NewExternalStringFrom*Byte	5 #include "src/factory.h" // for i::Factory::NewExternalStringFrom*Byte

6 #include "src/objects-inl.h"	6 #include "src/objects-inl.h"

7 #include "src/parsing/scanner-character-streams.h"	7 #include "src/parsing/scanner-character-streams.h"

8 #include "src/parsing/scanner.h"	8 #include "src/parsing/scanner.h"

9 #include "src/type-feedback-vector-inl.h" // for include "src/factory.h"	9 #include "src/type-feedback-vector-inl.h" // for include "src/factory.h"

10 #include "test/cctest/cctest.h"	10 #include "test/cctest/cctest.h"

(...skipping 91 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
102 v8::internal::ScannerStream::For(	102 v8::internal::ScannerStream::For(

103 &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));	103 &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));

104	104

105 // Read the data without dying.	105 // Read the data without dying.

106 v8::internal::uc32 c;	106 v8::internal::uc32 c;

107 do {	107 do {

108 c = stream->Advance();	108 c = stream->Advance();

109 } while (c != v8::internal::Utf16CharacterStream::kEndOfInput);	109 } while (c != v8::internal::Utf16CharacterStream::kEndOfInput);

110 }	110 }

111	111

	112 TEST(Utf8StreamBOM) {

	113 // Construct test string w/ UTF-8 BOM (byte order mark)

	114 char data[3 + arraysize(unicode_utf8)] = {"\xef\xbb\xbf"};

	115 strncpy(data + 3, unicode_utf8, arraysize(unicode_utf8));

	116

	117 const char* chunks[] = {data, "\0"};

	118 ChunkSource chunk_source(chunks);

	119 std::unique_ptr<v8::internal::Utf16CharacterStream> stream(

	120 v8::internal::ScannerStream::For(

	121 &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));

	122

	123 // Read the data without tripping over the BOM.

	124 for (size_t i = 0; unicode_ucs2[i]; i++) {

	125 CHECK_EQ(unicode_ucs2[i], stream->Advance());

	126 }

	127 CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput, stream->Advance());

	128

	129 // Make sure seek works.

	130 stream->Seek(0);

	131 CHECK_EQ(unicode_ucs2[0], stream->Advance());

	132

	133 stream->Seek(5);

	134 CHECK_EQ(unicode_ucs2[5], stream->Advance());

	135 }

	136

	137 TEST(Utf8SplitBOM) {

	138 // Construct chunks with a BOM split into two chunks.

	139 char partial_bom[] = "\xef\xbb";

	140 char data[1 + arraysize(unicode_utf8)] = {"\xbf"};
	marja 2016/09/20 18:52:30 ... how paranoid do we want to be, should there al ... how paranoid do we want to be, should there also be a test where it's all in one-byte chunks? vogelheim 2016/09/21 07:50:54 Done. Show quoted text On 2016/09/20 18:52:30, marja wrote: > ... how paranoid do we want to be, should there also be a test where it's all in > one-byte chunks? Done.
	141 strncpy(data + 1, unicode_utf8, arraysize(unicode_utf8));

	142

	143 const char* chunks[] = {partial_bom, data, "\0"};

	144 ChunkSource chunk_source(chunks);

	145 std::unique_ptr<v8::internal::Utf16CharacterStream> stream(

	146 v8::internal::ScannerStream::For(

	147 &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));

	148

	149 // Read the data without tripping over the BOM.

	150 for (size_t i = 0; unicode_ucs2[i]; i++) {

	151 CHECK_EQ(unicode_ucs2[i], stream->Advance());

	152 }

	153 }

	154

112 TEST(Utf8ChunkBoundaries) {	155 TEST(Utf8ChunkBoundaries) {

113 // Test utf-8 parsing at chunk boundaries.	156 // Test utf-8 parsing at chunk boundaries.

114	157

115 // Split the test string at each byte and pass it to the stream. This way,	158 // Split the test string at each byte and pass it to the stream. This way,

116 // we'll have a split at each possible boundary.	159 // we'll have a split at each possible boundary.

117 size_t len = strlen(unicode_utf8);	160 size_t len = strlen(unicode_utf8);

118 char buffer[arraysize(unicode_utf8) + 3];	161 char buffer[arraysize(unicode_utf8) + 3];

119 for (size_t i = 1; i < len; i++) {	162 for (size_t i = 1; i < len; i++) {

120 // Copy source string into buffer, splitting it at i.	163 // Copy source string into buffer, splitting it at i.

121 // Then add three chunks, 0..i-1, i..strlen-1, empty.	164 // Then add three chunks, 0..i-1, i..strlen-1, empty.

(...skipping 233 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
355	398

356 // 4k large buffer.	399 // 4k large buffer.

357 char buffer[4096 + 1];	400 char buffer[4096 + 1];

358 for (unsigned i = 0; i < arraysize(buffer); i++) {	401 for (unsigned i = 0; i < arraysize(buffer); i++) {

359 buffer[i] = static_cast<char>(i & 0x7F);	402 buffer[i] = static_cast<char>(i & 0x7F);

360 }	403 }

361 buffer[arraysize(buffer) - 1] = '\0';	404 buffer[arraysize(buffer) - 1] = '\0';

362 TestCharacterStreams(buffer, arraysize(buffer) - 1);	405 TestCharacterStreams(buffer, arraysize(buffer) - 1);

363 TestCharacterStreams(buffer, arraysize(buffer) - 1, 576, 3298);	406 TestCharacterStreams(buffer, arraysize(buffer) - 1, 576, 3298);

364 }	407 }

OLD	NEW

« src/parsing/scanner-character-streams.cc ('K') | « src/parsing/scanner-character-streams.cc ('k') | no next file » | no next file with comments »