src/lexer/lexer-shell.cc - Issue 196943021: Experimental parser: add utf8tolatin1 conversion

Side by Side Diff: src/lexer/lexer-shell.cc

Issue 196943021: Experimental parser: add utf8tolatin1 conversion (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser

Patch Set: Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2013 the V8 project authors. All rights reserved.	1 // Copyright 2013 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 35 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
46 #include "lexer/lexer.h"	46 #include "lexer/lexer.h"

47	47

48 using namespace v8::internal;	48 using namespace v8::internal;

49	49

50	50

51 enum Encoding {	51 enum Encoding {

52 LATIN1,	52 LATIN1,

53 UTF8,	53 UTF8,

54 UTF16,	54 UTF16,

55 UTF8TO16, // Convert stream via scanner input stream	55 UTF8TO16, // Convert stream via scanner input stream

56 UTF8TO16_PRECONVERT // Convert stream during file read	56 UTF8TOLATIN1, // Convert stream via scanner input stream

57 };	57 };

58	58

59	59

60 struct LexerShellSettings {	60 struct LexerShellSettings {

61 Encoding encoding;	61 Encoding encoding;

62 bool print_tokens;	62 bool print_tokens;

63 bool print_tokens_for_compare;	63 bool print_tokens_for_compare;

64 bool break_after_illegal;	64 bool break_after_illegal;

65 bool eos_test;	65 bool eos_test;

66 int repeat;	66 int repeat;

67 bool harmony_numeric_literals;	67 bool harmony_numeric_literals;

68 bool harmony_modules;	68 bool harmony_modules;

69 bool harmony_scoping;	69 bool harmony_scoping;

70 LexerShellSettings()	70 LexerShellSettings()

71 : encoding(LATIN1),	71 : encoding(LATIN1),

72 print_tokens(false),	72 print_tokens(false),

73 print_tokens_for_compare(false),	73 print_tokens_for_compare(false),

74 break_after_illegal(false),	74 break_after_illegal(false),

75 eos_test(false),	75 eos_test(false),

76 repeat(1),	76 repeat(1),

77 harmony_numeric_literals(false),	77 harmony_numeric_literals(false),

78 harmony_modules(false),	78 harmony_modules(false),

79 harmony_scoping(false) {}	79 harmony_scoping(false) {}

80 };	80 };

81	81

82	82

83 static uint16_t* ConvertUtf8ToUtf16(const uint16_t* const data_in,	83 static uint16_t* ConvertUtf8ToUtf16(const uint16_t* const data_in,

84 unsigned* length) {	84 unsigned* length,

	85 bool* is_one_byte) {

85 const unsigned file_size = *length;	86 const unsigned file_size = *length;

86 const uint8_t* char_data = reinterpret_cast<const uint8_t*>(data_in);	87 const uint8_t* char_data = reinterpret_cast<const uint8_t*>(data_in);

87 const uint32_t kMaxUtf16Character = 0xffff;	88 const uint32_t kMaxUtf16Character = 0xffff;

88 // Get utf8 length.	89 // Get utf8 length.

89 unsigned utf16_chars = 0;	90 unsigned utf16_chars = 0;

	91 *is_one_byte = true;

90 {	92 {

91 unsigned position = 0;	93 unsigned position = 0;

92 while (position < file_size) {	94 while (position < file_size) {

93 uint32_t c = char_data[position];	95 uint32_t c = char_data[position];

94 if (c <= unibrow::Utf8::kMaxOneByteChar) {	96 if (c <= unibrow::Utf8::kMaxOneByteChar) {

95 position++;	97 position++;

96 } else {	98 } else {

	99 *is_one_byte = false;

97 c = unibrow::Utf8::CalculateValue(char_data + position,	100 c = unibrow::Utf8::CalculateValue(char_data + position,

98 file_size - position,	101 file_size - position,

99 &position);	102 &position);

100 }	103 }

101 if (c > kMaxUtf16Character) {	104 if (c > kMaxUtf16Character) {

102 utf16_chars += 2;	105 utf16_chars += 2;

103 } else {	106 } else {

104 utf16_chars += 1;	107 utf16_chars += 1;

105 }	108 }

106 }	109 }

(...skipping 16 matching lines...) Expand all Loading...
123 data[i++] = unibrow::Utf16::TrailSurrogate(c);	126 data[i++] = unibrow::Utf16::TrailSurrogate(c);

124 } else {	127 } else {

125 data[i++] = static_cast<uc16>(c);	128 data[i++] = static_cast<uc16>(c);

126 }	129 }

127 }	130 }

128 length = 2 utf16_chars;	131 length = 2 utf16_chars;

129 return data;	132 return data;

130 }	133 }

131	134

132	135

	136 static uint16_t* ConvertUtf16ToLatin1(const uint16_t* const data_in,

	137 unsigned* length) {

	138 const unsigned size = length / 2 + length % 2;

	139 uint16_t* data = new uint16_t[size];

	140 uint8_t* char_data = reinterpret_cast<uint8_t*>(data);

	141 CopyChars(char_data, data_in, size);

	142 *length = size;

	143 return data;

	144 }

	145

	146

133 static uint16_t* Repeat(int repeat,	147 static uint16_t* Repeat(int repeat,

134 const uint16_t* const data_in,	148 const uint16_t* const data_in,

135 unsigned* length) {	149 unsigned* length) {

136 const unsigned file_size = *length;	150 const unsigned file_size = *length;

137 unsigned size = file_size * repeat;	151 unsigned size = file_size * repeat;

138 uint16_t* data = new uint16_t[size / 2 + size % 2];	152 uint16_t* data = new uint16_t[size / 2 + size % 2];

139 uint8_t* char_data = reinterpret_cast<uint8_t*>(data);	153 uint8_t* char_data = reinterpret_cast<uint8_t*>(data);

140 for (int i = 0; i < repeat; i++) {	154 for (int i = 0; i < repeat; i++) {

141 memcpy(&char_data[i * file_size], data_in, file_size);	155 memcpy(&char_data[i * file_size], data_in, file_size);

142 }	156 }

(...skipping 16 matching lines...) Expand all Loading...
159 i += fread(&char_data[i], 1, file_size - i, file);	173 i += fread(&char_data[i], 1, file_size - i, file);

160 }	174 }

161 fclose(file);	175 fclose(file);

162 *length = file_size;	176 *length = file_size;

163 return data;	177 return data;

164 }	178 }

165	179

166	180

167 static uint16_t* ReadFile(const char* name,	181 static uint16_t* ReadFile(const char* name,

168 const LexerShellSettings& settings,	182 const LexerShellSettings& settings,

169 unsigned* length) {	183 unsigned* length,

	184 Encoding* output_encoding) {

170 uint16_t* data = ReadFile(name, length);	185 uint16_t* data = ReadFile(name, length);

171 CHECK_GE(*length, 0);	186 CHECK_GE(*length, 0);

172 if (*length == 0) return data;	187 if (*length == 0) return data;

173	188

174 if (settings.encoding == UTF8TO16_PRECONVERT) {	189 *output_encoding = settings.encoding;

175 uint16_t* new_data = ConvertUtf8ToUtf16(data, length);	190

	191 if (settings.encoding == UTF8TO16 \|\|

	192 settings.encoding == UTF8TOLATIN1) {

	193 bool is_one_byte;

	194 uint16_t* new_data = ConvertUtf8ToUtf16(data, length, &is_one_byte);

	195 if (settings.encoding == UTF8TOLATIN1 && is_one_byte) {

	196 *output_encoding = LATIN1;

	197 } else {

	198 *output_encoding = UTF16;

	199 }

176 delete data;	200 delete data;

177 data = new_data;	201 data = new_data;

178 }	202 }

	203

	204 if (settings.encoding == UTF8TOLATIN1 && *output_encoding == LATIN1) {

	205 uint16_t* new_data = ConvertUtf16ToLatin1(data, length);

	206 delete data;

	207 data = new_data;

	208 }

179	209

180 if (settings.repeat > 1) {	210 if (settings.repeat > 1) {

181 uint16_t* new_data = Repeat(settings.repeat, data, length);	211 uint16_t* new_data = Repeat(settings.repeat, data, length);

182 delete data;	212 delete data;

183 data = new_data;	213 data = new_data;

184 }	214 }

185	215

186 return data;	216 return data;

187 }	217 }

188	218

(...skipping 69 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
258 }	288 }

259	289

260 private:	290 private:

261 DISALLOW_COPY_AND_ASSIGN(TokenWithLocation);	291 DISALLOW_COPY_AND_ASSIGN(TokenWithLocation);

262 };	292 };

263	293

264	294

265 static TimeDelta RunLexer(const uint16_t* source,	295 static TimeDelta RunLexer(const uint16_t* source,

266 const uint8_t* source_end,	296 const uint8_t* source_end,

267 Isolate* isolate,	297 Isolate* isolate,

	298 Encoding output_encoding,

268 const LexerShellSettings& settings) {	299 const LexerShellSettings& settings) {

269 SmartPointer<Utf16CharacterStream> stream;	300 SmartPointer<Utf16CharacterStream> stream;

270 const uint8_t* one_byte_source = reinterpret_cast<const uint8_t*>(source);	301 const uint8_t* one_byte_source = reinterpret_cast<const uint8_t*>(source);

271 int bytes = source_end - one_byte_source;	302 int bytes = source_end - one_byte_source;

272 switch (settings.encoding) {	303 switch (output_encoding) {

273 case UTF8TO16:

274 case UTF8:	304 case UTF8:

275 stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes));	305 stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes));

276 break;	306 break;

277 case UTF8TO16_PRECONVERT:

278 case UTF16: {	307 case UTF16: {

279 CHECK_EQ(0, bytes % 2);	308 CHECK_EQ(0, bytes % 2);

280 Handle<String> result = isolate->factory()->NewStringFromTwoByte(	309 Handle<String> result = isolate->factory()->NewStringFromTwoByte(

281 Vector<const uint16_t>(source, bytes / 2));	310 Vector<const uint16_t>(source, bytes / 2));

282 stream.Reset(	311 stream.Reset(

283 new GenericStringUtf16CharacterStream(result, 0, result->length()));	312 new GenericStringUtf16CharacterStream(result, 0, result->length()));

284 break;	313 break;

285 }	314 }

286 case LATIN1: {	315 case LATIN1: {

287 Handle<String> result = isolate->factory()->NewStringFromOneByte(	316 Handle<String> result = isolate->factory()->NewStringFromOneByte(

288 Vector<const uint8_t>(one_byte_source, bytes));	317 Vector<const uint8_t>(one_byte_source, bytes));

289 stream.Reset(	318 stream.Reset(

290 new GenericStringUtf16CharacterStream(result, 0, result->length()));	319 new GenericStringUtf16CharacterStream(result, 0, result->length()));

291 break;	320 break;

292 }	321 }

	322 case UTF8TO16:

	323 case UTF8TOLATIN1:

	324 CHECK(false);

293 }	325 }

294 Scanner scanner(isolate->unicode_cache());	326 Scanner scanner(isolate->unicode_cache());

295 scanner.SetHarmonyNumericLiterals(settings.harmony_numeric_literals);	327 scanner.SetHarmonyNumericLiterals(settings.harmony_numeric_literals);

296 scanner.SetHarmonyModules(settings.harmony_modules);	328 scanner.SetHarmonyModules(settings.harmony_modules);

297 scanner.SetHarmonyScoping(settings.harmony_scoping);	329 scanner.SetHarmonyScoping(settings.harmony_scoping);

298 ElapsedTimer timer;	330 ElapsedTimer timer;

299 std::vector<TokenWithLocation*> tokens;	331 std::vector<TokenWithLocation*> tokens;

300 timer.Start();	332 timer.Start();

301 scanner.Initialize(stream.get());	333 scanner.Initialize(stream.get());

302 Token::Value token;	334 Token::Value token;

(...skipping 30 matching lines...) Expand all Loading...
333 const LexerShellSettings& settings,	365 const LexerShellSettings& settings,

334 int truncate_by,	366 int truncate_by,

335 bool* can_truncate) {	367 bool* can_truncate) {

336 if (settings.print_tokens && !settings.print_tokens_for_compare) {	368 if (settings.print_tokens && !settings.print_tokens_for_compare) {

337 printf("Processing file %s, truncating by %d bytes\n", fname, truncate_by);	369 printf("Processing file %s, truncating by %d bytes\n", fname, truncate_by);

338 }	370 }

339 HandleScope handle_scope(isolate);	371 HandleScope handle_scope(isolate);

340 TimeDelta time;	372 TimeDelta time;

341 {	373 {

342 unsigned length_in_bytes;	374 unsigned length_in_bytes;

343 const uint16_t* buffer = ReadFile(fname, settings, &length_in_bytes);	375 Encoding output_encoding;

	376 const uint16_t* buffer =

	377 ReadFile(fname, settings, &length_in_bytes, &output_encoding);

344 const uint8_t* char_data = reinterpret_cast<const uint8_t*>(buffer);	378 const uint8_t* char_data = reinterpret_cast<const uint8_t*>(buffer);

345 const uint8_t* buffer_end = &char_data[length_in_bytes];	379 const uint8_t* buffer_end = &char_data[length_in_bytes];

346 if (truncate_by > buffer_end - char_data) {	380 if (truncate_by > buffer_end - char_data) {

347 *can_truncate = false;	381 *can_truncate = false;

348 } else {	382 } else {

349 buffer_end -= truncate_by;	383 buffer_end -= truncate_by;

350 time = RunLexer(buffer, buffer_end, isolate, settings);	384 time = RunLexer(buffer, buffer_end, isolate, output_encoding, settings);

351 }	385 }

352 delete[] buffer;	386 delete[] buffer;

353 }	387 }

354	388

355 return time;	389 return time;

356 }	390 }

357	391

358	392

359 int main(int argc, char* argv[]) {	393 int main(int argc, char* argv[]) {

360 v8::V8::InitializeICU();	394 v8::V8::InitializeICU();

361 v8::V8::SetFlagsFromCommandLine(&argc, argv, true);	395 v8::V8::SetFlagsFromCommandLine(&argc, argv, true);

362 std::vector<std::string> fnames;	396 std::vector<std::string> fnames;

363 LexerShellSettings settings;	397 LexerShellSettings settings;

364 for (int i = 0; i < argc; ++i) {	398 for (int i = 0; i < argc; ++i) {

365 if (strcmp(argv[i], "--latin1") == 0) {	399 if (strcmp(argv[i], "--latin1") == 0) {

366 settings.encoding = LATIN1;	400 settings.encoding = LATIN1;

367 } else if (strcmp(argv[i], "--utf8") == 0) {	401 } else if (strcmp(argv[i], "--utf8") == 0) {

368 settings.encoding = UTF8;	402 settings.encoding = UTF8;

369 } else if (strcmp(argv[i], "--utf16") == 0) {	403 } else if (strcmp(argv[i], "--utf16") == 0) {

370 settings.encoding = UTF16;	404 settings.encoding = UTF16;

371 } else if (strcmp(argv[i], "--utf8to16") == 0) {	405 } else if (strcmp(argv[i], "--utf8to16") == 0) {

372 #ifdef V8_USE_GENERATED_LEXER	406 #ifdef V8_USE_GENERATED_LEXER

373 settings.encoding = UTF8TO16_PRECONVERT;	407 settings.encoding = UTF8TO16;

374 #else	408 #else

375 settings.encoding = UTF8TO16;	409 settings.encoding = UTF8;

	410 #endif

	411 } else if (strcmp(argv[i], "--utf8tolatin1") == 0) {

	412 #ifdef V8_USE_GENERATED_LEXER

	413 settings.encoding = UTF8TOLATIN1;

	414 #else

	415 settings.encoding = UTF8;

376 #endif	416 #endif

377 } else if (strcmp(argv[i], "--print-tokens") == 0) {	417 } else if (strcmp(argv[i], "--print-tokens") == 0) {

378 settings.print_tokens = true;	418 settings.print_tokens = true;

379 } else if (strcmp(argv[i], "--print-tokens-for-compare") == 0) {	419 } else if (strcmp(argv[i], "--print-tokens-for-compare") == 0) {

380 settings.print_tokens = true;	420 settings.print_tokens = true;

381 settings.print_tokens_for_compare = true;	421 settings.print_tokens_for_compare = true;

382 } else if (strcmp(argv[i], "--no-baseline") == 0) {	422 } else if (strcmp(argv[i], "--no-baseline") == 0) {

383 // Ignore.	423 // Ignore.

384 } else if (strcmp(argv[i], "--no-experimental") == 0) {	424 } else if (strcmp(argv[i], "--no-experimental") == 0) {

385 // Ignore.	425 // Ignore.

(...skipping 38 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
424 ++truncate_by;	464 ++truncate_by;

425 } while (can_truncate);	465 } while (can_truncate);

426 }	466 }

427 if (!settings.print_tokens_for_compare) {	467 if (!settings.print_tokens_for_compare) {

428 printf("RunTime: %.f ms\n", total_time);	468 printf("RunTime: %.f ms\n", total_time);

429 }	469 }

430 }	470 }

431 v8::V8::Dispose();	471 v8::V8::Dispose();

432 return 0;	472 return 0;

433 }	473 }

OLD	NEW

« no previous file with comments | « no previous file | tools/lexer_generator/test/run_lexing_tests.py » ('j') | no next file with comments »