Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(87)

Side by Side Diff: src/lexer/lexer-shell.cc

Issue 196943021: Experimental parser: add utf8tolatin1 conversion (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser
Patch Set: Created 6 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | tools/lexer_generator/test/run_lexing_tests.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2013 the V8 project authors. All rights reserved. 1 // Copyright 2013 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
46 #include "lexer/lexer.h" 46 #include "lexer/lexer.h"
47 47
48 using namespace v8::internal; 48 using namespace v8::internal;
49 49
50 50
51 enum Encoding { 51 enum Encoding {
52 LATIN1, 52 LATIN1,
53 UTF8, 53 UTF8,
54 UTF16, 54 UTF16,
55 UTF8TO16, // Convert stream via scanner input stream 55 UTF8TO16, // Convert stream via scanner input stream
56 UTF8TO16_PRECONVERT // Convert stream during file read 56 UTF8TOLATIN1, // Convert stream via scanner input stream
57 }; 57 };
58 58
59 59
60 struct LexerShellSettings { 60 struct LexerShellSettings {
61 Encoding encoding; 61 Encoding encoding;
62 bool print_tokens; 62 bool print_tokens;
63 bool print_tokens_for_compare; 63 bool print_tokens_for_compare;
64 bool break_after_illegal; 64 bool break_after_illegal;
65 bool eos_test; 65 bool eos_test;
66 int repeat; 66 int repeat;
67 bool harmony_numeric_literals; 67 bool harmony_numeric_literals;
68 bool harmony_modules; 68 bool harmony_modules;
69 bool harmony_scoping; 69 bool harmony_scoping;
70 LexerShellSettings() 70 LexerShellSettings()
71 : encoding(LATIN1), 71 : encoding(LATIN1),
72 print_tokens(false), 72 print_tokens(false),
73 print_tokens_for_compare(false), 73 print_tokens_for_compare(false),
74 break_after_illegal(false), 74 break_after_illegal(false),
75 eos_test(false), 75 eos_test(false),
76 repeat(1), 76 repeat(1),
77 harmony_numeric_literals(false), 77 harmony_numeric_literals(false),
78 harmony_modules(false), 78 harmony_modules(false),
79 harmony_scoping(false) {} 79 harmony_scoping(false) {}
80 }; 80 };
81 81
82 82
83 static uint16_t* ConvertUtf8ToUtf16(const uint16_t* const data_in, 83 static uint16_t* ConvertUtf8ToUtf16(const uint16_t* const data_in,
84 unsigned* length) { 84 unsigned* length,
85 bool* is_one_byte) {
85 const unsigned file_size = *length; 86 const unsigned file_size = *length;
86 const uint8_t* char_data = reinterpret_cast<const uint8_t*>(data_in); 87 const uint8_t* char_data = reinterpret_cast<const uint8_t*>(data_in);
87 const uint32_t kMaxUtf16Character = 0xffff; 88 const uint32_t kMaxUtf16Character = 0xffff;
88 // Get utf8 length. 89 // Get utf8 length.
89 unsigned utf16_chars = 0; 90 unsigned utf16_chars = 0;
91 *is_one_byte = true;
90 { 92 {
91 unsigned position = 0; 93 unsigned position = 0;
92 while (position < file_size) { 94 while (position < file_size) {
93 uint32_t c = char_data[position]; 95 uint32_t c = char_data[position];
94 if (c <= unibrow::Utf8::kMaxOneByteChar) { 96 if (c <= unibrow::Utf8::kMaxOneByteChar) {
95 position++; 97 position++;
96 } else { 98 } else {
99 *is_one_byte = false;
97 c = unibrow::Utf8::CalculateValue(char_data + position, 100 c = unibrow::Utf8::CalculateValue(char_data + position,
98 file_size - position, 101 file_size - position,
99 &position); 102 &position);
100 } 103 }
101 if (c > kMaxUtf16Character) { 104 if (c > kMaxUtf16Character) {
102 utf16_chars += 2; 105 utf16_chars += 2;
103 } else { 106 } else {
104 utf16_chars += 1; 107 utf16_chars += 1;
105 } 108 }
106 } 109 }
(...skipping 16 matching lines...) Expand all
123 data[i++] = unibrow::Utf16::TrailSurrogate(c); 126 data[i++] = unibrow::Utf16::TrailSurrogate(c);
124 } else { 127 } else {
125 data[i++] = static_cast<uc16>(c); 128 data[i++] = static_cast<uc16>(c);
126 } 129 }
127 } 130 }
128 *length = 2 * utf16_chars; 131 *length = 2 * utf16_chars;
129 return data; 132 return data;
130 } 133 }
131 134
132 135
136 static uint16_t* ConvertUtf16ToLatin1(const uint16_t* const data_in,
137 unsigned* length) {
138 const unsigned size = *length / 2 + *length % 2;
139 uint16_t* data = new uint16_t[size];
140 uint8_t* char_data = reinterpret_cast<uint8_t*>(data);
141 CopyChars(char_data, data_in, size);
142 *length = size;
143 return data;
144 }
145
146
133 static uint16_t* Repeat(int repeat, 147 static uint16_t* Repeat(int repeat,
134 const uint16_t* const data_in, 148 const uint16_t* const data_in,
135 unsigned* length) { 149 unsigned* length) {
136 const unsigned file_size = *length; 150 const unsigned file_size = *length;
137 unsigned size = file_size * repeat; 151 unsigned size = file_size * repeat;
138 uint16_t* data = new uint16_t[size / 2 + size % 2]; 152 uint16_t* data = new uint16_t[size / 2 + size % 2];
139 uint8_t* char_data = reinterpret_cast<uint8_t*>(data); 153 uint8_t* char_data = reinterpret_cast<uint8_t*>(data);
140 for (int i = 0; i < repeat; i++) { 154 for (int i = 0; i < repeat; i++) {
141 memcpy(&char_data[i * file_size], data_in, file_size); 155 memcpy(&char_data[i * file_size], data_in, file_size);
142 } 156 }
(...skipping 16 matching lines...) Expand all
159 i += fread(&char_data[i], 1, file_size - i, file); 173 i += fread(&char_data[i], 1, file_size - i, file);
160 } 174 }
161 fclose(file); 175 fclose(file);
162 *length = file_size; 176 *length = file_size;
163 return data; 177 return data;
164 } 178 }
165 179
166 180
167 static uint16_t* ReadFile(const char* name, 181 static uint16_t* ReadFile(const char* name,
168 const LexerShellSettings& settings, 182 const LexerShellSettings& settings,
169 unsigned* length) { 183 unsigned* length,
184 Encoding* output_encoding) {
170 uint16_t* data = ReadFile(name, length); 185 uint16_t* data = ReadFile(name, length);
171 CHECK_GE(*length, 0); 186 CHECK_GE(*length, 0);
172 if (*length == 0) return data; 187 if (*length == 0) return data;
173 188
174 if (settings.encoding == UTF8TO16_PRECONVERT) { 189 *output_encoding = settings.encoding;
175 uint16_t* new_data = ConvertUtf8ToUtf16(data, length); 190
191 if (settings.encoding == UTF8TO16 ||
192 settings.encoding == UTF8TOLATIN1) {
193 bool is_one_byte;
194 uint16_t* new_data = ConvertUtf8ToUtf16(data, length, &is_one_byte);
195 if (settings.encoding == UTF8TOLATIN1 && is_one_byte) {
196 *output_encoding = LATIN1;
197 } else {
198 *output_encoding = UTF16;
199 }
176 delete data; 200 delete data;
177 data = new_data; 201 data = new_data;
178 } 202 }
203
204 if (settings.encoding == UTF8TOLATIN1 && *output_encoding == LATIN1) {
205 uint16_t* new_data = ConvertUtf16ToLatin1(data, length);
206 delete data;
207 data = new_data;
208 }
179 209
180 if (settings.repeat > 1) { 210 if (settings.repeat > 1) {
181 uint16_t* new_data = Repeat(settings.repeat, data, length); 211 uint16_t* new_data = Repeat(settings.repeat, data, length);
182 delete data; 212 delete data;
183 data = new_data; 213 data = new_data;
184 } 214 }
185 215
186 return data; 216 return data;
187 } 217 }
188 218
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after
258 } 288 }
259 289
260 private: 290 private:
261 DISALLOW_COPY_AND_ASSIGN(TokenWithLocation); 291 DISALLOW_COPY_AND_ASSIGN(TokenWithLocation);
262 }; 292 };
263 293
264 294
265 static TimeDelta RunLexer(const uint16_t* source, 295 static TimeDelta RunLexer(const uint16_t* source,
266 const uint8_t* source_end, 296 const uint8_t* source_end,
267 Isolate* isolate, 297 Isolate* isolate,
298 Encoding output_encoding,
268 const LexerShellSettings& settings) { 299 const LexerShellSettings& settings) {
269 SmartPointer<Utf16CharacterStream> stream; 300 SmartPointer<Utf16CharacterStream> stream;
270 const uint8_t* one_byte_source = reinterpret_cast<const uint8_t*>(source); 301 const uint8_t* one_byte_source = reinterpret_cast<const uint8_t*>(source);
271 int bytes = source_end - one_byte_source; 302 int bytes = source_end - one_byte_source;
272 switch (settings.encoding) { 303 switch (output_encoding) {
273 case UTF8TO16:
274 case UTF8: 304 case UTF8:
275 stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes)); 305 stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes));
276 break; 306 break;
277 case UTF8TO16_PRECONVERT:
278 case UTF16: { 307 case UTF16: {
279 CHECK_EQ(0, bytes % 2); 308 CHECK_EQ(0, bytes % 2);
280 Handle<String> result = isolate->factory()->NewStringFromTwoByte( 309 Handle<String> result = isolate->factory()->NewStringFromTwoByte(
281 Vector<const uint16_t>(source, bytes / 2)); 310 Vector<const uint16_t>(source, bytes / 2));
282 stream.Reset( 311 stream.Reset(
283 new GenericStringUtf16CharacterStream(result, 0, result->length())); 312 new GenericStringUtf16CharacterStream(result, 0, result->length()));
284 break; 313 break;
285 } 314 }
286 case LATIN1: { 315 case LATIN1: {
287 Handle<String> result = isolate->factory()->NewStringFromOneByte( 316 Handle<String> result = isolate->factory()->NewStringFromOneByte(
288 Vector<const uint8_t>(one_byte_source, bytes)); 317 Vector<const uint8_t>(one_byte_source, bytes));
289 stream.Reset( 318 stream.Reset(
290 new GenericStringUtf16CharacterStream(result, 0, result->length())); 319 new GenericStringUtf16CharacterStream(result, 0, result->length()));
291 break; 320 break;
292 } 321 }
322 case UTF8TO16:
323 case UTF8TOLATIN1:
324 CHECK(false);
293 } 325 }
294 Scanner scanner(isolate->unicode_cache()); 326 Scanner scanner(isolate->unicode_cache());
295 scanner.SetHarmonyNumericLiterals(settings.harmony_numeric_literals); 327 scanner.SetHarmonyNumericLiterals(settings.harmony_numeric_literals);
296 scanner.SetHarmonyModules(settings.harmony_modules); 328 scanner.SetHarmonyModules(settings.harmony_modules);
297 scanner.SetHarmonyScoping(settings.harmony_scoping); 329 scanner.SetHarmonyScoping(settings.harmony_scoping);
298 ElapsedTimer timer; 330 ElapsedTimer timer;
299 std::vector<TokenWithLocation*> tokens; 331 std::vector<TokenWithLocation*> tokens;
300 timer.Start(); 332 timer.Start();
301 scanner.Initialize(stream.get()); 333 scanner.Initialize(stream.get());
302 Token::Value token; 334 Token::Value token;
(...skipping 30 matching lines...) Expand all
333 const LexerShellSettings& settings, 365 const LexerShellSettings& settings,
334 int truncate_by, 366 int truncate_by,
335 bool* can_truncate) { 367 bool* can_truncate) {
336 if (settings.print_tokens && !settings.print_tokens_for_compare) { 368 if (settings.print_tokens && !settings.print_tokens_for_compare) {
337 printf("Processing file %s, truncating by %d bytes\n", fname, truncate_by); 369 printf("Processing file %s, truncating by %d bytes\n", fname, truncate_by);
338 } 370 }
339 HandleScope handle_scope(isolate); 371 HandleScope handle_scope(isolate);
340 TimeDelta time; 372 TimeDelta time;
341 { 373 {
342 unsigned length_in_bytes; 374 unsigned length_in_bytes;
343 const uint16_t* buffer = ReadFile(fname, settings, &length_in_bytes); 375 Encoding output_encoding;
376 const uint16_t* buffer =
377 ReadFile(fname, settings, &length_in_bytes, &output_encoding);
344 const uint8_t* char_data = reinterpret_cast<const uint8_t*>(buffer); 378 const uint8_t* char_data = reinterpret_cast<const uint8_t*>(buffer);
345 const uint8_t* buffer_end = &char_data[length_in_bytes]; 379 const uint8_t* buffer_end = &char_data[length_in_bytes];
346 if (truncate_by > buffer_end - char_data) { 380 if (truncate_by > buffer_end - char_data) {
347 *can_truncate = false; 381 *can_truncate = false;
348 } else { 382 } else {
349 buffer_end -= truncate_by; 383 buffer_end -= truncate_by;
350 time = RunLexer(buffer, buffer_end, isolate, settings); 384 time = RunLexer(buffer, buffer_end, isolate, output_encoding, settings);
351 } 385 }
352 delete[] buffer; 386 delete[] buffer;
353 } 387 }
354 388
355 return time; 389 return time;
356 } 390 }
357 391
358 392
359 int main(int argc, char* argv[]) { 393 int main(int argc, char* argv[]) {
360 v8::V8::InitializeICU(); 394 v8::V8::InitializeICU();
361 v8::V8::SetFlagsFromCommandLine(&argc, argv, true); 395 v8::V8::SetFlagsFromCommandLine(&argc, argv, true);
362 std::vector<std::string> fnames; 396 std::vector<std::string> fnames;
363 LexerShellSettings settings; 397 LexerShellSettings settings;
364 for (int i = 0; i < argc; ++i) { 398 for (int i = 0; i < argc; ++i) {
365 if (strcmp(argv[i], "--latin1") == 0) { 399 if (strcmp(argv[i], "--latin1") == 0) {
366 settings.encoding = LATIN1; 400 settings.encoding = LATIN1;
367 } else if (strcmp(argv[i], "--utf8") == 0) { 401 } else if (strcmp(argv[i], "--utf8") == 0) {
368 settings.encoding = UTF8; 402 settings.encoding = UTF8;
369 } else if (strcmp(argv[i], "--utf16") == 0) { 403 } else if (strcmp(argv[i], "--utf16") == 0) {
370 settings.encoding = UTF16; 404 settings.encoding = UTF16;
371 } else if (strcmp(argv[i], "--utf8to16") == 0) { 405 } else if (strcmp(argv[i], "--utf8to16") == 0) {
372 #ifdef V8_USE_GENERATED_LEXER 406 #ifdef V8_USE_GENERATED_LEXER
373 settings.encoding = UTF8TO16_PRECONVERT; 407 settings.encoding = UTF8TO16;
374 #else 408 #else
375 settings.encoding = UTF8TO16; 409 settings.encoding = UTF8;
410 #endif
411 } else if (strcmp(argv[i], "--utf8tolatin1") == 0) {
412 #ifdef V8_USE_GENERATED_LEXER
413 settings.encoding = UTF8TOLATIN1;
414 #else
415 settings.encoding = UTF8;
376 #endif 416 #endif
377 } else if (strcmp(argv[i], "--print-tokens") == 0) { 417 } else if (strcmp(argv[i], "--print-tokens") == 0) {
378 settings.print_tokens = true; 418 settings.print_tokens = true;
379 } else if (strcmp(argv[i], "--print-tokens-for-compare") == 0) { 419 } else if (strcmp(argv[i], "--print-tokens-for-compare") == 0) {
380 settings.print_tokens = true; 420 settings.print_tokens = true;
381 settings.print_tokens_for_compare = true; 421 settings.print_tokens_for_compare = true;
382 } else if (strcmp(argv[i], "--no-baseline") == 0) { 422 } else if (strcmp(argv[i], "--no-baseline") == 0) {
383 // Ignore. 423 // Ignore.
384 } else if (strcmp(argv[i], "--no-experimental") == 0) { 424 } else if (strcmp(argv[i], "--no-experimental") == 0) {
385 // Ignore. 425 // Ignore.
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
424 ++truncate_by; 464 ++truncate_by;
425 } while (can_truncate); 465 } while (can_truncate);
426 } 466 }
427 if (!settings.print_tokens_for_compare) { 467 if (!settings.print_tokens_for_compare) {
428 printf("RunTime: %.f ms\n", total_time); 468 printf("RunTime: %.f ms\n", total_time);
429 } 469 }
430 } 470 }
431 v8::V8::Dispose(); 471 v8::V8::Dispose();
432 return 0; 472 return 0;
433 } 473 }
OLDNEW
« no previous file with comments | « no previous file | tools/lexer_generator/test/run_lexing_tests.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698