Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(380)

Side by Side Diff: src/lexer/lexer-shell.cc

Issue 194613002: Experimental parser: proper utf16 conversion (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser
Patch Set: Created 6 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2013 the V8 project authors. All rights reserved. 1 // Copyright 2013 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
45 #include "scanner.h" 45 #include "scanner.h"
46 #include "lexer/lexer.h" 46 #include "lexer/lexer.h"
47 47
48 using namespace v8::internal; 48 using namespace v8::internal;
49 49
50 50
51 enum Encoding { 51 enum Encoding {
52 LATIN1, 52 LATIN1,
53 UTF8, 53 UTF8,
54 UTF16, 54 UTF16,
55 UTF8TO16 // Read as UTF8, convert to UTF16 before giving it to the lexers. 55 UTF8TO16, // Convert stream via scanner input stream
56 UTF8TO16_PRECONVERT // Convert stream during file read
56 }; 57 };
57 58
58 59
59 struct LexerShellSettings { 60 struct LexerShellSettings {
60 Encoding encoding; 61 Encoding encoding;
61 bool print_tokens; 62 bool print_tokens;
62 bool break_after_illegal; 63 bool break_after_illegal;
63 bool eos_test; 64 bool eos_test;
64 int repeat; 65 int repeat;
65 bool harmony_numeric_literals; 66 bool harmony_numeric_literals;
66 bool harmony_modules; 67 bool harmony_modules;
67 bool harmony_scoping; 68 bool harmony_scoping;
68 LexerShellSettings() 69 LexerShellSettings()
69 : encoding(LATIN1), 70 : encoding(LATIN1),
70 print_tokens(false), 71 print_tokens(false),
71 break_after_illegal(false), 72 break_after_illegal(false),
72 eos_test(false), 73 eos_test(false),
73 repeat(1), 74 repeat(1),
74 harmony_numeric_literals(false), 75 harmony_numeric_literals(false),
75 harmony_modules(false), 76 harmony_modules(false),
76 harmony_scoping(false) {} 77 harmony_scoping(false) {}
77 }; 78 };
78 79
79 80
80 static uint16_t* ReadFile(const char* name, const uint8_t** end, 81 static uint16_t* ConvertUtf8ToUtf16(const uint16_t* const data_in,
81 const LexerShellSettings& settings) { 82 unsigned* length) {
82 FILE* file = fopen(name, "rb"); 83 const unsigned file_size = *length;
83 CHECK(file != NULL); 84 const uint8_t* char_data = reinterpret_cast<const uint8_t*>(data_in);
84 85 const uint32_t kMaxUtf16Character = 0xffff;
85 fseek(file, 0, SEEK_END); 86 // Get utf8 length.
86 unsigned file_size = ftell(file); 87 unsigned utf16_chars = 0;
87 rewind(file); 88 {
88
89 uint16_t* two_byte_data = new uint16_t[file_size / 2 + file_size % 2];
90
91 uint8_t* char_data = reinterpret_cast<uint8_t*>(two_byte_data);
92 for (unsigned i = 0; i < file_size;) {
93 i += fread(&char_data[i], 1, file_size - i, file);
94 }
95 fclose(file);
96
97 if (settings.encoding == UTF8TO16) {
98 const uint32_t kMaxUtf16Character = 0xffff;
99 // Get utf8 length.
100 unsigned utf16_chars = 0;
101 {
102 unsigned position = 0;
103 while (position < file_size) {
104 uint32_t c = char_data[position];
105 if (c <= unibrow::Utf8::kMaxOneByteChar) {
106 position++;
107 } else {
108 c = unibrow::Utf8::CalculateValue(char_data + position,
109 file_size - position,
110 &position);
111 }
112 if (c > kMaxUtf16Character) {
113 utf16_chars += 2;
114 } else {
115 utf16_chars += 1;
116 }
117 }
118 }
119 // Write new buffer out.
120 uint16_t* data = new uint16_t[utf16_chars];
121 unsigned position = 0; 89 unsigned position = 0;
122 unsigned i = 0;
123 while (position < file_size) { 90 while (position < file_size) {
124 uint32_t c = char_data[position]; 91 uint32_t c = char_data[position];
125 if (c <= unibrow::Utf8::kMaxOneByteChar) { 92 if (c <= unibrow::Utf8::kMaxOneByteChar) {
126 position++; 93 position++;
127 } else { 94 } else {
128 c = unibrow::Utf8::CalculateValue(char_data + position, 95 c = unibrow::Utf8::CalculateValue(char_data + position,
129 file_size - position, 96 file_size - position,
130 &position); 97 &position);
131 } 98 }
132 if (c > kMaxUtf16Character) { 99 if (c > kMaxUtf16Character) {
133 data[i++] = unibrow::Utf16::LeadSurrogate(c); 100 utf16_chars += 2;
134 data[i++] = unibrow::Utf16::TrailSurrogate(c);
135 } else { 101 } else {
136 data[i++] = static_cast<uc16>(c); 102 utf16_chars += 1;
137 } 103 }
138 } 104 }
139 // Swap buffers.
140 delete two_byte_data;
141 file_size = utf16_chars * 2;
142 two_byte_data = data;
143 char_data = reinterpret_cast<uint8_t*>(two_byte_data);
144 } 105 }
145 106 // Write new buffer out.
146 // Duplicate buffer if necessary. 107 uint16_t* data = new uint16_t[utf16_chars];
147 if (settings.repeat > 1) { 108 unsigned position = 0;
148 unsigned size = file_size * settings.repeat; 109 unsigned i = 0;
149 uint16_t* data = new uint16_t[size / 2 + size % 2]; 110 while (position < file_size) {
150 char_data = reinterpret_cast<uint8_t*>(two_byte_data); 111 uint32_t c = char_data[position];
151 for (int i = 0; i < settings.repeat; i++) { 112 if (c <= unibrow::Utf8::kMaxOneByteChar) {
152 memcpy(&char_data[i * file_size], two_byte_data, file_size); 113 position++;
114 } else {
115 c = unibrow::Utf8::CalculateValue(char_data + position,
116 file_size - position,
117 &position);
153 } 118 }
154 delete two_byte_data; 119 if (c > kMaxUtf16Character) {
155 file_size = size; 120 data[i++] = unibrow::Utf16::LeadSurrogate(c);
156 two_byte_data = data; 121 data[i++] = unibrow::Utf16::TrailSurrogate(c);
122 } else {
123 data[i++] = static_cast<uc16>(c);
124 }
157 } 125 }
158 126 *length = 2 * utf16_chars;
159 *end = &char_data[file_size]; 127 return data;
160 return two_byte_data;
161 } 128 }
162 129
163 130
131 static uint16_t* Repeat(int repeat,
132 const uint16_t* const data_in,
133 unsigned* length) {
134 const unsigned file_size = *length;
135 unsigned size = file_size * repeat;
136 uint16_t* data = new uint16_t[size / 2 + size % 2];
137 uint8_t* char_data = reinterpret_cast<uint8_t*>(data);
138 for (int i = 0; i < repeat; i++) {
139 memcpy(&char_data[i * file_size], data_in, file_size);
140 }
141 *length = size;
142 return data;
143 }
144
145
146 static uint16_t* ReadFile(const char* name, unsigned* length) {
147 FILE* file = fopen(name, "rb");
148 CHECK(file != NULL);
149 // Get file size.
150 fseek(file, 0, SEEK_END);
151 unsigned file_size = ftell(file);
152 rewind(file);
153 // Read file contents.
154 uint16_t* data = new uint16_t[file_size / 2 + file_size % 2];
155 uint8_t* char_data = reinterpret_cast<uint8_t*>(data);
156 for (unsigned i = 0; i < file_size;) {
157 i += fread(&char_data[i], 1, file_size - i, file);
158 }
159 fclose(file);
160 *length = file_size;
161 return data;
162 }
163
164
165 static uint16_t* ReadFile(const char* name,
166 const LexerShellSettings& settings,
167 unsigned* length) {
168 uint16_t* data = ReadFile(name, length);
169 CHECK_GE(*length, 0);
170 if (*length == 0) return data;
171
172 if (settings.encoding == UTF8TO16_PRECONVERT) {
173 uint16_t* new_data = ConvertUtf8ToUtf16(data, length);
174 delete data;
175 data = new_data;
176 }
177
178 if (settings.repeat > 1) {
179 uint16_t* new_data = Repeat(settings.repeat, data, length);
180 delete data;
181 data = new_data;
182 }
183
184 return data;
185 }
186
187
164 struct TokenWithLocation { 188 struct TokenWithLocation {
165 Token::Value value; 189 Token::Value value;
166 size_t beg; 190 size_t beg;
167 size_t end; 191 size_t end;
168 std::vector<int> literal; 192 std::vector<int> literal;
169 bool is_ascii; 193 bool is_ascii;
170 // The location of the latest octal position when the token was seen. 194 // The location of the latest octal position when the token was seen.
171 int octal_beg; 195 int octal_beg;
172 int octal_end; 196 int octal_end;
173 TokenWithLocation() : 197 TokenWithLocation() :
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
236 260
237 static TimeDelta RunLexer(const uint16_t* source, 261 static TimeDelta RunLexer(const uint16_t* source,
238 const uint8_t* source_end, 262 const uint8_t* source_end,
239 Isolate* isolate, 263 Isolate* isolate,
240 std::vector<TokenWithLocation>* tokens, 264 std::vector<TokenWithLocation>* tokens,
241 const LexerShellSettings& settings) { 265 const LexerShellSettings& settings) {
242 SmartPointer<Utf16CharacterStream> stream; 266 SmartPointer<Utf16CharacterStream> stream;
243 const uint8_t* one_byte_source = reinterpret_cast<const uint8_t*>(source); 267 const uint8_t* one_byte_source = reinterpret_cast<const uint8_t*>(source);
244 int bytes = source_end - one_byte_source; 268 int bytes = source_end - one_byte_source;
245 switch (settings.encoding) { 269 switch (settings.encoding) {
270 case UTF8TO16:
246 case UTF8: 271 case UTF8:
247 stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes)); 272 stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes));
248 break; 273 break;
249 case UTF8TO16: 274 case UTF8TO16_PRECONVERT:
250 case UTF16: { 275 case UTF16: {
251 CHECK_EQ(0, bytes % 2); 276 CHECK_EQ(0, bytes % 2);
252 Handle<String> result = isolate->factory()->NewStringFromTwoByte( 277 Handle<String> result = isolate->factory()->NewStringFromTwoByte(
253 Vector<const uint16_t>(source, bytes / 2)); 278 Vector<const uint16_t>(source, bytes / 2));
254 stream.Reset( 279 stream.Reset(
255 new GenericStringUtf16CharacterStream(result, 0, result->length())); 280 new GenericStringUtf16CharacterStream(result, 0, result->length()));
256 break; 281 break;
257 } 282 }
258 case LATIN1: { 283 case LATIN1: {
259 Handle<String> result = isolate->factory()->NewStringFromOneByte( 284 Handle<String> result = isolate->factory()->NewStringFromOneByte(
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
293 const LexerShellSettings& settings, 318 const LexerShellSettings& settings,
294 int truncate_by, 319 int truncate_by,
295 bool* can_truncate) { 320 bool* can_truncate) {
296 if (settings.print_tokens) { 321 if (settings.print_tokens) {
297 printf("Processing file %s, truncating by %d bytes\n", fname, truncate_by); 322 printf("Processing file %s, truncating by %d bytes\n", fname, truncate_by);
298 } 323 }
299 HandleScope handle_scope(isolate); 324 HandleScope handle_scope(isolate);
300 std::vector<TokenWithLocation> tokens; 325 std::vector<TokenWithLocation> tokens;
301 TimeDelta time; 326 TimeDelta time;
302 { 327 {
303 const uint8_t* buffer_end = 0; 328 unsigned length_in_bytes;
304 const uint16_t* buffer = ReadFile(fname, &buffer_end, settings); 329 const uint16_t* buffer = ReadFile(fname, settings, &length_in_bytes);
305 if (truncate_by > buffer_end - reinterpret_cast<const uint8_t*>(buffer)) { 330 const uint8_t* char_data = reinterpret_cast<const uint8_t*>(buffer);
331 const uint8_t* buffer_end = &char_data[length_in_bytes];
332 if (truncate_by > buffer_end - char_data) {
306 *can_truncate = false; 333 *can_truncate = false;
307 } else { 334 } else {
308 buffer_end -= truncate_by; 335 buffer_end -= truncate_by;
309 time = RunLexer(buffer, buffer_end, isolate, &tokens, settings); 336 time = RunLexer(buffer, buffer_end, isolate, &tokens, settings);
310 } 337 }
311 delete[] buffer; 338 delete[] buffer;
312 } 339 }
313 if (settings.print_tokens) { 340 if (settings.print_tokens) {
314 printf("No of tokens:\t%d\n", static_cast<int>(tokens.size())); 341 printf("No of tokens:\t%d\n", static_cast<int>(tokens.size()));
315 for (size_t i = 0; i < tokens.size(); ++i) { 342 for (size_t i = 0; i < tokens.size(); ++i) {
(...skipping 14 matching lines...) Expand all
330 std::vector<std::string> fnames; 357 std::vector<std::string> fnames;
331 LexerShellSettings settings; 358 LexerShellSettings settings;
332 for (int i = 0; i < argc; ++i) { 359 for (int i = 0; i < argc; ++i) {
333 if (strcmp(argv[i], "--latin1") == 0) { 360 if (strcmp(argv[i], "--latin1") == 0) {
334 settings.encoding = LATIN1; 361 settings.encoding = LATIN1;
335 } else if (strcmp(argv[i], "--utf8") == 0) { 362 } else if (strcmp(argv[i], "--utf8") == 0) {
336 settings.encoding = UTF8; 363 settings.encoding = UTF8;
337 } else if (strcmp(argv[i], "--utf16") == 0) { 364 } else if (strcmp(argv[i], "--utf16") == 0) {
338 settings.encoding = UTF16; 365 settings.encoding = UTF16;
339 } else if (strcmp(argv[i], "--utf8to16") == 0) { 366 } else if (strcmp(argv[i], "--utf8to16") == 0) {
367 #ifdef V8_USE_GENERATED_LEXER
368 settings.encoding = UTF8TO16_PRECONVERT;
369 #else
340 settings.encoding = UTF8TO16; 370 settings.encoding = UTF8TO16;
371 #endif
341 } else if (strcmp(argv[i], "--print-tokens") == 0) { 372 } else if (strcmp(argv[i], "--print-tokens") == 0) {
342 settings.print_tokens = true; 373 settings.print_tokens = true;
343 } else if (strcmp(argv[i], "--no-baseline") == 0) { 374 } else if (strcmp(argv[i], "--no-baseline") == 0) {
344 // Ignore. 375 // Ignore.
345 } else if (strcmp(argv[i], "--no-experimental") == 0) { 376 } else if (strcmp(argv[i], "--no-experimental") == 0) {
346 // Ignore. 377 // Ignore.
347 } else if (strcmp(argv[i], "--no-check") == 0) { 378 } else if (strcmp(argv[i], "--no-check") == 0) {
348 // Ignore. 379 // Ignore.
349 } else if (strcmp(argv[i], "--break-after-illegal") == 0) { 380 } else if (strcmp(argv[i], "--break-after-illegal") == 0) {
350 settings.break_after_illegal = true; 381 settings.break_after_illegal = true;
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
383 &can_truncate); 414 &can_truncate);
384 total_time += t.InMillisecondsF(); 415 total_time += t.InMillisecondsF();
385 ++truncate_by; 416 ++truncate_by;
386 } while (can_truncate); 417 } while (can_truncate);
387 } 418 }
388 printf("RunTime: %.f ms\n", total_time); 419 printf("RunTime: %.f ms\n", total_time);
389 } 420 }
390 v8::V8::Dispose(); 421 v8::V8::Dispose();
391 return 0; 422 return 0;
392 } 423 }
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698