| OLD | NEW |
| 1 // Copyright 2013 the V8 project authors. All rights reserved. | 1 // Copyright 2013 the V8 project authors. All rights reserved. |
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
| 4 // met: | 4 // met: |
| 5 // | 5 // |
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 45 #include "scanner.h" | 45 #include "scanner.h" |
| 46 #include "lexer/lexer.h" | 46 #include "lexer/lexer.h" |
| 47 | 47 |
| 48 using namespace v8::internal; | 48 using namespace v8::internal; |
| 49 | 49 |
| 50 | 50 |
| 51 enum Encoding { | 51 enum Encoding { |
| 52 LATIN1, | 52 LATIN1, |
| 53 UTF8, | 53 UTF8, |
| 54 UTF16, | 54 UTF16, |
| 55 UTF8TO16 // Read as UTF8, convert to UTF16 before giving it to the lexers. | 55 UTF8TO16, // Convert stream via scanner input stream |
| 56 UTF8TO16_PRECONVERT // Convert stream during file read |
| 56 }; | 57 }; |
| 57 | 58 |
| 58 | 59 |
| 59 struct LexerShellSettings { | 60 struct LexerShellSettings { |
| 60 Encoding encoding; | 61 Encoding encoding; |
| 61 bool print_tokens; | 62 bool print_tokens; |
| 62 bool break_after_illegal; | 63 bool break_after_illegal; |
| 63 bool eos_test; | 64 bool eos_test; |
| 64 int repeat; | 65 int repeat; |
| 65 bool harmony_numeric_literals; | 66 bool harmony_numeric_literals; |
| 66 bool harmony_modules; | 67 bool harmony_modules; |
| 67 bool harmony_scoping; | 68 bool harmony_scoping; |
| 68 LexerShellSettings() | 69 LexerShellSettings() |
| 69 : encoding(LATIN1), | 70 : encoding(LATIN1), |
| 70 print_tokens(false), | 71 print_tokens(false), |
| 71 break_after_illegal(false), | 72 break_after_illegal(false), |
| 72 eos_test(false), | 73 eos_test(false), |
| 73 repeat(1), | 74 repeat(1), |
| 74 harmony_numeric_literals(false), | 75 harmony_numeric_literals(false), |
| 75 harmony_modules(false), | 76 harmony_modules(false), |
| 76 harmony_scoping(false) {} | 77 harmony_scoping(false) {} |
| 77 }; | 78 }; |
| 78 | 79 |
| 79 | 80 |
| 80 static uint16_t* ReadFile(const char* name, const uint8_t** end, | 81 static uint16_t* ConvertUtf8ToUtf16(const uint16_t* const data_in, |
| 81 const LexerShellSettings& settings) { | 82 unsigned* length) { |
| 82 FILE* file = fopen(name, "rb"); | 83 const unsigned file_size = *length; |
| 83 CHECK(file != NULL); | 84 const uint8_t* char_data = reinterpret_cast<const uint8_t*>(data_in); |
| 84 | 85 const uint32_t kMaxUtf16Character = 0xffff; |
| 85 fseek(file, 0, SEEK_END); | 86 // Get utf8 length. |
| 86 unsigned file_size = ftell(file); | 87 unsigned utf16_chars = 0; |
| 87 rewind(file); | 88 { |
| 88 | |
| 89 uint16_t* two_byte_data = new uint16_t[file_size / 2 + file_size % 2]; | |
| 90 | |
| 91 uint8_t* char_data = reinterpret_cast<uint8_t*>(two_byte_data); | |
| 92 for (unsigned i = 0; i < file_size;) { | |
| 93 i += fread(&char_data[i], 1, file_size - i, file); | |
| 94 } | |
| 95 fclose(file); | |
| 96 | |
| 97 if (settings.encoding == UTF8TO16) { | |
| 98 const uint32_t kMaxUtf16Character = 0xffff; | |
| 99 // Get utf8 length. | |
| 100 unsigned utf16_chars = 0; | |
| 101 { | |
| 102 unsigned position = 0; | |
| 103 while (position < file_size) { | |
| 104 uint32_t c = char_data[position]; | |
| 105 if (c <= unibrow::Utf8::kMaxOneByteChar) { | |
| 106 position++; | |
| 107 } else { | |
| 108 c = unibrow::Utf8::CalculateValue(char_data + position, | |
| 109 file_size - position, | |
| 110 &position); | |
| 111 } | |
| 112 if (c > kMaxUtf16Character) { | |
| 113 utf16_chars += 2; | |
| 114 } else { | |
| 115 utf16_chars += 1; | |
| 116 } | |
| 117 } | |
| 118 } | |
| 119 // Write new buffer out. | |
| 120 uint16_t* data = new uint16_t[utf16_chars]; | |
| 121 unsigned position = 0; | 89 unsigned position = 0; |
| 122 unsigned i = 0; | |
| 123 while (position < file_size) { | 90 while (position < file_size) { |
| 124 uint32_t c = char_data[position]; | 91 uint32_t c = char_data[position]; |
| 125 if (c <= unibrow::Utf8::kMaxOneByteChar) { | 92 if (c <= unibrow::Utf8::kMaxOneByteChar) { |
| 126 position++; | 93 position++; |
| 127 } else { | 94 } else { |
| 128 c = unibrow::Utf8::CalculateValue(char_data + position, | 95 c = unibrow::Utf8::CalculateValue(char_data + position, |
| 129 file_size - position, | 96 file_size - position, |
| 130 &position); | 97 &position); |
| 131 } | 98 } |
| 132 if (c > kMaxUtf16Character) { | 99 if (c > kMaxUtf16Character) { |
| 133 data[i++] = unibrow::Utf16::LeadSurrogate(c); | 100 utf16_chars += 2; |
| 134 data[i++] = unibrow::Utf16::TrailSurrogate(c); | |
| 135 } else { | 101 } else { |
| 136 data[i++] = static_cast<uc16>(c); | 102 utf16_chars += 1; |
| 137 } | 103 } |
| 138 } | 104 } |
| 139 // Swap buffers. | |
| 140 delete two_byte_data; | |
| 141 file_size = utf16_chars * 2; | |
| 142 two_byte_data = data; | |
| 143 char_data = reinterpret_cast<uint8_t*>(two_byte_data); | |
| 144 } | 105 } |
| 145 | 106 // Write new buffer out. |
| 146 // Duplicate buffer if necessary. | 107 uint16_t* data = new uint16_t[utf16_chars]; |
| 147 if (settings.repeat > 1) { | 108 unsigned position = 0; |
| 148 unsigned size = file_size * settings.repeat; | 109 unsigned i = 0; |
| 149 uint16_t* data = new uint16_t[size / 2 + size % 2]; | 110 while (position < file_size) { |
| 150 char_data = reinterpret_cast<uint8_t*>(two_byte_data); | 111 uint32_t c = char_data[position]; |
| 151 for (int i = 0; i < settings.repeat; i++) { | 112 if (c <= unibrow::Utf8::kMaxOneByteChar) { |
| 152 memcpy(&char_data[i * file_size], two_byte_data, file_size); | 113 position++; |
| 114 } else { |
| 115 c = unibrow::Utf8::CalculateValue(char_data + position, |
| 116 file_size - position, |
| 117 &position); |
| 153 } | 118 } |
| 154 delete two_byte_data; | 119 if (c > kMaxUtf16Character) { |
| 155 file_size = size; | 120 data[i++] = unibrow::Utf16::LeadSurrogate(c); |
| 156 two_byte_data = data; | 121 data[i++] = unibrow::Utf16::TrailSurrogate(c); |
| 122 } else { |
| 123 data[i++] = static_cast<uc16>(c); |
| 124 } |
| 157 } | 125 } |
| 158 | 126 *length = 2 * utf16_chars; |
| 159 *end = &char_data[file_size]; | 127 return data; |
| 160 return two_byte_data; | |
| 161 } | 128 } |
| 162 | 129 |
| 163 | 130 |
| 131 static uint16_t* Repeat(int repeat, |
| 132 const uint16_t* const data_in, |
| 133 unsigned* length) { |
| 134 const unsigned file_size = *length; |
| 135 unsigned size = file_size * repeat; |
| 136 uint16_t* data = new uint16_t[size / 2 + size % 2]; |
| 137 uint8_t* char_data = reinterpret_cast<uint8_t*>(data); |
| 138 for (int i = 0; i < repeat; i++) { |
| 139 memcpy(&char_data[i * file_size], data_in, file_size); |
| 140 } |
| 141 *length = size; |
| 142 return data; |
| 143 } |
| 144 |
| 145 |
| 146 static uint16_t* ReadFile(const char* name, unsigned* length) { |
| 147 FILE* file = fopen(name, "rb"); |
| 148 CHECK(file != NULL); |
| 149 // Get file size. |
| 150 fseek(file, 0, SEEK_END); |
| 151 unsigned file_size = ftell(file); |
| 152 rewind(file); |
| 153 // Read file contents. |
| 154 uint16_t* data = new uint16_t[file_size / 2 + file_size % 2]; |
| 155 uint8_t* char_data = reinterpret_cast<uint8_t*>(data); |
| 156 for (unsigned i = 0; i < file_size;) { |
| 157 i += fread(&char_data[i], 1, file_size - i, file); |
| 158 } |
| 159 fclose(file); |
| 160 *length = file_size; |
| 161 return data; |
| 162 } |
| 163 |
| 164 |
| 165 static uint16_t* ReadFile(const char* name, |
| 166 const LexerShellSettings& settings, |
| 167 unsigned* length) { |
| 168 uint16_t* data = ReadFile(name, length); |
| 169 CHECK_GE(*length, 0); |
| 170 if (*length == 0) return data; |
| 171 |
| 172 if (settings.encoding == UTF8TO16_PRECONVERT) { |
| 173 uint16_t* new_data = ConvertUtf8ToUtf16(data, length); |
| 174 delete data; |
| 175 data = new_data; |
| 176 } |
| 177 |
| 178 if (settings.repeat > 1) { |
| 179 uint16_t* new_data = Repeat(settings.repeat, data, length); |
| 180 delete data; |
| 181 data = new_data; |
| 182 } |
| 183 |
| 184 return data; |
| 185 } |
| 186 |
| 187 |
| 164 struct TokenWithLocation { | 188 struct TokenWithLocation { |
| 165 Token::Value value; | 189 Token::Value value; |
| 166 size_t beg; | 190 size_t beg; |
| 167 size_t end; | 191 size_t end; |
| 168 std::vector<int> literal; | 192 std::vector<int> literal; |
| 169 bool is_ascii; | 193 bool is_ascii; |
| 170 // The location of the latest octal position when the token was seen. | 194 // The location of the latest octal position when the token was seen. |
| 171 int octal_beg; | 195 int octal_beg; |
| 172 int octal_end; | 196 int octal_end; |
| 173 TokenWithLocation() : | 197 TokenWithLocation() : |
| (...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 236 | 260 |
| 237 static TimeDelta RunLexer(const uint16_t* source, | 261 static TimeDelta RunLexer(const uint16_t* source, |
| 238 const uint8_t* source_end, | 262 const uint8_t* source_end, |
| 239 Isolate* isolate, | 263 Isolate* isolate, |
| 240 std::vector<TokenWithLocation>* tokens, | 264 std::vector<TokenWithLocation>* tokens, |
| 241 const LexerShellSettings& settings) { | 265 const LexerShellSettings& settings) { |
| 242 SmartPointer<Utf16CharacterStream> stream; | 266 SmartPointer<Utf16CharacterStream> stream; |
| 243 const uint8_t* one_byte_source = reinterpret_cast<const uint8_t*>(source); | 267 const uint8_t* one_byte_source = reinterpret_cast<const uint8_t*>(source); |
| 244 int bytes = source_end - one_byte_source; | 268 int bytes = source_end - one_byte_source; |
| 245 switch (settings.encoding) { | 269 switch (settings.encoding) { |
| 270 case UTF8TO16: |
| 246 case UTF8: | 271 case UTF8: |
| 247 stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes)); | 272 stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes)); |
| 248 break; | 273 break; |
| 249 case UTF8TO16: | 274 case UTF8TO16_PRECONVERT: |
| 250 case UTF16: { | 275 case UTF16: { |
| 251 CHECK_EQ(0, bytes % 2); | 276 CHECK_EQ(0, bytes % 2); |
| 252 Handle<String> result = isolate->factory()->NewStringFromTwoByte( | 277 Handle<String> result = isolate->factory()->NewStringFromTwoByte( |
| 253 Vector<const uint16_t>(source, bytes / 2)); | 278 Vector<const uint16_t>(source, bytes / 2)); |
| 254 stream.Reset( | 279 stream.Reset( |
| 255 new GenericStringUtf16CharacterStream(result, 0, result->length())); | 280 new GenericStringUtf16CharacterStream(result, 0, result->length())); |
| 256 break; | 281 break; |
| 257 } | 282 } |
| 258 case LATIN1: { | 283 case LATIN1: { |
| 259 Handle<String> result = isolate->factory()->NewStringFromOneByte( | 284 Handle<String> result = isolate->factory()->NewStringFromOneByte( |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 293 const LexerShellSettings& settings, | 318 const LexerShellSettings& settings, |
| 294 int truncate_by, | 319 int truncate_by, |
| 295 bool* can_truncate) { | 320 bool* can_truncate) { |
| 296 if (settings.print_tokens) { | 321 if (settings.print_tokens) { |
| 297 printf("Processing file %s, truncating by %d bytes\n", fname, truncate_by); | 322 printf("Processing file %s, truncating by %d bytes\n", fname, truncate_by); |
| 298 } | 323 } |
| 299 HandleScope handle_scope(isolate); | 324 HandleScope handle_scope(isolate); |
| 300 std::vector<TokenWithLocation> tokens; | 325 std::vector<TokenWithLocation> tokens; |
| 301 TimeDelta time; | 326 TimeDelta time; |
| 302 { | 327 { |
| 303 const uint8_t* buffer_end = 0; | 328 unsigned length_in_bytes; |
| 304 const uint16_t* buffer = ReadFile(fname, &buffer_end, settings); | 329 const uint16_t* buffer = ReadFile(fname, settings, &length_in_bytes); |
| 305 if (truncate_by > buffer_end - reinterpret_cast<const uint8_t*>(buffer)) { | 330 const uint8_t* char_data = reinterpret_cast<const uint8_t*>(buffer); |
| 331 const uint8_t* buffer_end = &char_data[length_in_bytes]; |
| 332 if (truncate_by > buffer_end - char_data) { |
| 306 *can_truncate = false; | 333 *can_truncate = false; |
| 307 } else { | 334 } else { |
| 308 buffer_end -= truncate_by; | 335 buffer_end -= truncate_by; |
| 309 time = RunLexer(buffer, buffer_end, isolate, &tokens, settings); | 336 time = RunLexer(buffer, buffer_end, isolate, &tokens, settings); |
| 310 } | 337 } |
| 311 delete[] buffer; | 338 delete[] buffer; |
| 312 } | 339 } |
| 313 if (settings.print_tokens) { | 340 if (settings.print_tokens) { |
| 314 printf("No of tokens:\t%d\n", static_cast<int>(tokens.size())); | 341 printf("No of tokens:\t%d\n", static_cast<int>(tokens.size())); |
| 315 for (size_t i = 0; i < tokens.size(); ++i) { | 342 for (size_t i = 0; i < tokens.size(); ++i) { |
| (...skipping 14 matching lines...) Expand all Loading... |
| 330 std::vector<std::string> fnames; | 357 std::vector<std::string> fnames; |
| 331 LexerShellSettings settings; | 358 LexerShellSettings settings; |
| 332 for (int i = 0; i < argc; ++i) { | 359 for (int i = 0; i < argc; ++i) { |
| 333 if (strcmp(argv[i], "--latin1") == 0) { | 360 if (strcmp(argv[i], "--latin1") == 0) { |
| 334 settings.encoding = LATIN1; | 361 settings.encoding = LATIN1; |
| 335 } else if (strcmp(argv[i], "--utf8") == 0) { | 362 } else if (strcmp(argv[i], "--utf8") == 0) { |
| 336 settings.encoding = UTF8; | 363 settings.encoding = UTF8; |
| 337 } else if (strcmp(argv[i], "--utf16") == 0) { | 364 } else if (strcmp(argv[i], "--utf16") == 0) { |
| 338 settings.encoding = UTF16; | 365 settings.encoding = UTF16; |
| 339 } else if (strcmp(argv[i], "--utf8to16") == 0) { | 366 } else if (strcmp(argv[i], "--utf8to16") == 0) { |
| 367 #ifdef V8_USE_GENERATED_LEXER |
| 368 settings.encoding = UTF8TO16_PRECONVERT; |
| 369 #else |
| 340 settings.encoding = UTF8TO16; | 370 settings.encoding = UTF8TO16; |
| 371 #endif |
| 341 } else if (strcmp(argv[i], "--print-tokens") == 0) { | 372 } else if (strcmp(argv[i], "--print-tokens") == 0) { |
| 342 settings.print_tokens = true; | 373 settings.print_tokens = true; |
| 343 } else if (strcmp(argv[i], "--no-baseline") == 0) { | 374 } else if (strcmp(argv[i], "--no-baseline") == 0) { |
| 344 // Ignore. | 375 // Ignore. |
| 345 } else if (strcmp(argv[i], "--no-experimental") == 0) { | 376 } else if (strcmp(argv[i], "--no-experimental") == 0) { |
| 346 // Ignore. | 377 // Ignore. |
| 347 } else if (strcmp(argv[i], "--no-check") == 0) { | 378 } else if (strcmp(argv[i], "--no-check") == 0) { |
| 348 // Ignore. | 379 // Ignore. |
| 349 } else if (strcmp(argv[i], "--break-after-illegal") == 0) { | 380 } else if (strcmp(argv[i], "--break-after-illegal") == 0) { |
| 350 settings.break_after_illegal = true; | 381 settings.break_after_illegal = true; |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 383 &can_truncate); | 414 &can_truncate); |
| 384 total_time += t.InMillisecondsF(); | 415 total_time += t.InMillisecondsF(); |
| 385 ++truncate_by; | 416 ++truncate_by; |
| 386 } while (can_truncate); | 417 } while (can_truncate); |
| 387 } | 418 } |
| 388 printf("RunTime: %.f ms\n", total_time); | 419 printf("RunTime: %.f ms\n", total_time); |
| 389 } | 420 } |
| 390 v8::V8::Dispose(); | 421 v8::V8::Dispose(); |
| 391 return 0; | 422 return 0; |
| 392 } | 423 } |
| OLD | NEW |