Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(247)

Side by Side Diff: src/lexer/lexer-shell.cc

Issue 192643003: Experimental parser: fix UTF8TO16 handling. (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser
Patch Set: Created 6 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | tools/lexer_generator/test/run_lexing_tests.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2013 the V8 project authors. All rights reserved. 1 // Copyright 2013 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 29 matching lines...) Expand all
40 #include "platform.h" 40 #include "platform.h"
41 #include "runtime.h" 41 #include "runtime.h"
42 #include "scanner-character-streams.h" 42 #include "scanner-character-streams.h"
43 #include "scopeinfo.h" 43 #include "scopeinfo.h"
44 #include "string-stream.h" 44 #include "string-stream.h"
45 #include "scanner.h" 45 #include "scanner.h"
46 #include "lexer/lexer.h" 46 #include "lexer/lexer.h"
47 47
48 using namespace v8::internal; 48 using namespace v8::internal;
49 49
50 static byte* ReadFile(const char* name, const byte** end, int repeat,
51 bool convert_to_utf16) {
52 FILE* file = fopen(name, "rb");
53 if (file == NULL) return NULL;
54
55 fseek(file, 0, SEEK_END);
56 int file_size = ftell(file);
57 rewind(file);
58
59 int size = file_size * repeat;
60
61 byte* chars = new byte[size];
62 for (int i = 0; i < file_size;) {
63 int read = static_cast<int>(fread(&chars[i], 1, file_size - i, file));
64 i += read;
65 }
66 fclose(file);
67
68 for (int i = file_size; i < size; i++) {
69 chars[i] = chars[i - file_size];
70 }
71 *end = &chars[size];
72
73 if (!convert_to_utf16) return chars;
74
75 // Length of new_chars is not strictly accurate, but should be enough.
76 uint16_t* new_chars = new uint16_t[size];
77 {
78 Utf8ToUtf16CharacterStream stream(chars, size);
79 uint16_t* cursor = new_chars;
80 // uc32 c;
81 // The 32-bit char type is probably only so that we can have -1 as a return
82 // value. If the char is not -1, it should fit into 16 bits.
83 CHECK(false);
84 // while ((c = stream.Advance()) != -1) {
85 // *cursor++ = c;
86 // }
87 *end = reinterpret_cast<byte*>(cursor);
88 }
89 delete[] chars;
90 return reinterpret_cast<byte*>(new_chars);
91 }
92
93 50
94 enum Encoding { 51 enum Encoding {
95 LATIN1, 52 LATIN1,
96 UTF8, 53 UTF8,
97 UTF16, 54 UTF16,
98 UTF8TO16 // Read as UTF8, convert to UTF16 before giving it to the lexers. 55 UTF8TO16 // Read as UTF8, convert to UTF16 before giving it to the lexers.
99 }; 56 };
100 57
101 58
102 struct LexerShellSettings { 59 struct LexerShellSettings {
(...skipping 10 matching lines...) Expand all
113 print_tokens(false), 70 print_tokens(false),
114 break_after_illegal(false), 71 break_after_illegal(false),
115 eos_test(false), 72 eos_test(false),
116 repeat(1), 73 repeat(1),
117 harmony_numeric_literals(false), 74 harmony_numeric_literals(false),
118 harmony_modules(false), 75 harmony_modules(false),
119 harmony_scoping(false) {} 76 harmony_scoping(false) {}
120 }; 77 };
121 78
122 79
80 static uint16_t* ReadFile(const char* name, const uint8_t** end,
81 const LexerShellSettings& settings) {
82 FILE* file = fopen(name, "rb");
83 CHECK(file != NULL);
84
85 fseek(file, 0, SEEK_END);
86 unsigned file_size = ftell(file);
87 rewind(file);
88
89 uint16_t* two_byte_data = new uint16_t[file_size / 2 + file_size % 2];
90
91 uint8_t* char_data = reinterpret_cast<uint8_t*>(two_byte_data);
92 for (unsigned i = 0; i < file_size;) {
93 i += fread(&char_data[i], 1, file_size - i, file);
94 }
95 fclose(file);
96
97 if (settings.encoding == UTF8TO16) {
98 const uint32_t kMaxUtf16Character = 0xffff;
99 // Get utf8 length.
100 unsigned utf16_chars = 0;
101 {
102 unsigned position = 0;
103 while (position < file_size) {
104 uint32_t c = char_data[position];
105 if (c <= unibrow::Utf8::kMaxOneByteChar) {
106 position++;
107 } else {
108 c = unibrow::Utf8::CalculateValue(char_data + position,
109 file_size - position,
110 &position);
111 }
112 if (c > kMaxUtf16Character) {
113 utf16_chars += 2;
114 } else {
115 utf16_chars += 1;
116 }
117 }
118 }
119 // Write new buffer out.
120 uint16_t* data = new uint16_t[utf16_chars];
121 unsigned position = 0;
122 unsigned i = 0;
123 while (position < file_size) {
124 uint32_t c = char_data[position];
125 if (c <= unibrow::Utf8::kMaxOneByteChar) {
126 position++;
127 } else {
128 c = unibrow::Utf8::CalculateValue(char_data + position,
129 file_size - position,
130 &position);
131 }
132 if (c > kMaxUtf16Character) {
133 data[i++] = unibrow::Utf16::LeadSurrogate(c);
134 data[i++] = unibrow::Utf16::TrailSurrogate(c);
135 } else {
136 data[i++] = static_cast<uc16>(c);
137 }
138 }
139 // Swap buffers.
140 delete two_byte_data;
141 file_size = utf16_chars * 2;
142 two_byte_data = data;
143 char_data = reinterpret_cast<uint8_t*>(two_byte_data);
144 }
145
146 // Duplicate buffer if necessary.
147 if (settings.repeat > 1) {
148 unsigned size = file_size * settings.repeat;
149 uint16_t* data = new uint16_t[size / 2 + size % 2];
150 char_data = reinterpret_cast<uint8_t*>(two_byte_data);
151 for (int i = 0; i < settings.repeat; i++) {
152 memcpy(&char_data[i * file_size], two_byte_data, file_size);
153 }
154 delete two_byte_data;
155 file_size = size;
156 two_byte_data = data;
157 }
158
159 *end = &char_data[file_size];
160 return two_byte_data;
161 }
162
163
123 struct TokenWithLocation { 164 struct TokenWithLocation {
124 Token::Value value; 165 Token::Value value;
125 size_t beg; 166 size_t beg;
126 size_t end; 167 size_t end;
127 std::vector<int> literal; 168 std::vector<int> literal;
128 bool is_ascii; 169 bool is_ascii;
129 // The location of the latest octal position when the token was seen. 170 // The location of the latest octal position when the token was seen.
130 int octal_beg; 171 int octal_beg;
131 int octal_end; 172 int octal_end;
132 TokenWithLocation() : 173 TokenWithLocation() :
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
186 if (scanner->is_literal_ascii()) { 227 if (scanner->is_literal_ascii()) {
187 result.literal = ToStdVector(scanner->literal_ascii_string()); 228 result.literal = ToStdVector(scanner->literal_ascii_string());
188 } else { 229 } else {
189 result.literal = ToStdVector(scanner->literal_utf16_string()); 230 result.literal = ToStdVector(scanner->literal_utf16_string());
190 } 231 }
191 } 232 }
192 return result; 233 return result;
193 } 234 }
194 235
195 236
196 static TimeDelta RunLexer(const byte* source, 237 static TimeDelta RunLexer(const uint16_t* source,
197 const byte* source_end, 238 const uint8_t* source_end,
198 Isolate* isolate, 239 Isolate* isolate,
199 std::vector<TokenWithLocation>* tokens, 240 std::vector<TokenWithLocation>* tokens,
200 const LexerShellSettings& settings) { 241 const LexerShellSettings& settings) {
201 SmartPointer<Utf16CharacterStream> stream; 242 SmartPointer<Utf16CharacterStream> stream;
243 const uint8_t* one_byte_source = reinterpret_cast<const uint8_t*>(source);
244 int bytes = source_end - one_byte_source;
202 switch (settings.encoding) { 245 switch (settings.encoding) {
203 case UTF8: 246 case UTF8:
247 stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes));
248 break;
204 case UTF8TO16: 249 case UTF8TO16:
205 stream.Reset(new Utf8ToUtf16CharacterStream(source, source_end - source));
206 break;
207 case UTF16: { 250 case UTF16: {
251 CHECK_EQ(0, bytes % 2);
208 Handle<String> result = isolate->factory()->NewStringFromTwoByte( 252 Handle<String> result = isolate->factory()->NewStringFromTwoByte(
209 Vector<const uint16_t>( 253 Vector<const uint16_t>(source, bytes / 2));
210 reinterpret_cast<const uint16_t*>(source),
211 (source_end - source) / 2));
212 stream.Reset( 254 stream.Reset(
213 new GenericStringUtf16CharacterStream(result, 0, result->length())); 255 new GenericStringUtf16CharacterStream(result, 0, result->length()));
214 break; 256 break;
215 } 257 }
216 case LATIN1: { 258 case LATIN1: {
217 Handle<String> result = isolate->factory()->NewStringFromOneByte( 259 Handle<String> result = isolate->factory()->NewStringFromOneByte(
218 Vector<const uint8_t>(source, source_end - source)); 260 Vector<const uint8_t>(one_byte_source, bytes));
219 stream.Reset( 261 stream.Reset(
220 new GenericStringUtf16CharacterStream(result, 0, result->length())); 262 new GenericStringUtf16CharacterStream(result, 0, result->length()));
221 break; 263 break;
222 } 264 }
223 } 265 }
224 Scanner scanner(isolate->unicode_cache()); 266 Scanner scanner(isolate->unicode_cache());
225 scanner.SetHarmonyNumericLiterals(settings.harmony_numeric_literals); 267 scanner.SetHarmonyNumericLiterals(settings.harmony_numeric_literals);
226 scanner.SetHarmonyModules(settings.harmony_modules); 268 scanner.SetHarmonyModules(settings.harmony_modules);
227 scanner.SetHarmonyScoping(settings.harmony_scoping); 269 scanner.SetHarmonyScoping(settings.harmony_scoping);
228 ElapsedTimer timer; 270 ElapsedTimer timer;
(...skipping 22 matching lines...) Expand all
251 const LexerShellSettings& settings, 293 const LexerShellSettings& settings,
252 int truncate_by, 294 int truncate_by,
253 bool* can_truncate) { 295 bool* can_truncate) {
254 if (settings.print_tokens) { 296 if (settings.print_tokens) {
255 printf("Processing file %s, truncating by %d bytes\n", fname, truncate_by); 297 printf("Processing file %s, truncating by %d bytes\n", fname, truncate_by);
256 } 298 }
257 HandleScope handle_scope(isolate); 299 HandleScope handle_scope(isolate);
258 std::vector<TokenWithLocation> tokens; 300 std::vector<TokenWithLocation> tokens;
259 TimeDelta time; 301 TimeDelta time;
260 { 302 {
261 const byte* buffer_end = 0; 303 const uint8_t* buffer_end = 0;
262 const byte* buffer = ReadFile(fname, &buffer_end, settings.repeat, false); 304 const uint16_t* buffer = ReadFile(fname, &buffer_end, settings);
263 if (truncate_by > buffer_end - buffer) { 305 if (truncate_by > buffer_end - reinterpret_cast<const uint8_t*>(buffer)) {
264 *can_truncate = false; 306 *can_truncate = false;
265 } else { 307 } else {
266 buffer_end -= truncate_by; 308 buffer_end -= truncate_by;
267 time = RunLexer(buffer, buffer_end, isolate, &tokens, settings); 309 time = RunLexer(buffer, buffer_end, isolate, &tokens, settings);
268 } 310 }
269 delete[] buffer; 311 delete[] buffer;
270 } 312 }
271 if (settings.print_tokens) { 313 if (settings.print_tokens) {
272 printf("No of tokens:\t%d\n", static_cast<int>(tokens.size())); 314 printf("No of tokens:\t%d\n", static_cast<int>(tokens.size()));
273 for (size_t i = 0; i < tokens.size(); ++i) { 315 for (size_t i = 0; i < tokens.size(); ++i) {
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
341 &can_truncate); 383 &can_truncate);
342 total_time += t.InMillisecondsF(); 384 total_time += t.InMillisecondsF();
343 ++truncate_by; 385 ++truncate_by;
344 } while (can_truncate); 386 } while (can_truncate);
345 } 387 }
346 printf("RunTime: %.f ms\n", total_time); 388 printf("RunTime: %.f ms\n", total_time);
347 } 389 }
348 v8::V8::Dispose(); 390 v8::V8::Dispose();
349 return 0; 391 return 0;
350 } 392 }
OLDNEW
« no previous file with comments | « no previous file | tools/lexer_generator/test/run_lexing_tests.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698