src/lexer/lexer-shell.cc - Issue 194613002: Experimental parser: proper utf16 conversion

Side by Side Diff: src/lexer/lexer-shell.cc

Issue 194613002: Experimental parser: proper utf16 conversion (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser

Patch Set: Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2013 the V8 project authors. All rights reserved.	1 // Copyright 2013 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 34 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
45 #include "scanner.h"	45 #include "scanner.h"

46 #include "lexer/lexer.h"	46 #include "lexer/lexer.h"

47	47

48 using namespace v8::internal;	48 using namespace v8::internal;

49	49

50	50

51 enum Encoding {	51 enum Encoding {

52 LATIN1,	52 LATIN1,

53 UTF8,	53 UTF8,

54 UTF16,	54 UTF16,

55 UTF8TO16 // Read as UTF8, convert to UTF16 before giving it to the lexers.	55 UTF8TO16, // Convert stream via scanner input stream

	56 UTF8TO16_PRECONVERT // Convert stream during file read

56 };	57 };

57	58

58	59

59 struct LexerShellSettings {	60 struct LexerShellSettings {

60 Encoding encoding;	61 Encoding encoding;

61 bool print_tokens;	62 bool print_tokens;

62 bool break_after_illegal;	63 bool break_after_illegal;

63 bool eos_test;	64 bool eos_test;

64 int repeat;	65 int repeat;

65 bool harmony_numeric_literals;	66 bool harmony_numeric_literals;

66 bool harmony_modules;	67 bool harmony_modules;

67 bool harmony_scoping;	68 bool harmony_scoping;

68 LexerShellSettings()	69 LexerShellSettings()

69 : encoding(LATIN1),	70 : encoding(LATIN1),

70 print_tokens(false),	71 print_tokens(false),

71 break_after_illegal(false),	72 break_after_illegal(false),

72 eos_test(false),	73 eos_test(false),

73 repeat(1),	74 repeat(1),

74 harmony_numeric_literals(false),	75 harmony_numeric_literals(false),

75 harmony_modules(false),	76 harmony_modules(false),

76 harmony_scoping(false) {}	77 harmony_scoping(false) {}

77 };	78 };

78	79

79	80

80 static uint16_t* ReadFile(const char* name, const uint8_t** end,	81 static uint16_t* ConvertUtf8ToUtf16(const uint16_t* const data_in,

81 const LexerShellSettings& settings) {	82 unsigned* length) {

82 FILE* file = fopen(name, "rb");	83 const unsigned file_size = *length;

83 CHECK(file != NULL);	84 const uint8_t* char_data = reinterpret_cast<const uint8_t*>(data_in);

84	85 const uint32_t kMaxUtf16Character = 0xffff;

85 fseek(file, 0, SEEK_END);	86 // Get utf8 length.

86 unsigned file_size = ftell(file);	87 unsigned utf16_chars = 0;

87 rewind(file);	88 {

88

89 uint16_t* two_byte_data = new uint16_t[file_size / 2 + file_size % 2];

90

91 uint8_t* char_data = reinterpret_cast<uint8_t*>(two_byte_data);

92 for (unsigned i = 0; i < file_size;) {

93 i += fread(&char_data[i], 1, file_size - i, file);

94 }

95 fclose(file);

96

97 if (settings.encoding == UTF8TO16) {

98 const uint32_t kMaxUtf16Character = 0xffff;

99 // Get utf8 length.

100 unsigned utf16_chars = 0;

101 {

102 unsigned position = 0;

103 while (position < file_size) {

104 uint32_t c = char_data[position];

105 if (c <= unibrow::Utf8::kMaxOneByteChar) {

106 position++;

107 } else {

108 c = unibrow::Utf8::CalculateValue(char_data + position,

109 file_size - position,

110 &position);

111 }

112 if (c > kMaxUtf16Character) {

113 utf16_chars += 2;

114 } else {

115 utf16_chars += 1;

116 }

117 }

118 }

119 // Write new buffer out.

120 uint16_t* data = new uint16_t[utf16_chars];

121 unsigned position = 0;	89 unsigned position = 0;

122 unsigned i = 0;

123 while (position < file_size) {	90 while (position < file_size) {

124 uint32_t c = char_data[position];	91 uint32_t c = char_data[position];

125 if (c <= unibrow::Utf8::kMaxOneByteChar) {	92 if (c <= unibrow::Utf8::kMaxOneByteChar) {

126 position++;	93 position++;

127 } else {	94 } else {

128 c = unibrow::Utf8::CalculateValue(char_data + position,	95 c = unibrow::Utf8::CalculateValue(char_data + position,

129 file_size - position,	96 file_size - position,

130 &position);	97 &position);

131 }	98 }

132 if (c > kMaxUtf16Character) {	99 if (c > kMaxUtf16Character) {

133 data[i++] = unibrow::Utf16::LeadSurrogate(c);	100 utf16_chars += 2;

134 data[i++] = unibrow::Utf16::TrailSurrogate(c);

135 } else {	101 } else {

136 data[i++] = static_cast<uc16>(c);	102 utf16_chars += 1;

137 }	103 }

138 }	104 }

139 // Swap buffers.

140 delete two_byte_data;

141 file_size = utf16_chars * 2;

142 two_byte_data = data;

143 char_data = reinterpret_cast<uint8_t*>(two_byte_data);

144 }	105 }

145	106 // Write new buffer out.

146 // Duplicate buffer if necessary.	107 uint16_t* data = new uint16_t[utf16_chars];

147 if (settings.repeat > 1) {	108 unsigned position = 0;

148 unsigned size = file_size * settings.repeat;	109 unsigned i = 0;

149 uint16_t* data = new uint16_t[size / 2 + size % 2];	110 while (position < file_size) {

150 char_data = reinterpret_cast<uint8_t*>(two_byte_data);	111 uint32_t c = char_data[position];

151 for (int i = 0; i < settings.repeat; i++) {	112 if (c <= unibrow::Utf8::kMaxOneByteChar) {

152 memcpy(&char_data[i * file_size], two_byte_data, file_size);	113 position++;

	114 } else {

	115 c = unibrow::Utf8::CalculateValue(char_data + position,

	116 file_size - position,

	117 &position);

153 }	118 }

154 delete two_byte_data;	119 if (c > kMaxUtf16Character) {

155 file_size = size;	120 data[i++] = unibrow::Utf16::LeadSurrogate(c);

156 two_byte_data = data;	121 data[i++] = unibrow::Utf16::TrailSurrogate(c);

	122 } else {

	123 data[i++] = static_cast<uc16>(c);

	124 }

157 }	125 }

158	126 length = 2 utf16_chars;

159 *end = &char_data[file_size];	127 return data;

160 return two_byte_data;

161 }	128 }

162	129

163	130

	131 static uint16_t* Repeat(int repeat,

	132 const uint16_t* const data_in,

	133 unsigned* length) {

	134 const unsigned file_size = *length;

	135 unsigned size = file_size * repeat;

	136 uint16_t* data = new uint16_t[size / 2 + size % 2];

	137 uint8_t* char_data = reinterpret_cast<uint8_t*>(data);

	138 for (int i = 0; i < repeat; i++) {

	139 memcpy(&char_data[i * file_size], data_in, file_size);

	140 }

	141 *length = size;

	142 return data;

	143 }

	144

	145

	146 static uint16_t* ReadFile(const char* name, unsigned* length) {

	147 FILE* file = fopen(name, "rb");

	148 CHECK(file != NULL);

	149 // Get file size.

	150 fseek(file, 0, SEEK_END);

	151 unsigned file_size = ftell(file);

	152 rewind(file);

	153 // Read file contents.

	154 uint16_t* data = new uint16_t[file_size / 2 + file_size % 2];

	155 uint8_t* char_data = reinterpret_cast<uint8_t*>(data);

	156 for (unsigned i = 0; i < file_size;) {

	157 i += fread(&char_data[i], 1, file_size - i, file);

	158 }

	159 fclose(file);

	160 *length = file_size;

	161 return data;

	162 }

	163

	164

	165 static uint16_t* ReadFile(const char* name,

	166 const LexerShellSettings& settings,

	167 unsigned* length) {

	168 uint16_t* data = ReadFile(name, length);

	169 CHECK_GE(*length, 0);

	170 if (*length == 0) return data;

	171

	172 if (settings.encoding == UTF8TO16_PRECONVERT) {

	173 uint16_t* new_data = ConvertUtf8ToUtf16(data, length);

	174 delete data;

	175 data = new_data;

	176 }

	177

	178 if (settings.repeat > 1) {

	179 uint16_t* new_data = Repeat(settings.repeat, data, length);

	180 delete data;

	181 data = new_data;

	182 }

	183

	184 return data;

	185 }

	186

	187

164 struct TokenWithLocation {	188 struct TokenWithLocation {

165 Token::Value value;	189 Token::Value value;

166 size_t beg;	190 size_t beg;

167 size_t end;	191 size_t end;

168 std::vector<int> literal;	192 std::vector<int> literal;

169 bool is_ascii;	193 bool is_ascii;

170 // The location of the latest octal position when the token was seen.	194 // The location of the latest octal position when the token was seen.

171 int octal_beg;	195 int octal_beg;

172 int octal_end;	196 int octal_end;

173 TokenWithLocation() :	197 TokenWithLocation() :

(...skipping 62 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
236	260

237 static TimeDelta RunLexer(const uint16_t* source,	261 static TimeDelta RunLexer(const uint16_t* source,

238 const uint8_t* source_end,	262 const uint8_t* source_end,

239 Isolate* isolate,	263 Isolate* isolate,

240 std::vector<TokenWithLocation>* tokens,	264 std::vector<TokenWithLocation>* tokens,

241 const LexerShellSettings& settings) {	265 const LexerShellSettings& settings) {

242 SmartPointer<Utf16CharacterStream> stream;	266 SmartPointer<Utf16CharacterStream> stream;

243 const uint8_t* one_byte_source = reinterpret_cast<const uint8_t*>(source);	267 const uint8_t* one_byte_source = reinterpret_cast<const uint8_t*>(source);

244 int bytes = source_end - one_byte_source;	268 int bytes = source_end - one_byte_source;

245 switch (settings.encoding) {	269 switch (settings.encoding) {

	270 case UTF8TO16:

246 case UTF8:	271 case UTF8:

247 stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes));	272 stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes));

248 break;	273 break;

249 case UTF8TO16:	274 case UTF8TO16_PRECONVERT:

250 case UTF16: {	275 case UTF16: {

251 CHECK_EQ(0, bytes % 2);	276 CHECK_EQ(0, bytes % 2);

252 Handle<String> result = isolate->factory()->NewStringFromTwoByte(	277 Handle<String> result = isolate->factory()->NewStringFromTwoByte(

253 Vector<const uint16_t>(source, bytes / 2));	278 Vector<const uint16_t>(source, bytes / 2));

254 stream.Reset(	279 stream.Reset(

255 new GenericStringUtf16CharacterStream(result, 0, result->length()));	280 new GenericStringUtf16CharacterStream(result, 0, result->length()));

256 break;	281 break;

257 }	282 }

258 case LATIN1: {	283 case LATIN1: {

259 Handle<String> result = isolate->factory()->NewStringFromOneByte(	284 Handle<String> result = isolate->factory()->NewStringFromOneByte(

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
293 const LexerShellSettings& settings,	318 const LexerShellSettings& settings,

294 int truncate_by,	319 int truncate_by,

295 bool* can_truncate) {	320 bool* can_truncate) {

296 if (settings.print_tokens) {	321 if (settings.print_tokens) {

297 printf("Processing file %s, truncating by %d bytes\n", fname, truncate_by);	322 printf("Processing file %s, truncating by %d bytes\n", fname, truncate_by);

298 }	323 }

299 HandleScope handle_scope(isolate);	324 HandleScope handle_scope(isolate);

300 std::vector<TokenWithLocation> tokens;	325 std::vector<TokenWithLocation> tokens;

301 TimeDelta time;	326 TimeDelta time;

302 {	327 {

303 const uint8_t* buffer_end = 0;	328 unsigned length_in_bytes;

304 const uint16_t* buffer = ReadFile(fname, &buffer_end, settings);	329 const uint16_t* buffer = ReadFile(fname, settings, &length_in_bytes);

305 if (truncate_by > buffer_end - reinterpret_cast<const uint8_t*>(buffer)) {	330 const uint8_t* char_data = reinterpret_cast<const uint8_t*>(buffer);

	331 const uint8_t* buffer_end = &char_data[length_in_bytes];

	332 if (truncate_by > buffer_end - char_data) {

306 *can_truncate = false;	333 *can_truncate = false;

307 } else {	334 } else {

308 buffer_end -= truncate_by;	335 buffer_end -= truncate_by;

309 time = RunLexer(buffer, buffer_end, isolate, &tokens, settings);	336 time = RunLexer(buffer, buffer_end, isolate, &tokens, settings);

310 }	337 }

311 delete[] buffer;	338 delete[] buffer;

312 }	339 }

313 if (settings.print_tokens) {	340 if (settings.print_tokens) {

314 printf("No of tokens:\t%d\n", static_cast<int>(tokens.size()));	341 printf("No of tokens:\t%d\n", static_cast<int>(tokens.size()));

315 for (size_t i = 0; i < tokens.size(); ++i) {	342 for (size_t i = 0; i < tokens.size(); ++i) {

(...skipping 14 matching lines...) Expand all Loading...
330 std::vector<std::string> fnames;	357 std::vector<std::string> fnames;

331 LexerShellSettings settings;	358 LexerShellSettings settings;

332 for (int i = 0; i < argc; ++i) {	359 for (int i = 0; i < argc; ++i) {

333 if (strcmp(argv[i], "--latin1") == 0) {	360 if (strcmp(argv[i], "--latin1") == 0) {

334 settings.encoding = LATIN1;	361 settings.encoding = LATIN1;

335 } else if (strcmp(argv[i], "--utf8") == 0) {	362 } else if (strcmp(argv[i], "--utf8") == 0) {

336 settings.encoding = UTF8;	363 settings.encoding = UTF8;

337 } else if (strcmp(argv[i], "--utf16") == 0) {	364 } else if (strcmp(argv[i], "--utf16") == 0) {

338 settings.encoding = UTF16;	365 settings.encoding = UTF16;

339 } else if (strcmp(argv[i], "--utf8to16") == 0) {	366 } else if (strcmp(argv[i], "--utf8to16") == 0) {

	367 #ifdef V8_USE_GENERATED_LEXER

	368 settings.encoding = UTF8TO16_PRECONVERT;

	369 #else

340 settings.encoding = UTF8TO16;	370 settings.encoding = UTF8TO16;

	371 #endif

341 } else if (strcmp(argv[i], "--print-tokens") == 0) {	372 } else if (strcmp(argv[i], "--print-tokens") == 0) {

342 settings.print_tokens = true;	373 settings.print_tokens = true;

343 } else if (strcmp(argv[i], "--no-baseline") == 0) {	374 } else if (strcmp(argv[i], "--no-baseline") == 0) {

344 // Ignore.	375 // Ignore.

345 } else if (strcmp(argv[i], "--no-experimental") == 0) {	376 } else if (strcmp(argv[i], "--no-experimental") == 0) {

346 // Ignore.	377 // Ignore.

347 } else if (strcmp(argv[i], "--no-check") == 0) {	378 } else if (strcmp(argv[i], "--no-check") == 0) {

348 // Ignore.	379 // Ignore.

349 } else if (strcmp(argv[i], "--break-after-illegal") == 0) {	380 } else if (strcmp(argv[i], "--break-after-illegal") == 0) {

350 settings.break_after_illegal = true;	381 settings.break_after_illegal = true;

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
383 &can_truncate);	414 &can_truncate);

384 total_time += t.InMillisecondsF();	415 total_time += t.InMillisecondsF();

385 ++truncate_by;	416 ++truncate_by;

386 } while (can_truncate);	417 } while (can_truncate);

387 }	418 }

388 printf("RunTime: %.f ms\n", total_time);	419 printf("RunTime: %.f ms\n", total_time);

389 }	420 }

390 v8::V8::Dispose();	421 v8::V8::Dispose();

391 return 0;	422 return 0;

392 }	423 }

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »