src/lexer/lexer-shell.cc - Issue 192643003: Experimental parser: fix UTF8TO16 handling.

Side by Side Diff: src/lexer/lexer-shell.cc

Issue 192643003: Experimental parser: fix UTF8TO16 handling. (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser

Patch Set: Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2013 the V8 project authors. All rights reserved.	1 // Copyright 2013 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 29 matching lines...) Expand all Loading...
40 #include "platform.h"	40 #include "platform.h"

41 #include "runtime.h"	41 #include "runtime.h"

42 #include "scanner-character-streams.h"	42 #include "scanner-character-streams.h"

43 #include "scopeinfo.h"	43 #include "scopeinfo.h"

44 #include "string-stream.h"	44 #include "string-stream.h"

45 #include "scanner.h"	45 #include "scanner.h"

46 #include "lexer/lexer.h"	46 #include "lexer/lexer.h"

47	47

48 using namespace v8::internal;	48 using namespace v8::internal;

49	49

50 static byte* ReadFile(const char* name, const byte** end, int repeat,

51 bool convert_to_utf16) {

52 FILE* file = fopen(name, "rb");

53 if (file == NULL) return NULL;

54

55 fseek(file, 0, SEEK_END);

56 int file_size = ftell(file);

57 rewind(file);

58

59 int size = file_size * repeat;

60

61 byte* chars = new byte[size];

62 for (int i = 0; i < file_size;) {

63 int read = static_cast<int>(fread(&chars[i], 1, file_size - i, file));

64 i += read;

65 }

66 fclose(file);

67

68 for (int i = file_size; i < size; i++) {

69 chars[i] = chars[i - file_size];

70 }

71 *end = &chars[size];

72

73 if (!convert_to_utf16) return chars;

74

75 // Length of new_chars is not strictly accurate, but should be enough.

76 uint16_t* new_chars = new uint16_t[size];

77 {

78 Utf8ToUtf16CharacterStream stream(chars, size);

79 uint16_t* cursor = new_chars;

80 // uc32 c;

81 // The 32-bit char type is probably only so that we can have -1 as a return

82 // value. If the char is not -1, it should fit into 16 bits.

83 CHECK(false);

84 // while ((c = stream.Advance()) != -1) {

85 // *cursor++ = c;

86 // }

87 end = reinterpret_cast<byte>(cursor);

88 }

89 delete[] chars;

90 return reinterpret_cast<byte*>(new_chars);

91 }

92

93	50

94 enum Encoding {	51 enum Encoding {

95 LATIN1,	52 LATIN1,

96 UTF8,	53 UTF8,

97 UTF16,	54 UTF16,

98 UTF8TO16 // Read as UTF8, convert to UTF16 before giving it to the lexers.	55 UTF8TO16 // Read as UTF8, convert to UTF16 before giving it to the lexers.

99 };	56 };

100	57

101	58

102 struct LexerShellSettings {	59 struct LexerShellSettings {

(...skipping 10 matching lines...) Expand all Loading...
113 print_tokens(false),	70 print_tokens(false),

114 break_after_illegal(false),	71 break_after_illegal(false),

115 eos_test(false),	72 eos_test(false),

116 repeat(1),	73 repeat(1),

117 harmony_numeric_literals(false),	74 harmony_numeric_literals(false),

118 harmony_modules(false),	75 harmony_modules(false),

119 harmony_scoping(false) {}	76 harmony_scoping(false) {}

120 };	77 };

121	78

122	79

	80 static uint16_t* ReadFile(const char* name, const uint8_t** end,

	81 const LexerShellSettings& settings) {

	82 FILE* file = fopen(name, "rb");

	83 CHECK(file != NULL);

	84

	85 fseek(file, 0, SEEK_END);

	86 unsigned file_size = ftell(file);

	87 rewind(file);

	88

	89 uint16_t* two_byte_data = new uint16_t[file_size / 2 + file_size % 2];

	90

	91 uint8_t* char_data = reinterpret_cast<uint8_t*>(two_byte_data);

	92 for (unsigned i = 0; i < file_size;) {

	93 i += fread(&char_data[i], 1, file_size - i, file);

	94 }

	95 fclose(file);

	96

	97 if (settings.encoding == UTF8TO16) {

	98 const uint32_t kMaxUtf16Character = 0xffff;

	99 // Get utf8 length.

	100 unsigned utf16_chars = 0;

	101 {

	102 unsigned position = 0;

	103 while (position < file_size) {

	104 uint32_t c = char_data[position];

	105 if (c <= unibrow::Utf8::kMaxOneByteChar) {

	106 position++;

	107 } else {

	108 c = unibrow::Utf8::CalculateValue(char_data + position,

	109 file_size - position,

	110 &position);

	111 }

	112 if (c > kMaxUtf16Character) {

	113 utf16_chars += 2;

	114 } else {

	115 utf16_chars += 1;

	116 }

	117 }

	118 }

	119 // Write new buffer out.

	120 uint16_t* data = new uint16_t[utf16_chars];

	121 unsigned position = 0;

	122 unsigned i = 0;

	123 while (position < file_size) {

	124 uint32_t c = char_data[position];

	125 if (c <= unibrow::Utf8::kMaxOneByteChar) {

	126 position++;

	127 } else {

	128 c = unibrow::Utf8::CalculateValue(char_data + position,

	129 file_size - position,

	130 &position);

	131 }

	132 if (c > kMaxUtf16Character) {

	133 data[i++] = unibrow::Utf16::LeadSurrogate(c);

	134 data[i++] = unibrow::Utf16::TrailSurrogate(c);

	135 } else {

	136 data[i++] = static_cast<uc16>(c);

	137 }

	138 }

	139 // Swap buffers.

	140 delete two_byte_data;

	141 file_size = utf16_chars * 2;

	142 two_byte_data = data;

	143 char_data = reinterpret_cast<uint8_t*>(two_byte_data);

	144 }

	145

	146 // Duplicate buffer if necessary.

	147 if (settings.repeat > 1) {

	148 unsigned size = file_size * settings.repeat;

	149 uint16_t* data = new uint16_t[size / 2 + size % 2];

	150 char_data = reinterpret_cast<uint8_t*>(two_byte_data);

	151 for (int i = 0; i < settings.repeat; i++) {

	152 memcpy(&char_data[i * file_size], two_byte_data, file_size);

	153 }

	154 delete two_byte_data;

	155 file_size = size;

	156 two_byte_data = data;

	157 }

	158

	159 *end = &char_data[file_size];

	160 return two_byte_data;

	161 }

	162

	163

123 struct TokenWithLocation {	164 struct TokenWithLocation {

124 Token::Value value;	165 Token::Value value;

125 size_t beg;	166 size_t beg;

126 size_t end;	167 size_t end;

127 std::vector<int> literal;	168 std::vector<int> literal;

128 bool is_ascii;	169 bool is_ascii;

129 // The location of the latest octal position when the token was seen.	170 // The location of the latest octal position when the token was seen.

130 int octal_beg;	171 int octal_beg;

131 int octal_end;	172 int octal_end;

132 TokenWithLocation() :	173 TokenWithLocation() :

(...skipping 53 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
186 if (scanner->is_literal_ascii()) {	227 if (scanner->is_literal_ascii()) {

187 result.literal = ToStdVector(scanner->literal_ascii_string());	228 result.literal = ToStdVector(scanner->literal_ascii_string());

188 } else {	229 } else {

189 result.literal = ToStdVector(scanner->literal_utf16_string());	230 result.literal = ToStdVector(scanner->literal_utf16_string());

190 }	231 }

191 }	232 }

192 return result;	233 return result;

193 }	234 }

194	235

195	236

196 static TimeDelta RunLexer(const byte* source,	237 static TimeDelta RunLexer(const uint16_t* source,

197 const byte* source_end,	238 const uint8_t* source_end,

198 Isolate* isolate,	239 Isolate* isolate,

199 std::vector<TokenWithLocation>* tokens,	240 std::vector<TokenWithLocation>* tokens,

200 const LexerShellSettings& settings) {	241 const LexerShellSettings& settings) {

201 SmartPointer<Utf16CharacterStream> stream;	242 SmartPointer<Utf16CharacterStream> stream;

	243 const uint8_t* one_byte_source = reinterpret_cast<const uint8_t*>(source);

	244 int bytes = source_end - one_byte_source;

202 switch (settings.encoding) {	245 switch (settings.encoding) {

203 case UTF8:	246 case UTF8:

	247 stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes));

	248 break;

204 case UTF8TO16:	249 case UTF8TO16:

205 stream.Reset(new Utf8ToUtf16CharacterStream(source, source_end - source));

206 break;

207 case UTF16: {	250 case UTF16: {

	251 CHECK_EQ(0, bytes % 2);

208 Handle<String> result = isolate->factory()->NewStringFromTwoByte(	252 Handle<String> result = isolate->factory()->NewStringFromTwoByte(

209 Vector<const uint16_t>(	253 Vector<const uint16_t>(source, bytes / 2));

210 reinterpret_cast<const uint16_t*>(source),

211 (source_end - source) / 2));

212 stream.Reset(	254 stream.Reset(

213 new GenericStringUtf16CharacterStream(result, 0, result->length()));	255 new GenericStringUtf16CharacterStream(result, 0, result->length()));

214 break;	256 break;

215 }	257 }

216 case LATIN1: {	258 case LATIN1: {

217 Handle<String> result = isolate->factory()->NewStringFromOneByte(	259 Handle<String> result = isolate->factory()->NewStringFromOneByte(

218 Vector<const uint8_t>(source, source_end - source));	260 Vector<const uint8_t>(one_byte_source, bytes));

219 stream.Reset(	261 stream.Reset(

220 new GenericStringUtf16CharacterStream(result, 0, result->length()));	262 new GenericStringUtf16CharacterStream(result, 0, result->length()));

221 break;	263 break;

222 }	264 }

223 }	265 }

224 Scanner scanner(isolate->unicode_cache());	266 Scanner scanner(isolate->unicode_cache());

225 scanner.SetHarmonyNumericLiterals(settings.harmony_numeric_literals);	267 scanner.SetHarmonyNumericLiterals(settings.harmony_numeric_literals);

226 scanner.SetHarmonyModules(settings.harmony_modules);	268 scanner.SetHarmonyModules(settings.harmony_modules);

227 scanner.SetHarmonyScoping(settings.harmony_scoping);	269 scanner.SetHarmonyScoping(settings.harmony_scoping);

228 ElapsedTimer timer;	270 ElapsedTimer timer;

(...skipping 22 matching lines...) Expand all Loading...
251 const LexerShellSettings& settings,	293 const LexerShellSettings& settings,

252 int truncate_by,	294 int truncate_by,

253 bool* can_truncate) {	295 bool* can_truncate) {

254 if (settings.print_tokens) {	296 if (settings.print_tokens) {

255 printf("Processing file %s, truncating by %d bytes\n", fname, truncate_by);	297 printf("Processing file %s, truncating by %d bytes\n", fname, truncate_by);

256 }	298 }

257 HandleScope handle_scope(isolate);	299 HandleScope handle_scope(isolate);

258 std::vector<TokenWithLocation> tokens;	300 std::vector<TokenWithLocation> tokens;

259 TimeDelta time;	301 TimeDelta time;

260 {	302 {

261 const byte* buffer_end = 0;	303 const uint8_t* buffer_end = 0;

262 const byte* buffer = ReadFile(fname, &buffer_end, settings.repeat, false);	304 const uint16_t* buffer = ReadFile(fname, &buffer_end, settings);

263 if (truncate_by > buffer_end - buffer) {	305 if (truncate_by > buffer_end - reinterpret_cast<const uint8_t*>(buffer)) {

264 *can_truncate = false;	306 *can_truncate = false;

265 } else {	307 } else {

266 buffer_end -= truncate_by;	308 buffer_end -= truncate_by;

267 time = RunLexer(buffer, buffer_end, isolate, &tokens, settings);	309 time = RunLexer(buffer, buffer_end, isolate, &tokens, settings);

268 }	310 }

269 delete[] buffer;	311 delete[] buffer;

270 }	312 }

271 if (settings.print_tokens) {	313 if (settings.print_tokens) {

272 printf("No of tokens:\t%d\n", static_cast<int>(tokens.size()));	314 printf("No of tokens:\t%d\n", static_cast<int>(tokens.size()));

273 for (size_t i = 0; i < tokens.size(); ++i) {	315 for (size_t i = 0; i < tokens.size(); ++i) {

(...skipping 67 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
341 &can_truncate);	383 &can_truncate);

342 total_time += t.InMillisecondsF();	384 total_time += t.InMillisecondsF();

343 ++truncate_by;	385 ++truncate_by;

344 } while (can_truncate);	386 } while (can_truncate);

345 }	387 }

346 printf("RunTime: %.f ms\n", total_time);	388 printf("RunTime: %.f ms\n", total_time);

347 }	389 }

348 v8::V8::Dispose();	390 v8::V8::Dispose();

349 return 0;	391 return 0;

350 }	392 }

OLD	NEW

« no previous file with comments | « no previous file | tools/lexer_generator/test/run_lexing_tests.py » ('j') | no next file with comments »