OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2017 the V8 project authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "src/asmjs/asm-lexer.h" | |
6 | |
7 #include <stdlib.h> | |
8 | |
9 #include "src/objects.h" | |
marja
2017/03/15 12:34:49
Why is objects.h needed?
bradn
2017/03/16 00:21:46
There was a Handle<String> used in scanner.h inlin
| |
10 #include "src/parsing/scanner-character-streams.h" | |
vogelheim
2017/03/15 12:07:41
I don't see scanner-character-streams.h being used
bradn
2017/03/16 00:21:47
Dropped.
| |
11 #include "src/parsing/scanner.h" | |
marja
2017/03/15 12:34:49
Hmm, you're still including scanner.h even though
bradn
2017/03/16 00:21:47
That was in the header.
This is needed here becaus
marja
2017/03/16 17:05:33
My orig. comment suggested moving the streams out
| |
12 | |
13 namespace v8 { | |
14 namespace internal { | |
15 | |
16 namespace { | |
17 // Cap number of identifiers to ensure we can assign both global and | |
18 // local ones a token id in the range of an int32_t. | |
19 static const int kMaxIdentifierCount = 0xf000000; | |
20 }; | |
21 | |
22 AsmJsLexer::AsmJsLexer() | |
23 : token_(0), | |
24 preceding_token_(0), | |
25 next_token_(0), | |
26 rewind_(false), | |
27 in_local_scope_(false), | |
28 global_count_(0), | |
29 double_value_(0.0), | |
30 unsigned_value_(0), | |
31 preceded_by_newline_(false) { | |
32 #define V(name, _junk1, _junk2, _junk3) property_names_[#name] = kToken_##name; | |
33 STDLIB_MATH_FUNCTION_LIST(V) | |
34 STDLIB_ARRAY_TYPE_LIST(V) | |
35 #undef V | |
36 #define V(name) property_names_[#name] = kToken_##name; | |
37 STDLIB_MATH_VALUE_LIST(V) | |
38 STDLIB_OTHER_LIST(V) | |
39 #undef V | |
40 #define V(name) global_names_[#name] = kToken_##name; | |
41 KEYWORD_NAME_LIST(V) | |
42 #undef V | |
43 } | |
44 | |
45 void AsmJsLexer::SetStream(std::unique_ptr<Utf16CharacterStream> stream) { | |
46 stream_ = std::move(stream); | |
47 Next(); | |
48 } | |
49 | |
50 void AsmJsLexer::Next() { | |
vogelheim
2017/03/15 12:07:40
I find this method much nicer to read now. Thanks.
bradn
2017/03/16 00:21:47
:-)
| |
51 if (rewind_) { | |
52 preceding_token_ = token_; | |
53 token_ = next_token_; | |
54 next_token_ = 0; | |
55 rewind_ = false; | |
56 return; | |
57 } | |
58 | |
59 if (token_ == kEndOfInput || token_ == kParseError) { | |
60 return; | |
61 } | |
62 | |
63 #if DEBUG | |
64 if (FLAG_trace_asm_lexer) { | |
65 if (Token() != 0) { | |
vogelheim
2017/03/15 12:07:41
nitpick: No real problem here, but this logic is a
bradn
2017/03/16 00:21:47
Done.
| |
66 if (Token() == kDouble) { | |
67 PrintF("%lf ", AsDouble()); | |
68 } else if (Token() == kUnsigned) { | |
69 PrintF("%" PRIu64 " ", AsUnsigned()); | |
70 } else { | |
71 std::string name = Name(Token()); | |
72 PrintF("%s ", name.c_str()); | |
73 } | |
74 } | |
75 } | |
76 #endif | |
77 | |
78 preceded_by_newline_ = false; | |
79 preceding_token_ = token_; | |
80 for (;;) { | |
81 uc32 ch = stream_->Advance(); | |
82 switch (ch) { | |
83 case ' ': | |
84 case '\t': | |
85 case '\n': | |
86 case '\r': | |
87 // Ignore whitespace, track when we've passed a newline for optional | |
88 // semicolon support. | |
89 if (ch == '\n') { | |
vogelheim
2017/03/15 12:07:41
nitpick: This is weird. If you have a switch-case
bradn
2017/03/16 00:21:46
Hah, yeah good point (missed that in the refactor)
| |
90 preceded_by_newline_ = true; | |
91 } | |
92 break; | |
93 | |
94 case kEndOfInput: | |
95 token_ = kEndOfInput; | |
96 return; | |
97 | |
98 case '\'': | |
99 case '"': | |
100 ConsumeString(ch); | |
101 return; | |
102 | |
103 case '/': | |
104 ch = stream_->Advance(); | |
105 if (ch == '/') { | |
106 ConsumeCComment(); | |
107 } else if (ch == '*') { | |
108 ConsumeCPPComment(); | |
109 } else { | |
110 stream_->Back(); | |
111 token_ = '/'; | |
112 return; | |
113 } | |
114 // Breaks out of switch, but loops again (i.e. the case when we parsed | |
115 // a comment, but need to continue to look for the next token). | |
116 break; | |
117 | |
118 case '<': | |
119 case '>': | |
120 case '=': | |
121 case '!': | |
122 ConsumeCompareOrShift(ch); | |
123 return; | |
124 | |
125 default: | |
126 if (IsIdentifierStart(ch)) { | |
127 ConsumeIdentifier(ch); | |
128 } else if (IsNumberStart(ch)) { | |
129 ConsumeNumber(ch); | |
130 } else if (ch >= 32 && ch < 127) { | |
vogelheim
2017/03/15 12:07:41
[Not sure this is an issue, but... ]
How many of
bradn
2017/03/16 00:21:47
Listed out the single char ones.
| |
131 // Use fixed token IDs for ASCII. | |
132 token_ = ch; | |
133 } else { | |
134 // TODO(bradnelson): Support unicode (probably via UnicodeCache). | |
135 token_ = kParseError; | |
136 } | |
137 return; | |
138 } | |
139 } | |
140 } | |
141 | |
142 void AsmJsLexer::Rewind() { | |
143 DCHECK(!rewind_); | |
144 next_token_ = token_; | |
145 token_ = preceding_token_; | |
146 preceding_token_ = 0; | |
147 rewind_ = true; | |
148 preceded_by_newline_ = false; | |
149 identifier_string_.clear(); | |
150 } | |
151 | |
152 void AsmJsLexer::ResetLocals() { local_names_.clear(); } | |
153 | |
154 #if DEBUG | |
155 // Only used for debugging. | |
156 std::string AsmJsLexer::Name(token_t token) const { | |
157 // TODO(bradnelson): Make thread safe. | |
158 if (token >= 32 && token < 127) { | |
159 return std::string(1, static_cast<char>(token)); | |
160 } | |
161 for (auto& i : local_names_) { | |
162 if (i.second == token) { | |
163 return i.first.c_str(); | |
Karl
2017/03/15 15:04:13
Why not just:
return i.first;
bradn
2017/03/16 00:21:46
Done.
| |
164 } | |
165 } | |
166 for (auto& i : global_names_) { | |
167 if (i.second == token) { | |
168 return i.first.c_str(); | |
Karl
2017/03/15 15:04:13
Same here.
bradn
2017/03/16 00:21:47
Done.
| |
169 } | |
170 } | |
171 for (auto& i : property_names_) { | |
172 if (i.second == token) { | |
173 return i.first.c_str(); | |
Karl
2017/03/15 15:04:13
Same here.
bradn
2017/03/16 00:21:46
Done.
| |
174 } | |
175 } | |
176 switch (token) { | |
177 #define V(rawname, name) \ | |
178 case kToken_##name: \ | |
179 return rawname; | |
180 LONG_SYMBOL_NAME_LIST(V) | |
181 #undef V | |
182 default: | |
183 break; | |
184 } | |
185 if (token == kUnsigned) { | |
vogelheim
2017/03/15 12:07:40
Why not handle all of these inside the switch righ
bradn
2017/03/16 00:21:46
Done.
| |
186 return "{unsigned value}"; | |
187 } else if (token == kDouble) { | |
188 return "{double value}"; | |
189 } else if (token == kParseError) { | |
190 return "{parse error}"; | |
191 } else if (token == kEndOfInput) { | |
192 return "{end of input}"; | |
193 } | |
194 UNREACHABLE(); | |
195 return "{unreachable}"; | |
196 } | |
197 #endif | |
198 | |
199 int AsmJsLexer::GetPosition() const { return static_cast<int>(stream_->pos()); } | |
vogelheim
2017/03/15 12:07:40
Does this work if rewind_ is set? If not, maybe ad
bradn
2017/03/16 00:21:46
Done.
| |
200 | |
201 void AsmJsLexer::Seek(int pos) { | |
202 stream_->Seek(pos); | |
203 preceding_token_ = 0; | |
204 token_ = 0; | |
205 next_token_ = 0; | |
206 rewind_ = false; | |
207 Next(); | |
208 } | |
209 | |
210 void AsmJsLexer::ConsumeIdentifier(uc32 ch) { | |
211 // Consume characters while still part of the identifier. | |
212 identifier_string_ = ""; | |
vogelheim
2017/03/15 12:07:41
identifier_string_.clear();
(STL is bizarre, but.
bradn
2017/03/16 00:21:46
Yep. Done.
| |
213 while (IsIdentifierPart(ch)) { | |
214 identifier_string_ += ch; | |
215 ch = stream_->Advance(); | |
216 } | |
217 // Go back one for next time. | |
218 stream_->Back(); | |
219 | |
220 // Decode what the identifier means. | |
221 if (preceding_token_ == '.') { | |
222 auto i = property_names_.find(identifier_string_); | |
223 if (i != property_names_.end()) { | |
224 token_ = i->second; | |
225 return; | |
226 } | |
227 } else { | |
228 { | |
229 auto i = local_names_.find(identifier_string_); | |
230 if (i != local_names_.end()) { | |
231 token_ = i->second; | |
232 return; | |
233 } | |
234 } | |
235 if (!in_local_scope_) { | |
236 auto i = global_names_.find(identifier_string_); | |
237 if (i != global_names_.end()) { | |
238 token_ = i->second; | |
239 return; | |
240 } | |
241 } | |
242 } | |
243 if (preceding_token_ == '.') { | |
244 CHECK(global_count_ < kMaxIdentifierCount); | |
245 token_ = kGlobalsStart + global_count_++; | |
246 property_names_[identifier_string_] = token_; | |
247 } else if (in_local_scope_) { | |
248 CHECK(local_names_.size() < kMaxIdentifierCount); | |
249 token_ = kLocalsStart - static_cast<token_t>(local_names_.size()); | |
250 local_names_[identifier_string_] = token_; | |
251 } else { | |
252 CHECK(global_count_ < kMaxIdentifierCount); | |
253 token_ = kGlobalsStart + global_count_++; | |
254 global_names_[identifier_string_] = token_; | |
255 } | |
256 } | |
257 | |
258 void AsmJsLexer::ConsumeNumber(uc32 ch) { | |
259 std::string number; | |
260 number = ch; | |
261 bool has_dot = ch == '.'; | |
262 for (;;) { | |
263 ch = stream_->Advance(); | |
264 if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || | |
265 (ch >= 'A' && ch <= 'F') || ch == '.' || ch == 'x' || | |
266 ((ch == '-' || ch == '+') && (number[number.size() - 1] == 'e' || | |
267 number[number.size() - 1] == 'E'))) { | |
268 // TODO(bradnelson): Test weird cases ending in -. | |
269 if (ch == '.') { | |
270 has_dot = true; | |
271 } | |
272 number += ch; | |
273 } else { | |
274 break; | |
275 } | |
276 } | |
277 stream_->Back(); | |
278 // Special case the most common number. | |
279 if (number == "0") { | |
280 unsigned_value_ = 0; | |
281 token_ = kUnsigned; | |
282 return; | |
283 } | |
284 // Pick out dot. | |
285 if (number == ".") { | |
286 token_ = '.'; | |
287 return; | |
288 } | |
289 // Decode numbers. | |
290 // TODO(bradnelson): Replace strto* with shared code with scanner.cc | |
291 char* end; | |
292 if (has_dot) { | |
293 double_value_ = strtod(number.c_str(), &end); | |
294 token_ = kDouble; | |
295 } else { | |
296 if (number.size() > 2 && number[0] == '0' && number[1] == 'x') { | |
297 // Decode 0x* as hex. | |
298 unsigned_value_ = strtoul(number.c_str() + 2, &end, 16); | |
299 } else if (number.size() > 1 && number[0] == '0') { | |
300 // Decode 0* as octal. | |
301 unsigned_value_ = strtoul(number.c_str() + 1, &end, 8); | |
302 } else { | |
303 // Decode the rest as double. | |
304 // This can come up in asm.js as for example 1e2 is used to encode 100. | |
305 double_value_ = strtod(number.c_str(), &end); | |
306 unsigned_value_ = static_cast<uint32_t>(double_value_); | |
307 } | |
308 token_ = kUnsigned; | |
309 } | |
310 // Check if string to number conversion didn't consume all the characters. | |
311 // This happens if the character filter let through something invalid | |
312 // like: 0123ef for example. | |
313 // TODO(bradnelson): Check if this happens often enough to be a perf problem. | |
314 if (end != number.c_str() + number.size()) { | |
315 // If things didn't parse fully, but start with a '.', back out the other | |
316 // characters and emit the '.' token. | |
317 if (number[0] == '.') { | |
318 for (size_t k = 1; k < number.size(); ++k) { | |
319 stream_->Back(); | |
320 } | |
321 token_ = '.'; | |
322 return; | |
323 } | |
324 // Anything else that doesn't parse is an error. | |
325 token_ = kParseError; | |
326 return; | |
327 } | |
328 } | |
329 | |
330 void AsmJsLexer::ConsumeCComment() { | |
331 for (;;) { | |
332 uc32 ch = stream_->Advance(); | |
333 if (ch == '\n' || ch == kEndOfInput) { | |
334 break; | |
335 } | |
336 } | |
337 } | |
338 | |
339 void AsmJsLexer::ConsumeCPPComment() { | |
340 for (;;) { | |
341 uc32 ch = stream_->Advance(); | |
342 if (ch == '*') { | |
vogelheim
2017/03/15 12:07:41
Your choice, but I think this if-branch would be a
bradn
2017/03/16 00:21:46
Ah, yeah, that's better.
Done.
| |
343 ch = stream_->Advance(); | |
344 if (ch == '/') { | |
345 break; | |
346 } | |
347 if (ch == '*') { | |
348 stream_->Back(); | |
349 } | |
350 } else if (ch == kEndOfInput) { | |
vogelheim
2017/03/15 12:07:40
I think this potentially swallows a syntax error w
bradn
2017/03/16 00:21:46
Ah, yes.
Fixed and added a test.
| |
351 break; | |
352 } | |
353 } | |
354 } | |
355 | |
356 void AsmJsLexer::ConsumeString(uc32 quote) { | |
357 // Only string allowed is 'use asm' / "use asm". | |
358 const char* expected = "use asm"; | |
359 for (; *expected != '\0'; ++expected) { | |
360 if (stream_->Advance() != *expected) { | |
361 token_ = kParseError; | |
362 return; | |
363 } | |
364 } | |
365 if (stream_->Advance() != quote) { | |
366 token_ = kParseError; | |
367 return; | |
368 } | |
369 token_ = kToken_UseAsm; | |
370 } | |
371 | |
372 void AsmJsLexer::ConsumeCompareOrShift(uc32 ch) { | |
373 uc32 next_ch = stream_->Advance(); | |
374 if (next_ch == '=') { | |
375 switch (ch) { | |
376 case '<': | |
377 token_ = kToken_LE; | |
378 break; | |
379 case '>': | |
380 token_ = kToken_GE; | |
381 break; | |
382 case '=': | |
383 token_ = kToken_EQ; | |
384 break; | |
385 case '!': | |
386 token_ = kToken_NE; | |
387 break; | |
388 default: | |
389 UNREACHABLE(); | |
390 } | |
391 } else if (ch == '<' && next_ch == '<') { | |
392 token_ = kToken_SHL; | |
393 } else if (ch == '>' && next_ch == '>') { | |
394 if (stream_->Advance() == '>') { | |
395 token_ = kToken_SHR; | |
396 } else { | |
397 token_ = kToken_SAR; | |
398 stream_->Back(); | |
399 } | |
400 } else { | |
401 stream_->Back(); | |
402 token_ = ch; | |
403 } | |
404 } | |
405 | |
406 bool AsmJsLexer::IsIdentifierStart(uc32 ch) { | |
407 return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_' || | |
408 ch == '$'; | |
409 } | |
410 | |
411 bool AsmJsLexer::IsIdentifierPart(uc32 ch) { | |
412 return IsIdentifierStart(ch) || (ch >= '0' && ch <= '9'); | |
413 } | |
414 | |
415 bool AsmJsLexer::IsNumberStart(uc32 ch) { | |
416 return ch == '.' || (ch >= '0' && ch <= '9'); | |
417 } | |
418 | |
419 } // namespace internal | |
420 } // namespace v8 | |
OLD | NEW |