Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(92)

Side by Side Diff: src/asmjs/asm-lexer.cc

Issue 2751693002: [wasm][asm.js] Adding custom asm.js lexer. (Closed)
Patch Set: check Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2017 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/asmjs/asm-lexer.h"
6
7 #include <stdlib.h>
8
9 #include "src/objects.h"
marja 2017/03/15 12:34:49 Why is objects.h needed?
bradn 2017/03/16 00:21:46 There was a Handle<String> used in scanner.h inlin
10 #include "src/parsing/scanner-character-streams.h"
vogelheim 2017/03/15 12:07:41 I don't see scanner-character-streams.h being used
bradn 2017/03/16 00:21:47 Dropped.
11 #include "src/parsing/scanner.h"
marja 2017/03/15 12:34:49 Hmm, you're still including scanner.h even though
bradn 2017/03/16 00:21:47 That was in the header. This is needed here becaus
marja 2017/03/16 17:05:33 My orig. comment suggested moving the streams out
12
13 namespace v8 {
14 namespace internal {
15
16 namespace {
17 // Cap number of identifiers to ensure we can assign both global and
18 // local ones a token id in the range of an int32_t.
19 static const int kMaxIdentifierCount = 0xf000000;
20 };
21
22 AsmJsLexer::AsmJsLexer()
23 : token_(0),
24 preceding_token_(0),
25 next_token_(0),
26 rewind_(false),
27 in_local_scope_(false),
28 global_count_(0),
29 double_value_(0.0),
30 unsigned_value_(0),
31 preceded_by_newline_(false) {
32 #define V(name, _junk1, _junk2, _junk3) property_names_[#name] = kToken_##name;
33 STDLIB_MATH_FUNCTION_LIST(V)
34 STDLIB_ARRAY_TYPE_LIST(V)
35 #undef V
36 #define V(name) property_names_[#name] = kToken_##name;
37 STDLIB_MATH_VALUE_LIST(V)
38 STDLIB_OTHER_LIST(V)
39 #undef V
40 #define V(name) global_names_[#name] = kToken_##name;
41 KEYWORD_NAME_LIST(V)
42 #undef V
43 }
44
45 void AsmJsLexer::SetStream(std::unique_ptr<Utf16CharacterStream> stream) {
46 stream_ = std::move(stream);
47 Next();
48 }
49
50 void AsmJsLexer::Next() {
vogelheim 2017/03/15 12:07:40 I find this method much nicer to read now. Thanks.
bradn 2017/03/16 00:21:47 :-)
51 if (rewind_) {
52 preceding_token_ = token_;
53 token_ = next_token_;
54 next_token_ = 0;
55 rewind_ = false;
56 return;
57 }
58
59 if (token_ == kEndOfInput || token_ == kParseError) {
60 return;
61 }
62
63 #if DEBUG
64 if (FLAG_trace_asm_lexer) {
65 if (Token() != 0) {
vogelheim 2017/03/15 12:07:41 nitpick: No real problem here, but this logic is a
bradn 2017/03/16 00:21:47 Done.
66 if (Token() == kDouble) {
67 PrintF("%lf ", AsDouble());
68 } else if (Token() == kUnsigned) {
69 PrintF("%" PRIu64 " ", AsUnsigned());
70 } else {
71 std::string name = Name(Token());
72 PrintF("%s ", name.c_str());
73 }
74 }
75 }
76 #endif
77
78 preceded_by_newline_ = false;
79 preceding_token_ = token_;
80 for (;;) {
81 uc32 ch = stream_->Advance();
82 switch (ch) {
83 case ' ':
84 case '\t':
85 case '\n':
86 case '\r':
87 // Ignore whitespace, track when we've passed a newline for optional
88 // semicolon support.
89 if (ch == '\n') {
vogelheim 2017/03/15 12:07:41 nitpick: This is weird. If you have a switch-case
bradn 2017/03/16 00:21:46 Hah, yeah good point (missed that in the refactor)
90 preceded_by_newline_ = true;
91 }
92 break;
93
94 case kEndOfInput:
95 token_ = kEndOfInput;
96 return;
97
98 case '\'':
99 case '"':
100 ConsumeString(ch);
101 return;
102
103 case '/':
104 ch = stream_->Advance();
105 if (ch == '/') {
106 ConsumeCComment();
107 } else if (ch == '*') {
108 ConsumeCPPComment();
109 } else {
110 stream_->Back();
111 token_ = '/';
112 return;
113 }
114 // Breaks out of switch, but loops again (i.e. the case when we parsed
115 // a comment, but need to continue to look for the next token).
116 break;
117
118 case '<':
119 case '>':
120 case '=':
121 case '!':
122 ConsumeCompareOrShift(ch);
123 return;
124
125 default:
126 if (IsIdentifierStart(ch)) {
127 ConsumeIdentifier(ch);
128 } else if (IsNumberStart(ch)) {
129 ConsumeNumber(ch);
130 } else if (ch >= 32 && ch < 127) {
vogelheim 2017/03/15 12:07:41 [Not sure this is an issue, but... ] How many of
bradn 2017/03/16 00:21:47 Listed out the single char ones.
131 // Use fixed token IDs for ASCII.
132 token_ = ch;
133 } else {
134 // TODO(bradnelson): Support unicode (probably via UnicodeCache).
135 token_ = kParseError;
136 }
137 return;
138 }
139 }
140 }
141
142 void AsmJsLexer::Rewind() {
143 DCHECK(!rewind_);
144 next_token_ = token_;
145 token_ = preceding_token_;
146 preceding_token_ = 0;
147 rewind_ = true;
148 preceded_by_newline_ = false;
149 identifier_string_.clear();
150 }
151
152 void AsmJsLexer::ResetLocals() { local_names_.clear(); }
153
154 #if DEBUG
155 // Only used for debugging.
156 std::string AsmJsLexer::Name(token_t token) const {
157 // TODO(bradnelson): Make thread safe.
158 if (token >= 32 && token < 127) {
159 return std::string(1, static_cast<char>(token));
160 }
161 for (auto& i : local_names_) {
162 if (i.second == token) {
163 return i.first.c_str();
Karl 2017/03/15 15:04:13 Why not just: return i.first;
bradn 2017/03/16 00:21:46 Done.
164 }
165 }
166 for (auto& i : global_names_) {
167 if (i.second == token) {
168 return i.first.c_str();
Karl 2017/03/15 15:04:13 Same here.
bradn 2017/03/16 00:21:47 Done.
169 }
170 }
171 for (auto& i : property_names_) {
172 if (i.second == token) {
173 return i.first.c_str();
Karl 2017/03/15 15:04:13 Same here.
bradn 2017/03/16 00:21:46 Done.
174 }
175 }
176 switch (token) {
177 #define V(rawname, name) \
178 case kToken_##name: \
179 return rawname;
180 LONG_SYMBOL_NAME_LIST(V)
181 #undef V
182 default:
183 break;
184 }
185 if (token == kUnsigned) {
vogelheim 2017/03/15 12:07:40 Why not handle all of these inside the switch righ
bradn 2017/03/16 00:21:46 Done.
186 return "{unsigned value}";
187 } else if (token == kDouble) {
188 return "{double value}";
189 } else if (token == kParseError) {
190 return "{parse error}";
191 } else if (token == kEndOfInput) {
192 return "{end of input}";
193 }
194 UNREACHABLE();
195 return "{unreachable}";
196 }
197 #endif
198
199 int AsmJsLexer::GetPosition() const { return static_cast<int>(stream_->pos()); }
vogelheim 2017/03/15 12:07:40 Does this work if rewind_ is set? If not, maybe ad
bradn 2017/03/16 00:21:46 Done.
200
201 void AsmJsLexer::Seek(int pos) {
202 stream_->Seek(pos);
203 preceding_token_ = 0;
204 token_ = 0;
205 next_token_ = 0;
206 rewind_ = false;
207 Next();
208 }
209
210 void AsmJsLexer::ConsumeIdentifier(uc32 ch) {
211 // Consume characters while still part of the identifier.
212 identifier_string_ = "";
vogelheim 2017/03/15 12:07:41 identifier_string_.clear(); (STL is bizarre, but.
bradn 2017/03/16 00:21:46 Yep. Done.
213 while (IsIdentifierPart(ch)) {
214 identifier_string_ += ch;
215 ch = stream_->Advance();
216 }
217 // Go back one for next time.
218 stream_->Back();
219
220 // Decode what the identifier means.
221 if (preceding_token_ == '.') {
222 auto i = property_names_.find(identifier_string_);
223 if (i != property_names_.end()) {
224 token_ = i->second;
225 return;
226 }
227 } else {
228 {
229 auto i = local_names_.find(identifier_string_);
230 if (i != local_names_.end()) {
231 token_ = i->second;
232 return;
233 }
234 }
235 if (!in_local_scope_) {
236 auto i = global_names_.find(identifier_string_);
237 if (i != global_names_.end()) {
238 token_ = i->second;
239 return;
240 }
241 }
242 }
243 if (preceding_token_ == '.') {
244 CHECK(global_count_ < kMaxIdentifierCount);
245 token_ = kGlobalsStart + global_count_++;
246 property_names_[identifier_string_] = token_;
247 } else if (in_local_scope_) {
248 CHECK(local_names_.size() < kMaxIdentifierCount);
249 token_ = kLocalsStart - static_cast<token_t>(local_names_.size());
250 local_names_[identifier_string_] = token_;
251 } else {
252 CHECK(global_count_ < kMaxIdentifierCount);
253 token_ = kGlobalsStart + global_count_++;
254 global_names_[identifier_string_] = token_;
255 }
256 }
257
258 void AsmJsLexer::ConsumeNumber(uc32 ch) {
259 std::string number;
260 number = ch;
261 bool has_dot = ch == '.';
262 for (;;) {
263 ch = stream_->Advance();
264 if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') ||
265 (ch >= 'A' && ch <= 'F') || ch == '.' || ch == 'x' ||
266 ((ch == '-' || ch == '+') && (number[number.size() - 1] == 'e' ||
267 number[number.size() - 1] == 'E'))) {
268 // TODO(bradnelson): Test weird cases ending in -.
269 if (ch == '.') {
270 has_dot = true;
271 }
272 number += ch;
273 } else {
274 break;
275 }
276 }
277 stream_->Back();
278 // Special case the most common number.
279 if (number == "0") {
280 unsigned_value_ = 0;
281 token_ = kUnsigned;
282 return;
283 }
284 // Pick out dot.
285 if (number == ".") {
286 token_ = '.';
287 return;
288 }
289 // Decode numbers.
290 // TODO(bradnelson): Replace strto* with shared code with scanner.cc
291 char* end;
292 if (has_dot) {
293 double_value_ = strtod(number.c_str(), &end);
294 token_ = kDouble;
295 } else {
296 if (number.size() > 2 && number[0] == '0' && number[1] == 'x') {
297 // Decode 0x* as hex.
298 unsigned_value_ = strtoul(number.c_str() + 2, &end, 16);
299 } else if (number.size() > 1 && number[0] == '0') {
300 // Decode 0* as octal.
301 unsigned_value_ = strtoul(number.c_str() + 1, &end, 8);
302 } else {
303 // Decode the rest as double.
304 // This can come up in asm.js as for example 1e2 is used to encode 100.
305 double_value_ = strtod(number.c_str(), &end);
306 unsigned_value_ = static_cast<uint32_t>(double_value_);
307 }
308 token_ = kUnsigned;
309 }
310 // Check if string to number conversion didn't consume all the characters.
311 // This happens if the character filter let through something invalid
312 // like: 0123ef for example.
313 // TODO(bradnelson): Check if this happens often enough to be a perf problem.
314 if (end != number.c_str() + number.size()) {
315 // If things didn't parse fully, but start with a '.', back out the other
316 // characters and emit the '.' token.
317 if (number[0] == '.') {
318 for (size_t k = 1; k < number.size(); ++k) {
319 stream_->Back();
320 }
321 token_ = '.';
322 return;
323 }
324 // Anything else that doesn't parse is an error.
325 token_ = kParseError;
326 return;
327 }
328 }
329
330 void AsmJsLexer::ConsumeCComment() {
331 for (;;) {
332 uc32 ch = stream_->Advance();
333 if (ch == '\n' || ch == kEndOfInput) {
334 break;
335 }
336 }
337 }
338
339 void AsmJsLexer::ConsumeCPPComment() {
340 for (;;) {
341 uc32 ch = stream_->Advance();
342 if (ch == '*') {
vogelheim 2017/03/15 12:07:41 Your choice, but I think this if-branch would be a
bradn 2017/03/16 00:21:46 Ah, yeah, that's better. Done.
343 ch = stream_->Advance();
344 if (ch == '/') {
345 break;
346 }
347 if (ch == '*') {
348 stream_->Back();
349 }
350 } else if (ch == kEndOfInput) {
vogelheim 2017/03/15 12:07:40 I think this potentially swallows a syntax error w
bradn 2017/03/16 00:21:46 Ah, yes. Fixed and added a test.
351 break;
352 }
353 }
354 }
355
356 void AsmJsLexer::ConsumeString(uc32 quote) {
357 // Only string allowed is 'use asm' / "use asm".
358 const char* expected = "use asm";
359 for (; *expected != '\0'; ++expected) {
360 if (stream_->Advance() != *expected) {
361 token_ = kParseError;
362 return;
363 }
364 }
365 if (stream_->Advance() != quote) {
366 token_ = kParseError;
367 return;
368 }
369 token_ = kToken_UseAsm;
370 }
371
372 void AsmJsLexer::ConsumeCompareOrShift(uc32 ch) {
373 uc32 next_ch = stream_->Advance();
374 if (next_ch == '=') {
375 switch (ch) {
376 case '<':
377 token_ = kToken_LE;
378 break;
379 case '>':
380 token_ = kToken_GE;
381 break;
382 case '=':
383 token_ = kToken_EQ;
384 break;
385 case '!':
386 token_ = kToken_NE;
387 break;
388 default:
389 UNREACHABLE();
390 }
391 } else if (ch == '<' && next_ch == '<') {
392 token_ = kToken_SHL;
393 } else if (ch == '>' && next_ch == '>') {
394 if (stream_->Advance() == '>') {
395 token_ = kToken_SHR;
396 } else {
397 token_ = kToken_SAR;
398 stream_->Back();
399 }
400 } else {
401 stream_->Back();
402 token_ = ch;
403 }
404 }
405
406 bool AsmJsLexer::IsIdentifierStart(uc32 ch) {
407 return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_' ||
408 ch == '$';
409 }
410
411 bool AsmJsLexer::IsIdentifierPart(uc32 ch) {
412 return IsIdentifierStart(ch) || (ch >= '0' && ch <= '9');
413 }
414
415 bool AsmJsLexer::IsNumberStart(uc32 ch) {
416 return ch == '.' || (ch >= '0' && ch <= '9');
417 }
418
419 } // namespace internal
420 } // namespace v8
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698