Chromium Code Reviews| Index: src/asmjs/asm-lexer.cc | 
| diff --git a/src/asmjs/asm-lexer.cc b/src/asmjs/asm-lexer.cc | 
| new file mode 100644 | 
| index 0000000000000000000000000000000000000000..4970cc860fba490cd30cebf3b6d913a46b1838c3 | 
| --- /dev/null | 
| +++ b/src/asmjs/asm-lexer.cc | 
| @@ -0,0 +1,346 @@ | 
| +// Copyright 2017 the V8 project authors. All rights reserved. | 
| +// Use of this source code is governed by a BSD-style license that can be | 
| +// found in the LICENSE file. | 
| + | 
| +#include "src/asmjs/asm-lexer.h" | 
| + | 
| +#include <stdlib.h> | 
| + | 
| +#include "src/objects-inl.h" | 
| +#include "src/parsing/scanner-character-streams.h" | 
| +#include "src/parsing/scanner.h" | 
| + | 
| +namespace v8 { | 
| +namespace internal { | 
| + | 
| +AsmJsLexer::AsmJsLexer(Isolate* isolate, Handle<Script> script, int start, | 
| + int end) | 
| + : script_(script), | 
| + source_(String::cast(script->source()), isolate), | 
| + stream_(ScannerStream::For(source_, start, end)), | 
| + token_(0), | 
| + last_token_(0), | 
| + next_token_(0), | 
| + rewind_(false), | 
| + local_(false), | 
| + global_count_(0), | 
| + double_value_(0.0), | 
| + unsigned_value_(0), | 
| + preceeded_by_newline_(false) { | 
| +#define V(name, _junk1, _junk2, _junk3) property_names_[#name] = kToken_##name; | 
| + STDLIB_MATH_FUNCTION_LIST(V) | 
| + STDLIB_ARRAY_TYPE_LIST(V) | 
| +#undef V | 
| +#define V(name) property_names_[#name] = kToken_##name; | 
| + STDLIB_MATH_VALUE_LIST(V) | 
| + STDLIB_OTHER_LIST(V) | 
| +#undef V | 
| +#define V(name) global_names_[#name] = kToken_##name; | 
| + KEYWORD_NAME_LIST(V) | 
| +#undef V | 
| + Next(); | 
| +} | 
| + | 
| +void AsmJsLexer::Next() { | 
| + if (rewind_) { | 
| + last_token_ = token_; | 
| + token_ = next_token_; | 
| + next_token_ = 0; | 
| + rewind_ = false; | 
| + return; | 
| + } | 
| + | 
| + if (token_ == kEndOfInput || token_ == kParseError) { | 
| + return; | 
| + } | 
| + | 
| +#if 0 | 
| 
 
vogelheim
2017/03/14 13:36:37
Please don't do this.
 
bradn
2017/03/15 07:53:03
Changed to a trace flag.
This ends up being useful
 
 | 
| + // Uncomment for debug raw token stream. | 
| + if (Token() != 0) { | 
| + if (Token() == kDouble) { | 
| + fprintf(stderr, "%lf ", AsDouble()); | 
| + } else if (Token() == kUnsigned) { | 
| + fprintf(stderr, "%lu ", AsUnsigned()); | 
| + } else { | 
| + fprintf(stderr, "%s ", Name(Token())); | 
| + } | 
| + } | 
| +#endif | 
| + | 
| + preceeded_by_newline_ = false; | 
| + last_token_ = token_; | 
| + for (;;) { | 
| + token_t ch = stream_->Advance(); | 
| 
 
vogelheim
2017/03/14 13:36:37
(Here & below.) Using token_t for individual chara
 
bradn
2017/03/15 07:53:02
Switched all of these inside to uc32.
 
 | 
| + if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') { | 
| 
 
Karl
2017/03/14 18:00:47
Would a switch statement be cleaner here?
 
bradn
2017/03/15 07:53:02
Done.
 
 | 
| + // Skip whitespace. | 
| + if (ch == '\n') { | 
| + preceeded_by_newline_ = true; | 
| + } | 
| + continue; | 
| + } else if (ch == kEndOfInput) { | 
| 
 
marja
2017/03/14 11:11:47
(general comment) The structure is getting a bit c
 
bradn
2017/03/15 07:53:03
Decomposed into more functions, hope that helps.
 
 | 
| + token_ = kEndOfInput; | 
| + break; | 
| 
 
marja
2017/03/14 11:11:47
Why break, why not return? (Now it's not trivial t
 
bradn
2017/03/15 07:53:03
Redone in more functions, avoids the break.
 
 | 
| + } else if (ch < 32 || ch >= 127) { | 
| 
 
Karl
2017/03/14 18:00:47
If you use a switch statement, either explicitly e
 
bradn
2017/03/15 07:53:02
Done.
 
 | 
| + // Disallow non-ascii for now. | 
| + token_ = kParseError; | 
| + break; | 
| + } else if (ch == '\'' || ch == '\"') { | 
| + // Only string allowed is 'use asm' / "use asm". | 
| + const char* use_asm = "use asm"; | 
| 
 
Karl
2017/03/14 18:00:47
Should this be a constexpr?
 
bradn
2017/03/15 07:53:03
Changed round.
 
 | 
| + const char* pos = use_asm; | 
| + while (*pos) { | 
| + token_t och = stream_->Advance(); | 
| 
 
vogelheim
2017/03/14 13:36:37
och ? [here & below]
 
bradn
2017/03/15 07:53:02
Renamed and refactored.
 
 | 
| + if (och != *pos) { | 
| + token_ = kParseError; | 
| + return; | 
| + } | 
| + ++pos; | 
| + } | 
| + token_t och = stream_->Advance(); | 
| + if (och != ch) { | 
| 
 
marja
2017/03/14 11:11:47
Lost here... what's this?
Ahh, it's checking the
 
bradn
2017/03/15 07:53:02
renamed variable to highlight that.
 
 | 
| + token_ = kParseError; | 
| + break; | 
| + } | 
| + token_ = kToken_UseAsm; | 
| + break; | 
| + } else if (ch == '/') { | 
| + ch = stream_->Advance(); | 
| + if (ch == '/') { | 
| + for (;;) { | 
| + ch = stream_->Advance(); | 
| + if (ch == '\n' || ch == kEndOfInput) { | 
| + break; | 
| + } | 
| + } | 
| + continue; | 
| + } else if (ch == '*') { | 
| + for (;;) { | 
| + ch = stream_->Advance(); | 
| + if (ch == '*') { | 
| + ch = stream_->Advance(); | 
| + if (ch == '/') { | 
| 
 
vogelheim
2017/03/14 13:36:37
+1 to Marja's comments. 
Also, would this work on
 
bradn
2017/03/15 07:53:02
Yeah, this was wrong. Factor to function and fixed
 
 | 
| + if (ch == '*') { | 
| 
 
marja
2017/03/14 11:11:47
if ch == '/' on the line above, it cannot be '*' h
 
bradn
2017/03/15 07:53:02
Oops, fixed.
 
 | 
| + stream_->Back(); | 
| + } | 
| + break; | 
| 
 
marja
2017/03/14 11:11:47
I'm lost here anyway, what's this block, what are
 
bradn
2017/03/15 07:53:02
This was meant to back up if you saw a * inside a
 
 | 
| + } | 
| + } else if (ch == kEndOfInput) { | 
| + break; | 
| + } | 
| + } | 
| + continue; | 
| + } else { | 
| 
 
marja
2017/03/14 11:11:47
No idea here anymore which if this else associates
 
bradn
2017/03/15 07:53:03
Restructured, should be more clear now.
 
 | 
| + stream_->Back(); | 
| + token_ = '/'; | 
| + break; | 
| + } | 
| + } else if (ch == '<' || ch == '>' || ch == '=' || ch == '!') { | 
| + token_t och = stream_->Advance(); | 
| + if (och == '=') { | 
| + if (ch == '<') { | 
| + token_ = kToken_LE; | 
| + break; | 
| + } else if (ch == '>') { | 
| + token_ = kToken_GE; | 
| + break; | 
| + } else if (ch == '=') { | 
| + token_ = kToken_EQ; | 
| + break; | 
| + } else if (ch == '!') { | 
| + token_ = kToken_NE; | 
| + break; | 
| + } else { | 
| + UNREACHABLE(); | 
| + } | 
| + } else if (ch == '<' && och == '<') { | 
| + token_ = kToken_SHL; | 
| + break; | 
| + } else if (ch == '>' && och == '>') { | 
| + token_t ooch = stream_->Advance(); | 
| 
 
vogelheim
2017/03/14 13:36:37
ooch ?
 
bradn
2017/03/15 07:53:02
Hah, terrible name, sorry, dropped variable comple
 
 | 
| + if (ooch == '>') { | 
| + token_ = kToken_SHR; | 
| + } else { | 
| + token_ = kToken_SAR; | 
| + stream_->Back(); | 
| + } | 
| + break; | 
| + } else { | 
| + stream_->Back(); | 
| + token_ = ch; | 
| + break; | 
| + } | 
| + } else if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || | 
| + ch == '_' || ch == '$') { | 
| 
 
vogelheim
2017/03/14 13:36:37
Could you introduce helper functions for the chara
 
bradn
2017/03/15 07:53:02
Done.
 
 | 
| + name_ = ch; | 
| + for (;;) { | 
| + ch = stream_->Advance(); | 
| + if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_' || | 
| + ch == '$' || (ch >= '0' && ch <= '9')) { | 
| + name_ += ch; | 
| + } else { | 
| + break; | 
| 
 
marja
2017/03/14 11:11:47
Why not
while(ch >= ...) {
  name_ += ch;
  ch =
 
bradn
2017/03/15 07:53:03
Done.
 
 | 
| + } | 
| + } | 
| + stream_->Back(); | 
| + if (last_token_ == '.') { | 
| + auto i = property_names_.find(name_); | 
| + if (i != property_names_.end()) { | 
| + token_ = i->second; | 
| + break; | 
| 
 
marja
2017/03/14 11:11:47
E.g,. here it would be less confusing to use retur
 
bradn
2017/03/15 07:53:03
Done.
 
 | 
| + } | 
| + } else { | 
| + { | 
| + auto i = local_names_.find(name_); | 
| + if (i != local_names_.end()) { | 
| + token_ = i->second; | 
| + break; | 
| + } | 
| + } | 
| + if (!local_) { | 
| 
 
marja
2017/03/14 11:11:47
What's local_?
 
bradn
2017/03/15 07:53:03
Renamed.
 
 | 
| + auto i = global_names_.find(name_); | 
| + if (i != global_names_.end()) { | 
| + token_ = i->second; | 
| + break; | 
| + } | 
| + } | 
| + } | 
| + if (last_token_ == '.') { | 
| + // TODO(bradnelson): Assert no overflow. | 
| + token_ = kGlobalsStart + global_count_++; | 
| + property_names_[name_] = token_; | 
| + } else if (local_) { | 
| + // TODO(bradnelson): Assert no overflow. | 
| + token_ = kLocalsStart - local_names_.size(); | 
| + local_names_[name_] = token_; | 
| + } else { | 
| + // TODO(bradnelson): Assert no overflow. | 
| + token_ = kGlobalsStart + global_count_++; | 
| + global_names_[name_] = token_; | 
| + } | 
| + break; | 
| + } else if (ch == '.' || (ch >= '0' && ch <= '9')) { | 
| + bool has_dot = ch == '.'; | 
| + name_ = ch; | 
| + for (;;) { | 
| + ch = stream_->Advance(); | 
| + if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || | 
| 
 
marja
2017/03/14 11:11:47
Would it be feasible to have a helper function for
 
bradn
2017/03/15 07:53:03
I've added a TODO to do this.
Might require some c
 
 | 
| + (ch >= 'A' && ch <= 'F') || ch == '.' || ch == 'x' || | 
| + ((ch == '-' || ch == '+') && (name_[name_.size() - 1] == 'e' || | 
| + name_[name_.size() - 1] == 'E'))) { | 
| + // TODO(bradnelson): Test weird cases ending in -. | 
| + if (ch == '.') { | 
| + has_dot = true; | 
| + } | 
| + name_ += ch; | 
| + } else { | 
| + break; | 
| + } | 
| + } | 
| + stream_->Back(); | 
| + // Special case the most common number. | 
| + if (name_ == "0") { | 
| + unsigned_value_ = 0; | 
| + token_ = kUnsigned; | 
| + break; | 
| + } | 
| + // Pick out dot. | 
| + if (name_ == ".") { | 
| + token_ = '.'; | 
| + break; | 
| + } | 
| + // Decode numbers. | 
| + char* end; | 
| + if (has_dot) { | 
| + double_value_ = strtod(name_.c_str(), &end); | 
| 
 
vogelheim
2017/03/14 13:36:37
strtod may depend on the current locale. Are you r
 
bradn
2017/03/15 07:53:03
Yeah, it's a fair point these probably aren't idea
 
vogelheim
2017/03/15 12:07:40
Note that this is a correctness, not a performance
 
vogelheim
2017/03/15 12:10:11
My gut feeling is that parser & scanner are tightl
 
 | 
| + token_ = kDouble; | 
| + } else { | 
| + if (name_.size() > 2 && name_[0] == '0' && name_[1] == 'x') { | 
| + unsigned_value_ = strtoul(name_.c_str() + 2, &end, 16); | 
| + } else if (name_.size() > 1 && name_[0] == '0') { | 
| + unsigned_value_ = strtoul(name_.c_str() + 1, &end, 8); | 
| + } else { | 
| + double_value_ = strtod(name_.c_str(), &end); | 
| + unsigned_value_ = static_cast<uint32_t>(double_value_); | 
| 
 
marja
2017/03/14 11:11:47
Why strtod if it's guaranteed to be an integer (no
 
bradn
2017/03/15 07:53:02
Asm.js uses 1e2 for 100 (as an integer :-)
Added a
 
 | 
| + } | 
| + token_ = kUnsigned; | 
| + } | 
| + if (end != name_.c_str() + name_.size()) { | 
| 
 
vogelheim
2017/03/14 13:36:37
I'm confused. When does this happen?
 
bradn
2017/03/15 07:53:03
When a number failed to parse, added a comment + e
 
 | 
| + // Handle mistaken parse of '.' as number. | 
| 
 
marja
2017/03/14 11:11:47
How does this relate to the "Pick out dot" above?
 
bradn
2017/03/15 07:53:02
Reworded.
The idea here is that if the number pars
 
 | 
| + if (name_[0] == '.') { | 
| + for (size_t k = 1; k < name_.size(); ++k) { | 
| + stream_->Back(); | 
| + } | 
| + token_ = '.'; | 
| + break; | 
| + } | 
| + token_ = kParseError; | 
| + return; | 
| + } | 
| + break; | 
| + } else { | 
| + token_ = ch; | 
| + break; | 
| + } | 
| + } | 
| +} | 
| + | 
| +void AsmJsLexer::Rewind() { | 
| + DCHECK(!rewind_); | 
| + next_token_ = token_; | 
| + token_ = last_token_; | 
| + last_token_ = 0; | 
| + rewind_ = true; | 
| 
 
vogelheim
2017/03/14 13:36:37
This doesn't update name_. Is this intentional?
 
vogelheim
2017/03/14 13:36:37
This doesn't update preceeded_by_newline_. Is this
 
bradn
2017/03/15 07:53:02
Clearing it for good measure here (didn't want to
 
bradn
2017/03/15 07:53:03
Clobbering for good measure here, also commented a
 
 | 
| +} | 
| + | 
| +void AsmJsLexer::ResetLocals() { local_names_.clear(); } | 
| + | 
| +const char* AsmJsLexer::Name(token_t token) const { | 
| + // TODO(bradnelson): Make thread safe (and maybe debug only). | 
| + if (token >= 32 && token < 127) { | 
| + static char chname[2]; | 
| + chname[0] = static_cast<char>(token); | 
| 
 
vogelheim
2017/03/14 13:36:37
chname[1] = '\0' ??
 
bradn
2017/03/15 07:53:02
Done.
Whoops.
 
 | 
| + return chname; | 
| + } | 
| + for (auto i = local_names_.begin(); i != local_names_.end(); ++i) { | 
| 
 
vogelheim
2017/03/14 13:36:37
style nitpick: I'd use the for(auto& i : local_nam
 
bradn
2017/03/15 07:53:03
Done.
 
 | 
| + if (i->second == token) { | 
| + return i->first.c_str(); | 
| + } | 
| + } | 
| + for (auto i = global_names_.begin(); i != global_names_.end(); ++i) { | 
| + if (i->second == token) { | 
| + return i->first.c_str(); | 
| + } | 
| + } | 
| + for (auto i = property_names_.begin(); i != property_names_.end(); ++i) { | 
| + if (i->second == token) { | 
| + return i->first.c_str(); | 
| + } | 
| + } | 
| + switch (token) { | 
| +#define V(rawname, name) \ | 
| + case kToken_##name: \ | 
| + return rawname; | 
| + LONG_SYMBOL_NAME_LIST(V) | 
| +#undef V | 
| + default: | 
| + break; | 
| + } | 
| + if (token == kUnsigned) { | 
| + return "{unsigned value}"; | 
| + } else if (token == kDouble) { | 
| + return "{double value}"; | 
| + } else if (token == kParseError) { | 
| + return "{parse error}"; | 
| + } else if (token == kEndOfInput) { | 
| + return "{end of input}"; | 
| + } | 
| + UNREACHABLE(); | 
| + return "{unreachable}"; | 
| +} | 
| + | 
| +int AsmJsLexer::position() const { return static_cast<int>(stream_->pos()); } | 
| + | 
| +void AsmJsLexer::Seek(int pos) { stream_->Seek(pos); } | 
| + | 
| +} // namespace internal | 
| +} // namespace v8 |