src/asmjs/asm-lexer.cc - Issue 2751693002: [wasm][asm.js] Adding custom asm.js lexer.

Unified Diff: src/asmjs/asm-lexer.cc

Issue 2751693002: [wasm][asm.js] Adding custom asm.js lexer. (Closed)

Patch Set: fix warning Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/asmjs/asm-lexer.cc

diff --git a/src/asmjs/asm-lexer.cc b/src/asmjs/asm-lexer.cc

new file mode 100644

index 0000000000000000000000000000000000000000..4970cc860fba490cd30cebf3b6d913a46b1838c3

--- /dev/null

+++ b/src/asmjs/asm-lexer.cc

@@ -0,0 +1,346 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "src/asmjs/asm-lexer.h"

+#include <stdlib.h>

+#include "src/objects-inl.h"

+#include "src/parsing/scanner-character-streams.h"

+#include "src/parsing/scanner.h"

+namespace v8 {

+namespace internal {

+AsmJsLexer::AsmJsLexer(Isolate* isolate, Handle<Script> script, int start,

+ int end)

+ : script_(script),

+ source_(String::cast(script->source()), isolate),

+ stream_(ScannerStream::For(source_, start, end)),

+ token_(0),

+ last_token_(0),

+ next_token_(0),

+ rewind_(false),

+ local_(false),

+ global_count_(0),

+ double_value_(0.0),

+ unsigned_value_(0),

+ preceeded_by_newline_(false) {

+#define V(name, _junk1, _junk2, _junk3) property_names_[#name] = kToken_##name;

+ STDLIB_MATH_FUNCTION_LIST(V)

+ STDLIB_ARRAY_TYPE_LIST(V)

+#undef V

+#define V(name) property_names_[#name] = kToken_##name;

+ STDLIB_MATH_VALUE_LIST(V)

+ STDLIB_OTHER_LIST(V)

+#undef V

+#define V(name) global_names_[#name] = kToken_##name;

+ KEYWORD_NAME_LIST(V)

+#undef V

+ Next();

+void AsmJsLexer::Next() {

+ if (rewind_) {

+ last_token_ = token_;

+ token_ = next_token_;

+ next_token_ = 0;

+ rewind_ = false;

+ return;

+ }

+ if (token_ == kEndOfInput || token_ == kParseError) {

+ return;

+ }

+#if 0

vogelheim 2017/03/14 13:36:37 Please don't do this.

bradn 2017/03/15 07:53:03 Changed to a trace flag. This ends up being useful

+ // Uncomment for debug raw token stream.

+ if (Token() != 0) {

+ if (Token() == kDouble) {

+ fprintf(stderr, "%lf ", AsDouble());

+ } else if (Token() == kUnsigned) {

+ fprintf(stderr, "%lu ", AsUnsigned());

+ } else {

+ fprintf(stderr, "%s ", Name(Token()));

+ }

+#endif

+ preceeded_by_newline_ = false;

+ last_token_ = token_;

+ for (;;) {

+ token_t ch = stream_->Advance();

vogelheim 2017/03/14 13:36:37 (Here & below.) Using token_t for individual chara

bradn 2017/03/15 07:53:02 Switched all of these inside to uc32.

+ if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') {

Karl 2017/03/14 18:00:47 Would a switch statement be cleaner here?

bradn 2017/03/15 07:53:02 Done.

+ // Skip whitespace.

+ if (ch == '\n') {

+ preceeded_by_newline_ = true;

+ }

+ continue;

+ } else if (ch == kEndOfInput) {

marja 2017/03/14 11:11:47 (general comment) The structure is getting a bit c

bradn 2017/03/15 07:53:03 Decomposed into more functions, hope that helps.

+ token_ = kEndOfInput;

+ break;

marja 2017/03/14 11:11:47 Why break, why not return? (Now it's not trivial t

bradn 2017/03/15 07:53:03 Redone in more functions, avoids the break.

+ } else if (ch < 32 || ch >= 127) {

Karl 2017/03/14 18:00:47 If you use a switch statement, either explicitly e

bradn 2017/03/15 07:53:02 Done.

+ // Disallow non-ascii for now.

+ token_ = kParseError;

+ break;

+ } else if (ch == '\'' || ch == '\"') {

+ // Only string allowed is 'use asm' / "use asm".

+ const char* use_asm = "use asm";

Karl 2017/03/14 18:00:47 Should this be a constexpr?

bradn 2017/03/15 07:53:03 Changed round.

+ const char* pos = use_asm;

+ while (*pos) {

+ token_t och = stream_->Advance();

vogelheim 2017/03/14 13:36:37 och ? [here & below]

bradn 2017/03/15 07:53:02 Renamed and refactored.

+ if (och != *pos) {

+ token_ = kParseError;

+ return;

+ }

+ ++pos;

+ }

+ token_t och = stream_->Advance();

+ if (och != ch) {

marja 2017/03/14 11:11:47 Lost here... what's this? Ahh, it's checking the

bradn 2017/03/15 07:53:02 renamed variable to highlight that.

+ token_ = kParseError;

+ break;

+ }

+ token_ = kToken_UseAsm;

+ break;

+ } else if (ch == '/') {

+ ch = stream_->Advance();

+ if (ch == '/') {

+ for (;;) {

+ ch = stream_->Advance();

+ if (ch == '\n' || ch == kEndOfInput) {

+ break;

+ }

+ continue;

+ } else if (ch == '*') {

+ for (;;) {

+ ch = stream_->Advance();

+ if (ch == '*') {

+ ch = stream_->Advance();

+ if (ch == '/') {

vogelheim 2017/03/14 13:36:37 +1 to Marja's comments. Also, would this work on

bradn 2017/03/15 07:53:02 Yeah, this was wrong. Factor to function and fixed

+ if (ch == '*') {

marja 2017/03/14 11:11:47 if ch == '/' on the line above, it cannot be '*' h

bradn 2017/03/15 07:53:02 Oops, fixed.

+ stream_->Back();

+ }

+ break;

marja 2017/03/14 11:11:47 I'm lost here anyway, what's this block, what are

bradn 2017/03/15 07:53:02 This was meant to back up if you saw a * inside a

+ }

+ } else if (ch == kEndOfInput) {

+ break;

+ }

+ continue;

+ } else {

marja 2017/03/14 11:11:47 No idea here anymore which if this else associates

bradn 2017/03/15 07:53:03 Restructured, should be more clear now.

+ stream_->Back();

+ token_ = '/';

+ break;

+ }

+ } else if (ch == '<' || ch == '>' || ch == '=' || ch == '!') {

+ token_t och = stream_->Advance();

+ if (och == '=') {

+ if (ch == '<') {

+ token_ = kToken_LE;

+ break;

+ } else if (ch == '>') {

+ token_ = kToken_GE;

+ break;

+ } else if (ch == '=') {

+ token_ = kToken_EQ;

+ break;

+ } else if (ch == '!') {

+ token_ = kToken_NE;

+ break;

+ } else {

+ UNREACHABLE();

+ }

+ } else if (ch == '<' && och == '<') {

+ token_ = kToken_SHL;

+ break;

+ } else if (ch == '>' && och == '>') {

+ token_t ooch = stream_->Advance();

vogelheim 2017/03/14 13:36:37 ooch ?

bradn 2017/03/15 07:53:02 Hah, terrible name, sorry, dropped variable comple

+ if (ooch == '>') {

+ token_ = kToken_SHR;

+ } else {

+ token_ = kToken_SAR;

+ stream_->Back();

+ }

+ break;

+ } else {

+ stream_->Back();

+ token_ = ch;

+ break;

+ }

+ } else if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') ||

+ ch == '_' || ch == '$') {

vogelheim 2017/03/14 13:36:37 Could you introduce helper functions for the chara

bradn 2017/03/15 07:53:02 Done.

+ name_ = ch;

+ for (;;) {

+ ch = stream_->Advance();

+ if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_' ||

+ ch == '$' || (ch >= '0' && ch <= '9')) {

+ name_ += ch;

+ } else {

+ break;

marja 2017/03/14 11:11:47 Why not while(ch >= ...) { name_ += ch; ch =

bradn 2017/03/15 07:53:03 Done.

+ }

+ stream_->Back();

+ if (last_token_ == '.') {

+ auto i = property_names_.find(name_);

+ if (i != property_names_.end()) {

+ token_ = i->second;

+ break;

marja 2017/03/14 11:11:47 E.g,. here it would be less confusing to use retur

bradn 2017/03/15 07:53:03 Done.

+ }

+ } else {

+ {

+ auto i = local_names_.find(name_);

+ if (i != local_names_.end()) {

+ token_ = i->second;

+ break;

+ }

+ if (!local_) {

marja 2017/03/14 11:11:47 What's local_?

bradn 2017/03/15 07:53:03 Renamed.

+ auto i = global_names_.find(name_);

+ if (i != global_names_.end()) {

+ token_ = i->second;

+ break;

+ }

+ if (last_token_ == '.') {

+ // TODO(bradnelson): Assert no overflow.

+ token_ = kGlobalsStart + global_count_++;

+ property_names_[name_] = token_;

+ } else if (local_) {

+ // TODO(bradnelson): Assert no overflow.

+ token_ = kLocalsStart - local_names_.size();

+ local_names_[name_] = token_;

+ } else {

+ // TODO(bradnelson): Assert no overflow.

+ token_ = kGlobalsStart + global_count_++;

+ global_names_[name_] = token_;

+ }

+ break;

+ } else if (ch == '.' || (ch >= '0' && ch <= '9')) {

+ bool has_dot = ch == '.';

+ name_ = ch;

+ for (;;) {

+ ch = stream_->Advance();

+ if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') ||

marja 2017/03/14 11:11:47 Would it be feasible to have a helper function for

bradn 2017/03/15 07:53:03 I've added a TODO to do this. Might require some c

+ (ch >= 'A' && ch <= 'F') || ch == '.' || ch == 'x' ||

+ ((ch == '-' || ch == '+') && (name_[name_.size() - 1] == 'e' ||

+ name_[name_.size() - 1] == 'E'))) {

+ // TODO(bradnelson): Test weird cases ending in -.

+ if (ch == '.') {

+ has_dot = true;

+ }

+ name_ += ch;

+ } else {

+ break;

+ }

+ stream_->Back();

+ // Special case the most common number.

+ if (name_ == "0") {

+ unsigned_value_ = 0;

+ token_ = kUnsigned;

+ break;

+ }

+ // Pick out dot.

+ if (name_ == ".") {

+ token_ = '.';

+ break;

+ }

+ // Decode numbers.

+ char* end;

+ if (has_dot) {

+ double_value_ = strtod(name_.c_str(), &end);

vogelheim 2017/03/14 13:36:37 strtod may depend on the current locale. Are you r

bradn 2017/03/15 07:53:03 Yeah, it's a fair point these probably aren't idea

vogelheim 2017/03/15 12:07:40 Note that this is a correctness, not a performance

vogelheim 2017/03/15 12:10:11 My gut feeling is that parser & scanner are tightl

My gut feeling is that parser & scanner are tightly coupled, and we should keep this work + existing parser either entirely separate or entirely integrated. Performance considerations: I suspect that for the asm.js case, both scanner & parser would pretty much always stay in the 'fast paths', and a re-write of those parts won't actually buy us that much overall. Note that what we call the 'parser' is really the parser + AST builder, and what we call the 'preparser' is the full parser + an 'empty backend'. The AST building is what I think you're trying to get rid of, and I indeed can't see any value in it for the asm.js-to-wasm conversion. I don't think the parsing part of the parser is actually that slow: A rough performance estimate of preparser-vs-parser is 4:1, meaning that ~75% of "parse" time is actually spent in the AST. If you were to implement a class AsmJsParser : public ParserBase<AsmJSParser> {...} similar to the PreParser, I'd expect the resulting parsing speed to be roughly comparable to the pre-parser, for an approximate 4x performance win. In fairness: ParserBase wasn't really meant to be used that way, so 1, it's a huge amount of boilerplate to re-use it and 2, there'd almost certainly be some corner cases we didn't think of that and that you'd stumble over and that would require additional work to resolve. Also, 3, I think you're already quite heavily invested into the idea of having a separate parser, so I'm not sure this is really up for consideration. One more thing: Notice that the current parser/lexer already performs similar tricks to yours, but somewhat differently: There are all-ASCII fast paths. Identifiers are being de-duped & internalized in the AstValueFactory, rather than in the Scanner itself, but in both cases you end up with one insertion into a table and then just comparing the int/pointer to do equality comparison.

+ token_ = kDouble;

+ } else {

+ if (name_.size() > 2 && name_[0] == '0' && name_[1] == 'x') {

+ unsigned_value_ = strtoul(name_.c_str() + 2, &end, 16);

+ } else if (name_.size() > 1 && name_[0] == '0') {

+ unsigned_value_ = strtoul(name_.c_str() + 1, &end, 8);

+ } else {

+ double_value_ = strtod(name_.c_str(), &end);

+ unsigned_value_ = static_cast<uint32_t>(double_value_);

marja 2017/03/14 11:11:47 Why strtod if it's guaranteed to be an integer (no

bradn 2017/03/15 07:53:02 Asm.js uses 1e2 for 100 (as an integer :-) Added a

+ }

+ token_ = kUnsigned;

+ }

+ if (end != name_.c_str() + name_.size()) {

vogelheim 2017/03/14 13:36:37 I'm confused. When does this happen?

bradn 2017/03/15 07:53:03 When a number failed to parse, added a comment + e

+ // Handle mistaken parse of '.' as number.

marja 2017/03/14 11:11:47 How does this relate to the "Pick out dot" above?

bradn 2017/03/15 07:53:02 Reworded. The idea here is that if the number pars

+ if (name_[0] == '.') {

+ for (size_t k = 1; k < name_.size(); ++k) {

+ stream_->Back();

+ }

+ token_ = '.';

+ break;

+ }

+ token_ = kParseError;

+ return;

+ }

+ break;

+ } else {

+ token_ = ch;

+ break;

+ }

+void AsmJsLexer::Rewind() {

+ DCHECK(!rewind_);

+ next_token_ = token_;

+ token_ = last_token_;

+ last_token_ = 0;

+ rewind_ = true;

vogelheim 2017/03/14 13:36:37 This doesn't update name_. Is this intentional?

vogelheim 2017/03/14 13:36:37 This doesn't update preceeded_by_newline_. Is this

bradn 2017/03/15 07:53:02 Clearing it for good measure here (didn't want to

bradn 2017/03/15 07:53:03 Clobbering for good measure here, also commented a

+void AsmJsLexer::ResetLocals() { local_names_.clear(); }

+const char* AsmJsLexer::Name(token_t token) const {

+ // TODO(bradnelson): Make thread safe (and maybe debug only).

+ if (token >= 32 && token < 127) {

+ static char chname[2];

+ chname[0] = static_cast<char>(token);

vogelheim 2017/03/14 13:36:37 chname[1] = '\0' ??

bradn 2017/03/15 07:53:02 Done. Whoops.

+ return chname;

+ }

+ for (auto i = local_names_.begin(); i != local_names_.end(); ++i) {

vogelheim 2017/03/14 13:36:37 style nitpick: I'd use the for(auto& i : local_nam

bradn 2017/03/15 07:53:03 Done.

+ if (i->second == token) {

+ return i->first.c_str();

+ }

+ for (auto i = global_names_.begin(); i != global_names_.end(); ++i) {

+ if (i->second == token) {

+ return i->first.c_str();

+ }

+ for (auto i = property_names_.begin(); i != property_names_.end(); ++i) {

+ if (i->second == token) {

+ return i->first.c_str();

+ }

+ switch (token) {

+#define V(rawname, name) \

+ case kToken_##name: \

+ return rawname;

+ LONG_SYMBOL_NAME_LIST(V)

+#undef V

+ default:

+ break;

+ }

+ if (token == kUnsigned) {

+ return "{unsigned value}";

+ } else if (token == kDouble) {

+ return "{double value}";

+ } else if (token == kParseError) {

+ return "{parse error}";

+ } else if (token == kEndOfInput) {

+ return "{end of input}";

+ }

+ UNREACHABLE();

+ return "{unreachable}";

+int AsmJsLexer::position() const { return static_cast<int>(stream_->pos()); }

+void AsmJsLexer::Seek(int pos) { stream_->Seek(pos); }

+} // namespace internal

+} // namespace v8

« src/asmjs/asm-lexer.h ('K') | « src/asmjs/asm-lexer.h ('k') | src/asmjs/asm-names.h » ('j') | src/asmjs/asm-names.h » ('J')