src/asmjs/asm-lexer.cc - Issue 2751693002: [wasm][asm.js] Adding custom asm.js lexer.

Unified Diff: src/asmjs/asm-lexer.cc

Issue 2751693002: [wasm][asm.js] Adding custom asm.js lexer. (Closed)

Patch Set: check Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/asmjs/asm-lexer.cc

diff --git a/src/asmjs/asm-lexer.cc b/src/asmjs/asm-lexer.cc

new file mode 100644

index 0000000000000000000000000000000000000000..3e49edd4bd71a33207f1d5f7e94e821e409dd7b2

--- /dev/null

+++ b/src/asmjs/asm-lexer.cc

@@ -0,0 +1,420 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "src/asmjs/asm-lexer.h"

+#include <stdlib.h>

+#include "src/objects.h"

marja 2017/03/15 12:34:49 Why is objects.h needed?

bradn 2017/03/16 00:21:46 There was a Handle<String> used in scanner.h inlin

+#include "src/parsing/scanner-character-streams.h"

vogelheim 2017/03/15 12:07:41 I don't see scanner-character-streams.h being used

bradn 2017/03/16 00:21:47 Dropped.

+#include "src/parsing/scanner.h"

marja 2017/03/15 12:34:49 Hmm, you're still including scanner.h even though

bradn 2017/03/16 00:21:47 That was in the header. This is needed here becaus

marja 2017/03/16 17:05:33 My orig. comment suggested moving the streams out

+namespace v8 {

+namespace internal {

+namespace {

+// Cap number of identifiers to ensure we can assign both global and

+// local ones a token id in the range of an int32_t.

+static const int kMaxIdentifierCount = 0xf000000;

+};

+AsmJsLexer::AsmJsLexer()

+ : token_(0),

+ preceding_token_(0),

+ next_token_(0),

+ rewind_(false),

+ in_local_scope_(false),

+ global_count_(0),

+ double_value_(0.0),

+ unsigned_value_(0),

+ preceded_by_newline_(false) {

+#define V(name, _junk1, _junk2, _junk3) property_names_[#name] = kToken_##name;

+ STDLIB_MATH_FUNCTION_LIST(V)

+ STDLIB_ARRAY_TYPE_LIST(V)

+#undef V

+#define V(name) property_names_[#name] = kToken_##name;

+ STDLIB_MATH_VALUE_LIST(V)

+ STDLIB_OTHER_LIST(V)

+#undef V

+#define V(name) global_names_[#name] = kToken_##name;

+ KEYWORD_NAME_LIST(V)

+#undef V

+void AsmJsLexer::SetStream(std::unique_ptr<Utf16CharacterStream> stream) {

+ stream_ = std::move(stream);

+ Next();

+void AsmJsLexer::Next() {

vogelheim 2017/03/15 12:07:40 I find this method much nicer to read now. Thanks.

bradn 2017/03/16 00:21:47 :-)

+ if (rewind_) {

+ preceding_token_ = token_;

+ token_ = next_token_;

+ next_token_ = 0;

+ rewind_ = false;

+ return;

+ }

+ if (token_ == kEndOfInput || token_ == kParseError) {

+ return;

+ }

+#if DEBUG

+ if (FLAG_trace_asm_lexer) {

+ if (Token() != 0) {

vogelheim 2017/03/15 12:07:41 nitpick: No real problem here, but this logic is a

bradn 2017/03/16 00:21:47 Done.

+ if (Token() == kDouble) {

+ PrintF("%lf ", AsDouble());

+ } else if (Token() == kUnsigned) {

+ PrintF("%" PRIu64 " ", AsUnsigned());

+ } else {

+ std::string name = Name(Token());

+ PrintF("%s ", name.c_str());

+ }

+#endif

+ preceded_by_newline_ = false;

+ preceding_token_ = token_;

+ for (;;) {

+ uc32 ch = stream_->Advance();

+ switch (ch) {

+ case ' ':

+ case '\t':

+ case '\n':

+ case '\r':

+ // Ignore whitespace, track when we've passed a newline for optional

+ // semicolon support.

+ if (ch == '\n') {

vogelheim 2017/03/15 12:07:41 nitpick: This is weird. If you have a switch-case

bradn 2017/03/16 00:21:46 Hah, yeah good point (missed that in the refactor)

+ preceded_by_newline_ = true;

+ }

+ break;

+ case kEndOfInput:

+ token_ = kEndOfInput;

+ return;

+ case '\'':

+ case '"':

+ ConsumeString(ch);

+ return;

+ case '/':

+ ch = stream_->Advance();

+ if (ch == '/') {

+ ConsumeCComment();

+ } else if (ch == '*') {

+ ConsumeCPPComment();

+ } else {

+ stream_->Back();

+ token_ = '/';

+ return;

+ }

+ // Breaks out of switch, but loops again (i.e. the case when we parsed

+ // a comment, but need to continue to look for the next token).

+ break;

+ case '<':

+ case '>':

+ case '=':

+ case '!':

+ ConsumeCompareOrShift(ch);

+ return;

+ default:

+ if (IsIdentifierStart(ch)) {

+ ConsumeIdentifier(ch);

+ } else if (IsNumberStart(ch)) {

+ ConsumeNumber(ch);

+ } else if (ch >= 32 && ch < 127) {

vogelheim 2017/03/15 12:07:41 [Not sure this is an issue, but... ] How many of

bradn 2017/03/16 00:21:47 Listed out the single char ones.

+ // Use fixed token IDs for ASCII.

+ token_ = ch;

+ } else {

+ // TODO(bradnelson): Support unicode (probably via UnicodeCache).

+ token_ = kParseError;

+ }

+ return;

+ }

+void AsmJsLexer::Rewind() {

+ DCHECK(!rewind_);

+ next_token_ = token_;

+ token_ = preceding_token_;

+ preceding_token_ = 0;

+ rewind_ = true;

+ preceded_by_newline_ = false;

+ identifier_string_.clear();

+void AsmJsLexer::ResetLocals() { local_names_.clear(); }

+#if DEBUG

+// Only used for debugging.

+std::string AsmJsLexer::Name(token_t token) const {

+ // TODO(bradnelson): Make thread safe.

+ if (token >= 32 && token < 127) {

+ return std::string(1, static_cast<char>(token));

+ }

+ for (auto& i : local_names_) {

+ if (i.second == token) {

+ return i.first.c_str();

Karl 2017/03/15 15:04:13 Why not just: return i.first;

bradn 2017/03/16 00:21:46 Done.

+ }

+ for (auto& i : global_names_) {

+ if (i.second == token) {

+ return i.first.c_str();

Karl 2017/03/15 15:04:13 Same here.

bradn 2017/03/16 00:21:47 Done.

+ }

+ for (auto& i : property_names_) {

+ if (i.second == token) {

+ return i.first.c_str();

Karl 2017/03/15 15:04:13 Same here.

bradn 2017/03/16 00:21:46 Done.

+ }

+ switch (token) {

+#define V(rawname, name) \

+ case kToken_##name: \

+ return rawname;

+ LONG_SYMBOL_NAME_LIST(V)

+#undef V

+ default:

+ break;

+ }

+ if (token == kUnsigned) {

vogelheim 2017/03/15 12:07:40 Why not handle all of these inside the switch righ

bradn 2017/03/16 00:21:46 Done.

+ return "{unsigned value}";

+ } else if (token == kDouble) {

+ return "{double value}";

+ } else if (token == kParseError) {

+ return "{parse error}";

+ } else if (token == kEndOfInput) {

+ return "{end of input}";

+ }

+ UNREACHABLE();

+ return "{unreachable}";

+#endif

+int AsmJsLexer::GetPosition() const { return static_cast<int>(stream_->pos()); }

vogelheim 2017/03/15 12:07:40 Does this work if rewind_ is set? If not, maybe ad

bradn 2017/03/16 00:21:46 Done.

+void AsmJsLexer::Seek(int pos) {

+ stream_->Seek(pos);

+ preceding_token_ = 0;

+ token_ = 0;

+ next_token_ = 0;

+ rewind_ = false;

+ Next();

+void AsmJsLexer::ConsumeIdentifier(uc32 ch) {

+ // Consume characters while still part of the identifier.

+ identifier_string_ = "";

vogelheim 2017/03/15 12:07:41 identifier_string_.clear(); (STL is bizarre, but.

bradn 2017/03/16 00:21:46 Yep. Done.

+ while (IsIdentifierPart(ch)) {

+ identifier_string_ += ch;

+ ch = stream_->Advance();

+ }

+ // Go back one for next time.

+ stream_->Back();

+ // Decode what the identifier means.

+ if (preceding_token_ == '.') {

+ auto i = property_names_.find(identifier_string_);

+ if (i != property_names_.end()) {

+ token_ = i->second;

+ return;

+ }

+ } else {

+ {

+ auto i = local_names_.find(identifier_string_);

+ if (i != local_names_.end()) {

+ token_ = i->second;

+ return;

+ }

+ if (!in_local_scope_) {

+ auto i = global_names_.find(identifier_string_);

+ if (i != global_names_.end()) {

+ token_ = i->second;

+ return;

+ }

+ if (preceding_token_ == '.') {

+ CHECK(global_count_ < kMaxIdentifierCount);

+ token_ = kGlobalsStart + global_count_++;

+ property_names_[identifier_string_] = token_;

+ } else if (in_local_scope_) {

+ CHECK(local_names_.size() < kMaxIdentifierCount);

+ token_ = kLocalsStart - static_cast<token_t>(local_names_.size());

+ local_names_[identifier_string_] = token_;

+ } else {

+ CHECK(global_count_ < kMaxIdentifierCount);

+ token_ = kGlobalsStart + global_count_++;

+ global_names_[identifier_string_] = token_;

+ }

+void AsmJsLexer::ConsumeNumber(uc32 ch) {

+ std::string number;

+ number = ch;

+ bool has_dot = ch == '.';

+ for (;;) {

+ ch = stream_->Advance();

+ if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') ||

+ (ch >= 'A' && ch <= 'F') || ch == '.' || ch == 'x' ||

+ ((ch == '-' || ch == '+') && (number[number.size() - 1] == 'e' ||

+ number[number.size() - 1] == 'E'))) {

+ // TODO(bradnelson): Test weird cases ending in -.

+ if (ch == '.') {

+ has_dot = true;

+ }

+ number += ch;

+ } else {

+ break;

+ }

+ stream_->Back();

+ // Special case the most common number.

+ if (number == "0") {

+ unsigned_value_ = 0;

+ token_ = kUnsigned;

+ return;

+ }

+ // Pick out dot.

+ if (number == ".") {

+ token_ = '.';

+ return;

+ }

+ // Decode numbers.

+ // TODO(bradnelson): Replace strto* with shared code with scanner.cc

+ char* end;

+ if (has_dot) {

+ double_value_ = strtod(number.c_str(), &end);

+ token_ = kDouble;

+ } else {

+ if (number.size() > 2 && number[0] == '0' && number[1] == 'x') {

+ // Decode 0x* as hex.

+ unsigned_value_ = strtoul(number.c_str() + 2, &end, 16);

+ } else if (number.size() > 1 && number[0] == '0') {

+ // Decode 0* as octal.

+ unsigned_value_ = strtoul(number.c_str() + 1, &end, 8);

+ } else {

+ // Decode the rest as double.

+ // This can come up in asm.js as for example 1e2 is used to encode 100.

+ double_value_ = strtod(number.c_str(), &end);

+ unsigned_value_ = static_cast<uint32_t>(double_value_);

+ }

+ token_ = kUnsigned;

+ }

+ // Check if string to number conversion didn't consume all the characters.

+ // This happens if the character filter let through something invalid

+ // like: 0123ef for example.

+ // TODO(bradnelson): Check if this happens often enough to be a perf problem.

+ if (end != number.c_str() + number.size()) {

+ // If things didn't parse fully, but start with a '.', back out the other

+ // characters and emit the '.' token.

+ if (number[0] == '.') {

+ for (size_t k = 1; k < number.size(); ++k) {

+ stream_->Back();

+ }

+ token_ = '.';

+ return;

+ }

+ // Anything else that doesn't parse is an error.

+ token_ = kParseError;

+ return;

+ }

+void AsmJsLexer::ConsumeCComment() {

+ for (;;) {

+ uc32 ch = stream_->Advance();

+ if (ch == '\n' || ch == kEndOfInput) {

+ break;

+ }

+void AsmJsLexer::ConsumeCPPComment() {

+ for (;;) {

+ uc32 ch = stream_->Advance();

+ if (ch == '*') {

vogelheim 2017/03/15 12:07:41 Your choice, but I think this if-branch would be a

bradn 2017/03/16 00:21:46 Ah, yeah, that's better. Done.

+ ch = stream_->Advance();

+ if (ch == '/') {

+ break;

+ }

+ if (ch == '*') {

+ stream_->Back();

+ }

+ } else if (ch == kEndOfInput) {

vogelheim 2017/03/15 12:07:40 I think this potentially swallows a syntax error w

bradn 2017/03/16 00:21:46 Ah, yes. Fixed and added a test.

+ break;

+ }

+void AsmJsLexer::ConsumeString(uc32 quote) {

+ // Only string allowed is 'use asm' / "use asm".

+ const char* expected = "use asm";

+ for (; *expected != '\0'; ++expected) {

+ if (stream_->Advance() != *expected) {

+ token_ = kParseError;

+ return;

+ }

+ if (stream_->Advance() != quote) {

+ token_ = kParseError;

+ return;

+ }

+ token_ = kToken_UseAsm;

+void AsmJsLexer::ConsumeCompareOrShift(uc32 ch) {

+ uc32 next_ch = stream_->Advance();

+ if (next_ch == '=') {

+ switch (ch) {

+ case '<':

+ token_ = kToken_LE;

+ break;

+ case '>':

+ token_ = kToken_GE;

+ break;

+ case '=':

+ token_ = kToken_EQ;

+ break;

+ case '!':

+ token_ = kToken_NE;

+ break;

+ default:

+ UNREACHABLE();

+ }

+ } else if (ch == '<' && next_ch == '<') {

+ token_ = kToken_SHL;

+ } else if (ch == '>' && next_ch == '>') {

+ if (stream_->Advance() == '>') {

+ token_ = kToken_SHR;

+ } else {

+ token_ = kToken_SAR;

+ stream_->Back();

+ }

+ } else {

+ stream_->Back();

+ token_ = ch;

+ }

+bool AsmJsLexer::IsIdentifierStart(uc32 ch) {

+ return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_' ||

+ ch == '$';

+bool AsmJsLexer::IsIdentifierPart(uc32 ch) {

+ return IsIdentifierStart(ch) || (ch >= '0' && ch <= '9');

+bool AsmJsLexer::IsNumberStart(uc32 ch) {

+ return ch == '.' || (ch >= '0' && ch <= '9');

+} // namespace internal

+} // namespace v8

« src/asmjs/asm-lexer.h ('K') | « src/asmjs/asm-lexer.h ('k') | src/asmjs/asm-names.h » ('j') | test/unittests/unittests.gyp » ('J')