Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(291)

Unified Diff: src/asmjs/asm-lexer.cc

Issue 2751693002: [wasm][asm.js] Adding custom asm.js lexer. (Closed)
Patch Set: check Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/asmjs/asm-lexer.cc
diff --git a/src/asmjs/asm-lexer.cc b/src/asmjs/asm-lexer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e49edd4bd71a33207f1d5f7e94e821e409dd7b2
--- /dev/null
+++ b/src/asmjs/asm-lexer.cc
@@ -0,0 +1,420 @@
+// Copyright 2017 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "src/asmjs/asm-lexer.h"
+
+#include <stdlib.h>
+
+#include "src/objects.h"
marja 2017/03/15 12:34:49 Why is objects.h needed?
bradn 2017/03/16 00:21:46 There was a Handle<String> used in scanner.h inlin
+#include "src/parsing/scanner-character-streams.h"
vogelheim 2017/03/15 12:07:41 I don't see scanner-character-streams.h being used
bradn 2017/03/16 00:21:47 Dropped.
+#include "src/parsing/scanner.h"
marja 2017/03/15 12:34:49 Hmm, you're still including scanner.h even though
bradn 2017/03/16 00:21:47 That was in the header. This is needed here becaus
marja 2017/03/16 17:05:33 My orig. comment suggested moving the streams out
+
+namespace v8 {
+namespace internal {
+
+namespace {
+// Cap number of identifiers to ensure we can assign both global and
+// local ones a token id in the range of an int32_t.
+static const int kMaxIdentifierCount = 0xf000000;
+};
+
+AsmJsLexer::AsmJsLexer()
+ : token_(0),
+ preceding_token_(0),
+ next_token_(0),
+ rewind_(false),
+ in_local_scope_(false),
+ global_count_(0),
+ double_value_(0.0),
+ unsigned_value_(0),
+ preceded_by_newline_(false) {
+#define V(name, _junk1, _junk2, _junk3) property_names_[#name] = kToken_##name;
+ STDLIB_MATH_FUNCTION_LIST(V)
+ STDLIB_ARRAY_TYPE_LIST(V)
+#undef V
+#define V(name) property_names_[#name] = kToken_##name;
+ STDLIB_MATH_VALUE_LIST(V)
+ STDLIB_OTHER_LIST(V)
+#undef V
+#define V(name) global_names_[#name] = kToken_##name;
+ KEYWORD_NAME_LIST(V)
+#undef V
+}
+
+void AsmJsLexer::SetStream(std::unique_ptr<Utf16CharacterStream> stream) {
+ stream_ = std::move(stream);
+ Next();
+}
+
+void AsmJsLexer::Next() {
vogelheim 2017/03/15 12:07:40 I find this method much nicer to read now. Thanks.
bradn 2017/03/16 00:21:47 :-)
+ if (rewind_) {
+ preceding_token_ = token_;
+ token_ = next_token_;
+ next_token_ = 0;
+ rewind_ = false;
+ return;
+ }
+
+ if (token_ == kEndOfInput || token_ == kParseError) {
+ return;
+ }
+
+#if DEBUG
+ if (FLAG_trace_asm_lexer) {
+ if (Token() != 0) {
vogelheim 2017/03/15 12:07:41 nitpick: No real problem here, but this logic is a
bradn 2017/03/16 00:21:47 Done.
+ if (Token() == kDouble) {
+ PrintF("%lf ", AsDouble());
+ } else if (Token() == kUnsigned) {
+ PrintF("%" PRIu64 " ", AsUnsigned());
+ } else {
+ std::string name = Name(Token());
+ PrintF("%s ", name.c_str());
+ }
+ }
+ }
+#endif
+
+ preceded_by_newline_ = false;
+ preceding_token_ = token_;
+ for (;;) {
+ uc32 ch = stream_->Advance();
+ switch (ch) {
+ case ' ':
+ case '\t':
+ case '\n':
+ case '\r':
+ // Ignore whitespace, track when we've passed a newline for optional
+ // semicolon support.
+ if (ch == '\n') {
vogelheim 2017/03/15 12:07:41 nitpick: This is weird. If you have a switch-case
bradn 2017/03/16 00:21:46 Hah, yeah good point (missed that in the refactor)
+ preceded_by_newline_ = true;
+ }
+ break;
+
+ case kEndOfInput:
+ token_ = kEndOfInput;
+ return;
+
+ case '\'':
+ case '"':
+ ConsumeString(ch);
+ return;
+
+ case '/':
+ ch = stream_->Advance();
+ if (ch == '/') {
+ ConsumeCComment();
+ } else if (ch == '*') {
+ ConsumeCPPComment();
+ } else {
+ stream_->Back();
+ token_ = '/';
+ return;
+ }
+ // Breaks out of switch, but loops again (i.e. the case when we parsed
+ // a comment, but need to continue to look for the next token).
+ break;
+
+ case '<':
+ case '>':
+ case '=':
+ case '!':
+ ConsumeCompareOrShift(ch);
+ return;
+
+ default:
+ if (IsIdentifierStart(ch)) {
+ ConsumeIdentifier(ch);
+ } else if (IsNumberStart(ch)) {
+ ConsumeNumber(ch);
+ } else if (ch >= 32 && ch < 127) {
vogelheim 2017/03/15 12:07:41 [Not sure this is an issue, but... ] How many of
bradn 2017/03/16 00:21:47 Listed out the single char ones.
+ // Use fixed token IDs for ASCII.
+ token_ = ch;
+ } else {
+ // TODO(bradnelson): Support unicode (probably via UnicodeCache).
+ token_ = kParseError;
+ }
+ return;
+ }
+ }
+}
+
+void AsmJsLexer::Rewind() {
+ DCHECK(!rewind_);
+ next_token_ = token_;
+ token_ = preceding_token_;
+ preceding_token_ = 0;
+ rewind_ = true;
+ preceded_by_newline_ = false;
+ identifier_string_.clear();
+}
+
+void AsmJsLexer::ResetLocals() { local_names_.clear(); }
+
+#if DEBUG
+// Only used for debugging.
+std::string AsmJsLexer::Name(token_t token) const {
+ // TODO(bradnelson): Make thread safe.
+ if (token >= 32 && token < 127) {
+ return std::string(1, static_cast<char>(token));
+ }
+ for (auto& i : local_names_) {
+ if (i.second == token) {
+ return i.first.c_str();
Karl 2017/03/15 15:04:13 Why not just: return i.first;
bradn 2017/03/16 00:21:46 Done.
+ }
+ }
+ for (auto& i : global_names_) {
+ if (i.second == token) {
+ return i.first.c_str();
Karl 2017/03/15 15:04:13 Same here.
bradn 2017/03/16 00:21:47 Done.
+ }
+ }
+ for (auto& i : property_names_) {
+ if (i.second == token) {
+ return i.first.c_str();
Karl 2017/03/15 15:04:13 Same here.
bradn 2017/03/16 00:21:46 Done.
+ }
+ }
+ switch (token) {
+#define V(rawname, name) \
+ case kToken_##name: \
+ return rawname;
+ LONG_SYMBOL_NAME_LIST(V)
+#undef V
+ default:
+ break;
+ }
+ if (token == kUnsigned) {
vogelheim 2017/03/15 12:07:40 Why not handle all of these inside the switch righ
bradn 2017/03/16 00:21:46 Done.
+ return "{unsigned value}";
+ } else if (token == kDouble) {
+ return "{double value}";
+ } else if (token == kParseError) {
+ return "{parse error}";
+ } else if (token == kEndOfInput) {
+ return "{end of input}";
+ }
+ UNREACHABLE();
+ return "{unreachable}";
+}
+#endif
+
+int AsmJsLexer::GetPosition() const { return static_cast<int>(stream_->pos()); }
vogelheim 2017/03/15 12:07:40 Does this work if rewind_ is set? If not, maybe ad
bradn 2017/03/16 00:21:46 Done.
+
+void AsmJsLexer::Seek(int pos) {
+ stream_->Seek(pos);
+ preceding_token_ = 0;
+ token_ = 0;
+ next_token_ = 0;
+ rewind_ = false;
+ Next();
+}
+
+void AsmJsLexer::ConsumeIdentifier(uc32 ch) {
+ // Consume characters while still part of the identifier.
+ identifier_string_ = "";
vogelheim 2017/03/15 12:07:41 identifier_string_.clear(); (STL is bizarre, but.
bradn 2017/03/16 00:21:46 Yep. Done.
+ while (IsIdentifierPart(ch)) {
+ identifier_string_ += ch;
+ ch = stream_->Advance();
+ }
+ // Go back one for next time.
+ stream_->Back();
+
+ // Decode what the identifier means.
+ if (preceding_token_ == '.') {
+ auto i = property_names_.find(identifier_string_);
+ if (i != property_names_.end()) {
+ token_ = i->second;
+ return;
+ }
+ } else {
+ {
+ auto i = local_names_.find(identifier_string_);
+ if (i != local_names_.end()) {
+ token_ = i->second;
+ return;
+ }
+ }
+ if (!in_local_scope_) {
+ auto i = global_names_.find(identifier_string_);
+ if (i != global_names_.end()) {
+ token_ = i->second;
+ return;
+ }
+ }
+ }
+ if (preceding_token_ == '.') {
+ CHECK(global_count_ < kMaxIdentifierCount);
+ token_ = kGlobalsStart + global_count_++;
+ property_names_[identifier_string_] = token_;
+ } else if (in_local_scope_) {
+ CHECK(local_names_.size() < kMaxIdentifierCount);
+ token_ = kLocalsStart - static_cast<token_t>(local_names_.size());
+ local_names_[identifier_string_] = token_;
+ } else {
+ CHECK(global_count_ < kMaxIdentifierCount);
+ token_ = kGlobalsStart + global_count_++;
+ global_names_[identifier_string_] = token_;
+ }
+}
+
+void AsmJsLexer::ConsumeNumber(uc32 ch) {
+ std::string number;
+ number = ch;
+ bool has_dot = ch == '.';
+ for (;;) {
+ ch = stream_->Advance();
+ if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') ||
+ (ch >= 'A' && ch <= 'F') || ch == '.' || ch == 'x' ||
+ ((ch == '-' || ch == '+') && (number[number.size() - 1] == 'e' ||
+ number[number.size() - 1] == 'E'))) {
+ // TODO(bradnelson): Test weird cases ending in -.
+ if (ch == '.') {
+ has_dot = true;
+ }
+ number += ch;
+ } else {
+ break;
+ }
+ }
+ stream_->Back();
+ // Special case the most common number.
+ if (number == "0") {
+ unsigned_value_ = 0;
+ token_ = kUnsigned;
+ return;
+ }
+ // Pick out dot.
+ if (number == ".") {
+ token_ = '.';
+ return;
+ }
+ // Decode numbers.
+ // TODO(bradnelson): Replace strto* with shared code with scanner.cc
+ char* end;
+ if (has_dot) {
+ double_value_ = strtod(number.c_str(), &end);
+ token_ = kDouble;
+ } else {
+ if (number.size() > 2 && number[0] == '0' && number[1] == 'x') {
+ // Decode 0x* as hex.
+ unsigned_value_ = strtoul(number.c_str() + 2, &end, 16);
+ } else if (number.size() > 1 && number[0] == '0') {
+ // Decode 0* as octal.
+ unsigned_value_ = strtoul(number.c_str() + 1, &end, 8);
+ } else {
+ // Decode the rest as double.
+ // This can come up in asm.js as for example 1e2 is used to encode 100.
+ double_value_ = strtod(number.c_str(), &end);
+ unsigned_value_ = static_cast<uint32_t>(double_value_);
+ }
+ token_ = kUnsigned;
+ }
+ // Check if string to number conversion didn't consume all the characters.
+ // This happens if the character filter let through something invalid
+ // like: 0123ef for example.
+ // TODO(bradnelson): Check if this happens often enough to be a perf problem.
+ if (end != number.c_str() + number.size()) {
+ // If things didn't parse fully, but start with a '.', back out the other
+ // characters and emit the '.' token.
+ if (number[0] == '.') {
+ for (size_t k = 1; k < number.size(); ++k) {
+ stream_->Back();
+ }
+ token_ = '.';
+ return;
+ }
+ // Anything else that doesn't parse is an error.
+ token_ = kParseError;
+ return;
+ }
+}
+
+void AsmJsLexer::ConsumeCComment() {
+ for (;;) {
+ uc32 ch = stream_->Advance();
+ if (ch == '\n' || ch == kEndOfInput) {
+ break;
+ }
+ }
+}
+
+void AsmJsLexer::ConsumeCPPComment() {
+ for (;;) {
+ uc32 ch = stream_->Advance();
+ if (ch == '*') {
vogelheim 2017/03/15 12:07:41 Your choice, but I think this if-branch would be a
bradn 2017/03/16 00:21:46 Ah, yeah, that's better. Done.
+ ch = stream_->Advance();
+ if (ch == '/') {
+ break;
+ }
+ if (ch == '*') {
+ stream_->Back();
+ }
+ } else if (ch == kEndOfInput) {
vogelheim 2017/03/15 12:07:40 I think this potentially swallows a syntax error w
bradn 2017/03/16 00:21:46 Ah, yes. Fixed and added a test.
+ break;
+ }
+ }
+}
+
+void AsmJsLexer::ConsumeString(uc32 quote) {
+ // Only string allowed is 'use asm' / "use asm".
+ const char* expected = "use asm";
+ for (; *expected != '\0'; ++expected) {
+ if (stream_->Advance() != *expected) {
+ token_ = kParseError;
+ return;
+ }
+ }
+ if (stream_->Advance() != quote) {
+ token_ = kParseError;
+ return;
+ }
+ token_ = kToken_UseAsm;
+}
+
+void AsmJsLexer::ConsumeCompareOrShift(uc32 ch) {
+ uc32 next_ch = stream_->Advance();
+ if (next_ch == '=') {
+ switch (ch) {
+ case '<':
+ token_ = kToken_LE;
+ break;
+ case '>':
+ token_ = kToken_GE;
+ break;
+ case '=':
+ token_ = kToken_EQ;
+ break;
+ case '!':
+ token_ = kToken_NE;
+ break;
+ default:
+ UNREACHABLE();
+ }
+ } else if (ch == '<' && next_ch == '<') {
+ token_ = kToken_SHL;
+ } else if (ch == '>' && next_ch == '>') {
+ if (stream_->Advance() == '>') {
+ token_ = kToken_SHR;
+ } else {
+ token_ = kToken_SAR;
+ stream_->Back();
+ }
+ } else {
+ stream_->Back();
+ token_ = ch;
+ }
+}
+
+bool AsmJsLexer::IsIdentifierStart(uc32 ch) {
+ return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_' ||
+ ch == '$';
+}
+
+bool AsmJsLexer::IsIdentifierPart(uc32 ch) {
+ return IsIdentifierStart(ch) || (ch >= '0' && ch <= '9');
+}
+
+bool AsmJsLexer::IsNumberStart(uc32 ch) {
+ return ch == '.' || (ch >= '0' && ch <= '9');
+}
+
+} // namespace internal
+} // namespace v8

Powered by Google App Engine
This is Rietveld 408576698