Chromium Code Reviews| Index: src/asmjs/asm-lexer.cc |
| diff --git a/src/asmjs/asm-lexer.cc b/src/asmjs/asm-lexer.cc |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..3e49edd4bd71a33207f1d5f7e94e821e409dd7b2 |
| --- /dev/null |
| +++ b/src/asmjs/asm-lexer.cc |
| @@ -0,0 +1,420 @@ |
| +// Copyright 2017 the V8 project authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#include "src/asmjs/asm-lexer.h" |
| + |
| +#include <stdlib.h> |
| + |
| +#include "src/objects.h" |
|
marja
2017/03/15 12:34:49
Why is objects.h needed?
bradn
2017/03/16 00:21:46
There was a Handle<String> used in scanner.h inlin
|
| +#include "src/parsing/scanner-character-streams.h" |
|
vogelheim
2017/03/15 12:07:41
I don't see scanner-character-streams.h being used
bradn
2017/03/16 00:21:47
Dropped.
|
| +#include "src/parsing/scanner.h" |
|
marja
2017/03/15 12:34:49
Hmm, you're still including scanner.h even though
bradn
2017/03/16 00:21:47
That was in the header.
This is needed here becaus
marja
2017/03/16 17:05:33
My orig. comment suggested moving the streams out
|
| + |
| +namespace v8 { |
| +namespace internal { |
| + |
| +namespace { |
| +// Cap number of identifiers to ensure we can assign both global and |
| +// local ones a token id in the range of an int32_t. |
| +static const int kMaxIdentifierCount = 0xf000000; |
| +}; |
| + |
| +AsmJsLexer::AsmJsLexer() |
| + : token_(0), |
| + preceding_token_(0), |
| + next_token_(0), |
| + rewind_(false), |
| + in_local_scope_(false), |
| + global_count_(0), |
| + double_value_(0.0), |
| + unsigned_value_(0), |
| + preceded_by_newline_(false) { |
| +#define V(name, _junk1, _junk2, _junk3) property_names_[#name] = kToken_##name; |
| + STDLIB_MATH_FUNCTION_LIST(V) |
| + STDLIB_ARRAY_TYPE_LIST(V) |
| +#undef V |
| +#define V(name) property_names_[#name] = kToken_##name; |
| + STDLIB_MATH_VALUE_LIST(V) |
| + STDLIB_OTHER_LIST(V) |
| +#undef V |
| +#define V(name) global_names_[#name] = kToken_##name; |
| + KEYWORD_NAME_LIST(V) |
| +#undef V |
| +} |
| + |
| +void AsmJsLexer::SetStream(std::unique_ptr<Utf16CharacterStream> stream) { |
| + stream_ = std::move(stream); |
| + Next(); |
| +} |
| + |
| +void AsmJsLexer::Next() { |
|
vogelheim
2017/03/15 12:07:40
I find this method much nicer to read now. Thanks.
bradn
2017/03/16 00:21:47
:-)
|
| + if (rewind_) { |
| + preceding_token_ = token_; |
| + token_ = next_token_; |
| + next_token_ = 0; |
| + rewind_ = false; |
| + return; |
| + } |
| + |
| + if (token_ == kEndOfInput || token_ == kParseError) { |
| + return; |
| + } |
| + |
| +#if DEBUG |
| + if (FLAG_trace_asm_lexer) { |
| + if (Token() != 0) { |
|
vogelheim
2017/03/15 12:07:41
nitpick: No real problem here, but this logic is a
bradn
2017/03/16 00:21:47
Done.
|
| + if (Token() == kDouble) { |
| + PrintF("%lf ", AsDouble()); |
| + } else if (Token() == kUnsigned) { |
| + PrintF("%" PRIu64 " ", AsUnsigned()); |
| + } else { |
| + std::string name = Name(Token()); |
| + PrintF("%s ", name.c_str()); |
| + } |
| + } |
| + } |
| +#endif |
| + |
| + preceded_by_newline_ = false; |
| + preceding_token_ = token_; |
| + for (;;) { |
| + uc32 ch = stream_->Advance(); |
| + switch (ch) { |
| + case ' ': |
| + case '\t': |
| + case '\n': |
| + case '\r': |
| + // Ignore whitespace, track when we've passed a newline for optional |
| + // semicolon support. |
| + if (ch == '\n') { |
|
vogelheim
2017/03/15 12:07:41
nitpick: This is weird. If you have a switch-case
bradn
2017/03/16 00:21:46
Hah, yeah good point (missed that in the refactor)
|
| + preceded_by_newline_ = true; |
| + } |
| + break; |
| + |
| + case kEndOfInput: |
| + token_ = kEndOfInput; |
| + return; |
| + |
| + case '\'': |
| + case '"': |
| + ConsumeString(ch); |
| + return; |
| + |
| + case '/': |
| + ch = stream_->Advance(); |
| + if (ch == '/') { |
| + ConsumeCComment(); |
| + } else if (ch == '*') { |
| + ConsumeCPPComment(); |
| + } else { |
| + stream_->Back(); |
| + token_ = '/'; |
| + return; |
| + } |
| + // Breaks out of switch, but loops again (i.e. the case when we parsed |
| + // a comment, but need to continue to look for the next token). |
| + break; |
| + |
| + case '<': |
| + case '>': |
| + case '=': |
| + case '!': |
| + ConsumeCompareOrShift(ch); |
| + return; |
| + |
| + default: |
| + if (IsIdentifierStart(ch)) { |
| + ConsumeIdentifier(ch); |
| + } else if (IsNumberStart(ch)) { |
| + ConsumeNumber(ch); |
| + } else if (ch >= 32 && ch < 127) { |
|
vogelheim
2017/03/15 12:07:41
[Not sure this is an issue, but... ]
How many of
bradn
2017/03/16 00:21:47
Listed out the single char ones.
|
| + // Use fixed token IDs for ASCII. |
| + token_ = ch; |
| + } else { |
| + // TODO(bradnelson): Support unicode (probably via UnicodeCache). |
| + token_ = kParseError; |
| + } |
| + return; |
| + } |
| + } |
| +} |
| + |
| +void AsmJsLexer::Rewind() { |
| + DCHECK(!rewind_); |
| + next_token_ = token_; |
| + token_ = preceding_token_; |
| + preceding_token_ = 0; |
| + rewind_ = true; |
| + preceded_by_newline_ = false; |
| + identifier_string_.clear(); |
| +} |
| + |
| +void AsmJsLexer::ResetLocals() { local_names_.clear(); } |
| + |
| +#if DEBUG |
| +// Only used for debugging. |
| +std::string AsmJsLexer::Name(token_t token) const { |
| + // TODO(bradnelson): Make thread safe. |
| + if (token >= 32 && token < 127) { |
| + return std::string(1, static_cast<char>(token)); |
| + } |
| + for (auto& i : local_names_) { |
| + if (i.second == token) { |
| + return i.first.c_str(); |
|
Karl
2017/03/15 15:04:13
Why not just:
return i.first;
bradn
2017/03/16 00:21:46
Done.
|
| + } |
| + } |
| + for (auto& i : global_names_) { |
| + if (i.second == token) { |
| + return i.first.c_str(); |
|
Karl
2017/03/15 15:04:13
Same here.
bradn
2017/03/16 00:21:47
Done.
|
| + } |
| + } |
| + for (auto& i : property_names_) { |
| + if (i.second == token) { |
| + return i.first.c_str(); |
|
Karl
2017/03/15 15:04:13
Same here.
bradn
2017/03/16 00:21:46
Done.
|
| + } |
| + } |
| + switch (token) { |
| +#define V(rawname, name) \ |
| + case kToken_##name: \ |
| + return rawname; |
| + LONG_SYMBOL_NAME_LIST(V) |
| +#undef V |
| + default: |
| + break; |
| + } |
| + if (token == kUnsigned) { |
|
vogelheim
2017/03/15 12:07:40
Why not handle all of these inside the switch righ
bradn
2017/03/16 00:21:46
Done.
|
| + return "{unsigned value}"; |
| + } else if (token == kDouble) { |
| + return "{double value}"; |
| + } else if (token == kParseError) { |
| + return "{parse error}"; |
| + } else if (token == kEndOfInput) { |
| + return "{end of input}"; |
| + } |
| + UNREACHABLE(); |
| + return "{unreachable}"; |
| +} |
| +#endif |
| + |
| +int AsmJsLexer::GetPosition() const { return static_cast<int>(stream_->pos()); } |
|
vogelheim
2017/03/15 12:07:40
Does this work if rewind_ is set? If not, maybe ad
bradn
2017/03/16 00:21:46
Done.
|
| + |
| +void AsmJsLexer::Seek(int pos) { |
| + stream_->Seek(pos); |
| + preceding_token_ = 0; |
| + token_ = 0; |
| + next_token_ = 0; |
| + rewind_ = false; |
| + Next(); |
| +} |
| + |
| +void AsmJsLexer::ConsumeIdentifier(uc32 ch) { |
| + // Consume characters while still part of the identifier. |
| + identifier_string_ = ""; |
|
vogelheim
2017/03/15 12:07:41
identifier_string_.clear();
(STL is bizarre, but.
bradn
2017/03/16 00:21:46
Yep. Done.
|
| + while (IsIdentifierPart(ch)) { |
| + identifier_string_ += ch; |
| + ch = stream_->Advance(); |
| + } |
| + // Go back one for next time. |
| + stream_->Back(); |
| + |
| + // Decode what the identifier means. |
| + if (preceding_token_ == '.') { |
| + auto i = property_names_.find(identifier_string_); |
| + if (i != property_names_.end()) { |
| + token_ = i->second; |
| + return; |
| + } |
| + } else { |
| + { |
| + auto i = local_names_.find(identifier_string_); |
| + if (i != local_names_.end()) { |
| + token_ = i->second; |
| + return; |
| + } |
| + } |
| + if (!in_local_scope_) { |
| + auto i = global_names_.find(identifier_string_); |
| + if (i != global_names_.end()) { |
| + token_ = i->second; |
| + return; |
| + } |
| + } |
| + } |
| + if (preceding_token_ == '.') { |
| + CHECK(global_count_ < kMaxIdentifierCount); |
| + token_ = kGlobalsStart + global_count_++; |
| + property_names_[identifier_string_] = token_; |
| + } else if (in_local_scope_) { |
| + CHECK(local_names_.size() < kMaxIdentifierCount); |
| + token_ = kLocalsStart - static_cast<token_t>(local_names_.size()); |
| + local_names_[identifier_string_] = token_; |
| + } else { |
| + CHECK(global_count_ < kMaxIdentifierCount); |
| + token_ = kGlobalsStart + global_count_++; |
| + global_names_[identifier_string_] = token_; |
| + } |
| +} |
| + |
| +void AsmJsLexer::ConsumeNumber(uc32 ch) { |
| + std::string number; |
| + number = ch; |
| + bool has_dot = ch == '.'; |
| + for (;;) { |
| + ch = stream_->Advance(); |
| + if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || |
| + (ch >= 'A' && ch <= 'F') || ch == '.' || ch == 'x' || |
| + ((ch == '-' || ch == '+') && (number[number.size() - 1] == 'e' || |
| + number[number.size() - 1] == 'E'))) { |
| + // TODO(bradnelson): Test weird cases ending in -. |
| + if (ch == '.') { |
| + has_dot = true; |
| + } |
| + number += ch; |
| + } else { |
| + break; |
| + } |
| + } |
| + stream_->Back(); |
| + // Special case the most common number. |
| + if (number == "0") { |
| + unsigned_value_ = 0; |
| + token_ = kUnsigned; |
| + return; |
| + } |
| + // Pick out dot. |
| + if (number == ".") { |
| + token_ = '.'; |
| + return; |
| + } |
| + // Decode numbers. |
| + // TODO(bradnelson): Replace strto* with shared code with scanner.cc |
| + char* end; |
| + if (has_dot) { |
| + double_value_ = strtod(number.c_str(), &end); |
| + token_ = kDouble; |
| + } else { |
| + if (number.size() > 2 && number[0] == '0' && number[1] == 'x') { |
| + // Decode 0x* as hex. |
| + unsigned_value_ = strtoul(number.c_str() + 2, &end, 16); |
| + } else if (number.size() > 1 && number[0] == '0') { |
| + // Decode 0* as octal. |
| + unsigned_value_ = strtoul(number.c_str() + 1, &end, 8); |
| + } else { |
| + // Decode the rest as double. |
| + // This can come up in asm.js as for example 1e2 is used to encode 100. |
| + double_value_ = strtod(number.c_str(), &end); |
| + unsigned_value_ = static_cast<uint32_t>(double_value_); |
| + } |
| + token_ = kUnsigned; |
| + } |
| + // Check if string to number conversion didn't consume all the characters. |
| + // This happens if the character filter let through something invalid |
| + // like: 0123ef for example. |
| + // TODO(bradnelson): Check if this happens often enough to be a perf problem. |
| + if (end != number.c_str() + number.size()) { |
| + // If things didn't parse fully, but start with a '.', back out the other |
| + // characters and emit the '.' token. |
| + if (number[0] == '.') { |
| + for (size_t k = 1; k < number.size(); ++k) { |
| + stream_->Back(); |
| + } |
| + token_ = '.'; |
| + return; |
| + } |
| + // Anything else that doesn't parse is an error. |
| + token_ = kParseError; |
| + return; |
| + } |
| +} |
| + |
| +void AsmJsLexer::ConsumeCComment() { |
| + for (;;) { |
| + uc32 ch = stream_->Advance(); |
| + if (ch == '\n' || ch == kEndOfInput) { |
| + break; |
| + } |
| + } |
| +} |
| + |
| +void AsmJsLexer::ConsumeCPPComment() { |
| + for (;;) { |
| + uc32 ch = stream_->Advance(); |
| + if (ch == '*') { |
|
vogelheim
2017/03/15 12:07:41
Your choice, but I think this if-branch would be a
bradn
2017/03/16 00:21:46
Ah, yeah, that's better.
Done.
|
| + ch = stream_->Advance(); |
| + if (ch == '/') { |
| + break; |
| + } |
| + if (ch == '*') { |
| + stream_->Back(); |
| + } |
| + } else if (ch == kEndOfInput) { |
|
vogelheim
2017/03/15 12:07:40
I think this potentially swallows a syntax error w
bradn
2017/03/16 00:21:46
Ah, yes.
Fixed and added a test.
|
| + break; |
| + } |
| + } |
| +} |
| + |
| +void AsmJsLexer::ConsumeString(uc32 quote) { |
| + // Only string allowed is 'use asm' / "use asm". |
| + const char* expected = "use asm"; |
| + for (; *expected != '\0'; ++expected) { |
| + if (stream_->Advance() != *expected) { |
| + token_ = kParseError; |
| + return; |
| + } |
| + } |
| + if (stream_->Advance() != quote) { |
| + token_ = kParseError; |
| + return; |
| + } |
| + token_ = kToken_UseAsm; |
| +} |
| + |
| +void AsmJsLexer::ConsumeCompareOrShift(uc32 ch) { |
| + uc32 next_ch = stream_->Advance(); |
| + if (next_ch == '=') { |
| + switch (ch) { |
| + case '<': |
| + token_ = kToken_LE; |
| + break; |
| + case '>': |
| + token_ = kToken_GE; |
| + break; |
| + case '=': |
| + token_ = kToken_EQ; |
| + break; |
| + case '!': |
| + token_ = kToken_NE; |
| + break; |
| + default: |
| + UNREACHABLE(); |
| + } |
| + } else if (ch == '<' && next_ch == '<') { |
| + token_ = kToken_SHL; |
| + } else if (ch == '>' && next_ch == '>') { |
| + if (stream_->Advance() == '>') { |
| + token_ = kToken_SHR; |
| + } else { |
| + token_ = kToken_SAR; |
| + stream_->Back(); |
| + } |
| + } else { |
| + stream_->Back(); |
| + token_ = ch; |
| + } |
| +} |
| + |
| +bool AsmJsLexer::IsIdentifierStart(uc32 ch) { |
| + return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_' || |
| + ch == '$'; |
| +} |
| + |
| +bool AsmJsLexer::IsIdentifierPart(uc32 ch) { |
| + return IsIdentifierStart(ch) || (ch >= '0' && ch <= '9'); |
| +} |
| + |
| +bool AsmJsLexer::IsNumberStart(uc32 ch) { |
| + return ch == '.' || (ch >= '0' && ch <= '9'); |
| +} |
| + |
| +} // namespace internal |
| +} // namespace v8 |