src/asmjs/asm-lexer.cc - Issue 2751693002: [wasm][asm.js] Adding custom asm.js lexer.

Side by Side Diff: src/asmjs/asm-lexer.cc

Issue 2751693002: [wasm][asm.js] Adding custom asm.js lexer. (Closed)

Patch Set: check Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 // Copyright 2017 the V8 project authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "src/asmjs/asm-lexer.h"

	6

	7 #include <stdlib.h>

	8

	9 #include "src/objects.h"
	marja 2017/03/15 12:34:49 Why is objects.h needed? Why is objects.h needed? bradn 2017/03/16 00:21:46 There was a Handle<String> used in scanner.h inlin Show quoted text On 2017/03/15 12:34:49, marja wrote: > Why is objects.h needed? There was a Handle<String> used in scanner.h inline. Moved into the .cc file.
	10 #include "src/parsing/scanner-character-streams.h"
	vogelheim 2017/03/15 12:07:41 I don't see scanner-character-streams.h being used I don't see scanner-character-streams.h being used. Am I missing something? bradn 2017/03/16 00:21:47 Dropped. Show quoted text On 2017/03/15 12:07:41, vogelheim wrote: > I don't see scanner-character-streams.h being used. Am I missing something? Dropped.
	11 #include "src/parsing/scanner.h"
	marja 2017/03/15 12:34:49 Hmm, you're still including scanner.h even though Hmm, you're still including scanner.h even though my previous comment about it was marked done. bradn 2017/03/16 00:21:47 That was in the header. This is needed here becaus Show quoted text On 2017/03/15 12:34:49, marja wrote: > Hmm, you're still including scanner.h even though my previous comment about it > was marked done. That was in the header. This is needed here because the Utf16CharacterStream's declaration is in scanner.h marja 2017/03/16 17:05:33 My orig. comment suggested moving the streams out Show quoted text On 2017/03/16 00:21:47, bradn wrote: > On 2017/03/15 12:34:49, marja wrote: > > Hmm, you're still including scanner.h even though my previous comment about it > > was marked done. > > That was in the header. > This is needed here because the Utf16CharacterStream's declaration is in > scanner.h My orig. comment suggested moving the streams out of scanner.h. It's weird that a complete separate scanner needs to include scanner.h. But as I can see this CL is close to landing, would doing a follow-up be OK?
	12

	13 namespace v8 {

	14 namespace internal {

	15

	16 namespace {

	17 // Cap number of identifiers to ensure we can assign both global and

	18 // local ones a token id in the range of an int32_t.

	19 static const int kMaxIdentifierCount = 0xf000000;

	20 };

	21

	22 AsmJsLexer::AsmJsLexer()

	23 : token_(0),

	24 preceding_token_(0),

	25 next_token_(0),

	26 rewind_(false),

	27 in_local_scope_(false),

	28 global_count_(0),

	29 double_value_(0.0),

	30 unsigned_value_(0),

	31 preceded_by_newline_(false) {

	32 #define V(name, _junk1, _junk2, _junk3) property_names_[#name] = kToken_##name;

	33 STDLIB_MATH_FUNCTION_LIST(V)

	34 STDLIB_ARRAY_TYPE_LIST(V)

	35 #undef V

	36 #define V(name) property_names_[#name] = kToken_##name;

	37 STDLIB_MATH_VALUE_LIST(V)

	38 STDLIB_OTHER_LIST(V)

	39 #undef V

	40 #define V(name) global_names_[#name] = kToken_##name;

	41 KEYWORD_NAME_LIST(V)

	42 #undef V

	43 }

	44

	45 void AsmJsLexer::SetStream(std::unique_ptr<Utf16CharacterStream> stream) {

	46 stream_ = std::move(stream);

	47 Next();

	48 }

	49

	50 void AsmJsLexer::Next() {
	vogelheim 2017/03/15 12:07:40 I find this method much nicer to read now. Thanks. I find this method much nicer to read now. Thanks. bradn 2017/03/16 00:21:47 :-) Show quoted text On 2017/03/15 12:07:40, vogelheim wrote: > I find this method much nicer to read now. Thanks. :-)
	51 if (rewind_) {

	52 preceding_token_ = token_;

	53 token_ = next_token_;

	54 next_token_ = 0;

	55 rewind_ = false;

	56 return;

	57 }

	58

	59 if (token_ == kEndOfInput \|\| token_ == kParseError) {

	60 return;

	61 }

	62

	63 #if DEBUG

	64 if (FLAG_trace_asm_lexer) {

	65 if (Token() != 0) {
	vogelheim 2017/03/15 12:07:41 nitpick: No real problem here, but this logic is a nitpick: No real problem here, but this logic is a little weird. I think I'd either just handle a 0 token in Name, or just have the last else being an "else if (Token() != 0)". (Maybe Name(0) should be "{uninitialized}". I don't think 0 is a valid unicode code point.) bradn 2017/03/16 00:21:47 Done. Show quoted text On 2017/03/15 12:07:41, vogelheim wrote: > nitpick: No real problem here, but this logic is a little weird. I think I'd > either just handle a 0 token in Name, or just have the last else being an "else > if (Token() != 0)". > > (Maybe Name(0) should be "{uninitialized}". I don't think 0 is a valid unicode > code point.) Done.
	66 if (Token() == kDouble) {

	67 PrintF("%lf ", AsDouble());

	68 } else if (Token() == kUnsigned) {

	69 PrintF("%" PRIu64 " ", AsUnsigned());

	70 } else {

	71 std::string name = Name(Token());

	72 PrintF("%s ", name.c_str());

	73 }

	74 }

	75 }

	76 #endif

	77

	78 preceded_by_newline_ = false;

	79 preceding_token_ = token_;

	80 for (;;) {

	81 uc32 ch = stream_->Advance();

	82 switch (ch) {

	83 case ' ':

	84 case '\t':

	85 case '\n':

	86 case '\r':

	87 // Ignore whitespace, track when we've passed a newline for optional

	88 // semicolon support.

	89 if (ch == '\n') {
	vogelheim 2017/03/15 12:07:41 nitpick: This is weird. If you have a switch-case nitpick: This is weird. If you have a switch-case anyhow, and if you want to treat one branch separately, why not just have a separate case for it? case '\n': preceded_... break; case ' ': case '\t': .... bradn 2017/03/16 00:21:46 Hah, yeah good point (missed that in the refactor) Show quoted text On 2017/03/15 12:07:41, vogelheim wrote: > nitpick: This is weird. If you have a switch-case anyhow, and if you want to > treat one branch separately, why not just have a separate case for it? > > case '\n': > preceded_... > break; > case ' ': > case '\t': > .... Hah, yeah good point (missed that in the refactor). Done.
	90 preceded_by_newline_ = true;

	91 }

	92 break;

	93

	94 case kEndOfInput:

	95 token_ = kEndOfInput;

	96 return;

	97

	98 case '\'':

	99 case '"':

	100 ConsumeString(ch);

	101 return;

	102

	103 case '/':

	104 ch = stream_->Advance();

	105 if (ch == '/') {

	106 ConsumeCComment();

	107 } else if (ch == '*') {

	108 ConsumeCPPComment();

	109 } else {

	110 stream_->Back();

	111 token_ = '/';

	112 return;

	113 }

	114 // Breaks out of switch, but loops again (i.e. the case when we parsed

	115 // a comment, but need to continue to look for the next token).

	116 break;

	117

	118 case '<':

	119 case '>':

	120 case '=':

	121 case '!':

	122 ConsumeCompareOrShift(ch);

	123 return;

	124

	125 default:

	126 if (IsIdentifierStart(ch)) {

	127 ConsumeIdentifier(ch);

	128 } else if (IsNumberStart(ch)) {

	129 ConsumeNumber(ch);

	130 } else if (ch >= 32 && ch < 127) {
	vogelheim 2017/03/15 12:07:41 [Not sure this is an issue, but... ] How many of [Not sure this is an issue, but... ] How many of these are legitimate tokens anyhow? Alphabetic chars + numeric ones are handles above, as are some operators. So these would be some legitimate tokens (+, -, , square + curly + round braces), but some others would just be syntax errors (#)? bradn* 2017/03/16 00:21:47 Listed out the single char ones. Show quoted text On 2017/03/15 12:07:41, vogelheim wrote: > [Not sure this is an issue, but... ] > > How many of these are legitimate tokens anyhow? Alphabetic chars + numeric ones > are handles above, as are some operators. So these would be some legitimate > tokens (+, -, *, square + curly + round braces), but some others would just be > syntax errors (#)? Listed out the single char ones.
	131 // Use fixed token IDs for ASCII.

	132 token_ = ch;

	133 } else {

	134 // TODO(bradnelson): Support unicode (probably via UnicodeCache).

	135 token_ = kParseError;

	136 }

	137 return;

	138 }

	139 }

	140 }

	141

	142 void AsmJsLexer::Rewind() {

	143 DCHECK(!rewind_);

	144 next_token_ = token_;

	145 token_ = preceding_token_;

	146 preceding_token_ = 0;

	147 rewind_ = true;

	148 preceded_by_newline_ = false;

	149 identifier_string_.clear();

	150 }

	151

	152 void AsmJsLexer::ResetLocals() { local_names_.clear(); }

	153

	154 #if DEBUG

	155 // Only used for debugging.

	156 std::string AsmJsLexer::Name(token_t token) const {

	157 // TODO(bradnelson): Make thread safe.

	158 if (token >= 32 && token < 127) {

	159 return std::string(1, static_cast<char>(token));

	160 }

	161 for (auto& i : local_names_) {

	162 if (i.second == token) {

	163 return i.first.c_str();
	Karl 2017/03/15 15:04:13 Why not just: return i.first; Why not just: return i.first; bradn 2017/03/16 00:21:46 Done. Show quoted text On 2017/03/15 15:04:13, Karl wrote: > Why not just: > > return i.first; Done.
	164 }

	165 }

	166 for (auto& i : global_names_) {

	167 if (i.second == token) {

	168 return i.first.c_str();
	Karl 2017/03/15 15:04:13 Same here. Same here. bradn 2017/03/16 00:21:47 Done. Show quoted text On 2017/03/15 15:04:13, Karl wrote: > Same here. Done.
	169 }

	170 }

	171 for (auto& i : property_names_) {

	172 if (i.second == token) {

	173 return i.first.c_str();
	Karl 2017/03/15 15:04:13 Same here. Same here. bradn 2017/03/16 00:21:46 Done. Show quoted text On 2017/03/15 15:04:13, Karl wrote: > Same here. Done.
	174 }

	175 }

	176 switch (token) {

	177 #define V(rawname, name) \

	178 case kToken_##name: \

	179 return rawname;

	180 LONG_SYMBOL_NAME_LIST(V)

	181 #undef V

	182 default:

	183 break;

	184 }

	185 if (token == kUnsigned) {
	vogelheim 2017/03/15 12:07:40 Why not handle all of these inside the switch righ Why not handle all of these inside the switch right above? bradn 2017/03/16 00:21:46 Done. Show quoted text On 2017/03/15 12:07:40, vogelheim wrote: > Why not handle all of these inside the switch right above? Done.
	186 return "{unsigned value}";

	187 } else if (token == kDouble) {

	188 return "{double value}";

	189 } else if (token == kParseError) {

	190 return "{parse error}";

	191 } else if (token == kEndOfInput) {

	192 return "{end of input}";

	193 }

	194 UNREACHABLE();

	195 return "{unreachable}";

	196 }

	197 #endif

	198

	199 int AsmJsLexer::GetPosition() const { return static_cast<int>(stream_->pos()); }
	vogelheim 2017/03/15 12:07:40 Does this work if rewind_ is set? If not, maybe ad Does this work if rewind_ is set? If not, maybe add DCHECK(!rewind_). bradn 2017/03/16 00:21:46 Done. Show quoted text On 2017/03/15 12:07:40, vogelheim wrote: > Does this work if rewind_ is set? If not, maybe add DCHECK(!rewind_). Done.
	200

	201 void AsmJsLexer::Seek(int pos) {

	202 stream_->Seek(pos);

	203 preceding_token_ = 0;

	204 token_ = 0;

	205 next_token_ = 0;

	206 rewind_ = false;

	207 Next();

	208 }

	209

	210 void AsmJsLexer::ConsumeIdentifier(uc32 ch) {

	211 // Consume characters while still part of the identifier.

	212 identifier_string_ = "";
	vogelheim 2017/03/15 12:07:41 identifier_string_.clear(); (STL is bizarre, but. identifier_string_.clear(); (STL is bizarre, but.. The way you've written it is very clear to read, but unfortunately will actually call a copy operation, and will iterate over the empty input string, only to figure out that it's empty. Not sure if compilers are being clever about it, but .clear() always does the right thing.) bradn 2017/03/16 00:21:46 Yep. Done. Show quoted text On 2017/03/15 12:07:41, vogelheim wrote: > identifier_string_.clear(); > > (STL is bizarre, but.. The way you've written it is very clear to read, but > unfortunately will actually call a copy operation, and will iterate over the > empty input string, only to figure out that it's empty. Not sure if compilers > are being clever about it, but .clear() always does the right thing.) Yep. Done.
	213 while (IsIdentifierPart(ch)) {

	214 identifier_string_ += ch;

	215 ch = stream_->Advance();

	216 }

	217 // Go back one for next time.

	218 stream_->Back();

	219

	220 // Decode what the identifier means.

	221 if (preceding_token_ == '.') {

	222 auto i = property_names_.find(identifier_string_);

	223 if (i != property_names_.end()) {

	224 token_ = i->second;

	225 return;

	226 }

	227 } else {

	228 {

	229 auto i = local_names_.find(identifier_string_);

	230 if (i != local_names_.end()) {

	231 token_ = i->second;

	232 return;

	233 }

	234 }

	235 if (!in_local_scope_) {

	236 auto i = global_names_.find(identifier_string_);

	237 if (i != global_names_.end()) {

	238 token_ = i->second;

	239 return;

	240 }

	241 }

	242 }

	243 if (preceding_token_ == '.') {

	244 CHECK(global_count_ < kMaxIdentifierCount);

	245 token_ = kGlobalsStart + global_count_++;

	246 property_names_[identifier_string_] = token_;

	247 } else if (in_local_scope_) {

	248 CHECK(local_names_.size() < kMaxIdentifierCount);

	249 token_ = kLocalsStart - static_cast<token_t>(local_names_.size());

	250 local_names_[identifier_string_] = token_;

	251 } else {

	252 CHECK(global_count_ < kMaxIdentifierCount);

	253 token_ = kGlobalsStart + global_count_++;

	254 global_names_[identifier_string_] = token_;

	255 }

	256 }

	257

	258 void AsmJsLexer::ConsumeNumber(uc32 ch) {

	259 std::string number;

	260 number = ch;

	261 bool has_dot = ch == '.';

	262 for (;;) {

	263 ch = stream_->Advance();

	264 if ((ch >= '0' && ch <= '9') \|\| (ch >= 'a' && ch <= 'f') \|\|

	265 (ch >= 'A' && ch <= 'F') \|\| ch == '.' \|\| ch == 'x' \|\|

	266 ((ch == '-' \|\| ch == '+') && (number[number.size() - 1] == 'e' \|\|

	267 number[number.size() - 1] == 'E'))) {

	268 // TODO(bradnelson): Test weird cases ending in -.

	269 if (ch == '.') {

	270 has_dot = true;

	271 }

	272 number += ch;

	273 } else {

	274 break;

	275 }

	276 }

	277 stream_->Back();

	278 // Special case the most common number.

	279 if (number == "0") {

	280 unsigned_value_ = 0;

	281 token_ = kUnsigned;

	282 return;

	283 }

	284 // Pick out dot.

	285 if (number == ".") {

	286 token_ = '.';

	287 return;

	288 }

	289 // Decode numbers.

	290 // TODO(bradnelson): Replace strto* with shared code with scanner.cc

	291 char* end;

	292 if (has_dot) {

	293 double_value_ = strtod(number.c_str(), &end);

	294 token_ = kDouble;

	295 } else {

	296 if (number.size() > 2 && number[0] == '0' && number[1] == 'x') {

	297 // Decode 0x* as hex.

	298 unsigned_value_ = strtoul(number.c_str() + 2, &end, 16);

	299 } else if (number.size() > 1 && number[0] == '0') {

	300 // Decode 0* as octal.

	301 unsigned_value_ = strtoul(number.c_str() + 1, &end, 8);

	302 } else {

	303 // Decode the rest as double.

	304 // This can come up in asm.js as for example 1e2 is used to encode 100.

	305 double_value_ = strtod(number.c_str(), &end);

	306 unsigned_value_ = static_cast<uint32_t>(double_value_);

	307 }

	308 token_ = kUnsigned;

	309 }

	310 // Check if string to number conversion didn't consume all the characters.

	311 // This happens if the character filter let through something invalid

	312 // like: 0123ef for example.

	313 // TODO(bradnelson): Check if this happens often enough to be a perf problem.

	314 if (end != number.c_str() + number.size()) {

	315 // If things didn't parse fully, but start with a '.', back out the other

	316 // characters and emit the '.' token.

	317 if (number[0] == '.') {

	318 for (size_t k = 1; k < number.size(); ++k) {

	319 stream_->Back();

	320 }

	321 token_ = '.';

	322 return;

	323 }

	324 // Anything else that doesn't parse is an error.

	325 token_ = kParseError;

	326 return;

	327 }

	328 }

	329

	330 void AsmJsLexer::ConsumeCComment() {

	331 for (;;) {

	332 uc32 ch = stream_->Advance();

	333 if (ch == '\n' \|\| ch == kEndOfInput) {

	334 break;

	335 }

	336 }

	337 }

	338

	339 void AsmJsLexer::ConsumeCPPComment() {

	340 for (;;) {

	341 uc32 ch = stream_->Advance();

	342 if (ch == '*') {
	vogelheim 2017/03/15 12:07:41 Your choice, but I think this if-branch would be a Your choice, but I think this if-branch would be a bit more compact if replaced with this: while (ch == '') { ch = stream_->Advance(); if (ch == '/' \|\| ch == kEndOfInput) return; } bradn* 2017/03/16 00:21:46 Ah, yeah, that's better. Done. Show quoted text On 2017/03/15 12:07:41, vogelheim wrote: > Your choice, but I think this if-branch would be a bit more compact if replaced > with this: > > while (ch == '*') { > ch = stream_->Advance(); > if (ch == '/' \|\| ch == kEndOfInput) > return; > } Ah, yeah, that's better. Done.
	343 ch = stream_->Advance();

	344 if (ch == '/') {

	345 break;

	346 }

	347 if (ch == '*') {

	348 stream_->Back();

	349 }

	350 } else if (ch == kEndOfInput) {
	vogelheim 2017/03/15 12:07:40 I think this potentially swallows a syntax error w I think this potentially swallows a syntax error w/ an unclosed comment at the end of a file. (E.g. xxx /* yyy <EOF>) bradn 2017/03/16 00:21:46 Ah, yes. Fixed and added a test. Show quoted text On 2017/03/15 12:07:40, vogelheim wrote: > I think this potentially swallows a syntax error w/ an unclosed comment at the > end of a file. (E.g. xxx /* yyy <EOF>) Ah, yes. Fixed and added a test.
	351 break;

	352 }

	353 }

	354 }

	355

	356 void AsmJsLexer::ConsumeString(uc32 quote) {

	357 // Only string allowed is 'use asm' / "use asm".

	358 const char* expected = "use asm";

	359 for (; *expected != '\0'; ++expected) {

	360 if (stream_->Advance() != *expected) {

	361 token_ = kParseError;

	362 return;

	363 }

	364 }

	365 if (stream_->Advance() != quote) {

	366 token_ = kParseError;

	367 return;

	368 }

	369 token_ = kToken_UseAsm;

	370 }

	371

	372 void AsmJsLexer::ConsumeCompareOrShift(uc32 ch) {

	373 uc32 next_ch = stream_->Advance();

	374 if (next_ch == '=') {

	375 switch (ch) {

	376 case '<':

	377 token_ = kToken_LE;

	378 break;

	379 case '>':

	380 token_ = kToken_GE;

	381 break;

	382 case '=':

	383 token_ = kToken_EQ;

	384 break;

	385 case '!':

	386 token_ = kToken_NE;

	387 break;

	388 default:

	389 UNREACHABLE();

	390 }

	391 } else if (ch == '<' && next_ch == '<') {

	392 token_ = kToken_SHL;

	393 } else if (ch == '>' && next_ch == '>') {

	394 if (stream_->Advance() == '>') {

	395 token_ = kToken_SHR;

	396 } else {

	397 token_ = kToken_SAR;

	398 stream_->Back();

	399 }

	400 } else {

	401 stream_->Back();

	402 token_ = ch;

	403 }

	404 }

	405

	406 bool AsmJsLexer::IsIdentifierStart(uc32 ch) {

	407 return (ch >= 'A' && ch <= 'Z') \|\| (ch >= 'a' && ch <= 'z') \|\| ch == '_' \|\|

	408 ch == '$';

	409 }

	410

	411 bool AsmJsLexer::IsIdentifierPart(uc32 ch) {

	412 return IsIdentifierStart(ch) \|\| (ch >= '0' && ch <= '9');

	413 }

	414

	415 bool AsmJsLexer::IsNumberStart(uc32 ch) {

	416 return ch == '.' \|\| (ch >= '0' && ch <= '9');

	417 }

	418

	419 } // namespace internal

	420 } // namespace v8

OLD	NEW

« src/asmjs/asm-lexer.h ('K') | « src/asmjs/asm-lexer.h ('k') | src/asmjs/asm-names.h » ('j') | test/unittests/unittests.gyp » ('J')