tools/gn/tokenizer.cc - Issue 588893006: gn: attach comments to parse tree

Side by Side Diff: tools/gn/tokenizer.cc

Issue 588893006: gn: attach comments to parse tree (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: suffix comments too Created 6 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "tools/gn/tokenizer.h"	5 #include "tools/gn/tokenizer.h"

6	6

7 #include "base/logging.h"	7 #include "base/logging.h"

	8 #include "base/strings/string_util.h"

8 #include "tools/gn/input_file.h"	9 #include "tools/gn/input_file.h"

9	10

10 namespace {	11 namespace {

11	12

12 bool CouldBeTwoCharOperatorBegin(char c) {	13 bool CouldBeTwoCharOperatorBegin(char c) {

13 return c == '<' \|\| c == '>' \|\| c == '!' \|\| c == '=' \|\| c == '-' \|\|	14 return c == '<' \|\| c == '>' \|\| c == '!' \|\| c == '=' \|\| c == '-' \|\|

14 c == '+' \|\| c == '\|' \|\| c == '&';	15 c == '+' \|\| c == '\|' \|\| c == '&';

15 }	16 }

16	17

17 bool CouldBeTwoCharOperatorEnd(char c) {	18 bool CouldBeTwoCharOperatorEnd(char c) {

(...skipping 82 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
100 }	101 }

101 size_t token_begin = cur_;	102 size_t token_begin = cur_;

102 AdvanceToEndOfToken(location, type);	103 AdvanceToEndOfToken(location, type);

103 if (has_error())	104 if (has_error())

104 break;	105 break;

105 size_t token_end = cur_;	106 size_t token_end = cur_;

106	107

107 base::StringPiece token_value(&input_.data()[token_begin],	108 base::StringPiece token_value(&input_.data()[token_begin],

108 token_end - token_begin);	109 token_end - token_begin);

109	110

110 if (type == Token::UNCLASSIFIED_OPERATOR)	111 if (type == Token::UNCLASSIFIED_OPERATOR) {

111 type = GetSpecificOperatorType(token_value);	112 type = GetSpecificOperatorType(token_value);

112 if (type == Token::IDENTIFIER) {	113 } else if (type == Token::IDENTIFIER) {

113 if (token_value == "if")	114 if (token_value == "if")

114 type = Token::IF;	115 type = Token::IF;

115 else if (token_value == "else")	116 else if (token_value == "else")

116 type = Token::ELSE;	117 type = Token::ELSE;

117 else if (token_value == "true")	118 else if (token_value == "true")

118 type = Token::TRUE_TOKEN;	119 type = Token::TRUE_TOKEN;

119 else if (token_value == "false")	120 else if (token_value == "false")

120 type = Token::FALSE_TOKEN;	121 type = Token::FALSE_TOKEN;

	122 } else if (type == Token::UNCLASSIFIED_COMMENT) {

	123 // Find back to the previous \n, and trim. If it's only whitespace, then

	124 // this is on a line alone, otherwise it's a suffix comment.

	125 size_t newline_location = input_.find_last_of('\n', token_begin);

	126 base::StringPiece to_newline = input_.substr(

	127 newline_location + 1, token_begin - (newline_location + 1));

	128 std::string trimmed;

	129 // TODO(scottmg): Should write TrimWhitespace for StringPiece.

	130 base::TrimWhitespace(to_newline.as_string(), base::TRIM_ALL, &trimmed);
	brettw 2014/09/23 21:33:15 I'd like to resolve this if possible, the tokenize I'd like to resolve this if possible, the tokenizer and parser is currently very careful not to make strings. Actually, it seems like this should be a special function that just searches backwards and terminates if it finds a newline or a non-whitespace character. scottmg 2014/09/23 22:15:37 Done. (as special function, that makes more sense, Show quoted text On 2014/09/23 21:33:15, brettw wrote: > I'd like to resolve this if possible, the tokenizer and parser is currently very > careful not to make strings. > > Actually, it seems like this should be a special function that just searches > backwards and terminates if it finds a newline or a non-whitespace character. Done. (as special function, that makes more sense, I agree). I also changed IsCurrentWhitespace (rather than over-factor it) to disallow VT and FF, since those seem at least as silly as regular TAB.
	131 if (trimmed.empty())

	132 type = Token::LINE_COMMENT;

	133 else

	134 type = Token::SUFFIX_COMMENT;

121 }	135 }

122	136

123 // TODO(brettw) This just strips comments from the token stream. This	137 tokens_.push_back(Token(location, type, token_value));

124 // is probably wrong, they should be removed at a later stage so we can

125 // do things like rewrite the file. But this makes the parser simpler and

126 // is OK for now.

127 if (type != Token::COMMENT)

128 tokens_.push_back(Token(location, type, token_value));

129 }	138 }

130 if (err_->has_error())	139 if (err_->has_error())

131 tokens_.clear();	140 tokens_.clear();

132 return tokens_;	141 return tokens_;

133 }	142 }

134	143

135 // static	144 // static

136 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {	145 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {

137 DCHECK_GT(n, 0);	146 DCHECK_GT(n, 0);

138	147

(...skipping 53 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
192 return Token::LEFT_BRACE;	201 return Token::LEFT_BRACE;

193 if (next_char == '}')	202 if (next_char == '}')

194 return Token::RIGHT_BRACE;	203 return Token::RIGHT_BRACE;

195	204

196 if (next_char == '.')	205 if (next_char == '.')

197 return Token::DOT;	206 return Token::DOT;

198 if (next_char == ',')	207 if (next_char == ',')

199 return Token::COMMA;	208 return Token::COMMA;

200	209

201 if (next_char == '#')	210 if (next_char == '#')

202 return Token::COMMENT;	211 return Token::UNCLASSIFIED_COMMENT;

203	212

204 // For the case of '-' differentiate between a negative number and anything	213 // For the case of '-' differentiate between a negative number and anything

205 // else.	214 // else.

206 if (next_char == '-') {	215 if (next_char == '-') {

207 if (!CanIncrement())	216 if (!CanIncrement())

208 return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of	217 return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of

209 // file.	218 // file.

210 char following_char = input_[cur_ + 1];	219 char following_char = input_[cur_ + 1];

211 if (IsAsciiDigit(following_char))	220 if (IsAsciiDigit(following_char))

212 return Token::INTEGER;	221 return Token::INTEGER;

(...skipping 65 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
278 case Token::RIGHT_BRACKET:	287 case Token::RIGHT_BRACKET:

279 case Token::LEFT_BRACE:	288 case Token::LEFT_BRACE:

280 case Token::RIGHT_BRACE:	289 case Token::RIGHT_BRACE:

281 case Token::LEFT_PAREN:	290 case Token::LEFT_PAREN:

282 case Token::RIGHT_PAREN:	291 case Token::RIGHT_PAREN:

283 case Token::DOT:	292 case Token::DOT:

284 case Token::COMMA:	293 case Token::COMMA:

285 Advance(); // All are one char.	294 Advance(); // All are one char.

286 break;	295 break;

287	296

288 case Token::COMMENT:	297 case Token::UNCLASSIFIED_COMMENT:

289 // Eat to EOL.	298 // Eat to EOL.

290 while (!at_end() && !IsCurrentNewline())	299 while (!at_end() && !IsCurrentNewline())

291 Advance();	300 Advance();

292 break;	301 break;

293	302

294 case Token::INVALID:	303 case Token::INVALID:

295 default:	304 default:

296 *err_ = Err(location, "Everything is all messed up",	305 *err_ = Err(location, "Everything is all messed up",

297 "Please insert system disk in drive A: and press any key.");	306 "Please insert system disk in drive A: and press any key.");

298 NOTREACHED();	307 NOTREACHED();

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
332 if (IsCurrentNewline()) {	341 if (IsCurrentNewline()) {

333 line_number_++;	342 line_number_++;

334 char_in_line_ = 1;	343 char_in_line_ = 1;

335 } else {	344 } else {

336 char_in_line_++;	345 char_in_line_++;

337 }	346 }

338 cur_++;	347 cur_++;

339 }	348 }

340	349

341 Location Tokenizer::GetCurrentLocation() const {	350 Location Tokenizer::GetCurrentLocation() const {

342 return Location(input_file_, line_number_, char_in_line_);	351 return Location(input_file_, line_number_, char_in_line_, cur_);

343 }	352 }

344	353

345 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {	354 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {

346 std::string help;	355 std::string help;

347 if (cur_char() == ';') {	356 if (cur_char() == ';') {

348 // Semicolon.	357 // Semicolon.

349 help = "Semicolons are not needed, delete this one.";	358 help = "Semicolons are not needed, delete this one.";

350 } else if (cur_char() == '\t') {	359 } else if (cur_char() == '\t') {

351 // Tab.	360 // Tab.

352 help = "You got a tab character in here. Tabs are evil. "	361 help = "You got a tab character in here. Tabs are evil. "

353 "Convert to spaces.";	362 "Convert to spaces.";

354 } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&	363 } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&

355 (input_[cur_ + 1] == '/' \|\| input_[cur_ + 1] == '*')) {	364 (input_[cur_ + 1] == '/' \|\| input_[cur_ + 1] == '*')) {

356 // Different types of comments.	365 // Different types of comments.

357 help = "Comments should start with # instead";	366 help = "Comments should start with # instead";

358 } else {	367 } else {

359 help = "I have no idea what this is.";	368 help = "I have no idea what this is.";

360 }	369 }

361	370

362 return Err(location, "Invalid token.", help);	371 return Err(location, "Invalid token.", help);

363 }	372 }

OLD	NEW

« tools/gn/parser_unittest.cc ('K') | « tools/gn/token.h ('k') | tools/gn/tokenizer_unittest.cc » ('j') | no next file with comments »