tools/gn/tokenizer.cc - Issue 21114002: Add initial prototype for the GN meta-buildsystem.

Side by Side Diff: tools/gn/tokenizer.cc

Issue 21114002: Add initial prototype for the GN meta-buildsystem. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: add owners and readme Created 7 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "tools/gn/tokenizer.h"

	6

	7 #include "base/logging.h"

	8 #include "tools/gn/input_file.h"

	9

	10 namespace {

	11

	12 bool IsNumberChar(char c) {

	13 return c == '-' \|\| (c >= '0' && c <= '9');

	14 }

	15

	16 bool CouldBeTwoCharOperatorBegin(char c) {

	17 return c == '<' \|\| c == '>' \|\| c == '!' \|\| c == '=' \|\| c == '-' \|\|

	18 c == '+' \|\| c == '\|' \|\| c == '&';

	19 }

	20

	21 bool CouldBeTwoCharOperatorEnd(char c) {

	22 return c == '=' \|\| c == '\|' \|\| c == '&';

	23 }

	24

	25 bool CouldBeOneCharOperator(char c) {

	26 return c == '=' \|\| c == '<' \|\| c == '>' \|\| c == '+' \|\| c == '!' \|\|

	27 c == ':' \|\| c == '\|' \|\| c == '&' \|\| c == '-';

	28 }

	29

	30 bool CouldBeOperator(char c) {

	31 return CouldBeOneCharOperator(c) \|\| CouldBeTwoCharOperatorBegin(c);

	32 }

	33

	34 bool IsSeparatorChar(char c) {

	35 return c == ',';

	36 }

	37

	38 bool IsScoperChar(char c) {

	39 return c == '(' \|\| c == ')' \|\| c == '[' \|\| c == ']' \|\| c == '{' \|\| c == '}';

	40 }

	41

	42 } // namespace

	43

	44 Tokenizer::Tokenizer(const InputFile* input_file, Err* err)

	45 : input_file_(input_file),

	46 input_(input_file->contents()),

	47 err_(err),

	48 cur_(0),

	49 line_number_(1),

	50 char_in_line_(1) {

	51 }

	52

	53 Tokenizer::~Tokenizer() {

	54 }

	55

	56 // static

	57 std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {

	58 Tokenizer t(input_file, err);

	59 return t.Run();

	60 }

	61

	62 std::vector<Token> Tokenizer::Run() {

	63 std::vector<Token> tokens;

	64 while (!done()) {

	65 AdvanceToNextToken();

	66 if (done())

	67 break;

	68 Location location = GetCurrentLocation();

	69

	70 Token::Type type = ClassifyCurrent();

	71 if (type == Token::INVALID) {

	72 *err_ = GetErrorForInvalidToken(location);

	73 break;

	74 }

	75 size_t token_begin = cur_;

	76 AdvanceToEndOfToken(location, type);

	77 if (has_error())

	78 break;

	79 size_t token_end = cur_;

	80

	81 // TODO(brettw) This just strips comments from the token stream. This

	82 // is probably wrong, they should be removed at a later stage so we can

	83 // do things like rewrite the file. But this makes the parser simpler and

	84 // is OK for now.

	85 if (type != Token::COMMENT) {

	86 tokens.push_back(Token(

	87 location,

	88 type,

	89 base::StringPiece(&input_.data()[token_begin],

	90 token_end - token_begin)));

	91 }

	92 }

	93 if (err_->has_error())

	94 tokens.clear();

	95 return tokens;

	96 }

	97

	98 // static

	99 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {

	100 int cur_line = 1;

	101 size_t cur_byte = 0;

	102

	103 DCHECK(n > 0);

	104

	105 if (n == 1)

	106 return 0;

	107

	108 while (cur_byte < buf.size()) {

	109 if (IsNewline(buf, cur_byte)) {

	110 cur_line++;

	111 if (cur_line == n)

	112 return cur_byte + 1;

	113 }

	114 cur_byte++;

	115 }

	116 return -1;

	117 }

	118

	119 // static

	120 bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {

	121 DCHECK(offset < buffer.size());

	122 // We may need more logic here to handle different line ending styles.

	123 return buffer[offset] == '\n';

	124 }

	125

	126

	127 void Tokenizer::AdvanceToNextToken() {

	128 while (!at_end() && IsCurrentWhitespace())

	129 Advance();

	130 }

	131

	132 Token::Type Tokenizer::ClassifyCurrent() const {

	133 DCHECK(!at_end());

	134 char next_char = cur_char();

	135 if (next_char >= '0' && next_char <= '9')

	136 return Token::INTEGER;

	137 if (next_char == '"')

	138 return Token::STRING;

	139

	140 // Note: '-' handled specially below.

	141 if (next_char != '-' && CouldBeOperator(next_char))

	142 return Token::OPERATOR;

	143

	144 if (IsIdentifierFirstChar(next_char))

	145 return Token::IDENTIFIER;

	146

	147 if (IsScoperChar(next_char))

	148 return Token::SCOPER;

	149

	150 if (IsSeparatorChar(next_char))

	151 return Token::SEPARATOR;

	152

	153 if (next_char == '#')

	154 return Token::COMMENT;

	155

	156 // For the case of '-' differentiate between a negative number and anything

	157 // else.

	158 if (next_char == '-') {

	159 if (!CanIncrement())

	160 return Token::OPERATOR; // Just the minus before end of file.

	161 char following_char = input_[cur_ + 1];

	162 if (following_char >= '0' && following_char <= '9')

	163 return Token::INTEGER;

	164 return Token::OPERATOR;

	165 }

	166

	167 return Token::INVALID;

	168 }

	169

	170 void Tokenizer::AdvanceToEndOfToken(const Location& location,

	171 Token::Type type) {

	172 switch (type) {

	173 case Token::INTEGER:

	174 do {

	175 Advance();

	176 } while (!at_end() && IsNumberChar(cur_char()));

	177 if (!at_end()) {

	178 // Require the char after a number to be some kind of space, scope,

	179 // or operator.

	180 char c = cur_char();

	181 if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&

	182 !IsScoperChar(c) && !IsSeparatorChar(c)) {

	183 *err_ = Err(GetCurrentLocation(),

	184 "This is not a valid number.",

	185 "Learn to count.");

	186 // Highlight the number.

	187 err_->AppendRange(LocationRange(location, GetCurrentLocation()));

	188 }

	189 }

	190 break;

	191

	192 case Token::STRING: {

	193 char initial = cur_char();

	194 Advance(); // Advance past initial "

	195 for (;;) {

	196 if (at_end()) {

	197 *err_ = Err(LocationRange(location,

	198 Location(input_file_, line_number_, char_in_line_)),

	199 "Unterminated string literal.",

	200 "Don't leave me hanging like this!");

	201 break;

	202 }

	203 if (IsCurrentStringTerminator(initial)) {

	204 Advance(); // Skip past last "

	205 break;

	206 } else if (cur_char() == '\n') {

	207 *err_ = Err(LocationRange(location,

	208 GetCurrentLocation()),

	209 "Newline in string constant.");

	210 }

	211 Advance();

	212 }

	213 break;

	214 }

	215

	216 case Token::OPERATOR:

	217 // Some operators are two characters, some are one.

	218 if (CouldBeTwoCharOperatorBegin(cur_char())) {

	219 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))

	220 Advance();

	221 }

	222 Advance();

	223 break;

	224

	225 case Token::IDENTIFIER:

	226 while (!at_end() && IsIdentifierContinuingChar(cur_char()))

	227 Advance();

	228 break;

	229

	230 case Token::SCOPER:

	231 case Token::SEPARATOR:

	232 Advance(); // All are one char.

	233 break;

	234

	235 case Token::COMMENT:

	236 // Eat to EOL.

	237 while (!at_end() && !IsCurrentNewline())

	238 Advance();

	239 break;

	240

	241 case Token::INVALID:

	242 *err_ = Err(location, "Everything is all messed up",

	243 "Please insert system disk in drive A: and press any key.");

	244 NOTREACHED();

	245 return;

	246 }

	247 }

	248

	249 bool Tokenizer::IsCurrentWhitespace() const {

	250 DCHECK(!at_end());

	251 char c = input_[cur_];

	252 // Note that tab (0x09) is illegal.

	253 return c == 0x0A \|\| c == 0x0B \|\| c == 0x0C \|\| c == 0x0D \|\| c == 0x20;

	254 }

	255

	256 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {

	257 DCHECK(!at_end());

	258 if (cur_char() != quote_char)

	259 return false;

	260

	261 // Check for escaping. \" is not a string terminator, but \\" is. Count

	262 // the number of preceeding backslashes.

	263 int num_backslashes = 0;

	264 for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)

	265 num_backslashes++;

	266

	267 // Even backslashes mean that they were escaping each other and don't count

	268 // as escaping this quote.

	269 return (num_backslashes % 2) == 0;

	270 }

	271

	272 bool Tokenizer::IsCurrentNewline() const {

	273 return IsNewline(input_, cur_);

	274 }

	275

	276 void Tokenizer::Advance() {

	277 DCHECK(cur_ < input_.size());

	278 if (IsCurrentNewline()) {

	279 line_number_++;

	280 char_in_line_ = 1;

	281 } else {

	282 char_in_line_++;

	283 }

	284 cur_++;

	285 }

	286

	287 Location Tokenizer::GetCurrentLocation() const {

	288 return Location(input_file_, line_number_, char_in_line_);

	289 }

	290

	291 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {

	292 std::string help;

	293 if (cur_char() == ';') {

	294 // Semicolon.

	295 help = "Semicolons are not needed, delete this one.";

	296 } else if (cur_char() == '\t') {

	297 // Tab.

	298 help = "You got a tab character in here. Tabs are evil. "

	299 "Convert to spaces.";

	300 } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&

	301 (input_[cur_ + 1] == '/' \|\| input_[cur_ + 1] == '*')) {

	302 // Different types of comments.

	303 help = "Comments should start with # instead";

	304 } else {

	305 help = "I have no idea what this is.";

	306 }

	307

	308 return Err(location, "Invalid token.", help);

	309 }

OLD	NEW

« no previous file with comments | « tools/gn/tokenizer.h ('k') | tools/gn/tokenizer_unittest.cc » ('j') | no next file with comments »