tools/gn/tokenizer.cc - Issue 21114002: Add initial prototype for the GN meta-buildsystem.

Unified Diff: tools/gn/tokenizer.cc

Issue 21114002: Add initial prototype for the GN meta-buildsystem. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: add owners and readme Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/gn/tokenizer.cc

diff --git a/tools/gn/tokenizer.cc b/tools/gn/tokenizer.cc

new file mode 100644

index 0000000000000000000000000000000000000000..971f56b820a7a349c0878fa373cfbde2490ca36d

--- /dev/null

+++ b/tools/gn/tokenizer.cc

@@ -0,0 +1,309 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "tools/gn/tokenizer.h"

+#include "base/logging.h"

+#include "tools/gn/input_file.h"

+namespace {

+bool IsNumberChar(char c) {

+ return c == '-' || (c >= '0' && c <= '9');

+bool CouldBeTwoCharOperatorBegin(char c) {

+ return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||

+ c == '+' || c == '|' || c == '&';

+bool CouldBeTwoCharOperatorEnd(char c) {

+ return c == '=' || c == '|' || c == '&';

+bool CouldBeOneCharOperator(char c) {

+ return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||

+ c == ':' || c == '|' || c == '&' || c == '-';

+bool CouldBeOperator(char c) {

+ return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);

+bool IsSeparatorChar(char c) {

+ return c == ',';

+bool IsScoperChar(char c) {

+ return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';

+} // namespace

+Tokenizer::Tokenizer(const InputFile* input_file, Err* err)

+ : input_file_(input_file),

+ input_(input_file->contents()),

+ err_(err),

+ cur_(0),

+ line_number_(1),

+ char_in_line_(1) {

+Tokenizer::~Tokenizer() {

+// static

+std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {

+ Tokenizer t(input_file, err);

+ return t.Run();

+std::vector<Token> Tokenizer::Run() {

+ std::vector<Token> tokens;

+ while (!done()) {

+ AdvanceToNextToken();

+ if (done())

+ break;

+ Location location = GetCurrentLocation();

+ Token::Type type = ClassifyCurrent();

+ if (type == Token::INVALID) {

+ *err_ = GetErrorForInvalidToken(location);

+ break;

+ }

+ size_t token_begin = cur_;

+ AdvanceToEndOfToken(location, type);

+ if (has_error())

+ break;

+ size_t token_end = cur_;

+ // TODO(brettw) This just strips comments from the token stream. This

+ // is probably wrong, they should be removed at a later stage so we can

+ // do things like rewrite the file. But this makes the parser simpler and

+ // is OK for now.

+ if (type != Token::COMMENT) {

+ tokens.push_back(Token(

+ location,

+ type,

+ base::StringPiece(&input_.data()[token_begin],

+ token_end - token_begin)));

+ }

+ if (err_->has_error())

+ tokens.clear();

+ return tokens;

+// static

+size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {

+ int cur_line = 1;

+ size_t cur_byte = 0;

+ DCHECK(n > 0);

+ if (n == 1)

+ return 0;

+ while (cur_byte < buf.size()) {

+ if (IsNewline(buf, cur_byte)) {

+ cur_line++;

+ if (cur_line == n)

+ return cur_byte + 1;

+ }

+ cur_byte++;

+ }

+ return -1;

+// static

+bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {

+ DCHECK(offset < buffer.size());

+ // We may need more logic here to handle different line ending styles.

+ return buffer[offset] == '\n';

+void Tokenizer::AdvanceToNextToken() {

+ while (!at_end() && IsCurrentWhitespace())

+ Advance();

+Token::Type Tokenizer::ClassifyCurrent() const {

+ DCHECK(!at_end());

+ char next_char = cur_char();

+ if (next_char >= '0' && next_char <= '9')

+ return Token::INTEGER;

+ if (next_char == '"')

+ return Token::STRING;

+ // Note: '-' handled specially below.

+ if (next_char != '-' && CouldBeOperator(next_char))

+ return Token::OPERATOR;

+ if (IsIdentifierFirstChar(next_char))

+ return Token::IDENTIFIER;

+ if (IsScoperChar(next_char))

+ return Token::SCOPER;

+ if (IsSeparatorChar(next_char))

+ return Token::SEPARATOR;

+ if (next_char == '#')

+ return Token::COMMENT;

+ // For the case of '-' differentiate between a negative number and anything

+ // else.

+ if (next_char == '-') {

+ if (!CanIncrement())

+ return Token::OPERATOR; // Just the minus before end of file.

+ char following_char = input_[cur_ + 1];

+ if (following_char >= '0' && following_char <= '9')

+ return Token::INTEGER;

+ return Token::OPERATOR;

+ }

+ return Token::INVALID;

+void Tokenizer::AdvanceToEndOfToken(const Location& location,

+ Token::Type type) {

+ switch (type) {

+ case Token::INTEGER:

+ do {

+ Advance();

+ } while (!at_end() && IsNumberChar(cur_char()));

+ if (!at_end()) {

+ // Require the char after a number to be some kind of space, scope,

+ // or operator.

+ char c = cur_char();

+ if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&

+ !IsScoperChar(c) && !IsSeparatorChar(c)) {

+ *err_ = Err(GetCurrentLocation(),

+ "This is not a valid number.",

+ "Learn to count.");

+ // Highlight the number.

+ err_->AppendRange(LocationRange(location, GetCurrentLocation()));

+ }

+ break;

+ case Token::STRING: {

+ char initial = cur_char();

+ Advance(); // Advance past initial "

+ for (;;) {

+ if (at_end()) {

+ *err_ = Err(LocationRange(location,

+ Location(input_file_, line_number_, char_in_line_)),

+ "Unterminated string literal.",

+ "Don't leave me hanging like this!");

+ break;

+ }

+ if (IsCurrentStringTerminator(initial)) {

+ Advance(); // Skip past last "

+ break;

+ } else if (cur_char() == '\n') {

+ *err_ = Err(LocationRange(location,

+ GetCurrentLocation()),

+ "Newline in string constant.");

+ }

+ Advance();

+ }

+ break;

+ }

+ case Token::OPERATOR:

+ // Some operators are two characters, some are one.

+ if (CouldBeTwoCharOperatorBegin(cur_char())) {

+ if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))

+ Advance();

+ }

+ Advance();

+ break;

+ case Token::IDENTIFIER:

+ while (!at_end() && IsIdentifierContinuingChar(cur_char()))

+ Advance();

+ break;

+ case Token::SCOPER:

+ case Token::SEPARATOR:

+ Advance(); // All are one char.

+ break;

+ case Token::COMMENT:

+ // Eat to EOL.

+ while (!at_end() && !IsCurrentNewline())

+ Advance();

+ break;

+ case Token::INVALID:

+ *err_ = Err(location, "Everything is all messed up",

+ "Please insert system disk in drive A: and press any key.");

+ NOTREACHED();

+ return;

+ }

+bool Tokenizer::IsCurrentWhitespace() const {

+ DCHECK(!at_end());

+ char c = input_[cur_];

+ // Note that tab (0x09) is illegal.

+ return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20;

+bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {

+ DCHECK(!at_end());

+ if (cur_char() != quote_char)

+ return false;

+ // Check for escaping. \" is not a string terminator, but \\" is. Count

+ // the number of preceeding backslashes.

+ int num_backslashes = 0;

+ for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)

+ num_backslashes++;

+ // Even backslashes mean that they were escaping each other and don't count

+ // as escaping this quote.

+ return (num_backslashes % 2) == 0;

+bool Tokenizer::IsCurrentNewline() const {

+ return IsNewline(input_, cur_);

+void Tokenizer::Advance() {

+ DCHECK(cur_ < input_.size());

+ if (IsCurrentNewline()) {

+ line_number_++;

+ char_in_line_ = 1;

+ } else {

+ char_in_line_++;

+ }

+ cur_++;

+Location Tokenizer::GetCurrentLocation() const {

+ return Location(input_file_, line_number_, char_in_line_);

+Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {

+ std::string help;

+ if (cur_char() == ';') {

+ // Semicolon.

+ help = "Semicolons are not needed, delete this one.";

+ } else if (cur_char() == '\t') {

+ // Tab.

+ help = "You got a tab character in here. Tabs are evil. "

+ "Convert to spaces.";

+ } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&

+ (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {

+ // Different types of comments.

+ help = "Comments should start with # instead";

+ } else {

+ help = "I have no idea what this is.";

+ }

+ return Err(location, "Invalid token.", help);

« no previous file with comments | « tools/gn/tokenizer.h ('k') | tools/gn/tokenizer_unittest.cc » ('j') | no next file with comments »