| Index: tools/gn/tokenizer.cc
|
| diff --git a/tools/gn/tokenizer.cc b/tools/gn/tokenizer.cc
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..971f56b820a7a349c0878fa373cfbde2490ca36d
|
| --- /dev/null
|
| +++ b/tools/gn/tokenizer.cc
|
| @@ -0,0 +1,309 @@
|
| +// Copyright (c) 2013 The Chromium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +#include "tools/gn/tokenizer.h"
|
| +
|
| +#include "base/logging.h"
|
| +#include "tools/gn/input_file.h"
|
| +
|
| +namespace {
|
| +
|
| +bool IsNumberChar(char c) {
|
| + return c == '-' || (c >= '0' && c <= '9');
|
| +}
|
| +
|
| +bool CouldBeTwoCharOperatorBegin(char c) {
|
| + return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
|
| + c == '+' || c == '|' || c == '&';
|
| +}
|
| +
|
| +bool CouldBeTwoCharOperatorEnd(char c) {
|
| + return c == '=' || c == '|' || c == '&';
|
| +}
|
| +
|
| +bool CouldBeOneCharOperator(char c) {
|
| + return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
|
| + c == ':' || c == '|' || c == '&' || c == '-';
|
| +}
|
| +
|
| +bool CouldBeOperator(char c) {
|
| + return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
|
| +}
|
| +
|
| +bool IsSeparatorChar(char c) {
|
| + return c == ',';
|
| +}
|
| +
|
| +bool IsScoperChar(char c) {
|
| + return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
|
| +}
|
| +
|
| +} // namespace
|
| +
|
| +Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
|
| + : input_file_(input_file),
|
| + input_(input_file->contents()),
|
| + err_(err),
|
| + cur_(0),
|
| + line_number_(1),
|
| + char_in_line_(1) {
|
| +}
|
| +
|
| +Tokenizer::~Tokenizer() {
|
| +}
|
| +
|
| +// static
|
| +std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
|
| + Tokenizer t(input_file, err);
|
| + return t.Run();
|
| +}
|
| +
|
| +std::vector<Token> Tokenizer::Run() {
|
| + std::vector<Token> tokens;
|
| + while (!done()) {
|
| + AdvanceToNextToken();
|
| + if (done())
|
| + break;
|
| + Location location = GetCurrentLocation();
|
| +
|
| + Token::Type type = ClassifyCurrent();
|
| + if (type == Token::INVALID) {
|
| + *err_ = GetErrorForInvalidToken(location);
|
| + break;
|
| + }
|
| + size_t token_begin = cur_;
|
| + AdvanceToEndOfToken(location, type);
|
| + if (has_error())
|
| + break;
|
| + size_t token_end = cur_;
|
| +
|
| + // TODO(brettw) This just strips comments from the token stream. This
|
| + // is probably wrong, they should be removed at a later stage so we can
|
| + // do things like rewrite the file. But this makes the parser simpler and
|
| + // is OK for now.
|
| + if (type != Token::COMMENT) {
|
| + tokens.push_back(Token(
|
| + location,
|
| + type,
|
| + base::StringPiece(&input_.data()[token_begin],
|
| + token_end - token_begin)));
|
| + }
|
| + }
|
| + if (err_->has_error())
|
| + tokens.clear();
|
| + return tokens;
|
| +}
|
| +
|
| +// static
|
| +size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
|
| + int cur_line = 1;
|
| + size_t cur_byte = 0;
|
| +
|
| + DCHECK(n > 0);
|
| +
|
| + if (n == 1)
|
| + return 0;
|
| +
|
| + while (cur_byte < buf.size()) {
|
| + if (IsNewline(buf, cur_byte)) {
|
| + cur_line++;
|
| + if (cur_line == n)
|
| + return cur_byte + 1;
|
| + }
|
| + cur_byte++;
|
| + }
|
| + return -1;
|
| +}
|
| +
|
| +// static
|
| +bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
|
| + DCHECK(offset < buffer.size());
|
| + // We may need more logic here to handle different line ending styles.
|
| + return buffer[offset] == '\n';
|
| +}
|
| +
|
| +
|
| +void Tokenizer::AdvanceToNextToken() {
|
| + while (!at_end() && IsCurrentWhitespace())
|
| + Advance();
|
| +}
|
| +
|
| +Token::Type Tokenizer::ClassifyCurrent() const {
|
| + DCHECK(!at_end());
|
| + char next_char = cur_char();
|
| + if (next_char >= '0' && next_char <= '9')
|
| + return Token::INTEGER;
|
| + if (next_char == '"')
|
| + return Token::STRING;
|
| +
|
| + // Note: '-' handled specially below.
|
| + if (next_char != '-' && CouldBeOperator(next_char))
|
| + return Token::OPERATOR;
|
| +
|
| + if (IsIdentifierFirstChar(next_char))
|
| + return Token::IDENTIFIER;
|
| +
|
| + if (IsScoperChar(next_char))
|
| + return Token::SCOPER;
|
| +
|
| + if (IsSeparatorChar(next_char))
|
| + return Token::SEPARATOR;
|
| +
|
| + if (next_char == '#')
|
| + return Token::COMMENT;
|
| +
|
| + // For the case of '-' differentiate between a negative number and anything
|
| + // else.
|
| + if (next_char == '-') {
|
| + if (!CanIncrement())
|
| + return Token::OPERATOR; // Just the minus before end of file.
|
| + char following_char = input_[cur_ + 1];
|
| + if (following_char >= '0' && following_char <= '9')
|
| + return Token::INTEGER;
|
| + return Token::OPERATOR;
|
| + }
|
| +
|
| + return Token::INVALID;
|
| +}
|
| +
|
| +void Tokenizer::AdvanceToEndOfToken(const Location& location,
|
| + Token::Type type) {
|
| + switch (type) {
|
| + case Token::INTEGER:
|
| + do {
|
| + Advance();
|
| + } while (!at_end() && IsNumberChar(cur_char()));
|
| + if (!at_end()) {
|
| + // Require the char after a number to be some kind of space, scope,
|
| + // or operator.
|
| + char c = cur_char();
|
| + if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
|
| + !IsScoperChar(c) && !IsSeparatorChar(c)) {
|
| + *err_ = Err(GetCurrentLocation(),
|
| + "This is not a valid number.",
|
| + "Learn to count.");
|
| + // Highlight the number.
|
| + err_->AppendRange(LocationRange(location, GetCurrentLocation()));
|
| + }
|
| + }
|
| + break;
|
| +
|
| + case Token::STRING: {
|
| + char initial = cur_char();
|
| + Advance(); // Advance past initial "
|
| + for (;;) {
|
| + if (at_end()) {
|
| + *err_ = Err(LocationRange(location,
|
| + Location(input_file_, line_number_, char_in_line_)),
|
| + "Unterminated string literal.",
|
| + "Don't leave me hanging like this!");
|
| + break;
|
| + }
|
| + if (IsCurrentStringTerminator(initial)) {
|
| + Advance(); // Skip past last "
|
| + break;
|
| + } else if (cur_char() == '\n') {
|
| + *err_ = Err(LocationRange(location,
|
| + GetCurrentLocation()),
|
| + "Newline in string constant.");
|
| + }
|
| + Advance();
|
| + }
|
| + break;
|
| + }
|
| +
|
| + case Token::OPERATOR:
|
| + // Some operators are two characters, some are one.
|
| + if (CouldBeTwoCharOperatorBegin(cur_char())) {
|
| + if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
|
| + Advance();
|
| + }
|
| + Advance();
|
| + break;
|
| +
|
| + case Token::IDENTIFIER:
|
| + while (!at_end() && IsIdentifierContinuingChar(cur_char()))
|
| + Advance();
|
| + break;
|
| +
|
| + case Token::SCOPER:
|
| + case Token::SEPARATOR:
|
| + Advance(); // All are one char.
|
| + break;
|
| +
|
| + case Token::COMMENT:
|
| + // Eat to EOL.
|
| + while (!at_end() && !IsCurrentNewline())
|
| + Advance();
|
| + break;
|
| +
|
| + case Token::INVALID:
|
| + *err_ = Err(location, "Everything is all messed up",
|
| + "Please insert system disk in drive A: and press any key.");
|
| + NOTREACHED();
|
| + return;
|
| + }
|
| +}
|
| +
|
| +bool Tokenizer::IsCurrentWhitespace() const {
|
| + DCHECK(!at_end());
|
| + char c = input_[cur_];
|
| + // Note that tab (0x09) is illegal.
|
| + return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20;
|
| +}
|
| +
|
| +bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
|
| + DCHECK(!at_end());
|
| + if (cur_char() != quote_char)
|
| + return false;
|
| +
|
| + // Check for escaping. \" is not a string terminator, but \\" is. Count
|
| + // the number of preceeding backslashes.
|
| + int num_backslashes = 0;
|
| + for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
|
| + num_backslashes++;
|
| +
|
| + // Even backslashes mean that they were escaping each other and don't count
|
| + // as escaping this quote.
|
| + return (num_backslashes % 2) == 0;
|
| +}
|
| +
|
| +bool Tokenizer::IsCurrentNewline() const {
|
| + return IsNewline(input_, cur_);
|
| +}
|
| +
|
| +void Tokenizer::Advance() {
|
| + DCHECK(cur_ < input_.size());
|
| + if (IsCurrentNewline()) {
|
| + line_number_++;
|
| + char_in_line_ = 1;
|
| + } else {
|
| + char_in_line_++;
|
| + }
|
| + cur_++;
|
| +}
|
| +
|
| +Location Tokenizer::GetCurrentLocation() const {
|
| + return Location(input_file_, line_number_, char_in_line_);
|
| +}
|
| +
|
| +Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
|
| + std::string help;
|
| + if (cur_char() == ';') {
|
| + // Semicolon.
|
| + help = "Semicolons are not needed, delete this one.";
|
| + } else if (cur_char() == '\t') {
|
| + // Tab.
|
| + help = "You got a tab character in here. Tabs are evil. "
|
| + "Convert to spaces.";
|
| + } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
|
| + (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
|
| + // Different types of comments.
|
| + help = "Comments should start with # instead";
|
| + } else {
|
| + help = "I have no idea what this is.";
|
| + }
|
| +
|
| + return Err(location, "Invalid token.", help);
|
| +}
|
|
|