mojom/lexer.cc - Issue 1034083003: Mojom lexer.

Unified Diff: mojom/lexer.cc

Issue 1034083003: Mojom lexer. (Closed) Base URL: https://github.com/domokit/mojo.git@master

Patch Set: Created 5 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: mojom/lexer.cc

diff --git a/mojom/lexer.cc b/mojom/lexer.cc

new file mode 100644

index 0000000000000000000000000000000000000000..e55e2fbca0b2f5160a69e8b37257b74c2cd861bd

--- /dev/null

+++ b/mojom/lexer.cc

@@ -0,0 +1,420 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "mojom/lexer.h"

+#include <map>

+#include <string>

+#include "base/lazy_instance.h"

+namespace mojo {

+namespace mojom {

+namespace {

+class KeywordsDict {

+ public:

+ KeywordsDict();

+ private:

+ std::map<std::string, mojom::TokenType> keywords_;

+ friend std::map<std::string, mojom::TokenType>& Keywords();

+ DISALLOW_COPY_AND_ASSIGN(KeywordsDict);

+};

+static base::LazyInstance<KeywordsDict> g_keywords = LAZY_INSTANCE_INITIALIZER;

+std::map<std::string, mojom::TokenType>& Keywords() {

+ return g_keywords.Get().keywords_;

+KeywordsDict::KeywordsDict() {

+ keywords_["import"] = TokenType::IMPORT;

+ keywords_["module"] = TokenType::MODULE;

+ keywords_["struct"] = TokenType::STRUCT;

+ keywords_["union"] = TokenType::UNION;

+ keywords_["interface"] = TokenType::INTERFACE;

+ keywords_["enum"] = TokenType::ENUM;

+ keywords_["const"] = TokenType::CONST;

+ keywords_["true"] = TokenType::TRUE;

+ keywords_["false"] = TokenType::FALSE;

+ keywords_["default"] = TokenType::DEFAULT;

+// Non-localized versions of isalpha.

+bool IsAlpha(char c) {

+ return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'));

+// Non-localized versions of isnum.

+bool IsDigit(char c) {

+ return ('0' <= c && c <= '9');

+bool IsHexDigit(char c) {

+ return (IsDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'));

+// Non-localized versions of isalnum.

+bool IsAlnum(char c) {

+ return IsAlpha(c) || IsDigit(c);

+// MojomLexer tokenizes a mojom source file. It is NOT thread-safe.

+class MojomLexer {

+ public:

+ explicit MojomLexer(const std::string& source);

+ ~MojomLexer();

+ // Returns the list of tokens in the source file.

+ std::vector<Token> Tokenize();

+ private:

+ // The GetNextToken.* functions all return true if they could find a token

+ // (even an error token) and false otherwise.

+ bool GetNextToken(Token* result);

+ bool GetNextTokenSingleChar(Token* result);

+ bool GetNextTokenEqualsOrResponse(Token* result);

+ bool GetNextTokenIdentifier(Token* result);

+ bool GetNextTokenDecConst(Token* result);

+ bool GetNextTokenHexConst(Token* result);

+ bool GetNextTokenOrdinal(Token* result);

+ bool GetNextTokenStringLiteral(Token* result);

+ void ConsumeSkippable();

+ void ConsumeDigits();

+ void ConsumeEol();

+ void Consume(size_t num);

+ bool eos(size_t offset_plus) {

+ return offset_ + offset_plus >= source_.size();

+ }

+ const std::string source_;

+ size_t offset_;

+ size_t line_no_;

+ size_t offset_in_line_;

+ DISALLOW_COPY_AND_ASSIGN(MojomLexer);

+};

+std::vector<Token> MojomLexer::Tokenize() {

+ offset_ = 0;

+ line_no_ = 0;

+ offset_in_line_ = 0;

+ std::vector<Token> result;

+ Token cur;

+ while (GetNextToken(&cur)) {

+ result.push_back(cur);

+ // As soon as an error token is found, stop tokenizing.

+ if (cur.error()) {

+ break;

+ }

+ return result;

+bool MojomLexer::GetNextToken(Token* result) {

+ // Skip all spaces which may be in front of the next token.

+ ConsumeSkippable();

+ // If we found the end of the source signal that is so.

+ if (eos(0))

+ return false;

+ // Save the current position in the source code.

+ result->char_pos = offset_;

+ result->line_no = line_no_;

+ result->line_pos = offset_in_line_;

+ if (GetNextTokenSingleChar(result) || GetNextTokenEqualsOrResponse(result) ||

+ GetNextTokenIdentifier(result) || GetNextTokenHexConst(result) ||

+ GetNextTokenDecConst(result) || GetNextTokenDecConst(result) ||

+ GetNextTokenOrdinal(result) || GetNextTokenStringLiteral(result))

+ return true;

+ result->token = source_.substr(offset_, 1);

+ result->token_type = TokenType::ERROR_ILLEGAL_CHAR;

+ return true;

+void MojomLexer::ConsumeSkippable() {

+ if (eos(0))

+ return;

+ bool found_non_space = false;

+ while (!found_non_space && !eos(0)) {

+ switch (source_[offset_]) {

+ case ' ':

+ case '\t':

+ case '\r':

+ Consume(1);

+ break;

+ case '\n':

+ ConsumeEol();

+ break;

+ default:

+ found_non_space = true;

+ break;

+ }

+// Finds all single-character tokens except for '='.

+bool MojomLexer::GetNextTokenSingleChar(Token* result) {

+ switch (source_[offset_]) {

+ case '(':

+ result->token_type = TokenType::LPAREN;

+ break;

+ case ')':

+ result->token_type = TokenType::RPAREN;

+ break;

+ case '[':

+ result->token_type = TokenType::LBRACKET;

+ break;

+ case ']':

+ result->token_type = TokenType::RBRACKET;

+ break;

+ case '{':

+ result->token_type = TokenType::LBRACE;

+ break;

+ case '}':

+ result->token_type = TokenType::RBRACE;

+ break;

+ case '<':

+ result->token_type = TokenType::LANGLE;

+ break;

+ case '>':

+ result->token_type = TokenType::RANGLE;

+ break;

+ case ';':

+ result->token_type = TokenType::SEMI;

+ break;

+ case ',':

+ result->token_type = TokenType::COMMA;

+ break;

+ case '.':

+ result->token_type = TokenType::DOT;

+ break;

+ case '-':

+ result->token_type = TokenType::MINUS;

+ break;

+ case '+':

+ result->token_type = TokenType::PLUS;

+ break;

+ case '&':

+ result->token_type = TokenType::AMP;

+ break;

+ case '?':

+ result->token_type = TokenType::QSTN;

+ break;

+ default:

+ return false;

+ break;

+ }

+ result->token = source_.substr(offset_, 1);

+ Consume(1);

+ return true;

+// Finds '=' or '=>'.

+bool MojomLexer::GetNextTokenEqualsOrResponse(Token* result) {

+ if (source_[offset_] != '=')

+ return false;

+ Consume(1);

+ if (eos(0) || source_[offset_] != '>') {

+ result->token_type = TokenType::EQUALS;

+ result->token = "=";

+ } else {

+ result->token_type = TokenType::RESPONSE;

+ result->token = "=>";

+ Consume(1);

+ }

+ return true;

+// valid C identifiers (K&R2: A.2.3)

+bool MojomLexer::GetNextTokenIdentifier(Token* result) {

+ char c = source_[offset_];

+ // Identifiers start with a letter or underscore.

+ if (!(IsAlpha(c) || c == '_'))

+ return false;

+ size_t start_offset = offset_;

+ // Identifiers contain letters numbers and underscores.

+ while (!eos(0) && (IsAlnum(source_[offset_]) || c == '_'))

+ Consume(1);

+ result->token = source_.substr(start_offset, offset_ - start_offset);

+ result->token_type = TokenType::IDENTIFIER;

+ if (Keywords().count(result->token))

+ result->token_type = Keywords()[result->token];

+ return true;

+// integer constants (K&R2: A.2.5.1) dec

+// floating constants (K&R2: A.2.5.3)

+bool MojomLexer::GetNextTokenDecConst(Token* result) {

+ if (!IsDigit(source_[offset_]))

+ return false;

+ result->token_type = TokenType::INT_CONST_DEC;

+ // If the number starts with a zero and is not a floating point number.

+ if (source_[offset_] == '0' &&

+ (eos(1) || (source_[offset_] == 'e' && source_[offset_] == 'E' &&

+ source_[offset_] == '.'))) {

+ // TODO(azani): Catch and error on octal.

+ result->token = "0";

+ Consume(1);

+ return true;

+ }

+ size_t start_offset = offset_;

+ // First, we consume all the digits.

+ ConsumeDigits();

+ // If there is a fractional part, we consume the . and the following digits.

+ if (!eos(0) && source_[offset_] == '.') {

+ result->token_type = TokenType::FLOAT_CONST;

+ Consume(1);

+ ConsumeDigits();

+ }

+ // If there is an exponential part, we consume the e and the following digits.

+ if (!eos(0) && (source_[offset_] == 'e' || source_[offset_] == 'E')) {

+ if (!eos(2) && (source_[offset_ + 1] == '-' || source_[offset_ + 1]) &&

+ IsDigit(source_[offset_ + 2])) {

+ result->token_type = TokenType::FLOAT_CONST;

+ Consume(2); // Consume e/E and +/-

+ ConsumeDigits();

+ } else if (!eos(1) && IsDigit(source_[offset_ + 1])) {

+ result->token_type = TokenType::FLOAT_CONST;

+ Consume(1); // Consume e/E

+ ConsumeDigits();

+ }

+ result->token = source_.substr(start_offset, offset_ - start_offset);

+ return true;

+// integer constants (K&R2: A.2.5.1) hex

+bool MojomLexer::GetNextTokenHexConst(Token* result) {

+ // Hex numbers start with a 0, x and then some hex numeral.

+ if (eos(2) || source_[offset_] != '0' ||

+ (source_[offset_ + 1] != 'x' && source_[offset_ + 1] != 'X') ||

+ !IsHexDigit(source_[offset_ + 2]))

+ return false;

+ result->token_type = TokenType::INT_CONST_HEX;

+ size_t start_offset = offset_;

+ Consume(2);

+ while (IsHexDigit(source_[offset_]))

+ Consume(1);

+ result->token = source_.substr(start_offset, offset_ - start_offset);

+ return true;

+bool MojomLexer::GetNextTokenOrdinal(Token* result) {

+ // Ordinals start with '@' and then some digit.

+ if (eos(1) || source_[offset_] != '@' || !IsDigit(source_[offset_ + 1]))

+ return false;

+ size_t start_offset = offset_;

+ // Consumes '@'.

+ Consume(1);

+ result->token_type = TokenType::ORDINAL;

+ ConsumeDigits();

+ result->token = source_.substr(start_offset, offset_ - start_offset);

+ return true;

+bool MojomLexer::GetNextTokenStringLiteral(Token* result) {

+ // Ordinals start with '@' and then some digit.

+ if (source_[offset_] != '"')

+ return false;

+ size_t start_offset = offset_;

+ // Consumes '"'.

+ Consume(1);

+ while (source_[offset_] != '"') {

+ if (source_[offset_] == '\n' || eos(0)) {

+ result->token_type = TokenType::ERROR_UNTERMINATED_STRING_LITERAL;

+ result->token = source_.substr(start_offset, offset_ - start_offset);

+ return true;

+ }

+ // This block will be skipped if the backslash is at the end of the source.

+ if (source_[offset_] == '\\' && !eos(1)) {

+ // Consume the backslash. This will ensure \" is consumed.

+ Consume(1);

+ }

+ Consume(1);

+ }

+ // Consume the closing doublequotes.

+ Consume(1);

+ result->token_type = TokenType::STRING_LITERAL;

+ result->token = source_.substr(start_offset, offset_ - start_offset);

+ return true;

+void MojomLexer::ConsumeDigits() {

+ while (!eos(0) && IsDigit(source_[offset_]))

+ Consume(1);

+void MojomLexer::ConsumeEol() {

+ ++offset_;

+ ++line_no_;

+ offset_in_line_ = 0;

+void MojomLexer::Consume(size_t num) {

+ offset_ += num;

+ offset_in_line_ += num;

+MojomLexer::MojomLexer(const std::string& source)

+ : source_(source), offset_(0), line_no_(0), offset_in_line_(0) {

+MojomLexer::~MojomLexer() {

+} // namespace

+Token::Token()

+ : token_type(TokenType::ERROR_UNKNOWN),

+ char_pos(0),

+ line_no(0),

+ line_pos(0) {

+Token::~Token() {

+// Accepts the text of a mojom file and returns the ordered list of tokens

+// found in the file.

+std::vector<Token> Tokenize(const std::string& source) {

+ return MojomLexer(source).Tokenize();

+} // namespace mojom

+} // namespace mojo

« no previous file with comments | « mojom/lexer.h ('k') | mojom/lexer_unittest.cc » ('j') | no next file with comments »