Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(505)

Unified Diff: mojom/lexer.cc

Issue 1034083003: Mojom lexer. (Closed) Base URL: https://github.com/domokit/mojo.git@master
Patch Set: Created 5 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « mojom/lexer.h ('k') | mojom/lexer_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: mojom/lexer.cc
diff --git a/mojom/lexer.cc b/mojom/lexer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e55e2fbca0b2f5160a69e8b37257b74c2cd861bd
--- /dev/null
+++ b/mojom/lexer.cc
@@ -0,0 +1,420 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "mojom/lexer.h"
+
+#include <map>
+#include <string>
+
+#include "base/lazy_instance.h"
+
+namespace mojo {
+namespace mojom {
+
+namespace {
+
+class KeywordsDict {
+ public:
+ KeywordsDict();
+
+ private:
+ std::map<std::string, mojom::TokenType> keywords_;
+ friend std::map<std::string, mojom::TokenType>& Keywords();
+
+ DISALLOW_COPY_AND_ASSIGN(KeywordsDict);
+};
+static base::LazyInstance<KeywordsDict> g_keywords = LAZY_INSTANCE_INITIALIZER;
+
+std::map<std::string, mojom::TokenType>& Keywords() {
+ return g_keywords.Get().keywords_;
+}
+
+KeywordsDict::KeywordsDict() {
+ keywords_["import"] = TokenType::IMPORT;
+ keywords_["module"] = TokenType::MODULE;
+ keywords_["struct"] = TokenType::STRUCT;
+ keywords_["union"] = TokenType::UNION;
+ keywords_["interface"] = TokenType::INTERFACE;
+ keywords_["enum"] = TokenType::ENUM;
+ keywords_["const"] = TokenType::CONST;
+ keywords_["true"] = TokenType::TRUE;
+ keywords_["false"] = TokenType::FALSE;
+ keywords_["default"] = TokenType::DEFAULT;
+}
+
+// Non-localized versions of isalpha.
+bool IsAlpha(char c) {
+ return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'));
+}
+
+// Non-localized versions of isnum.
+bool IsDigit(char c) {
+ return ('0' <= c && c <= '9');
+}
+
+bool IsHexDigit(char c) {
+ return (IsDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'));
+}
+
+// Non-localized versions of isalnum.
+bool IsAlnum(char c) {
+ return IsAlpha(c) || IsDigit(c);
+}
+
+// MojomLexer tokenizes a mojom source file. It is NOT thread-safe.
+class MojomLexer {
+ public:
+ explicit MojomLexer(const std::string& source);
+ ~MojomLexer();
+
+ // Returns the list of tokens in the source file.
+ std::vector<Token> Tokenize();
+
+ private:
+ // The GetNextToken.* functions all return true if they could find a token
+ // (even an error token) and false otherwise.
+ bool GetNextToken(Token* result);
+ bool GetNextTokenSingleChar(Token* result);
+ bool GetNextTokenEqualsOrResponse(Token* result);
+ bool GetNextTokenIdentifier(Token* result);
+ bool GetNextTokenDecConst(Token* result);
+ bool GetNextTokenHexConst(Token* result);
+ bool GetNextTokenOrdinal(Token* result);
+ bool GetNextTokenStringLiteral(Token* result);
+
+ void ConsumeSkippable();
+ void ConsumeDigits();
+ void ConsumeEol();
+ void Consume(size_t num);
+
+ bool eos(size_t offset_plus) {
+ return offset_ + offset_plus >= source_.size();
+ }
+
+ const std::string source_;
+ size_t offset_;
+ size_t line_no_;
+ size_t offset_in_line_;
+
+ DISALLOW_COPY_AND_ASSIGN(MojomLexer);
+};
+
+std::vector<Token> MojomLexer::Tokenize() {
+ offset_ = 0;
+ line_no_ = 0;
+ offset_in_line_ = 0;
+
+ std::vector<Token> result;
+ Token cur;
+ while (GetNextToken(&cur)) {
+ result.push_back(cur);
+
+ // As soon as an error token is found, stop tokenizing.
+ if (cur.error()) {
+ break;
+ }
+ }
+
+ return result;
+}
+
+bool MojomLexer::GetNextToken(Token* result) {
+ // Skip all spaces which may be in front of the next token.
+ ConsumeSkippable();
+
+ // If we found the end of the source signal that is so.
+ if (eos(0))
+ return false;
+
+ // Save the current position in the source code.
+ result->char_pos = offset_;
+ result->line_no = line_no_;
+ result->line_pos = offset_in_line_;
+
+ if (GetNextTokenSingleChar(result) || GetNextTokenEqualsOrResponse(result) ||
+ GetNextTokenIdentifier(result) || GetNextTokenHexConst(result) ||
+ GetNextTokenDecConst(result) || GetNextTokenDecConst(result) ||
+ GetNextTokenOrdinal(result) || GetNextTokenStringLiteral(result))
+ return true;
+
+ result->token = source_.substr(offset_, 1);
+ result->token_type = TokenType::ERROR_ILLEGAL_CHAR;
+ return true;
+}
+
+void MojomLexer::ConsumeSkippable() {
+ if (eos(0))
+ return;
+
+ bool found_non_space = false;
+ while (!found_non_space && !eos(0)) {
+ switch (source_[offset_]) {
+ case ' ':
+ case '\t':
+ case '\r':
+ Consume(1);
+ break;
+ case '\n':
+ ConsumeEol();
+ break;
+ default:
+ found_non_space = true;
+ break;
+ }
+ }
+}
+
+// Finds all single-character tokens except for '='.
+bool MojomLexer::GetNextTokenSingleChar(Token* result) {
+ switch (source_[offset_]) {
+ case '(':
+ result->token_type = TokenType::LPAREN;
+ break;
+ case ')':
+ result->token_type = TokenType::RPAREN;
+ break;
+ case '[':
+ result->token_type = TokenType::LBRACKET;
+ break;
+ case ']':
+ result->token_type = TokenType::RBRACKET;
+ break;
+ case '{':
+ result->token_type = TokenType::LBRACE;
+ break;
+ case '}':
+ result->token_type = TokenType::RBRACE;
+ break;
+ case '<':
+ result->token_type = TokenType::LANGLE;
+ break;
+ case '>':
+ result->token_type = TokenType::RANGLE;
+ break;
+ case ';':
+ result->token_type = TokenType::SEMI;
+ break;
+ case ',':
+ result->token_type = TokenType::COMMA;
+ break;
+ case '.':
+ result->token_type = TokenType::DOT;
+ break;
+ case '-':
+ result->token_type = TokenType::MINUS;
+ break;
+ case '+':
+ result->token_type = TokenType::PLUS;
+ break;
+ case '&':
+ result->token_type = TokenType::AMP;
+ break;
+ case '?':
+ result->token_type = TokenType::QSTN;
+ break;
+ default:
+ return false;
+ break;
+ }
+
+ result->token = source_.substr(offset_, 1);
+ Consume(1);
+ return true;
+}
+
+// Finds '=' or '=>'.
+bool MojomLexer::GetNextTokenEqualsOrResponse(Token* result) {
+ if (source_[offset_] != '=')
+ return false;
+ Consume(1);
+
+ if (eos(0) || source_[offset_] != '>') {
+ result->token_type = TokenType::EQUALS;
+ result->token = "=";
+ } else {
+ result->token_type = TokenType::RESPONSE;
+ result->token = "=>";
+ Consume(1);
+ }
+ return true;
+}
+
+// valid C identifiers (K&R2: A.2.3)
+bool MojomLexer::GetNextTokenIdentifier(Token* result) {
+ char c = source_[offset_];
+
+ // Identifiers start with a letter or underscore.
+ if (!(IsAlpha(c) || c == '_'))
+ return false;
+ size_t start_offset = offset_;
+
+ // Identifiers contain letters numbers and underscores.
+ while (!eos(0) && (IsAlnum(source_[offset_]) || c == '_'))
+ Consume(1);
+
+ result->token = source_.substr(start_offset, offset_ - start_offset);
+ result->token_type = TokenType::IDENTIFIER;
+
+ if (Keywords().count(result->token))
+ result->token_type = Keywords()[result->token];
+
+ return true;
+}
+
+// integer constants (K&R2: A.2.5.1) dec
+// floating constants (K&R2: A.2.5.3)
+bool MojomLexer::GetNextTokenDecConst(Token* result) {
+ if (!IsDigit(source_[offset_]))
+ return false;
+
+ result->token_type = TokenType::INT_CONST_DEC;
+ // If the number starts with a zero and is not a floating point number.
+ if (source_[offset_] == '0' &&
+ (eos(1) || (source_[offset_] == 'e' && source_[offset_] == 'E' &&
+ source_[offset_] == '.'))) {
+ // TODO(azani): Catch and error on octal.
+ result->token = "0";
+ Consume(1);
+ return true;
+ }
+
+ size_t start_offset = offset_;
+
+ // First, we consume all the digits.
+ ConsumeDigits();
+
+ // If there is a fractional part, we consume the . and the following digits.
+ if (!eos(0) && source_[offset_] == '.') {
+ result->token_type = TokenType::FLOAT_CONST;
+ Consume(1);
+ ConsumeDigits();
+ }
+
+ // If there is an exponential part, we consume the e and the following digits.
+ if (!eos(0) && (source_[offset_] == 'e' || source_[offset_] == 'E')) {
+ if (!eos(2) && (source_[offset_ + 1] == '-' || source_[offset_ + 1]) &&
+ IsDigit(source_[offset_ + 2])) {
+ result->token_type = TokenType::FLOAT_CONST;
+ Consume(2); // Consume e/E and +/-
+ ConsumeDigits();
+ } else if (!eos(1) && IsDigit(source_[offset_ + 1])) {
+ result->token_type = TokenType::FLOAT_CONST;
+ Consume(1); // Consume e/E
+ ConsumeDigits();
+ }
+ }
+
+ result->token = source_.substr(start_offset, offset_ - start_offset);
+ return true;
+}
+
+// integer constants (K&R2: A.2.5.1) hex
+bool MojomLexer::GetNextTokenHexConst(Token* result) {
+ // Hex numbers start with a 0, x and then some hex numeral.
+ if (eos(2) || source_[offset_] != '0' ||
+ (source_[offset_ + 1] != 'x' && source_[offset_ + 1] != 'X') ||
+ !IsHexDigit(source_[offset_ + 2]))
+ return false;
+
+ result->token_type = TokenType::INT_CONST_HEX;
+ size_t start_offset = offset_;
+ Consume(2);
+
+ while (IsHexDigit(source_[offset_]))
+ Consume(1);
+
+ result->token = source_.substr(start_offset, offset_ - start_offset);
+ return true;
+}
+
+bool MojomLexer::GetNextTokenOrdinal(Token* result) {
+ // Ordinals start with '@' and then some digit.
+ if (eos(1) || source_[offset_] != '@' || !IsDigit(source_[offset_ + 1]))
+ return false;
+ size_t start_offset = offset_;
+ // Consumes '@'.
+ Consume(1);
+
+ result->token_type = TokenType::ORDINAL;
+ ConsumeDigits();
+
+ result->token = source_.substr(start_offset, offset_ - start_offset);
+ return true;
+}
+
+bool MojomLexer::GetNextTokenStringLiteral(Token* result) {
+ // Ordinals start with '@' and then some digit.
+ if (source_[offset_] != '"')
+ return false;
+
+ size_t start_offset = offset_;
+ // Consumes '"'.
+ Consume(1);
+
+ while (source_[offset_] != '"') {
+ if (source_[offset_] == '\n' || eos(0)) {
+ result->token_type = TokenType::ERROR_UNTERMINATED_STRING_LITERAL;
+ result->token = source_.substr(start_offset, offset_ - start_offset);
+ return true;
+ }
+
+ // This block will be skipped if the backslash is at the end of the source.
+ if (source_[offset_] == '\\' && !eos(1)) {
+ // Consume the backslash. This will ensure \" is consumed.
+ Consume(1);
+ }
+ Consume(1);
+ }
+ // Consume the closing doublequotes.
+ Consume(1);
+
+ result->token_type = TokenType::STRING_LITERAL;
+
+ result->token = source_.substr(start_offset, offset_ - start_offset);
+ return true;
+}
+
+void MojomLexer::ConsumeDigits() {
+ while (!eos(0) && IsDigit(source_[offset_]))
+ Consume(1);
+}
+
+void MojomLexer::ConsumeEol() {
+ ++offset_;
+ ++line_no_;
+ offset_in_line_ = 0;
+}
+
+void MojomLexer::Consume(size_t num) {
+ offset_ += num;
+ offset_in_line_ += num;
+}
+
+MojomLexer::MojomLexer(const std::string& source)
+ : source_(source), offset_(0), line_no_(0), offset_in_line_(0) {
+}
+
+MojomLexer::~MojomLexer() {
+}
+
+} // namespace
+
+Token::Token()
+ : token_type(TokenType::ERROR_UNKNOWN),
+ char_pos(0),
+ line_no(0),
+ line_pos(0) {
+}
+
+Token::~Token() {
+}
+
+// Accepts the text of a mojom file and returns the ordered list of tokens
+// found in the file.
+std::vector<Token> Tokenize(const std::string& source) {
+ return MojomLexer(source).Tokenize();
+}
+
+} // namespace mojom
+} // namespace mojo
« no previous file with comments | « mojom/lexer.h ('k') | mojom/lexer_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698