mojom/lexer.cc - Issue 1034083003: Mojom lexer.

Side by Side Diff: mojom/lexer.cc

Issue 1034083003: Mojom lexer. (Closed) Base URL: https://github.com/domokit/mojo.git@master

Patch Set: Created 5 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 // Copyright 2015 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "mojom/lexer.h"

	6

	7 #include <map>

	8 #include <string>

	9

	10 #include "base/lazy_instance.h"

	11

	12 namespace mojo {

	13 namespace mojom {

	14

	15 namespace {

	16

	17 class KeywordsDict {

	18 public:

	19 KeywordsDict();

	20

	21 private:

	22 std::map<std::string, mojom::TokenType> keywords_;

	23 friend std::map<std::string, mojom::TokenType>& Keywords();

	24

	25 DISALLOW_COPY_AND_ASSIGN(KeywordsDict);

	26 };

	27 static base::LazyInstance<KeywordsDict> g_keywords = LAZY_INSTANCE_INITIALIZER;

	28

	29 std::map<std::string, mojom::TokenType>& Keywords() {

	30 return g_keywords.Get().keywords_;

	31 }

	32

	33 KeywordsDict::KeywordsDict() {

	34 keywords_["import"] = TokenType::IMPORT;

	35 keywords_["module"] = TokenType::MODULE;

	36 keywords_["struct"] = TokenType::STRUCT;

	37 keywords_["union"] = TokenType::UNION;

	38 keywords_["interface"] = TokenType::INTERFACE;

	39 keywords_["enum"] = TokenType::ENUM;

	40 keywords_["const"] = TokenType::CONST;

	41 keywords_["true"] = TokenType::TRUE;

	42 keywords_["false"] = TokenType::FALSE;

	43 keywords_["default"] = TokenType::DEFAULT;

	44 }

	45

	46 // Non-localized versions of isalpha.

	47 bool IsAlpha(char c) {

	48 return (('a' <= c && c <= 'z') \|\| ('A' <= c && c <= 'Z'));

	49 }

	50

	51 // Non-localized versions of isnum.

	52 bool IsDigit(char c) {

	53 return ('0' <= c && c <= '9');

	54 }

	55

	56 bool IsHexDigit(char c) {

	57 return (IsDigit(c) \|\| ('a' <= c && c <= 'f') \|\| ('A' <= c && c <= 'F'));

	58 }

	59

	60 // Non-localized versions of isalnum.

	61 bool IsAlnum(char c) {

	62 return IsAlpha(c) \|\| IsDigit(c);

	63 }

	64

	65 // MojomLexer tokenizes a mojom source file. It is NOT thread-safe.

	66 class MojomLexer {

	67 public:

	68 explicit MojomLexer(const std::string& source);

	69 ~MojomLexer();

	70

	71 // Returns the list of tokens in the source file.

	72 std::vector<Token> Tokenize();

	73

	74 private:

	75 // The GetNextToken.* functions all return true if they could find a token

	76 // (even an error token) and false otherwise.

	77 bool GetNextToken(Token* result);

	78 bool GetNextTokenSingleChar(Token* result);

	79 bool GetNextTokenEqualsOrResponse(Token* result);

	80 bool GetNextTokenIdentifier(Token* result);

	81 bool GetNextTokenDecConst(Token* result);

	82 bool GetNextTokenHexConst(Token* result);

	83 bool GetNextTokenOrdinal(Token* result);

	84 bool GetNextTokenStringLiteral(Token* result);

	85

	86 void ConsumeSkippable();

	87 void ConsumeDigits();

	88 void ConsumeEol();

	89 void Consume(size_t num);

	90

	91 bool eos(size_t offset_plus) {

	92 return offset_ + offset_plus >= source_.size();

	93 }

	94

	95 const std::string source_;

	96 size_t offset_;

	97 size_t line_no_;

	98 size_t offset_in_line_;

	99

	100 DISALLOW_COPY_AND_ASSIGN(MojomLexer);

	101 };

	102

	103 std::vector<Token> MojomLexer::Tokenize() {

	104 offset_ = 0;

	105 line_no_ = 0;

	106 offset_in_line_ = 0;

	107

	108 std::vector<Token> result;

	109 Token cur;

	110 while (GetNextToken(&cur)) {

	111 result.push_back(cur);

	112

	113 // As soon as an error token is found, stop tokenizing.

	114 if (cur.error()) {

	115 break;

	116 }

	117 }

	118

	119 return result;

	120 }

	121

	122 bool MojomLexer::GetNextToken(Token* result) {

	123 // Skip all spaces which may be in front of the next token.

	124 ConsumeSkippable();

	125

	126 // If we found the end of the source signal that is so.

	127 if (eos(0))

	128 return false;

	129

	130 // Save the current position in the source code.

	131 result->char_pos = offset_;

	132 result->line_no = line_no_;

	133 result->line_pos = offset_in_line_;

	134

	135 if (GetNextTokenSingleChar(result) \|\| GetNextTokenEqualsOrResponse(result) \|\|

	136 GetNextTokenIdentifier(result) \|\| GetNextTokenHexConst(result) \|\|

	137 GetNextTokenDecConst(result) \|\| GetNextTokenDecConst(result) \|\|

	138 GetNextTokenOrdinal(result) \|\| GetNextTokenStringLiteral(result))

	139 return true;

	140

	141 result->token = source_.substr(offset_, 1);

	142 result->token_type = TokenType::ERROR_ILLEGAL_CHAR;

	143 return true;

	144 }

	145

	146 void MojomLexer::ConsumeSkippable() {

	147 if (eos(0))

	148 return;

	149

	150 bool found_non_space = false;

	151 while (!found_non_space && !eos(0)) {

	152 switch (source_[offset_]) {

	153 case ' ':

	154 case '\t':

	155 case '\r':

	156 Consume(1);

	157 break;

	158 case '\n':

	159 ConsumeEol();

	160 break;

	161 default:

	162 found_non_space = true;

	163 break;

	164 }

	165 }

	166 }

	167

	168 // Finds all single-character tokens except for '='.

	169 bool MojomLexer::GetNextTokenSingleChar(Token* result) {

	170 switch (source_[offset_]) {

	171 case '(':

	172 result->token_type = TokenType::LPAREN;

	173 break;

	174 case ')':

	175 result->token_type = TokenType::RPAREN;

	176 break;

	177 case '[':

	178 result->token_type = TokenType::LBRACKET;

	179 break;

	180 case ']':

	181 result->token_type = TokenType::RBRACKET;

	182 break;

	183 case '{':

	184 result->token_type = TokenType::LBRACE;

	185 break;

	186 case '}':

	187 result->token_type = TokenType::RBRACE;

	188 break;

	189 case '<':

	190 result->token_type = TokenType::LANGLE;

	191 break;

	192 case '>':

	193 result->token_type = TokenType::RANGLE;

	194 break;

	195 case ';':

	196 result->token_type = TokenType::SEMI;

	197 break;

	198 case ',':

	199 result->token_type = TokenType::COMMA;

	200 break;

	201 case '.':

	202 result->token_type = TokenType::DOT;

	203 break;

	204 case '-':

	205 result->token_type = TokenType::MINUS;

	206 break;

	207 case '+':

	208 result->token_type = TokenType::PLUS;

	209 break;

	210 case '&':

	211 result->token_type = TokenType::AMP;

	212 break;

	213 case '?':

	214 result->token_type = TokenType::QSTN;

	215 break;

	216 default:

	217 return false;

	218 break;

	219 }

	220

	221 result->token = source_.substr(offset_, 1);

	222 Consume(1);

	223 return true;

	224 }

	225

	226 // Finds '=' or '=>'.

	227 bool MojomLexer::GetNextTokenEqualsOrResponse(Token* result) {

	228 if (source_[offset_] != '=')

	229 return false;

	230 Consume(1);

	231

	232 if (eos(0) \|\| source_[offset_] != '>') {

	233 result->token_type = TokenType::EQUALS;

	234 result->token = "=";

	235 } else {

	236 result->token_type = TokenType::RESPONSE;

	237 result->token = "=>";

	238 Consume(1);

	239 }

	240 return true;

	241 }

	242

	243 // valid C identifiers (K&R2: A.2.3)

	244 bool MojomLexer::GetNextTokenIdentifier(Token* result) {

	245 char c = source_[offset_];

	246

	247 // Identifiers start with a letter or underscore.

	248 if (!(IsAlpha(c) \|\| c == '_'))

	249 return false;

	250 size_t start_offset = offset_;

	251

	252 // Identifiers contain letters numbers and underscores.

	253 while (!eos(0) && (IsAlnum(source_[offset_]) \|\| c == '_'))

	254 Consume(1);

	255

	256 result->token = source_.substr(start_offset, offset_ - start_offset);

	257 result->token_type = TokenType::IDENTIFIER;

	258

	259 if (Keywords().count(result->token))

	260 result->token_type = Keywords()[result->token];

	261

	262 return true;

	263 }

	264

	265 // integer constants (K&R2: A.2.5.1) dec

	266 // floating constants (K&R2: A.2.5.3)

	267 bool MojomLexer::GetNextTokenDecConst(Token* result) {

	268 if (!IsDigit(source_[offset_]))

	269 return false;

	270

	271 result->token_type = TokenType::INT_CONST_DEC;

	272 // If the number starts with a zero and is not a floating point number.

	273 if (source_[offset_] == '0' &&

	274 (eos(1) \|\| (source_[offset_] == 'e' && source_[offset_] == 'E' &&

	275 source_[offset_] == '.'))) {

	276 // TODO(azani): Catch and error on octal.

	277 result->token = "0";

	278 Consume(1);

	279 return true;

	280 }

	281

	282 size_t start_offset = offset_;

	283

	284 // First, we consume all the digits.

	285 ConsumeDigits();

	286

	287 // If there is a fractional part, we consume the . and the following digits.

	288 if (!eos(0) && source_[offset_] == '.') {

	289 result->token_type = TokenType::FLOAT_CONST;

	290 Consume(1);

	291 ConsumeDigits();

	292 }

	293

	294 // If there is an exponential part, we consume the e and the following digits.

	295 if (!eos(0) && (source_[offset_] == 'e' \|\| source_[offset_] == 'E')) {

	296 if (!eos(2) && (source_[offset_ + 1] == '-' \|\| source_[offset_ + 1]) &&

	297 IsDigit(source_[offset_ + 2])) {

	298 result->token_type = TokenType::FLOAT_CONST;

	299 Consume(2); // Consume e/E and +/-

	300 ConsumeDigits();

	301 } else if (!eos(1) && IsDigit(source_[offset_ + 1])) {

	302 result->token_type = TokenType::FLOAT_CONST;

	303 Consume(1); // Consume e/E

	304 ConsumeDigits();

	305 }

	306 }

	307

	308 result->token = source_.substr(start_offset, offset_ - start_offset);

	309 return true;

	310 }

	311

	312 // integer constants (K&R2: A.2.5.1) hex

	313 bool MojomLexer::GetNextTokenHexConst(Token* result) {

	314 // Hex numbers start with a 0, x and then some hex numeral.

	315 if (eos(2) \|\| source_[offset_] != '0' \|\|

	316 (source_[offset_ + 1] != 'x' && source_[offset_ + 1] != 'X') \|\|

	317 !IsHexDigit(source_[offset_ + 2]))

	318 return false;

	319

	320 result->token_type = TokenType::INT_CONST_HEX;

	321 size_t start_offset = offset_;

	322 Consume(2);

	323

	324 while (IsHexDigit(source_[offset_]))

	325 Consume(1);

	326

	327 result->token = source_.substr(start_offset, offset_ - start_offset);

	328 return true;

	329 }

	330

	331 bool MojomLexer::GetNextTokenOrdinal(Token* result) {

	332 // Ordinals start with '@' and then some digit.

	333 if (eos(1) \|\| source_[offset_] != '@' \|\| !IsDigit(source_[offset_ + 1]))

	334 return false;

	335 size_t start_offset = offset_;

	336 // Consumes '@'.

	337 Consume(1);

	338

	339 result->token_type = TokenType::ORDINAL;

	340 ConsumeDigits();

	341

	342 result->token = source_.substr(start_offset, offset_ - start_offset);

	343 return true;

	344 }

	345

	346 bool MojomLexer::GetNextTokenStringLiteral(Token* result) {

	347 // Ordinals start with '@' and then some digit.

	348 if (source_[offset_] != '"')

	349 return false;

	350

	351 size_t start_offset = offset_;

	352 // Consumes '"'.

	353 Consume(1);

	354

	355 while (source_[offset_] != '"') {

	356 if (source_[offset_] == '\n' \|\| eos(0)) {

	357 result->token_type = TokenType::ERROR_UNTERMINATED_STRING_LITERAL;

	358 result->token = source_.substr(start_offset, offset_ - start_offset);

	359 return true;

	360 }

	361

	362 // This block will be skipped if the backslash is at the end of the source.

	363 if (source_[offset_] == '\\' && !eos(1)) {

	364 // Consume the backslash. This will ensure \" is consumed.

	365 Consume(1);

	366 }

	367 Consume(1);

	368 }

	369 // Consume the closing doublequotes.

	370 Consume(1);

	371

	372 result->token_type = TokenType::STRING_LITERAL;

	373

	374 result->token = source_.substr(start_offset, offset_ - start_offset);

	375 return true;

	376 }

	377

	378 void MojomLexer::ConsumeDigits() {

	379 while (!eos(0) && IsDigit(source_[offset_]))

	380 Consume(1);

	381 }

	382

	383 void MojomLexer::ConsumeEol() {

	384 ++offset_;

	385 ++line_no_;

	386 offset_in_line_ = 0;

	387 }

	388

	389 void MojomLexer::Consume(size_t num) {

	390 offset_ += num;

	391 offset_in_line_ += num;

	392 }

	393

	394 MojomLexer::MojomLexer(const std::string& source)

	395 : source_(source), offset_(0), line_no_(0), offset_in_line_(0) {

	396 }

	397

	398 MojomLexer::~MojomLexer() {

	399 }

	400

	401 } // namespace

	402

	403 Token::Token()

	404 : token_type(TokenType::ERROR_UNKNOWN),

	405 char_pos(0),

	406 line_no(0),

	407 line_pos(0) {

	408 }

	409

	410 Token::~Token() {

	411 }

	412

	413 // Accepts the text of a mojom file and returns the ordered list of tokens

	414 // found in the file.

	415 std::vector<Token> Tokenize(const std::string& source) {

	416 return MojomLexer(source).Tokenize();

	417 }

	418

	419 } // namespace mojom

	420 } // namespace mojo

OLD	NEW

« no previous file with comments | « mojom/lexer.h ('k') | mojom/lexer_unittest.cc » ('j') | no next file with comments »