| OLD | NEW |
| (Empty) |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "mojom/lexer.h" | |
| 6 | |
| 7 #include <map> | |
| 8 #include <string> | |
| 9 | |
| 10 #include "base/lazy_instance.h" | |
| 11 | |
| 12 namespace mojo { | |
| 13 namespace mojom { | |
| 14 | |
| 15 namespace { | |
| 16 | |
| 17 class KeywordsDict { | |
| 18 public: | |
| 19 KeywordsDict(); | |
| 20 | |
| 21 private: | |
| 22 std::map<std::string, mojom::TokenType> keywords_; | |
| 23 friend std::map<std::string, mojom::TokenType>& Keywords(); | |
| 24 | |
| 25 DISALLOW_COPY_AND_ASSIGN(KeywordsDict); | |
| 26 }; | |
| 27 static base::LazyInstance<KeywordsDict> g_keywords = LAZY_INSTANCE_INITIALIZER; | |
| 28 | |
| 29 std::map<std::string, mojom::TokenType>& Keywords() { | |
| 30 return g_keywords.Get().keywords_; | |
| 31 } | |
| 32 | |
| 33 KeywordsDict::KeywordsDict() { | |
| 34 keywords_["import"] = TokenType::IMPORT; | |
| 35 keywords_["module"] = TokenType::MODULE; | |
| 36 keywords_["struct"] = TokenType::STRUCT; | |
| 37 keywords_["union"] = TokenType::UNION; | |
| 38 keywords_["interface"] = TokenType::INTERFACE; | |
| 39 keywords_["enum"] = TokenType::ENUM; | |
| 40 keywords_["const"] = TokenType::CONST; | |
| 41 keywords_["true"] = TokenType::TRUE; | |
| 42 keywords_["false"] = TokenType::FALSE; | |
| 43 keywords_["default"] = TokenType::DEFAULT; | |
| 44 } | |
| 45 | |
| 46 // Non-localized versions of isalpha. | |
| 47 bool IsAlpha(char c) { | |
| 48 return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')); | |
| 49 } | |
| 50 | |
| 51 // Non-localized versions of isnum. | |
| 52 bool IsDigit(char c) { | |
| 53 return ('0' <= c && c <= '9'); | |
| 54 } | |
| 55 | |
| 56 bool IsHexDigit(char c) { | |
| 57 return (IsDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')); | |
| 58 } | |
| 59 | |
| 60 // Non-localized versions of isalnum. | |
| 61 bool IsAlnum(char c) { | |
| 62 return IsAlpha(c) || IsDigit(c); | |
| 63 } | |
| 64 | |
| 65 // MojomLexer tokenizes a mojom source file. It is NOT thread-safe. | |
| 66 class MojomLexer { | |
| 67 public: | |
| 68 explicit MojomLexer(const std::string& source); | |
| 69 ~MojomLexer(); | |
| 70 | |
| 71 // Returns the list of tokens in the source file. | |
| 72 std::vector<Token> Tokenize(); | |
| 73 | |
| 74 private: | |
| 75 // The GetNextToken.* functions all return true if they could find a token | |
| 76 // (even an error token) and false otherwise. | |
| 77 bool GetNextToken(Token* result); | |
| 78 bool GetNextTokenSingleChar(Token* result); | |
| 79 bool GetNextTokenEqualsOrResponse(Token* result); | |
| 80 bool GetNextTokenIdentifier(Token* result); | |
| 81 bool GetNextTokenDecConst(Token* result); | |
| 82 bool GetNextTokenHexConst(Token* result); | |
| 83 bool GetNextTokenOrdinal(Token* result); | |
| 84 bool GetNextTokenStringLiteral(Token* result); | |
| 85 | |
| 86 void ConsumeSkippable(); | |
| 87 void ConsumeDigits(); | |
| 88 void ConsumeEol(); | |
| 89 void Consume(size_t num); | |
| 90 | |
| 91 bool eos(size_t offset_plus) { | |
| 92 return offset_ + offset_plus >= source_.size(); | |
| 93 } | |
| 94 | |
| 95 const std::string source_; | |
| 96 size_t offset_; | |
| 97 size_t line_no_; | |
| 98 size_t offset_in_line_; | |
| 99 | |
| 100 DISALLOW_COPY_AND_ASSIGN(MojomLexer); | |
| 101 }; | |
| 102 | |
| 103 std::vector<Token> MojomLexer::Tokenize() { | |
| 104 offset_ = 0; | |
| 105 line_no_ = 0; | |
| 106 offset_in_line_ = 0; | |
| 107 | |
| 108 std::vector<Token> result; | |
| 109 Token cur; | |
| 110 while (GetNextToken(&cur)) { | |
| 111 result.push_back(cur); | |
| 112 | |
| 113 // As soon as an error token is found, stop tokenizing. | |
| 114 if (cur.error()) { | |
| 115 break; | |
| 116 } | |
| 117 } | |
| 118 | |
| 119 return result; | |
| 120 } | |
| 121 | |
| 122 bool MojomLexer::GetNextToken(Token* result) { | |
| 123 // Skip all spaces which may be in front of the next token. | |
| 124 ConsumeSkippable(); | |
| 125 | |
| 126 // If we found the end of the source signal that is so. | |
| 127 if (eos(0)) | |
| 128 return false; | |
| 129 | |
| 130 // Save the current position in the source code. | |
| 131 result->char_pos = offset_; | |
| 132 result->line_no = line_no_; | |
| 133 result->line_pos = offset_in_line_; | |
| 134 | |
| 135 if (GetNextTokenSingleChar(result) || GetNextTokenEqualsOrResponse(result) || | |
| 136 GetNextTokenIdentifier(result) || GetNextTokenHexConst(result) || | |
| 137 GetNextTokenDecConst(result) || GetNextTokenDecConst(result) || | |
| 138 GetNextTokenOrdinal(result) || GetNextTokenStringLiteral(result)) | |
| 139 return true; | |
| 140 | |
| 141 result->token = source_.substr(offset_, 1); | |
| 142 result->token_type = TokenType::ERROR_ILLEGAL_CHAR; | |
| 143 return true; | |
| 144 } | |
| 145 | |
| 146 void MojomLexer::ConsumeSkippable() { | |
| 147 if (eos(0)) | |
| 148 return; | |
| 149 | |
| 150 bool found_non_space = false; | |
| 151 while (!found_non_space && !eos(0)) { | |
| 152 switch (source_[offset_]) { | |
| 153 case ' ': | |
| 154 case '\t': | |
| 155 case '\r': | |
| 156 Consume(1); | |
| 157 break; | |
| 158 case '\n': | |
| 159 ConsumeEol(); | |
| 160 break; | |
| 161 default: | |
| 162 found_non_space = true; | |
| 163 break; | |
| 164 } | |
| 165 } | |
| 166 } | |
| 167 | |
| 168 // Finds all single-character tokens except for '='. | |
| 169 bool MojomLexer::GetNextTokenSingleChar(Token* result) { | |
| 170 switch (source_[offset_]) { | |
| 171 case '(': | |
| 172 result->token_type = TokenType::LPAREN; | |
| 173 break; | |
| 174 case ')': | |
| 175 result->token_type = TokenType::RPAREN; | |
| 176 break; | |
| 177 case '[': | |
| 178 result->token_type = TokenType::LBRACKET; | |
| 179 break; | |
| 180 case ']': | |
| 181 result->token_type = TokenType::RBRACKET; | |
| 182 break; | |
| 183 case '{': | |
| 184 result->token_type = TokenType::LBRACE; | |
| 185 break; | |
| 186 case '}': | |
| 187 result->token_type = TokenType::RBRACE; | |
| 188 break; | |
| 189 case '<': | |
| 190 result->token_type = TokenType::LANGLE; | |
| 191 break; | |
| 192 case '>': | |
| 193 result->token_type = TokenType::RANGLE; | |
| 194 break; | |
| 195 case ';': | |
| 196 result->token_type = TokenType::SEMI; | |
| 197 break; | |
| 198 case ',': | |
| 199 result->token_type = TokenType::COMMA; | |
| 200 break; | |
| 201 case '.': | |
| 202 result->token_type = TokenType::DOT; | |
| 203 break; | |
| 204 case '-': | |
| 205 result->token_type = TokenType::MINUS; | |
| 206 break; | |
| 207 case '+': | |
| 208 result->token_type = TokenType::PLUS; | |
| 209 break; | |
| 210 case '&': | |
| 211 result->token_type = TokenType::AMP; | |
| 212 break; | |
| 213 case '?': | |
| 214 result->token_type = TokenType::QSTN; | |
| 215 break; | |
| 216 default: | |
| 217 return false; | |
| 218 break; | |
| 219 } | |
| 220 | |
| 221 result->token = source_.substr(offset_, 1); | |
| 222 Consume(1); | |
| 223 return true; | |
| 224 } | |
| 225 | |
| 226 // Finds '=' or '=>'. | |
| 227 bool MojomLexer::GetNextTokenEqualsOrResponse(Token* result) { | |
| 228 if (source_[offset_] != '=') | |
| 229 return false; | |
| 230 Consume(1); | |
| 231 | |
| 232 if (eos(0) || source_[offset_] != '>') { | |
| 233 result->token_type = TokenType::EQUALS; | |
| 234 result->token = "="; | |
| 235 } else { | |
| 236 result->token_type = TokenType::RESPONSE; | |
| 237 result->token = "=>"; | |
| 238 Consume(1); | |
| 239 } | |
| 240 return true; | |
| 241 } | |
| 242 | |
| 243 // valid C identifiers (K&R2: A.2.3) | |
| 244 bool MojomLexer::GetNextTokenIdentifier(Token* result) { | |
| 245 char c = source_[offset_]; | |
| 246 | |
| 247 // Identifiers start with a letter or underscore. | |
| 248 if (!(IsAlpha(c) || c == '_')) | |
| 249 return false; | |
| 250 size_t start_offset = offset_; | |
| 251 | |
| 252 // Identifiers contain letters numbers and underscores. | |
| 253 while (!eos(0) && (IsAlnum(source_[offset_]) || c == '_')) | |
| 254 Consume(1); | |
| 255 | |
| 256 result->token = source_.substr(start_offset, offset_ - start_offset); | |
| 257 result->token_type = TokenType::IDENTIFIER; | |
| 258 | |
| 259 if (Keywords().count(result->token)) | |
| 260 result->token_type = Keywords()[result->token]; | |
| 261 | |
| 262 return true; | |
| 263 } | |
| 264 | |
| 265 // integer constants (K&R2: A.2.5.1) dec | |
| 266 // floating constants (K&R2: A.2.5.3) | |
| 267 bool MojomLexer::GetNextTokenDecConst(Token* result) { | |
| 268 if (!IsDigit(source_[offset_])) | |
| 269 return false; | |
| 270 | |
| 271 result->token_type = TokenType::INT_CONST_DEC; | |
| 272 // If the number starts with a zero and is not a floating point number. | |
| 273 if (source_[offset_] == '0' && | |
| 274 (eos(1) || (source_[offset_] == 'e' && source_[offset_] == 'E' && | |
| 275 source_[offset_] == '.'))) { | |
| 276 // TODO(azani): Catch and error on octal. | |
| 277 result->token = "0"; | |
| 278 Consume(1); | |
| 279 return true; | |
| 280 } | |
| 281 | |
| 282 size_t start_offset = offset_; | |
| 283 | |
| 284 // First, we consume all the digits. | |
| 285 ConsumeDigits(); | |
| 286 | |
| 287 // If there is a fractional part, we consume the . and the following digits. | |
| 288 if (!eos(0) && source_[offset_] == '.') { | |
| 289 result->token_type = TokenType::FLOAT_CONST; | |
| 290 Consume(1); | |
| 291 ConsumeDigits(); | |
| 292 } | |
| 293 | |
| 294 // If there is an exponential part, we consume the e and the following digits. | |
| 295 if (!eos(0) && (source_[offset_] == 'e' || source_[offset_] == 'E')) { | |
| 296 if (!eos(2) && (source_[offset_ + 1] == '-' || source_[offset_ + 1]) && | |
| 297 IsDigit(source_[offset_ + 2])) { | |
| 298 result->token_type = TokenType::FLOAT_CONST; | |
| 299 Consume(2); // Consume e/E and +/- | |
| 300 ConsumeDigits(); | |
| 301 } else if (!eos(1) && IsDigit(source_[offset_ + 1])) { | |
| 302 result->token_type = TokenType::FLOAT_CONST; | |
| 303 Consume(1); // Consume e/E | |
| 304 ConsumeDigits(); | |
| 305 } | |
| 306 } | |
| 307 | |
| 308 result->token = source_.substr(start_offset, offset_ - start_offset); | |
| 309 return true; | |
| 310 } | |
| 311 | |
| 312 // integer constants (K&R2: A.2.5.1) hex | |
| 313 bool MojomLexer::GetNextTokenHexConst(Token* result) { | |
| 314 // Hex numbers start with a 0, x and then some hex numeral. | |
| 315 if (eos(2) || source_[offset_] != '0' || | |
| 316 (source_[offset_ + 1] != 'x' && source_[offset_ + 1] != 'X') || | |
| 317 !IsHexDigit(source_[offset_ + 2])) | |
| 318 return false; | |
| 319 | |
| 320 result->token_type = TokenType::INT_CONST_HEX; | |
| 321 size_t start_offset = offset_; | |
| 322 Consume(2); | |
| 323 | |
| 324 while (IsHexDigit(source_[offset_])) | |
| 325 Consume(1); | |
| 326 | |
| 327 result->token = source_.substr(start_offset, offset_ - start_offset); | |
| 328 return true; | |
| 329 } | |
| 330 | |
| 331 bool MojomLexer::GetNextTokenOrdinal(Token* result) { | |
| 332 // Ordinals start with '@' and then some digit. | |
| 333 if (eos(1) || source_[offset_] != '@' || !IsDigit(source_[offset_ + 1])) | |
| 334 return false; | |
| 335 size_t start_offset = offset_; | |
| 336 // Consumes '@'. | |
| 337 Consume(1); | |
| 338 | |
| 339 result->token_type = TokenType::ORDINAL; | |
| 340 ConsumeDigits(); | |
| 341 | |
| 342 result->token = source_.substr(start_offset, offset_ - start_offset); | |
| 343 return true; | |
| 344 } | |
| 345 | |
| 346 bool MojomLexer::GetNextTokenStringLiteral(Token* result) { | |
| 347 // Ordinals start with '@' and then some digit. | |
| 348 if (source_[offset_] != '"') | |
| 349 return false; | |
| 350 | |
| 351 size_t start_offset = offset_; | |
| 352 // Consumes '"'. | |
| 353 Consume(1); | |
| 354 | |
| 355 while (source_[offset_] != '"') { | |
| 356 if (source_[offset_] == '\n' || eos(0)) { | |
| 357 result->token_type = TokenType::ERROR_UNTERMINATED_STRING_LITERAL; | |
| 358 result->token = source_.substr(start_offset, offset_ - start_offset); | |
| 359 return true; | |
| 360 } | |
| 361 | |
| 362 // This block will be skipped if the backslash is at the end of the source. | |
| 363 if (source_[offset_] == '\\' && !eos(1)) { | |
| 364 // Consume the backslash. This will ensure \" is consumed. | |
| 365 Consume(1); | |
| 366 } | |
| 367 Consume(1); | |
| 368 } | |
| 369 // Consume the closing doublequotes. | |
| 370 Consume(1); | |
| 371 | |
| 372 result->token_type = TokenType::STRING_LITERAL; | |
| 373 | |
| 374 result->token = source_.substr(start_offset, offset_ - start_offset); | |
| 375 return true; | |
| 376 } | |
| 377 | |
| 378 void MojomLexer::ConsumeDigits() { | |
| 379 while (!eos(0) && IsDigit(source_[offset_])) | |
| 380 Consume(1); | |
| 381 } | |
| 382 | |
| 383 void MojomLexer::ConsumeEol() { | |
| 384 ++offset_; | |
| 385 ++line_no_; | |
| 386 offset_in_line_ = 0; | |
| 387 } | |
| 388 | |
| 389 void MojomLexer::Consume(size_t num) { | |
| 390 offset_ += num; | |
| 391 offset_in_line_ += num; | |
| 392 } | |
| 393 | |
| 394 MojomLexer::MojomLexer(const std::string& source) | |
| 395 : source_(source), offset_(0), line_no_(0), offset_in_line_(0) { | |
| 396 } | |
| 397 | |
| 398 MojomLexer::~MojomLexer() { | |
| 399 } | |
| 400 | |
| 401 } // namespace | |
| 402 | |
| 403 Token::Token() | |
| 404 : token_type(TokenType::ERROR_UNKNOWN), | |
| 405 char_pos(0), | |
| 406 line_no(0), | |
| 407 line_pos(0) { | |
| 408 } | |
| 409 | |
| 410 Token::~Token() { | |
| 411 } | |
| 412 | |
| 413 // Accepts the text of a mojom file and returns the ordered list of tokens | |
| 414 // found in the file. | |
| 415 std::vector<Token> Tokenize(const std::string& source) { | |
| 416 return MojomLexer(source).Tokenize(); | |
| 417 } | |
| 418 | |
| 419 } // namespace mojom | |
| 420 } // namespace mojo | |
| OLD | NEW |