OLD | NEW |
(Empty) | |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "mojom/lexer.h" |
| 6 |
| 7 #include <map> |
| 8 #include <string> |
| 9 |
| 10 #include "base/lazy_instance.h" |
| 11 |
| 12 namespace mojo { |
| 13 namespace mojom { |
| 14 |
| 15 namespace { |
| 16 |
| 17 class KeywordsDict { |
| 18 public: |
| 19 KeywordsDict(); |
| 20 |
| 21 private: |
| 22 std::map<std::string, mojom::TokenType> keywords_; |
| 23 friend std::map<std::string, mojom::TokenType>& Keywords(); |
| 24 |
| 25 DISALLOW_COPY_AND_ASSIGN(KeywordsDict); |
| 26 }; |
| 27 static base::LazyInstance<KeywordsDict> g_keywords = LAZY_INSTANCE_INITIALIZER; |
| 28 |
| 29 std::map<std::string, mojom::TokenType>& Keywords() { |
| 30 return g_keywords.Get().keywords_; |
| 31 } |
| 32 |
| 33 KeywordsDict::KeywordsDict() { |
| 34 keywords_["import"] = TokenType::IMPORT; |
| 35 keywords_["module"] = TokenType::MODULE; |
| 36 keywords_["struct"] = TokenType::STRUCT; |
| 37 keywords_["union"] = TokenType::UNION; |
| 38 keywords_["interface"] = TokenType::INTERFACE; |
| 39 keywords_["enum"] = TokenType::ENUM; |
| 40 keywords_["const"] = TokenType::CONST; |
| 41 keywords_["true"] = TokenType::TRUE; |
| 42 keywords_["false"] = TokenType::FALSE; |
| 43 keywords_["default"] = TokenType::DEFAULT; |
| 44 } |
| 45 |
| 46 // Non-localized versions of isalpha. |
| 47 bool IsAlpha(char c) { |
| 48 return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')); |
| 49 } |
| 50 |
| 51 // Non-localized versions of isnum. |
| 52 bool IsDigit(char c) { |
| 53 return ('0' <= c && c <= '9'); |
| 54 } |
| 55 |
| 56 bool IsHexDigit(char c) { |
| 57 return (IsDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')); |
| 58 } |
| 59 |
| 60 // Non-localized versions of isalnum. |
| 61 bool IsAlnum(char c) { |
| 62 return IsAlpha(c) || IsDigit(c); |
| 63 } |
| 64 |
| 65 // MojomLexer tokenizes a mojom source file. It is NOT thread-safe. |
| 66 class MojomLexer { |
| 67 public: |
| 68 explicit MojomLexer(const std::string& source); |
| 69 ~MojomLexer(); |
| 70 |
| 71 // Returns the list of tokens in the source file. |
| 72 std::vector<Token> Tokenize(); |
| 73 |
| 74 private: |
| 75 // The GetNextToken.* functions all return true if they could find a token |
| 76 // (even an error token) and false otherwise. |
| 77 bool GetNextToken(Token* result); |
| 78 bool GetNextTokenSingleChar(Token* result); |
| 79 bool GetNextTokenEqualsOrResponse(Token* result); |
| 80 bool GetNextTokenIdentifier(Token* result); |
| 81 bool GetNextTokenDecConst(Token* result); |
| 82 bool GetNextTokenHexConst(Token* result); |
| 83 bool GetNextTokenOrdinal(Token* result); |
| 84 bool GetNextTokenStringLiteral(Token* result); |
| 85 |
| 86 void ConsumeSkippable(); |
| 87 void ConsumeDigits(); |
| 88 void ConsumeEol(); |
| 89 void Consume(size_t num); |
| 90 |
| 91 bool eos(size_t offset_plus) { |
| 92 return offset_ + offset_plus >= source_.size(); |
| 93 } |
| 94 |
| 95 const std::string source_; |
| 96 size_t offset_; |
| 97 size_t line_no_; |
| 98 size_t offset_in_line_; |
| 99 |
| 100 DISALLOW_COPY_AND_ASSIGN(MojomLexer); |
| 101 }; |
| 102 |
| 103 std::vector<Token> MojomLexer::Tokenize() { |
| 104 offset_ = 0; |
| 105 line_no_ = 0; |
| 106 offset_in_line_ = 0; |
| 107 |
| 108 std::vector<Token> result; |
| 109 Token cur; |
| 110 while (GetNextToken(&cur)) { |
| 111 result.push_back(cur); |
| 112 |
| 113 // As soon as an error token is found, stop tokenizing. |
| 114 if (cur.error()) { |
| 115 break; |
| 116 } |
| 117 } |
| 118 |
| 119 return result; |
| 120 } |
| 121 |
| 122 bool MojomLexer::GetNextToken(Token* result) { |
| 123 // Skip all spaces which may be in front of the next token. |
| 124 ConsumeSkippable(); |
| 125 |
| 126 // If we found the end of the source signal that is so. |
| 127 if (eos(0)) |
| 128 return false; |
| 129 |
| 130 // Save the current position in the source code. |
| 131 result->char_pos = offset_; |
| 132 result->line_no = line_no_; |
| 133 result->line_pos = offset_in_line_; |
| 134 |
| 135 if (GetNextTokenSingleChar(result) || GetNextTokenEqualsOrResponse(result) || |
| 136 GetNextTokenIdentifier(result) || GetNextTokenHexConst(result) || |
| 137 GetNextTokenDecConst(result) || GetNextTokenDecConst(result) || |
| 138 GetNextTokenOrdinal(result) || GetNextTokenStringLiteral(result)) |
| 139 return true; |
| 140 |
| 141 result->token = source_.substr(offset_, 1); |
| 142 result->token_type = TokenType::ERROR_ILLEGAL_CHAR; |
| 143 return true; |
| 144 } |
| 145 |
| 146 void MojomLexer::ConsumeSkippable() { |
| 147 if (eos(0)) |
| 148 return; |
| 149 |
| 150 bool found_non_space = false; |
| 151 while (!found_non_space && !eos(0)) { |
| 152 switch (source_[offset_]) { |
| 153 case ' ': |
| 154 case '\t': |
| 155 case '\r': |
| 156 Consume(1); |
| 157 break; |
| 158 case '\n': |
| 159 ConsumeEol(); |
| 160 break; |
| 161 default: |
| 162 found_non_space = true; |
| 163 break; |
| 164 } |
| 165 } |
| 166 } |
| 167 |
| 168 // Finds all single-character tokens except for '='. |
| 169 bool MojomLexer::GetNextTokenSingleChar(Token* result) { |
| 170 switch (source_[offset_]) { |
| 171 case '(': |
| 172 result->token_type = TokenType::LPAREN; |
| 173 break; |
| 174 case ')': |
| 175 result->token_type = TokenType::RPAREN; |
| 176 break; |
| 177 case '[': |
| 178 result->token_type = TokenType::LBRACKET; |
| 179 break; |
| 180 case ']': |
| 181 result->token_type = TokenType::RBRACKET; |
| 182 break; |
| 183 case '{': |
| 184 result->token_type = TokenType::LBRACE; |
| 185 break; |
| 186 case '}': |
| 187 result->token_type = TokenType::RBRACE; |
| 188 break; |
| 189 case '<': |
| 190 result->token_type = TokenType::LANGLE; |
| 191 break; |
| 192 case '>': |
| 193 result->token_type = TokenType::RANGLE; |
| 194 break; |
| 195 case ';': |
| 196 result->token_type = TokenType::SEMI; |
| 197 break; |
| 198 case ',': |
| 199 result->token_type = TokenType::COMMA; |
| 200 break; |
| 201 case '.': |
| 202 result->token_type = TokenType::DOT; |
| 203 break; |
| 204 case '-': |
| 205 result->token_type = TokenType::MINUS; |
| 206 break; |
| 207 case '+': |
| 208 result->token_type = TokenType::PLUS; |
| 209 break; |
| 210 case '&': |
| 211 result->token_type = TokenType::AMP; |
| 212 break; |
| 213 case '?': |
| 214 result->token_type = TokenType::QSTN; |
| 215 break; |
| 216 default: |
| 217 return false; |
| 218 break; |
| 219 } |
| 220 |
| 221 result->token = source_.substr(offset_, 1); |
| 222 Consume(1); |
| 223 return true; |
| 224 } |
| 225 |
| 226 // Finds '=' or '=>'. |
| 227 bool MojomLexer::GetNextTokenEqualsOrResponse(Token* result) { |
| 228 if (source_[offset_] != '=') |
| 229 return false; |
| 230 Consume(1); |
| 231 |
| 232 if (eos(0) || source_[offset_] != '>') { |
| 233 result->token_type = TokenType::EQUALS; |
| 234 result->token = "="; |
| 235 } else { |
| 236 result->token_type = TokenType::RESPONSE; |
| 237 result->token = "=>"; |
| 238 Consume(1); |
| 239 } |
| 240 return true; |
| 241 } |
| 242 |
| 243 // valid C identifiers (K&R2: A.2.3) |
| 244 bool MojomLexer::GetNextTokenIdentifier(Token* result) { |
| 245 char c = source_[offset_]; |
| 246 |
| 247 // Identifiers start with a letter or underscore. |
| 248 if (!(IsAlpha(c) || c == '_')) |
| 249 return false; |
| 250 size_t start_offset = offset_; |
| 251 |
| 252 // Identifiers contain letters numbers and underscores. |
| 253 while (!eos(0) && (IsAlnum(source_[offset_]) || c == '_')) |
| 254 Consume(1); |
| 255 |
| 256 result->token = source_.substr(start_offset, offset_ - start_offset); |
| 257 result->token_type = TokenType::IDENTIFIER; |
| 258 |
| 259 if (Keywords().count(result->token)) |
| 260 result->token_type = Keywords()[result->token]; |
| 261 |
| 262 return true; |
| 263 } |
| 264 |
| 265 // integer constants (K&R2: A.2.5.1) dec |
| 266 // floating constants (K&R2: A.2.5.3) |
| 267 bool MojomLexer::GetNextTokenDecConst(Token* result) { |
| 268 if (!IsDigit(source_[offset_])) |
| 269 return false; |
| 270 |
| 271 result->token_type = TokenType::INT_CONST_DEC; |
| 272 // If the number starts with a zero and is not a floating point number. |
| 273 if (source_[offset_] == '0' && |
| 274 (eos(1) || (source_[offset_] == 'e' && source_[offset_] == 'E' && |
| 275 source_[offset_] == '.'))) { |
| 276 // TODO(azani): Catch and error on octal. |
| 277 result->token = "0"; |
| 278 Consume(1); |
| 279 return true; |
| 280 } |
| 281 |
| 282 size_t start_offset = offset_; |
| 283 |
| 284 // First, we consume all the digits. |
| 285 ConsumeDigits(); |
| 286 |
| 287 // If there is a fractional part, we consume the . and the following digits. |
| 288 if (!eos(0) && source_[offset_] == '.') { |
| 289 result->token_type = TokenType::FLOAT_CONST; |
| 290 Consume(1); |
| 291 ConsumeDigits(); |
| 292 } |
| 293 |
| 294 // If there is an exponential part, we consume the e and the following digits. |
| 295 if (!eos(0) && (source_[offset_] == 'e' || source_[offset_] == 'E')) { |
| 296 if (!eos(2) && (source_[offset_ + 1] == '-' || source_[offset_ + 1]) && |
| 297 IsDigit(source_[offset_ + 2])) { |
| 298 result->token_type = TokenType::FLOAT_CONST; |
| 299 Consume(2); // Consume e/E and +/- |
| 300 ConsumeDigits(); |
| 301 } else if (!eos(1) && IsDigit(source_[offset_ + 1])) { |
| 302 result->token_type = TokenType::FLOAT_CONST; |
| 303 Consume(1); // Consume e/E |
| 304 ConsumeDigits(); |
| 305 } |
| 306 } |
| 307 |
| 308 result->token = source_.substr(start_offset, offset_ - start_offset); |
| 309 return true; |
| 310 } |
| 311 |
| 312 // integer constants (K&R2: A.2.5.1) hex |
| 313 bool MojomLexer::GetNextTokenHexConst(Token* result) { |
| 314 // Hex numbers start with a 0, x and then some hex numeral. |
| 315 if (eos(2) || source_[offset_] != '0' || |
| 316 (source_[offset_ + 1] != 'x' && source_[offset_ + 1] != 'X') || |
| 317 !IsHexDigit(source_[offset_ + 2])) |
| 318 return false; |
| 319 |
| 320 result->token_type = TokenType::INT_CONST_HEX; |
| 321 size_t start_offset = offset_; |
| 322 Consume(2); |
| 323 |
| 324 while (IsHexDigit(source_[offset_])) |
| 325 Consume(1); |
| 326 |
| 327 result->token = source_.substr(start_offset, offset_ - start_offset); |
| 328 return true; |
| 329 } |
| 330 |
| 331 bool MojomLexer::GetNextTokenOrdinal(Token* result) { |
| 332 // Ordinals start with '@' and then some digit. |
| 333 if (eos(1) || source_[offset_] != '@' || !IsDigit(source_[offset_ + 1])) |
| 334 return false; |
| 335 size_t start_offset = offset_; |
| 336 // Consumes '@'. |
| 337 Consume(1); |
| 338 |
| 339 result->token_type = TokenType::ORDINAL; |
| 340 ConsumeDigits(); |
| 341 |
| 342 result->token = source_.substr(start_offset, offset_ - start_offset); |
| 343 return true; |
| 344 } |
| 345 |
| 346 bool MojomLexer::GetNextTokenStringLiteral(Token* result) { |
| 347 // Ordinals start with '@' and then some digit. |
| 348 if (source_[offset_] != '"') |
| 349 return false; |
| 350 |
| 351 size_t start_offset = offset_; |
| 352 // Consumes '"'. |
| 353 Consume(1); |
| 354 |
| 355 while (source_[offset_] != '"') { |
| 356 if (source_[offset_] == '\n' || eos(0)) { |
| 357 result->token_type = TokenType::ERROR_UNTERMINATED_STRING_LITERAL; |
| 358 result->token = source_.substr(start_offset, offset_ - start_offset); |
| 359 return true; |
| 360 } |
| 361 |
| 362 // This block will be skipped if the backslash is at the end of the source. |
| 363 if (source_[offset_] == '\\' && !eos(1)) { |
| 364 // Consume the backslash. This will ensure \" is consumed. |
| 365 Consume(1); |
| 366 } |
| 367 Consume(1); |
| 368 } |
| 369 // Consume the closing doublequotes. |
| 370 Consume(1); |
| 371 |
| 372 result->token_type = TokenType::STRING_LITERAL; |
| 373 |
| 374 result->token = source_.substr(start_offset, offset_ - start_offset); |
| 375 return true; |
| 376 } |
| 377 |
| 378 void MojomLexer::ConsumeDigits() { |
| 379 while (!eos(0) && IsDigit(source_[offset_])) |
| 380 Consume(1); |
| 381 } |
| 382 |
| 383 void MojomLexer::ConsumeEol() { |
| 384 ++offset_; |
| 385 ++line_no_; |
| 386 offset_in_line_ = 0; |
| 387 } |
| 388 |
| 389 void MojomLexer::Consume(size_t num) { |
| 390 offset_ += num; |
| 391 offset_in_line_ += num; |
| 392 } |
| 393 |
| 394 MojomLexer::MojomLexer(const std::string& source) |
| 395 : source_(source), offset_(0), line_no_(0), offset_in_line_(0) { |
| 396 } |
| 397 |
| 398 MojomLexer::~MojomLexer() { |
| 399 } |
| 400 |
| 401 } // namespace |
| 402 |
| 403 Token::Token() |
| 404 : token_type(TokenType::ERROR_UNKNOWN), |
| 405 char_pos(0), |
| 406 line_no(0), |
| 407 line_pos(0) { |
| 408 } |
| 409 |
| 410 Token::~Token() { |
| 411 } |
| 412 |
| 413 // Accepts the text of a mojom file and returns the ordered list of tokens |
| 414 // found in the file. |
| 415 std::vector<Token> Tokenize(const std::string& source) { |
| 416 return MojomLexer(source).Tokenize(); |
| 417 } |
| 418 |
| 419 } // namespace mojom |
| 420 } // namespace mojo |
OLD | NEW |