OLD | NEW |
| (Empty) |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "mojom/lexer.h" | |
6 | |
7 #include <map> | |
8 #include <string> | |
9 | |
10 #include "base/lazy_instance.h" | |
11 | |
12 namespace mojo { | |
13 namespace mojom { | |
14 | |
15 namespace { | |
16 | |
17 class KeywordsDict { | |
18 public: | |
19 KeywordsDict(); | |
20 | |
21 private: | |
22 std::map<std::string, mojom::TokenType> keywords_; | |
23 friend std::map<std::string, mojom::TokenType>& Keywords(); | |
24 | |
25 DISALLOW_COPY_AND_ASSIGN(KeywordsDict); | |
26 }; | |
27 static base::LazyInstance<KeywordsDict> g_keywords = LAZY_INSTANCE_INITIALIZER; | |
28 | |
29 std::map<std::string, mojom::TokenType>& Keywords() { | |
30 return g_keywords.Get().keywords_; | |
31 } | |
32 | |
33 KeywordsDict::KeywordsDict() { | |
34 keywords_["import"] = TokenType::IMPORT; | |
35 keywords_["module"] = TokenType::MODULE; | |
36 keywords_["struct"] = TokenType::STRUCT; | |
37 keywords_["union"] = TokenType::UNION; | |
38 keywords_["interface"] = TokenType::INTERFACE; | |
39 keywords_["enum"] = TokenType::ENUM; | |
40 keywords_["const"] = TokenType::CONST; | |
41 keywords_["true"] = TokenType::TRUE; | |
42 keywords_["false"] = TokenType::FALSE; | |
43 keywords_["default"] = TokenType::DEFAULT; | |
44 } | |
45 | |
46 // Non-localized versions of isalpha. | |
47 bool IsAlpha(char c) { | |
48 return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')); | |
49 } | |
50 | |
51 // Non-localized versions of isnum. | |
52 bool IsDigit(char c) { | |
53 return ('0' <= c && c <= '9'); | |
54 } | |
55 | |
56 bool IsHexDigit(char c) { | |
57 return (IsDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')); | |
58 } | |
59 | |
60 // Non-localized versions of isalnum. | |
61 bool IsAlnum(char c) { | |
62 return IsAlpha(c) || IsDigit(c); | |
63 } | |
64 | |
65 // MojomLexer tokenizes a mojom source file. It is NOT thread-safe. | |
66 class MojomLexer { | |
67 public: | |
68 explicit MojomLexer(const std::string& source); | |
69 ~MojomLexer(); | |
70 | |
71 // Returns the list of tokens in the source file. | |
72 std::vector<Token> Tokenize(); | |
73 | |
74 private: | |
75 // The GetNextToken.* functions all return true if they could find a token | |
76 // (even an error token) and false otherwise. | |
77 bool GetNextToken(Token* result); | |
78 bool GetNextTokenSingleChar(Token* result); | |
79 bool GetNextTokenEqualsOrResponse(Token* result); | |
80 bool GetNextTokenIdentifier(Token* result); | |
81 bool GetNextTokenDecConst(Token* result); | |
82 bool GetNextTokenHexConst(Token* result); | |
83 bool GetNextTokenOrdinal(Token* result); | |
84 bool GetNextTokenStringLiteral(Token* result); | |
85 | |
86 void ConsumeSkippable(); | |
87 void ConsumeDigits(); | |
88 void ConsumeEol(); | |
89 void Consume(size_t num); | |
90 | |
91 bool eos(size_t offset_plus) { | |
92 return offset_ + offset_plus >= source_.size(); | |
93 } | |
94 | |
95 const std::string source_; | |
96 size_t offset_; | |
97 size_t line_no_; | |
98 size_t offset_in_line_; | |
99 | |
100 DISALLOW_COPY_AND_ASSIGN(MojomLexer); | |
101 }; | |
102 | |
103 std::vector<Token> MojomLexer::Tokenize() { | |
104 offset_ = 0; | |
105 line_no_ = 0; | |
106 offset_in_line_ = 0; | |
107 | |
108 std::vector<Token> result; | |
109 Token cur; | |
110 while (GetNextToken(&cur)) { | |
111 result.push_back(cur); | |
112 | |
113 // As soon as an error token is found, stop tokenizing. | |
114 if (cur.error()) { | |
115 break; | |
116 } | |
117 } | |
118 | |
119 return result; | |
120 } | |
121 | |
122 bool MojomLexer::GetNextToken(Token* result) { | |
123 // Skip all spaces which may be in front of the next token. | |
124 ConsumeSkippable(); | |
125 | |
126 // If we found the end of the source signal that is so. | |
127 if (eos(0)) | |
128 return false; | |
129 | |
130 // Save the current position in the source code. | |
131 result->char_pos = offset_; | |
132 result->line_no = line_no_; | |
133 result->line_pos = offset_in_line_; | |
134 | |
135 if (GetNextTokenSingleChar(result) || GetNextTokenEqualsOrResponse(result) || | |
136 GetNextTokenIdentifier(result) || GetNextTokenHexConst(result) || | |
137 GetNextTokenDecConst(result) || GetNextTokenDecConst(result) || | |
138 GetNextTokenOrdinal(result) || GetNextTokenStringLiteral(result)) | |
139 return true; | |
140 | |
141 result->token = source_.substr(offset_, 1); | |
142 result->token_type = TokenType::ERROR_ILLEGAL_CHAR; | |
143 return true; | |
144 } | |
145 | |
146 void MojomLexer::ConsumeSkippable() { | |
147 if (eos(0)) | |
148 return; | |
149 | |
150 bool found_non_space = false; | |
151 while (!found_non_space && !eos(0)) { | |
152 switch (source_[offset_]) { | |
153 case ' ': | |
154 case '\t': | |
155 case '\r': | |
156 Consume(1); | |
157 break; | |
158 case '\n': | |
159 ConsumeEol(); | |
160 break; | |
161 default: | |
162 found_non_space = true; | |
163 break; | |
164 } | |
165 } | |
166 } | |
167 | |
168 // Finds all single-character tokens except for '='. | |
169 bool MojomLexer::GetNextTokenSingleChar(Token* result) { | |
170 switch (source_[offset_]) { | |
171 case '(': | |
172 result->token_type = TokenType::LPAREN; | |
173 break; | |
174 case ')': | |
175 result->token_type = TokenType::RPAREN; | |
176 break; | |
177 case '[': | |
178 result->token_type = TokenType::LBRACKET; | |
179 break; | |
180 case ']': | |
181 result->token_type = TokenType::RBRACKET; | |
182 break; | |
183 case '{': | |
184 result->token_type = TokenType::LBRACE; | |
185 break; | |
186 case '}': | |
187 result->token_type = TokenType::RBRACE; | |
188 break; | |
189 case '<': | |
190 result->token_type = TokenType::LANGLE; | |
191 break; | |
192 case '>': | |
193 result->token_type = TokenType::RANGLE; | |
194 break; | |
195 case ';': | |
196 result->token_type = TokenType::SEMI; | |
197 break; | |
198 case ',': | |
199 result->token_type = TokenType::COMMA; | |
200 break; | |
201 case '.': | |
202 result->token_type = TokenType::DOT; | |
203 break; | |
204 case '-': | |
205 result->token_type = TokenType::MINUS; | |
206 break; | |
207 case '+': | |
208 result->token_type = TokenType::PLUS; | |
209 break; | |
210 case '&': | |
211 result->token_type = TokenType::AMP; | |
212 break; | |
213 case '?': | |
214 result->token_type = TokenType::QSTN; | |
215 break; | |
216 default: | |
217 return false; | |
218 break; | |
219 } | |
220 | |
221 result->token = source_.substr(offset_, 1); | |
222 Consume(1); | |
223 return true; | |
224 } | |
225 | |
226 // Finds '=' or '=>'. | |
227 bool MojomLexer::GetNextTokenEqualsOrResponse(Token* result) { | |
228 if (source_[offset_] != '=') | |
229 return false; | |
230 Consume(1); | |
231 | |
232 if (eos(0) || source_[offset_] != '>') { | |
233 result->token_type = TokenType::EQUALS; | |
234 result->token = "="; | |
235 } else { | |
236 result->token_type = TokenType::RESPONSE; | |
237 result->token = "=>"; | |
238 Consume(1); | |
239 } | |
240 return true; | |
241 } | |
242 | |
243 // valid C identifiers (K&R2: A.2.3) | |
244 bool MojomLexer::GetNextTokenIdentifier(Token* result) { | |
245 char c = source_[offset_]; | |
246 | |
247 // Identifiers start with a letter or underscore. | |
248 if (!(IsAlpha(c) || c == '_')) | |
249 return false; | |
250 size_t start_offset = offset_; | |
251 | |
252 // Identifiers contain letters numbers and underscores. | |
253 while (!eos(0) && (IsAlnum(source_[offset_]) || c == '_')) | |
254 Consume(1); | |
255 | |
256 result->token = source_.substr(start_offset, offset_ - start_offset); | |
257 result->token_type = TokenType::IDENTIFIER; | |
258 | |
259 if (Keywords().count(result->token)) | |
260 result->token_type = Keywords()[result->token]; | |
261 | |
262 return true; | |
263 } | |
264 | |
265 // integer constants (K&R2: A.2.5.1) dec | |
266 // floating constants (K&R2: A.2.5.3) | |
267 bool MojomLexer::GetNextTokenDecConst(Token* result) { | |
268 if (!IsDigit(source_[offset_])) | |
269 return false; | |
270 | |
271 result->token_type = TokenType::INT_CONST_DEC; | |
272 // If the number starts with a zero and is not a floating point number. | |
273 if (source_[offset_] == '0' && | |
274 (eos(1) || (source_[offset_] == 'e' && source_[offset_] == 'E' && | |
275 source_[offset_] == '.'))) { | |
276 // TODO(azani): Catch and error on octal. | |
277 result->token = "0"; | |
278 Consume(1); | |
279 return true; | |
280 } | |
281 | |
282 size_t start_offset = offset_; | |
283 | |
284 // First, we consume all the digits. | |
285 ConsumeDigits(); | |
286 | |
287 // If there is a fractional part, we consume the . and the following digits. | |
288 if (!eos(0) && source_[offset_] == '.') { | |
289 result->token_type = TokenType::FLOAT_CONST; | |
290 Consume(1); | |
291 ConsumeDigits(); | |
292 } | |
293 | |
294 // If there is an exponential part, we consume the e and the following digits. | |
295 if (!eos(0) && (source_[offset_] == 'e' || source_[offset_] == 'E')) { | |
296 if (!eos(2) && (source_[offset_ + 1] == '-' || source_[offset_ + 1]) && | |
297 IsDigit(source_[offset_ + 2])) { | |
298 result->token_type = TokenType::FLOAT_CONST; | |
299 Consume(2); // Consume e/E and +/- | |
300 ConsumeDigits(); | |
301 } else if (!eos(1) && IsDigit(source_[offset_ + 1])) { | |
302 result->token_type = TokenType::FLOAT_CONST; | |
303 Consume(1); // Consume e/E | |
304 ConsumeDigits(); | |
305 } | |
306 } | |
307 | |
308 result->token = source_.substr(start_offset, offset_ - start_offset); | |
309 return true; | |
310 } | |
311 | |
312 // integer constants (K&R2: A.2.5.1) hex | |
313 bool MojomLexer::GetNextTokenHexConst(Token* result) { | |
314 // Hex numbers start with a 0, x and then some hex numeral. | |
315 if (eos(2) || source_[offset_] != '0' || | |
316 (source_[offset_ + 1] != 'x' && source_[offset_ + 1] != 'X') || | |
317 !IsHexDigit(source_[offset_ + 2])) | |
318 return false; | |
319 | |
320 result->token_type = TokenType::INT_CONST_HEX; | |
321 size_t start_offset = offset_; | |
322 Consume(2); | |
323 | |
324 while (IsHexDigit(source_[offset_])) | |
325 Consume(1); | |
326 | |
327 result->token = source_.substr(start_offset, offset_ - start_offset); | |
328 return true; | |
329 } | |
330 | |
331 bool MojomLexer::GetNextTokenOrdinal(Token* result) { | |
332 // Ordinals start with '@' and then some digit. | |
333 if (eos(1) || source_[offset_] != '@' || !IsDigit(source_[offset_ + 1])) | |
334 return false; | |
335 size_t start_offset = offset_; | |
336 // Consumes '@'. | |
337 Consume(1); | |
338 | |
339 result->token_type = TokenType::ORDINAL; | |
340 ConsumeDigits(); | |
341 | |
342 result->token = source_.substr(start_offset, offset_ - start_offset); | |
343 return true; | |
344 } | |
345 | |
346 bool MojomLexer::GetNextTokenStringLiteral(Token* result) { | |
347 // Ordinals start with '@' and then some digit. | |
348 if (source_[offset_] != '"') | |
349 return false; | |
350 | |
351 size_t start_offset = offset_; | |
352 // Consumes '"'. | |
353 Consume(1); | |
354 | |
355 while (source_[offset_] != '"') { | |
356 if (source_[offset_] == '\n' || eos(0)) { | |
357 result->token_type = TokenType::ERROR_UNTERMINATED_STRING_LITERAL; | |
358 result->token = source_.substr(start_offset, offset_ - start_offset); | |
359 return true; | |
360 } | |
361 | |
362 // This block will be skipped if the backslash is at the end of the source. | |
363 if (source_[offset_] == '\\' && !eos(1)) { | |
364 // Consume the backslash. This will ensure \" is consumed. | |
365 Consume(1); | |
366 } | |
367 Consume(1); | |
368 } | |
369 // Consume the closing doublequotes. | |
370 Consume(1); | |
371 | |
372 result->token_type = TokenType::STRING_LITERAL; | |
373 | |
374 result->token = source_.substr(start_offset, offset_ - start_offset); | |
375 return true; | |
376 } | |
377 | |
378 void MojomLexer::ConsumeDigits() { | |
379 while (!eos(0) && IsDigit(source_[offset_])) | |
380 Consume(1); | |
381 } | |
382 | |
383 void MojomLexer::ConsumeEol() { | |
384 ++offset_; | |
385 ++line_no_; | |
386 offset_in_line_ = 0; | |
387 } | |
388 | |
389 void MojomLexer::Consume(size_t num) { | |
390 offset_ += num; | |
391 offset_in_line_ += num; | |
392 } | |
393 | |
394 MojomLexer::MojomLexer(const std::string& source) | |
395 : source_(source), offset_(0), line_no_(0), offset_in_line_(0) { | |
396 } | |
397 | |
398 MojomLexer::~MojomLexer() { | |
399 } | |
400 | |
401 } // namespace | |
402 | |
403 Token::Token() | |
404 : token_type(TokenType::ERROR_UNKNOWN), | |
405 char_pos(0), | |
406 line_no(0), | |
407 line_pos(0) { | |
408 } | |
409 | |
410 Token::~Token() { | |
411 } | |
412 | |
413 // Accepts the text of a mojom file and returns the ordered list of tokens | |
414 // found in the file. | |
415 std::vector<Token> Tokenize(const std::string& source) { | |
416 return MojomLexer(source).Tokenize(); | |
417 } | |
418 | |
419 } // namespace mojom | |
420 } // namespace mojo | |
OLD | NEW |