OLD | NEW |
| (Empty) |
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "tools/gn/tokenizer.h" | |
6 | |
7 #include "base/logging.h" | |
8 #include "tools/gn/input_file.h" | |
9 | |
10 namespace { | |
11 | |
12 bool IsNumberChar(char c) { | |
13 return c == '-' || (c >= '0' && c <= '9'); | |
14 } | |
15 | |
16 bool CouldBeTwoCharOperatorBegin(char c) { | |
17 return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' || | |
18 c == '+' || c == '|' || c == '&'; | |
19 } | |
20 | |
21 bool CouldBeTwoCharOperatorEnd(char c) { | |
22 return c == '=' || c == '|' || c == '&'; | |
23 } | |
24 | |
25 bool CouldBeOneCharOperator(char c) { | |
26 return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' || | |
27 c == ':' || c == '|' || c == '&' || c == '-'; | |
28 } | |
29 | |
30 bool CouldBeOperator(char c) { | |
31 return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c); | |
32 } | |
33 | |
34 bool IsSeparatorChar(char c) { | |
35 return c == ','; | |
36 } | |
37 | |
38 bool IsScoperChar(char c) { | |
39 return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}'; | |
40 } | |
41 | |
42 } // namespace | |
43 | |
44 Tokenizer::Tokenizer(const InputFile* input_file, Err* err) | |
45 : input_file_(input_file), | |
46 input_(input_file->contents()), | |
47 err_(err), | |
48 cur_(0), | |
49 line_number_(1), | |
50 char_in_line_(1) { | |
51 } | |
52 | |
53 Tokenizer::~Tokenizer() { | |
54 } | |
55 | |
56 // static | |
57 std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) { | |
58 Tokenizer t(input_file, err); | |
59 return t.Run(); | |
60 } | |
61 | |
62 std::vector<Token> Tokenizer::Run() { | |
63 std::vector<Token> tokens; | |
64 while (!done()) { | |
65 AdvanceToNextToken(); | |
66 if (done()) | |
67 break; | |
68 Location location = GetCurrentLocation(); | |
69 | |
70 Token::Type type = ClassifyCurrent(); | |
71 if (type == Token::INVALID) { | |
72 *err_ = GetErrorForInvalidToken(location); | |
73 break; | |
74 } | |
75 size_t token_begin = cur_; | |
76 AdvanceToEndOfToken(location, type); | |
77 if (has_error()) | |
78 break; | |
79 size_t token_end = cur_; | |
80 | |
81 // TODO(brettw) This just strips comments from the token stream. This | |
82 // is probably wrong, they should be removed at a later stage so we can | |
83 // do things like rewrite the file. But this makes the parser simpler and | |
84 // is OK for now. | |
85 if (type != Token::COMMENT) { | |
86 tokens.push_back(Token( | |
87 location, | |
88 type, | |
89 base::StringPiece(&input_.data()[token_begin], | |
90 token_end - token_begin))); | |
91 } | |
92 } | |
93 if (err_->has_error()) | |
94 tokens.clear(); | |
95 return tokens; | |
96 } | |
97 | |
98 // static | |
99 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) { | |
100 int cur_line = 1; | |
101 size_t cur_byte = 0; | |
102 | |
103 DCHECK(n > 0); | |
104 | |
105 if (n == 1) | |
106 return 0; | |
107 | |
108 while (cur_byte < buf.size()) { | |
109 if (IsNewline(buf, cur_byte)) { | |
110 cur_line++; | |
111 if (cur_line == n) | |
112 return cur_byte + 1; | |
113 } | |
114 cur_byte++; | |
115 } | |
116 return -1; | |
117 } | |
118 | |
119 // static | |
120 bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) { | |
121 DCHECK(offset < buffer.size()); | |
122 // We may need more logic here to handle different line ending styles. | |
123 return buffer[offset] == '\n'; | |
124 } | |
125 | |
126 | |
127 void Tokenizer::AdvanceToNextToken() { | |
128 while (!at_end() && IsCurrentWhitespace()) | |
129 Advance(); | |
130 } | |
131 | |
132 Token::Type Tokenizer::ClassifyCurrent() const { | |
133 DCHECK(!at_end()); | |
134 char next_char = cur_char(); | |
135 if (next_char >= '0' && next_char <= '9') | |
136 return Token::INTEGER; | |
137 if (next_char == '"') | |
138 return Token::STRING; | |
139 | |
140 // Note: '-' handled specially below. | |
141 if (next_char != '-' && CouldBeOperator(next_char)) | |
142 return Token::OPERATOR; | |
143 | |
144 if (IsIdentifierFirstChar(next_char)) | |
145 return Token::IDENTIFIER; | |
146 | |
147 if (IsScoperChar(next_char)) | |
148 return Token::SCOPER; | |
149 | |
150 if (IsSeparatorChar(next_char)) | |
151 return Token::SEPARATOR; | |
152 | |
153 if (next_char == '#') | |
154 return Token::COMMENT; | |
155 | |
156 // For the case of '-' differentiate between a negative number and anything | |
157 // else. | |
158 if (next_char == '-') { | |
159 if (!CanIncrement()) | |
160 return Token::OPERATOR; // Just the minus before end of file. | |
161 char following_char = input_[cur_ + 1]; | |
162 if (following_char >= '0' && following_char <= '9') | |
163 return Token::INTEGER; | |
164 return Token::OPERATOR; | |
165 } | |
166 | |
167 return Token::INVALID; | |
168 } | |
169 | |
170 void Tokenizer::AdvanceToEndOfToken(const Location& location, | |
171 Token::Type type) { | |
172 switch (type) { | |
173 case Token::INTEGER: | |
174 do { | |
175 Advance(); | |
176 } while (!at_end() && IsNumberChar(cur_char())); | |
177 if (!at_end()) { | |
178 // Require the char after a number to be some kind of space, scope, | |
179 // or operator. | |
180 char c = cur_char(); | |
181 if (!IsCurrentWhitespace() && !CouldBeOperator(c) && | |
182 !IsScoperChar(c) && !IsSeparatorChar(c)) { | |
183 *err_ = Err(GetCurrentLocation(), | |
184 "This is not a valid number.", | |
185 "Learn to count."); | |
186 // Highlight the number. | |
187 err_->AppendRange(LocationRange(location, GetCurrentLocation())); | |
188 } | |
189 } | |
190 break; | |
191 | |
192 case Token::STRING: { | |
193 char initial = cur_char(); | |
194 Advance(); // Advance past initial " | |
195 for (;;) { | |
196 if (at_end()) { | |
197 *err_ = Err(LocationRange(location, | |
198 Location(input_file_, line_number_, char_in_line_)), | |
199 "Unterminated string literal.", | |
200 "Don't leave me hanging like this!"); | |
201 break; | |
202 } | |
203 if (IsCurrentStringTerminator(initial)) { | |
204 Advance(); // Skip past last " | |
205 break; | |
206 } else if (cur_char() == '\n') { | |
207 *err_ = Err(LocationRange(location, | |
208 GetCurrentLocation()), | |
209 "Newline in string constant."); | |
210 } | |
211 Advance(); | |
212 } | |
213 break; | |
214 } | |
215 | |
216 case Token::OPERATOR: | |
217 // Some operators are two characters, some are one. | |
218 if (CouldBeTwoCharOperatorBegin(cur_char())) { | |
219 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1])) | |
220 Advance(); | |
221 } | |
222 Advance(); | |
223 break; | |
224 | |
225 case Token::IDENTIFIER: | |
226 while (!at_end() && IsIdentifierContinuingChar(cur_char())) | |
227 Advance(); | |
228 break; | |
229 | |
230 case Token::SCOPER: | |
231 case Token::SEPARATOR: | |
232 Advance(); // All are one char. | |
233 break; | |
234 | |
235 case Token::COMMENT: | |
236 // Eat to EOL. | |
237 while (!at_end() && !IsCurrentNewline()) | |
238 Advance(); | |
239 break; | |
240 | |
241 case Token::INVALID: | |
242 *err_ = Err(location, "Everything is all messed up", | |
243 "Please insert system disk in drive A: and press any key."); | |
244 NOTREACHED(); | |
245 return; | |
246 } | |
247 } | |
248 | |
249 bool Tokenizer::IsCurrentWhitespace() const { | |
250 DCHECK(!at_end()); | |
251 char c = input_[cur_]; | |
252 // Note that tab (0x09) is illegal. | |
253 return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20; | |
254 } | |
255 | |
256 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const { | |
257 DCHECK(!at_end()); | |
258 if (cur_char() != quote_char) | |
259 return false; | |
260 | |
261 // Check for escaping. \" is not a string terminator, but \\" is. Count | |
262 // the number of preceeding backslashes. | |
263 int num_backslashes = 0; | |
264 for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--) | |
265 num_backslashes++; | |
266 | |
267 // Even backslashes mean that they were escaping each other and don't count | |
268 // as escaping this quote. | |
269 return (num_backslashes % 2) == 0; | |
270 } | |
271 | |
272 bool Tokenizer::IsCurrentNewline() const { | |
273 return IsNewline(input_, cur_); | |
274 } | |
275 | |
276 void Tokenizer::Advance() { | |
277 DCHECK(cur_ < input_.size()); | |
278 if (IsCurrentNewline()) { | |
279 line_number_++; | |
280 char_in_line_ = 1; | |
281 } else { | |
282 char_in_line_++; | |
283 } | |
284 cur_++; | |
285 } | |
286 | |
287 Location Tokenizer::GetCurrentLocation() const { | |
288 return Location(input_file_, line_number_, char_in_line_); | |
289 } | |
290 | |
291 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const { | |
292 std::string help; | |
293 if (cur_char() == ';') { | |
294 // Semicolon. | |
295 help = "Semicolons are not needed, delete this one."; | |
296 } else if (cur_char() == '\t') { | |
297 // Tab. | |
298 help = "You got a tab character in here. Tabs are evil. " | |
299 "Convert to spaces."; | |
300 } else if (cur_char() == '/' && cur_ + 1 < input_.size() && | |
301 (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) { | |
302 // Different types of comments. | |
303 help = "Comments should start with # instead"; | |
304 } else { | |
305 help = "I have no idea what this is."; | |
306 } | |
307 | |
308 return Err(location, "Invalid token.", help); | |
309 } | |
OLD | NEW |