OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "tools/gn/tokenizer.h" |
| 6 |
| 7 #include "base/logging.h" |
| 8 #include "tools/gn/input_file.h" |
| 9 |
| 10 namespace { |
| 11 |
| 12 bool IsNumberChar(char c) { |
| 13 return c == '-' || (c >= '0' && c <= '9'); |
| 14 } |
| 15 |
| 16 bool CouldBeTwoCharOperatorBegin(char c) { |
| 17 return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' || |
| 18 c == '+' || c == '|' || c == '&'; |
| 19 } |
| 20 |
| 21 bool CouldBeTwoCharOperatorEnd(char c) { |
| 22 return c == '=' || c == '|' || c == '&'; |
| 23 } |
| 24 |
| 25 bool CouldBeOneCharOperator(char c) { |
| 26 return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' || |
| 27 c == ':' || c == '|' || c == '&' || c == '-'; |
| 28 } |
| 29 |
| 30 bool CouldBeOperator(char c) { |
| 31 return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c); |
| 32 } |
| 33 |
| 34 bool IsSeparatorChar(char c) { |
| 35 return c == ','; |
| 36 } |
| 37 |
| 38 bool IsScoperChar(char c) { |
| 39 return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}'; |
| 40 } |
| 41 |
| 42 } // namespace |
| 43 |
| 44 Tokenizer::Tokenizer(const InputFile* input_file, Err* err) |
| 45 : input_file_(input_file), |
| 46 input_(input_file->contents()), |
| 47 err_(err), |
| 48 cur_(0), |
| 49 line_number_(1), |
| 50 char_in_line_(1) { |
| 51 } |
| 52 |
| 53 Tokenizer::~Tokenizer() { |
| 54 } |
| 55 |
| 56 // static |
| 57 std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) { |
| 58 Tokenizer t(input_file, err); |
| 59 return t.Run(); |
| 60 } |
| 61 |
| 62 std::vector<Token> Tokenizer::Run() { |
| 63 std::vector<Token> tokens; |
| 64 while (!done()) { |
| 65 AdvanceToNextToken(); |
| 66 if (done()) |
| 67 break; |
| 68 Location location = GetCurrentLocation(); |
| 69 |
| 70 Token::Type type = ClassifyCurrent(); |
| 71 if (type == Token::INVALID) { |
| 72 *err_ = GetErrorForInvalidToken(location); |
| 73 break; |
| 74 } |
| 75 size_t token_begin = cur_; |
| 76 AdvanceToEndOfToken(location, type); |
| 77 if (has_error()) |
| 78 break; |
| 79 size_t token_end = cur_; |
| 80 |
| 81 // TODO(brettw) This just strips comments from the token stream. This |
| 82 // is probably wrong, they should be removed at a later stage so we can |
| 83 // do things like rewrite the file. But this makes the parser simpler and |
| 84 // is OK for now. |
| 85 if (type != Token::COMMENT) { |
| 86 tokens.push_back(Token( |
| 87 location, |
| 88 type, |
| 89 base::StringPiece(&input_.data()[token_begin], |
| 90 token_end - token_begin))); |
| 91 } |
| 92 } |
| 93 if (err_->has_error()) |
| 94 tokens.clear(); |
| 95 return tokens; |
| 96 } |
| 97 |
| 98 // static |
| 99 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) { |
| 100 int cur_line = 1; |
| 101 size_t cur_byte = 0; |
| 102 |
| 103 DCHECK(n > 0); |
| 104 |
| 105 if (n == 1) |
| 106 return 0; |
| 107 |
| 108 while (cur_byte < buf.size()) { |
| 109 if (IsNewline(buf, cur_byte)) { |
| 110 cur_line++; |
| 111 if (cur_line == n) |
| 112 return cur_byte + 1; |
| 113 } |
| 114 cur_byte++; |
| 115 } |
| 116 return -1; |
| 117 } |
| 118 |
| 119 // static |
| 120 bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) { |
| 121 DCHECK(offset < buffer.size()); |
| 122 // We may need more logic here to handle different line ending styles. |
| 123 return buffer[offset] == '\n'; |
| 124 } |
| 125 |
| 126 |
| 127 void Tokenizer::AdvanceToNextToken() { |
| 128 while (!at_end() && IsCurrentWhitespace()) |
| 129 Advance(); |
| 130 } |
| 131 |
| 132 Token::Type Tokenizer::ClassifyCurrent() const { |
| 133 DCHECK(!at_end()); |
| 134 char next_char = cur_char(); |
| 135 if (next_char >= '0' && next_char <= '9') |
| 136 return Token::INTEGER; |
| 137 if (next_char == '"') |
| 138 return Token::STRING; |
| 139 |
| 140 // Note: '-' handled specially below. |
| 141 if (next_char != '-' && CouldBeOperator(next_char)) |
| 142 return Token::OPERATOR; |
| 143 |
| 144 if (IsIdentifierFirstChar(next_char)) |
| 145 return Token::IDENTIFIER; |
| 146 |
| 147 if (IsScoperChar(next_char)) |
| 148 return Token::SCOPER; |
| 149 |
| 150 if (IsSeparatorChar(next_char)) |
| 151 return Token::SEPARATOR; |
| 152 |
| 153 if (next_char == '#') |
| 154 return Token::COMMENT; |
| 155 |
| 156 // For the case of '-' differentiate between a negative number and anything |
| 157 // else. |
| 158 if (next_char == '-') { |
| 159 if (!CanIncrement()) |
| 160 return Token::OPERATOR; // Just the minus before end of file. |
| 161 char following_char = input_[cur_ + 1]; |
| 162 if (following_char >= '0' && following_char <= '9') |
| 163 return Token::INTEGER; |
| 164 return Token::OPERATOR; |
| 165 } |
| 166 |
| 167 return Token::INVALID; |
| 168 } |
| 169 |
| 170 void Tokenizer::AdvanceToEndOfToken(const Location& location, |
| 171 Token::Type type) { |
| 172 switch (type) { |
| 173 case Token::INTEGER: |
| 174 do { |
| 175 Advance(); |
| 176 } while (!at_end() && IsNumberChar(cur_char())); |
| 177 if (!at_end()) { |
| 178 // Require the char after a number to be some kind of space, scope, |
| 179 // or operator. |
| 180 char c = cur_char(); |
| 181 if (!IsCurrentWhitespace() && !CouldBeOperator(c) && |
| 182 !IsScoperChar(c) && !IsSeparatorChar(c)) { |
| 183 *err_ = Err(GetCurrentLocation(), |
| 184 "This is not a valid number.", |
| 185 "Learn to count."); |
| 186 // Highlight the number. |
| 187 err_->AppendRange(LocationRange(location, GetCurrentLocation())); |
| 188 } |
| 189 } |
| 190 break; |
| 191 |
| 192 case Token::STRING: { |
| 193 char initial = cur_char(); |
| 194 Advance(); // Advance past initial " |
| 195 for (;;) { |
| 196 if (at_end()) { |
| 197 *err_ = Err(LocationRange(location, |
| 198 Location(input_file_, line_number_, char_in_line_)), |
| 199 "Unterminated string literal.", |
| 200 "Don't leave me hanging like this!"); |
| 201 break; |
| 202 } |
| 203 if (IsCurrentStringTerminator(initial)) { |
| 204 Advance(); // Skip past last " |
| 205 break; |
| 206 } else if (cur_char() == '\n') { |
| 207 *err_ = Err(LocationRange(location, |
| 208 GetCurrentLocation()), |
| 209 "Newline in string constant."); |
| 210 } |
| 211 Advance(); |
| 212 } |
| 213 break; |
| 214 } |
| 215 |
| 216 case Token::OPERATOR: |
| 217 // Some operators are two characters, some are one. |
| 218 if (CouldBeTwoCharOperatorBegin(cur_char())) { |
| 219 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1])) |
| 220 Advance(); |
| 221 } |
| 222 Advance(); |
| 223 break; |
| 224 |
| 225 case Token::IDENTIFIER: |
| 226 while (!at_end() && IsIdentifierContinuingChar(cur_char())) |
| 227 Advance(); |
| 228 break; |
| 229 |
| 230 case Token::SCOPER: |
| 231 case Token::SEPARATOR: |
| 232 Advance(); // All are one char. |
| 233 break; |
| 234 |
| 235 case Token::COMMENT: |
| 236 // Eat to EOL. |
| 237 while (!at_end() && !IsCurrentNewline()) |
| 238 Advance(); |
| 239 break; |
| 240 |
| 241 case Token::INVALID: |
| 242 *err_ = Err(location, "Everything is all messed up", |
| 243 "Please insert system disk in drive A: and press any key."); |
| 244 NOTREACHED(); |
| 245 return; |
| 246 } |
| 247 } |
| 248 |
| 249 bool Tokenizer::IsCurrentWhitespace() const { |
| 250 DCHECK(!at_end()); |
| 251 char c = input_[cur_]; |
| 252 // Note that tab (0x09) is illegal. |
| 253 return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20; |
| 254 } |
| 255 |
| 256 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const { |
| 257 DCHECK(!at_end()); |
| 258 if (cur_char() != quote_char) |
| 259 return false; |
| 260 |
| 261 // Check for escaping. \" is not a string terminator, but \\" is. Count |
| 262 // the number of preceeding backslashes. |
| 263 int num_backslashes = 0; |
| 264 for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--) |
| 265 num_backslashes++; |
| 266 |
| 267 // Even backslashes mean that they were escaping each other and don't count |
| 268 // as escaping this quote. |
| 269 return (num_backslashes % 2) == 0; |
| 270 } |
| 271 |
| 272 bool Tokenizer::IsCurrentNewline() const { |
| 273 return IsNewline(input_, cur_); |
| 274 } |
| 275 |
| 276 void Tokenizer::Advance() { |
| 277 DCHECK(cur_ < input_.size()); |
| 278 if (IsCurrentNewline()) { |
| 279 line_number_++; |
| 280 char_in_line_ = 1; |
| 281 } else { |
| 282 char_in_line_++; |
| 283 } |
| 284 cur_++; |
| 285 } |
| 286 |
| 287 Location Tokenizer::GetCurrentLocation() const { |
| 288 return Location(input_file_, line_number_, char_in_line_); |
| 289 } |
| 290 |
| 291 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const { |
| 292 std::string help; |
| 293 if (cur_char() == ';') { |
| 294 // Semicolon. |
| 295 help = "Semicolons are not needed, delete this one."; |
| 296 } else if (cur_char() == '\t') { |
| 297 // Tab. |
| 298 help = "You got a tab character in here. Tabs are evil. " |
| 299 "Convert to spaces."; |
| 300 } else if (cur_char() == '/' && cur_ + 1 < input_.size() && |
| 301 (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) { |
| 302 // Different types of comments. |
| 303 help = "Comments should start with # instead"; |
| 304 } else { |
| 305 help = "I have no idea what this is."; |
| 306 } |
| 307 |
| 308 return Err(location, "Invalid token.", help); |
| 309 } |
OLD | NEW |