Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 // | |
| 5 // To call the lexer, call Tokenize with the source string to obtain | |
|
rudominer
2015/10/12 18:06:05
To use the lexer, call...
azani
2015/10/13 00:23:46
Done.
| |
| 6 // a TokenStream. The lexer is run concurrently so you should be able to | |
| 7 // use the TokenStream before the lexer is done with the source. | |
| 8 // | |
| 9 // The lexer is implemented as a state machine. The states are represented | |
| 10 // by functions (the stateFn type) which accept a lexer and return the | |
| 11 // new state. | |
| 12 // | |
| 13 // Most states also have an isFooStart function which helps determine if | |
| 14 // a transition to Foo is appropriate. Those functions accept a single | |
| 15 // rune as a parameter and return true if the state machine should | |
| 16 // transition to state Foo. Some states do not have such functions on | |
| 17 // account of the transition condition being trivial. | |
| 18 // | |
| 19 // The lexer implementation was inspired by | |
| 20 // http://cuddle.googlecode.com/hg/talk/lex.html | |
| 21 | |
| 22 package lexer | |
| 23 | |
| 24 import ( | |
| 25 "unicode/utf8" | |
| 26 ) | |
| 27 | |
| 28 // Tokenize accepts a source string and returns the tokens which compose | |
|
rudominer
2015/10/12 18:06:05
The phrasing belies the asynchronous nature. Also
azani
2015/10/13 00:23:46
Done.
| |
| 29 // the source string. | |
| 30 func Tokenize(source string) TokenStream { | |
| 31 tokens := make(chan Token) | |
| 32 l := lexer{source: source, tokens: tokens} | |
| 33 go l.run() | |
| 34 return &TokenChan{tokenChan: tokens} | |
| 35 } | |
| 36 | |
| 37 type lexer struct { | |
| 38 // source is the source code to be lexed. | |
|
rudominer
2015/10/12 18:06:05
nit: You don't need to repeat the variable name wi
azani
2015/10/12 18:18:07
Hey Matt, could you settle that question for us? I
| |
| 39 source string | |
| 40 | |
| 41 // offset is the number of characters that have been consumed. | |
|
rudominer
2015/10/12 18:06:05
Let's follow the go standard terminology and avoid
azani
2015/10/13 00:23:45
Done.
| |
| 42 offset int | |
| 43 | |
| 44 // lineno is the current line number. | |
| 45 lineNo int | |
| 46 | |
| 47 // lineOffset is how many characters have been consumed since the | |
| 48 // beginning of the line. | |
| 49 lineOffset int | |
|
rudominer
2015/10/12 18:06:05
I think it would be clearer if we didn't over use
azani
2015/10/13 00:23:45
Done.
| |
| 50 | |
| 51 // start is the index in source where the current token begins. | |
|
rudominer
2015/10/12 18:06:05
Maybe currentTokenStartIndex
or currentTokenStart?
azani
2015/10/13 00:23:46
Done.
| |
| 52 start int | |
| 53 | |
| 54 // lineStart is the line number on which the current token begins. | |
| 55 lineStart int | |
|
rudominer
2015/10/12 18:06:05
Maybe currentTokenLineNumber?
azani
2015/10/13 00:23:45
Done.
| |
| 56 | |
| 57 // lineOffsetStart is the number of characters since the beginning | |
| 58 // of the line where the current token begins. | |
| 59 lineOffsetStart int | |
|
rudominer
2015/10/12 18:06:05
Maybe currentTokenColumn?
azani
2015/10/13 00:23:45
Done.
| |
| 60 | |
| 61 // tokens is a channel on which the found tokens are emitted. | |
|
rudominer
2015/10/12 18:06:05
maybe "to which the found..."
azani
2015/10/13 00:23:45
Done.
| |
| 62 tokens chan Token | |
| 63 } | |
| 64 | |
| 65 // CurText returns the consumed part of the current token. | |
| 66 func (l *lexer) CurText() string { | |
| 67 return l.source[l.start:l.offset] | |
| 68 } | |
| 69 | |
| 70 // emitToken emits the current token and begins the new token. | |
|
rudominer
2015/10/12 18:06:05
begins a new token
azani
2015/10/13 00:23:46
Done.
| |
| 71 func (l *lexer) emitToken(tokenType TokenKind) { | |
| 72 l.tokens <- Token{ | |
| 73 Kind: tokenType, | |
| 74 Text: l.source[l.start:l.offset], | |
| 75 CharPos: l.start, | |
| 76 LineNo: l.lineStart, | |
| 77 LinePos: l.lineOffsetStart} | |
| 78 l.startToken() | |
| 79 } | |
| 80 | |
| 81 // startToken starts the new token. | |
|
rudominer
2015/10/12 18:06:05
starts a new token
azani
2015/10/13 00:23:45
Done.
| |
| 82 func (l *lexer) startToken() { | |
| 83 l.start = l.offset | |
| 84 l.lineStart = l.lineNo | |
| 85 l.lineOffsetStart = l.lineOffset | |
| 86 } | |
| 87 | |
| 88 // Consume consumes the next rune in the source. | |
| 89 func (l *lexer) Consume() { | |
| 90 c, width := utf8.DecodeRuneInString(l.source[l.offset:]) | |
| 91 if width == 0 { | |
|
rudominer
2015/10/12 18:06:05
What is your thinking here? Can width=0 without c
azani
2015/10/13 00:23:45
RuneError is just a special rune. It is handled th
| |
| 92 width = 1 | |
|
mattr
2015/10/12 18:04:13
This struck me as strange, this means the input wa
azani
2015/10/13 00:23:45
Basically, in all cases where this branch is taken
| |
| 93 } | |
| 94 | |
| 95 if c == '\n' { | |
| 96 l.lineNo += 1 | |
| 97 l.lineOffset = 0 | |
| 98 } else { | |
| 99 l.lineOffset += width | |
| 100 } | |
| 101 l.offset += width | |
| 102 } | |
| 103 | |
| 104 // Peek returns the next rune in the source. | |
| 105 func (l *lexer) Peek() rune { | |
| 106 char, _ := utf8.DecodeRuneInString(l.source[l.offset:]) | |
|
rudominer
2015/10/12 18:06:05
What if char == RuneError?
azani
2015/10/13 00:23:45
We treat it like any other rune. It looks like Run
| |
| 107 return char | |
| 108 } | |
| 109 | |
| 110 // IsEos returns true if the whole source has been consumed false | |
| 111 // otherwise. | |
| 112 func (l *lexer) IsEos() bool { | |
| 113 return l.offset >= len(l.source) | |
|
rudominer
2015/10/12 18:06:05
Note that this depends on l.offset being a number
azani
2015/10/13 00:23:46
Yes, since strings index based on bytes, not runes
| |
| 114 } | |
| 115 | |
| 116 // run is the lexer's main loop. | |
| 117 func (l *lexer) run() { | |
| 118 // This is a state machine. | |
|
rudominer
2015/10/12 18:06:06
"This" is ambiguous here. Maybe "We are implementi
azani
2015/10/13 00:23:45
Done.
| |
| 119 // lexRoot is the beginning state. | |
| 120 // nil is the end state. | |
| 121 // States are functions which are called on the lexer. They return the | |
| 122 // next state. | |
| 123 for state := lexRoot; state != nil; { | |
| 124 state = state(l) | |
| 125 } | |
| 126 close(l.tokens) | |
| 127 } | |
| 128 | |
| 129 // A stateFn represents a state in the lexer state machine. | |
| 130 type stateFn func(*lexer) stateFn | |
| 131 | |
| 132 // This is the beginning state and also the state in between tokens. | |
|
rudominer
2015/10/12 18:06:05
Nit. In my understanding of a DFA there is no "sta
azani
2015/10/13 00:23:45
I meant returned to after most tokens are emitted.
| |
| 133 func lexRoot(l *lexer) stateFn { | |
| 134 if l.offset >= len(l.source) { | |
|
rudominer
2015/10/12 18:06:05
You have a function IsEos() for this, no?
azani
2015/10/13 00:23:45
Done.
| |
| 135 return nil | |
| 136 } | |
| 137 | |
| 138 switch c := l.Peek(); { | |
| 139 case isSingleCharTokens(c): | |
| 140 return lexSingleCharTokens | |
| 141 case isEqualsOrResponseStart(c): | |
| 142 return lexEqualsOrResponse | |
| 143 case isNameStart(c): | |
| 144 return lexName | |
| 145 case isOrdinalStart(c): | |
| 146 return lexOrdinal | |
| 147 case isNumberStart(c): | |
| 148 return lexNumber | |
| 149 case isStringStart(c): | |
| 150 return lexString | |
| 151 case isSkippable(c): | |
| 152 return lexSkip | |
| 153 case isMaybeComment(c): | |
| 154 return lexComment | |
| 155 } | |
| 156 | |
| 157 l.Consume() | |
| 158 l.emitToken(ERROR_ILLEGAL_CHAR) | |
| 159 return nil | |
| 160 } | |
| 161 | |
| 162 // isSkippable determines if a rune is skippable. | |
| 163 func isSkippable(c rune) bool { | |
| 164 return c == ' ' || c == '\t' || c == '\r' || c == '\n' | |
| 165 } | |
| 166 | |
| 167 // lexSkip consumes skippable runes. | |
| 168 func lexSkip(l *lexer) stateFn { | |
| 169 for isSkippable(l.Peek()) { | |
| 170 l.Consume() | |
| 171 } | |
| 172 l.startToken() | |
| 173 return lexRoot | |
| 174 } | |
| 175 | |
| 176 // singleCharTokens is a map of single-rune tokens. | |
| 177 var singleCharTokens = map[rune]TokenKind{ | |
| 178 '(': LPAREN, | |
| 179 ')': RPAREN, | |
| 180 '[': LBRACKET, | |
| 181 ']': RBRACKET, | |
| 182 '{': LBRACE, | |
| 183 '}': RBRACE, | |
| 184 '<': LANGLE, | |
| 185 '>': RANGLE, | |
| 186 ';': SEMI, | |
| 187 ',': COMMA, | |
| 188 '.': DOT, | |
| 189 '-': MINUS, | |
| 190 '+': PLUS, | |
| 191 '&': AMP, | |
| 192 '?': QSTN, | |
| 193 } | |
| 194 | |
| 195 // isSingleCharTokens returns true if the rune is a single character token. | |
| 196 func isSingleCharTokens(c rune) bool { | |
| 197 _, ok := singleCharTokens[c] | |
| 198 return ok | |
| 199 } | |
| 200 | |
| 201 // lexSingleCharTokens lexes single character tokens. | |
| 202 func lexSingleCharTokens(l *lexer) stateFn { | |
| 203 c := l.Peek() | |
| 204 l.Consume() | |
| 205 t, _ := singleCharTokens[c] | |
| 206 l.emitToken(t) | |
| 207 | |
| 208 return lexRoot | |
| 209 } | |
| 210 | |
| 211 // isEqualsOrResponseStart returns true if the rune corresponds to be | |
| 212 // beginning of either the '=' or '=>' tokens. | |
| 213 func isEqualsOrResponseStart(c rune) bool { | |
| 214 return c == '=' | |
| 215 } | |
| 216 | |
| 217 // lexEqualsOrResponse lexes the '=' or the '=>' token. | |
| 218 func lexEqualsOrResponse(l *lexer) stateFn { | |
| 219 l.Consume() | |
| 220 | |
| 221 if l.Peek() == '>' { | |
| 222 l.Consume() | |
| 223 l.emitToken(RESPONSE) | |
| 224 } else { | |
| 225 l.emitToken(EQUALS) | |
| 226 } | |
| 227 | |
| 228 return lexRoot | |
| 229 } | |
| 230 | |
| 231 // isAlpha returns true if the rune is a letter of the alphabet. | |
| 232 func isAlpha(c rune) bool { | |
| 233 return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) | |
| 234 } | |
| 235 | |
| 236 // isDigit returns true if the rune is a digit. | |
| 237 func isDigit(c rune) bool { | |
| 238 return ('0' <= c && c <= '9') | |
| 239 } | |
| 240 | |
| 241 // isHexDigit returns true if the rune is a hexadecimal digit. | |
| 242 func isHexDigit(c rune) bool { | |
| 243 return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') | |
| 244 } | |
| 245 | |
| 246 // isNameStart returns true if the rune is the beginning of a NAME token. | |
| 247 func isNameStart(c rune) bool { | |
| 248 return isAlpha(c) || c == '_' | |
| 249 } | |
| 250 | |
| 251 // keywordTokens maps keywords to their associate tokens. | |
| 252 var keywordTokens = map[string]TokenKind{ | |
| 253 "import": IMPORT, | |
| 254 "module": MODULE, | |
| 255 "struct": STRUCT, | |
| 256 "union": UNION, | |
| 257 "interface": INTERFACE, | |
| 258 "enum": ENUM, | |
| 259 "const": CONST, | |
| 260 "true": TRUE, | |
| 261 "false": FALSE, | |
| 262 "default": DEFAULT, | |
| 263 } | |
| 264 | |
| 265 // lexName lexes valid C identifiers. (K&R2: A.2.3) | |
| 266 func lexName(l *lexer) stateFn { | |
| 267 l.Consume() | |
| 268 | |
| 269 // isNameRune returns true if the rune is valid in a NAME token. | |
| 270 isNameRune := func(c rune) bool { | |
| 271 return isAlpha(c) || isDigit(c) || c == '_' | |
| 272 } | |
| 273 | |
| 274 for isNameRune(l.Peek()) { | |
| 275 l.Consume() | |
| 276 } | |
| 277 | |
| 278 // Emit the appropriate keyword token if the current name is a | |
| 279 // keyword or a NAME token otherwise. | |
| 280 if token, found := keywordTokens[l.CurText()]; found { | |
| 281 l.emitToken(token) | |
| 282 } else { | |
| 283 l.emitToken(NAME) | |
| 284 } | |
| 285 | |
| 286 return lexRoot | |
| 287 } | |
| 288 | |
| 289 // isOrdinalStart returns true if the rune is the start of an ORDINAL | |
| 290 // token. | |
| 291 func isOrdinalStart(c rune) bool { | |
| 292 return '@' == c | |
| 293 } | |
| 294 | |
| 295 // lexOrdinal lexes an ORDINAL token. Ordinals are a '@' followed by one | |
| 296 // or more digits. | |
| 297 func lexOrdinal(l *lexer) stateFn { | |
| 298 // Consume the '@'. | |
| 299 l.Consume() | |
| 300 | |
| 301 for isDigit(l.Peek()) { | |
| 302 l.Consume() | |
| 303 } | |
| 304 | |
| 305 l.emitToken(ORDINAL) | |
| 306 | |
| 307 return lexRoot | |
| 308 } | |
| 309 | |
| 310 // isNumberStart returns true if the rune is the beginning of a number. | |
| 311 func isNumberStart(c rune) bool { | |
| 312 // Even hexadecimals must start with a digit (namely 0). | |
| 313 return isDigit(c) | |
| 314 } | |
| 315 | |
| 316 // lexNumber lexes a number token. | |
| 317 func lexNumber(l *lexer) stateFn { | |
| 318 // If the number starts with 0 it cannot be a decimal integer. | |
| 319 if l.Peek() == '0' { | |
| 320 return lexNumberStartWithZero | |
| 321 } | |
| 322 return lexDec | |
| 323 } | |
| 324 | |
| 325 // lexDec lexes a base-10 number. | |
| 326 func lexDec(l *lexer) stateFn { | |
| 327 for isDigit(l.Peek()) { | |
| 328 l.Consume() | |
| 329 } | |
| 330 | |
| 331 // If a decimal part is found, transition to the decimal state. | |
| 332 if isDecimalPartStart(l.Peek()) { | |
| 333 return lexDecimalPart | |
| 334 } | |
| 335 | |
| 336 l.emitToken(INT_CONST_DEC) | |
| 337 | |
| 338 return lexRoot | |
| 339 } | |
| 340 | |
| 341 // lexNumberStartWithZero lexes hexadecimals, some floats or 0. | |
| 342 func lexNumberStartWithZero(l *lexer) stateFn { | |
| 343 // Consume the starting 0 | |
| 344 l.Consume() | |
| 345 | |
| 346 // Here we check to see whether we are in the hexadecimal or floating | |
| 347 // point case. | |
| 348 switch c := l.Peek(); { | |
| 349 case c == 'x' || c == 'X': | |
| 350 return lexHexNumber | |
| 351 case isDecimalPartStart(c): | |
| 352 return lexDecimalPart | |
| 353 } | |
| 354 | |
| 355 // Found a naked 0. | |
| 356 l.emitToken(INT_CONST_DEC) | |
| 357 | |
| 358 return lexRoot | |
| 359 } | |
| 360 | |
| 361 // lexHexNumber lexes hexadecimal integers. | |
| 362 func lexHexNumber(l *lexer) stateFn { | |
| 363 // Consume the x or X | |
| 364 l.Consume() | |
| 365 | |
| 366 for isHexDigit(l.Peek()) { | |
| 367 l.Consume() | |
| 368 } | |
| 369 | |
| 370 l.emitToken(INT_CONST_HEX) | |
| 371 | |
| 372 return lexRoot | |
| 373 } | |
| 374 | |
| 375 // isDecimalPartStart returns true if the rune represents the start of | |
| 376 // the decimal part of a floating point number. | |
| 377 func isDecimalPartStart(c rune) bool { | |
| 378 return c == '.' || c == 'e' || c == 'E' | |
| 379 } | |
| 380 | |
| 381 // lexDecimalPart lexes the decimal part of a floating point number. | |
| 382 func lexDecimalPart(l *lexer) stateFn { | |
| 383 // Consume '.' or 'e' or 'E' | |
| 384 l.Consume() | |
| 385 | |
| 386 if c := l.Peek(); c == '+' || c == '-' { | |
| 387 l.Consume() | |
| 388 } | |
| 389 | |
| 390 for isDigit(l.Peek()) { | |
| 391 l.Consume() | |
| 392 } | |
| 393 | |
| 394 l.emitToken(FLOAT_CONST) | |
| 395 | |
| 396 return lexRoot | |
| 397 } | |
| 398 | |
| 399 // isStringStart returns true if the rune represents the start of a string. | |
| 400 func isStringStart(c rune) bool { | |
| 401 return '"' == c | |
| 402 } | |
| 403 | |
| 404 // lexString lexes a quoted string. | |
| 405 func lexString(l *lexer) stateFn { | |
| 406 // Consume opening quotes. | |
| 407 l.Consume() | |
| 408 | |
| 409 for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' { | |
| 410 if l.Peek() == '\\' { | |
| 411 // If we see an escape character consume whatever follow s blindly. | |
| 412 // TODO(azani): Consider parsing escape sequences. | |
| 413 l.Consume() | |
| 414 } | |
| 415 l.Consume() | |
| 416 } | |
| 417 | |
| 418 if l.IsEos() || l.Peek() == '\n' { | |
| 419 l.emitToken(ERROR_UNTERMINATED_STRING_LITERAL) | |
| 420 return nil | |
| 421 } | |
| 422 | |
| 423 // Consume the closing quotes | |
| 424 l.Consume() | |
| 425 | |
| 426 l.emitToken(STRING_LITERAL) | |
| 427 | |
| 428 return lexRoot | |
| 429 } | |
| 430 | |
| 431 // isMaybeComment returns true if the rune may be the beginning of a | |
| 432 // comment. | |
| 433 func isMaybeComment(c rune) bool { | |
| 434 return c == '/' | |
| 435 } | |
| 436 | |
| 437 // lexComment consumes a single-line or multi-line comment. | |
| 438 func lexComment(l *lexer) stateFn { | |
| 439 // Consume the '/'. | |
| 440 l.Consume() | |
| 441 | |
| 442 switch l.Peek() { | |
| 443 case '/': | |
| 444 return lexSingleLineComment | |
| 445 case '*': | |
| 446 return lexMultiLineComment | |
| 447 } | |
| 448 | |
| 449 l.emitToken(ERROR_ILLEGAL_CHAR) | |
| 450 return nil | |
| 451 } | |
| 452 | |
| 453 // lexSingleLineComment consumes a single line comment. | |
| 454 func lexSingleLineComment(l *lexer) stateFn { | |
| 455 // Consume the '/' | |
| 456 l.Consume() | |
| 457 | |
| 458 for !l.IsEos() && l.Peek() != '\n' { | |
| 459 l.Consume() | |
| 460 } | |
| 461 | |
| 462 l.startToken() | |
| 463 return lexRoot | |
| 464 } | |
| 465 | |
| 466 // lexMultiLineComment consumes a multi-line comment. | |
| 467 func lexMultiLineComment(l *lexer) stateFn { | |
| 468 // Consume the '*'. | |
| 469 l.Consume() | |
| 470 | |
| 471 for !l.IsEos() { | |
| 472 if l.Peek() == '*' { | |
| 473 return lexPossibleEndOfComment | |
| 474 } | |
| 475 l.Consume() | |
| 476 } | |
| 477 | |
| 478 l.emitToken(ERROR_UNTERMINATED_COMMENT) | |
| 479 return nil | |
| 480 } | |
| 481 | |
| 482 // lexPossibleEndOfComment consumes the possible end of a multiline | |
| 483 // comment and determines whether the comment in fact ended or not. | |
| 484 func lexPossibleEndOfComment(l *lexer) stateFn { | |
| 485 // Consume the '*' | |
| 486 l.Consume() | |
| 487 | |
| 488 if l.IsEos() { | |
| 489 l.emitToken(ERROR_UNTERMINATED_COMMENT) | |
| 490 return nil | |
| 491 } | |
| 492 | |
| 493 if l.Peek() == '/' { | |
| 494 l.Consume() | |
| 495 l.startToken() | |
| 496 return lexRoot | |
| 497 } | |
| 498 | |
| 499 return lexMultiLineComment | |
| 500 } | |
| OLD | NEW |