Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 // | |
| 5 // To use the lexer, call Tokenize with the source string to obtain | |
| 6 // a TokenStream. The lexer is run concurrently so you should be able to | |
| 7 // use the TokenStream before the lexer is done with the source. | |
| 8 // | |
| 9 // The lexer is implemented as a state machine. The states are represented | |
| 10 // by functions (the stateFn type) which accept a lexer and return the | |
| 11 // new state. | |
| 12 // | |
| 13 // Most states also have an isFooStart function which helps determine if | |
| 14 // a transition to Foo is appropriate. Those functions accept a single | |
| 15 // rune as a parameter and return true if the state machine should | |
| 16 // transition to state Foo. Some states do not have such functions on | |
| 17 // account of the transition condition being trivial. | |
| 18 // | |
| 19 // The lexer implementation was inspired by | |
| 20 // http://cuddle.googlecode.com/hg/talk/lex.html | |
| 21 | |
| 22 package lexer | |
| 23 | |
| 24 import ( | |
| 25 "unicode/utf8" | |
| 26 ) | |
| 27 | |
| 28 // Tokenize accepts a source string and parses it into a stream of tokens which | |
| 29 // can be read from the returned TokenStream. | |
| 30 func Tokenize(source string) TokenStream { | |
| 31 tokens := make(chan Token) | |
| 32 l := lexer{source: source, tokens: tokens} | |
| 33 go l.run() | |
| 34 return &TokenChan{tokenChan: tokens} | |
| 35 } | |
| 36 | |
| 37 type lexer struct { | |
| 38 // source is the source code to be lexed. | |
| 39 source string | |
| 40 | |
| 41 // offset is the number of bytes that have been consumed. | |
| 42 offset int | |
| 43 | |
| 44 // tokens is a channel to which the found tokens are emitted. | |
| 45 tokens chan Token | |
| 46 | |
| 47 // sourcePos is the number of runes that have been consumed. | |
| 48 sourcePos int | |
| 49 | |
| 50 // lineno is the current line number. | |
| 51 lineNo int | |
| 52 | |
| 53 // linePos is how many runes have been consumed since the beginning of t he | |
| 54 // line. | |
| 55 linePos int | |
| 56 | |
| 57 // curTokenOffset is the number of bytes consumed prior to the beginning of | |
| 58 // the current token. | |
| 59 curTokenOffset int | |
| 60 | |
| 61 // curTokenSourcePos is the number of runes consumed prior to the beginn ing of | |
| 62 // the current token. | |
| 63 curTokenSourcePos int | |
| 64 | |
| 65 // curTokenLineNo is the line number on which the current token begins. | |
| 66 curTokenLineNo int | |
| 67 | |
| 68 // curTokenLinePos is the number of runes since the beginning of the lin e | |
| 69 // where the current token begins. | |
| 70 curTokenLinePos int | |
| 71 } | |
| 72 | |
| 73 // CurText returns the consumed part of the current token. | |
| 74 func (l *lexer) CurText() string { | |
| 75 return l.source[l.curTokenOffset:l.offset] | |
| 76 } | |
| 77 | |
| 78 // emitToken emits the current token and begins a new token. | |
| 79 func (l *lexer) emitToken(tokenType TokenKind) { | |
| 80 l.tokens <- Token{ | |
| 81 Kind: tokenType, | |
| 82 Text: l.source[l.curTokenOffset:l.offset], | |
| 83 CharPos: l.curTokenSourcePos, | |
| 84 LineNo: l.curTokenLineNo, | |
| 85 LinePos: l.curTokenLinePos} | |
| 86 l.beginToken() | |
| 87 } | |
| 88 | |
| 89 // beginToken begins the new token. | |
| 90 func (l *lexer) beginToken() { | |
| 91 l.curTokenOffset = l.offset | |
| 92 l.curTokenSourcePos = l.sourcePos | |
| 93 l.curTokenLineNo = l.lineNo | |
| 94 l.curTokenLinePos = l.linePos | |
| 95 } | |
| 96 | |
| 97 // Consume consumes the next rune in the source. | |
| 98 func (l *lexer) Consume() { | |
| 99 if l.IsEos() { | |
| 100 return | |
| 101 } | |
| 102 | |
| 103 c, width := utf8.DecodeRuneInString(l.source[l.offset:]) | |
| 104 if width == 0 { | |
|
mattr
2015/10/14 19:29:26
I guess my question was, why increment width here
azani
2015/10/14 22:29:41
It turns out it's not necessary... Gone now and th
| |
| 105 width = 1 | |
| 106 } | |
| 107 | |
| 108 if c == '\n' { | |
| 109 l.lineNo += 1 | |
| 110 l.linePos = 0 | |
| 111 } else { | |
| 112 l.linePos += 1 | |
| 113 } | |
| 114 l.offset += width | |
| 115 l.sourcePos += 1 | |
| 116 } | |
| 117 | |
| 118 // Peek returns the next rune in the source. | |
| 119 func (l *lexer) Peek() rune { | |
| 120 // At the end of the string, there is no sane answer to Peek. | |
| 121 if l.IsEos() { | |
| 122 return utf8.RuneError | |
| 123 } | |
| 124 | |
| 125 // If RuneError is returned, it will be handled as any other rune, likel y | |
| 126 // resulting in an ErrorIllegalChar token being emitted. | |
| 127 char, _ := utf8.DecodeRuneInString(l.source[l.offset:]) | |
| 128 return char | |
| 129 } | |
| 130 | |
| 131 // IsEos returns true if the whole source has been consumed false | |
| 132 // otherwise. | |
| 133 func (l *lexer) IsEos() bool { | |
| 134 return l.offset >= len(l.source) | |
| 135 } | |
| 136 | |
| 137 // run is the lexer's main loop. | |
| 138 func (l *lexer) run() { | |
| 139 // We are implementing a state machine. | |
| 140 // lexRoot is the beginning state. | |
| 141 // nil is the end state. | |
| 142 // States are functions which are called on the lexer. They return the | |
| 143 // next state. | |
| 144 for state := lexRoot; state != nil; { | |
| 145 state = state(l) | |
| 146 } | |
| 147 close(l.tokens) | |
| 148 } | |
| 149 | |
| 150 // A stateFn represents a state in the lexer state machine. | |
| 151 type stateFn func(*lexer) stateFn | |
| 152 | |
| 153 // This is the beginning state and also the state which is returned to after | |
| 154 // most tokens are emitted. | |
| 155 func lexRoot(l *lexer) stateFn { | |
| 156 if l.IsEos() { | |
| 157 return nil | |
| 158 } | |
| 159 | |
| 160 switch c := l.Peek(); { | |
| 161 case isSingleCharTokens(c): | |
| 162 return lexSingleCharTokens | |
| 163 case isEqualsOrResponseStart(c): | |
| 164 return lexEqualsOrResponse | |
| 165 case isNameStart(c): | |
| 166 return lexName | |
| 167 case isOrdinalStart(c): | |
| 168 return lexOrdinal | |
| 169 case isNumberStart(c): | |
| 170 return lexNumber | |
| 171 case isStringStart(c): | |
| 172 return lexString | |
| 173 case isSkippable(c): | |
| 174 return lexSkip | |
| 175 case isMaybeComment(c): | |
| 176 return lexComment | |
| 177 } | |
| 178 | |
| 179 l.Consume() | |
| 180 l.emitToken(ErrorIllegalChar) | |
| 181 return nil | |
| 182 } | |
| 183 | |
| 184 // isSkippable determines if a rune is skippable. | |
| 185 func isSkippable(c rune) bool { | |
| 186 return c == ' ' || c == '\t' || c == '\r' || c == '\n' | |
| 187 } | |
| 188 | |
| 189 // lexSkip consumes skippable runes. | |
| 190 func lexSkip(l *lexer) stateFn { | |
| 191 for isSkippable(l.Peek()) { | |
| 192 l.Consume() | |
| 193 } | |
| 194 l.beginToken() | |
| 195 return lexRoot | |
| 196 } | |
| 197 | |
| 198 // singleCharTokens is a map of single-rune tokens. | |
| 199 var singleCharTokens = map[rune]TokenKind{ | |
| 200 '(': LParen, | |
| 201 ')': RParen, | |
| 202 '[': LBracket, | |
| 203 ']': RBracket, | |
| 204 '{': LBrace, | |
| 205 '}': RBrace, | |
| 206 '<': LAngle, | |
| 207 '>': RAngle, | |
| 208 ';': Semi, | |
| 209 ',': Comma, | |
| 210 '.': Dot, | |
| 211 '-': Minus, | |
| 212 '+': Plus, | |
| 213 '&': Amp, | |
| 214 '?': Qstn, | |
| 215 } | |
| 216 | |
| 217 // isSingleCharTokens returns true if the rune is a single character token. | |
| 218 func isSingleCharTokens(c rune) bool { | |
| 219 _, ok := singleCharTokens[c] | |
| 220 return ok | |
| 221 } | |
| 222 | |
| 223 // lexSingleCharTokens lexes single character tokens. | |
| 224 func lexSingleCharTokens(l *lexer) stateFn { | |
| 225 c := l.Peek() | |
| 226 l.Consume() | |
| 227 t, _ := singleCharTokens[c] | |
| 228 l.emitToken(t) | |
| 229 | |
| 230 return lexRoot | |
| 231 } | |
| 232 | |
| 233 // isEqualsOrResponseStart returns true if the rune corresponds to be | |
| 234 // beginning of either the '=' or '=>' tokens. | |
| 235 func isEqualsOrResponseStart(c rune) bool { | |
| 236 return c == '=' | |
| 237 } | |
| 238 | |
| 239 // lexEqualsOrResponse lexes the '=' or the '=>' token. | |
| 240 func lexEqualsOrResponse(l *lexer) stateFn { | |
| 241 l.Consume() | |
| 242 | |
| 243 if l.Peek() == '>' { | |
| 244 l.Consume() | |
| 245 l.emitToken(Response) | |
| 246 } else { | |
| 247 l.emitToken(Equals) | |
| 248 } | |
| 249 | |
| 250 return lexRoot | |
| 251 } | |
| 252 | |
| 253 // isAlpha returns true if the rune is a letter of the alphabet. | |
| 254 func isAlpha(c rune) bool { | |
| 255 return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) | |
| 256 } | |
| 257 | |
| 258 // isDigit returns true if the rune is a digit. | |
| 259 func isDigit(c rune) bool { | |
| 260 return ('0' <= c && c <= '9') | |
| 261 } | |
| 262 | |
| 263 // isHexDigit returns true if the rune is a hexadecimal digit. | |
| 264 func isHexDigit(c rune) bool { | |
| 265 return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') | |
| 266 } | |
| 267 | |
| 268 // isNameStart returns true if the rune is the beginning of a Name token. | |
| 269 func isNameStart(c rune) bool { | |
| 270 return isAlpha(c) || c == '_' | |
| 271 } | |
| 272 | |
| 273 // keywordTokens maps keywords to their associate tokens. | |
| 274 var keywordTokens = map[string]TokenKind{ | |
| 275 "import": Import, | |
| 276 "module": Module, | |
| 277 "struct": Struct, | |
| 278 "union": Union, | |
| 279 "interface": Interface, | |
| 280 "enum": Enum, | |
| 281 "const": Const, | |
| 282 "true": True, | |
| 283 "false": False, | |
| 284 "default": Default, | |
| 285 } | |
| 286 | |
| 287 // lexName lexes valid C identifiers. (K&R2: A.2.3) | |
| 288 func lexName(l *lexer) stateFn { | |
| 289 l.Consume() | |
| 290 | |
| 291 // isNameRune returns true if the rune is valid in a Name token. | |
| 292 isNameRune := func(c rune) bool { | |
| 293 return isAlpha(c) || isDigit(c) || c == '_' | |
| 294 } | |
| 295 | |
| 296 for isNameRune(l.Peek()) { | |
| 297 l.Consume() | |
| 298 } | |
| 299 | |
| 300 // Emit the appropriate keyword token if the current name is a | |
| 301 // keyword or a Name token otherwise. | |
| 302 if token, found := keywordTokens[l.CurText()]; found { | |
| 303 l.emitToken(token) | |
| 304 } else { | |
| 305 l.emitToken(Name) | |
| 306 } | |
| 307 | |
| 308 return lexRoot | |
| 309 } | |
| 310 | |
| 311 // isOrdinalStart returns true if the rune is the beginning of an Ordinal | |
| 312 // token. | |
| 313 func isOrdinalStart(c rune) bool { | |
| 314 return '@' == c | |
| 315 } | |
| 316 | |
| 317 // lexOrdinal lexes an Ordinal token. Ordinals are a '@' followed by one | |
| 318 // or more digits. | |
| 319 func lexOrdinal(l *lexer) stateFn { | |
| 320 // Consume the '@'. | |
| 321 l.Consume() | |
| 322 | |
| 323 for isDigit(l.Peek()) { | |
| 324 l.Consume() | |
| 325 } | |
| 326 | |
| 327 l.emitToken(Ordinal) | |
| 328 | |
| 329 return lexRoot | |
| 330 } | |
| 331 | |
| 332 // isNumberStart returns true if the rune is the beginning of a number. | |
| 333 func isNumberStart(c rune) bool { | |
| 334 // Even hexadecimals must begin with a digit (namely 0). | |
| 335 return isDigit(c) | |
| 336 } | |
| 337 | |
| 338 // lexNumber lexes a number token. | |
| 339 func lexNumber(l *lexer) stateFn { | |
| 340 // If the number begins with 0 it cannot be a decimal integer. | |
| 341 if l.Peek() == '0' { | |
| 342 return lexNumberStartWithZero | |
| 343 } | |
| 344 return lexDec | |
| 345 } | |
| 346 | |
| 347 // lexDec lexes a base-10 number. | |
| 348 func lexDec(l *lexer) stateFn { | |
| 349 for isDigit(l.Peek()) { | |
| 350 l.Consume() | |
| 351 } | |
| 352 | |
| 353 // If a decimal part is found, transition to the decimal state. | |
| 354 if isDecimalPartStart(l.Peek()) { | |
| 355 return lexDecimalPart | |
| 356 } | |
| 357 | |
| 358 l.emitToken(IntConstDec) | |
| 359 | |
| 360 return lexRoot | |
| 361 } | |
| 362 | |
| 363 // lexNumberStartWithZero lexes hexadecimals, some floats or 0. | |
| 364 func lexNumberStartWithZero(l *lexer) stateFn { | |
| 365 // Consume the leading 0 | |
| 366 l.Consume() | |
| 367 | |
| 368 // Here we check to see whether we are in the hexadecimal or floating | |
| 369 // point case. | |
| 370 switch c := l.Peek(); { | |
| 371 case c == 'x' || c == 'X': | |
| 372 return lexHexNumber | |
| 373 case isDecimalPartStart(c): | |
| 374 return lexDecimalPart | |
| 375 } | |
| 376 | |
| 377 // Found a naked 0. | |
| 378 l.emitToken(IntConstDec) | |
| 379 | |
| 380 return lexRoot | |
| 381 } | |
| 382 | |
| 383 // lexHexNumber lexes hexadecimal integers. | |
| 384 func lexHexNumber(l *lexer) stateFn { | |
| 385 // Consume the x or X | |
| 386 l.Consume() | |
| 387 | |
| 388 for isHexDigit(l.Peek()) { | |
| 389 l.Consume() | |
| 390 } | |
| 391 | |
| 392 l.emitToken(IntConstHex) | |
| 393 | |
| 394 return lexRoot | |
| 395 } | |
| 396 | |
| 397 // isDecimalPartStart returns true if the rune represents the beginning of | |
| 398 // the decimal part of a floating point number. | |
| 399 func isDecimalPartStart(c rune) bool { | |
| 400 return c == '.' || c == 'e' || c == 'E' | |
| 401 } | |
| 402 | |
| 403 // lexDecimalPart lexes the decimal part of a floating point number. | |
| 404 func lexDecimalPart(l *lexer) stateFn { | |
| 405 // Consume '.' or 'e' or 'E' | |
| 406 l.Consume() | |
| 407 | |
| 408 if c := l.Peek(); c == '+' || c == '-' { | |
| 409 l.Consume() | |
| 410 } | |
| 411 | |
| 412 for isDigit(l.Peek()) { | |
| 413 l.Consume() | |
| 414 } | |
| 415 | |
| 416 l.emitToken(FloatConst) | |
| 417 | |
| 418 return lexRoot | |
| 419 } | |
| 420 | |
| 421 // isStringStart returns true if the rune represents the beginning of a string. | |
| 422 func isStringStart(c rune) bool { | |
| 423 return '"' == c | |
| 424 } | |
| 425 | |
| 426 // lexString lexes a quoted string. | |
| 427 func lexString(l *lexer) stateFn { | |
| 428 // Consume opening quotes. | |
| 429 l.Consume() | |
| 430 | |
| 431 for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' { | |
| 432 if l.Peek() == '\\' { | |
| 433 // If we see an escape character consume whatever follow s blindly. | |
| 434 // TODO(azani): Consider parsing escape sequences. | |
| 435 l.Consume() | |
| 436 } | |
| 437 l.Consume() | |
| 438 } | |
| 439 | |
| 440 if l.IsEos() || l.Peek() == '\n' { | |
| 441 l.emitToken(ErrorUnterminatedStringLiteral) | |
| 442 return nil | |
| 443 } | |
| 444 | |
| 445 // Consume the closing quotes | |
| 446 l.Consume() | |
| 447 | |
| 448 l.emitToken(StringLiteral) | |
| 449 | |
| 450 return lexRoot | |
| 451 } | |
| 452 | |
| 453 // isMaybeComment returns true if the rune may be the beginning of a | |
| 454 // comment. | |
| 455 func isMaybeComment(c rune) bool { | |
| 456 return c == '/' | |
| 457 } | |
| 458 | |
| 459 // lexComment consumes a single-line or multi-line comment. | |
| 460 func lexComment(l *lexer) stateFn { | |
| 461 // Consume the '/'. | |
| 462 l.Consume() | |
| 463 | |
| 464 switch l.Peek() { | |
| 465 case '/': | |
| 466 return lexSingleLineComment | |
| 467 case '*': | |
| 468 return lexMultiLineComment | |
| 469 } | |
| 470 | |
| 471 l.emitToken(ErrorIllegalChar) | |
| 472 return nil | |
| 473 } | |
| 474 | |
| 475 // lexSingleLineComment consumes a single line comment. | |
| 476 func lexSingleLineComment(l *lexer) stateFn { | |
| 477 // Consume the '/' | |
| 478 l.Consume() | |
| 479 | |
| 480 for !l.IsEos() && l.Peek() != '\n' { | |
| 481 l.Consume() | |
| 482 } | |
| 483 | |
| 484 l.beginToken() | |
| 485 return lexRoot | |
| 486 } | |
| 487 | |
| 488 // lexMultiLineComment consumes a multi-line comment. | |
| 489 func lexMultiLineComment(l *lexer) stateFn { | |
| 490 // Consume the '*'. | |
| 491 l.Consume() | |
| 492 | |
| 493 for !l.IsEos() { | |
| 494 if l.Peek() == '*' { | |
| 495 return lexPossibleEndOfComment | |
| 496 } | |
| 497 l.Consume() | |
| 498 } | |
| 499 | |
| 500 l.emitToken(ErrorUnterminatedComment) | |
| 501 return nil | |
| 502 } | |
| 503 | |
| 504 // lexPossibleEndOfComment consumes the possible end of a multiline | |
| 505 // comment and determines whether the comment in fact ended or not. | |
| 506 func lexPossibleEndOfComment(l *lexer) stateFn { | |
| 507 // Consume the '*' | |
| 508 l.Consume() | |
| 509 | |
| 510 if l.IsEos() { | |
| 511 l.emitToken(ErrorUnterminatedComment) | |
| 512 return nil | |
| 513 } | |
| 514 | |
| 515 if l.Peek() == '/' { | |
| 516 l.Consume() | |
| 517 l.beginToken() | |
| 518 return lexRoot | |
| 519 } | |
| 520 | |
| 521 return lexMultiLineComment | |
| 522 } | |
| OLD | NEW |