OLD | NEW |
(Empty) | |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 // |
| 5 // To use the lexer, call Tokenize with the source string to obtain |
| 6 // a TokenStream. The lexer is run concurrently so you should be able to |
| 7 // use the TokenStream before the lexer is done with the source. |
| 8 // |
| 9 // The lexer is implemented as a state machine. The states are represented |
| 10 // by functions (the stateFn type) which accept a lexer and return the |
| 11 // new state. |
| 12 // |
| 13 // Most states also have an isFooStart function which helps determine if |
| 14 // a transition to Foo is appropriate. Those functions accept a single |
| 15 // rune as a parameter and return true if the state machine should |
| 16 // transition to state Foo. Some states do not have such functions on |
| 17 // account of the transition condition being trivial. |
| 18 // |
| 19 // The lexer implementation was inspired by |
| 20 // http://cuddle.googlecode.com/hg/talk/lex.html |
| 21 |
| 22 package lexer |
| 23 |
| 24 import ( |
| 25 "unicode/utf8" |
| 26 ) |
| 27 |
| 28 // Tokenize accepts a source string and parses it into a stream of tokens which |
| 29 // can be read from the returned TokenStream. |
| 30 func Tokenize(source string) TokenStream { |
| 31 tokens := make(chan Token) |
| 32 l := lexer{source: source, tokens: tokens} |
| 33 go l.run() |
| 34 return &TokenChan{tokenChan: tokens} |
| 35 } |
| 36 |
| 37 type lexer struct { |
| 38 // source is the source code to be lexed. |
| 39 source string |
| 40 |
| 41 // offset is the number of bytes that have been consumed. |
| 42 offset int |
| 43 |
| 44 // tokens is a channel to which the found tokens are emitted. |
| 45 tokens chan Token |
| 46 |
| 47 // sourcePos is the number of runes that have been consumed. |
| 48 sourcePos int |
| 49 |
| 50 // lineno is the current line number. |
| 51 lineNo int |
| 52 |
| 53 // linePos is how many runes have been consumed since the beginning of t
he |
| 54 // line. |
| 55 linePos int |
| 56 |
| 57 // curTokenOffset is the number of bytes consumed prior to the beginning
of |
| 58 // the current token. |
| 59 curTokenOffset int |
| 60 |
| 61 // curTokenSourcePos is the number of runes consumed prior to the beginn
ing of |
| 62 // the current token. |
| 63 curTokenSourcePos int |
| 64 |
| 65 // curTokenLineNo is the line number on which the current token begins. |
| 66 curTokenLineNo int |
| 67 |
| 68 // curTokenLinePos is the number of runes since the beginning of the lin
e |
| 69 // where the current token begins. |
| 70 curTokenLinePos int |
| 71 } |
| 72 |
| 73 // CurText returns the consumed part of the current token. |
| 74 func (l *lexer) CurText() string { |
| 75 return l.source[l.curTokenOffset:l.offset] |
| 76 } |
| 77 |
| 78 // emitToken emits the current token and begins a new token. |
| 79 func (l *lexer) emitToken(tokenType TokenKind) { |
| 80 l.tokens <- Token{ |
| 81 Kind: tokenType, |
| 82 Text: l.source[l.curTokenOffset:l.offset], |
| 83 CharPos: l.curTokenSourcePos, |
| 84 LineNo: l.curTokenLineNo, |
| 85 LinePos: l.curTokenLinePos} |
| 86 l.beginToken() |
| 87 } |
| 88 |
| 89 // beginToken begins the new token. |
| 90 func (l *lexer) beginToken() { |
| 91 l.curTokenOffset = l.offset |
| 92 l.curTokenSourcePos = l.sourcePos |
| 93 l.curTokenLineNo = l.lineNo |
| 94 l.curTokenLinePos = l.linePos |
| 95 } |
| 96 |
| 97 // Consume consumes the next rune in the source. |
| 98 func (l *lexer) Consume() { |
| 99 if l.IsEos() { |
| 100 return |
| 101 } |
| 102 |
| 103 c, width := utf8.DecodeRuneInString(l.source[l.offset:]) |
| 104 |
| 105 if c == '\n' { |
| 106 l.lineNo += 1 |
| 107 l.linePos = 0 |
| 108 } else { |
| 109 l.linePos += 1 |
| 110 } |
| 111 l.offset += width |
| 112 l.sourcePos += 1 |
| 113 } |
| 114 |
| 115 // Peek returns the next rune in the source. |
| 116 func (l *lexer) Peek() rune { |
| 117 // At the end of the string, there is no sane answer to Peek. |
| 118 if l.IsEos() { |
| 119 return utf8.RuneError |
| 120 } |
| 121 |
| 122 // If RuneError is returned, it will be handled as any other rune, likel
y |
| 123 // resulting in an ErrorIllegalChar token being emitted. |
| 124 char, _ := utf8.DecodeRuneInString(l.source[l.offset:]) |
| 125 return char |
| 126 } |
| 127 |
| 128 // IsEos returns true if the whole source has been consumed false |
| 129 // otherwise. |
| 130 func (l *lexer) IsEos() bool { |
| 131 return l.offset >= len(l.source) |
| 132 } |
| 133 |
| 134 // run is the lexer's main loop. |
| 135 func (l *lexer) run() { |
| 136 // We are implementing a state machine. |
| 137 // lexRoot is the beginning state. |
| 138 // nil is the end state. |
| 139 // States are functions which are called on the lexer. They return the |
| 140 // next state. |
| 141 for state := lexRoot; state != nil; { |
| 142 state = state(l) |
| 143 } |
| 144 close(l.tokens) |
| 145 } |
| 146 |
| 147 // A stateFn represents a state in the lexer state machine. |
| 148 type stateFn func(*lexer) stateFn |
| 149 |
| 150 // This is the beginning state and also the state which is returned to after |
| 151 // most tokens are emitted. |
| 152 func lexRoot(l *lexer) stateFn { |
| 153 if l.IsEos() { |
| 154 return nil |
| 155 } |
| 156 |
| 157 switch c := l.Peek(); { |
| 158 case isSingleCharTokens(c): |
| 159 return lexSingleCharTokens |
| 160 case isEqualsOrResponseStart(c): |
| 161 return lexEqualsOrResponse |
| 162 case isNameStart(c): |
| 163 return lexName |
| 164 case isOrdinalStart(c): |
| 165 return lexOrdinal |
| 166 case isNumberStart(c): |
| 167 return lexNumber |
| 168 case isStringStart(c): |
| 169 return lexString |
| 170 case isSkippable(c): |
| 171 return lexSkip |
| 172 case isMaybeComment(c): |
| 173 return lexComment |
| 174 } |
| 175 |
| 176 l.Consume() |
| 177 l.emitToken(ErrorIllegalChar) |
| 178 return nil |
| 179 } |
| 180 |
| 181 // isSkippable determines if a rune is skippable. |
| 182 func isSkippable(c rune) bool { |
| 183 return c == ' ' || c == '\t' || c == '\r' || c == '\n' |
| 184 } |
| 185 |
| 186 // lexSkip consumes skippable runes. |
| 187 func lexSkip(l *lexer) stateFn { |
| 188 for isSkippable(l.Peek()) { |
| 189 l.Consume() |
| 190 } |
| 191 l.beginToken() |
| 192 return lexRoot |
| 193 } |
| 194 |
| 195 // singleCharTokens is a map of single-rune tokens. |
| 196 var singleCharTokens = map[rune]TokenKind{ |
| 197 '(': LParen, |
| 198 ')': RParen, |
| 199 '[': LBracket, |
| 200 ']': RBracket, |
| 201 '{': LBrace, |
| 202 '}': RBrace, |
| 203 '<': LAngle, |
| 204 '>': RAngle, |
| 205 ';': Semi, |
| 206 ',': Comma, |
| 207 '.': Dot, |
| 208 '-': Minus, |
| 209 '+': Plus, |
| 210 '&': Amp, |
| 211 '?': Qstn, |
| 212 } |
| 213 |
| 214 // isSingleCharTokens returns true if the rune is a single character token. |
| 215 func isSingleCharTokens(c rune) bool { |
| 216 _, ok := singleCharTokens[c] |
| 217 return ok |
| 218 } |
| 219 |
| 220 // lexSingleCharTokens lexes single character tokens. |
| 221 func lexSingleCharTokens(l *lexer) stateFn { |
| 222 c := l.Peek() |
| 223 l.Consume() |
| 224 t, _ := singleCharTokens[c] |
| 225 l.emitToken(t) |
| 226 |
| 227 return lexRoot |
| 228 } |
| 229 |
| 230 // isEqualsOrResponseStart returns true if the rune corresponds to be |
| 231 // beginning of either the '=' or '=>' tokens. |
| 232 func isEqualsOrResponseStart(c rune) bool { |
| 233 return c == '=' |
| 234 } |
| 235 |
| 236 // lexEqualsOrResponse lexes the '=' or the '=>' token. |
| 237 func lexEqualsOrResponse(l *lexer) stateFn { |
| 238 l.Consume() |
| 239 |
| 240 if l.Peek() == '>' { |
| 241 l.Consume() |
| 242 l.emitToken(Response) |
| 243 } else { |
| 244 l.emitToken(Equals) |
| 245 } |
| 246 |
| 247 return lexRoot |
| 248 } |
| 249 |
| 250 // isAlpha returns true if the rune is a letter of the alphabet. |
| 251 func isAlpha(c rune) bool { |
| 252 return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) |
| 253 } |
| 254 |
| 255 // isDigit returns true if the rune is a digit. |
| 256 func isDigit(c rune) bool { |
| 257 return ('0' <= c && c <= '9') |
| 258 } |
| 259 |
| 260 // isHexDigit returns true if the rune is a hexadecimal digit. |
| 261 func isHexDigit(c rune) bool { |
| 262 return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') |
| 263 } |
| 264 |
| 265 // isNameStart returns true if the rune is the beginning of a Name token. |
| 266 func isNameStart(c rune) bool { |
| 267 return isAlpha(c) || c == '_' |
| 268 } |
| 269 |
| 270 // keywordTokens maps keywords to their associate tokens. |
| 271 var keywordTokens = map[string]TokenKind{ |
| 272 "import": Import, |
| 273 "module": Module, |
| 274 "struct": Struct, |
| 275 "union": Union, |
| 276 "interface": Interface, |
| 277 "enum": Enum, |
| 278 "const": Const, |
| 279 "true": True, |
| 280 "false": False, |
| 281 "default": Default, |
| 282 } |
| 283 |
| 284 // lexName lexes valid C identifiers. (K&R2: A.2.3) |
| 285 func lexName(l *lexer) stateFn { |
| 286 l.Consume() |
| 287 |
| 288 // isNameRune returns true if the rune is valid in a Name token. |
| 289 isNameRune := func(c rune) bool { |
| 290 return isAlpha(c) || isDigit(c) || c == '_' |
| 291 } |
| 292 |
| 293 for isNameRune(l.Peek()) { |
| 294 l.Consume() |
| 295 } |
| 296 |
| 297 // Emit the appropriate keyword token if the current name is a |
| 298 // keyword or a Name token otherwise. |
| 299 if token, found := keywordTokens[l.CurText()]; found { |
| 300 l.emitToken(token) |
| 301 } else { |
| 302 l.emitToken(Name) |
| 303 } |
| 304 |
| 305 return lexRoot |
| 306 } |
| 307 |
| 308 // isOrdinalStart returns true if the rune is the beginning of an Ordinal |
| 309 // token. |
| 310 func isOrdinalStart(c rune) bool { |
| 311 return '@' == c |
| 312 } |
| 313 |
| 314 // lexOrdinal lexes an Ordinal token. Ordinals are a '@' followed by one |
| 315 // or more digits. |
| 316 func lexOrdinal(l *lexer) stateFn { |
| 317 // Consume the '@'. |
| 318 l.Consume() |
| 319 |
| 320 for isDigit(l.Peek()) { |
| 321 l.Consume() |
| 322 } |
| 323 |
| 324 l.emitToken(Ordinal) |
| 325 |
| 326 return lexRoot |
| 327 } |
| 328 |
| 329 // isNumberStart returns true if the rune is the beginning of a number. |
| 330 func isNumberStart(c rune) bool { |
| 331 // Even hexadecimals must begin with a digit (namely 0). |
| 332 return isDigit(c) |
| 333 } |
| 334 |
| 335 // lexNumber lexes a number token. |
| 336 func lexNumber(l *lexer) stateFn { |
| 337 // If the number begins with 0 it cannot be a decimal integer. |
| 338 if l.Peek() == '0' { |
| 339 return lexNumberStartWithZero |
| 340 } |
| 341 return lexDec |
| 342 } |
| 343 |
| 344 // lexDec lexes a base-10 number. |
| 345 func lexDec(l *lexer) stateFn { |
| 346 for isDigit(l.Peek()) { |
| 347 l.Consume() |
| 348 } |
| 349 |
| 350 // If a decimal part is found, transition to the decimal state. |
| 351 if isDecimalPartStart(l.Peek()) { |
| 352 return lexDecimalPart |
| 353 } |
| 354 |
| 355 l.emitToken(IntConstDec) |
| 356 |
| 357 return lexRoot |
| 358 } |
| 359 |
| 360 // lexNumberStartWithZero lexes hexadecimals, some floats or 0. |
| 361 func lexNumberStartWithZero(l *lexer) stateFn { |
| 362 // Consume the leading 0 |
| 363 l.Consume() |
| 364 |
| 365 // Here we check to see whether we are in the hexadecimal or floating |
| 366 // point case. |
| 367 switch c := l.Peek(); { |
| 368 case c == 'x' || c == 'X': |
| 369 return lexHexNumber |
| 370 case isDecimalPartStart(c): |
| 371 return lexDecimalPart |
| 372 } |
| 373 |
| 374 // Found a naked 0. |
| 375 l.emitToken(IntConstDec) |
| 376 |
| 377 return lexRoot |
| 378 } |
| 379 |
| 380 // lexHexNumber lexes hexadecimal integers. |
| 381 func lexHexNumber(l *lexer) stateFn { |
| 382 // Consume the x or X |
| 383 l.Consume() |
| 384 |
| 385 for isHexDigit(l.Peek()) { |
| 386 l.Consume() |
| 387 } |
| 388 |
| 389 l.emitToken(IntConstHex) |
| 390 |
| 391 return lexRoot |
| 392 } |
| 393 |
| 394 // isDecimalPartStart returns true if the rune represents the beginning of |
| 395 // the decimal part of a floating point number. |
| 396 func isDecimalPartStart(c rune) bool { |
| 397 return c == '.' || c == 'e' || c == 'E' |
| 398 } |
| 399 |
| 400 // lexDecimalPart lexes the decimal part of a floating point number. |
| 401 func lexDecimalPart(l *lexer) stateFn { |
| 402 // Consume '.' or 'e' or 'E' |
| 403 l.Consume() |
| 404 |
| 405 if c := l.Peek(); c == '+' || c == '-' { |
| 406 l.Consume() |
| 407 } |
| 408 |
| 409 for isDigit(l.Peek()) { |
| 410 l.Consume() |
| 411 } |
| 412 |
| 413 l.emitToken(FloatConst) |
| 414 |
| 415 return lexRoot |
| 416 } |
| 417 |
| 418 // isStringStart returns true if the rune represents the beginning of a string. |
| 419 func isStringStart(c rune) bool { |
| 420 return '"' == c |
| 421 } |
| 422 |
| 423 // lexString lexes a quoted string. |
| 424 func lexString(l *lexer) stateFn { |
| 425 // Consume opening quotes. |
| 426 l.Consume() |
| 427 |
| 428 for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' { |
| 429 if l.Peek() == '\\' { |
| 430 // If we see an escape character consume whatever follow
s blindly. |
| 431 // TODO(azani): Consider parsing escape sequences. |
| 432 l.Consume() |
| 433 } |
| 434 l.Consume() |
| 435 } |
| 436 |
| 437 if l.IsEos() || l.Peek() == '\n' { |
| 438 l.emitToken(ErrorUnterminatedStringLiteral) |
| 439 return nil |
| 440 } |
| 441 |
| 442 // Consume the closing quotes |
| 443 l.Consume() |
| 444 |
| 445 l.emitToken(StringLiteral) |
| 446 |
| 447 return lexRoot |
| 448 } |
| 449 |
| 450 // isMaybeComment returns true if the rune may be the beginning of a |
| 451 // comment. |
| 452 func isMaybeComment(c rune) bool { |
| 453 return c == '/' |
| 454 } |
| 455 |
| 456 // lexComment consumes a single-line or multi-line comment. |
| 457 func lexComment(l *lexer) stateFn { |
| 458 // Consume the '/'. |
| 459 l.Consume() |
| 460 |
| 461 switch l.Peek() { |
| 462 case '/': |
| 463 return lexSingleLineComment |
| 464 case '*': |
| 465 return lexMultiLineComment |
| 466 } |
| 467 |
| 468 l.emitToken(ErrorIllegalChar) |
| 469 return nil |
| 470 } |
| 471 |
| 472 // lexSingleLineComment consumes a single line comment. |
| 473 func lexSingleLineComment(l *lexer) stateFn { |
| 474 // Consume the '/' |
| 475 l.Consume() |
| 476 |
| 477 for !l.IsEos() && l.Peek() != '\n' { |
| 478 l.Consume() |
| 479 } |
| 480 |
| 481 l.beginToken() |
| 482 return lexRoot |
| 483 } |
| 484 |
| 485 // lexMultiLineComment consumes a multi-line comment. |
| 486 func lexMultiLineComment(l *lexer) stateFn { |
| 487 // Consume the '*'. |
| 488 l.Consume() |
| 489 |
| 490 for !l.IsEos() { |
| 491 if l.Peek() == '*' { |
| 492 return lexPossibleEndOfComment |
| 493 } |
| 494 l.Consume() |
| 495 } |
| 496 |
| 497 l.emitToken(ErrorUnterminatedComment) |
| 498 return nil |
| 499 } |
| 500 |
| 501 // lexPossibleEndOfComment consumes the possible end of a multiline |
| 502 // comment and determines whether the comment in fact ended or not. |
| 503 func lexPossibleEndOfComment(l *lexer) stateFn { |
| 504 // Consume the '*' |
| 505 l.Consume() |
| 506 |
| 507 if l.IsEos() { |
| 508 l.emitToken(ErrorUnterminatedComment) |
| 509 return nil |
| 510 } |
| 511 |
| 512 if l.Peek() == '/' { |
| 513 l.Consume() |
| 514 l.beginToken() |
| 515 return lexRoot |
| 516 } |
| 517 |
| 518 return lexMultiLineComment |
| 519 } |
OLD | NEW |