Chromium Code Reviews| Index: mojom/mojom_parser/lexer/lexer.go |
| diff --git a/mojom/mojom_parser/lexer/lexer.go b/mojom/mojom_parser/lexer/lexer.go |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..f918b3bc4eb6e8a4f76ada2f477ce84b01b6fb82 |
| --- /dev/null |
| +++ b/mojom/mojom_parser/lexer/lexer.go |
| @@ -0,0 +1,500 @@ |
| +// Copyright 2015 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| +// |
| +// To call the lexer, call Tokenize with the source string to obtain |
|
rudominer
2015/10/12 18:06:05
To use the lexer, call...
azani
2015/10/13 00:23:46
Done.
|
| +// a TokenStream. The lexer is run concurrently so you should be able to |
| +// use the TokenStream before the lexer is done with the source. |
| +// |
| +// The lexer is implemented as a state machine. The states are represented |
| +// by functions (the stateFn type) which accept a lexer and return the |
| +// new state. |
| +// |
| +// Most states also have an isFooStart function which helps determine if |
| +// a transition to Foo is appropriate. Those functions accept a single |
| +// rune as a parameter and return true if the state machine should |
| +// transition to state Foo. Some states do not have such functions on |
| +// account of the transition condition being trivial. |
| +// |
| +// The lexer implementation was inspired by |
| +// http://cuddle.googlecode.com/hg/talk/lex.html |
| + |
| +package lexer |
| + |
| +import ( |
| + "unicode/utf8" |
| +) |
| + |
| +// Tokenize accepts a source string and returns the tokens which compose |
|
rudominer
2015/10/12 18:06:05
The phrasing belies the asynchronous nature. Also
azani
2015/10/13 00:23:46
Done.
|
| +// the source string. |
| +func Tokenize(source string) TokenStream { |
| + tokens := make(chan Token) |
| + l := lexer{source: source, tokens: tokens} |
| + go l.run() |
| + return &TokenChan{tokenChan: tokens} |
| +} |
| + |
| +type lexer struct { |
| + // source is the source code to be lexed. |
|
rudominer
2015/10/12 18:06:05
nit: You don't need to repeat the variable name wi
azani
2015/10/12 18:18:07
Hey Matt, could you settle that question for us? I
|
| + source string |
| + |
| + // offset is the number of characters that have been consumed. |
|
rudominer
2015/10/12 18:06:05
Let's follow the go standard terminology and avoid
azani
2015/10/13 00:23:45
Done.
|
| + offset int |
| + |
| + // lineno is the current line number. |
| + lineNo int |
| + |
| + // lineOffset is how many characters have been consumed since the |
| + // beginning of the line. |
| + lineOffset int |
|
rudominer
2015/10/12 18:06:05
I think it would be clearer if we didn't over use
azani
2015/10/13 00:23:45
Done.
|
| + |
| + // start is the index in source where the current token begins. |
|
rudominer
2015/10/12 18:06:05
Maybe currentTokenStartIndex
or currentTokenStart?
azani
2015/10/13 00:23:46
Done.
|
| + start int |
| + |
| + // lineStart is the line number on which the current token begins. |
| + lineStart int |
|
rudominer
2015/10/12 18:06:05
Maybe currentTokenLineNumber?
azani
2015/10/13 00:23:45
Done.
|
| + |
| + // lineOffsetStart is the number of characters since the beginning |
| + // of the line where the current token begins. |
| + lineOffsetStart int |
|
rudominer
2015/10/12 18:06:05
Maybe currentTokenColumn?
azani
2015/10/13 00:23:45
Done.
|
| + |
| + // tokens is a channel on which the found tokens are emitted. |
|
rudominer
2015/10/12 18:06:05
maybe "to which the found..."
azani
2015/10/13 00:23:45
Done.
|
| + tokens chan Token |
| +} |
| + |
| +// CurText returns the consumed part of the current token. |
| +func (l *lexer) CurText() string { |
| + return l.source[l.start:l.offset] |
| +} |
| + |
| +// emitToken emits the current token and begins the new token. |
|
rudominer
2015/10/12 18:06:05
begins a new token
azani
2015/10/13 00:23:46
Done.
|
| +func (l *lexer) emitToken(tokenType TokenKind) { |
| + l.tokens <- Token{ |
| + Kind: tokenType, |
| + Text: l.source[l.start:l.offset], |
| + CharPos: l.start, |
| + LineNo: l.lineStart, |
| + LinePos: l.lineOffsetStart} |
| + l.startToken() |
| +} |
| + |
| +// startToken starts the new token. |
|
rudominer
2015/10/12 18:06:05
starts a new token
azani
2015/10/13 00:23:45
Done.
|
| +func (l *lexer) startToken() { |
| + l.start = l.offset |
| + l.lineStart = l.lineNo |
| + l.lineOffsetStart = l.lineOffset |
| +} |
| + |
| +// Consume consumes the next rune in the source. |
| +func (l *lexer) Consume() { |
| + c, width := utf8.DecodeRuneInString(l.source[l.offset:]) |
| + if width == 0 { |
|
rudominer
2015/10/12 18:06:05
What is your thinking here? Can width=0 without c
azani
2015/10/13 00:23:45
RuneError is just a special rune. It is handled th
|
| + width = 1 |
|
mattr
2015/10/12 18:04:13
This struck me as strange, this means the input wa
azani
2015/10/13 00:23:45
Basically, in all cases where this branch is taken
|
| + } |
| + |
| + if c == '\n' { |
| + l.lineNo += 1 |
| + l.lineOffset = 0 |
| + } else { |
| + l.lineOffset += width |
| + } |
| + l.offset += width |
| +} |
| + |
| +// Peek returns the next rune in the source. |
| +func (l *lexer) Peek() rune { |
| + char, _ := utf8.DecodeRuneInString(l.source[l.offset:]) |
|
rudominer
2015/10/12 18:06:05
What if char == RuneError?
azani
2015/10/13 00:23:45
We treat it like any other rune. It looks like Run
|
| + return char |
| +} |
| + |
| +// IsEos returns true if the whole source has been consumed false |
| +// otherwise. |
| +func (l *lexer) IsEos() bool { |
| + return l.offset >= len(l.source) |
|
rudominer
2015/10/12 18:06:05
Note that this depends on l.offset being a number
azani
2015/10/13 00:23:46
Yes, since strings index based on bytes, not runes
|
| +} |
| + |
| +// run is the lexer's main loop. |
| +func (l *lexer) run() { |
| + // This is a state machine. |
|
rudominer
2015/10/12 18:06:06
"This" is ambiguous here. Maybe "We are implementi
azani
2015/10/13 00:23:45
Done.
|
| + // lexRoot is the beginning state. |
| + // nil is the end state. |
| + // States are functions which are called on the lexer. They return the |
| + // next state. |
| + for state := lexRoot; state != nil; { |
| + state = state(l) |
| + } |
| + close(l.tokens) |
| +} |
| + |
| +// A stateFn represents a state in the lexer state machine. |
| +type stateFn func(*lexer) stateFn |
| + |
| +// This is the beginning state and also the state in between tokens. |
|
rudominer
2015/10/12 18:06:05
Nit. In my understanding of a DFA there is no "sta
azani
2015/10/13 00:23:45
I meant returned to after most tokens are emitted.
|
| +func lexRoot(l *lexer) stateFn { |
| + if l.offset >= len(l.source) { |
|
rudominer
2015/10/12 18:06:05
You have a function IsEos() for this, no?
azani
2015/10/13 00:23:45
Done.
|
| + return nil |
| + } |
| + |
| + switch c := l.Peek(); { |
| + case isSingleCharTokens(c): |
| + return lexSingleCharTokens |
| + case isEqualsOrResponseStart(c): |
| + return lexEqualsOrResponse |
| + case isNameStart(c): |
| + return lexName |
| + case isOrdinalStart(c): |
| + return lexOrdinal |
| + case isNumberStart(c): |
| + return lexNumber |
| + case isStringStart(c): |
| + return lexString |
| + case isSkippable(c): |
| + return lexSkip |
| + case isMaybeComment(c): |
| + return lexComment |
| + } |
| + |
| + l.Consume() |
| + l.emitToken(ERROR_ILLEGAL_CHAR) |
| + return nil |
| +} |
| + |
| +// isSkippable determines if a rune is skippable. |
| +func isSkippable(c rune) bool { |
| + return c == ' ' || c == '\t' || c == '\r' || c == '\n' |
| +} |
| + |
| +// lexSkip consumes skippable runes. |
| +func lexSkip(l *lexer) stateFn { |
| + for isSkippable(l.Peek()) { |
| + l.Consume() |
| + } |
| + l.startToken() |
| + return lexRoot |
| +} |
| + |
| +// singleCharTokens is a map of single-rune tokens. |
| +var singleCharTokens = map[rune]TokenKind{ |
| + '(': LPAREN, |
| + ')': RPAREN, |
| + '[': LBRACKET, |
| + ']': RBRACKET, |
| + '{': LBRACE, |
| + '}': RBRACE, |
| + '<': LANGLE, |
| + '>': RANGLE, |
| + ';': SEMI, |
| + ',': COMMA, |
| + '.': DOT, |
| + '-': MINUS, |
| + '+': PLUS, |
| + '&': AMP, |
| + '?': QSTN, |
| +} |
| + |
| +// isSingleCharTokens returns true if the rune is a single character token. |
| +func isSingleCharTokens(c rune) bool { |
| + _, ok := singleCharTokens[c] |
| + return ok |
| +} |
| + |
| +// lexSingleCharTokens lexes single character tokens. |
| +func lexSingleCharTokens(l *lexer) stateFn { |
| + c := l.Peek() |
| + l.Consume() |
| + t, _ := singleCharTokens[c] |
| + l.emitToken(t) |
| + |
| + return lexRoot |
| +} |
| + |
| +// isEqualsOrResponseStart returns true if the rune corresponds to be |
| +// beginning of either the '=' or '=>' tokens. |
| +func isEqualsOrResponseStart(c rune) bool { |
| + return c == '=' |
| +} |
| + |
| +// lexEqualsOrResponse lexes the '=' or the '=>' token. |
| +func lexEqualsOrResponse(l *lexer) stateFn { |
| + l.Consume() |
| + |
| + if l.Peek() == '>' { |
| + l.Consume() |
| + l.emitToken(RESPONSE) |
| + } else { |
| + l.emitToken(EQUALS) |
| + } |
| + |
| + return lexRoot |
| +} |
| + |
| +// isAlpha returns true if the rune is a letter of the alphabet. |
| +func isAlpha(c rune) bool { |
| + return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) |
| +} |
| + |
| +// isDigit returns true if the rune is a digit. |
| +func isDigit(c rune) bool { |
| + return ('0' <= c && c <= '9') |
| +} |
| + |
| +// isHexDigit returns true if the rune is a hexadecimal digit. |
| +func isHexDigit(c rune) bool { |
| + return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') |
| +} |
| + |
| +// isNameStart returns true if the rune is the beginning of a NAME token. |
| +func isNameStart(c rune) bool { |
| + return isAlpha(c) || c == '_' |
| +} |
| + |
| +// keywordTokens maps keywords to their associate tokens. |
| +var keywordTokens = map[string]TokenKind{ |
| + "import": IMPORT, |
| + "module": MODULE, |
| + "struct": STRUCT, |
| + "union": UNION, |
| + "interface": INTERFACE, |
| + "enum": ENUM, |
| + "const": CONST, |
| + "true": TRUE, |
| + "false": FALSE, |
| + "default": DEFAULT, |
| +} |
| + |
| +// lexName lexes valid C identifiers. (K&R2: A.2.3) |
| +func lexName(l *lexer) stateFn { |
| + l.Consume() |
| + |
| + // isNameRune returns true if the rune is valid in a NAME token. |
| + isNameRune := func(c rune) bool { |
| + return isAlpha(c) || isDigit(c) || c == '_' |
| + } |
| + |
| + for isNameRune(l.Peek()) { |
| + l.Consume() |
| + } |
| + |
| + // Emit the appropriate keyword token if the current name is a |
| + // keyword or a NAME token otherwise. |
| + if token, found := keywordTokens[l.CurText()]; found { |
| + l.emitToken(token) |
| + } else { |
| + l.emitToken(NAME) |
| + } |
| + |
| + return lexRoot |
| +} |
| + |
| +// isOrdinalStart returns true if the rune is the start of an ORDINAL |
| +// token. |
| +func isOrdinalStart(c rune) bool { |
| + return '@' == c |
| +} |
| + |
| +// lexOrdinal lexes an ORDINAL token. Ordinals are a '@' followed by one |
| +// or more digits. |
| +func lexOrdinal(l *lexer) stateFn { |
| + // Consume the '@'. |
| + l.Consume() |
| + |
| + for isDigit(l.Peek()) { |
| + l.Consume() |
| + } |
| + |
| + l.emitToken(ORDINAL) |
| + |
| + return lexRoot |
| +} |
| + |
| +// isNumberStart returns true if the rune is the beginning of a number. |
| +func isNumberStart(c rune) bool { |
| + // Even hexadecimals must start with a digit (namely 0). |
| + return isDigit(c) |
| +} |
| + |
| +// lexNumber lexes a number token. |
| +func lexNumber(l *lexer) stateFn { |
| + // If the number starts with 0 it cannot be a decimal integer. |
| + if l.Peek() == '0' { |
| + return lexNumberStartWithZero |
| + } |
| + return lexDec |
| +} |
| + |
| +// lexDec lexes a base-10 number. |
| +func lexDec(l *lexer) stateFn { |
| + for isDigit(l.Peek()) { |
| + l.Consume() |
| + } |
| + |
| + // If a decimal part is found, transition to the decimal state. |
| + if isDecimalPartStart(l.Peek()) { |
| + return lexDecimalPart |
| + } |
| + |
| + l.emitToken(INT_CONST_DEC) |
| + |
| + return lexRoot |
| +} |
| + |
| +// lexNumberStartWithZero lexes hexadecimals, some floats or 0. |
| +func lexNumberStartWithZero(l *lexer) stateFn { |
| + // Consume the starting 0 |
| + l.Consume() |
| + |
| + // Here we check to see whether we are in the hexadecimal or floating |
| + // point case. |
| + switch c := l.Peek(); { |
| + case c == 'x' || c == 'X': |
| + return lexHexNumber |
| + case isDecimalPartStart(c): |
| + return lexDecimalPart |
| + } |
| + |
| + // Found a naked 0. |
| + l.emitToken(INT_CONST_DEC) |
| + |
| + return lexRoot |
| +} |
| + |
| +// lexHexNumber lexes hexadecimal integers. |
| +func lexHexNumber(l *lexer) stateFn { |
| + // Consume the x or X |
| + l.Consume() |
| + |
| + for isHexDigit(l.Peek()) { |
| + l.Consume() |
| + } |
| + |
| + l.emitToken(INT_CONST_HEX) |
| + |
| + return lexRoot |
| +} |
| + |
| +// isDecimalPartStart returns true if the rune represents the start of |
| +// the decimal part of a floating point number. |
| +func isDecimalPartStart(c rune) bool { |
| + return c == '.' || c == 'e' || c == 'E' |
| +} |
| + |
| +// lexDecimalPart lexes the decimal part of a floating point number. |
| +func lexDecimalPart(l *lexer) stateFn { |
| + // Consume '.' or 'e' or 'E' |
| + l.Consume() |
| + |
| + if c := l.Peek(); c == '+' || c == '-' { |
| + l.Consume() |
| + } |
| + |
| + for isDigit(l.Peek()) { |
| + l.Consume() |
| + } |
| + |
| + l.emitToken(FLOAT_CONST) |
| + |
| + return lexRoot |
| +} |
| + |
| +// isStringStart returns true if the rune represents the start of a string. |
| +func isStringStart(c rune) bool { |
| + return '"' == c |
| +} |
| + |
| +// lexString lexes a quoted string. |
| +func lexString(l *lexer) stateFn { |
| + // Consume opening quotes. |
| + l.Consume() |
| + |
| + for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' { |
| + if l.Peek() == '\\' { |
| + // If we see an escape character consume whatever follows blindly. |
| + // TODO(azani): Consider parsing escape sequences. |
| + l.Consume() |
| + } |
| + l.Consume() |
| + } |
| + |
| + if l.IsEos() || l.Peek() == '\n' { |
| + l.emitToken(ERROR_UNTERMINATED_STRING_LITERAL) |
| + return nil |
| + } |
| + |
| + // Consume the closing quotes |
| + l.Consume() |
| + |
| + l.emitToken(STRING_LITERAL) |
| + |
| + return lexRoot |
| +} |
| + |
| +// isMaybeComment returns true if the rune may be the beginning of a |
| +// comment. |
| +func isMaybeComment(c rune) bool { |
| + return c == '/' |
| +} |
| + |
| +// lexComment consumes a single-line or multi-line comment. |
| +func lexComment(l *lexer) stateFn { |
| + // Consume the '/'. |
| + l.Consume() |
| + |
| + switch l.Peek() { |
| + case '/': |
| + return lexSingleLineComment |
| + case '*': |
| + return lexMultiLineComment |
| + } |
| + |
| + l.emitToken(ERROR_ILLEGAL_CHAR) |
| + return nil |
| +} |
| + |
| +// lexSingleLineComment consumes a single line comment. |
| +func lexSingleLineComment(l *lexer) stateFn { |
| + // Consume the '/' |
| + l.Consume() |
| + |
| + for !l.IsEos() && l.Peek() != '\n' { |
| + l.Consume() |
| + } |
| + |
| + l.startToken() |
| + return lexRoot |
| +} |
| + |
| +// lexMultiLineComment consumes a multi-line comment. |
| +func lexMultiLineComment(l *lexer) stateFn { |
| + // Consume the '*'. |
| + l.Consume() |
| + |
| + for !l.IsEos() { |
| + if l.Peek() == '*' { |
| + return lexPossibleEndOfComment |
| + } |
| + l.Consume() |
| + } |
| + |
| + l.emitToken(ERROR_UNTERMINATED_COMMENT) |
| + return nil |
| +} |
| + |
| +// lexPossibleEndOfComment consumes the possible end of a multiline |
| +// comment and determines whether the comment in fact ended or not. |
| +func lexPossibleEndOfComment(l *lexer) stateFn { |
| + // Consume the '*' |
| + l.Consume() |
| + |
| + if l.IsEos() { |
| + l.emitToken(ERROR_UNTERMINATED_COMMENT) |
| + return nil |
| + } |
| + |
| + if l.Peek() == '/' { |
| + l.Consume() |
| + l.startToken() |
| + return lexRoot |
| + } |
| + |
| + return lexMultiLineComment |
| +} |