Index: mojom/mojom_parser/lexer/lexer.go |
diff --git a/mojom/mojom_parser/lexer/lexer.go b/mojom/mojom_parser/lexer/lexer.go |
new file mode 100644 |
index 0000000000000000000000000000000000000000..f918b3bc4eb6e8a4f76ada2f477ce84b01b6fb82 |
--- /dev/null |
+++ b/mojom/mojom_parser/lexer/lexer.go |
@@ -0,0 +1,500 @@ |
+// Copyright 2015 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+// |
+// To call the lexer, call Tokenize with the source string to obtain |
rudominer
2015/10/12 18:06:05
To use the lexer, call...
azani
2015/10/13 00:23:46
Done.
|
+// a TokenStream. The lexer is run concurrently so you should be able to |
+// use the TokenStream before the lexer is done with the source. |
+// |
+// The lexer is implemented as a state machine. The states are represented |
+// by functions (the stateFn type) which accept a lexer and return the |
+// new state. |
+// |
+// Most states also have an isFooStart function which helps determine if |
+// a transition to Foo is appropriate. Those functions accept a single |
+// rune as a parameter and return true if the state machine should |
+// transition to state Foo. Some states do not have such functions on |
+// account of the transition condition being trivial. |
+// |
+// The lexer implementation was inspired by |
+// http://cuddle.googlecode.com/hg/talk/lex.html |
+ |
+package lexer |
+ |
+import ( |
+ "unicode/utf8" |
+) |
+ |
+// Tokenize accepts a source string and returns the tokens which compose |
rudominer
2015/10/12 18:06:05
The phrasing belies the asynchronous nature. Also
azani
2015/10/13 00:23:46
Done.
|
+// the source string. |
+func Tokenize(source string) TokenStream { |
+ tokens := make(chan Token) |
+ l := lexer{source: source, tokens: tokens} |
+ go l.run() |
+ return &TokenChan{tokenChan: tokens} |
+} |
+ |
+type lexer struct { |
+ // source is the source code to be lexed. |
rudominer
2015/10/12 18:06:05
nit: You don't need to repeat the variable name wi
azani
2015/10/12 18:18:07
Hey Matt, could you settle that question for us? I
|
+ source string |
+ |
+ // offset is the number of characters that have been consumed. |
rudominer
2015/10/12 18:06:05
Let's follow the go standard terminology and avoid
azani
2015/10/13 00:23:45
Done.
|
+ offset int |
+ |
+ // lineno is the current line number. |
+ lineNo int |
+ |
+ // lineOffset is how many characters have been consumed since the |
+ // beginning of the line. |
+ lineOffset int |
rudominer
2015/10/12 18:06:05
I think it would be clearer if we didn't over use
azani
2015/10/13 00:23:45
Done.
|
+ |
+ // start is the index in source where the current token begins. |
rudominer
2015/10/12 18:06:05
Maybe currentTokenStartIndex
or currentTokenStart?
azani
2015/10/13 00:23:46
Done.
|
+ start int |
+ |
+ // lineStart is the line number on which the current token begins. |
+ lineStart int |
rudominer
2015/10/12 18:06:05
Maybe currentTokenLineNumber?
azani
2015/10/13 00:23:45
Done.
|
+ |
+ // lineOffsetStart is the number of characters since the beginning |
+ // of the line where the current token begins. |
+ lineOffsetStart int |
rudominer
2015/10/12 18:06:05
Maybe currentTokenColumn?
azani
2015/10/13 00:23:45
Done.
|
+ |
+ // tokens is a channel on which the found tokens are emitted. |
rudominer
2015/10/12 18:06:05
maybe "to which the found..."
azani
2015/10/13 00:23:45
Done.
|
+ tokens chan Token |
+} |
+ |
+// CurText returns the consumed part of the current token. |
+func (l *lexer) CurText() string { |
+ return l.source[l.start:l.offset] |
+} |
+ |
+// emitToken emits the current token and begins the new token. |
rudominer
2015/10/12 18:06:05
begins a new token
azani
2015/10/13 00:23:46
Done.
|
+func (l *lexer) emitToken(tokenType TokenKind) { |
+ l.tokens <- Token{ |
+ Kind: tokenType, |
+ Text: l.source[l.start:l.offset], |
+ CharPos: l.start, |
+ LineNo: l.lineStart, |
+ LinePos: l.lineOffsetStart} |
+ l.startToken() |
+} |
+ |
+// startToken starts the new token. |
rudominer
2015/10/12 18:06:05
starts a new token
azani
2015/10/13 00:23:45
Done.
|
+func (l *lexer) startToken() { |
+ l.start = l.offset |
+ l.lineStart = l.lineNo |
+ l.lineOffsetStart = l.lineOffset |
+} |
+ |
+// Consume consumes the next rune in the source. |
+func (l *lexer) Consume() { |
+ c, width := utf8.DecodeRuneInString(l.source[l.offset:]) |
+ if width == 0 { |
rudominer
2015/10/12 18:06:05
What is your thinking here? Can width=0 without c
azani
2015/10/13 00:23:45
RuneError is just a special rune. It is handled th
|
+ width = 1 |
mattr
2015/10/12 18:04:13
This struck me as strange, this means the input wa
azani
2015/10/13 00:23:45
Basically, in all cases where this branch is taken
|
+ } |
+ |
+ if c == '\n' { |
+ l.lineNo += 1 |
+ l.lineOffset = 0 |
+ } else { |
+ l.lineOffset += width |
+ } |
+ l.offset += width |
+} |
+ |
+// Peek returns the next rune in the source. |
+func (l *lexer) Peek() rune { |
+ char, _ := utf8.DecodeRuneInString(l.source[l.offset:]) |
rudominer
2015/10/12 18:06:05
What if char == RuneError?
azani
2015/10/13 00:23:45
We treat it like any other rune. It looks like Run
|
+ return char |
+} |
+ |
+// IsEos returns true if the whole source has been consumed false |
+// otherwise. |
+func (l *lexer) IsEos() bool { |
+ return l.offset >= len(l.source) |
rudominer
2015/10/12 18:06:05
Note that this depends on l.offset being a number
azani
2015/10/13 00:23:46
Yes, since strings index based on bytes, not runes
|
+} |
+ |
+// run is the lexer's main loop. |
+func (l *lexer) run() { |
+ // This is a state machine. |
rudominer
2015/10/12 18:06:06
"This" is ambiguous here. Maybe "We are implementi
azani
2015/10/13 00:23:45
Done.
|
+ // lexRoot is the beginning state. |
+ // nil is the end state. |
+ // States are functions which are called on the lexer. They return the |
+ // next state. |
+ for state := lexRoot; state != nil; { |
+ state = state(l) |
+ } |
+ close(l.tokens) |
+} |
+ |
+// A stateFn represents a state in the lexer state machine. |
+type stateFn func(*lexer) stateFn |
+ |
+// This is the beginning state and also the state in between tokens. |
rudominer
2015/10/12 18:06:05
Nit. In my understanding of a DFA there is no "sta
azani
2015/10/13 00:23:45
I meant returned to after most tokens are emitted.
|
+func lexRoot(l *lexer) stateFn { |
+ if l.offset >= len(l.source) { |
rudominer
2015/10/12 18:06:05
You have a function IsEos() for this, no?
azani
2015/10/13 00:23:45
Done.
|
+ return nil |
+ } |
+ |
+ switch c := l.Peek(); { |
+ case isSingleCharTokens(c): |
+ return lexSingleCharTokens |
+ case isEqualsOrResponseStart(c): |
+ return lexEqualsOrResponse |
+ case isNameStart(c): |
+ return lexName |
+ case isOrdinalStart(c): |
+ return lexOrdinal |
+ case isNumberStart(c): |
+ return lexNumber |
+ case isStringStart(c): |
+ return lexString |
+ case isSkippable(c): |
+ return lexSkip |
+ case isMaybeComment(c): |
+ return lexComment |
+ } |
+ |
+ l.Consume() |
+ l.emitToken(ERROR_ILLEGAL_CHAR) |
+ return nil |
+} |
+ |
+// isSkippable determines if a rune is skippable. |
+func isSkippable(c rune) bool { |
+ return c == ' ' || c == '\t' || c == '\r' || c == '\n' |
+} |
+ |
+// lexSkip consumes skippable runes. |
+func lexSkip(l *lexer) stateFn { |
+ for isSkippable(l.Peek()) { |
+ l.Consume() |
+ } |
+ l.startToken() |
+ return lexRoot |
+} |
+ |
+// singleCharTokens is a map of single-rune tokens. |
+var singleCharTokens = map[rune]TokenKind{ |
+ '(': LPAREN, |
+ ')': RPAREN, |
+ '[': LBRACKET, |
+ ']': RBRACKET, |
+ '{': LBRACE, |
+ '}': RBRACE, |
+ '<': LANGLE, |
+ '>': RANGLE, |
+ ';': SEMI, |
+ ',': COMMA, |
+ '.': DOT, |
+ '-': MINUS, |
+ '+': PLUS, |
+ '&': AMP, |
+ '?': QSTN, |
+} |
+ |
+// isSingleCharTokens returns true if the rune is a single character token. |
+func isSingleCharTokens(c rune) bool { |
+ _, ok := singleCharTokens[c] |
+ return ok |
+} |
+ |
+// lexSingleCharTokens lexes single character tokens. |
+func lexSingleCharTokens(l *lexer) stateFn { |
+ c := l.Peek() |
+ l.Consume() |
+ t, _ := singleCharTokens[c] |
+ l.emitToken(t) |
+ |
+ return lexRoot |
+} |
+ |
+// isEqualsOrResponseStart returns true if the rune corresponds to be |
+// beginning of either the '=' or '=>' tokens. |
+func isEqualsOrResponseStart(c rune) bool { |
+ return c == '=' |
+} |
+ |
+// lexEqualsOrResponse lexes the '=' or the '=>' token. |
+func lexEqualsOrResponse(l *lexer) stateFn { |
+ l.Consume() |
+ |
+ if l.Peek() == '>' { |
+ l.Consume() |
+ l.emitToken(RESPONSE) |
+ } else { |
+ l.emitToken(EQUALS) |
+ } |
+ |
+ return lexRoot |
+} |
+ |
+// isAlpha returns true if the rune is a letter of the alphabet. |
+func isAlpha(c rune) bool { |
+ return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) |
+} |
+ |
+// isDigit returns true if the rune is a digit. |
+func isDigit(c rune) bool { |
+ return ('0' <= c && c <= '9') |
+} |
+ |
+// isHexDigit returns true if the rune is a hexadecimal digit. |
+func isHexDigit(c rune) bool { |
+ return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') |
+} |
+ |
+// isNameStart returns true if the rune is the beginning of a NAME token. |
+func isNameStart(c rune) bool { |
+ return isAlpha(c) || c == '_' |
+} |
+ |
+// keywordTokens maps keywords to their associate tokens. |
+var keywordTokens = map[string]TokenKind{ |
+ "import": IMPORT, |
+ "module": MODULE, |
+ "struct": STRUCT, |
+ "union": UNION, |
+ "interface": INTERFACE, |
+ "enum": ENUM, |
+ "const": CONST, |
+ "true": TRUE, |
+ "false": FALSE, |
+ "default": DEFAULT, |
+} |
+ |
+// lexName lexes valid C identifiers. (K&R2: A.2.3) |
+func lexName(l *lexer) stateFn { |
+ l.Consume() |
+ |
+ // isNameRune returns true if the rune is valid in a NAME token. |
+ isNameRune := func(c rune) bool { |
+ return isAlpha(c) || isDigit(c) || c == '_' |
+ } |
+ |
+ for isNameRune(l.Peek()) { |
+ l.Consume() |
+ } |
+ |
+ // Emit the appropriate keyword token if the current name is a |
+ // keyword or a NAME token otherwise. |
+ if token, found := keywordTokens[l.CurText()]; found { |
+ l.emitToken(token) |
+ } else { |
+ l.emitToken(NAME) |
+ } |
+ |
+ return lexRoot |
+} |
+ |
+// isOrdinalStart returns true if the rune is the start of an ORDINAL |
+// token. |
+func isOrdinalStart(c rune) bool { |
+ return '@' == c |
+} |
+ |
+// lexOrdinal lexes an ORDINAL token. Ordinals are a '@' followed by one |
+// or more digits. |
+func lexOrdinal(l *lexer) stateFn { |
+ // Consume the '@'. |
+ l.Consume() |
+ |
+ for isDigit(l.Peek()) { |
+ l.Consume() |
+ } |
+ |
+ l.emitToken(ORDINAL) |
+ |
+ return lexRoot |
+} |
+ |
+// isNumberStart returns true if the rune is the beginning of a number. |
+func isNumberStart(c rune) bool { |
+ // Even hexadecimals must start with a digit (namely 0). |
+ return isDigit(c) |
+} |
+ |
+// lexNumber lexes a number token. |
+func lexNumber(l *lexer) stateFn { |
+ // If the number starts with 0 it cannot be a decimal integer. |
+ if l.Peek() == '0' { |
+ return lexNumberStartWithZero |
+ } |
+ return lexDec |
+} |
+ |
+// lexDec lexes a base-10 number. |
+func lexDec(l *lexer) stateFn { |
+ for isDigit(l.Peek()) { |
+ l.Consume() |
+ } |
+ |
+ // If a decimal part is found, transition to the decimal state. |
+ if isDecimalPartStart(l.Peek()) { |
+ return lexDecimalPart |
+ } |
+ |
+ l.emitToken(INT_CONST_DEC) |
+ |
+ return lexRoot |
+} |
+ |
+// lexNumberStartWithZero lexes hexadecimals, some floats or 0. |
+func lexNumberStartWithZero(l *lexer) stateFn { |
+ // Consume the starting 0 |
+ l.Consume() |
+ |
+ // Here we check to see whether we are in the hexadecimal or floating |
+ // point case. |
+ switch c := l.Peek(); { |
+ case c == 'x' || c == 'X': |
+ return lexHexNumber |
+ case isDecimalPartStart(c): |
+ return lexDecimalPart |
+ } |
+ |
+ // Found a naked 0. |
+ l.emitToken(INT_CONST_DEC) |
+ |
+ return lexRoot |
+} |
+ |
+// lexHexNumber lexes hexadecimal integers. |
+func lexHexNumber(l *lexer) stateFn { |
+ // Consume the x or X |
+ l.Consume() |
+ |
+ for isHexDigit(l.Peek()) { |
+ l.Consume() |
+ } |
+ |
+ l.emitToken(INT_CONST_HEX) |
+ |
+ return lexRoot |
+} |
+ |
+// isDecimalPartStart returns true if the rune represents the start of |
+// the decimal part of a floating point number. |
+func isDecimalPartStart(c rune) bool { |
+ return c == '.' || c == 'e' || c == 'E' |
+} |
+ |
+// lexDecimalPart lexes the decimal part of a floating point number. |
+func lexDecimalPart(l *lexer) stateFn { |
+ // Consume '.' or 'e' or 'E' |
+ l.Consume() |
+ |
+ if c := l.Peek(); c == '+' || c == '-' { |
+ l.Consume() |
+ } |
+ |
+ for isDigit(l.Peek()) { |
+ l.Consume() |
+ } |
+ |
+ l.emitToken(FLOAT_CONST) |
+ |
+ return lexRoot |
+} |
+ |
+// isStringStart returns true if the rune represents the start of a string. |
+func isStringStart(c rune) bool { |
+ return '"' == c |
+} |
+ |
+// lexString lexes a quoted string. |
+func lexString(l *lexer) stateFn { |
+ // Consume opening quotes. |
+ l.Consume() |
+ |
+ for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' { |
+ if l.Peek() == '\\' { |
+ // If we see an escape character consume whatever follows blindly. |
+ // TODO(azani): Consider parsing escape sequences. |
+ l.Consume() |
+ } |
+ l.Consume() |
+ } |
+ |
+ if l.IsEos() || l.Peek() == '\n' { |
+ l.emitToken(ERROR_UNTERMINATED_STRING_LITERAL) |
+ return nil |
+ } |
+ |
+ // Consume the closing quotes |
+ l.Consume() |
+ |
+ l.emitToken(STRING_LITERAL) |
+ |
+ return lexRoot |
+} |
+ |
+// isMaybeComment returns true if the rune may be the beginning of a |
+// comment. |
+func isMaybeComment(c rune) bool { |
+ return c == '/' |
+} |
+ |
+// lexComment consumes a single-line or multi-line comment. |
+func lexComment(l *lexer) stateFn { |
+ // Consume the '/'. |
+ l.Consume() |
+ |
+ switch l.Peek() { |
+ case '/': |
+ return lexSingleLineComment |
+ case '*': |
+ return lexMultiLineComment |
+ } |
+ |
+ l.emitToken(ERROR_ILLEGAL_CHAR) |
+ return nil |
+} |
+ |
+// lexSingleLineComment consumes a single line comment. |
+func lexSingleLineComment(l *lexer) stateFn { |
+ // Consume the '/' |
+ l.Consume() |
+ |
+ for !l.IsEos() && l.Peek() != '\n' { |
+ l.Consume() |
+ } |
+ |
+ l.startToken() |
+ return lexRoot |
+} |
+ |
+// lexMultiLineComment consumes a multi-line comment. |
+func lexMultiLineComment(l *lexer) stateFn { |
+ // Consume the '*'. |
+ l.Consume() |
+ |
+ for !l.IsEos() { |
+ if l.Peek() == '*' { |
+ return lexPossibleEndOfComment |
+ } |
+ l.Consume() |
+ } |
+ |
+ l.emitToken(ERROR_UNTERMINATED_COMMENT) |
+ return nil |
+} |
+ |
+// lexPossibleEndOfComment consumes the possible end of a multiline |
+// comment and determines whether the comment in fact ended or not. |
+func lexPossibleEndOfComment(l *lexer) stateFn { |
+ // Consume the '*' |
+ l.Consume() |
+ |
+ if l.IsEos() { |
+ l.emitToken(ERROR_UNTERMINATED_COMMENT) |
+ return nil |
+ } |
+ |
+ if l.Peek() == '/' { |
+ l.Consume() |
+ l.startToken() |
+ return lexRoot |
+ } |
+ |
+ return lexMultiLineComment |
+} |