Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(705)

Unified Diff: mojom/mojom_parser/lexer/lexer.go

Issue 1387893002: New lexer for mojom written in go. (Closed) Base URL: https://github.com/domokit/mojo.git@master
Patch Set: Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | mojom/mojom_parser/lexer/lexer_test.go » ('j') | mojom/mojom_parser/lexer/lexer_test.go » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: mojom/mojom_parser/lexer/lexer.go
diff --git a/mojom/mojom_parser/lexer/lexer.go b/mojom/mojom_parser/lexer/lexer.go
new file mode 100644
index 0000000000000000000000000000000000000000..f918b3bc4eb6e8a4f76ada2f477ce84b01b6fb82
--- /dev/null
+++ b/mojom/mojom_parser/lexer/lexer.go
@@ -0,0 +1,500 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// To call the lexer, call Tokenize with the source string to obtain
rudominer 2015/10/12 18:06:05 To use the lexer, call...
azani 2015/10/13 00:23:46 Done.
+// a TokenStream. The lexer is run concurrently so you should be able to
+// use the TokenStream before the lexer is done with the source.
+//
+// The lexer is implemented as a state machine. The states are represented
+// by functions (the stateFn type) which accept a lexer and return the
+// new state.
+//
+// Most states also have an isFooStart function which helps determine if
+// a transition to Foo is appropriate. Those functions accept a single
+// rune as a parameter and return true if the state machine should
+// transition to state Foo. Some states do not have such functions on
+// account of the transition condition being trivial.
+//
+// The lexer implementation was inspired by
+// http://cuddle.googlecode.com/hg/talk/lex.html
+
+package lexer
+
+import (
+ "unicode/utf8"
+)
+
+// Tokenize accepts a source string and returns the tokens which compose
rudominer 2015/10/12 18:06:05 The phrasing belies the asynchronous nature. Also
azani 2015/10/13 00:23:46 Done.
+// the source string.
+func Tokenize(source string) TokenStream {
+ tokens := make(chan Token)
+ l := lexer{source: source, tokens: tokens}
+ go l.run()
+ return &TokenChan{tokenChan: tokens}
+}
+
+type lexer struct {
+ // source is the source code to be lexed.
rudominer 2015/10/12 18:06:05 nit: You don't need to repeat the variable name wi
azani 2015/10/12 18:18:07 Hey Matt, could you settle that question for us? I
+ source string
+
+ // offset is the number of characters that have been consumed.
rudominer 2015/10/12 18:06:05 Let's follow the go standard terminology and avoid
azani 2015/10/13 00:23:45 Done.
+ offset int
+
+ // lineno is the current line number.
+ lineNo int
+
+ // lineOffset is how many characters have been consumed since the
+ // beginning of the line.
+ lineOffset int
rudominer 2015/10/12 18:06:05 I think it would be clearer if we didn't over use
azani 2015/10/13 00:23:45 Done.
+
+ // start is the index in source where the current token begins.
rudominer 2015/10/12 18:06:05 Maybe currentTokenStartIndex or currentTokenStart?
azani 2015/10/13 00:23:46 Done.
+ start int
+
+ // lineStart is the line number on which the current token begins.
+ lineStart int
rudominer 2015/10/12 18:06:05 Maybe currentTokenLineNumber?
azani 2015/10/13 00:23:45 Done.
+
+ // lineOffsetStart is the number of characters since the beginning
+ // of the line where the current token begins.
+ lineOffsetStart int
rudominer 2015/10/12 18:06:05 Maybe currentTokenColumn?
azani 2015/10/13 00:23:45 Done.
+
+ // tokens is a channel on which the found tokens are emitted.
rudominer 2015/10/12 18:06:05 maybe "to which the found..."
azani 2015/10/13 00:23:45 Done.
+ tokens chan Token
+}
+
+// CurText returns the consumed part of the current token.
+func (l *lexer) CurText() string {
+ return l.source[l.start:l.offset]
+}
+
+// emitToken emits the current token and begins the new token.
rudominer 2015/10/12 18:06:05 begins a new token
azani 2015/10/13 00:23:46 Done.
+func (l *lexer) emitToken(tokenType TokenKind) {
+ l.tokens <- Token{
+ Kind: tokenType,
+ Text: l.source[l.start:l.offset],
+ CharPos: l.start,
+ LineNo: l.lineStart,
+ LinePos: l.lineOffsetStart}
+ l.startToken()
+}
+
+// startToken starts the new token.
rudominer 2015/10/12 18:06:05 starts a new token
azani 2015/10/13 00:23:45 Done.
+func (l *lexer) startToken() {
+ l.start = l.offset
+ l.lineStart = l.lineNo
+ l.lineOffsetStart = l.lineOffset
+}
+
+// Consume consumes the next rune in the source.
+func (l *lexer) Consume() {
+ c, width := utf8.DecodeRuneInString(l.source[l.offset:])
+ if width == 0 {
rudominer 2015/10/12 18:06:05 What is your thinking here? Can width=0 without c
azani 2015/10/13 00:23:45 RuneError is just a special rune. It is handled th
+ width = 1
mattr 2015/10/12 18:04:13 This struck me as strange, this means the input wa
azani 2015/10/13 00:23:45 Basically, in all cases where this branch is taken
+ }
+
+ if c == '\n' {
+ l.lineNo += 1
+ l.lineOffset = 0
+ } else {
+ l.lineOffset += width
+ }
+ l.offset += width
+}
+
+// Peek returns the next rune in the source.
+func (l *lexer) Peek() rune {
+ char, _ := utf8.DecodeRuneInString(l.source[l.offset:])
rudominer 2015/10/12 18:06:05 What if char == RuneError?
azani 2015/10/13 00:23:45 We treat it like any other rune. It looks like Run
+ return char
+}
+
+// IsEos returns true if the whole source has been consumed false
+// otherwise.
+func (l *lexer) IsEos() bool {
+ return l.offset >= len(l.source)
rudominer 2015/10/12 18:06:05 Note that this depends on l.offset being a number
azani 2015/10/13 00:23:46 Yes, since strings index based on bytes, not runes
+}
+
+// run is the lexer's main loop.
+func (l *lexer) run() {
+ // This is a state machine.
rudominer 2015/10/12 18:06:06 "This" is ambiguous here. Maybe "We are implementi
azani 2015/10/13 00:23:45 Done.
+ // lexRoot is the beginning state.
+ // nil is the end state.
+ // States are functions which are called on the lexer. They return the
+ // next state.
+ for state := lexRoot; state != nil; {
+ state = state(l)
+ }
+ close(l.tokens)
+}
+
+// A stateFn represents a state in the lexer state machine.
+type stateFn func(*lexer) stateFn
+
+// This is the beginning state and also the state in between tokens.
rudominer 2015/10/12 18:06:05 Nit. In my understanding of a DFA there is no "sta
azani 2015/10/13 00:23:45 I meant returned to after most tokens are emitted.
+func lexRoot(l *lexer) stateFn {
+ if l.offset >= len(l.source) {
rudominer 2015/10/12 18:06:05 You have a function IsEos() for this, no?
azani 2015/10/13 00:23:45 Done.
+ return nil
+ }
+
+ switch c := l.Peek(); {
+ case isSingleCharTokens(c):
+ return lexSingleCharTokens
+ case isEqualsOrResponseStart(c):
+ return lexEqualsOrResponse
+ case isNameStart(c):
+ return lexName
+ case isOrdinalStart(c):
+ return lexOrdinal
+ case isNumberStart(c):
+ return lexNumber
+ case isStringStart(c):
+ return lexString
+ case isSkippable(c):
+ return lexSkip
+ case isMaybeComment(c):
+ return lexComment
+ }
+
+ l.Consume()
+ l.emitToken(ERROR_ILLEGAL_CHAR)
+ return nil
+}
+
+// isSkippable determines if a rune is skippable.
+func isSkippable(c rune) bool {
+ return c == ' ' || c == '\t' || c == '\r' || c == '\n'
+}
+
+// lexSkip consumes skippable runes.
+func lexSkip(l *lexer) stateFn {
+ for isSkippable(l.Peek()) {
+ l.Consume()
+ }
+ l.startToken()
+ return lexRoot
+}
+
+// singleCharTokens is a map of single-rune tokens.
+var singleCharTokens = map[rune]TokenKind{
+ '(': LPAREN,
+ ')': RPAREN,
+ '[': LBRACKET,
+ ']': RBRACKET,
+ '{': LBRACE,
+ '}': RBRACE,
+ '<': LANGLE,
+ '>': RANGLE,
+ ';': SEMI,
+ ',': COMMA,
+ '.': DOT,
+ '-': MINUS,
+ '+': PLUS,
+ '&': AMP,
+ '?': QSTN,
+}
+
+// isSingleCharTokens returns true if the rune is a single character token.
+func isSingleCharTokens(c rune) bool {
+ _, ok := singleCharTokens[c]
+ return ok
+}
+
+// lexSingleCharTokens lexes single character tokens.
+func lexSingleCharTokens(l *lexer) stateFn {
+ c := l.Peek()
+ l.Consume()
+ t, _ := singleCharTokens[c]
+ l.emitToken(t)
+
+ return lexRoot
+}
+
+// isEqualsOrResponseStart returns true if the rune corresponds to be
+// beginning of either the '=' or '=>' tokens.
+func isEqualsOrResponseStart(c rune) bool {
+ return c == '='
+}
+
+// lexEqualsOrResponse lexes the '=' or the '=>' token.
+func lexEqualsOrResponse(l *lexer) stateFn {
+ l.Consume()
+
+ if l.Peek() == '>' {
+ l.Consume()
+ l.emitToken(RESPONSE)
+ } else {
+ l.emitToken(EQUALS)
+ }
+
+ return lexRoot
+}
+
+// isAlpha returns true if the rune is a letter of the alphabet.
+func isAlpha(c rune) bool {
+ return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
+}
+
+// isDigit returns true if the rune is a digit.
+func isDigit(c rune) bool {
+ return ('0' <= c && c <= '9')
+}
+
+// isHexDigit returns true if the rune is a hexadecimal digit.
+func isHexDigit(c rune) bool {
+ return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
+}
+
+// isNameStart returns true if the rune is the beginning of a NAME token.
+func isNameStart(c rune) bool {
+ return isAlpha(c) || c == '_'
+}
+
+// keywordTokens maps keywords to their associate tokens.
+var keywordTokens = map[string]TokenKind{
+ "import": IMPORT,
+ "module": MODULE,
+ "struct": STRUCT,
+ "union": UNION,
+ "interface": INTERFACE,
+ "enum": ENUM,
+ "const": CONST,
+ "true": TRUE,
+ "false": FALSE,
+ "default": DEFAULT,
+}
+
+// lexName lexes valid C identifiers. (K&R2: A.2.3)
+func lexName(l *lexer) stateFn {
+ l.Consume()
+
+ // isNameRune returns true if the rune is valid in a NAME token.
+ isNameRune := func(c rune) bool {
+ return isAlpha(c) || isDigit(c) || c == '_'
+ }
+
+ for isNameRune(l.Peek()) {
+ l.Consume()
+ }
+
+ // Emit the appropriate keyword token if the current name is a
+ // keyword or a NAME token otherwise.
+ if token, found := keywordTokens[l.CurText()]; found {
+ l.emitToken(token)
+ } else {
+ l.emitToken(NAME)
+ }
+
+ return lexRoot
+}
+
+// isOrdinalStart returns true if the rune is the start of an ORDINAL
+// token.
+func isOrdinalStart(c rune) bool {
+ return '@' == c
+}
+
+// lexOrdinal lexes an ORDINAL token. Ordinals are a '@' followed by one
+// or more digits.
+func lexOrdinal(l *lexer) stateFn {
+ // Consume the '@'.
+ l.Consume()
+
+ for isDigit(l.Peek()) {
+ l.Consume()
+ }
+
+ l.emitToken(ORDINAL)
+
+ return lexRoot
+}
+
+// isNumberStart returns true if the rune is the beginning of a number.
+func isNumberStart(c rune) bool {
+ // Even hexadecimals must start with a digit (namely 0).
+ return isDigit(c)
+}
+
+// lexNumber lexes a number token.
+func lexNumber(l *lexer) stateFn {
+ // If the number starts with 0 it cannot be a decimal integer.
+ if l.Peek() == '0' {
+ return lexNumberStartWithZero
+ }
+ return lexDec
+}
+
+// lexDec lexes a base-10 number.
+func lexDec(l *lexer) stateFn {
+ for isDigit(l.Peek()) {
+ l.Consume()
+ }
+
+ // If a decimal part is found, transition to the decimal state.
+ if isDecimalPartStart(l.Peek()) {
+ return lexDecimalPart
+ }
+
+ l.emitToken(INT_CONST_DEC)
+
+ return lexRoot
+}
+
+// lexNumberStartWithZero lexes hexadecimals, some floats or 0.
+func lexNumberStartWithZero(l *lexer) stateFn {
+ // Consume the starting 0
+ l.Consume()
+
+ // Here we check to see whether we are in the hexadecimal or floating
+ // point case.
+ switch c := l.Peek(); {
+ case c == 'x' || c == 'X':
+ return lexHexNumber
+ case isDecimalPartStart(c):
+ return lexDecimalPart
+ }
+
+ // Found a naked 0.
+ l.emitToken(INT_CONST_DEC)
+
+ return lexRoot
+}
+
+// lexHexNumber lexes hexadecimal integers.
+func lexHexNumber(l *lexer) stateFn {
+ // Consume the x or X
+ l.Consume()
+
+ for isHexDigit(l.Peek()) {
+ l.Consume()
+ }
+
+ l.emitToken(INT_CONST_HEX)
+
+ return lexRoot
+}
+
+// isDecimalPartStart returns true if the rune represents the start of
+// the decimal part of a floating point number.
+func isDecimalPartStart(c rune) bool {
+ return c == '.' || c == 'e' || c == 'E'
+}
+
+// lexDecimalPart lexes the decimal part of a floating point number.
+func lexDecimalPart(l *lexer) stateFn {
+ // Consume '.' or 'e' or 'E'
+ l.Consume()
+
+ if c := l.Peek(); c == '+' || c == '-' {
+ l.Consume()
+ }
+
+ for isDigit(l.Peek()) {
+ l.Consume()
+ }
+
+ l.emitToken(FLOAT_CONST)
+
+ return lexRoot
+}
+
+// isStringStart returns true if the rune represents the start of a string.
+func isStringStart(c rune) bool {
+ return '"' == c
+}
+
+// lexString lexes a quoted string.
+func lexString(l *lexer) stateFn {
+ // Consume opening quotes.
+ l.Consume()
+
+ for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' {
+ if l.Peek() == '\\' {
+ // If we see an escape character consume whatever follows blindly.
+ // TODO(azani): Consider parsing escape sequences.
+ l.Consume()
+ }
+ l.Consume()
+ }
+
+ if l.IsEos() || l.Peek() == '\n' {
+ l.emitToken(ERROR_UNTERMINATED_STRING_LITERAL)
+ return nil
+ }
+
+ // Consume the closing quotes
+ l.Consume()
+
+ l.emitToken(STRING_LITERAL)
+
+ return lexRoot
+}
+
+// isMaybeComment returns true if the rune may be the beginning of a
+// comment.
+func isMaybeComment(c rune) bool {
+ return c == '/'
+}
+
+// lexComment consumes a single-line or multi-line comment.
+func lexComment(l *lexer) stateFn {
+ // Consume the '/'.
+ l.Consume()
+
+ switch l.Peek() {
+ case '/':
+ return lexSingleLineComment
+ case '*':
+ return lexMultiLineComment
+ }
+
+ l.emitToken(ERROR_ILLEGAL_CHAR)
+ return nil
+}
+
+// lexSingleLineComment consumes a single line comment.
+func lexSingleLineComment(l *lexer) stateFn {
+ // Consume the '/'
+ l.Consume()
+
+ for !l.IsEos() && l.Peek() != '\n' {
+ l.Consume()
+ }
+
+ l.startToken()
+ return lexRoot
+}
+
+// lexMultiLineComment consumes a multi-line comment.
+func lexMultiLineComment(l *lexer) stateFn {
+ // Consume the '*'.
+ l.Consume()
+
+ for !l.IsEos() {
+ if l.Peek() == '*' {
+ return lexPossibleEndOfComment
+ }
+ l.Consume()
+ }
+
+ l.emitToken(ERROR_UNTERMINATED_COMMENT)
+ return nil
+}
+
+// lexPossibleEndOfComment consumes the possible end of a multiline
+// comment and determines whether the comment in fact ended or not.
+func lexPossibleEndOfComment(l *lexer) stateFn {
+ // Consume the '*'
+ l.Consume()
+
+ if l.IsEos() {
+ l.emitToken(ERROR_UNTERMINATED_COMMENT)
+ return nil
+ }
+
+ if l.Peek() == '/' {
+ l.Consume()
+ l.startToken()
+ return lexRoot
+ }
+
+ return lexMultiLineComment
+}
« no previous file with comments | « no previous file | mojom/mojom_parser/lexer/lexer_test.go » ('j') | mojom/mojom_parser/lexer/lexer_test.go » ('J')

Powered by Google App Engine
This is Rietveld 408576698