mojom/mojom_parser/lexer/lexer.go - Issue 1387893002: New lexer for mojom written in go.

Unified Diff: mojom/mojom_parser/lexer/lexer.go

Issue 1387893002: New lexer for mojom written in go. (Closed) Base URL: https://github.com/domokit/mojo.git@master

Patch Set: Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: mojom/mojom_parser/lexer/lexer.go

diff --git a/mojom/mojom_parser/lexer/lexer.go b/mojom/mojom_parser/lexer/lexer.go

new file mode 100644

index 0000000000000000000000000000000000000000..f918b3bc4eb6e8a4f76ada2f477ce84b01b6fb82

--- /dev/null

+++ b/mojom/mojom_parser/lexer/lexer.go

@@ -0,0 +1,500 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+//

+// To call the lexer, call Tokenize with the source string to obtain

rudominer 2015/10/12 18:06:05 To use the lexer, call...

azani 2015/10/13 00:23:46 Done.

+// a TokenStream. The lexer is run concurrently so you should be able to

+// use the TokenStream before the lexer is done with the source.

+//

+// The lexer is implemented as a state machine. The states are represented

+// by functions (the stateFn type) which accept a lexer and return the

+// new state.

+//

+// Most states also have an isFooStart function which helps determine if

+// a transition to Foo is appropriate. Those functions accept a single

+// rune as a parameter and return true if the state machine should

+// transition to state Foo. Some states do not have such functions on

+// account of the transition condition being trivial.

+//

+// The lexer implementation was inspired by

+// http://cuddle.googlecode.com/hg/talk/lex.html

+package lexer

+import (

+ "unicode/utf8"

+// Tokenize accepts a source string and returns the tokens which compose

rudominer 2015/10/12 18:06:05 The phrasing belies the asynchronous nature. Also

azani 2015/10/13 00:23:46 Done.

+// the source string.

+func Tokenize(source string) TokenStream {

+ tokens := make(chan Token)

+ l := lexer{source: source, tokens: tokens}

+ go l.run()

+ return &TokenChan{tokenChan: tokens}

+type lexer struct {

+ // source is the source code to be lexed.

rudominer 2015/10/12 18:06:05 nit: You don't need to repeat the variable name wi

azani 2015/10/12 18:18:07 Hey Matt, could you settle that question for us? I

+ source string

+ // offset is the number of characters that have been consumed.

rudominer 2015/10/12 18:06:05 Let's follow the go standard terminology and avoid

azani 2015/10/13 00:23:45 Done.

+ offset int

+ // lineno is the current line number.

+ lineNo int

+ // lineOffset is how many characters have been consumed since the

+ // beginning of the line.

+ lineOffset int

rudominer 2015/10/12 18:06:05 I think it would be clearer if we didn't over use

azani 2015/10/13 00:23:45 Done.

+ // start is the index in source where the current token begins.

rudominer 2015/10/12 18:06:05 Maybe currentTokenStartIndex or currentTokenStart?

azani 2015/10/13 00:23:46 Done.

+ start int

+ // lineStart is the line number on which the current token begins.

+ lineStart int

rudominer 2015/10/12 18:06:05 Maybe currentTokenLineNumber?

azani 2015/10/13 00:23:45 Done.

+ // lineOffsetStart is the number of characters since the beginning

+ // of the line where the current token begins.

+ lineOffsetStart int

rudominer 2015/10/12 18:06:05 Maybe currentTokenColumn?

azani 2015/10/13 00:23:45 Done.

+ // tokens is a channel on which the found tokens are emitted.

rudominer 2015/10/12 18:06:05 maybe "to which the found..."

azani 2015/10/13 00:23:45 Done.

+ tokens chan Token

+// CurText returns the consumed part of the current token.

+func (l *lexer) CurText() string {

+ return l.source[l.start:l.offset]

+// emitToken emits the current token and begins the new token.

rudominer 2015/10/12 18:06:05 begins a new token

azani 2015/10/13 00:23:46 Done.

+func (l *lexer) emitToken(tokenType TokenKind) {

+ l.tokens <- Token{

+ Kind: tokenType,

+ Text: l.source[l.start:l.offset],

+ CharPos: l.start,

+ LineNo: l.lineStart,

+ LinePos: l.lineOffsetStart}

+ l.startToken()

+// startToken starts the new token.

rudominer 2015/10/12 18:06:05 starts a new token

azani 2015/10/13 00:23:45 Done.

+func (l *lexer) startToken() {

+ l.start = l.offset

+ l.lineStart = l.lineNo

+ l.lineOffsetStart = l.lineOffset

+// Consume consumes the next rune in the source.

+func (l *lexer) Consume() {

+ c, width := utf8.DecodeRuneInString(l.source[l.offset:])

+ if width == 0 {

rudominer 2015/10/12 18:06:05 What is your thinking here? Can width=0 without c

azani 2015/10/13 00:23:45 RuneError is just a special rune. It is handled th

+ width = 1

mattr 2015/10/12 18:04:13 This struck me as strange, this means the input wa

azani 2015/10/13 00:23:45 Basically, in all cases where this branch is taken

+ }

+ if c == '\n' {

+ l.lineNo += 1

+ l.lineOffset = 0

+ } else {

+ l.lineOffset += width

+ }

+ l.offset += width

+// Peek returns the next rune in the source.

+func (l *lexer) Peek() rune {

+ char, _ := utf8.DecodeRuneInString(l.source[l.offset:])

rudominer 2015/10/12 18:06:05 What if char == RuneError?

azani 2015/10/13 00:23:45 We treat it like any other rune. It looks like Run

+ return char

+// IsEos returns true if the whole source has been consumed false

+// otherwise.

+func (l *lexer) IsEos() bool {

+ return l.offset >= len(l.source)

rudominer 2015/10/12 18:06:05 Note that this depends on l.offset being a number

azani 2015/10/13 00:23:46 Yes, since strings index based on bytes, not runes

+// run is the lexer's main loop.

+func (l *lexer) run() {

+ // This is a state machine.

rudominer 2015/10/12 18:06:06 "This" is ambiguous here. Maybe "We are implementi

azani 2015/10/13 00:23:45 Done.

+ // lexRoot is the beginning state.

+ // nil is the end state.

+ // States are functions which are called on the lexer. They return the

+ // next state.

+ for state := lexRoot; state != nil; {

+ state = state(l)

+ }

+ close(l.tokens)

+// A stateFn represents a state in the lexer state machine.

+type stateFn func(*lexer) stateFn

+// This is the beginning state and also the state in between tokens.

rudominer 2015/10/12 18:06:05 Nit. In my understanding of a DFA there is no "sta

azani 2015/10/13 00:23:45 I meant returned to after most tokens are emitted.

+func lexRoot(l *lexer) stateFn {

+ if l.offset >= len(l.source) {

rudominer 2015/10/12 18:06:05 You have a function IsEos() for this, no?

azani 2015/10/13 00:23:45 Done.

+ return nil

+ }

+ switch c := l.Peek(); {

+ case isSingleCharTokens(c):

+ return lexSingleCharTokens

+ case isEqualsOrResponseStart(c):

+ return lexEqualsOrResponse

+ case isNameStart(c):

+ return lexName

+ case isOrdinalStart(c):

+ return lexOrdinal

+ case isNumberStart(c):

+ return lexNumber

+ case isStringStart(c):

+ return lexString

+ case isSkippable(c):

+ return lexSkip

+ case isMaybeComment(c):

+ return lexComment

+ }

+ l.Consume()

+ l.emitToken(ERROR_ILLEGAL_CHAR)

+ return nil

+// isSkippable determines if a rune is skippable.

+func isSkippable(c rune) bool {

+ return c == ' ' || c == '\t' || c == '\r' || c == '\n'

+// lexSkip consumes skippable runes.

+func lexSkip(l *lexer) stateFn {

+ for isSkippable(l.Peek()) {

+ l.Consume()

+ }

+ l.startToken()

+ return lexRoot

+// singleCharTokens is a map of single-rune tokens.

+var singleCharTokens = map[rune]TokenKind{

+ '(': LPAREN,

+ ')': RPAREN,

+ '[': LBRACKET,

+ ']': RBRACKET,

+ '{': LBRACE,

+ '}': RBRACE,

+ '<': LANGLE,

+ '>': RANGLE,

+ ';': SEMI,

+ ',': COMMA,

+ '.': DOT,

+ '-': MINUS,

+ '+': PLUS,

+ '&': AMP,

+ '?': QSTN,

+// isSingleCharTokens returns true if the rune is a single character token.

+func isSingleCharTokens(c rune) bool {

+ _, ok := singleCharTokens[c]

+ return ok

+// lexSingleCharTokens lexes single character tokens.

+func lexSingleCharTokens(l *lexer) stateFn {

+ c := l.Peek()

+ l.Consume()

+ t, _ := singleCharTokens[c]

+ l.emitToken(t)

+ return lexRoot

+// isEqualsOrResponseStart returns true if the rune corresponds to be

+// beginning of either the '=' or '=>' tokens.

+func isEqualsOrResponseStart(c rune) bool {

+ return c == '='

+// lexEqualsOrResponse lexes the '=' or the '=>' token.

+func lexEqualsOrResponse(l *lexer) stateFn {

+ l.Consume()

+ if l.Peek() == '>' {

+ l.Consume()

+ l.emitToken(RESPONSE)

+ } else {

+ l.emitToken(EQUALS)

+ }

+ return lexRoot

+// isAlpha returns true if the rune is a letter of the alphabet.

+func isAlpha(c rune) bool {

+ return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))

+// isDigit returns true if the rune is a digit.

+func isDigit(c rune) bool {

+ return ('0' <= c && c <= '9')

+// isHexDigit returns true if the rune is a hexadecimal digit.

+func isHexDigit(c rune) bool {

+ return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')

+// isNameStart returns true if the rune is the beginning of a NAME token.

+func isNameStart(c rune) bool {

+ return isAlpha(c) || c == '_'

+// keywordTokens maps keywords to their associate tokens.

+var keywordTokens = map[string]TokenKind{

+ "import": IMPORT,

+ "module": MODULE,

+ "struct": STRUCT,

+ "union": UNION,

+ "interface": INTERFACE,

+ "enum": ENUM,

+ "const": CONST,

+ "true": TRUE,

+ "false": FALSE,

+ "default": DEFAULT,

+// lexName lexes valid C identifiers. (K&R2: A.2.3)

+func lexName(l *lexer) stateFn {

+ l.Consume()

+ // isNameRune returns true if the rune is valid in a NAME token.

+ isNameRune := func(c rune) bool {

+ return isAlpha(c) || isDigit(c) || c == '_'

+ }

+ for isNameRune(l.Peek()) {

+ l.Consume()

+ }

+ // Emit the appropriate keyword token if the current name is a

+ // keyword or a NAME token otherwise.

+ if token, found := keywordTokens[l.CurText()]; found {

+ l.emitToken(token)

+ } else {

+ l.emitToken(NAME)

+ }

+ return lexRoot

+// isOrdinalStart returns true if the rune is the start of an ORDINAL

+// token.

+func isOrdinalStart(c rune) bool {

+ return '@' == c

+// lexOrdinal lexes an ORDINAL token. Ordinals are a '@' followed by one

+// or more digits.

+func lexOrdinal(l *lexer) stateFn {

+ // Consume the '@'.

+ l.Consume()

+ for isDigit(l.Peek()) {

+ l.Consume()

+ }

+ l.emitToken(ORDINAL)

+ return lexRoot

+// isNumberStart returns true if the rune is the beginning of a number.

+func isNumberStart(c rune) bool {

+ // Even hexadecimals must start with a digit (namely 0).

+ return isDigit(c)

+// lexNumber lexes a number token.

+func lexNumber(l *lexer) stateFn {

+ // If the number starts with 0 it cannot be a decimal integer.

+ if l.Peek() == '0' {

+ return lexNumberStartWithZero

+ }

+ return lexDec

+// lexDec lexes a base-10 number.

+func lexDec(l *lexer) stateFn {

+ for isDigit(l.Peek()) {

+ l.Consume()

+ }

+ // If a decimal part is found, transition to the decimal state.

+ if isDecimalPartStart(l.Peek()) {

+ return lexDecimalPart

+ }

+ l.emitToken(INT_CONST_DEC)

+ return lexRoot

+// lexNumberStartWithZero lexes hexadecimals, some floats or 0.

+func lexNumberStartWithZero(l *lexer) stateFn {

+ // Consume the starting 0

+ l.Consume()

+ // Here we check to see whether we are in the hexadecimal or floating

+ // point case.

+ switch c := l.Peek(); {

+ case c == 'x' || c == 'X':

+ return lexHexNumber

+ case isDecimalPartStart(c):

+ return lexDecimalPart

+ }

+ // Found a naked 0.

+ l.emitToken(INT_CONST_DEC)

+ return lexRoot

+// lexHexNumber lexes hexadecimal integers.

+func lexHexNumber(l *lexer) stateFn {

+ // Consume the x or X

+ l.Consume()

+ for isHexDigit(l.Peek()) {

+ l.Consume()

+ }

+ l.emitToken(INT_CONST_HEX)

+ return lexRoot

+// isDecimalPartStart returns true if the rune represents the start of

+// the decimal part of a floating point number.

+func isDecimalPartStart(c rune) bool {

+ return c == '.' || c == 'e' || c == 'E'

+// lexDecimalPart lexes the decimal part of a floating point number.

+func lexDecimalPart(l *lexer) stateFn {

+ // Consume '.' or 'e' or 'E'

+ l.Consume()

+ if c := l.Peek(); c == '+' || c == '-' {

+ l.Consume()

+ }

+ for isDigit(l.Peek()) {

+ l.Consume()

+ }

+ l.emitToken(FLOAT_CONST)

+ return lexRoot

+// isStringStart returns true if the rune represents the start of a string.

+func isStringStart(c rune) bool {

+ return '"' == c

+// lexString lexes a quoted string.

+func lexString(l *lexer) stateFn {

+ // Consume opening quotes.

+ l.Consume()

+ for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' {

+ if l.Peek() == '\\' {

+ // If we see an escape character consume whatever follows blindly.

+ // TODO(azani): Consider parsing escape sequences.

+ l.Consume()

+ }

+ l.Consume()

+ }

+ if l.IsEos() || l.Peek() == '\n' {

+ l.emitToken(ERROR_UNTERMINATED_STRING_LITERAL)

+ return nil

+ }

+ // Consume the closing quotes

+ l.Consume()

+ l.emitToken(STRING_LITERAL)

+ return lexRoot

+// isMaybeComment returns true if the rune may be the beginning of a

+// comment.

+func isMaybeComment(c rune) bool {

+ return c == '/'

+// lexComment consumes a single-line or multi-line comment.

+func lexComment(l *lexer) stateFn {

+ // Consume the '/'.

+ l.Consume()

+ switch l.Peek() {

+ case '/':

+ return lexSingleLineComment

+ case '*':

+ return lexMultiLineComment

+ }

+ l.emitToken(ERROR_ILLEGAL_CHAR)

+ return nil

+// lexSingleLineComment consumes a single line comment.

+func lexSingleLineComment(l *lexer) stateFn {

+ // Consume the '/'

+ l.Consume()

+ for !l.IsEos() && l.Peek() != '\n' {

+ l.Consume()

+ }

+ l.startToken()

+ return lexRoot

+// lexMultiLineComment consumes a multi-line comment.

+func lexMultiLineComment(l *lexer) stateFn {

+ // Consume the '*'.

+ l.Consume()

+ for !l.IsEos() {

+ if l.Peek() == '*' {

+ return lexPossibleEndOfComment

+ }

+ l.Consume()

+ }

+ l.emitToken(ERROR_UNTERMINATED_COMMENT)

+ return nil

+// lexPossibleEndOfComment consumes the possible end of a multiline

+// comment and determines whether the comment in fact ended or not.

+func lexPossibleEndOfComment(l *lexer) stateFn {

+ // Consume the '*'

+ l.Consume()

+ if l.IsEos() {

+ l.emitToken(ERROR_UNTERMINATED_COMMENT)

+ return nil

+ }

+ if l.Peek() == '/' {

+ l.Consume()

+ l.startToken()

+ return lexRoot

+ }

+ return lexMultiLineComment

« no previous file with comments | « no previous file | mojom/mojom_parser/lexer/lexer_test.go » ('j') | mojom/mojom_parser/lexer/lexer_test.go » ('J')