mojom/mojom_parser/lexer/lexer.go - Issue 1387893002: New lexer for mojom written in go.

Unified Diff: mojom/mojom_parser/lexer/lexer.go

Issue 1387893002: New lexer for mojom written in go. (Closed) Base URL: https://github.com/domokit/mojo.git@master

Patch Set: Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: mojom/mojom_parser/lexer/lexer.go

diff --git a/mojom/mojom_parser/lexer/lexer.go b/mojom/mojom_parser/lexer/lexer.go

new file mode 100644

index 0000000000000000000000000000000000000000..b210b2bbeed5039abb13a78b6b690194230b1f81

--- /dev/null

+++ b/mojom/mojom_parser/lexer/lexer.go

@@ -0,0 +1,519 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+//

+// To use the lexer, call Tokenize with the source string to obtain

+// a TokenStream. The lexer is run concurrently so you should be able to

+// use the TokenStream before the lexer is done with the source.

+//

+// The lexer is implemented as a state machine. The states are represented

+// by functions (the stateFn type) which accept a lexer and return the

+// new state.

+//

+// Most states also have an isFooStart function which helps determine if

+// a transition to Foo is appropriate. Those functions accept a single

+// rune as a parameter and return true if the state machine should

+// transition to state Foo. Some states do not have such functions on

+// account of the transition condition being trivial.

+//

+// The lexer implementation was inspired by

+// http://cuddle.googlecode.com/hg/talk/lex.html

+package lexer

+import (

+ "unicode/utf8"

+// Tokenize accepts a source string and parses it into a stream of tokens which

+// can be read from the returned TokenStream.

+func Tokenize(source string) TokenStream {

+ tokens := make(chan Token)

+ l := lexer{source: source, tokens: tokens}

+ go l.run()

+ return &TokenChan{tokenChan: tokens}

+type lexer struct {

+ // source is the source code to be lexed.

+ source string

+ // offset is the number of bytes that have been consumed.

+ offset int

+ // tokens is a channel to which the found tokens are emitted.

+ tokens chan Token

+ // sourcePos is the number of runes that have been consumed.

+ sourcePos int

+ // lineno is the current line number.

+ lineNo int

+ // linePos is how many runes have been consumed since the beginning of the

+ // line.

+ linePos int

+ // curTokenOffset is the number of bytes consumed prior to the beginning of

+ // the current token.

+ curTokenOffset int

+ // curTokenSourcePos is the number of runes consumed prior to the beginning of

+ // the current token.

+ curTokenSourcePos int

+ // curTokenLineNo is the line number on which the current token begins.

+ curTokenLineNo int

+ // curTokenLinePos is the number of runes since the beginning of the line

+ // where the current token begins.

+ curTokenLinePos int

+// CurText returns the consumed part of the current token.

+func (l *lexer) CurText() string {

+ return l.source[l.curTokenOffset:l.offset]

+// emitToken emits the current token and begins a new token.

+func (l *lexer) emitToken(tokenType TokenKind) {

+ l.tokens <- Token{

+ Kind: tokenType,

+ Text: l.source[l.curTokenOffset:l.offset],

+ CharPos: l.curTokenSourcePos,

+ LineNo: l.curTokenLineNo,

+ LinePos: l.curTokenLinePos}

+ l.beginToken()

+// beginToken begins the new token.

+func (l *lexer) beginToken() {

+ l.curTokenOffset = l.offset

+ l.curTokenSourcePos = l.sourcePos

+ l.curTokenLineNo = l.lineNo

+ l.curTokenLinePos = l.linePos

+// Consume consumes the next rune in the source.

+func (l *lexer) Consume() {

+ if l.IsEos() {

+ return

+ }

+ c, width := utf8.DecodeRuneInString(l.source[l.offset:])

+ if c == '\n' {

+ l.lineNo += 1

+ l.linePos = 0

+ } else {

+ l.linePos += 1

+ }

+ l.offset += width

+ l.sourcePos += 1

+// Peek returns the next rune in the source.

+func (l *lexer) Peek() rune {

+ // At the end of the string, there is no sane answer to Peek.

+ if l.IsEos() {

+ return utf8.RuneError

+ }

+ // If RuneError is returned, it will be handled as any other rune, likely

+ // resulting in an ErrorIllegalChar token being emitted.

+ char, _ := utf8.DecodeRuneInString(l.source[l.offset:])

+ return char

+// IsEos returns true if the whole source has been consumed false

+// otherwise.

+func (l *lexer) IsEos() bool {

+ return l.offset >= len(l.source)

+// run is the lexer's main loop.

+func (l *lexer) run() {

+ // We are implementing a state machine.

+ // lexRoot is the beginning state.

+ // nil is the end state.

+ // States are functions which are called on the lexer. They return the

+ // next state.

+ for state := lexRoot; state != nil; {

+ state = state(l)

+ }

+ close(l.tokens)

+// A stateFn represents a state in the lexer state machine.

+type stateFn func(*lexer) stateFn

+// This is the beginning state and also the state which is returned to after

+// most tokens are emitted.

+func lexRoot(l *lexer) stateFn {

+ if l.IsEos() {

+ return nil

+ }

+ switch c := l.Peek(); {

+ case isSingleCharTokens(c):

+ return lexSingleCharTokens

+ case isEqualsOrResponseStart(c):

+ return lexEqualsOrResponse

+ case isNameStart(c):

+ return lexName

+ case isOrdinalStart(c):

+ return lexOrdinal

+ case isNumberStart(c):

+ return lexNumber

+ case isStringStart(c):

+ return lexString

+ case isSkippable(c):

+ return lexSkip

+ case isMaybeComment(c):

+ return lexComment

+ }

+ l.Consume()

+ l.emitToken(ErrorIllegalChar)

+ return nil

+// isSkippable determines if a rune is skippable.

+func isSkippable(c rune) bool {

+ return c == ' ' || c == '\t' || c == '\r' || c == '\n'

+// lexSkip consumes skippable runes.

+func lexSkip(l *lexer) stateFn {

+ for isSkippable(l.Peek()) {

+ l.Consume()

+ }

+ l.beginToken()

+ return lexRoot

+// singleCharTokens is a map of single-rune tokens.

+var singleCharTokens = map[rune]TokenKind{

+ '(': LParen,

+ ')': RParen,

+ '[': LBracket,

+ ']': RBracket,

+ '{': LBrace,

+ '}': RBrace,

+ '<': LAngle,

+ '>': RAngle,

+ ';': Semi,

+ ',': Comma,

+ '.': Dot,

+ '-': Minus,

+ '+': Plus,

+ '&': Amp,

+ '?': Qstn,

+// isSingleCharTokens returns true if the rune is a single character token.

+func isSingleCharTokens(c rune) bool {

+ _, ok := singleCharTokens[c]

+ return ok

+// lexSingleCharTokens lexes single character tokens.

+func lexSingleCharTokens(l *lexer) stateFn {

+ c := l.Peek()

+ l.Consume()

+ t, _ := singleCharTokens[c]

+ l.emitToken(t)

+ return lexRoot

+// isEqualsOrResponseStart returns true if the rune corresponds to be

+// beginning of either the '=' or '=>' tokens.

+func isEqualsOrResponseStart(c rune) bool {

+ return c == '='

+// lexEqualsOrResponse lexes the '=' or the '=>' token.

+func lexEqualsOrResponse(l *lexer) stateFn {

+ l.Consume()

+ if l.Peek() == '>' {

+ l.Consume()

+ l.emitToken(Response)

+ } else {

+ l.emitToken(Equals)

+ }

+ return lexRoot

+// isAlpha returns true if the rune is a letter of the alphabet.

+func isAlpha(c rune) bool {

+ return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))

+// isDigit returns true if the rune is a digit.

+func isDigit(c rune) bool {

+ return ('0' <= c && c <= '9')

+// isHexDigit returns true if the rune is a hexadecimal digit.

+func isHexDigit(c rune) bool {

+ return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')

+// isNameStart returns true if the rune is the beginning of a Name token.

+func isNameStart(c rune) bool {

+ return isAlpha(c) || c == '_'

+// keywordTokens maps keywords to their associate tokens.

+var keywordTokens = map[string]TokenKind{

+ "import": Import,

+ "module": Module,

+ "struct": Struct,

+ "union": Union,

+ "interface": Interface,

+ "enum": Enum,

+ "const": Const,

+ "true": True,

+ "false": False,

+ "default": Default,

+// lexName lexes valid C identifiers. (K&R2: A.2.3)

+func lexName(l *lexer) stateFn {

+ l.Consume()

+ // isNameRune returns true if the rune is valid in a Name token.

+ isNameRune := func(c rune) bool {

+ return isAlpha(c) || isDigit(c) || c == '_'

+ }

+ for isNameRune(l.Peek()) {

+ l.Consume()

+ }

+ // Emit the appropriate keyword token if the current name is a

+ // keyword or a Name token otherwise.

+ if token, found := keywordTokens[l.CurText()]; found {

+ l.emitToken(token)

+ } else {

+ l.emitToken(Name)

+ }

+ return lexRoot

+// isOrdinalStart returns true if the rune is the beginning of an Ordinal

+// token.

+func isOrdinalStart(c rune) bool {

+ return '@' == c

+// lexOrdinal lexes an Ordinal token. Ordinals are a '@' followed by one

+// or more digits.

+func lexOrdinal(l *lexer) stateFn {

+ // Consume the '@'.

+ l.Consume()

+ for isDigit(l.Peek()) {

+ l.Consume()

+ }

+ l.emitToken(Ordinal)

+ return lexRoot

+// isNumberStart returns true if the rune is the beginning of a number.

+func isNumberStart(c rune) bool {

+ // Even hexadecimals must begin with a digit (namely 0).

+ return isDigit(c)

+// lexNumber lexes a number token.

+func lexNumber(l *lexer) stateFn {

+ // If the number begins with 0 it cannot be a decimal integer.

+ if l.Peek() == '0' {

+ return lexNumberStartWithZero

+ }

+ return lexDec

+// lexDec lexes a base-10 number.

+func lexDec(l *lexer) stateFn {

+ for isDigit(l.Peek()) {

+ l.Consume()

+ }

+ // If a decimal part is found, transition to the decimal state.

+ if isDecimalPartStart(l.Peek()) {

+ return lexDecimalPart

+ }

+ l.emitToken(IntConstDec)

+ return lexRoot

+// lexNumberStartWithZero lexes hexadecimals, some floats or 0.

+func lexNumberStartWithZero(l *lexer) stateFn {

+ // Consume the leading 0

+ l.Consume()

+ // Here we check to see whether we are in the hexadecimal or floating

+ // point case.

+ switch c := l.Peek(); {

+ case c == 'x' || c == 'X':

+ return lexHexNumber

+ case isDecimalPartStart(c):

+ return lexDecimalPart

+ }

+ // Found a naked 0.

+ l.emitToken(IntConstDec)

+ return lexRoot

+// lexHexNumber lexes hexadecimal integers.

+func lexHexNumber(l *lexer) stateFn {

+ // Consume the x or X

+ l.Consume()

+ for isHexDigit(l.Peek()) {

+ l.Consume()

+ }

+ l.emitToken(IntConstHex)

+ return lexRoot

+// isDecimalPartStart returns true if the rune represents the beginning of

+// the decimal part of a floating point number.

+func isDecimalPartStart(c rune) bool {

+ return c == '.' || c == 'e' || c == 'E'

+// lexDecimalPart lexes the decimal part of a floating point number.

+func lexDecimalPart(l *lexer) stateFn {

+ // Consume '.' or 'e' or 'E'

+ l.Consume()

+ if c := l.Peek(); c == '+' || c == '-' {

+ l.Consume()

+ }

+ for isDigit(l.Peek()) {

+ l.Consume()

+ }

+ l.emitToken(FloatConst)

+ return lexRoot

+// isStringStart returns true if the rune represents the beginning of a string.

+func isStringStart(c rune) bool {

+ return '"' == c

+// lexString lexes a quoted string.

+func lexString(l *lexer) stateFn {

+ // Consume opening quotes.

+ l.Consume()

+ for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' {

+ if l.Peek() == '\\' {

+ // If we see an escape character consume whatever follows blindly.

+ // TODO(azani): Consider parsing escape sequences.

+ l.Consume()

+ }

+ l.Consume()

+ }

+ if l.IsEos() || l.Peek() == '\n' {

+ l.emitToken(ErrorUnterminatedStringLiteral)

+ return nil

+ }

+ // Consume the closing quotes

+ l.Consume()

+ l.emitToken(StringLiteral)

+ return lexRoot

+// isMaybeComment returns true if the rune may be the beginning of a

+// comment.

+func isMaybeComment(c rune) bool {

+ return c == '/'

+// lexComment consumes a single-line or multi-line comment.

+func lexComment(l *lexer) stateFn {

+ // Consume the '/'.

+ l.Consume()

+ switch l.Peek() {

+ case '/':

+ return lexSingleLineComment

+ case '*':

+ return lexMultiLineComment

+ }

+ l.emitToken(ErrorIllegalChar)

+ return nil

+// lexSingleLineComment consumes a single line comment.

+func lexSingleLineComment(l *lexer) stateFn {

+ // Consume the '/'

+ l.Consume()

+ for !l.IsEos() && l.Peek() != '\n' {

+ l.Consume()

+ }

+ l.beginToken()

+ return lexRoot

+// lexMultiLineComment consumes a multi-line comment.

+func lexMultiLineComment(l *lexer) stateFn {

+ // Consume the '*'.

+ l.Consume()

+ for !l.IsEos() {

+ if l.Peek() == '*' {

+ return lexPossibleEndOfComment

+ }

+ l.Consume()

+ }

+ l.emitToken(ErrorUnterminatedComment)

+ return nil

+// lexPossibleEndOfComment consumes the possible end of a multiline

+// comment and determines whether the comment in fact ended or not.

+func lexPossibleEndOfComment(l *lexer) stateFn {

+ // Consume the '*'

+ l.Consume()

+ if l.IsEos() {

+ l.emitToken(ErrorUnterminatedComment)

+ return nil

+ }

+ if l.Peek() == '/' {

+ l.Consume()

+ l.beginToken()

+ return lexRoot

+ }

+ return lexMultiLineComment

« no previous file with comments | « no previous file | mojom/mojom_parser/lexer/lexer_test.go » ('j') | no next file with comments »