| Index: mojom/mojom_parser/lexer/lexer.go
|
| diff --git a/mojom/mojom_parser/lexer/lexer.go b/mojom/mojom_parser/lexer/lexer.go
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..b210b2bbeed5039abb13a78b6b690194230b1f81
|
| --- /dev/null
|
| +++ b/mojom/mojom_parser/lexer/lexer.go
|
| @@ -0,0 +1,519 @@
|
| +// Copyright 2015 The Chromium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +//
|
| +// To use the lexer, call Tokenize with the source string to obtain
|
| +// a TokenStream. The lexer is run concurrently so you should be able to
|
| +// use the TokenStream before the lexer is done with the source.
|
| +//
|
| +// The lexer is implemented as a state machine. The states are represented
|
| +// by functions (the stateFn type) which accept a lexer and return the
|
| +// new state.
|
| +//
|
| +// Most states also have an isFooStart function which helps determine if
|
| +// a transition to Foo is appropriate. Those functions accept a single
|
| +// rune as a parameter and return true if the state machine should
|
| +// transition to state Foo. Some states do not have such functions on
|
| +// account of the transition condition being trivial.
|
| +//
|
| +// The lexer implementation was inspired by
|
| +// http://cuddle.googlecode.com/hg/talk/lex.html
|
| +
|
| +package lexer
|
| +
|
| +import (
|
| + "unicode/utf8"
|
| +)
|
| +
|
| +// Tokenize accepts a source string and parses it into a stream of tokens which
|
| +// can be read from the returned TokenStream.
|
| +func Tokenize(source string) TokenStream {
|
| + tokens := make(chan Token)
|
| + l := lexer{source: source, tokens: tokens}
|
| + go l.run()
|
| + return &TokenChan{tokenChan: tokens}
|
| +}
|
| +
|
| +type lexer struct {
|
| + // source is the source code to be lexed.
|
| + source string
|
| +
|
| + // offset is the number of bytes that have been consumed.
|
| + offset int
|
| +
|
| + // tokens is a channel to which the found tokens are emitted.
|
| + tokens chan Token
|
| +
|
| + // sourcePos is the number of runes that have been consumed.
|
| + sourcePos int
|
| +
|
| + // lineno is the current line number.
|
| + lineNo int
|
| +
|
| + // linePos is how many runes have been consumed since the beginning of the
|
| + // line.
|
| + linePos int
|
| +
|
| + // curTokenOffset is the number of bytes consumed prior to the beginning of
|
| + // the current token.
|
| + curTokenOffset int
|
| +
|
| + // curTokenSourcePos is the number of runes consumed prior to the beginning of
|
| + // the current token.
|
| + curTokenSourcePos int
|
| +
|
| + // curTokenLineNo is the line number on which the current token begins.
|
| + curTokenLineNo int
|
| +
|
| + // curTokenLinePos is the number of runes since the beginning of the line
|
| + // where the current token begins.
|
| + curTokenLinePos int
|
| +}
|
| +
|
| +// CurText returns the consumed part of the current token.
|
| +func (l *lexer) CurText() string {
|
| + return l.source[l.curTokenOffset:l.offset]
|
| +}
|
| +
|
| +// emitToken emits the current token and begins a new token.
|
| +func (l *lexer) emitToken(tokenType TokenKind) {
|
| + l.tokens <- Token{
|
| + Kind: tokenType,
|
| + Text: l.source[l.curTokenOffset:l.offset],
|
| + CharPos: l.curTokenSourcePos,
|
| + LineNo: l.curTokenLineNo,
|
| + LinePos: l.curTokenLinePos}
|
| + l.beginToken()
|
| +}
|
| +
|
| +// beginToken begins the new token.
|
| +func (l *lexer) beginToken() {
|
| + l.curTokenOffset = l.offset
|
| + l.curTokenSourcePos = l.sourcePos
|
| + l.curTokenLineNo = l.lineNo
|
| + l.curTokenLinePos = l.linePos
|
| +}
|
| +
|
| +// Consume consumes the next rune in the source.
|
| +func (l *lexer) Consume() {
|
| + if l.IsEos() {
|
| + return
|
| + }
|
| +
|
| + c, width := utf8.DecodeRuneInString(l.source[l.offset:])
|
| +
|
| + if c == '\n' {
|
| + l.lineNo += 1
|
| + l.linePos = 0
|
| + } else {
|
| + l.linePos += 1
|
| + }
|
| + l.offset += width
|
| + l.sourcePos += 1
|
| +}
|
| +
|
| +// Peek returns the next rune in the source.
|
| +func (l *lexer) Peek() rune {
|
| + // At the end of the string, there is no sane answer to Peek.
|
| + if l.IsEos() {
|
| + return utf8.RuneError
|
| + }
|
| +
|
| + // If RuneError is returned, it will be handled as any other rune, likely
|
| + // resulting in an ErrorIllegalChar token being emitted.
|
| + char, _ := utf8.DecodeRuneInString(l.source[l.offset:])
|
| + return char
|
| +}
|
| +
|
| +// IsEos returns true if the whole source has been consumed false
|
| +// otherwise.
|
| +func (l *lexer) IsEos() bool {
|
| + return l.offset >= len(l.source)
|
| +}
|
| +
|
| +// run is the lexer's main loop.
|
| +func (l *lexer) run() {
|
| + // We are implementing a state machine.
|
| + // lexRoot is the beginning state.
|
| + // nil is the end state.
|
| + // States are functions which are called on the lexer. They return the
|
| + // next state.
|
| + for state := lexRoot; state != nil; {
|
| + state = state(l)
|
| + }
|
| + close(l.tokens)
|
| +}
|
| +
|
| +// A stateFn represents a state in the lexer state machine.
|
| +type stateFn func(*lexer) stateFn
|
| +
|
| +// This is the beginning state and also the state which is returned to after
|
| +// most tokens are emitted.
|
| +func lexRoot(l *lexer) stateFn {
|
| + if l.IsEos() {
|
| + return nil
|
| + }
|
| +
|
| + switch c := l.Peek(); {
|
| + case isSingleCharTokens(c):
|
| + return lexSingleCharTokens
|
| + case isEqualsOrResponseStart(c):
|
| + return lexEqualsOrResponse
|
| + case isNameStart(c):
|
| + return lexName
|
| + case isOrdinalStart(c):
|
| + return lexOrdinal
|
| + case isNumberStart(c):
|
| + return lexNumber
|
| + case isStringStart(c):
|
| + return lexString
|
| + case isSkippable(c):
|
| + return lexSkip
|
| + case isMaybeComment(c):
|
| + return lexComment
|
| + }
|
| +
|
| + l.Consume()
|
| + l.emitToken(ErrorIllegalChar)
|
| + return nil
|
| +}
|
| +
|
| +// isSkippable determines if a rune is skippable.
|
| +func isSkippable(c rune) bool {
|
| + return c == ' ' || c == '\t' || c == '\r' || c == '\n'
|
| +}
|
| +
|
| +// lexSkip consumes skippable runes.
|
| +func lexSkip(l *lexer) stateFn {
|
| + for isSkippable(l.Peek()) {
|
| + l.Consume()
|
| + }
|
| + l.beginToken()
|
| + return lexRoot
|
| +}
|
| +
|
| +// singleCharTokens is a map of single-rune tokens.
|
| +var singleCharTokens = map[rune]TokenKind{
|
| + '(': LParen,
|
| + ')': RParen,
|
| + '[': LBracket,
|
| + ']': RBracket,
|
| + '{': LBrace,
|
| + '}': RBrace,
|
| + '<': LAngle,
|
| + '>': RAngle,
|
| + ';': Semi,
|
| + ',': Comma,
|
| + '.': Dot,
|
| + '-': Minus,
|
| + '+': Plus,
|
| + '&': Amp,
|
| + '?': Qstn,
|
| +}
|
| +
|
| +// isSingleCharTokens returns true if the rune is a single character token.
|
| +func isSingleCharTokens(c rune) bool {
|
| + _, ok := singleCharTokens[c]
|
| + return ok
|
| +}
|
| +
|
| +// lexSingleCharTokens lexes single character tokens.
|
| +func lexSingleCharTokens(l *lexer) stateFn {
|
| + c := l.Peek()
|
| + l.Consume()
|
| + t, _ := singleCharTokens[c]
|
| + l.emitToken(t)
|
| +
|
| + return lexRoot
|
| +}
|
| +
|
| +// isEqualsOrResponseStart returns true if the rune corresponds to be
|
| +// beginning of either the '=' or '=>' tokens.
|
| +func isEqualsOrResponseStart(c rune) bool {
|
| + return c == '='
|
| +}
|
| +
|
| +// lexEqualsOrResponse lexes the '=' or the '=>' token.
|
| +func lexEqualsOrResponse(l *lexer) stateFn {
|
| + l.Consume()
|
| +
|
| + if l.Peek() == '>' {
|
| + l.Consume()
|
| + l.emitToken(Response)
|
| + } else {
|
| + l.emitToken(Equals)
|
| + }
|
| +
|
| + return lexRoot
|
| +}
|
| +
|
| +// isAlpha returns true if the rune is a letter of the alphabet.
|
| +func isAlpha(c rune) bool {
|
| + return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
|
| +}
|
| +
|
| +// isDigit returns true if the rune is a digit.
|
| +func isDigit(c rune) bool {
|
| + return ('0' <= c && c <= '9')
|
| +}
|
| +
|
| +// isHexDigit returns true if the rune is a hexadecimal digit.
|
| +func isHexDigit(c rune) bool {
|
| + return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
|
| +}
|
| +
|
| +// isNameStart returns true if the rune is the beginning of a Name token.
|
| +func isNameStart(c rune) bool {
|
| + return isAlpha(c) || c == '_'
|
| +}
|
| +
|
| +// keywordTokens maps keywords to their associate tokens.
|
| +var keywordTokens = map[string]TokenKind{
|
| + "import": Import,
|
| + "module": Module,
|
| + "struct": Struct,
|
| + "union": Union,
|
| + "interface": Interface,
|
| + "enum": Enum,
|
| + "const": Const,
|
| + "true": True,
|
| + "false": False,
|
| + "default": Default,
|
| +}
|
| +
|
| +// lexName lexes valid C identifiers. (K&R2: A.2.3)
|
| +func lexName(l *lexer) stateFn {
|
| + l.Consume()
|
| +
|
| + // isNameRune returns true if the rune is valid in a Name token.
|
| + isNameRune := func(c rune) bool {
|
| + return isAlpha(c) || isDigit(c) || c == '_'
|
| + }
|
| +
|
| + for isNameRune(l.Peek()) {
|
| + l.Consume()
|
| + }
|
| +
|
| + // Emit the appropriate keyword token if the current name is a
|
| + // keyword or a Name token otherwise.
|
| + if token, found := keywordTokens[l.CurText()]; found {
|
| + l.emitToken(token)
|
| + } else {
|
| + l.emitToken(Name)
|
| + }
|
| +
|
| + return lexRoot
|
| +}
|
| +
|
| +// isOrdinalStart returns true if the rune is the beginning of an Ordinal
|
| +// token.
|
| +func isOrdinalStart(c rune) bool {
|
| + return '@' == c
|
| +}
|
| +
|
| +// lexOrdinal lexes an Ordinal token. Ordinals are a '@' followed by one
|
| +// or more digits.
|
| +func lexOrdinal(l *lexer) stateFn {
|
| + // Consume the '@'.
|
| + l.Consume()
|
| +
|
| + for isDigit(l.Peek()) {
|
| + l.Consume()
|
| + }
|
| +
|
| + l.emitToken(Ordinal)
|
| +
|
| + return lexRoot
|
| +}
|
| +
|
| +// isNumberStart returns true if the rune is the beginning of a number.
|
| +func isNumberStart(c rune) bool {
|
| + // Even hexadecimals must begin with a digit (namely 0).
|
| + return isDigit(c)
|
| +}
|
| +
|
| +// lexNumber lexes a number token.
|
| +func lexNumber(l *lexer) stateFn {
|
| + // If the number begins with 0 it cannot be a decimal integer.
|
| + if l.Peek() == '0' {
|
| + return lexNumberStartWithZero
|
| + }
|
| + return lexDec
|
| +}
|
| +
|
| +// lexDec lexes a base-10 number.
|
| +func lexDec(l *lexer) stateFn {
|
| + for isDigit(l.Peek()) {
|
| + l.Consume()
|
| + }
|
| +
|
| + // If a decimal part is found, transition to the decimal state.
|
| + if isDecimalPartStart(l.Peek()) {
|
| + return lexDecimalPart
|
| + }
|
| +
|
| + l.emitToken(IntConstDec)
|
| +
|
| + return lexRoot
|
| +}
|
| +
|
| +// lexNumberStartWithZero lexes hexadecimals, some floats or 0.
|
| +func lexNumberStartWithZero(l *lexer) stateFn {
|
| + // Consume the leading 0
|
| + l.Consume()
|
| +
|
| + // Here we check to see whether we are in the hexadecimal or floating
|
| + // point case.
|
| + switch c := l.Peek(); {
|
| + case c == 'x' || c == 'X':
|
| + return lexHexNumber
|
| + case isDecimalPartStart(c):
|
| + return lexDecimalPart
|
| + }
|
| +
|
| + // Found a naked 0.
|
| + l.emitToken(IntConstDec)
|
| +
|
| + return lexRoot
|
| +}
|
| +
|
| +// lexHexNumber lexes hexadecimal integers.
|
| +func lexHexNumber(l *lexer) stateFn {
|
| + // Consume the x or X
|
| + l.Consume()
|
| +
|
| + for isHexDigit(l.Peek()) {
|
| + l.Consume()
|
| + }
|
| +
|
| + l.emitToken(IntConstHex)
|
| +
|
| + return lexRoot
|
| +}
|
| +
|
| +// isDecimalPartStart returns true if the rune represents the beginning of
|
| +// the decimal part of a floating point number.
|
| +func isDecimalPartStart(c rune) bool {
|
| + return c == '.' || c == 'e' || c == 'E'
|
| +}
|
| +
|
| +// lexDecimalPart lexes the decimal part of a floating point number.
|
| +func lexDecimalPart(l *lexer) stateFn {
|
| + // Consume '.' or 'e' or 'E'
|
| + l.Consume()
|
| +
|
| + if c := l.Peek(); c == '+' || c == '-' {
|
| + l.Consume()
|
| + }
|
| +
|
| + for isDigit(l.Peek()) {
|
| + l.Consume()
|
| + }
|
| +
|
| + l.emitToken(FloatConst)
|
| +
|
| + return lexRoot
|
| +}
|
| +
|
| +// isStringStart returns true if the rune represents the beginning of a string.
|
| +func isStringStart(c rune) bool {
|
| + return '"' == c
|
| +}
|
| +
|
| +// lexString lexes a quoted string.
|
| +func lexString(l *lexer) stateFn {
|
| + // Consume opening quotes.
|
| + l.Consume()
|
| +
|
| + for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' {
|
| + if l.Peek() == '\\' {
|
| + // If we see an escape character consume whatever follows blindly.
|
| + // TODO(azani): Consider parsing escape sequences.
|
| + l.Consume()
|
| + }
|
| + l.Consume()
|
| + }
|
| +
|
| + if l.IsEos() || l.Peek() == '\n' {
|
| + l.emitToken(ErrorUnterminatedStringLiteral)
|
| + return nil
|
| + }
|
| +
|
| + // Consume the closing quotes
|
| + l.Consume()
|
| +
|
| + l.emitToken(StringLiteral)
|
| +
|
| + return lexRoot
|
| +}
|
| +
|
| +// isMaybeComment returns true if the rune may be the beginning of a
|
| +// comment.
|
| +func isMaybeComment(c rune) bool {
|
| + return c == '/'
|
| +}
|
| +
|
| +// lexComment consumes a single-line or multi-line comment.
|
| +func lexComment(l *lexer) stateFn {
|
| + // Consume the '/'.
|
| + l.Consume()
|
| +
|
| + switch l.Peek() {
|
| + case '/':
|
| + return lexSingleLineComment
|
| + case '*':
|
| + return lexMultiLineComment
|
| + }
|
| +
|
| + l.emitToken(ErrorIllegalChar)
|
| + return nil
|
| +}
|
| +
|
| +// lexSingleLineComment consumes a single line comment.
|
| +func lexSingleLineComment(l *lexer) stateFn {
|
| + // Consume the '/'
|
| + l.Consume()
|
| +
|
| + for !l.IsEos() && l.Peek() != '\n' {
|
| + l.Consume()
|
| + }
|
| +
|
| + l.beginToken()
|
| + return lexRoot
|
| +}
|
| +
|
| +// lexMultiLineComment consumes a multi-line comment.
|
| +func lexMultiLineComment(l *lexer) stateFn {
|
| + // Consume the '*'.
|
| + l.Consume()
|
| +
|
| + for !l.IsEos() {
|
| + if l.Peek() == '*' {
|
| + return lexPossibleEndOfComment
|
| + }
|
| + l.Consume()
|
| + }
|
| +
|
| + l.emitToken(ErrorUnterminatedComment)
|
| + return nil
|
| +}
|
| +
|
| +// lexPossibleEndOfComment consumes the possible end of a multiline
|
| +// comment and determines whether the comment in fact ended or not.
|
| +func lexPossibleEndOfComment(l *lexer) stateFn {
|
| + // Consume the '*'
|
| + l.Consume()
|
| +
|
| + if l.IsEos() {
|
| + l.emitToken(ErrorUnterminatedComment)
|
| + return nil
|
| + }
|
| +
|
| + if l.Peek() == '/' {
|
| + l.Consume()
|
| + l.beginToken()
|
| + return lexRoot
|
| + }
|
| +
|
| + return lexMultiLineComment
|
| +}
|
|
|