Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1513)

Unified Diff: mojom/mojom_parser/lexer/lexer.go

Issue 1387893002: New lexer for mojom written in go. (Closed) Base URL: https://github.com/domokit/mojo.git@master
Patch Set: Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | mojom/mojom_parser/lexer/lexer_test.go » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: mojom/mojom_parser/lexer/lexer.go
diff --git a/mojom/mojom_parser/lexer/lexer.go b/mojom/mojom_parser/lexer/lexer.go
new file mode 100644
index 0000000000000000000000000000000000000000..b210b2bbeed5039abb13a78b6b690194230b1f81
--- /dev/null
+++ b/mojom/mojom_parser/lexer/lexer.go
@@ -0,0 +1,519 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// To use the lexer, call Tokenize with the source string to obtain
+// a TokenStream. The lexer is run concurrently so you should be able to
+// use the TokenStream before the lexer is done with the source.
+//
+// The lexer is implemented as a state machine. The states are represented
+// by functions (the stateFn type) which accept a lexer and return the
+// new state.
+//
+// Most states also have an isFooStart function which helps determine if
+// a transition to Foo is appropriate. Those functions accept a single
+// rune as a parameter and return true if the state machine should
+// transition to state Foo. Some states do not have such functions on
+// account of the transition condition being trivial.
+//
+// The lexer implementation was inspired by
+// http://cuddle.googlecode.com/hg/talk/lex.html
+
+package lexer
+
+import (
+ "unicode/utf8"
+)
+
+// Tokenize accepts a source string and parses it into a stream of tokens which
+// can be read from the returned TokenStream.
+func Tokenize(source string) TokenStream {
+ tokens := make(chan Token)
+ l := lexer{source: source, tokens: tokens}
+ go l.run()
+ return &TokenChan{tokenChan: tokens}
+}
+
+type lexer struct {
+ // source is the source code to be lexed.
+ source string
+
+ // offset is the number of bytes that have been consumed.
+ offset int
+
+ // tokens is a channel to which the found tokens are emitted.
+ tokens chan Token
+
+ // sourcePos is the number of runes that have been consumed.
+ sourcePos int
+
+ // lineno is the current line number.
+ lineNo int
+
+ // linePos is how many runes have been consumed since the beginning of the
+ // line.
+ linePos int
+
+ // curTokenOffset is the number of bytes consumed prior to the beginning of
+ // the current token.
+ curTokenOffset int
+
+ // curTokenSourcePos is the number of runes consumed prior to the beginning of
+ // the current token.
+ curTokenSourcePos int
+
+ // curTokenLineNo is the line number on which the current token begins.
+ curTokenLineNo int
+
+ // curTokenLinePos is the number of runes since the beginning of the line
+ // where the current token begins.
+ curTokenLinePos int
+}
+
+// CurText returns the consumed part of the current token.
+func (l *lexer) CurText() string {
+ return l.source[l.curTokenOffset:l.offset]
+}
+
+// emitToken emits the current token and begins a new token.
+func (l *lexer) emitToken(tokenType TokenKind) {
+ l.tokens <- Token{
+ Kind: tokenType,
+ Text: l.source[l.curTokenOffset:l.offset],
+ CharPos: l.curTokenSourcePos,
+ LineNo: l.curTokenLineNo,
+ LinePos: l.curTokenLinePos}
+ l.beginToken()
+}
+
+// beginToken begins the new token.
+func (l *lexer) beginToken() {
+ l.curTokenOffset = l.offset
+ l.curTokenSourcePos = l.sourcePos
+ l.curTokenLineNo = l.lineNo
+ l.curTokenLinePos = l.linePos
+}
+
+// Consume consumes the next rune in the source.
+func (l *lexer) Consume() {
+ if l.IsEos() {
+ return
+ }
+
+ c, width := utf8.DecodeRuneInString(l.source[l.offset:])
+
+ if c == '\n' {
+ l.lineNo += 1
+ l.linePos = 0
+ } else {
+ l.linePos += 1
+ }
+ l.offset += width
+ l.sourcePos += 1
+}
+
+// Peek returns the next rune in the source.
+func (l *lexer) Peek() rune {
+ // At the end of the string, there is no sane answer to Peek.
+ if l.IsEos() {
+ return utf8.RuneError
+ }
+
+ // If RuneError is returned, it will be handled as any other rune, likely
+ // resulting in an ErrorIllegalChar token being emitted.
+ char, _ := utf8.DecodeRuneInString(l.source[l.offset:])
+ return char
+}
+
+// IsEos returns true if the whole source has been consumed false
+// otherwise.
+func (l *lexer) IsEos() bool {
+ return l.offset >= len(l.source)
+}
+
+// run is the lexer's main loop.
+func (l *lexer) run() {
+ // We are implementing a state machine.
+ // lexRoot is the beginning state.
+ // nil is the end state.
+ // States are functions which are called on the lexer. They return the
+ // next state.
+ for state := lexRoot; state != nil; {
+ state = state(l)
+ }
+ close(l.tokens)
+}
+
+// A stateFn represents a state in the lexer state machine.
+type stateFn func(*lexer) stateFn
+
+// This is the beginning state and also the state which is returned to after
+// most tokens are emitted.
+func lexRoot(l *lexer) stateFn {
+ if l.IsEos() {
+ return nil
+ }
+
+ switch c := l.Peek(); {
+ case isSingleCharTokens(c):
+ return lexSingleCharTokens
+ case isEqualsOrResponseStart(c):
+ return lexEqualsOrResponse
+ case isNameStart(c):
+ return lexName
+ case isOrdinalStart(c):
+ return lexOrdinal
+ case isNumberStart(c):
+ return lexNumber
+ case isStringStart(c):
+ return lexString
+ case isSkippable(c):
+ return lexSkip
+ case isMaybeComment(c):
+ return lexComment
+ }
+
+ l.Consume()
+ l.emitToken(ErrorIllegalChar)
+ return nil
+}
+
+// isSkippable determines if a rune is skippable.
+func isSkippable(c rune) bool {
+ return c == ' ' || c == '\t' || c == '\r' || c == '\n'
+}
+
+// lexSkip consumes skippable runes.
+func lexSkip(l *lexer) stateFn {
+ for isSkippable(l.Peek()) {
+ l.Consume()
+ }
+ l.beginToken()
+ return lexRoot
+}
+
+// singleCharTokens is a map of single-rune tokens.
+var singleCharTokens = map[rune]TokenKind{
+ '(': LParen,
+ ')': RParen,
+ '[': LBracket,
+ ']': RBracket,
+ '{': LBrace,
+ '}': RBrace,
+ '<': LAngle,
+ '>': RAngle,
+ ';': Semi,
+ ',': Comma,
+ '.': Dot,
+ '-': Minus,
+ '+': Plus,
+ '&': Amp,
+ '?': Qstn,
+}
+
+// isSingleCharTokens returns true if the rune is a single character token.
+func isSingleCharTokens(c rune) bool {
+ _, ok := singleCharTokens[c]
+ return ok
+}
+
+// lexSingleCharTokens lexes single character tokens.
+func lexSingleCharTokens(l *lexer) stateFn {
+ c := l.Peek()
+ l.Consume()
+ t, _ := singleCharTokens[c]
+ l.emitToken(t)
+
+ return lexRoot
+}
+
+// isEqualsOrResponseStart returns true if the rune corresponds to be
+// beginning of either the '=' or '=>' tokens.
+func isEqualsOrResponseStart(c rune) bool {
+ return c == '='
+}
+
+// lexEqualsOrResponse lexes the '=' or the '=>' token.
+func lexEqualsOrResponse(l *lexer) stateFn {
+ l.Consume()
+
+ if l.Peek() == '>' {
+ l.Consume()
+ l.emitToken(Response)
+ } else {
+ l.emitToken(Equals)
+ }
+
+ return lexRoot
+}
+
+// isAlpha returns true if the rune is a letter of the alphabet.
+func isAlpha(c rune) bool {
+ return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
+}
+
+// isDigit returns true if the rune is a digit.
+func isDigit(c rune) bool {
+ return ('0' <= c && c <= '9')
+}
+
+// isHexDigit returns true if the rune is a hexadecimal digit.
+func isHexDigit(c rune) bool {
+ return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
+}
+
+// isNameStart returns true if the rune is the beginning of a Name token.
+func isNameStart(c rune) bool {
+ return isAlpha(c) || c == '_'
+}
+
+// keywordTokens maps keywords to their associate tokens.
+var keywordTokens = map[string]TokenKind{
+ "import": Import,
+ "module": Module,
+ "struct": Struct,
+ "union": Union,
+ "interface": Interface,
+ "enum": Enum,
+ "const": Const,
+ "true": True,
+ "false": False,
+ "default": Default,
+}
+
+// lexName lexes valid C identifiers. (K&R2: A.2.3)
+func lexName(l *lexer) stateFn {
+ l.Consume()
+
+ // isNameRune returns true if the rune is valid in a Name token.
+ isNameRune := func(c rune) bool {
+ return isAlpha(c) || isDigit(c) || c == '_'
+ }
+
+ for isNameRune(l.Peek()) {
+ l.Consume()
+ }
+
+ // Emit the appropriate keyword token if the current name is a
+ // keyword or a Name token otherwise.
+ if token, found := keywordTokens[l.CurText()]; found {
+ l.emitToken(token)
+ } else {
+ l.emitToken(Name)
+ }
+
+ return lexRoot
+}
+
+// isOrdinalStart returns true if the rune is the beginning of an Ordinal
+// token.
+func isOrdinalStart(c rune) bool {
+ return '@' == c
+}
+
+// lexOrdinal lexes an Ordinal token. Ordinals are a '@' followed by one
+// or more digits.
+func lexOrdinal(l *lexer) stateFn {
+ // Consume the '@'.
+ l.Consume()
+
+ for isDigit(l.Peek()) {
+ l.Consume()
+ }
+
+ l.emitToken(Ordinal)
+
+ return lexRoot
+}
+
+// isNumberStart returns true if the rune is the beginning of a number.
+func isNumberStart(c rune) bool {
+ // Even hexadecimals must begin with a digit (namely 0).
+ return isDigit(c)
+}
+
+// lexNumber lexes a number token.
+func lexNumber(l *lexer) stateFn {
+ // If the number begins with 0 it cannot be a decimal integer.
+ if l.Peek() == '0' {
+ return lexNumberStartWithZero
+ }
+ return lexDec
+}
+
+// lexDec lexes a base-10 number.
+func lexDec(l *lexer) stateFn {
+ for isDigit(l.Peek()) {
+ l.Consume()
+ }
+
+ // If a decimal part is found, transition to the decimal state.
+ if isDecimalPartStart(l.Peek()) {
+ return lexDecimalPart
+ }
+
+ l.emitToken(IntConstDec)
+
+ return lexRoot
+}
+
+// lexNumberStartWithZero lexes hexadecimals, some floats or 0.
+func lexNumberStartWithZero(l *lexer) stateFn {
+ // Consume the leading 0
+ l.Consume()
+
+ // Here we check to see whether we are in the hexadecimal or floating
+ // point case.
+ switch c := l.Peek(); {
+ case c == 'x' || c == 'X':
+ return lexHexNumber
+ case isDecimalPartStart(c):
+ return lexDecimalPart
+ }
+
+ // Found a naked 0.
+ l.emitToken(IntConstDec)
+
+ return lexRoot
+}
+
+// lexHexNumber lexes hexadecimal integers.
+func lexHexNumber(l *lexer) stateFn {
+ // Consume the x or X
+ l.Consume()
+
+ for isHexDigit(l.Peek()) {
+ l.Consume()
+ }
+
+ l.emitToken(IntConstHex)
+
+ return lexRoot
+}
+
+// isDecimalPartStart returns true if the rune represents the beginning of
+// the decimal part of a floating point number.
+func isDecimalPartStart(c rune) bool {
+ return c == '.' || c == 'e' || c == 'E'
+}
+
+// lexDecimalPart lexes the decimal part of a floating point number.
+func lexDecimalPart(l *lexer) stateFn {
+ // Consume '.' or 'e' or 'E'
+ l.Consume()
+
+ if c := l.Peek(); c == '+' || c == '-' {
+ l.Consume()
+ }
+
+ for isDigit(l.Peek()) {
+ l.Consume()
+ }
+
+ l.emitToken(FloatConst)
+
+ return lexRoot
+}
+
+// isStringStart returns true if the rune represents the beginning of a string.
+func isStringStart(c rune) bool {
+ return '"' == c
+}
+
+// lexString lexes a quoted string.
+func lexString(l *lexer) stateFn {
+ // Consume opening quotes.
+ l.Consume()
+
+ for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' {
+ if l.Peek() == '\\' {
+ // If we see an escape character consume whatever follows blindly.
+ // TODO(azani): Consider parsing escape sequences.
+ l.Consume()
+ }
+ l.Consume()
+ }
+
+ if l.IsEos() || l.Peek() == '\n' {
+ l.emitToken(ErrorUnterminatedStringLiteral)
+ return nil
+ }
+
+ // Consume the closing quotes
+ l.Consume()
+
+ l.emitToken(StringLiteral)
+
+ return lexRoot
+}
+
+// isMaybeComment returns true if the rune may be the beginning of a
+// comment.
+func isMaybeComment(c rune) bool {
+ return c == '/'
+}
+
+// lexComment consumes a single-line or multi-line comment.
+func lexComment(l *lexer) stateFn {
+ // Consume the '/'.
+ l.Consume()
+
+ switch l.Peek() {
+ case '/':
+ return lexSingleLineComment
+ case '*':
+ return lexMultiLineComment
+ }
+
+ l.emitToken(ErrorIllegalChar)
+ return nil
+}
+
+// lexSingleLineComment consumes a single line comment.
+func lexSingleLineComment(l *lexer) stateFn {
+ // Consume the '/'
+ l.Consume()
+
+ for !l.IsEos() && l.Peek() != '\n' {
+ l.Consume()
+ }
+
+ l.beginToken()
+ return lexRoot
+}
+
+// lexMultiLineComment consumes a multi-line comment.
+func lexMultiLineComment(l *lexer) stateFn {
+ // Consume the '*'.
+ l.Consume()
+
+ for !l.IsEos() {
+ if l.Peek() == '*' {
+ return lexPossibleEndOfComment
+ }
+ l.Consume()
+ }
+
+ l.emitToken(ErrorUnterminatedComment)
+ return nil
+}
+
+// lexPossibleEndOfComment consumes the possible end of a multiline
+// comment and determines whether the comment in fact ended or not.
+func lexPossibleEndOfComment(l *lexer) stateFn {
+ // Consume the '*'
+ l.Consume()
+
+ if l.IsEos() {
+ l.emitToken(ErrorUnterminatedComment)
+ return nil
+ }
+
+ if l.Peek() == '/' {
+ l.Consume()
+ l.beginToken()
+ return lexRoot
+ }
+
+ return lexMultiLineComment
+}
« no previous file with comments | « no previous file | mojom/mojom_parser/lexer/lexer_test.go » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698