Index: mojom/mojom_parser/lexer/lexer.go |
diff --git a/mojom/mojom_parser/lexer/lexer.go b/mojom/mojom_parser/lexer/lexer.go |
new file mode 100644 |
index 0000000000000000000000000000000000000000..b210b2bbeed5039abb13a78b6b690194230b1f81 |
--- /dev/null |
+++ b/mojom/mojom_parser/lexer/lexer.go |
@@ -0,0 +1,519 @@ |
+// Copyright 2015 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+// |
+// To use the lexer, call Tokenize with the source string to obtain |
+// a TokenStream. The lexer is run concurrently so you should be able to |
+// use the TokenStream before the lexer is done with the source. |
+// |
+// The lexer is implemented as a state machine. The states are represented |
+// by functions (the stateFn type) which accept a lexer and return the |
+// new state. |
+// |
+// Most states also have an isFooStart function which helps determine if |
+// a transition to Foo is appropriate. Those functions accept a single |
+// rune as a parameter and return true if the state machine should |
+// transition to state Foo. Some states do not have such functions on |
+// account of the transition condition being trivial. |
+// |
+// The lexer implementation was inspired by |
+// http://cuddle.googlecode.com/hg/talk/lex.html |
+ |
+package lexer |
+ |
+import ( |
+ "unicode/utf8" |
+) |
+ |
+// Tokenize accepts a source string and parses it into a stream of tokens which |
+// can be read from the returned TokenStream. |
+func Tokenize(source string) TokenStream { |
+ tokens := make(chan Token) |
+ l := lexer{source: source, tokens: tokens} |
+ go l.run() |
+ return &TokenChan{tokenChan: tokens} |
+} |
+ |
+type lexer struct { |
+ // source is the source code to be lexed. |
+ source string |
+ |
+ // offset is the number of bytes that have been consumed. |
+ offset int |
+ |
+ // tokens is a channel to which the found tokens are emitted. |
+ tokens chan Token |
+ |
+ // sourcePos is the number of runes that have been consumed. |
+ sourcePos int |
+ |
+ // lineno is the current line number. |
+ lineNo int |
+ |
+ // linePos is how many runes have been consumed since the beginning of the |
+ // line. |
+ linePos int |
+ |
+ // curTokenOffset is the number of bytes consumed prior to the beginning of |
+ // the current token. |
+ curTokenOffset int |
+ |
+ // curTokenSourcePos is the number of runes consumed prior to the beginning of |
+ // the current token. |
+ curTokenSourcePos int |
+ |
+ // curTokenLineNo is the line number on which the current token begins. |
+ curTokenLineNo int |
+ |
+ // curTokenLinePos is the number of runes since the beginning of the line |
+ // where the current token begins. |
+ curTokenLinePos int |
+} |
+ |
+// CurText returns the consumed part of the current token. |
+func (l *lexer) CurText() string { |
+ return l.source[l.curTokenOffset:l.offset] |
+} |
+ |
+// emitToken emits the current token and begins a new token. |
+func (l *lexer) emitToken(tokenType TokenKind) { |
+ l.tokens <- Token{ |
+ Kind: tokenType, |
+ Text: l.source[l.curTokenOffset:l.offset], |
+ CharPos: l.curTokenSourcePos, |
+ LineNo: l.curTokenLineNo, |
+ LinePos: l.curTokenLinePos} |
+ l.beginToken() |
+} |
+ |
+// beginToken begins the new token. |
+func (l *lexer) beginToken() { |
+ l.curTokenOffset = l.offset |
+ l.curTokenSourcePos = l.sourcePos |
+ l.curTokenLineNo = l.lineNo |
+ l.curTokenLinePos = l.linePos |
+} |
+ |
+// Consume consumes the next rune in the source. |
+func (l *lexer) Consume() { |
+ if l.IsEos() { |
+ return |
+ } |
+ |
+ c, width := utf8.DecodeRuneInString(l.source[l.offset:]) |
+ |
+ if c == '\n' { |
+ l.lineNo += 1 |
+ l.linePos = 0 |
+ } else { |
+ l.linePos += 1 |
+ } |
+ l.offset += width |
+ l.sourcePos += 1 |
+} |
+ |
+// Peek returns the next rune in the source. |
+func (l *lexer) Peek() rune { |
+ // At the end of the string, there is no sane answer to Peek. |
+ if l.IsEos() { |
+ return utf8.RuneError |
+ } |
+ |
+ // If RuneError is returned, it will be handled as any other rune, likely |
+ // resulting in an ErrorIllegalChar token being emitted. |
+ char, _ := utf8.DecodeRuneInString(l.source[l.offset:]) |
+ return char |
+} |
+ |
+// IsEos returns true if the whole source has been consumed false |
+// otherwise. |
+func (l *lexer) IsEos() bool { |
+ return l.offset >= len(l.source) |
+} |
+ |
+// run is the lexer's main loop. |
+func (l *lexer) run() { |
+ // We are implementing a state machine. |
+ // lexRoot is the beginning state. |
+ // nil is the end state. |
+ // States are functions which are called on the lexer. They return the |
+ // next state. |
+ for state := lexRoot; state != nil; { |
+ state = state(l) |
+ } |
+ close(l.tokens) |
+} |
+ |
+// A stateFn represents a state in the lexer state machine. |
+type stateFn func(*lexer) stateFn |
+ |
+// This is the beginning state and also the state which is returned to after |
+// most tokens are emitted. |
+func lexRoot(l *lexer) stateFn { |
+ if l.IsEos() { |
+ return nil |
+ } |
+ |
+ switch c := l.Peek(); { |
+ case isSingleCharTokens(c): |
+ return lexSingleCharTokens |
+ case isEqualsOrResponseStart(c): |
+ return lexEqualsOrResponse |
+ case isNameStart(c): |
+ return lexName |
+ case isOrdinalStart(c): |
+ return lexOrdinal |
+ case isNumberStart(c): |
+ return lexNumber |
+ case isStringStart(c): |
+ return lexString |
+ case isSkippable(c): |
+ return lexSkip |
+ case isMaybeComment(c): |
+ return lexComment |
+ } |
+ |
+ l.Consume() |
+ l.emitToken(ErrorIllegalChar) |
+ return nil |
+} |
+ |
+// isSkippable determines if a rune is skippable. |
+func isSkippable(c rune) bool { |
+ return c == ' ' || c == '\t' || c == '\r' || c == '\n' |
+} |
+ |
+// lexSkip consumes skippable runes. |
+func lexSkip(l *lexer) stateFn { |
+ for isSkippable(l.Peek()) { |
+ l.Consume() |
+ } |
+ l.beginToken() |
+ return lexRoot |
+} |
+ |
+// singleCharTokens is a map of single-rune tokens. |
+var singleCharTokens = map[rune]TokenKind{ |
+ '(': LParen, |
+ ')': RParen, |
+ '[': LBracket, |
+ ']': RBracket, |
+ '{': LBrace, |
+ '}': RBrace, |
+ '<': LAngle, |
+ '>': RAngle, |
+ ';': Semi, |
+ ',': Comma, |
+ '.': Dot, |
+ '-': Minus, |
+ '+': Plus, |
+ '&': Amp, |
+ '?': Qstn, |
+} |
+ |
+// isSingleCharTokens returns true if the rune is a single character token. |
+func isSingleCharTokens(c rune) bool { |
+ _, ok := singleCharTokens[c] |
+ return ok |
+} |
+ |
+// lexSingleCharTokens lexes single character tokens. |
+func lexSingleCharTokens(l *lexer) stateFn { |
+ c := l.Peek() |
+ l.Consume() |
+ t, _ := singleCharTokens[c] |
+ l.emitToken(t) |
+ |
+ return lexRoot |
+} |
+ |
+// isEqualsOrResponseStart returns true if the rune corresponds to be |
+// beginning of either the '=' or '=>' tokens. |
+func isEqualsOrResponseStart(c rune) bool { |
+ return c == '=' |
+} |
+ |
+// lexEqualsOrResponse lexes the '=' or the '=>' token. |
+func lexEqualsOrResponse(l *lexer) stateFn { |
+ l.Consume() |
+ |
+ if l.Peek() == '>' { |
+ l.Consume() |
+ l.emitToken(Response) |
+ } else { |
+ l.emitToken(Equals) |
+ } |
+ |
+ return lexRoot |
+} |
+ |
+// isAlpha returns true if the rune is a letter of the alphabet. |
+func isAlpha(c rune) bool { |
+ return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) |
+} |
+ |
+// isDigit returns true if the rune is a digit. |
+func isDigit(c rune) bool { |
+ return ('0' <= c && c <= '9') |
+} |
+ |
+// isHexDigit returns true if the rune is a hexadecimal digit. |
+func isHexDigit(c rune) bool { |
+ return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') |
+} |
+ |
+// isNameStart returns true if the rune is the beginning of a Name token. |
+func isNameStart(c rune) bool { |
+ return isAlpha(c) || c == '_' |
+} |
+ |
+// keywordTokens maps keywords to their associate tokens. |
+var keywordTokens = map[string]TokenKind{ |
+ "import": Import, |
+ "module": Module, |
+ "struct": Struct, |
+ "union": Union, |
+ "interface": Interface, |
+ "enum": Enum, |
+ "const": Const, |
+ "true": True, |
+ "false": False, |
+ "default": Default, |
+} |
+ |
+// lexName lexes valid C identifiers. (K&R2: A.2.3) |
+func lexName(l *lexer) stateFn { |
+ l.Consume() |
+ |
+ // isNameRune returns true if the rune is valid in a Name token. |
+ isNameRune := func(c rune) bool { |
+ return isAlpha(c) || isDigit(c) || c == '_' |
+ } |
+ |
+ for isNameRune(l.Peek()) { |
+ l.Consume() |
+ } |
+ |
+ // Emit the appropriate keyword token if the current name is a |
+ // keyword or a Name token otherwise. |
+ if token, found := keywordTokens[l.CurText()]; found { |
+ l.emitToken(token) |
+ } else { |
+ l.emitToken(Name) |
+ } |
+ |
+ return lexRoot |
+} |
+ |
+// isOrdinalStart returns true if the rune is the beginning of an Ordinal |
+// token. |
+func isOrdinalStart(c rune) bool { |
+ return '@' == c |
+} |
+ |
+// lexOrdinal lexes an Ordinal token. Ordinals are a '@' followed by one |
+// or more digits. |
+func lexOrdinal(l *lexer) stateFn { |
+ // Consume the '@'. |
+ l.Consume() |
+ |
+ for isDigit(l.Peek()) { |
+ l.Consume() |
+ } |
+ |
+ l.emitToken(Ordinal) |
+ |
+ return lexRoot |
+} |
+ |
+// isNumberStart returns true if the rune is the beginning of a number. |
+func isNumberStart(c rune) bool { |
+ // Even hexadecimals must begin with a digit (namely 0). |
+ return isDigit(c) |
+} |
+ |
+// lexNumber lexes a number token. |
+func lexNumber(l *lexer) stateFn { |
+ // If the number begins with 0 it cannot be a decimal integer. |
+ if l.Peek() == '0' { |
+ return lexNumberStartWithZero |
+ } |
+ return lexDec |
+} |
+ |
+// lexDec lexes a base-10 number. |
+func lexDec(l *lexer) stateFn { |
+ for isDigit(l.Peek()) { |
+ l.Consume() |
+ } |
+ |
+ // If a decimal part is found, transition to the decimal state. |
+ if isDecimalPartStart(l.Peek()) { |
+ return lexDecimalPart |
+ } |
+ |
+ l.emitToken(IntConstDec) |
+ |
+ return lexRoot |
+} |
+ |
+// lexNumberStartWithZero lexes hexadecimals, some floats or 0. |
+func lexNumberStartWithZero(l *lexer) stateFn { |
+ // Consume the leading 0 |
+ l.Consume() |
+ |
+ // Here we check to see whether we are in the hexadecimal or floating |
+ // point case. |
+ switch c := l.Peek(); { |
+ case c == 'x' || c == 'X': |
+ return lexHexNumber |
+ case isDecimalPartStart(c): |
+ return lexDecimalPart |
+ } |
+ |
+ // Found a naked 0. |
+ l.emitToken(IntConstDec) |
+ |
+ return lexRoot |
+} |
+ |
+// lexHexNumber lexes hexadecimal integers. |
+func lexHexNumber(l *lexer) stateFn { |
+ // Consume the x or X |
+ l.Consume() |
+ |
+ for isHexDigit(l.Peek()) { |
+ l.Consume() |
+ } |
+ |
+ l.emitToken(IntConstHex) |
+ |
+ return lexRoot |
+} |
+ |
+// isDecimalPartStart returns true if the rune represents the beginning of |
+// the decimal part of a floating point number. |
+func isDecimalPartStart(c rune) bool { |
+ return c == '.' || c == 'e' || c == 'E' |
+} |
+ |
+// lexDecimalPart lexes the decimal part of a floating point number. |
+func lexDecimalPart(l *lexer) stateFn { |
+ // Consume '.' or 'e' or 'E' |
+ l.Consume() |
+ |
+ if c := l.Peek(); c == '+' || c == '-' { |
+ l.Consume() |
+ } |
+ |
+ for isDigit(l.Peek()) { |
+ l.Consume() |
+ } |
+ |
+ l.emitToken(FloatConst) |
+ |
+ return lexRoot |
+} |
+ |
+// isStringStart returns true if the rune represents the beginning of a string. |
+func isStringStart(c rune) bool { |
+ return '"' == c |
+} |
+ |
+// lexString lexes a quoted string. |
+func lexString(l *lexer) stateFn { |
+ // Consume opening quotes. |
+ l.Consume() |
+ |
+ for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' { |
+ if l.Peek() == '\\' { |
+ // If we see an escape character consume whatever follows blindly. |
+ // TODO(azani): Consider parsing escape sequences. |
+ l.Consume() |
+ } |
+ l.Consume() |
+ } |
+ |
+ if l.IsEos() || l.Peek() == '\n' { |
+ l.emitToken(ErrorUnterminatedStringLiteral) |
+ return nil |
+ } |
+ |
+ // Consume the closing quotes |
+ l.Consume() |
+ |
+ l.emitToken(StringLiteral) |
+ |
+ return lexRoot |
+} |
+ |
+// isMaybeComment returns true if the rune may be the beginning of a |
+// comment. |
+func isMaybeComment(c rune) bool { |
+ return c == '/' |
+} |
+ |
+// lexComment consumes a single-line or multi-line comment. |
+func lexComment(l *lexer) stateFn { |
+ // Consume the '/'. |
+ l.Consume() |
+ |
+ switch l.Peek() { |
+ case '/': |
+ return lexSingleLineComment |
+ case '*': |
+ return lexMultiLineComment |
+ } |
+ |
+ l.emitToken(ErrorIllegalChar) |
+ return nil |
+} |
+ |
+// lexSingleLineComment consumes a single line comment. |
+func lexSingleLineComment(l *lexer) stateFn { |
+ // Consume the '/' |
+ l.Consume() |
+ |
+ for !l.IsEos() && l.Peek() != '\n' { |
+ l.Consume() |
+ } |
+ |
+ l.beginToken() |
+ return lexRoot |
+} |
+ |
+// lexMultiLineComment consumes a multi-line comment. |
+func lexMultiLineComment(l *lexer) stateFn { |
+ // Consume the '*'. |
+ l.Consume() |
+ |
+ for !l.IsEos() { |
+ if l.Peek() == '*' { |
+ return lexPossibleEndOfComment |
+ } |
+ l.Consume() |
+ } |
+ |
+ l.emitToken(ErrorUnterminatedComment) |
+ return nil |
+} |
+ |
+// lexPossibleEndOfComment consumes the possible end of a multiline |
+// comment and determines whether the comment in fact ended or not. |
+func lexPossibleEndOfComment(l *lexer) stateFn { |
+ // Consume the '*' |
+ l.Consume() |
+ |
+ if l.IsEos() { |
+ l.emitToken(ErrorUnterminatedComment) |
+ return nil |
+ } |
+ |
+ if l.Peek() == '/' { |
+ l.Consume() |
+ l.beginToken() |
+ return lexRoot |
+ } |
+ |
+ return lexMultiLineComment |
+} |