mojom/mojom_parser/lexer/lexer.go - Issue 1387893002: New lexer for mojom written in go.

Side by Side Diff: mojom/mojom_parser/lexer/lexer.go

Issue 1387893002: New lexer for mojom written in go. (Closed) Base URL: https://github.com/domokit/mojo.git@master

Patch Set: Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 // Copyright 2015 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4 //

	5 // To use the lexer, call Tokenize with the source string to obtain

	6 // a TokenStream. The lexer is run concurrently so you should be able to

	7 // use the TokenStream before the lexer is done with the source.

	8 //

	9 // The lexer is implemented as a state machine. The states are represented

	10 // by functions (the stateFn type) which accept a lexer and return the

	11 // new state.

	12 //

	13 // Most states also have an isFooStart function which helps determine if

	14 // a transition to Foo is appropriate. Those functions accept a single

	15 // rune as a parameter and return true if the state machine should

	16 // transition to state Foo. Some states do not have such functions on

	17 // account of the transition condition being trivial.

	18 //

	19 // The lexer implementation was inspired by

	20 // http://cuddle.googlecode.com/hg/talk/lex.html

	21

	22 package lexer

	23

	24 import (

	25 "unicode/utf8"

	26 )

	27

	28 // Tokenize accepts a source string and parses it into a stream of tokens which

	29 // can be read from the returned TokenStream.

	30 func Tokenize(source string) TokenStream {

	31 tokens := make(chan Token)

	32 l := lexer{source: source, tokens: tokens}

	33 go l.run()

	34 return &TokenChan{tokenChan: tokens}

	35 }

	36

	37 type lexer struct {

	38 // source is the source code to be lexed.

	39 source string

	40

	41 // offset is the number of bytes that have been consumed.

	42 offset int

	43

	44 // tokens is a channel to which the found tokens are emitted.

	45 tokens chan Token

	46

	47 // sourcePos is the number of runes that have been consumed.

	48 sourcePos int

	49

	50 // lineno is the current line number.

	51 lineNo int

	52

	53 // linePos is how many runes have been consumed since the beginning of t he

	54 // line.

	55 linePos int

	56

	57 // curTokenOffset is the number of bytes consumed prior to the beginning of

	58 // the current token.

	59 curTokenOffset int

	60

	61 // curTokenSourcePos is the number of runes consumed prior to the beginn ing of

	62 // the current token.

	63 curTokenSourcePos int

	64

	65 // curTokenLineNo is the line number on which the current token begins.

	66 curTokenLineNo int

	67

	68 // curTokenLinePos is the number of runes since the beginning of the lin e

	69 // where the current token begins.

	70 curTokenLinePos int

	71 }

	72

	73 // CurText returns the consumed part of the current token.

	74 func (l *lexer) CurText() string {

	75 return l.source[l.curTokenOffset:l.offset]

	76 }

	77

	78 // emitToken emits the current token and begins a new token.

	79 func (l *lexer) emitToken(tokenType TokenKind) {

	80 l.tokens <- Token{

	81 Kind: tokenType,

	82 Text: l.source[l.curTokenOffset:l.offset],

	83 CharPos: l.curTokenSourcePos,

	84 LineNo: l.curTokenLineNo,

	85 LinePos: l.curTokenLinePos}

	86 l.beginToken()

	87 }

	88

	89 // beginToken begins the new token.

	90 func (l *lexer) beginToken() {

	91 l.curTokenOffset = l.offset

	92 l.curTokenSourcePos = l.sourcePos

	93 l.curTokenLineNo = l.lineNo

	94 l.curTokenLinePos = l.linePos

	95 }

	96

	97 // Consume consumes the next rune in the source.

	98 func (l *lexer) Consume() {

	99 if l.IsEos() {

	100 return

	101 }

	102

	103 c, width := utf8.DecodeRuneInString(l.source[l.offset:])

	104

	105 if c == '\n' {

	106 l.lineNo += 1

	107 l.linePos = 0

	108 } else {

	109 l.linePos += 1

	110 }

	111 l.offset += width

	112 l.sourcePos += 1

	113 }

	114

	115 // Peek returns the next rune in the source.

	116 func (l *lexer) Peek() rune {

	117 // At the end of the string, there is no sane answer to Peek.

	118 if l.IsEos() {

	119 return utf8.RuneError

	120 }

	121

	122 // If RuneError is returned, it will be handled as any other rune, likel y

	123 // resulting in an ErrorIllegalChar token being emitted.

	124 char, _ := utf8.DecodeRuneInString(l.source[l.offset:])

	125 return char

	126 }

	127

	128 // IsEos returns true if the whole source has been consumed false

	129 // otherwise.

	130 func (l *lexer) IsEos() bool {

	131 return l.offset >= len(l.source)

	132 }

	133

	134 // run is the lexer's main loop.

	135 func (l *lexer) run() {

	136 // We are implementing a state machine.

	137 // lexRoot is the beginning state.

	138 // nil is the end state.

	139 // States are functions which are called on the lexer. They return the

	140 // next state.

	141 for state := lexRoot; state != nil; {

	142 state = state(l)

	143 }

	144 close(l.tokens)

	145 }

	146

	147 // A stateFn represents a state in the lexer state machine.

	148 type stateFn func(*lexer) stateFn

	149

	150 // This is the beginning state and also the state which is returned to after

	151 // most tokens are emitted.

	152 func lexRoot(l *lexer) stateFn {

	153 if l.IsEos() {

	154 return nil

	155 }

	156

	157 switch c := l.Peek(); {

	158 case isSingleCharTokens(c):

	159 return lexSingleCharTokens

	160 case isEqualsOrResponseStart(c):

	161 return lexEqualsOrResponse

	162 case isNameStart(c):

	163 return lexName

	164 case isOrdinalStart(c):

	165 return lexOrdinal

	166 case isNumberStart(c):

	167 return lexNumber

	168 case isStringStart(c):

	169 return lexString

	170 case isSkippable(c):

	171 return lexSkip

	172 case isMaybeComment(c):

	173 return lexComment

	174 }

	175

	176 l.Consume()

	177 l.emitToken(ErrorIllegalChar)

	178 return nil

	179 }

	180

	181 // isSkippable determines if a rune is skippable.

	182 func isSkippable(c rune) bool {

	183 return c == ' ' \|\| c == '\t' \|\| c == '\r' \|\| c == '\n'

	184 }

	185

	186 // lexSkip consumes skippable runes.

	187 func lexSkip(l *lexer) stateFn {

	188 for isSkippable(l.Peek()) {

	189 l.Consume()

	190 }

	191 l.beginToken()

	192 return lexRoot

	193 }

	194

	195 // singleCharTokens is a map of single-rune tokens.

	196 var singleCharTokens = map[rune]TokenKind{

	197 '(': LParen,

	198 ')': RParen,

	199 '[': LBracket,

	200 ']': RBracket,

	201 '{': LBrace,

	202 '}': RBrace,

	203 '<': LAngle,

	204 '>': RAngle,

	205 ';': Semi,

	206 ',': Comma,

	207 '.': Dot,

	208 '-': Minus,

	209 '+': Plus,

	210 '&': Amp,

	211 '?': Qstn,

	212 }

	213

	214 // isSingleCharTokens returns true if the rune is a single character token.

	215 func isSingleCharTokens(c rune) bool {

	216 _, ok := singleCharTokens[c]

	217 return ok

	218 }

	219

	220 // lexSingleCharTokens lexes single character tokens.

	221 func lexSingleCharTokens(l *lexer) stateFn {

	222 c := l.Peek()

	223 l.Consume()

	224 t, _ := singleCharTokens[c]

	225 l.emitToken(t)

	226

	227 return lexRoot

	228 }

	229

	230 // isEqualsOrResponseStart returns true if the rune corresponds to be

	231 // beginning of either the '=' or '=>' tokens.

	232 func isEqualsOrResponseStart(c rune) bool {

	233 return c == '='

	234 }

	235

	236 // lexEqualsOrResponse lexes the '=' or the '=>' token.

	237 func lexEqualsOrResponse(l *lexer) stateFn {

	238 l.Consume()

	239

	240 if l.Peek() == '>' {

	241 l.Consume()

	242 l.emitToken(Response)

	243 } else {

	244 l.emitToken(Equals)

	245 }

	246

	247 return lexRoot

	248 }

	249

	250 // isAlpha returns true if the rune is a letter of the alphabet.

	251 func isAlpha(c rune) bool {

	252 return (('a' <= c && c <= 'z') \|\| ('A' <= c && c <= 'Z'))

	253 }

	254

	255 // isDigit returns true if the rune is a digit.

	256 func isDigit(c rune) bool {

	257 return ('0' <= c && c <= '9')

	258 }

	259

	260 // isHexDigit returns true if the rune is a hexadecimal digit.

	261 func isHexDigit(c rune) bool {

	262 return isDigit(c) \|\| ('a' <= c && c <= 'f') \|\| ('A' <= c && c <= 'F')

	263 }

	264

	265 // isNameStart returns true if the rune is the beginning of a Name token.

	266 func isNameStart(c rune) bool {

	267 return isAlpha(c) \|\| c == '_'

	268 }

	269

	270 // keywordTokens maps keywords to their associate tokens.

	271 var keywordTokens = map[string]TokenKind{

	272 "import": Import,

	273 "module": Module,

	274 "struct": Struct,

	275 "union": Union,

	276 "interface": Interface,

	277 "enum": Enum,

	278 "const": Const,

	279 "true": True,

	280 "false": False,

	281 "default": Default,

	282 }

	283

	284 // lexName lexes valid C identifiers. (K&R2: A.2.3)

	285 func lexName(l *lexer) stateFn {

	286 l.Consume()

	287

	288 // isNameRune returns true if the rune is valid in a Name token.

	289 isNameRune := func(c rune) bool {

	290 return isAlpha(c) \|\| isDigit(c) \|\| c == '_'

	291 }

	292

	293 for isNameRune(l.Peek()) {

	294 l.Consume()

	295 }

	296

	297 // Emit the appropriate keyword token if the current name is a

	298 // keyword or a Name token otherwise.

	299 if token, found := keywordTokens[l.CurText()]; found {

	300 l.emitToken(token)

	301 } else {

	302 l.emitToken(Name)

	303 }

	304

	305 return lexRoot

	306 }

	307

	308 // isOrdinalStart returns true if the rune is the beginning of an Ordinal

	309 // token.

	310 func isOrdinalStart(c rune) bool {

	311 return '@' == c

	312 }

	313

	314 // lexOrdinal lexes an Ordinal token. Ordinals are a '@' followed by one

	315 // or more digits.

	316 func lexOrdinal(l *lexer) stateFn {

	317 // Consume the '@'.

	318 l.Consume()

	319

	320 for isDigit(l.Peek()) {

	321 l.Consume()

	322 }

	323

	324 l.emitToken(Ordinal)

	325

	326 return lexRoot

	327 }

	328

	329 // isNumberStart returns true if the rune is the beginning of a number.

	330 func isNumberStart(c rune) bool {

	331 // Even hexadecimals must begin with a digit (namely 0).

	332 return isDigit(c)

	333 }

	334

	335 // lexNumber lexes a number token.

	336 func lexNumber(l *lexer) stateFn {

	337 // If the number begins with 0 it cannot be a decimal integer.

	338 if l.Peek() == '0' {

	339 return lexNumberStartWithZero

	340 }

	341 return lexDec

	342 }

	343

	344 // lexDec lexes a base-10 number.

	345 func lexDec(l *lexer) stateFn {

	346 for isDigit(l.Peek()) {

	347 l.Consume()

	348 }

	349

	350 // If a decimal part is found, transition to the decimal state.

	351 if isDecimalPartStart(l.Peek()) {

	352 return lexDecimalPart

	353 }

	354

	355 l.emitToken(IntConstDec)

	356

	357 return lexRoot

	358 }

	359

	360 // lexNumberStartWithZero lexes hexadecimals, some floats or 0.

	361 func lexNumberStartWithZero(l *lexer) stateFn {

	362 // Consume the leading 0

	363 l.Consume()

	364

	365 // Here we check to see whether we are in the hexadecimal or floating

	366 // point case.

	367 switch c := l.Peek(); {

	368 case c == 'x' \|\| c == 'X':

	369 return lexHexNumber

	370 case isDecimalPartStart(c):

	371 return lexDecimalPart

	372 }

	373

	374 // Found a naked 0.

	375 l.emitToken(IntConstDec)

	376

	377 return lexRoot

	378 }

	379

	380 // lexHexNumber lexes hexadecimal integers.

	381 func lexHexNumber(l *lexer) stateFn {

	382 // Consume the x or X

	383 l.Consume()

	384

	385 for isHexDigit(l.Peek()) {

	386 l.Consume()

	387 }

	388

	389 l.emitToken(IntConstHex)

	390

	391 return lexRoot

	392 }

	393

	394 // isDecimalPartStart returns true if the rune represents the beginning of

	395 // the decimal part of a floating point number.

	396 func isDecimalPartStart(c rune) bool {

	397 return c == '.' \|\| c == 'e' \|\| c == 'E'

	398 }

	399

	400 // lexDecimalPart lexes the decimal part of a floating point number.

	401 func lexDecimalPart(l *lexer) stateFn {

	402 // Consume '.' or 'e' or 'E'

	403 l.Consume()

	404

	405 if c := l.Peek(); c == '+' \|\| c == '-' {

	406 l.Consume()

	407 }

	408

	409 for isDigit(l.Peek()) {

	410 l.Consume()

	411 }

	412

	413 l.emitToken(FloatConst)

	414

	415 return lexRoot

	416 }

	417

	418 // isStringStart returns true if the rune represents the beginning of a string.

	419 func isStringStart(c rune) bool {

	420 return '"' == c

	421 }

	422

	423 // lexString lexes a quoted string.

	424 func lexString(l *lexer) stateFn {

	425 // Consume opening quotes.

	426 l.Consume()

	427

	428 for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' {

	429 if l.Peek() == '\\' {

	430 // If we see an escape character consume whatever follow s blindly.

	431 // TODO(azani): Consider parsing escape sequences.

	432 l.Consume()

	433 }

	434 l.Consume()

	435 }

	436

	437 if l.IsEos() \|\| l.Peek() == '\n' {

	438 l.emitToken(ErrorUnterminatedStringLiteral)

	439 return nil

	440 }

	441

	442 // Consume the closing quotes

	443 l.Consume()

	444

	445 l.emitToken(StringLiteral)

	446

	447 return lexRoot

	448 }

	449

	450 // isMaybeComment returns true if the rune may be the beginning of a

	451 // comment.

	452 func isMaybeComment(c rune) bool {

	453 return c == '/'

	454 }

	455

	456 // lexComment consumes a single-line or multi-line comment.

	457 func lexComment(l *lexer) stateFn {

	458 // Consume the '/'.

	459 l.Consume()

	460

	461 switch l.Peek() {

	462 case '/':

	463 return lexSingleLineComment

	464 case '*':

	465 return lexMultiLineComment

	466 }

	467

	468 l.emitToken(ErrorIllegalChar)

	469 return nil

	470 }

	471

	472 // lexSingleLineComment consumes a single line comment.

	473 func lexSingleLineComment(l *lexer) stateFn {

	474 // Consume the '/'

	475 l.Consume()

	476

	477 for !l.IsEos() && l.Peek() != '\n' {

	478 l.Consume()

	479 }

	480

	481 l.beginToken()

	482 return lexRoot

	483 }

	484

	485 // lexMultiLineComment consumes a multi-line comment.

	486 func lexMultiLineComment(l *lexer) stateFn {

	487 // Consume the '*'.

	488 l.Consume()

	489

	490 for !l.IsEos() {

	491 if l.Peek() == '*' {

	492 return lexPossibleEndOfComment

	493 }

	494 l.Consume()

	495 }

	496

	497 l.emitToken(ErrorUnterminatedComment)

	498 return nil

	499 }

	500

	501 // lexPossibleEndOfComment consumes the possible end of a multiline

	502 // comment and determines whether the comment in fact ended or not.

	503 func lexPossibleEndOfComment(l *lexer) stateFn {

	504 // Consume the '*'

	505 l.Consume()

	506

	507 if l.IsEos() {

	508 l.emitToken(ErrorUnterminatedComment)

	509 return nil

	510 }

	511

	512 if l.Peek() == '/' {

	513 l.Consume()

	514 l.beginToken()

	515 return lexRoot

	516 }

	517

	518 return lexMultiLineComment

	519 }

OLD	NEW

« no previous file with comments | « no previous file | mojom/mojom_parser/lexer/lexer_test.go » ('j') | no next file with comments »