mojom/mojom_parser/lexer/lexer.go - Issue 1387893002: New lexer for mojom written in go.

Side by Side Diff: mojom/mojom_parser/lexer/lexer.go

Issue 1387893002: New lexer for mojom written in go. (Closed) Base URL: https://github.com/domokit/mojo.git@master

Patch Set: Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 // Copyright 2015 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4 //

	5 // To use the lexer, call Tokenize with the source string to obtain

	6 // a TokenStream. The lexer is run concurrently so you should be able to

	7 // use the TokenStream before the lexer is done with the source.

	8 //

	9 // The lexer is implemented as a state machine. The states are represented

	10 // by functions (the stateFn type) which accept a lexer and return the

	11 // new state.

	12 //

	13 // Most states also have an isFooStart function which helps determine if

	14 // a transition to Foo is appropriate. Those functions accept a single

	15 // rune as a parameter and return true if the state machine should

	16 // transition to state Foo. Some states do not have such functions on

	17 // account of the transition condition being trivial.

	18 //

	19 // The lexer implementation was inspired by

	20 // http://cuddle.googlecode.com/hg/talk/lex.html

	21

	22 package lexer

	23

	24 import (

	25 "unicode/utf8"

	26 )

	27

	28 // Tokenize accepts a source string and parses it into a stream of tokens which

	29 // can be read from the returned TokenStream.

	30 func Tokenize(source string) TokenStream {

	31 tokens := make(chan Token)

	32 l := lexer{source: source, tokens: tokens}

	33 go l.run()

	34 return &TokenChan{tokenChan: tokens}

	35 }

	36

	37 type lexer struct {

	38 // source is the source code to be lexed.

	39 source string

	40

	41 // offset is the number of bytes that have been consumed.

	42 offset int

	43

	44 // tokens is a channel to which the found tokens are emitted.

	45 tokens chan Token

	46

	47 // sourcePos is the number of runes that have been consumed.

	48 sourcePos int

	49

	50 // lineno is the current line number.

	51 lineNo int

	52

	53 // linePos is how many runes have been consumed since the beginning of t he

	54 // line.

	55 linePos int

	56

	57 // curTokenOffset is the number of bytes consumed prior to the beginning of

	58 // the current token.

	59 curTokenOffset int

	60

	61 // curTokenSourcePos is the number of runes consumed prior to the beginn ing of

	62 // the current token.

	63 curTokenSourcePos int

	64

	65 // curTokenLineNo is the line number on which the current token begins.

	66 curTokenLineNo int

	67

	68 // curTokenLinePos is the number of runes since the beginning of the lin e

	69 // where the current token begins.

	70 curTokenLinePos int

	71 }

	72

	73 // CurText returns the consumed part of the current token.

	74 func (l *lexer) CurText() string {

	75 return l.source[l.curTokenOffset:l.offset]

	76 }

	77

	78 // emitToken emits the current token and begins a new token.

	79 func (l *lexer) emitToken(tokenType TokenKind) {

	80 l.tokens <- Token{

	81 Kind: tokenType,

	82 Text: l.source[l.curTokenOffset:l.offset],

	83 CharPos: l.curTokenSourcePos,

	84 LineNo: l.curTokenLineNo,

	85 LinePos: l.curTokenLinePos}

	86 l.beginToken()

	87 }

	88

	89 // beginToken begins the new token.

	90 func (l *lexer) beginToken() {

	91 l.curTokenOffset = l.offset

	92 l.curTokenSourcePos = l.sourcePos

	93 l.curTokenLineNo = l.lineNo

	94 l.curTokenLinePos = l.linePos

	95 }

	96

	97 // Consume consumes the next rune in the source.

	98 func (l *lexer) Consume() {

	99 if l.IsEos() {

	100 return

	101 }

	102

	103 c, width := utf8.DecodeRuneInString(l.source[l.offset:])

	104 if width == 0 {
	mattr 2015/10/14 19:29:26 I guess my question was, why increment width here I guess my question was, why increment width here at all? It doesn't seem to serve any purpose. azani 2015/10/14 22:29:41 It turns out it's not necessary... Gone now and th Show quoted text On 2015/10/14 19:29:26, mattr wrote: > I guess my question was, why increment width here at all? It doesn't seem to > serve any purpose. It turns out it's not necessary... Gone now and thanks for pushing so I could figure this out.
	105 width = 1

	106 }

	107

	108 if c == '\n' {

	109 l.lineNo += 1

	110 l.linePos = 0

	111 } else {

	112 l.linePos += 1

	113 }

	114 l.offset += width

	115 l.sourcePos += 1

	116 }

	117

	118 // Peek returns the next rune in the source.

	119 func (l *lexer) Peek() rune {

	120 // At the end of the string, there is no sane answer to Peek.

	121 if l.IsEos() {

	122 return utf8.RuneError

	123 }

	124

	125 // If RuneError is returned, it will be handled as any other rune, likel y

	126 // resulting in an ErrorIllegalChar token being emitted.

	127 char, _ := utf8.DecodeRuneInString(l.source[l.offset:])

	128 return char

	129 }

	130

	131 // IsEos returns true if the whole source has been consumed false

	132 // otherwise.

	133 func (l *lexer) IsEos() bool {

	134 return l.offset >= len(l.source)

	135 }

	136

	137 // run is the lexer's main loop.

	138 func (l *lexer) run() {

	139 // We are implementing a state machine.

	140 // lexRoot is the beginning state.

	141 // nil is the end state.

	142 // States are functions which are called on the lexer. They return the

	143 // next state.

	144 for state := lexRoot; state != nil; {

	145 state = state(l)

	146 }

	147 close(l.tokens)

	148 }

	149

	150 // A stateFn represents a state in the lexer state machine.

	151 type stateFn func(*lexer) stateFn

	152

	153 // This is the beginning state and also the state which is returned to after

	154 // most tokens are emitted.

	155 func lexRoot(l *lexer) stateFn {

	156 if l.IsEos() {

	157 return nil

	158 }

	159

	160 switch c := l.Peek(); {

	161 case isSingleCharTokens(c):

	162 return lexSingleCharTokens

	163 case isEqualsOrResponseStart(c):

	164 return lexEqualsOrResponse

	165 case isNameStart(c):

	166 return lexName

	167 case isOrdinalStart(c):

	168 return lexOrdinal

	169 case isNumberStart(c):

	170 return lexNumber

	171 case isStringStart(c):

	172 return lexString

	173 case isSkippable(c):

	174 return lexSkip

	175 case isMaybeComment(c):

	176 return lexComment

	177 }

	178

	179 l.Consume()

	180 l.emitToken(ErrorIllegalChar)

	181 return nil

	182 }

	183

	184 // isSkippable determines if a rune is skippable.

	185 func isSkippable(c rune) bool {

	186 return c == ' ' \|\| c == '\t' \|\| c == '\r' \|\| c == '\n'

	187 }

	188

	189 // lexSkip consumes skippable runes.

	190 func lexSkip(l *lexer) stateFn {

	191 for isSkippable(l.Peek()) {

	192 l.Consume()

	193 }

	194 l.beginToken()

	195 return lexRoot

	196 }

	197

	198 // singleCharTokens is a map of single-rune tokens.

	199 var singleCharTokens = map[rune]TokenKind{

	200 '(': LParen,

	201 ')': RParen,

	202 '[': LBracket,

	203 ']': RBracket,

	204 '{': LBrace,

	205 '}': RBrace,

	206 '<': LAngle,

	207 '>': RAngle,

	208 ';': Semi,

	209 ',': Comma,

	210 '.': Dot,

	211 '-': Minus,

	212 '+': Plus,

	213 '&': Amp,

	214 '?': Qstn,

	215 }

	216

	217 // isSingleCharTokens returns true if the rune is a single character token.

	218 func isSingleCharTokens(c rune) bool {

	219 _, ok := singleCharTokens[c]

	220 return ok

	221 }

	222

	223 // lexSingleCharTokens lexes single character tokens.

	224 func lexSingleCharTokens(l *lexer) stateFn {

	225 c := l.Peek()

	226 l.Consume()

	227 t, _ := singleCharTokens[c]

	228 l.emitToken(t)

	229

	230 return lexRoot

	231 }

	232

	233 // isEqualsOrResponseStart returns true if the rune corresponds to be

	234 // beginning of either the '=' or '=>' tokens.

	235 func isEqualsOrResponseStart(c rune) bool {

	236 return c == '='

	237 }

	238

	239 // lexEqualsOrResponse lexes the '=' or the '=>' token.

	240 func lexEqualsOrResponse(l *lexer) stateFn {

	241 l.Consume()

	242

	243 if l.Peek() == '>' {

	244 l.Consume()

	245 l.emitToken(Response)

	246 } else {

	247 l.emitToken(Equals)

	248 }

	249

	250 return lexRoot

	251 }

	252

	253 // isAlpha returns true if the rune is a letter of the alphabet.

	254 func isAlpha(c rune) bool {

	255 return (('a' <= c && c <= 'z') \|\| ('A' <= c && c <= 'Z'))

	256 }

	257

	258 // isDigit returns true if the rune is a digit.

	259 func isDigit(c rune) bool {

	260 return ('0' <= c && c <= '9')

	261 }

	262

	263 // isHexDigit returns true if the rune is a hexadecimal digit.

	264 func isHexDigit(c rune) bool {

	265 return isDigit(c) \|\| ('a' <= c && c <= 'f') \|\| ('A' <= c && c <= 'F')

	266 }

	267

	268 // isNameStart returns true if the rune is the beginning of a Name token.

	269 func isNameStart(c rune) bool {

	270 return isAlpha(c) \|\| c == '_'

	271 }

	272

	273 // keywordTokens maps keywords to their associate tokens.

	274 var keywordTokens = map[string]TokenKind{

	275 "import": Import,

	276 "module": Module,

	277 "struct": Struct,

	278 "union": Union,

	279 "interface": Interface,

	280 "enum": Enum,

	281 "const": Const,

	282 "true": True,

	283 "false": False,

	284 "default": Default,

	285 }

	286

	287 // lexName lexes valid C identifiers. (K&R2: A.2.3)

	288 func lexName(l *lexer) stateFn {

	289 l.Consume()

	290

	291 // isNameRune returns true if the rune is valid in a Name token.

	292 isNameRune := func(c rune) bool {

	293 return isAlpha(c) \|\| isDigit(c) \|\| c == '_'

	294 }

	295

	296 for isNameRune(l.Peek()) {

	297 l.Consume()

	298 }

	299

	300 // Emit the appropriate keyword token if the current name is a

	301 // keyword or a Name token otherwise.

	302 if token, found := keywordTokens[l.CurText()]; found {

	303 l.emitToken(token)

	304 } else {

	305 l.emitToken(Name)

	306 }

	307

	308 return lexRoot

	309 }

	310

	311 // isOrdinalStart returns true if the rune is the beginning of an Ordinal

	312 // token.

	313 func isOrdinalStart(c rune) bool {

	314 return '@' == c

	315 }

	316

	317 // lexOrdinal lexes an Ordinal token. Ordinals are a '@' followed by one

	318 // or more digits.

	319 func lexOrdinal(l *lexer) stateFn {

	320 // Consume the '@'.

	321 l.Consume()

	322

	323 for isDigit(l.Peek()) {

	324 l.Consume()

	325 }

	326

	327 l.emitToken(Ordinal)

	328

	329 return lexRoot

	330 }

	331

	332 // isNumberStart returns true if the rune is the beginning of a number.

	333 func isNumberStart(c rune) bool {

	334 // Even hexadecimals must begin with a digit (namely 0).

	335 return isDigit(c)

	336 }

	337

	338 // lexNumber lexes a number token.

	339 func lexNumber(l *lexer) stateFn {

	340 // If the number begins with 0 it cannot be a decimal integer.

	341 if l.Peek() == '0' {

	342 return lexNumberStartWithZero

	343 }

	344 return lexDec

	345 }

	346

	347 // lexDec lexes a base-10 number.

	348 func lexDec(l *lexer) stateFn {

	349 for isDigit(l.Peek()) {

	350 l.Consume()

	351 }

	352

	353 // If a decimal part is found, transition to the decimal state.

	354 if isDecimalPartStart(l.Peek()) {

	355 return lexDecimalPart

	356 }

	357

	358 l.emitToken(IntConstDec)

	359

	360 return lexRoot

	361 }

	362

	363 // lexNumberStartWithZero lexes hexadecimals, some floats or 0.

	364 func lexNumberStartWithZero(l *lexer) stateFn {

	365 // Consume the leading 0

	366 l.Consume()

	367

	368 // Here we check to see whether we are in the hexadecimal or floating

	369 // point case.

	370 switch c := l.Peek(); {

	371 case c == 'x' \|\| c == 'X':

	372 return lexHexNumber

	373 case isDecimalPartStart(c):

	374 return lexDecimalPart

	375 }

	376

	377 // Found a naked 0.

	378 l.emitToken(IntConstDec)

	379

	380 return lexRoot

	381 }

	382

	383 // lexHexNumber lexes hexadecimal integers.

	384 func lexHexNumber(l *lexer) stateFn {

	385 // Consume the x or X

	386 l.Consume()

	387

	388 for isHexDigit(l.Peek()) {

	389 l.Consume()

	390 }

	391

	392 l.emitToken(IntConstHex)

	393

	394 return lexRoot

	395 }

	396

	397 // isDecimalPartStart returns true if the rune represents the beginning of

	398 // the decimal part of a floating point number.

	399 func isDecimalPartStart(c rune) bool {

	400 return c == '.' \|\| c == 'e' \|\| c == 'E'

	401 }

	402

	403 // lexDecimalPart lexes the decimal part of a floating point number.

	404 func lexDecimalPart(l *lexer) stateFn {

	405 // Consume '.' or 'e' or 'E'

	406 l.Consume()

	407

	408 if c := l.Peek(); c == '+' \|\| c == '-' {

	409 l.Consume()

	410 }

	411

	412 for isDigit(l.Peek()) {

	413 l.Consume()

	414 }

	415

	416 l.emitToken(FloatConst)

	417

	418 return lexRoot

	419 }

	420

	421 // isStringStart returns true if the rune represents the beginning of a string.

	422 func isStringStart(c rune) bool {

	423 return '"' == c

	424 }

	425

	426 // lexString lexes a quoted string.

	427 func lexString(l *lexer) stateFn {

	428 // Consume opening quotes.

	429 l.Consume()

	430

	431 for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' {

	432 if l.Peek() == '\\' {

	433 // If we see an escape character consume whatever follow s blindly.

	434 // TODO(azani): Consider parsing escape sequences.

	435 l.Consume()

	436 }

	437 l.Consume()

	438 }

	439

	440 if l.IsEos() \|\| l.Peek() == '\n' {

	441 l.emitToken(ErrorUnterminatedStringLiteral)

	442 return nil

	443 }

	444

	445 // Consume the closing quotes

	446 l.Consume()

	447

	448 l.emitToken(StringLiteral)

	449

	450 return lexRoot

	451 }

	452

	453 // isMaybeComment returns true if the rune may be the beginning of a

	454 // comment.

	455 func isMaybeComment(c rune) bool {

	456 return c == '/'

	457 }

	458

	459 // lexComment consumes a single-line or multi-line comment.

	460 func lexComment(l *lexer) stateFn {

	461 // Consume the '/'.

	462 l.Consume()

	463

	464 switch l.Peek() {

	465 case '/':

	466 return lexSingleLineComment

	467 case '*':

	468 return lexMultiLineComment

	469 }

	470

	471 l.emitToken(ErrorIllegalChar)

	472 return nil

	473 }

	474

	475 // lexSingleLineComment consumes a single line comment.

	476 func lexSingleLineComment(l *lexer) stateFn {

	477 // Consume the '/'

	478 l.Consume()

	479

	480 for !l.IsEos() && l.Peek() != '\n' {

	481 l.Consume()

	482 }

	483

	484 l.beginToken()

	485 return lexRoot

	486 }

	487

	488 // lexMultiLineComment consumes a multi-line comment.

	489 func lexMultiLineComment(l *lexer) stateFn {

	490 // Consume the '*'.

	491 l.Consume()

	492

	493 for !l.IsEos() {

	494 if l.Peek() == '*' {

	495 return lexPossibleEndOfComment

	496 }

	497 l.Consume()

	498 }

	499

	500 l.emitToken(ErrorUnterminatedComment)

	501 return nil

	502 }

	503

	504 // lexPossibleEndOfComment consumes the possible end of a multiline

	505 // comment and determines whether the comment in fact ended or not.

	506 func lexPossibleEndOfComment(l *lexer) stateFn {

	507 // Consume the '*'

	508 l.Consume()

	509

	510 if l.IsEos() {

	511 l.emitToken(ErrorUnterminatedComment)

	512 return nil

	513 }

	514

	515 if l.Peek() == '/' {

	516 l.Consume()

	517 l.beginToken()

	518 return lexRoot

	519 }

	520

	521 return lexMultiLineComment

	522 }

OLD	NEW

« no previous file with comments | « no previous file | mojom/mojom_parser/lexer/lexer_test.go » ('j') | no next file with comments »