Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(24)

Side by Side Diff: mojom/mojom_parser/lexer/lexer.go

Issue 1387893002: New lexer for mojom written in go. (Closed) Base URL: https://github.com/domokit/mojo.git@master
Patch Set: Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | mojom/mojom_parser/lexer/lexer_test.go » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // To use the lexer, call Tokenize with the source string to obtain
6 // a TokenStream. The lexer is run concurrently so you should be able to
7 // use the TokenStream before the lexer is done with the source.
8 //
9 // The lexer is implemented as a state machine. The states are represented
10 // by functions (the stateFn type) which accept a lexer and return the
11 // new state.
12 //
13 // Most states also have an isFooStart function which helps determine if
14 // a transition to Foo is appropriate. Those functions accept a single
15 // rune as a parameter and return true if the state machine should
16 // transition to state Foo. Some states do not have such functions on
17 // account of the transition condition being trivial.
18 //
19 // The lexer implementation was inspired by
20 // http://cuddle.googlecode.com/hg/talk/lex.html
21
22 package lexer
23
24 import (
25 "unicode/utf8"
26 )
27
28 // Tokenize accepts a source string and parses it into a stream of tokens which
29 // can be read from the returned TokenStream.
30 func Tokenize(source string) TokenStream {
31 tokens := make(chan Token)
32 l := lexer{source: source, tokens: tokens}
33 go l.run()
34 return &TokenChan{tokenChan: tokens}
35 }
36
37 type lexer struct {
38 // source is the source code to be lexed.
39 source string
40
41 // offset is the number of bytes that have been consumed.
42 offset int
43
44 // tokens is a channel to which the found tokens are emitted.
45 tokens chan Token
46
47 // sourcePos is the number of runes that have been consumed.
48 sourcePos int
49
50 // lineno is the current line number.
51 lineNo int
52
53 // linePos is how many runes have been consumed since the beginning of t he
54 // line.
55 linePos int
56
57 // curTokenOffset is the number of bytes consumed prior to the beginning of
58 // the current token.
59 curTokenOffset int
60
61 // curTokenSourcePos is the number of runes consumed prior to the beginn ing of
62 // the current token.
63 curTokenSourcePos int
64
65 // curTokenLineNo is the line number on which the current token begins.
66 curTokenLineNo int
67
68 // curTokenLinePos is the number of runes since the beginning of the lin e
69 // where the current token begins.
70 curTokenLinePos int
71 }
72
73 // CurText returns the consumed part of the current token.
74 func (l *lexer) CurText() string {
75 return l.source[l.curTokenOffset:l.offset]
76 }
77
78 // emitToken emits the current token and begins a new token.
79 func (l *lexer) emitToken(tokenType TokenKind) {
80 l.tokens <- Token{
81 Kind: tokenType,
82 Text: l.source[l.curTokenOffset:l.offset],
83 CharPos: l.curTokenSourcePos,
84 LineNo: l.curTokenLineNo,
85 LinePos: l.curTokenLinePos}
86 l.beginToken()
87 }
88
89 // beginToken begins the new token.
90 func (l *lexer) beginToken() {
91 l.curTokenOffset = l.offset
92 l.curTokenSourcePos = l.sourcePos
93 l.curTokenLineNo = l.lineNo
94 l.curTokenLinePos = l.linePos
95 }
96
97 // Consume consumes the next rune in the source.
98 func (l *lexer) Consume() {
99 if l.IsEos() {
100 return
101 }
102
103 c, width := utf8.DecodeRuneInString(l.source[l.offset:])
104
105 if c == '\n' {
106 l.lineNo += 1
107 l.linePos = 0
108 } else {
109 l.linePos += 1
110 }
111 l.offset += width
112 l.sourcePos += 1
113 }
114
115 // Peek returns the next rune in the source.
116 func (l *lexer) Peek() rune {
117 // At the end of the string, there is no sane answer to Peek.
118 if l.IsEos() {
119 return utf8.RuneError
120 }
121
122 // If RuneError is returned, it will be handled as any other rune, likel y
123 // resulting in an ErrorIllegalChar token being emitted.
124 char, _ := utf8.DecodeRuneInString(l.source[l.offset:])
125 return char
126 }
127
128 // IsEos returns true if the whole source has been consumed false
129 // otherwise.
130 func (l *lexer) IsEos() bool {
131 return l.offset >= len(l.source)
132 }
133
134 // run is the lexer's main loop.
135 func (l *lexer) run() {
136 // We are implementing a state machine.
137 // lexRoot is the beginning state.
138 // nil is the end state.
139 // States are functions which are called on the lexer. They return the
140 // next state.
141 for state := lexRoot; state != nil; {
142 state = state(l)
143 }
144 close(l.tokens)
145 }
146
147 // A stateFn represents a state in the lexer state machine.
148 type stateFn func(*lexer) stateFn
149
150 // This is the beginning state and also the state which is returned to after
151 // most tokens are emitted.
152 func lexRoot(l *lexer) stateFn {
153 if l.IsEos() {
154 return nil
155 }
156
157 switch c := l.Peek(); {
158 case isSingleCharTokens(c):
159 return lexSingleCharTokens
160 case isEqualsOrResponseStart(c):
161 return lexEqualsOrResponse
162 case isNameStart(c):
163 return lexName
164 case isOrdinalStart(c):
165 return lexOrdinal
166 case isNumberStart(c):
167 return lexNumber
168 case isStringStart(c):
169 return lexString
170 case isSkippable(c):
171 return lexSkip
172 case isMaybeComment(c):
173 return lexComment
174 }
175
176 l.Consume()
177 l.emitToken(ErrorIllegalChar)
178 return nil
179 }
180
181 // isSkippable determines if a rune is skippable.
182 func isSkippable(c rune) bool {
183 return c == ' ' || c == '\t' || c == '\r' || c == '\n'
184 }
185
186 // lexSkip consumes skippable runes.
187 func lexSkip(l *lexer) stateFn {
188 for isSkippable(l.Peek()) {
189 l.Consume()
190 }
191 l.beginToken()
192 return lexRoot
193 }
194
195 // singleCharTokens is a map of single-rune tokens.
196 var singleCharTokens = map[rune]TokenKind{
197 '(': LParen,
198 ')': RParen,
199 '[': LBracket,
200 ']': RBracket,
201 '{': LBrace,
202 '}': RBrace,
203 '<': LAngle,
204 '>': RAngle,
205 ';': Semi,
206 ',': Comma,
207 '.': Dot,
208 '-': Minus,
209 '+': Plus,
210 '&': Amp,
211 '?': Qstn,
212 }
213
214 // isSingleCharTokens returns true if the rune is a single character token.
215 func isSingleCharTokens(c rune) bool {
216 _, ok := singleCharTokens[c]
217 return ok
218 }
219
220 // lexSingleCharTokens lexes single character tokens.
221 func lexSingleCharTokens(l *lexer) stateFn {
222 c := l.Peek()
223 l.Consume()
224 t, _ := singleCharTokens[c]
225 l.emitToken(t)
226
227 return lexRoot
228 }
229
230 // isEqualsOrResponseStart returns true if the rune corresponds to be
231 // beginning of either the '=' or '=>' tokens.
232 func isEqualsOrResponseStart(c rune) bool {
233 return c == '='
234 }
235
236 // lexEqualsOrResponse lexes the '=' or the '=>' token.
237 func lexEqualsOrResponse(l *lexer) stateFn {
238 l.Consume()
239
240 if l.Peek() == '>' {
241 l.Consume()
242 l.emitToken(Response)
243 } else {
244 l.emitToken(Equals)
245 }
246
247 return lexRoot
248 }
249
250 // isAlpha returns true if the rune is a letter of the alphabet.
251 func isAlpha(c rune) bool {
252 return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
253 }
254
255 // isDigit returns true if the rune is a digit.
256 func isDigit(c rune) bool {
257 return ('0' <= c && c <= '9')
258 }
259
260 // isHexDigit returns true if the rune is a hexadecimal digit.
261 func isHexDigit(c rune) bool {
262 return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
263 }
264
265 // isNameStart returns true if the rune is the beginning of a Name token.
266 func isNameStart(c rune) bool {
267 return isAlpha(c) || c == '_'
268 }
269
270 // keywordTokens maps keywords to their associate tokens.
271 var keywordTokens = map[string]TokenKind{
272 "import": Import,
273 "module": Module,
274 "struct": Struct,
275 "union": Union,
276 "interface": Interface,
277 "enum": Enum,
278 "const": Const,
279 "true": True,
280 "false": False,
281 "default": Default,
282 }
283
284 // lexName lexes valid C identifiers. (K&R2: A.2.3)
285 func lexName(l *lexer) stateFn {
286 l.Consume()
287
288 // isNameRune returns true if the rune is valid in a Name token.
289 isNameRune := func(c rune) bool {
290 return isAlpha(c) || isDigit(c) || c == '_'
291 }
292
293 for isNameRune(l.Peek()) {
294 l.Consume()
295 }
296
297 // Emit the appropriate keyword token if the current name is a
298 // keyword or a Name token otherwise.
299 if token, found := keywordTokens[l.CurText()]; found {
300 l.emitToken(token)
301 } else {
302 l.emitToken(Name)
303 }
304
305 return lexRoot
306 }
307
308 // isOrdinalStart returns true if the rune is the beginning of an Ordinal
309 // token.
310 func isOrdinalStart(c rune) bool {
311 return '@' == c
312 }
313
314 // lexOrdinal lexes an Ordinal token. Ordinals are a '@' followed by one
315 // or more digits.
316 func lexOrdinal(l *lexer) stateFn {
317 // Consume the '@'.
318 l.Consume()
319
320 for isDigit(l.Peek()) {
321 l.Consume()
322 }
323
324 l.emitToken(Ordinal)
325
326 return lexRoot
327 }
328
329 // isNumberStart returns true if the rune is the beginning of a number.
330 func isNumberStart(c rune) bool {
331 // Even hexadecimals must begin with a digit (namely 0).
332 return isDigit(c)
333 }
334
335 // lexNumber lexes a number token.
336 func lexNumber(l *lexer) stateFn {
337 // If the number begins with 0 it cannot be a decimal integer.
338 if l.Peek() == '0' {
339 return lexNumberStartWithZero
340 }
341 return lexDec
342 }
343
344 // lexDec lexes a base-10 number.
345 func lexDec(l *lexer) stateFn {
346 for isDigit(l.Peek()) {
347 l.Consume()
348 }
349
350 // If a decimal part is found, transition to the decimal state.
351 if isDecimalPartStart(l.Peek()) {
352 return lexDecimalPart
353 }
354
355 l.emitToken(IntConstDec)
356
357 return lexRoot
358 }
359
360 // lexNumberStartWithZero lexes hexadecimals, some floats or 0.
361 func lexNumberStartWithZero(l *lexer) stateFn {
362 // Consume the leading 0
363 l.Consume()
364
365 // Here we check to see whether we are in the hexadecimal or floating
366 // point case.
367 switch c := l.Peek(); {
368 case c == 'x' || c == 'X':
369 return lexHexNumber
370 case isDecimalPartStart(c):
371 return lexDecimalPart
372 }
373
374 // Found a naked 0.
375 l.emitToken(IntConstDec)
376
377 return lexRoot
378 }
379
380 // lexHexNumber lexes hexadecimal integers.
381 func lexHexNumber(l *lexer) stateFn {
382 // Consume the x or X
383 l.Consume()
384
385 for isHexDigit(l.Peek()) {
386 l.Consume()
387 }
388
389 l.emitToken(IntConstHex)
390
391 return lexRoot
392 }
393
394 // isDecimalPartStart returns true if the rune represents the beginning of
395 // the decimal part of a floating point number.
396 func isDecimalPartStart(c rune) bool {
397 return c == '.' || c == 'e' || c == 'E'
398 }
399
400 // lexDecimalPart lexes the decimal part of a floating point number.
401 func lexDecimalPart(l *lexer) stateFn {
402 // Consume '.' or 'e' or 'E'
403 l.Consume()
404
405 if c := l.Peek(); c == '+' || c == '-' {
406 l.Consume()
407 }
408
409 for isDigit(l.Peek()) {
410 l.Consume()
411 }
412
413 l.emitToken(FloatConst)
414
415 return lexRoot
416 }
417
418 // isStringStart returns true if the rune represents the beginning of a string.
419 func isStringStart(c rune) bool {
420 return '"' == c
421 }
422
423 // lexString lexes a quoted string.
424 func lexString(l *lexer) stateFn {
425 // Consume opening quotes.
426 l.Consume()
427
428 for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' {
429 if l.Peek() == '\\' {
430 // If we see an escape character consume whatever follow s blindly.
431 // TODO(azani): Consider parsing escape sequences.
432 l.Consume()
433 }
434 l.Consume()
435 }
436
437 if l.IsEos() || l.Peek() == '\n' {
438 l.emitToken(ErrorUnterminatedStringLiteral)
439 return nil
440 }
441
442 // Consume the closing quotes
443 l.Consume()
444
445 l.emitToken(StringLiteral)
446
447 return lexRoot
448 }
449
450 // isMaybeComment returns true if the rune may be the beginning of a
451 // comment.
452 func isMaybeComment(c rune) bool {
453 return c == '/'
454 }
455
456 // lexComment consumes a single-line or multi-line comment.
457 func lexComment(l *lexer) stateFn {
458 // Consume the '/'.
459 l.Consume()
460
461 switch l.Peek() {
462 case '/':
463 return lexSingleLineComment
464 case '*':
465 return lexMultiLineComment
466 }
467
468 l.emitToken(ErrorIllegalChar)
469 return nil
470 }
471
472 // lexSingleLineComment consumes a single line comment.
473 func lexSingleLineComment(l *lexer) stateFn {
474 // Consume the '/'
475 l.Consume()
476
477 for !l.IsEos() && l.Peek() != '\n' {
478 l.Consume()
479 }
480
481 l.beginToken()
482 return lexRoot
483 }
484
485 // lexMultiLineComment consumes a multi-line comment.
486 func lexMultiLineComment(l *lexer) stateFn {
487 // Consume the '*'.
488 l.Consume()
489
490 for !l.IsEos() {
491 if l.Peek() == '*' {
492 return lexPossibleEndOfComment
493 }
494 l.Consume()
495 }
496
497 l.emitToken(ErrorUnterminatedComment)
498 return nil
499 }
500
501 // lexPossibleEndOfComment consumes the possible end of a multiline
502 // comment and determines whether the comment in fact ended or not.
503 func lexPossibleEndOfComment(l *lexer) stateFn {
504 // Consume the '*'
505 l.Consume()
506
507 if l.IsEos() {
508 l.emitToken(ErrorUnterminatedComment)
509 return nil
510 }
511
512 if l.Peek() == '/' {
513 l.Consume()
514 l.beginToken()
515 return lexRoot
516 }
517
518 return lexMultiLineComment
519 }
OLDNEW
« no previous file with comments | « no previous file | mojom/mojom_parser/lexer/lexer_test.go » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698