Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(899)

Side by Side Diff: mojom/mojom_parser/lexer/lexer.go

Issue 1387893002: New lexer for mojom written in go. (Closed) Base URL: https://github.com/domokit/mojo.git@master
Patch Set: Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | mojom/mojom_parser/lexer/lexer_test.go » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // To use the lexer, call Tokenize with the source string to obtain
6 // a TokenStream. The lexer is run concurrently so you should be able to
7 // use the TokenStream before the lexer is done with the source.
8 //
9 // The lexer is implemented as a state machine. The states are represented
10 // by functions (the stateFn type) which accept a lexer and return the
11 // new state.
12 //
13 // Most states also have an isFooStart function which helps determine if
14 // a transition to Foo is appropriate. Those functions accept a single
15 // rune as a parameter and return true if the state machine should
16 // transition to state Foo. Some states do not have such functions on
17 // account of the transition condition being trivial.
18 //
19 // The lexer implementation was inspired by
20 // http://cuddle.googlecode.com/hg/talk/lex.html
21
22 package lexer
23
24 import (
25 "unicode/utf8"
26 )
27
28 // Tokenize accepts a source string and parses it into a stream of tokens which
29 // can be read from the returned TokenStream.
30 func Tokenize(source string) TokenStream {
31 tokens := make(chan Token)
32 l := lexer{source: source, tokens: tokens}
33 go l.run()
34 return &TokenChan{tokenChan: tokens}
35 }
36
37 type lexer struct {
38 // source is the source code to be lexed.
39 source string
40
41 // offset is the number of bytes that have been consumed.
42 offset int
43
44 // tokens is a channel to which the found tokens are emitted.
45 tokens chan Token
46
47 // sourcePos is the number of runes that have been consumed.
48 sourcePos int
49
50 // lineno is the current line number.
51 lineNo int
52
53 // linePos is how many runes have been consumed since the beginning of t he
54 // line.
55 linePos int
56
57 // curTokenOffset is the number of bytes consumed prior to the beginning of
58 // the current token.
59 curTokenOffset int
60
61 // curTokenSourcePos is the number of runes consumed prior to the beginn ing of
62 // the current token.
63 curTokenSourcePos int
64
65 // curTokenLineNo is the line number on which the current token begins.
66 curTokenLineNo int
67
68 // curTokenLinePos is the number of runes since the beginning of the lin e
69 // where the current token begins.
70 curTokenLinePos int
71 }
72
73 // CurText returns the consumed part of the current token.
74 func (l *lexer) CurText() string {
75 return l.source[l.curTokenOffset:l.offset]
76 }
77
78 // emitToken emits the current token and begins a new token.
79 func (l *lexer) emitToken(tokenType TokenKind) {
80 l.tokens <- Token{
81 Kind: tokenType,
82 Text: l.source[l.curTokenOffset:l.offset],
83 CharPos: l.curTokenSourcePos,
84 LineNo: l.curTokenLineNo,
85 LinePos: l.curTokenLinePos}
86 l.beginToken()
87 }
88
89 // beginToken begins the new token.
90 func (l *lexer) beginToken() {
91 l.curTokenOffset = l.offset
92 l.curTokenSourcePos = l.sourcePos
93 l.curTokenLineNo = l.lineNo
94 l.curTokenLinePos = l.linePos
95 }
96
97 // Consume consumes the next rune in the source.
98 func (l *lexer) Consume() {
99 if l.IsEos() {
100 return
101 }
102
103 c, width := utf8.DecodeRuneInString(l.source[l.offset:])
104 if width == 0 {
mattr 2015/10/14 19:29:26 I guess my question was, why increment width here
azani 2015/10/14 22:29:41 It turns out it's not necessary... Gone now and th
105 width = 1
106 }
107
108 if c == '\n' {
109 l.lineNo += 1
110 l.linePos = 0
111 } else {
112 l.linePos += 1
113 }
114 l.offset += width
115 l.sourcePos += 1
116 }
117
118 // Peek returns the next rune in the source.
119 func (l *lexer) Peek() rune {
120 // At the end of the string, there is no sane answer to Peek.
121 if l.IsEos() {
122 return utf8.RuneError
123 }
124
125 // If RuneError is returned, it will be handled as any other rune, likel y
126 // resulting in an ErrorIllegalChar token being emitted.
127 char, _ := utf8.DecodeRuneInString(l.source[l.offset:])
128 return char
129 }
130
131 // IsEos returns true if the whole source has been consumed false
132 // otherwise.
133 func (l *lexer) IsEos() bool {
134 return l.offset >= len(l.source)
135 }
136
137 // run is the lexer's main loop.
138 func (l *lexer) run() {
139 // We are implementing a state machine.
140 // lexRoot is the beginning state.
141 // nil is the end state.
142 // States are functions which are called on the lexer. They return the
143 // next state.
144 for state := lexRoot; state != nil; {
145 state = state(l)
146 }
147 close(l.tokens)
148 }
149
150 // A stateFn represents a state in the lexer state machine.
151 type stateFn func(*lexer) stateFn
152
153 // This is the beginning state and also the state which is returned to after
154 // most tokens are emitted.
155 func lexRoot(l *lexer) stateFn {
156 if l.IsEos() {
157 return nil
158 }
159
160 switch c := l.Peek(); {
161 case isSingleCharTokens(c):
162 return lexSingleCharTokens
163 case isEqualsOrResponseStart(c):
164 return lexEqualsOrResponse
165 case isNameStart(c):
166 return lexName
167 case isOrdinalStart(c):
168 return lexOrdinal
169 case isNumberStart(c):
170 return lexNumber
171 case isStringStart(c):
172 return lexString
173 case isSkippable(c):
174 return lexSkip
175 case isMaybeComment(c):
176 return lexComment
177 }
178
179 l.Consume()
180 l.emitToken(ErrorIllegalChar)
181 return nil
182 }
183
184 // isSkippable determines if a rune is skippable.
185 func isSkippable(c rune) bool {
186 return c == ' ' || c == '\t' || c == '\r' || c == '\n'
187 }
188
189 // lexSkip consumes skippable runes.
190 func lexSkip(l *lexer) stateFn {
191 for isSkippable(l.Peek()) {
192 l.Consume()
193 }
194 l.beginToken()
195 return lexRoot
196 }
197
198 // singleCharTokens is a map of single-rune tokens.
199 var singleCharTokens = map[rune]TokenKind{
200 '(': LParen,
201 ')': RParen,
202 '[': LBracket,
203 ']': RBracket,
204 '{': LBrace,
205 '}': RBrace,
206 '<': LAngle,
207 '>': RAngle,
208 ';': Semi,
209 ',': Comma,
210 '.': Dot,
211 '-': Minus,
212 '+': Plus,
213 '&': Amp,
214 '?': Qstn,
215 }
216
217 // isSingleCharTokens returns true if the rune is a single character token.
218 func isSingleCharTokens(c rune) bool {
219 _, ok := singleCharTokens[c]
220 return ok
221 }
222
223 // lexSingleCharTokens lexes single character tokens.
224 func lexSingleCharTokens(l *lexer) stateFn {
225 c := l.Peek()
226 l.Consume()
227 t, _ := singleCharTokens[c]
228 l.emitToken(t)
229
230 return lexRoot
231 }
232
233 // isEqualsOrResponseStart returns true if the rune corresponds to be
234 // beginning of either the '=' or '=>' tokens.
235 func isEqualsOrResponseStart(c rune) bool {
236 return c == '='
237 }
238
239 // lexEqualsOrResponse lexes the '=' or the '=>' token.
240 func lexEqualsOrResponse(l *lexer) stateFn {
241 l.Consume()
242
243 if l.Peek() == '>' {
244 l.Consume()
245 l.emitToken(Response)
246 } else {
247 l.emitToken(Equals)
248 }
249
250 return lexRoot
251 }
252
253 // isAlpha returns true if the rune is a letter of the alphabet.
254 func isAlpha(c rune) bool {
255 return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
256 }
257
258 // isDigit returns true if the rune is a digit.
259 func isDigit(c rune) bool {
260 return ('0' <= c && c <= '9')
261 }
262
263 // isHexDigit returns true if the rune is a hexadecimal digit.
264 func isHexDigit(c rune) bool {
265 return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
266 }
267
268 // isNameStart returns true if the rune is the beginning of a Name token.
269 func isNameStart(c rune) bool {
270 return isAlpha(c) || c == '_'
271 }
272
273 // keywordTokens maps keywords to their associate tokens.
274 var keywordTokens = map[string]TokenKind{
275 "import": Import,
276 "module": Module,
277 "struct": Struct,
278 "union": Union,
279 "interface": Interface,
280 "enum": Enum,
281 "const": Const,
282 "true": True,
283 "false": False,
284 "default": Default,
285 }
286
287 // lexName lexes valid C identifiers. (K&R2: A.2.3)
288 func lexName(l *lexer) stateFn {
289 l.Consume()
290
291 // isNameRune returns true if the rune is valid in a Name token.
292 isNameRune := func(c rune) bool {
293 return isAlpha(c) || isDigit(c) || c == '_'
294 }
295
296 for isNameRune(l.Peek()) {
297 l.Consume()
298 }
299
300 // Emit the appropriate keyword token if the current name is a
301 // keyword or a Name token otherwise.
302 if token, found := keywordTokens[l.CurText()]; found {
303 l.emitToken(token)
304 } else {
305 l.emitToken(Name)
306 }
307
308 return lexRoot
309 }
310
311 // isOrdinalStart returns true if the rune is the beginning of an Ordinal
312 // token.
313 func isOrdinalStart(c rune) bool {
314 return '@' == c
315 }
316
317 // lexOrdinal lexes an Ordinal token. Ordinals are a '@' followed by one
318 // or more digits.
319 func lexOrdinal(l *lexer) stateFn {
320 // Consume the '@'.
321 l.Consume()
322
323 for isDigit(l.Peek()) {
324 l.Consume()
325 }
326
327 l.emitToken(Ordinal)
328
329 return lexRoot
330 }
331
332 // isNumberStart returns true if the rune is the beginning of a number.
333 func isNumberStart(c rune) bool {
334 // Even hexadecimals must begin with a digit (namely 0).
335 return isDigit(c)
336 }
337
338 // lexNumber lexes a number token.
339 func lexNumber(l *lexer) stateFn {
340 // If the number begins with 0 it cannot be a decimal integer.
341 if l.Peek() == '0' {
342 return lexNumberStartWithZero
343 }
344 return lexDec
345 }
346
347 // lexDec lexes a base-10 number.
348 func lexDec(l *lexer) stateFn {
349 for isDigit(l.Peek()) {
350 l.Consume()
351 }
352
353 // If a decimal part is found, transition to the decimal state.
354 if isDecimalPartStart(l.Peek()) {
355 return lexDecimalPart
356 }
357
358 l.emitToken(IntConstDec)
359
360 return lexRoot
361 }
362
363 // lexNumberStartWithZero lexes hexadecimals, some floats or 0.
364 func lexNumberStartWithZero(l *lexer) stateFn {
365 // Consume the leading 0
366 l.Consume()
367
368 // Here we check to see whether we are in the hexadecimal or floating
369 // point case.
370 switch c := l.Peek(); {
371 case c == 'x' || c == 'X':
372 return lexHexNumber
373 case isDecimalPartStart(c):
374 return lexDecimalPart
375 }
376
377 // Found a naked 0.
378 l.emitToken(IntConstDec)
379
380 return lexRoot
381 }
382
383 // lexHexNumber lexes hexadecimal integers.
384 func lexHexNumber(l *lexer) stateFn {
385 // Consume the x or X
386 l.Consume()
387
388 for isHexDigit(l.Peek()) {
389 l.Consume()
390 }
391
392 l.emitToken(IntConstHex)
393
394 return lexRoot
395 }
396
397 // isDecimalPartStart returns true if the rune represents the beginning of
398 // the decimal part of a floating point number.
399 func isDecimalPartStart(c rune) bool {
400 return c == '.' || c == 'e' || c == 'E'
401 }
402
403 // lexDecimalPart lexes the decimal part of a floating point number.
404 func lexDecimalPart(l *lexer) stateFn {
405 // Consume '.' or 'e' or 'E'
406 l.Consume()
407
408 if c := l.Peek(); c == '+' || c == '-' {
409 l.Consume()
410 }
411
412 for isDigit(l.Peek()) {
413 l.Consume()
414 }
415
416 l.emitToken(FloatConst)
417
418 return lexRoot
419 }
420
421 // isStringStart returns true if the rune represents the beginning of a string.
422 func isStringStart(c rune) bool {
423 return '"' == c
424 }
425
426 // lexString lexes a quoted string.
427 func lexString(l *lexer) stateFn {
428 // Consume opening quotes.
429 l.Consume()
430
431 for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' {
432 if l.Peek() == '\\' {
433 // If we see an escape character consume whatever follow s blindly.
434 // TODO(azani): Consider parsing escape sequences.
435 l.Consume()
436 }
437 l.Consume()
438 }
439
440 if l.IsEos() || l.Peek() == '\n' {
441 l.emitToken(ErrorUnterminatedStringLiteral)
442 return nil
443 }
444
445 // Consume the closing quotes
446 l.Consume()
447
448 l.emitToken(StringLiteral)
449
450 return lexRoot
451 }
452
453 // isMaybeComment returns true if the rune may be the beginning of a
454 // comment.
455 func isMaybeComment(c rune) bool {
456 return c == '/'
457 }
458
459 // lexComment consumes a single-line or multi-line comment.
460 func lexComment(l *lexer) stateFn {
461 // Consume the '/'.
462 l.Consume()
463
464 switch l.Peek() {
465 case '/':
466 return lexSingleLineComment
467 case '*':
468 return lexMultiLineComment
469 }
470
471 l.emitToken(ErrorIllegalChar)
472 return nil
473 }
474
475 // lexSingleLineComment consumes a single line comment.
476 func lexSingleLineComment(l *lexer) stateFn {
477 // Consume the '/'
478 l.Consume()
479
480 for !l.IsEos() && l.Peek() != '\n' {
481 l.Consume()
482 }
483
484 l.beginToken()
485 return lexRoot
486 }
487
488 // lexMultiLineComment consumes a multi-line comment.
489 func lexMultiLineComment(l *lexer) stateFn {
490 // Consume the '*'.
491 l.Consume()
492
493 for !l.IsEos() {
494 if l.Peek() == '*' {
495 return lexPossibleEndOfComment
496 }
497 l.Consume()
498 }
499
500 l.emitToken(ErrorUnterminatedComment)
501 return nil
502 }
503
504 // lexPossibleEndOfComment consumes the possible end of a multiline
505 // comment and determines whether the comment in fact ended or not.
506 func lexPossibleEndOfComment(l *lexer) stateFn {
507 // Consume the '*'
508 l.Consume()
509
510 if l.IsEos() {
511 l.emitToken(ErrorUnterminatedComment)
512 return nil
513 }
514
515 if l.Peek() == '/' {
516 l.Consume()
517 l.beginToken()
518 return lexRoot
519 }
520
521 return lexMultiLineComment
522 }
OLDNEW
« no previous file with comments | « no previous file | mojom/mojom_parser/lexer/lexer_test.go » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698