Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(35)

Side by Side Diff: mojom/mojom_parser/lexer/lexer.go

Issue 1387893002: New lexer for mojom written in go. (Closed) Base URL: https://github.com/domokit/mojo.git@master
Patch Set: Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // To call the lexer, call Tokenize with the source string to obtain
rudominer 2015/10/12 18:06:05 To use the lexer, call...
azani 2015/10/13 00:23:46 Done.
6 // a TokenStream. The lexer is run concurrently so you should be able to
7 // use the TokenStream before the lexer is done with the source.
8 //
9 // The lexer is implemented as a state machine. The states are represented
10 // by functions (the stateFn type) which accept a lexer and return the
11 // new state.
12 //
13 // Most states also have an isFooStart function which helps determine if
14 // a transition to Foo is appropriate. Those functions accept a single
15 // rune as a parameter and return true if the state machine should
16 // transition to state Foo. Some states do not have such functions on
17 // account of the transition condition being trivial.
18 //
19 // The lexer implementation was inspired by
20 // http://cuddle.googlecode.com/hg/talk/lex.html
21
22 package lexer
23
24 import (
25 "unicode/utf8"
26 )
27
28 // Tokenize accepts a source string and returns the tokens which compose
rudominer 2015/10/12 18:06:05 The phrasing belies the asynchronous nature. Also
azani 2015/10/13 00:23:46 Done.
29 // the source string.
30 func Tokenize(source string) TokenStream {
31 tokens := make(chan Token)
32 l := lexer{source: source, tokens: tokens}
33 go l.run()
34 return &TokenChan{tokenChan: tokens}
35 }
36
37 type lexer struct {
38 // source is the source code to be lexed.
rudominer 2015/10/12 18:06:05 nit: You don't need to repeat the variable name wi
azani 2015/10/12 18:18:07 Hey Matt, could you settle that question for us? I
39 source string
40
41 // offset is the number of characters that have been consumed.
rudominer 2015/10/12 18:06:05 Let's follow the go standard terminology and avoid
azani 2015/10/13 00:23:45 Done.
42 offset int
43
44 // lineno is the current line number.
45 lineNo int
46
47 // lineOffset is how many characters have been consumed since the
48 // beginning of the line.
49 lineOffset int
rudominer 2015/10/12 18:06:05 I think it would be clearer if we didn't over use
azani 2015/10/13 00:23:45 Done.
50
51 // start is the index in source where the current token begins.
rudominer 2015/10/12 18:06:05 Maybe currentTokenStartIndex or currentTokenStart?
azani 2015/10/13 00:23:46 Done.
52 start int
53
54 // lineStart is the line number on which the current token begins.
55 lineStart int
rudominer 2015/10/12 18:06:05 Maybe currentTokenLineNumber?
azani 2015/10/13 00:23:45 Done.
56
57 // lineOffsetStart is the number of characters since the beginning
58 // of the line where the current token begins.
59 lineOffsetStart int
rudominer 2015/10/12 18:06:05 Maybe currentTokenColumn?
azani 2015/10/13 00:23:45 Done.
60
61 // tokens is a channel on which the found tokens are emitted.
rudominer 2015/10/12 18:06:05 maybe "to which the found..."
azani 2015/10/13 00:23:45 Done.
62 tokens chan Token
63 }
64
65 // CurText returns the consumed part of the current token.
66 func (l *lexer) CurText() string {
67 return l.source[l.start:l.offset]
68 }
69
70 // emitToken emits the current token and begins the new token.
rudominer 2015/10/12 18:06:05 begins a new token
azani 2015/10/13 00:23:46 Done.
71 func (l *lexer) emitToken(tokenType TokenKind) {
72 l.tokens <- Token{
73 Kind: tokenType,
74 Text: l.source[l.start:l.offset],
75 CharPos: l.start,
76 LineNo: l.lineStart,
77 LinePos: l.lineOffsetStart}
78 l.startToken()
79 }
80
81 // startToken starts the new token.
rudominer 2015/10/12 18:06:05 starts a new token
azani 2015/10/13 00:23:45 Done.
82 func (l *lexer) startToken() {
83 l.start = l.offset
84 l.lineStart = l.lineNo
85 l.lineOffsetStart = l.lineOffset
86 }
87
88 // Consume consumes the next rune in the source.
89 func (l *lexer) Consume() {
90 c, width := utf8.DecodeRuneInString(l.source[l.offset:])
91 if width == 0 {
rudominer 2015/10/12 18:06:05 What is your thinking here? Can width=0 without c
azani 2015/10/13 00:23:45 RuneError is just a special rune. It is handled th
92 width = 1
mattr 2015/10/12 18:04:13 This struck me as strange, this means the input wa
azani 2015/10/13 00:23:45 Basically, in all cases where this branch is taken
93 }
94
95 if c == '\n' {
96 l.lineNo += 1
97 l.lineOffset = 0
98 } else {
99 l.lineOffset += width
100 }
101 l.offset += width
102 }
103
104 // Peek returns the next rune in the source.
105 func (l *lexer) Peek() rune {
106 char, _ := utf8.DecodeRuneInString(l.source[l.offset:])
rudominer 2015/10/12 18:06:05 What if char == RuneError?
azani 2015/10/13 00:23:45 We treat it like any other rune. It looks like Run
107 return char
108 }
109
110 // IsEos returns true if the whole source has been consumed false
111 // otherwise.
112 func (l *lexer) IsEos() bool {
113 return l.offset >= len(l.source)
rudominer 2015/10/12 18:06:05 Note that this depends on l.offset being a number
azani 2015/10/13 00:23:46 Yes, since strings index based on bytes, not runes
114 }
115
116 // run is the lexer's main loop.
117 func (l *lexer) run() {
118 // This is a state machine.
rudominer 2015/10/12 18:06:06 "This" is ambiguous here. Maybe "We are implementi
azani 2015/10/13 00:23:45 Done.
119 // lexRoot is the beginning state.
120 // nil is the end state.
121 // States are functions which are called on the lexer. They return the
122 // next state.
123 for state := lexRoot; state != nil; {
124 state = state(l)
125 }
126 close(l.tokens)
127 }
128
129 // A stateFn represents a state in the lexer state machine.
130 type stateFn func(*lexer) stateFn
131
132 // This is the beginning state and also the state in between tokens.
rudominer 2015/10/12 18:06:05 Nit. In my understanding of a DFA there is no "sta
azani 2015/10/13 00:23:45 I meant returned to after most tokens are emitted.
133 func lexRoot(l *lexer) stateFn {
134 if l.offset >= len(l.source) {
rudominer 2015/10/12 18:06:05 You have a function IsEos() for this, no?
azani 2015/10/13 00:23:45 Done.
135 return nil
136 }
137
138 switch c := l.Peek(); {
139 case isSingleCharTokens(c):
140 return lexSingleCharTokens
141 case isEqualsOrResponseStart(c):
142 return lexEqualsOrResponse
143 case isNameStart(c):
144 return lexName
145 case isOrdinalStart(c):
146 return lexOrdinal
147 case isNumberStart(c):
148 return lexNumber
149 case isStringStart(c):
150 return lexString
151 case isSkippable(c):
152 return lexSkip
153 case isMaybeComment(c):
154 return lexComment
155 }
156
157 l.Consume()
158 l.emitToken(ERROR_ILLEGAL_CHAR)
159 return nil
160 }
161
162 // isSkippable determines if a rune is skippable.
163 func isSkippable(c rune) bool {
164 return c == ' ' || c == '\t' || c == '\r' || c == '\n'
165 }
166
167 // lexSkip consumes skippable runes.
168 func lexSkip(l *lexer) stateFn {
169 for isSkippable(l.Peek()) {
170 l.Consume()
171 }
172 l.startToken()
173 return lexRoot
174 }
175
176 // singleCharTokens is a map of single-rune tokens.
177 var singleCharTokens = map[rune]TokenKind{
178 '(': LPAREN,
179 ')': RPAREN,
180 '[': LBRACKET,
181 ']': RBRACKET,
182 '{': LBRACE,
183 '}': RBRACE,
184 '<': LANGLE,
185 '>': RANGLE,
186 ';': SEMI,
187 ',': COMMA,
188 '.': DOT,
189 '-': MINUS,
190 '+': PLUS,
191 '&': AMP,
192 '?': QSTN,
193 }
194
195 // isSingleCharTokens returns true if the rune is a single character token.
196 func isSingleCharTokens(c rune) bool {
197 _, ok := singleCharTokens[c]
198 return ok
199 }
200
201 // lexSingleCharTokens lexes single character tokens.
202 func lexSingleCharTokens(l *lexer) stateFn {
203 c := l.Peek()
204 l.Consume()
205 t, _ := singleCharTokens[c]
206 l.emitToken(t)
207
208 return lexRoot
209 }
210
211 // isEqualsOrResponseStart returns true if the rune corresponds to be
212 // beginning of either the '=' or '=>' tokens.
213 func isEqualsOrResponseStart(c rune) bool {
214 return c == '='
215 }
216
217 // lexEqualsOrResponse lexes the '=' or the '=>' token.
218 func lexEqualsOrResponse(l *lexer) stateFn {
219 l.Consume()
220
221 if l.Peek() == '>' {
222 l.Consume()
223 l.emitToken(RESPONSE)
224 } else {
225 l.emitToken(EQUALS)
226 }
227
228 return lexRoot
229 }
230
231 // isAlpha returns true if the rune is a letter of the alphabet.
232 func isAlpha(c rune) bool {
233 return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
234 }
235
236 // isDigit returns true if the rune is a digit.
237 func isDigit(c rune) bool {
238 return ('0' <= c && c <= '9')
239 }
240
241 // isHexDigit returns true if the rune is a hexadecimal digit.
242 func isHexDigit(c rune) bool {
243 return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
244 }
245
246 // isNameStart returns true if the rune is the beginning of a NAME token.
247 func isNameStart(c rune) bool {
248 return isAlpha(c) || c == '_'
249 }
250
251 // keywordTokens maps keywords to their associate tokens.
252 var keywordTokens = map[string]TokenKind{
253 "import": IMPORT,
254 "module": MODULE,
255 "struct": STRUCT,
256 "union": UNION,
257 "interface": INTERFACE,
258 "enum": ENUM,
259 "const": CONST,
260 "true": TRUE,
261 "false": FALSE,
262 "default": DEFAULT,
263 }
264
265 // lexName lexes valid C identifiers. (K&R2: A.2.3)
266 func lexName(l *lexer) stateFn {
267 l.Consume()
268
269 // isNameRune returns true if the rune is valid in a NAME token.
270 isNameRune := func(c rune) bool {
271 return isAlpha(c) || isDigit(c) || c == '_'
272 }
273
274 for isNameRune(l.Peek()) {
275 l.Consume()
276 }
277
278 // Emit the appropriate keyword token if the current name is a
279 // keyword or a NAME token otherwise.
280 if token, found := keywordTokens[l.CurText()]; found {
281 l.emitToken(token)
282 } else {
283 l.emitToken(NAME)
284 }
285
286 return lexRoot
287 }
288
289 // isOrdinalStart returns true if the rune is the start of an ORDINAL
290 // token.
291 func isOrdinalStart(c rune) bool {
292 return '@' == c
293 }
294
295 // lexOrdinal lexes an ORDINAL token. Ordinals are a '@' followed by one
296 // or more digits.
297 func lexOrdinal(l *lexer) stateFn {
298 // Consume the '@'.
299 l.Consume()
300
301 for isDigit(l.Peek()) {
302 l.Consume()
303 }
304
305 l.emitToken(ORDINAL)
306
307 return lexRoot
308 }
309
310 // isNumberStart returns true if the rune is the beginning of a number.
311 func isNumberStart(c rune) bool {
312 // Even hexadecimals must start with a digit (namely 0).
313 return isDigit(c)
314 }
315
316 // lexNumber lexes a number token.
317 func lexNumber(l *lexer) stateFn {
318 // If the number starts with 0 it cannot be a decimal integer.
319 if l.Peek() == '0' {
320 return lexNumberStartWithZero
321 }
322 return lexDec
323 }
324
325 // lexDec lexes a base-10 number.
326 func lexDec(l *lexer) stateFn {
327 for isDigit(l.Peek()) {
328 l.Consume()
329 }
330
331 // If a decimal part is found, transition to the decimal state.
332 if isDecimalPartStart(l.Peek()) {
333 return lexDecimalPart
334 }
335
336 l.emitToken(INT_CONST_DEC)
337
338 return lexRoot
339 }
340
341 // lexNumberStartWithZero lexes hexadecimals, some floats or 0.
342 func lexNumberStartWithZero(l *lexer) stateFn {
343 // Consume the starting 0
344 l.Consume()
345
346 // Here we check to see whether we are in the hexadecimal or floating
347 // point case.
348 switch c := l.Peek(); {
349 case c == 'x' || c == 'X':
350 return lexHexNumber
351 case isDecimalPartStart(c):
352 return lexDecimalPart
353 }
354
355 // Found a naked 0.
356 l.emitToken(INT_CONST_DEC)
357
358 return lexRoot
359 }
360
361 // lexHexNumber lexes hexadecimal integers.
362 func lexHexNumber(l *lexer) stateFn {
363 // Consume the x or X
364 l.Consume()
365
366 for isHexDigit(l.Peek()) {
367 l.Consume()
368 }
369
370 l.emitToken(INT_CONST_HEX)
371
372 return lexRoot
373 }
374
375 // isDecimalPartStart returns true if the rune represents the start of
376 // the decimal part of a floating point number.
377 func isDecimalPartStart(c rune) bool {
378 return c == '.' || c == 'e' || c == 'E'
379 }
380
381 // lexDecimalPart lexes the decimal part of a floating point number.
382 func lexDecimalPart(l *lexer) stateFn {
383 // Consume '.' or 'e' or 'E'
384 l.Consume()
385
386 if c := l.Peek(); c == '+' || c == '-' {
387 l.Consume()
388 }
389
390 for isDigit(l.Peek()) {
391 l.Consume()
392 }
393
394 l.emitToken(FLOAT_CONST)
395
396 return lexRoot
397 }
398
399 // isStringStart returns true if the rune represents the start of a string.
400 func isStringStart(c rune) bool {
401 return '"' == c
402 }
403
404 // lexString lexes a quoted string.
405 func lexString(l *lexer) stateFn {
406 // Consume opening quotes.
407 l.Consume()
408
409 for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' {
410 if l.Peek() == '\\' {
411 // If we see an escape character consume whatever follow s blindly.
412 // TODO(azani): Consider parsing escape sequences.
413 l.Consume()
414 }
415 l.Consume()
416 }
417
418 if l.IsEos() || l.Peek() == '\n' {
419 l.emitToken(ERROR_UNTERMINATED_STRING_LITERAL)
420 return nil
421 }
422
423 // Consume the closing quotes
424 l.Consume()
425
426 l.emitToken(STRING_LITERAL)
427
428 return lexRoot
429 }
430
431 // isMaybeComment returns true if the rune may be the beginning of a
432 // comment.
433 func isMaybeComment(c rune) bool {
434 return c == '/'
435 }
436
437 // lexComment consumes a single-line or multi-line comment.
438 func lexComment(l *lexer) stateFn {
439 // Consume the '/'.
440 l.Consume()
441
442 switch l.Peek() {
443 case '/':
444 return lexSingleLineComment
445 case '*':
446 return lexMultiLineComment
447 }
448
449 l.emitToken(ERROR_ILLEGAL_CHAR)
450 return nil
451 }
452
453 // lexSingleLineComment consumes a single line comment.
454 func lexSingleLineComment(l *lexer) stateFn {
455 // Consume the '/'
456 l.Consume()
457
458 for !l.IsEos() && l.Peek() != '\n' {
459 l.Consume()
460 }
461
462 l.startToken()
463 return lexRoot
464 }
465
466 // lexMultiLineComment consumes a multi-line comment.
467 func lexMultiLineComment(l *lexer) stateFn {
468 // Consume the '*'.
469 l.Consume()
470
471 for !l.IsEos() {
472 if l.Peek() == '*' {
473 return lexPossibleEndOfComment
474 }
475 l.Consume()
476 }
477
478 l.emitToken(ERROR_UNTERMINATED_COMMENT)
479 return nil
480 }
481
482 // lexPossibleEndOfComment consumes the possible end of a multiline
483 // comment and determines whether the comment in fact ended or not.
484 func lexPossibleEndOfComment(l *lexer) stateFn {
485 // Consume the '*'
486 l.Consume()
487
488 if l.IsEos() {
489 l.emitToken(ERROR_UNTERMINATED_COMMENT)
490 return nil
491 }
492
493 if l.Peek() == '/' {
494 l.Consume()
495 l.startToken()
496 return lexRoot
497 }
498
499 return lexMultiLineComment
500 }
OLDNEW
« no previous file with comments | « no previous file | mojom/mojom_parser/lexer/lexer_test.go » ('j') | mojom/mojom_parser/lexer/lexer_test.go » ('J')

Powered by Google App Engine
This is Rietveld 408576698