OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 // | |
5 // To call the lexer, call Tokenize with the source string to obtain | |
rudominer
2015/10/12 18:06:05
To use the lexer, call...
azani
2015/10/13 00:23:46
Done.
| |
6 // a TokenStream. The lexer is run concurrently so you should be able to | |
7 // use the TokenStream before the lexer is done with the source. | |
8 // | |
9 // The lexer is implemented as a state machine. The states are represented | |
10 // by functions (the stateFn type) which accept a lexer and return the | |
11 // new state. | |
12 // | |
13 // Most states also have an isFooStart function which helps determine if | |
14 // a transition to Foo is appropriate. Those functions accept a single | |
15 // rune as a parameter and return true if the state machine should | |
16 // transition to state Foo. Some states do not have such functions on | |
17 // account of the transition condition being trivial. | |
18 // | |
19 // The lexer implementation was inspired by | |
20 // http://cuddle.googlecode.com/hg/talk/lex.html | |
21 | |
22 package lexer | |
23 | |
24 import ( | |
25 "unicode/utf8" | |
26 ) | |
27 | |
28 // Tokenize accepts a source string and returns the tokens which compose | |
rudominer
2015/10/12 18:06:05
The phrasing belies the asynchronous nature. Also
azani
2015/10/13 00:23:46
Done.
| |
29 // the source string. | |
30 func Tokenize(source string) TokenStream { | |
31 tokens := make(chan Token) | |
32 l := lexer{source: source, tokens: tokens} | |
33 go l.run() | |
34 return &TokenChan{tokenChan: tokens} | |
35 } | |
36 | |
37 type lexer struct { | |
38 // source is the source code to be lexed. | |
rudominer
2015/10/12 18:06:05
nit: You don't need to repeat the variable name wi
azani
2015/10/12 18:18:07
Hey Matt, could you settle that question for us? I
| |
39 source string | |
40 | |
41 // offset is the number of characters that have been consumed. | |
rudominer
2015/10/12 18:06:05
Let's follow the go standard terminology and avoid
azani
2015/10/13 00:23:45
Done.
| |
42 offset int | |
43 | |
44 // lineno is the current line number. | |
45 lineNo int | |
46 | |
47 // lineOffset is how many characters have been consumed since the | |
48 // beginning of the line. | |
49 lineOffset int | |
rudominer
2015/10/12 18:06:05
I think it would be clearer if we didn't over use
azani
2015/10/13 00:23:45
Done.
| |
50 | |
51 // start is the index in source where the current token begins. | |
rudominer
2015/10/12 18:06:05
Maybe currentTokenStartIndex
or currentTokenStart?
azani
2015/10/13 00:23:46
Done.
| |
52 start int | |
53 | |
54 // lineStart is the line number on which the current token begins. | |
55 lineStart int | |
rudominer
2015/10/12 18:06:05
Maybe currentTokenLineNumber?
azani
2015/10/13 00:23:45
Done.
| |
56 | |
57 // lineOffsetStart is the number of characters since the beginning | |
58 // of the line where the current token begins. | |
59 lineOffsetStart int | |
rudominer
2015/10/12 18:06:05
Maybe currentTokenColumn?
azani
2015/10/13 00:23:45
Done.
| |
60 | |
61 // tokens is a channel on which the found tokens are emitted. | |
rudominer
2015/10/12 18:06:05
maybe "to which the found..."
azani
2015/10/13 00:23:45
Done.
| |
62 tokens chan Token | |
63 } | |
64 | |
65 // CurText returns the consumed part of the current token. | |
66 func (l *lexer) CurText() string { | |
67 return l.source[l.start:l.offset] | |
68 } | |
69 | |
70 // emitToken emits the current token and begins the new token. | |
rudominer
2015/10/12 18:06:05
begins a new token
azani
2015/10/13 00:23:46
Done.
| |
71 func (l *lexer) emitToken(tokenType TokenKind) { | |
72 l.tokens <- Token{ | |
73 Kind: tokenType, | |
74 Text: l.source[l.start:l.offset], | |
75 CharPos: l.start, | |
76 LineNo: l.lineStart, | |
77 LinePos: l.lineOffsetStart} | |
78 l.startToken() | |
79 } | |
80 | |
81 // startToken starts the new token. | |
rudominer
2015/10/12 18:06:05
starts a new token
azani
2015/10/13 00:23:45
Done.
| |
82 func (l *lexer) startToken() { | |
83 l.start = l.offset | |
84 l.lineStart = l.lineNo | |
85 l.lineOffsetStart = l.lineOffset | |
86 } | |
87 | |
88 // Consume consumes the next rune in the source. | |
89 func (l *lexer) Consume() { | |
90 c, width := utf8.DecodeRuneInString(l.source[l.offset:]) | |
91 if width == 0 { | |
rudominer
2015/10/12 18:06:05
What is your thinking here? Can width=0 without c
azani
2015/10/13 00:23:45
RuneError is just a special rune. It is handled th
| |
92 width = 1 | |
mattr
2015/10/12 18:04:13
This struck me as strange, this means the input wa
azani
2015/10/13 00:23:45
Basically, in all cases where this branch is taken
| |
93 } | |
94 | |
95 if c == '\n' { | |
96 l.lineNo += 1 | |
97 l.lineOffset = 0 | |
98 } else { | |
99 l.lineOffset += width | |
100 } | |
101 l.offset += width | |
102 } | |
103 | |
104 // Peek returns the next rune in the source. | |
105 func (l *lexer) Peek() rune { | |
106 char, _ := utf8.DecodeRuneInString(l.source[l.offset:]) | |
rudominer
2015/10/12 18:06:05
What if char == RuneError?
azani
2015/10/13 00:23:45
We treat it like any other rune. It looks like Run
| |
107 return char | |
108 } | |
109 | |
110 // IsEos returns true if the whole source has been consumed false | |
111 // otherwise. | |
112 func (l *lexer) IsEos() bool { | |
113 return l.offset >= len(l.source) | |
rudominer
2015/10/12 18:06:05
Note that this depends on l.offset being a number
azani
2015/10/13 00:23:46
Yes, since strings index based on bytes, not runes
| |
114 } | |
115 | |
116 // run is the lexer's main loop. | |
117 func (l *lexer) run() { | |
118 // This is a state machine. | |
rudominer
2015/10/12 18:06:06
"This" is ambiguous here. Maybe "We are implementi
azani
2015/10/13 00:23:45
Done.
| |
119 // lexRoot is the beginning state. | |
120 // nil is the end state. | |
121 // States are functions which are called on the lexer. They return the | |
122 // next state. | |
123 for state := lexRoot; state != nil; { | |
124 state = state(l) | |
125 } | |
126 close(l.tokens) | |
127 } | |
128 | |
129 // A stateFn represents a state in the lexer state machine. | |
130 type stateFn func(*lexer) stateFn | |
131 | |
132 // This is the beginning state and also the state in between tokens. | |
rudominer
2015/10/12 18:06:05
Nit. In my understanding of a DFA there is no "sta
azani
2015/10/13 00:23:45
I meant returned to after most tokens are emitted.
| |
133 func lexRoot(l *lexer) stateFn { | |
134 if l.offset >= len(l.source) { | |
rudominer
2015/10/12 18:06:05
You have a function IsEos() for this, no?
azani
2015/10/13 00:23:45
Done.
| |
135 return nil | |
136 } | |
137 | |
138 switch c := l.Peek(); { | |
139 case isSingleCharTokens(c): | |
140 return lexSingleCharTokens | |
141 case isEqualsOrResponseStart(c): | |
142 return lexEqualsOrResponse | |
143 case isNameStart(c): | |
144 return lexName | |
145 case isOrdinalStart(c): | |
146 return lexOrdinal | |
147 case isNumberStart(c): | |
148 return lexNumber | |
149 case isStringStart(c): | |
150 return lexString | |
151 case isSkippable(c): | |
152 return lexSkip | |
153 case isMaybeComment(c): | |
154 return lexComment | |
155 } | |
156 | |
157 l.Consume() | |
158 l.emitToken(ERROR_ILLEGAL_CHAR) | |
159 return nil | |
160 } | |
161 | |
162 // isSkippable determines if a rune is skippable. | |
163 func isSkippable(c rune) bool { | |
164 return c == ' ' || c == '\t' || c == '\r' || c == '\n' | |
165 } | |
166 | |
167 // lexSkip consumes skippable runes. | |
168 func lexSkip(l *lexer) stateFn { | |
169 for isSkippable(l.Peek()) { | |
170 l.Consume() | |
171 } | |
172 l.startToken() | |
173 return lexRoot | |
174 } | |
175 | |
176 // singleCharTokens is a map of single-rune tokens. | |
177 var singleCharTokens = map[rune]TokenKind{ | |
178 '(': LPAREN, | |
179 ')': RPAREN, | |
180 '[': LBRACKET, | |
181 ']': RBRACKET, | |
182 '{': LBRACE, | |
183 '}': RBRACE, | |
184 '<': LANGLE, | |
185 '>': RANGLE, | |
186 ';': SEMI, | |
187 ',': COMMA, | |
188 '.': DOT, | |
189 '-': MINUS, | |
190 '+': PLUS, | |
191 '&': AMP, | |
192 '?': QSTN, | |
193 } | |
194 | |
195 // isSingleCharTokens returns true if the rune is a single character token. | |
196 func isSingleCharTokens(c rune) bool { | |
197 _, ok := singleCharTokens[c] | |
198 return ok | |
199 } | |
200 | |
201 // lexSingleCharTokens lexes single character tokens. | |
202 func lexSingleCharTokens(l *lexer) stateFn { | |
203 c := l.Peek() | |
204 l.Consume() | |
205 t, _ := singleCharTokens[c] | |
206 l.emitToken(t) | |
207 | |
208 return lexRoot | |
209 } | |
210 | |
211 // isEqualsOrResponseStart returns true if the rune corresponds to be | |
212 // beginning of either the '=' or '=>' tokens. | |
213 func isEqualsOrResponseStart(c rune) bool { | |
214 return c == '=' | |
215 } | |
216 | |
217 // lexEqualsOrResponse lexes the '=' or the '=>' token. | |
218 func lexEqualsOrResponse(l *lexer) stateFn { | |
219 l.Consume() | |
220 | |
221 if l.Peek() == '>' { | |
222 l.Consume() | |
223 l.emitToken(RESPONSE) | |
224 } else { | |
225 l.emitToken(EQUALS) | |
226 } | |
227 | |
228 return lexRoot | |
229 } | |
230 | |
231 // isAlpha returns true if the rune is a letter of the alphabet. | |
232 func isAlpha(c rune) bool { | |
233 return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) | |
234 } | |
235 | |
236 // isDigit returns true if the rune is a digit. | |
237 func isDigit(c rune) bool { | |
238 return ('0' <= c && c <= '9') | |
239 } | |
240 | |
241 // isHexDigit returns true if the rune is a hexadecimal digit. | |
242 func isHexDigit(c rune) bool { | |
243 return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') | |
244 } | |
245 | |
246 // isNameStart returns true if the rune is the beginning of a NAME token. | |
247 func isNameStart(c rune) bool { | |
248 return isAlpha(c) || c == '_' | |
249 } | |
250 | |
251 // keywordTokens maps keywords to their associate tokens. | |
252 var keywordTokens = map[string]TokenKind{ | |
253 "import": IMPORT, | |
254 "module": MODULE, | |
255 "struct": STRUCT, | |
256 "union": UNION, | |
257 "interface": INTERFACE, | |
258 "enum": ENUM, | |
259 "const": CONST, | |
260 "true": TRUE, | |
261 "false": FALSE, | |
262 "default": DEFAULT, | |
263 } | |
264 | |
265 // lexName lexes valid C identifiers. (K&R2: A.2.3) | |
266 func lexName(l *lexer) stateFn { | |
267 l.Consume() | |
268 | |
269 // isNameRune returns true if the rune is valid in a NAME token. | |
270 isNameRune := func(c rune) bool { | |
271 return isAlpha(c) || isDigit(c) || c == '_' | |
272 } | |
273 | |
274 for isNameRune(l.Peek()) { | |
275 l.Consume() | |
276 } | |
277 | |
278 // Emit the appropriate keyword token if the current name is a | |
279 // keyword or a NAME token otherwise. | |
280 if token, found := keywordTokens[l.CurText()]; found { | |
281 l.emitToken(token) | |
282 } else { | |
283 l.emitToken(NAME) | |
284 } | |
285 | |
286 return lexRoot | |
287 } | |
288 | |
289 // isOrdinalStart returns true if the rune is the start of an ORDINAL | |
290 // token. | |
291 func isOrdinalStart(c rune) bool { | |
292 return '@' == c | |
293 } | |
294 | |
295 // lexOrdinal lexes an ORDINAL token. Ordinals are a '@' followed by one | |
296 // or more digits. | |
297 func lexOrdinal(l *lexer) stateFn { | |
298 // Consume the '@'. | |
299 l.Consume() | |
300 | |
301 for isDigit(l.Peek()) { | |
302 l.Consume() | |
303 } | |
304 | |
305 l.emitToken(ORDINAL) | |
306 | |
307 return lexRoot | |
308 } | |
309 | |
310 // isNumberStart returns true if the rune is the beginning of a number. | |
311 func isNumberStart(c rune) bool { | |
312 // Even hexadecimals must start with a digit (namely 0). | |
313 return isDigit(c) | |
314 } | |
315 | |
316 // lexNumber lexes a number token. | |
317 func lexNumber(l *lexer) stateFn { | |
318 // If the number starts with 0 it cannot be a decimal integer. | |
319 if l.Peek() == '0' { | |
320 return lexNumberStartWithZero | |
321 } | |
322 return lexDec | |
323 } | |
324 | |
325 // lexDec lexes a base-10 number. | |
326 func lexDec(l *lexer) stateFn { | |
327 for isDigit(l.Peek()) { | |
328 l.Consume() | |
329 } | |
330 | |
331 // If a decimal part is found, transition to the decimal state. | |
332 if isDecimalPartStart(l.Peek()) { | |
333 return lexDecimalPart | |
334 } | |
335 | |
336 l.emitToken(INT_CONST_DEC) | |
337 | |
338 return lexRoot | |
339 } | |
340 | |
341 // lexNumberStartWithZero lexes hexadecimals, some floats or 0. | |
342 func lexNumberStartWithZero(l *lexer) stateFn { | |
343 // Consume the starting 0 | |
344 l.Consume() | |
345 | |
346 // Here we check to see whether we are in the hexadecimal or floating | |
347 // point case. | |
348 switch c := l.Peek(); { | |
349 case c == 'x' || c == 'X': | |
350 return lexHexNumber | |
351 case isDecimalPartStart(c): | |
352 return lexDecimalPart | |
353 } | |
354 | |
355 // Found a naked 0. | |
356 l.emitToken(INT_CONST_DEC) | |
357 | |
358 return lexRoot | |
359 } | |
360 | |
361 // lexHexNumber lexes hexadecimal integers. | |
362 func lexHexNumber(l *lexer) stateFn { | |
363 // Consume the x or X | |
364 l.Consume() | |
365 | |
366 for isHexDigit(l.Peek()) { | |
367 l.Consume() | |
368 } | |
369 | |
370 l.emitToken(INT_CONST_HEX) | |
371 | |
372 return lexRoot | |
373 } | |
374 | |
375 // isDecimalPartStart returns true if the rune represents the start of | |
376 // the decimal part of a floating point number. | |
377 func isDecimalPartStart(c rune) bool { | |
378 return c == '.' || c == 'e' || c == 'E' | |
379 } | |
380 | |
381 // lexDecimalPart lexes the decimal part of a floating point number. | |
382 func lexDecimalPart(l *lexer) stateFn { | |
383 // Consume '.' or 'e' or 'E' | |
384 l.Consume() | |
385 | |
386 if c := l.Peek(); c == '+' || c == '-' { | |
387 l.Consume() | |
388 } | |
389 | |
390 for isDigit(l.Peek()) { | |
391 l.Consume() | |
392 } | |
393 | |
394 l.emitToken(FLOAT_CONST) | |
395 | |
396 return lexRoot | |
397 } | |
398 | |
399 // isStringStart returns true if the rune represents the start of a string. | |
400 func isStringStart(c rune) bool { | |
401 return '"' == c | |
402 } | |
403 | |
404 // lexString lexes a quoted string. | |
405 func lexString(l *lexer) stateFn { | |
406 // Consume opening quotes. | |
407 l.Consume() | |
408 | |
409 for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' { | |
410 if l.Peek() == '\\' { | |
411 // If we see an escape character consume whatever follow s blindly. | |
412 // TODO(azani): Consider parsing escape sequences. | |
413 l.Consume() | |
414 } | |
415 l.Consume() | |
416 } | |
417 | |
418 if l.IsEos() || l.Peek() == '\n' { | |
419 l.emitToken(ERROR_UNTERMINATED_STRING_LITERAL) | |
420 return nil | |
421 } | |
422 | |
423 // Consume the closing quotes | |
424 l.Consume() | |
425 | |
426 l.emitToken(STRING_LITERAL) | |
427 | |
428 return lexRoot | |
429 } | |
430 | |
431 // isMaybeComment returns true if the rune may be the beginning of a | |
432 // comment. | |
433 func isMaybeComment(c rune) bool { | |
434 return c == '/' | |
435 } | |
436 | |
437 // lexComment consumes a single-line or multi-line comment. | |
438 func lexComment(l *lexer) stateFn { | |
439 // Consume the '/'. | |
440 l.Consume() | |
441 | |
442 switch l.Peek() { | |
443 case '/': | |
444 return lexSingleLineComment | |
445 case '*': | |
446 return lexMultiLineComment | |
447 } | |
448 | |
449 l.emitToken(ERROR_ILLEGAL_CHAR) | |
450 return nil | |
451 } | |
452 | |
453 // lexSingleLineComment consumes a single line comment. | |
454 func lexSingleLineComment(l *lexer) stateFn { | |
455 // Consume the '/' | |
456 l.Consume() | |
457 | |
458 for !l.IsEos() && l.Peek() != '\n' { | |
459 l.Consume() | |
460 } | |
461 | |
462 l.startToken() | |
463 return lexRoot | |
464 } | |
465 | |
466 // lexMultiLineComment consumes a multi-line comment. | |
467 func lexMultiLineComment(l *lexer) stateFn { | |
468 // Consume the '*'. | |
469 l.Consume() | |
470 | |
471 for !l.IsEos() { | |
472 if l.Peek() == '*' { | |
473 return lexPossibleEndOfComment | |
474 } | |
475 l.Consume() | |
476 } | |
477 | |
478 l.emitToken(ERROR_UNTERMINATED_COMMENT) | |
479 return nil | |
480 } | |
481 | |
482 // lexPossibleEndOfComment consumes the possible end of a multiline | |
483 // comment and determines whether the comment in fact ended or not. | |
484 func lexPossibleEndOfComment(l *lexer) stateFn { | |
485 // Consume the '*' | |
486 l.Consume() | |
487 | |
488 if l.IsEos() { | |
489 l.emitToken(ERROR_UNTERMINATED_COMMENT) | |
490 return nil | |
491 } | |
492 | |
493 if l.Peek() == '/' { | |
494 l.Consume() | |
495 l.startToken() | |
496 return lexRoot | |
497 } | |
498 | |
499 return lexMultiLineComment | |
500 } | |
OLD | NEW |