OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 // | |
5 // To use the lexer, call Tokenize with the source string to obtain | |
6 // a TokenStream. The lexer is run concurrently so you should be able to | |
7 // use the TokenStream before the lexer is done with the source. | |
8 // | |
9 // The lexer is implemented as a state machine. The states are represented | |
10 // by functions (the stateFn type) which accept a lexer and return the | |
11 // new state. | |
12 // | |
13 // Most states also have an isFooStart function which helps determine if | |
14 // a transition to Foo is appropriate. Those functions accept a single | |
15 // rune as a parameter and return true if the state machine should | |
16 // transition to state Foo. Some states do not have such functions on | |
17 // account of the transition condition being trivial. | |
18 // | |
19 // The lexer implementation was inspired by | |
20 // http://cuddle.googlecode.com/hg/talk/lex.html | |
21 | |
22 package lexer | |
23 | |
24 import ( | |
25 "unicode/utf8" | |
26 ) | |
27 | |
28 // Tokenize accepts a source string and parses it into a stream of tokens which | |
29 // can be read from the returned TokenStream. | |
30 func Tokenize(source string) TokenStream { | |
31 tokens := make(chan Token) | |
32 l := lexer{source: source, tokens: tokens} | |
33 go l.run() | |
34 return &TokenChan{tokenChan: tokens} | |
35 } | |
36 | |
37 type lexer struct { | |
38 // source is the source code to be lexed. | |
39 source string | |
40 | |
41 // offset is the number of bytes that have been consumed. | |
42 offset int | |
43 | |
44 // tokens is a channel to which the found tokens are emitted. | |
45 tokens chan Token | |
46 | |
47 // sourcePos is the number of runes that have been consumed. | |
48 sourcePos int | |
49 | |
50 // lineno is the current line number. | |
51 lineNo int | |
52 | |
53 // linePos is how many runes have been consumed since the beginning of t he | |
54 // line. | |
55 linePos int | |
56 | |
57 // curTokenOffset is the number of bytes consumed prior to the beginning of | |
58 // the current token. | |
59 curTokenOffset int | |
60 | |
61 // curTokenSourcePos is the number of runes consumed prior to the beginn ing of | |
62 // the current token. | |
63 curTokenSourcePos int | |
64 | |
65 // curTokenLineNo is the line number on which the current token begins. | |
66 curTokenLineNo int | |
67 | |
68 // curTokenLinePos is the number of runes since the beginning of the lin e | |
69 // where the current token begins. | |
70 curTokenLinePos int | |
71 } | |
72 | |
73 // CurText returns the consumed part of the current token. | |
74 func (l *lexer) CurText() string { | |
75 return l.source[l.curTokenOffset:l.offset] | |
76 } | |
77 | |
78 // emitToken emits the current token and begins a new token. | |
79 func (l *lexer) emitToken(tokenType TokenKind) { | |
80 l.tokens <- Token{ | |
81 Kind: tokenType, | |
82 Text: l.source[l.curTokenOffset:l.offset], | |
83 CharPos: l.curTokenSourcePos, | |
84 LineNo: l.curTokenLineNo, | |
85 LinePos: l.curTokenLinePos} | |
86 l.beginToken() | |
87 } | |
88 | |
89 // beginToken begins the new token. | |
90 func (l *lexer) beginToken() { | |
91 l.curTokenOffset = l.offset | |
92 l.curTokenSourcePos = l.sourcePos | |
93 l.curTokenLineNo = l.lineNo | |
94 l.curTokenLinePos = l.linePos | |
95 } | |
96 | |
97 // Consume consumes the next rune in the source. | |
98 func (l *lexer) Consume() { | |
99 if l.IsEos() { | |
100 return | |
101 } | |
102 | |
103 c, width := utf8.DecodeRuneInString(l.source[l.offset:]) | |
104 if width == 0 { | |
mattr
2015/10/14 19:29:26
I guess my question was, why increment width here
azani
2015/10/14 22:29:41
It turns out it's not necessary... Gone now and th
| |
105 width = 1 | |
106 } | |
107 | |
108 if c == '\n' { | |
109 l.lineNo += 1 | |
110 l.linePos = 0 | |
111 } else { | |
112 l.linePos += 1 | |
113 } | |
114 l.offset += width | |
115 l.sourcePos += 1 | |
116 } | |
117 | |
118 // Peek returns the next rune in the source. | |
119 func (l *lexer) Peek() rune { | |
120 // At the end of the string, there is no sane answer to Peek. | |
121 if l.IsEos() { | |
122 return utf8.RuneError | |
123 } | |
124 | |
125 // If RuneError is returned, it will be handled as any other rune, likel y | |
126 // resulting in an ErrorIllegalChar token being emitted. | |
127 char, _ := utf8.DecodeRuneInString(l.source[l.offset:]) | |
128 return char | |
129 } | |
130 | |
131 // IsEos returns true if the whole source has been consumed false | |
132 // otherwise. | |
133 func (l *lexer) IsEos() bool { | |
134 return l.offset >= len(l.source) | |
135 } | |
136 | |
137 // run is the lexer's main loop. | |
138 func (l *lexer) run() { | |
139 // We are implementing a state machine. | |
140 // lexRoot is the beginning state. | |
141 // nil is the end state. | |
142 // States are functions which are called on the lexer. They return the | |
143 // next state. | |
144 for state := lexRoot; state != nil; { | |
145 state = state(l) | |
146 } | |
147 close(l.tokens) | |
148 } | |
149 | |
150 // A stateFn represents a state in the lexer state machine. | |
151 type stateFn func(*lexer) stateFn | |
152 | |
153 // This is the beginning state and also the state which is returned to after | |
154 // most tokens are emitted. | |
155 func lexRoot(l *lexer) stateFn { | |
156 if l.IsEos() { | |
157 return nil | |
158 } | |
159 | |
160 switch c := l.Peek(); { | |
161 case isSingleCharTokens(c): | |
162 return lexSingleCharTokens | |
163 case isEqualsOrResponseStart(c): | |
164 return lexEqualsOrResponse | |
165 case isNameStart(c): | |
166 return lexName | |
167 case isOrdinalStart(c): | |
168 return lexOrdinal | |
169 case isNumberStart(c): | |
170 return lexNumber | |
171 case isStringStart(c): | |
172 return lexString | |
173 case isSkippable(c): | |
174 return lexSkip | |
175 case isMaybeComment(c): | |
176 return lexComment | |
177 } | |
178 | |
179 l.Consume() | |
180 l.emitToken(ErrorIllegalChar) | |
181 return nil | |
182 } | |
183 | |
184 // isSkippable determines if a rune is skippable. | |
185 func isSkippable(c rune) bool { | |
186 return c == ' ' || c == '\t' || c == '\r' || c == '\n' | |
187 } | |
188 | |
189 // lexSkip consumes skippable runes. | |
190 func lexSkip(l *lexer) stateFn { | |
191 for isSkippable(l.Peek()) { | |
192 l.Consume() | |
193 } | |
194 l.beginToken() | |
195 return lexRoot | |
196 } | |
197 | |
198 // singleCharTokens is a map of single-rune tokens. | |
199 var singleCharTokens = map[rune]TokenKind{ | |
200 '(': LParen, | |
201 ')': RParen, | |
202 '[': LBracket, | |
203 ']': RBracket, | |
204 '{': LBrace, | |
205 '}': RBrace, | |
206 '<': LAngle, | |
207 '>': RAngle, | |
208 ';': Semi, | |
209 ',': Comma, | |
210 '.': Dot, | |
211 '-': Minus, | |
212 '+': Plus, | |
213 '&': Amp, | |
214 '?': Qstn, | |
215 } | |
216 | |
217 // isSingleCharTokens returns true if the rune is a single character token. | |
218 func isSingleCharTokens(c rune) bool { | |
219 _, ok := singleCharTokens[c] | |
220 return ok | |
221 } | |
222 | |
223 // lexSingleCharTokens lexes single character tokens. | |
224 func lexSingleCharTokens(l *lexer) stateFn { | |
225 c := l.Peek() | |
226 l.Consume() | |
227 t, _ := singleCharTokens[c] | |
228 l.emitToken(t) | |
229 | |
230 return lexRoot | |
231 } | |
232 | |
233 // isEqualsOrResponseStart returns true if the rune corresponds to be | |
234 // beginning of either the '=' or '=>' tokens. | |
235 func isEqualsOrResponseStart(c rune) bool { | |
236 return c == '=' | |
237 } | |
238 | |
239 // lexEqualsOrResponse lexes the '=' or the '=>' token. | |
240 func lexEqualsOrResponse(l *lexer) stateFn { | |
241 l.Consume() | |
242 | |
243 if l.Peek() == '>' { | |
244 l.Consume() | |
245 l.emitToken(Response) | |
246 } else { | |
247 l.emitToken(Equals) | |
248 } | |
249 | |
250 return lexRoot | |
251 } | |
252 | |
253 // isAlpha returns true if the rune is a letter of the alphabet. | |
254 func isAlpha(c rune) bool { | |
255 return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) | |
256 } | |
257 | |
258 // isDigit returns true if the rune is a digit. | |
259 func isDigit(c rune) bool { | |
260 return ('0' <= c && c <= '9') | |
261 } | |
262 | |
263 // isHexDigit returns true if the rune is a hexadecimal digit. | |
264 func isHexDigit(c rune) bool { | |
265 return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') | |
266 } | |
267 | |
268 // isNameStart returns true if the rune is the beginning of a Name token. | |
269 func isNameStart(c rune) bool { | |
270 return isAlpha(c) || c == '_' | |
271 } | |
272 | |
273 // keywordTokens maps keywords to their associate tokens. | |
274 var keywordTokens = map[string]TokenKind{ | |
275 "import": Import, | |
276 "module": Module, | |
277 "struct": Struct, | |
278 "union": Union, | |
279 "interface": Interface, | |
280 "enum": Enum, | |
281 "const": Const, | |
282 "true": True, | |
283 "false": False, | |
284 "default": Default, | |
285 } | |
286 | |
287 // lexName lexes valid C identifiers. (K&R2: A.2.3) | |
288 func lexName(l *lexer) stateFn { | |
289 l.Consume() | |
290 | |
291 // isNameRune returns true if the rune is valid in a Name token. | |
292 isNameRune := func(c rune) bool { | |
293 return isAlpha(c) || isDigit(c) || c == '_' | |
294 } | |
295 | |
296 for isNameRune(l.Peek()) { | |
297 l.Consume() | |
298 } | |
299 | |
300 // Emit the appropriate keyword token if the current name is a | |
301 // keyword or a Name token otherwise. | |
302 if token, found := keywordTokens[l.CurText()]; found { | |
303 l.emitToken(token) | |
304 } else { | |
305 l.emitToken(Name) | |
306 } | |
307 | |
308 return lexRoot | |
309 } | |
310 | |
311 // isOrdinalStart returns true if the rune is the beginning of an Ordinal | |
312 // token. | |
313 func isOrdinalStart(c rune) bool { | |
314 return '@' == c | |
315 } | |
316 | |
317 // lexOrdinal lexes an Ordinal token. Ordinals are a '@' followed by one | |
318 // or more digits. | |
319 func lexOrdinal(l *lexer) stateFn { | |
320 // Consume the '@'. | |
321 l.Consume() | |
322 | |
323 for isDigit(l.Peek()) { | |
324 l.Consume() | |
325 } | |
326 | |
327 l.emitToken(Ordinal) | |
328 | |
329 return lexRoot | |
330 } | |
331 | |
332 // isNumberStart returns true if the rune is the beginning of a number. | |
333 func isNumberStart(c rune) bool { | |
334 // Even hexadecimals must begin with a digit (namely 0). | |
335 return isDigit(c) | |
336 } | |
337 | |
338 // lexNumber lexes a number token. | |
339 func lexNumber(l *lexer) stateFn { | |
340 // If the number begins with 0 it cannot be a decimal integer. | |
341 if l.Peek() == '0' { | |
342 return lexNumberStartWithZero | |
343 } | |
344 return lexDec | |
345 } | |
346 | |
347 // lexDec lexes a base-10 number. | |
348 func lexDec(l *lexer) stateFn { | |
349 for isDigit(l.Peek()) { | |
350 l.Consume() | |
351 } | |
352 | |
353 // If a decimal part is found, transition to the decimal state. | |
354 if isDecimalPartStart(l.Peek()) { | |
355 return lexDecimalPart | |
356 } | |
357 | |
358 l.emitToken(IntConstDec) | |
359 | |
360 return lexRoot | |
361 } | |
362 | |
363 // lexNumberStartWithZero lexes hexadecimals, some floats or 0. | |
364 func lexNumberStartWithZero(l *lexer) stateFn { | |
365 // Consume the leading 0 | |
366 l.Consume() | |
367 | |
368 // Here we check to see whether we are in the hexadecimal or floating | |
369 // point case. | |
370 switch c := l.Peek(); { | |
371 case c == 'x' || c == 'X': | |
372 return lexHexNumber | |
373 case isDecimalPartStart(c): | |
374 return lexDecimalPart | |
375 } | |
376 | |
377 // Found a naked 0. | |
378 l.emitToken(IntConstDec) | |
379 | |
380 return lexRoot | |
381 } | |
382 | |
383 // lexHexNumber lexes hexadecimal integers. | |
384 func lexHexNumber(l *lexer) stateFn { | |
385 // Consume the x or X | |
386 l.Consume() | |
387 | |
388 for isHexDigit(l.Peek()) { | |
389 l.Consume() | |
390 } | |
391 | |
392 l.emitToken(IntConstHex) | |
393 | |
394 return lexRoot | |
395 } | |
396 | |
397 // isDecimalPartStart returns true if the rune represents the beginning of | |
398 // the decimal part of a floating point number. | |
399 func isDecimalPartStart(c rune) bool { | |
400 return c == '.' || c == 'e' || c == 'E' | |
401 } | |
402 | |
403 // lexDecimalPart lexes the decimal part of a floating point number. | |
404 func lexDecimalPart(l *lexer) stateFn { | |
405 // Consume '.' or 'e' or 'E' | |
406 l.Consume() | |
407 | |
408 if c := l.Peek(); c == '+' || c == '-' { | |
409 l.Consume() | |
410 } | |
411 | |
412 for isDigit(l.Peek()) { | |
413 l.Consume() | |
414 } | |
415 | |
416 l.emitToken(FloatConst) | |
417 | |
418 return lexRoot | |
419 } | |
420 | |
421 // isStringStart returns true if the rune represents the beginning of a string. | |
422 func isStringStart(c rune) bool { | |
423 return '"' == c | |
424 } | |
425 | |
426 // lexString lexes a quoted string. | |
427 func lexString(l *lexer) stateFn { | |
428 // Consume opening quotes. | |
429 l.Consume() | |
430 | |
431 for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' { | |
432 if l.Peek() == '\\' { | |
433 // If we see an escape character consume whatever follow s blindly. | |
434 // TODO(azani): Consider parsing escape sequences. | |
435 l.Consume() | |
436 } | |
437 l.Consume() | |
438 } | |
439 | |
440 if l.IsEos() || l.Peek() == '\n' { | |
441 l.emitToken(ErrorUnterminatedStringLiteral) | |
442 return nil | |
443 } | |
444 | |
445 // Consume the closing quotes | |
446 l.Consume() | |
447 | |
448 l.emitToken(StringLiteral) | |
449 | |
450 return lexRoot | |
451 } | |
452 | |
453 // isMaybeComment returns true if the rune may be the beginning of a | |
454 // comment. | |
455 func isMaybeComment(c rune) bool { | |
456 return c == '/' | |
457 } | |
458 | |
459 // lexComment consumes a single-line or multi-line comment. | |
460 func lexComment(l *lexer) stateFn { | |
461 // Consume the '/'. | |
462 l.Consume() | |
463 | |
464 switch l.Peek() { | |
465 case '/': | |
466 return lexSingleLineComment | |
467 case '*': | |
468 return lexMultiLineComment | |
469 } | |
470 | |
471 l.emitToken(ErrorIllegalChar) | |
472 return nil | |
473 } | |
474 | |
475 // lexSingleLineComment consumes a single line comment. | |
476 func lexSingleLineComment(l *lexer) stateFn { | |
477 // Consume the '/' | |
478 l.Consume() | |
479 | |
480 for !l.IsEos() && l.Peek() != '\n' { | |
481 l.Consume() | |
482 } | |
483 | |
484 l.beginToken() | |
485 return lexRoot | |
486 } | |
487 | |
488 // lexMultiLineComment consumes a multi-line comment. | |
489 func lexMultiLineComment(l *lexer) stateFn { | |
490 // Consume the '*'. | |
491 l.Consume() | |
492 | |
493 for !l.IsEos() { | |
494 if l.Peek() == '*' { | |
495 return lexPossibleEndOfComment | |
496 } | |
497 l.Consume() | |
498 } | |
499 | |
500 l.emitToken(ErrorUnterminatedComment) | |
501 return nil | |
502 } | |
503 | |
504 // lexPossibleEndOfComment consumes the possible end of a multiline | |
505 // comment and determines whether the comment in fact ended or not. | |
506 func lexPossibleEndOfComment(l *lexer) stateFn { | |
507 // Consume the '*' | |
508 l.Consume() | |
509 | |
510 if l.IsEos() { | |
511 l.emitToken(ErrorUnterminatedComment) | |
512 return nil | |
513 } | |
514 | |
515 if l.Peek() == '/' { | |
516 l.Consume() | |
517 l.beginToken() | |
518 return lexRoot | |
519 } | |
520 | |
521 return lexMultiLineComment | |
522 } | |
OLD | NEW |