OLD | NEW |
| (Empty) |
1 # ----------------------------------------------------------------------------- | |
2 # ply: lex.py | |
3 # | |
4 # Copyright (C) 2001-2011, | |
5 # David M. Beazley (Dabeaz LLC) | |
6 # All rights reserved. | |
7 # | |
8 # Redistribution and use in source and binary forms, with or without | |
9 # modification, are permitted provided that the following conditions are | |
10 # met: | |
11 # | |
12 # * Redistributions of source code must retain the above copyright notice, | |
13 # this list of conditions and the following disclaimer. | |
14 # * Redistributions in binary form must reproduce the above copyright notice, | |
15 # this list of conditions and the following disclaimer in the documentation | |
16 # and/or other materials provided with the distribution. | |
17 # * Neither the name of the David Beazley or Dabeaz LLC may be used to | |
18 # endorse or promote products derived from this software without | |
19 # specific prior written permission. | |
20 # | |
21 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
22 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
24 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
25 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
26 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
27 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
28 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
29 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
30 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
31 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
32 # ----------------------------------------------------------------------------- | |
33 | |
34 __version__ = "3.4" | |
35 __tabversion__ = "3.2" # Version of table file used | |
36 | |
37 import re, sys, types, copy, os | |
38 | |
39 # This tuple contains known string types | |
40 try: | |
41 # Python 2.6 | |
42 StringTypes = (types.StringType, types.UnicodeType) | |
43 except AttributeError: | |
44 # Python 3.0 | |
45 StringTypes = (str, bytes) | |
46 | |
47 # Extract the code attribute of a function. Different implementations | |
48 # are for Python 2/3 compatibility. | |
49 | |
50 if sys.version_info[0] < 3: | |
51 def func_code(f): | |
52 return f.func_code | |
53 else: | |
54 def func_code(f): | |
55 return f.__code__ | |
56 | |
57 # This regular expression is used to match valid token names | |
58 _is_identifier = re.compile(r'^[a-zA-Z0-9_]+$') | |
59 | |
60 # Exception thrown when invalid token encountered and no default error | |
61 # handler is defined. | |
62 | |
63 class LexError(Exception): | |
64 def __init__(self,message,s): | |
65 self.args = (message,) | |
66 self.text = s | |
67 | |
68 # Token class. This class is used to represent the tokens produced. | |
69 class LexToken(object): | |
70 def __str__(self): | |
71 return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.
lexpos) | |
72 def __repr__(self): | |
73 return str(self) | |
74 | |
75 # This object is a stand-in for a logging object created by the | |
76 # logging module. | |
77 | |
78 class PlyLogger(object): | |
79 def __init__(self,f): | |
80 self.f = f | |
81 def critical(self,msg,*args,**kwargs): | |
82 self.f.write((msg % args) + "\n") | |
83 | |
84 def warning(self,msg,*args,**kwargs): | |
85 self.f.write("WARNING: "+ (msg % args) + "\n") | |
86 | |
87 def error(self,msg,*args,**kwargs): | |
88 self.f.write("ERROR: " + (msg % args) + "\n") | |
89 | |
90 info = critical | |
91 debug = critical | |
92 | |
93 # Null logger is used when no output is generated. Does nothing. | |
94 class NullLogger(object): | |
95 def __getattribute__(self,name): | |
96 return self | |
97 def __call__(self,*args,**kwargs): | |
98 return self | |
99 | |
100 # ----------------------------------------------------------------------------- | |
101 # === Lexing Engine === | |
102 # | |
103 # The following Lexer class implements the lexer runtime. There are only | |
104 # a few public methods and attributes: | |
105 # | |
106 # input() - Store a new string in the lexer | |
107 # token() - Get the next token | |
108 # clone() - Clone the lexer | |
109 # | |
110 # lineno - Current line number | |
111 # lexpos - Current position in the input string | |
112 # ----------------------------------------------------------------------------- | |
113 | |
114 class Lexer: | |
115 def __init__(self): | |
116 self.lexre = None # Master regular expression. This is a lis
t of | |
117 # tuples (re,findex) where re is a compile
d | |
118 # regular expression and findex is a list | |
119 # mapping regex group numbers to rules | |
120 self.lexretext = None # Current regular expression strings | |
121 self.lexstatere = {} # Dictionary mapping lexer states to maste
r regexs | |
122 self.lexstateretext = {} # Dictionary mapping lexer states to regex
strings | |
123 self.lexstaterenames = {} # Dictionary mapping lexer states to symbo
l names | |
124 self.lexstate = "INITIAL" # Current lexer state | |
125 self.lexstatestack = [] # Stack of lexer states | |
126 self.lexstateinfo = None # State information | |
127 self.lexstateignore = {} # Dictionary of ignored characters for eac
h state | |
128 self.lexstateerrorf = {} # Dictionary of error functions for each s
tate | |
129 self.lexreflags = 0 # Optional re compile flags | |
130 self.lexdata = None # Actual input data (as a string) | |
131 self.lexpos = 0 # Current position in input text | |
132 self.lexlen = 0 # Length of the input text | |
133 self.lexerrorf = None # Error rule (if any) | |
134 self.lextokens = None # List of valid tokens | |
135 self.lexignore = "" # Ignored characters | |
136 self.lexliterals = "" # Literal characters that can be passed th
rough | |
137 self.lexmodule = None # Module | |
138 self.lineno = 1 # Current line number | |
139 self.lexoptimize = 0 # Optimized mode | |
140 | |
141 def clone(self,object=None): | |
142 c = copy.copy(self) | |
143 | |
144 # If the object parameter has been supplied, it means we are attaching t
he | |
145 # lexer to a new object. In this case, we have to rebind all methods in | |
146 # the lexstatere and lexstateerrorf tables. | |
147 | |
148 if object: | |
149 newtab = { } | |
150 for key, ritem in self.lexstatere.items(): | |
151 newre = [] | |
152 for cre, findex in ritem: | |
153 newfindex = [] | |
154 for f in findex: | |
155 if not f or not f[0]: | |
156 newfindex.append(f) | |
157 continue | |
158 newfindex.append((getattr(object,f[0].__name__),f[1])) | |
159 newre.append((cre,newfindex)) | |
160 newtab[key] = newre | |
161 c.lexstatere = newtab | |
162 c.lexstateerrorf = { } | |
163 for key, ef in self.lexstateerrorf.items(): | |
164 c.lexstateerrorf[key] = getattr(object,ef.__name__) | |
165 c.lexmodule = object | |
166 return c | |
167 | |
168 # ------------------------------------------------------------ | |
169 # writetab() - Write lexer information to a table file | |
170 # ------------------------------------------------------------ | |
171 def writetab(self,tabfile,outputdir=""): | |
172 if isinstance(tabfile,types.ModuleType): | |
173 return | |
174 basetabfilename = tabfile.split(".")[-1] | |
175 filename = os.path.join(outputdir,basetabfilename)+".py" | |
176 tf = open(filename,"w") | |
177 tf.write("# %s.py. This file automatically created by PLY (version %s).
Don't edit!\n" % (tabfile,__version__)) | |
178 tf.write("_tabversion = %s\n" % repr(__version__)) | |
179 tf.write("_lextokens = %s\n" % repr(self.lextokens)) | |
180 tf.write("_lexreflags = %s\n" % repr(self.lexreflags)) | |
181 tf.write("_lexliterals = %s\n" % repr(self.lexliterals)) | |
182 tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo)) | |
183 | |
184 tabre = { } | |
185 # Collect all functions in the initial state | |
186 initial = self.lexstatere["INITIAL"] | |
187 initialfuncs = [] | |
188 for part in initial: | |
189 for f in part[1]: | |
190 if f and f[0]: | |
191 initialfuncs.append(f) | |
192 | |
193 for key, lre in self.lexstatere.items(): | |
194 titem = [] | |
195 for i in range(len(lre)): | |
196 titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[
i][1],self.lexstaterenames[key][i]))) | |
197 tabre[key] = titem | |
198 | |
199 tf.write("_lexstatere = %s\n" % repr(tabre)) | |
200 tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore)) | |
201 | |
202 taberr = { } | |
203 for key, ef in self.lexstateerrorf.items(): | |
204 if ef: | |
205 taberr[key] = ef.__name__ | |
206 else: | |
207 taberr[key] = None | |
208 tf.write("_lexstateerrorf = %s\n" % repr(taberr)) | |
209 tf.close() | |
210 | |
211 # ------------------------------------------------------------ | |
212 # readtab() - Read lexer information from a tab file | |
213 # ------------------------------------------------------------ | |
214 def readtab(self,tabfile,fdict): | |
215 if isinstance(tabfile,types.ModuleType): | |
216 lextab = tabfile | |
217 else: | |
218 if sys.version_info[0] < 3: | |
219 exec("import %s as lextab" % tabfile) | |
220 else: | |
221 env = { } | |
222 exec("import %s as lextab" % tabfile, env,env) | |
223 lextab = env['lextab'] | |
224 | |
225 if getattr(lextab,"_tabversion","0.0") != __version__: | |
226 raise ImportError("Inconsistent PLY version") | |
227 | |
228 self.lextokens = lextab._lextokens | |
229 self.lexreflags = lextab._lexreflags | |
230 self.lexliterals = lextab._lexliterals | |
231 self.lexstateinfo = lextab._lexstateinfo | |
232 self.lexstateignore = lextab._lexstateignore | |
233 self.lexstatere = { } | |
234 self.lexstateretext = { } | |
235 for key,lre in lextab._lexstatere.items(): | |
236 titem = [] | |
237 txtitem = [] | |
238 for i in range(len(lre)): | |
239 titem.append((re.compile(lre[i][0],lextab._lexreflags | re.VER
BOSE),_names_to_funcs(lre[i][1],fdict))) | |
240 txtitem.append(lre[i][0]) | |
241 self.lexstatere[key] = titem | |
242 self.lexstateretext[key] = txtitem | |
243 self.lexstateerrorf = { } | |
244 for key,ef in lextab._lexstateerrorf.items(): | |
245 self.lexstateerrorf[key] = fdict[ef] | |
246 self.begin('INITIAL') | |
247 | |
248 # ------------------------------------------------------------ | |
249 # input() - Push a new string into the lexer | |
250 # ------------------------------------------------------------ | |
251 def input(self,s): | |
252 # Pull off the first character to see if s looks like a string | |
253 c = s[:1] | |
254 if not isinstance(c,StringTypes): | |
255 raise ValueError("Expected a string") | |
256 self.lexdata = s | |
257 self.lexpos = 0 | |
258 self.lexlen = len(s) | |
259 | |
260 # ------------------------------------------------------------ | |
261 # begin() - Changes the lexing state | |
262 # ------------------------------------------------------------ | |
263 def begin(self,state): | |
264 if not state in self.lexstatere: | |
265 raise ValueError("Undefined state") | |
266 self.lexre = self.lexstatere[state] | |
267 self.lexretext = self.lexstateretext[state] | |
268 self.lexignore = self.lexstateignore.get(state,"") | |
269 self.lexerrorf = self.lexstateerrorf.get(state,None) | |
270 self.lexstate = state | |
271 | |
272 # ------------------------------------------------------------ | |
273 # push_state() - Changes the lexing state and saves old on stack | |
274 # ------------------------------------------------------------ | |
275 def push_state(self,state): | |
276 self.lexstatestack.append(self.lexstate) | |
277 self.begin(state) | |
278 | |
279 # ------------------------------------------------------------ | |
280 # pop_state() - Restores the previous state | |
281 # ------------------------------------------------------------ | |
282 def pop_state(self): | |
283 self.begin(self.lexstatestack.pop()) | |
284 | |
285 # ------------------------------------------------------------ | |
286 # current_state() - Returns the current lexing state | |
287 # ------------------------------------------------------------ | |
288 def current_state(self): | |
289 return self.lexstate | |
290 | |
291 # ------------------------------------------------------------ | |
292 # skip() - Skip ahead n characters | |
293 # ------------------------------------------------------------ | |
294 def skip(self,n): | |
295 self.lexpos += n | |
296 | |
297 # ------------------------------------------------------------ | |
298 # opttoken() - Return the next token from the Lexer | |
299 # | |
300 # Note: This function has been carefully implemented to be as fast | |
301 # as possible. Don't make changes unless you really know what | |
302 # you are doing | |
303 # ------------------------------------------------------------ | |
304 def token(self): | |
305 # Make local copies of frequently referenced attributes | |
306 lexpos = self.lexpos | |
307 lexlen = self.lexlen | |
308 lexignore = self.lexignore | |
309 lexdata = self.lexdata | |
310 | |
311 while lexpos < lexlen: | |
312 # This code provides some short-circuit code for whitespace, tabs, a
nd other ignored characters | |
313 if lexdata[lexpos] in lexignore: | |
314 lexpos += 1 | |
315 continue | |
316 | |
317 # Look for a regular expression match | |
318 for lexre,lexindexfunc in self.lexre: | |
319 m = lexre.match(lexdata,lexpos) | |
320 if not m: continue | |
321 | |
322 # Create a token for return | |
323 tok = LexToken() | |
324 tok.value = m.group() | |
325 tok.lineno = self.lineno | |
326 tok.lexpos = lexpos | |
327 | |
328 i = m.lastindex | |
329 func,tok.type = lexindexfunc[i] | |
330 | |
331 if not func: | |
332 # If no token type was set, it's an ignored token | |
333 if tok.type: | |
334 self.lexpos = m.end() | |
335 return tok | |
336 else: | |
337 lexpos = m.end() | |
338 break | |
339 | |
340 lexpos = m.end() | |
341 | |
342 # If token is processed by a function, call it | |
343 | |
344 tok.lexer = self # Set additional attributes useful in toke
n rules | |
345 self.lexmatch = m | |
346 self.lexpos = lexpos | |
347 | |
348 newtok = func(tok) | |
349 | |
350 # Every function must return a token, if nothing, we just move t
o next token | |
351 if not newtok: | |
352 lexpos = self.lexpos # This is here in case user
has updated lexpos. | |
353 lexignore = self.lexignore # This is here in case there
was a state change | |
354 break | |
355 | |
356 # Verify type of the token. If not in the token map, raise an e
rror | |
357 if not self.lexoptimize: | |
358 if not newtok.type in self.lextokens: | |
359 raise LexError("%s:%d: Rule '%s' returned an unknown tok
en type '%s'" % ( | |
360 func_code(func).co_filename, func_code(func).co_firs
tlineno, | |
361 func.__name__, newtok.type),lexdata[lexpos:]) | |
362 | |
363 return newtok | |
364 else: | |
365 # No match, see if in literals | |
366 if lexdata[lexpos] in self.lexliterals: | |
367 tok = LexToken() | |
368 tok.value = lexdata[lexpos] | |
369 tok.lineno = self.lineno | |
370 tok.type = tok.value | |
371 tok.lexpos = lexpos | |
372 self.lexpos = lexpos + 1 | |
373 return tok | |
374 | |
375 # No match. Call t_error() if defined. | |
376 if self.lexerrorf: | |
377 tok = LexToken() | |
378 tok.value = self.lexdata[lexpos:] | |
379 tok.lineno = self.lineno | |
380 tok.type = "error" | |
381 tok.lexer = self | |
382 tok.lexpos = lexpos | |
383 self.lexpos = lexpos | |
384 newtok = self.lexerrorf(tok) | |
385 if lexpos == self.lexpos: | |
386 # Error method didn't change text position at all. This
is an error. | |
387 raise LexError("Scanning error. Illegal character '%s'"
% (lexdata[lexpos]), lexdata[lexpos:]) | |
388 lexpos = self.lexpos | |
389 if not newtok: continue | |
390 return newtok | |
391 | |
392 self.lexpos = lexpos | |
393 raise LexError("Illegal character '%s' at index %d" % (lexdata[l
expos],lexpos), lexdata[lexpos:]) | |
394 | |
395 self.lexpos = lexpos + 1 | |
396 if self.lexdata is None: | |
397 raise RuntimeError("No input string given with input()") | |
398 return None | |
399 | |
400 # Iterator interface | |
401 def __iter__(self): | |
402 return self | |
403 | |
404 def next(self): | |
405 t = self.token() | |
406 if t is None: | |
407 raise StopIteration | |
408 return t | |
409 | |
410 __next__ = next | |
411 | |
412 # ----------------------------------------------------------------------------- | |
413 # ==== Lex Builder === | |
414 # | |
415 # The functions and classes below are used to collect lexing information | |
416 # and build a Lexer object from it. | |
417 # ----------------------------------------------------------------------------- | |
418 | |
419 # ----------------------------------------------------------------------------- | |
420 # get_caller_module_dict() | |
421 # | |
422 # This function returns a dictionary containing all of the symbols defined withi
n | |
423 # a caller further down the call stack. This is used to get the environment | |
424 # associated with the yacc() call if none was provided. | |
425 # ----------------------------------------------------------------------------- | |
426 | |
427 def get_caller_module_dict(levels): | |
428 try: | |
429 raise RuntimeError | |
430 except RuntimeError: | |
431 e,b,t = sys.exc_info() | |
432 f = t.tb_frame | |
433 while levels > 0: | |
434 f = f.f_back | |
435 levels -= 1 | |
436 ldict = f.f_globals.copy() | |
437 if f.f_globals != f.f_locals: | |
438 ldict.update(f.f_locals) | |
439 | |
440 return ldict | |
441 | |
442 # ----------------------------------------------------------------------------- | |
443 # _funcs_to_names() | |
444 # | |
445 # Given a list of regular expression functions, this converts it to a list | |
446 # suitable for output to a table file | |
447 # ----------------------------------------------------------------------------- | |
448 | |
449 def _funcs_to_names(funclist,namelist): | |
450 result = [] | |
451 for f,name in zip(funclist,namelist): | |
452 if f and f[0]: | |
453 result.append((name, f[1])) | |
454 else: | |
455 result.append(f) | |
456 return result | |
457 | |
458 # ----------------------------------------------------------------------------- | |
459 # _names_to_funcs() | |
460 # | |
461 # Given a list of regular expression function names, this converts it back to | |
462 # functions. | |
463 # ----------------------------------------------------------------------------- | |
464 | |
465 def _names_to_funcs(namelist,fdict): | |
466 result = [] | |
467 for n in namelist: | |
468 if n and n[0]: | |
469 result.append((fdict[n[0]],n[1])) | |
470 else: | |
471 result.append(n) | |
472 return result | |
473 | |
474 # ----------------------------------------------------------------------------- | |
475 # _form_master_re() | |
476 # | |
477 # This function takes a list of all of the regex components and attempts to | |
478 # form the master regular expression. Given limitations in the Python re | |
479 # module, it may be necessary to break the master regex into separate expression
s. | |
480 # ----------------------------------------------------------------------------- | |
481 | |
482 def _form_master_re(relist,reflags,ldict,toknames): | |
483 if not relist: return [] | |
484 regex = "|".join(relist) | |
485 try: | |
486 lexre = re.compile(regex,re.VERBOSE | reflags) | |
487 | |
488 # Build the index to function map for the matching engine | |
489 lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1) | |
490 lexindexnames = lexindexfunc[:] | |
491 | |
492 for f,i in lexre.groupindex.items(): | |
493 handle = ldict.get(f,None) | |
494 if type(handle) in (types.FunctionType, types.MethodType): | |
495 lexindexfunc[i] = (handle,toknames[f]) | |
496 lexindexnames[i] = f | |
497 elif handle is not None: | |
498 lexindexnames[i] = f | |
499 if f.find("ignore_") > 0: | |
500 lexindexfunc[i] = (None,None) | |
501 else: | |
502 lexindexfunc[i] = (None, toknames[f]) | |
503 | |
504 return [(lexre,lexindexfunc)],[regex],[lexindexnames] | |
505 except Exception: | |
506 m = int(len(relist)/2) | |
507 if m == 0: m = 1 | |
508 llist, lre, lnames = _form_master_re(relist[:m],reflags,ldict,toknames) | |
509 rlist, rre, rnames = _form_master_re(relist[m:],reflags,ldict,toknames) | |
510 return llist+rlist, lre+rre, lnames+rnames | |
511 | |
512 # ----------------------------------------------------------------------------- | |
513 # def _statetoken(s,names) | |
514 # | |
515 # Given a declaration name s of the form "t_" and a dictionary whose keys are | |
516 # state names, this function returns a tuple (states,tokenname) where states | |
517 # is a tuple of state names and tokenname is the name of the token. For example
, | |
518 # calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM') | |
519 # ----------------------------------------------------------------------------- | |
520 | |
521 def _statetoken(s,names): | |
522 nonstate = 1 | |
523 parts = s.split("_") | |
524 for i in range(1,len(parts)): | |
525 if not parts[i] in names and parts[i] != 'ANY': break | |
526 if i > 1: | |
527 states = tuple(parts[1:i]) | |
528 else: | |
529 states = ('INITIAL',) | |
530 | |
531 if 'ANY' in states: | |
532 states = tuple(names) | |
533 | |
534 tokenname = "_".join(parts[i:]) | |
535 return (states,tokenname) | |
536 | |
537 | |
538 # ----------------------------------------------------------------------------- | |
539 # LexerReflect() | |
540 # | |
541 # This class represents information needed to build a lexer as extracted from a | |
542 # user's input file. | |
543 # ----------------------------------------------------------------------------- | |
544 class LexerReflect(object): | |
545 def __init__(self,ldict,log=None,reflags=0): | |
546 self.ldict = ldict | |
547 self.error_func = None | |
548 self.tokens = [] | |
549 self.reflags = reflags | |
550 self.stateinfo = { 'INITIAL' : 'inclusive'} | |
551 self.files = {} | |
552 self.error = 0 | |
553 | |
554 if log is None: | |
555 self.log = PlyLogger(sys.stderr) | |
556 else: | |
557 self.log = log | |
558 | |
559 # Get all of the basic information | |
560 def get_all(self): | |
561 self.get_tokens() | |
562 self.get_literals() | |
563 self.get_states() | |
564 self.get_rules() | |
565 | |
566 # Validate all of the information | |
567 def validate_all(self): | |
568 self.validate_tokens() | |
569 self.validate_literals() | |
570 self.validate_rules() | |
571 return self.error | |
572 | |
573 # Get the tokens map | |
574 def get_tokens(self): | |
575 tokens = self.ldict.get("tokens",None) | |
576 if not tokens: | |
577 self.log.error("No token list is defined") | |
578 self.error = 1 | |
579 return | |
580 | |
581 if not isinstance(tokens,(list, tuple)): | |
582 self.log.error("tokens must be a list or tuple") | |
583 self.error = 1 | |
584 return | |
585 | |
586 if not tokens: | |
587 self.log.error("tokens is empty") | |
588 self.error = 1 | |
589 return | |
590 | |
591 self.tokens = tokens | |
592 | |
593 # Validate the tokens | |
594 def validate_tokens(self): | |
595 terminals = {} | |
596 for n in self.tokens: | |
597 if not _is_identifier.match(n): | |
598 self.log.error("Bad token name '%s'",n) | |
599 self.error = 1 | |
600 if n in terminals: | |
601 self.log.warning("Token '%s' multiply defined", n) | |
602 terminals[n] = 1 | |
603 | |
604 # Get the literals specifier | |
605 def get_literals(self): | |
606 self.literals = self.ldict.get("literals","") | |
607 | |
608 # Validate literals | |
609 def validate_literals(self): | |
610 try: | |
611 for c in self.literals: | |
612 if not isinstance(c,StringTypes) or len(c) > 1: | |
613 self.log.error("Invalid literal %s. Must be a single charact
er", repr(c)) | |
614 self.error = 1 | |
615 continue | |
616 | |
617 except TypeError: | |
618 self.log.error("Invalid literals specification. literals must be a s
equence of characters") | |
619 self.error = 1 | |
620 | |
621 def get_states(self): | |
622 self.states = self.ldict.get("states",None) | |
623 # Build statemap | |
624 if self.states: | |
625 if not isinstance(self.states,(tuple,list)): | |
626 self.log.error("states must be defined as a tuple or list") | |
627 self.error = 1 | |
628 else: | |
629 for s in self.states: | |
630 if not isinstance(s,tuple) or len(s) != 2: | |
631 self.log.error("Invalid state specifier %s. Must
be a tuple (statename,'exclusive|inclusive')",repr(s)) | |
632 self.error = 1 | |
633 continue | |
634 name, statetype = s | |
635 if not isinstance(name,StringTypes): | |
636 self.log.error("State name %s must be a string",
repr(name)) | |
637 self.error = 1 | |
638 continue | |
639 if not (statetype == 'inclusive' or statetype == 'exclus
ive'): | |
640 self.log.error("State type for state %s must be '
inclusive' or 'exclusive'",name) | |
641 self.error = 1 | |
642 continue | |
643 if name in self.stateinfo: | |
644 self.log.error("State '%s' already defined",name) | |
645 self.error = 1 | |
646 continue | |
647 self.stateinfo[name] = statetype | |
648 | |
649 # Get all of the symbols with a t_ prefix and sort them into various | |
650 # categories (functions, strings, error functions, and ignore characters) | |
651 | |
652 def get_rules(self): | |
653 tsymbols = [f for f in self.ldict if f[:2] == 't_' ] | |
654 | |
655 # Now build up a list of functions and a list of strings | |
656 | |
657 self.toknames = { } # Mapping of symbols to token names | |
658 self.funcsym = { } # Symbols defined as functions | |
659 self.strsym = { } # Symbols defined as strings | |
660 self.ignore = { } # Ignore strings by state | |
661 self.errorf = { } # Error functions by state | |
662 | |
663 for s in self.stateinfo: | |
664 self.funcsym[s] = [] | |
665 self.strsym[s] = [] | |
666 | |
667 if len(tsymbols) == 0: | |
668 self.log.error("No rules of the form t_rulename are defined") | |
669 self.error = 1 | |
670 return | |
671 | |
672 for f in tsymbols: | |
673 t = self.ldict[f] | |
674 states, tokname = _statetoken(f,self.stateinfo) | |
675 self.toknames[f] = tokname | |
676 | |
677 if hasattr(t,"__call__"): | |
678 if tokname == 'error': | |
679 for s in states: | |
680 self.errorf[s] = t | |
681 elif tokname == 'ignore': | |
682 line = func_code(t).co_firstlineno | |
683 file = func_code(t).co_filename | |
684 self.log.error("%s:%d: Rule '%s' must be defined as a string
",file,line,t.__name__) | |
685 self.error = 1 | |
686 else: | |
687 for s in states: | |
688 self.funcsym[s].append((f,t)) | |
689 elif isinstance(t, StringTypes): | |
690 if tokname == 'ignore': | |
691 for s in states: | |
692 self.ignore[s] = t | |
693 if "\\" in t: | |
694 self.log.warning("%s contains a literal backslash '\\'",
f) | |
695 | |
696 elif tokname == 'error': | |
697 self.log.error("Rule '%s' must be defined as a function", f) | |
698 self.error = 1 | |
699 else: | |
700 for s in states: | |
701 self.strsym[s].append((f,t)) | |
702 else: | |
703 self.log.error("%s not defined as a function or string", f) | |
704 self.error = 1 | |
705 | |
706 # Sort the functions by line number | |
707 for f in self.funcsym.values(): | |
708 if sys.version_info[0] < 3: | |
709 f.sort(lambda x,y: cmp(func_code(x[1]).co_firstlineno,func_code(
y[1]).co_firstlineno)) | |
710 else: | |
711 # Python 3.0 | |
712 f.sort(key=lambda x: func_code(x[1]).co_firstlineno) | |
713 | |
714 # Sort the strings by regular expression length | |
715 for s in self.strsym.values(): | |
716 if sys.version_info[0] < 3: | |
717 s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[
1]))) | |
718 else: | |
719 # Python 3.0 | |
720 s.sort(key=lambda x: len(x[1]),reverse=True) | |
721 | |
722 # Validate all of the t_rules collected | |
723 def validate_rules(self): | |
724 for state in self.stateinfo: | |
725 # Validate all rules defined by functions | |
726 | |
727 | |
728 | |
729 for fname, f in self.funcsym[state]: | |
730 line = func_code(f).co_firstlineno | |
731 file = func_code(f).co_filename | |
732 self.files[file] = 1 | |
733 | |
734 tokname = self.toknames[fname] | |
735 if isinstance(f, types.MethodType): | |
736 reqargs = 2 | |
737 else: | |
738 reqargs = 1 | |
739 nargs = func_code(f).co_argcount | |
740 if nargs > reqargs: | |
741 self.log.error("%s:%d: Rule '%s' has too many arguments",fil
e,line,f.__name__) | |
742 self.error = 1 | |
743 continue | |
744 | |
745 if nargs < reqargs: | |
746 self.log.error("%s:%d: Rule '%s' requires an argument", file
,line,f.__name__) | |
747 self.error = 1 | |
748 continue | |
749 | |
750 if not f.__doc__: | |
751 self.log.error("%s:%d: No regular expression defined for rul
e '%s'",file,line,f.__name__) | |
752 self.error = 1 | |
753 continue | |
754 | |
755 try: | |
756 c = re.compile("(?P<%s>%s)" % (fname,f.__doc__), re.VERBOSE
| self.reflags) | |
757 if c.match(""): | |
758 self.log.error("%s:%d: Regular expression for rule '%s'
matches empty string", file,line,f.__name__) | |
759 self.error = 1 | |
760 except re.error: | |
761 _etype, e, _etrace = sys.exc_info() | |
762 self.log.error("%s:%d: Invalid regular expression for rule '
%s'. %s", file,line,f.__name__,e) | |
763 if '#' in f.__doc__: | |
764 self.log.error("%s:%d. Make sure '#' in rule '%s' is esc
aped with '\\#'",file,line, f.__name__) | |
765 self.error = 1 | |
766 | |
767 # Validate all rules defined by strings | |
768 for name,r in self.strsym[state]: | |
769 tokname = self.toknames[name] | |
770 if tokname == 'error': | |
771 self.log.error("Rule '%s' must be defined as a function", na
me) | |
772 self.error = 1 | |
773 continue | |
774 | |
775 if not tokname in self.tokens and tokname.find("ignore_") < 0: | |
776 self.log.error("Rule '%s' defined for an unspecified token %
s",name,tokname) | |
777 self.error = 1 | |
778 continue | |
779 | |
780 try: | |
781 c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | self.ref
lags) | |
782 if (c.match("")): | |
783 self.log.error("Regular expression for rule '%s' matche
s empty string",name) | |
784 self.error = 1 | |
785 except re.error: | |
786 _etype, e, _etrace = sys.exc_info() | |
787 self.log.error("Invalid regular expression for rule '%s'. %s
",name,e) | |
788 if '#' in r: | |
789 self.log.error("Make sure '#' in rule '%s' is escaped w
ith '\\#'",name) | |
790 self.error = 1 | |
791 | |
792 if not self.funcsym[state] and not self.strsym[state]: | |
793 self.log.error("No rules defined for state '%s'",state) | |
794 self.error = 1 | |
795 | |
796 # Validate the error function | |
797 efunc = self.errorf.get(state,None) | |
798 if efunc: | |
799 f = efunc | |
800 line = func_code(f).co_firstlineno | |
801 file = func_code(f).co_filename | |
802 self.files[file] = 1 | |
803 | |
804 if isinstance(f, types.MethodType): | |
805 reqargs = 2 | |
806 else: | |
807 reqargs = 1 | |
808 nargs = func_code(f).co_argcount | |
809 if nargs > reqargs: | |
810 self.log.error("%s:%d: Rule '%s' has too many arguments",fil
e,line,f.__name__) | |
811 self.error = 1 | |
812 | |
813 if nargs < reqargs: | |
814 self.log.error("%s:%d: Rule '%s' requires an argument", file
,line,f.__name__) | |
815 self.error = 1 | |
816 | |
817 for f in self.files: | |
818 self.validate_file(f) | |
819 | |
820 | |
821 # --------------------------------------------------------------------------
--- | |
822 # validate_file() | |
823 # | |
824 # This checks to see if there are duplicated t_rulename() functions or strin
gs | |
825 # in the parser input file. This is done using a simple regular expression | |
826 # match on each line in the given file. | |
827 # --------------------------------------------------------------------------
--- | |
828 | |
829 def validate_file(self,filename): | |
830 import os.path | |
831 base,ext = os.path.splitext(filename) | |
832 if ext != '.py': return # No idea what the file is. Return OK | |
833 | |
834 try: | |
835 f = open(filename) | |
836 lines = f.readlines() | |
837 f.close() | |
838 except IOError: | |
839 return # Couldn't find the file. Don't worry a
bout it | |
840 | |
841 fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') | |
842 sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') | |
843 | |
844 counthash = { } | |
845 linen = 1 | |
846 for l in lines: | |
847 m = fre.match(l) | |
848 if not m: | |
849 m = sre.match(l) | |
850 if m: | |
851 name = m.group(1) | |
852 prev = counthash.get(name) | |
853 if not prev: | |
854 counthash[name] = linen | |
855 else: | |
856 self.log.error("%s:%d: Rule %s redefined. Previously defined
on line %d",filename,linen,name,prev) | |
857 self.error = 1 | |
858 linen += 1 | |
859 | |
860 # ----------------------------------------------------------------------------- | |
861 # lex(module) | |
862 # | |
863 # Build all of the regular expression rules from definitions in the supplied mod
ule | |
864 # ----------------------------------------------------------------------------- | |
865 def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,now
arn=0,outputdir="", debuglog=None, errorlog=None): | |
866 global lexer | |
867 ldict = None | |
868 stateinfo = { 'INITIAL' : 'inclusive'} | |
869 lexobj = Lexer() | |
870 lexobj.lexoptimize = optimize | |
871 global token,input | |
872 | |
873 if errorlog is None: | |
874 errorlog = PlyLogger(sys.stderr) | |
875 | |
876 if debug: | |
877 if debuglog is None: | |
878 debuglog = PlyLogger(sys.stderr) | |
879 | |
880 # Get the module dictionary used for the lexer | |
881 if object: module = object | |
882 | |
883 if module: | |
884 _items = [(k,getattr(module,k)) for k in dir(module)] | |
885 ldict = dict(_items) | |
886 else: | |
887 ldict = get_caller_module_dict(2) | |
888 | |
889 # Collect parser information from the dictionary | |
890 linfo = LexerReflect(ldict,log=errorlog,reflags=reflags) | |
891 linfo.get_all() | |
892 if not optimize: | |
893 if linfo.validate_all(): | |
894 raise SyntaxError("Can't build lexer") | |
895 | |
896 if optimize and lextab: | |
897 try: | |
898 lexobj.readtab(lextab,ldict) | |
899 token = lexobj.token | |
900 input = lexobj.input | |
901 lexer = lexobj | |
902 return lexobj | |
903 | |
904 except ImportError: | |
905 pass | |
906 | |
907 # Dump some basic debugging information | |
908 if debug: | |
909 debuglog.info("lex: tokens = %r", linfo.tokens) | |
910 debuglog.info("lex: literals = %r", linfo.literals) | |
911 debuglog.info("lex: states = %r", linfo.stateinfo) | |
912 | |
913 # Build a dictionary of valid token names | |
914 lexobj.lextokens = { } | |
915 for n in linfo.tokens: | |
916 lexobj.lextokens[n] = 1 | |
917 | |
918 # Get literals specification | |
919 if isinstance(linfo.literals,(list,tuple)): | |
920 lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals) | |
921 else: | |
922 lexobj.lexliterals = linfo.literals | |
923 | |
924 # Get the stateinfo dictionary | |
925 stateinfo = linfo.stateinfo | |
926 | |
927 regexs = { } | |
928 # Build the master regular expressions | |
929 for state in stateinfo: | |
930 regex_list = [] | |
931 | |
932 # Add rules defined by functions first | |
933 for fname, f in linfo.funcsym[state]: | |
934 line = func_code(f).co_firstlineno | |
935 file = func_code(f).co_filename | |
936 regex_list.append("(?P<%s>%s)" % (fname,f.__doc__)) | |
937 if debug: | |
938 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",fname,f
.__doc__, state) | |
939 | |
940 # Now add all of the simple rules | |
941 for name,r in linfo.strsym[state]: | |
942 regex_list.append("(?P<%s>%s)" % (name,r)) | |
943 if debug: | |
944 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",name,r,
state) | |
945 | |
946 regexs[state] = regex_list | |
947 | |
948 # Build the master regular expressions | |
949 | |
950 if debug: | |
951 debuglog.info("lex: ==== MASTER REGEXS FOLLOW ====") | |
952 | |
953 for state in regexs: | |
954 lexre, re_text, re_names = _form_master_re(regexs[state],reflags,ldict,l
info.toknames) | |
955 lexobj.lexstatere[state] = lexre | |
956 lexobj.lexstateretext[state] = re_text | |
957 lexobj.lexstaterenames[state] = re_names | |
958 if debug: | |
959 for i in range(len(re_text)): | |
960 debuglog.info("lex: state '%s' : regex[%d] = '%s'",state, i, re_
text[i]) | |
961 | |
962 # For inclusive states, we need to add the regular expressions from the INIT
IAL state | |
963 for state,stype in stateinfo.items(): | |
964 if state != "INITIAL" and stype == 'inclusive': | |
965 lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL']) | |
966 lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL'
]) | |
967 lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIA
L']) | |
968 | |
969 lexobj.lexstateinfo = stateinfo | |
970 lexobj.lexre = lexobj.lexstatere["INITIAL"] | |
971 lexobj.lexretext = lexobj.lexstateretext["INITIAL"] | |
972 lexobj.lexreflags = reflags | |
973 | |
974 # Set up ignore variables | |
975 lexobj.lexstateignore = linfo.ignore | |
976 lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","") | |
977 | |
978 # Set up error functions | |
979 lexobj.lexstateerrorf = linfo.errorf | |
980 lexobj.lexerrorf = linfo.errorf.get("INITIAL",None) | |
981 if not lexobj.lexerrorf: | |
982 errorlog.warning("No t_error rule is defined") | |
983 | |
984 # Check state information for ignore and error rules | |
985 for s,stype in stateinfo.items(): | |
986 if stype == 'exclusive': | |
987 if not s in linfo.errorf: | |
988 errorlog.warning("No error rule is defined for exclusive stat
e '%s'", s) | |
989 if not s in linfo.ignore and lexobj.lexignore: | |
990 errorlog.warning("No ignore rule is defined for exclusive sta
te '%s'", s) | |
991 elif stype == 'inclusive': | |
992 if not s in linfo.errorf: | |
993 linfo.errorf[s] = linfo.errorf.get("INITIAL",None) | |
994 if not s in linfo.ignore: | |
995 linfo.ignore[s] = linfo.ignore.get("INITIAL","") | |
996 | |
997 # Create global versions of the token() and input() functions | |
998 token = lexobj.token | |
999 input = lexobj.input | |
1000 lexer = lexobj | |
1001 | |
1002 # If in optimize mode, we write the lextab | |
1003 if lextab and optimize: | |
1004 lexobj.writetab(lextab,outputdir) | |
1005 | |
1006 return lexobj | |
1007 | |
1008 # ----------------------------------------------------------------------------- | |
1009 # runmain() | |
1010 # | |
1011 # This runs the lexer as a main program | |
1012 # ----------------------------------------------------------------------------- | |
1013 | |
1014 def runmain(lexer=None,data=None): | |
1015 if not data: | |
1016 try: | |
1017 filename = sys.argv[1] | |
1018 f = open(filename) | |
1019 data = f.read() | |
1020 f.close() | |
1021 except IndexError: | |
1022 sys.stdout.write("Reading from standard input (type EOF to end):\n") | |
1023 data = sys.stdin.read() | |
1024 | |
1025 if lexer: | |
1026 _input = lexer.input | |
1027 else: | |
1028 _input = input | |
1029 _input(data) | |
1030 if lexer: | |
1031 _token = lexer.token | |
1032 else: | |
1033 _token = token | |
1034 | |
1035 while 1: | |
1036 tok = _token() | |
1037 if not tok: break | |
1038 sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno,to
k.lexpos)) | |
1039 | |
1040 # ----------------------------------------------------------------------------- | |
1041 # @TOKEN(regex) | |
1042 # | |
1043 # This decorator function can be used to set the regex expression on a function | |
1044 # when its docstring might need to be set in an alternative way | |
1045 # ----------------------------------------------------------------------------- | |
1046 | |
1047 def TOKEN(r): | |
1048 def set_doc(f): | |
1049 if hasattr(r,"__call__"): | |
1050 f.__doc__ = r.__doc__ | |
1051 else: | |
1052 f.__doc__ = r | |
1053 return f | |
1054 return set_doc | |
1055 | |
1056 # Alternative spelling of the TOKEN decorator | |
1057 Token = TOKEN | |
1058 | |
OLD | NEW |