third_party/ply/lex.py - Issue 6673006: Add PLY (Python Lex Yacc)

Side by Side Diff: third_party/ply/lex.py

Issue 6673006: Add PLY (Python Lex Yacc) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 9 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 # -----------------------------------------------------------------------------

	2 # ply: lex.py

	3 #

	4 # Copyright (C) 2001-2011,

	5 # David M. Beazley (Dabeaz LLC)

	6 # All rights reserved.

	7 #

	8 # Redistribution and use in source and binary forms, with or without

	9 # modification, are permitted provided that the following conditions are

	10 # met:

	11 #

	12 # * Redistributions of source code must retain the above copyright notice,

	13 # this list of conditions and the following disclaimer.

	14 # * Redistributions in binary form must reproduce the above copyright notice,

	15 # this list of conditions and the following disclaimer in the documentation

	16 # and/or other materials provided with the distribution.

	17 # * Neither the name of the David Beazley or Dabeaz LLC may be used to

	18 # endorse or promote products derived from this software without

	19 # specific prior written permission.

	20 #

	21 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

	22 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

	23 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

	24 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

	25 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

	26 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

	27 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

	28 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

	29 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

	30 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

	31 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	32 # -----------------------------------------------------------------------------

	33

	34 __version__ = "3.4"

	35 __tabversion__ = "3.2" # Version of table file used

	36

	37 import re, sys, types, copy, os

	38

	39 # This tuple contains known string types

	40 try:

	41 # Python 2.6

	42 StringTypes = (types.StringType, types.UnicodeType)

	43 except AttributeError:

	44 # Python 3.0

	45 StringTypes = (str, bytes)

	46

	47 # Extract the code attribute of a function. Different implementations

	48 # are for Python 2/3 compatibility.

	49

	50 if sys.version_info[0] < 3:

	51 def func_code(f):

	52 return f.func_code

	53 else:

	54 def func_code(f):

	55 return f.__code__

	56

	57 # This regular expression is used to match valid token names

	58 _is_identifier = re.compile(r'^[a-zA-Z0-9_]+$')

	59

	60 # Exception thrown when invalid token encountered and no default error

	61 # handler is defined.

	62

	63 class LexError(Exception):

	64 def __init__(self,message,s):

	65 self.args = (message,)

	66 self.text = s

	67

	68 # Token class. This class is used to represent the tokens produced.

	69 class LexToken(object):

	70 def __str__(self):

	71 return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self. lexpos)

	72 def __repr__(self):

	73 return str(self)

	74

	75 # This object is a stand-in for a logging object created by the

	76 # logging module.

	77

	78 class PlyLogger(object):

	79 def __init__(self,f):

	80 self.f = f

	81 def critical(self,msg,args,*kwargs):

	82 self.f.write((msg % args) + "\n")

	83

	84 def warning(self,msg,args,*kwargs):

	85 self.f.write("WARNING: "+ (msg % args) + "\n")

	86

	87 def error(self,msg,args,*kwargs):

	88 self.f.write("ERROR: " + (msg % args) + "\n")

	89

	90 info = critical

	91 debug = critical

	92

	93 # Null logger is used when no output is generated. Does nothing.

	94 class NullLogger(object):

	95 def __getattribute__(self,name):

	96 return self

	97 def __call__(self,args,*kwargs):

	98 return self

	99

	100 # -----------------------------------------------------------------------------

	101 # === Lexing Engine ===

	102 #

	103 # The following Lexer class implements the lexer runtime. There are only

	104 # a few public methods and attributes:

	105 #

	106 # input() - Store a new string in the lexer

	107 # token() - Get the next token

	108 # clone() - Clone the lexer

	109 #

	110 # lineno - Current line number

	111 # lexpos - Current position in the input string

	112 # -----------------------------------------------------------------------------

	113

	114 class Lexer:

	115 def __init__(self):

	116 self.lexre = None # Master regular expression. This is a lis t of

	117 # tuples (re,findex) where re is a compile d

	118 # regular expression and findex is a list

	119 # mapping regex group numbers to rules

	120 self.lexretext = None # Current regular expression strings

	121 self.lexstatere = {} # Dictionary mapping lexer states to maste r regexs

	122 self.lexstateretext = {} # Dictionary mapping lexer states to regex strings

	123 self.lexstaterenames = {} # Dictionary mapping lexer states to symbo l names

	124 self.lexstate = "INITIAL" # Current lexer state

	125 self.lexstatestack = [] # Stack of lexer states

	126 self.lexstateinfo = None # State information

	127 self.lexstateignore = {} # Dictionary of ignored characters for eac h state

	128 self.lexstateerrorf = {} # Dictionary of error functions for each s tate

	129 self.lexreflags = 0 # Optional re compile flags

	130 self.lexdata = None # Actual input data (as a string)

	131 self.lexpos = 0 # Current position in input text

	132 self.lexlen = 0 # Length of the input text

	133 self.lexerrorf = None # Error rule (if any)

	134 self.lextokens = None # List of valid tokens

	135 self.lexignore = "" # Ignored characters

	136 self.lexliterals = "" # Literal characters that can be passed th rough

	137 self.lexmodule = None # Module

	138 self.lineno = 1 # Current line number

	139 self.lexoptimize = 0 # Optimized mode

	140

	141 def clone(self,object=None):

	142 c = copy.copy(self)

	143

	144 # If the object parameter has been supplied, it means we are attaching t he

	145 # lexer to a new object. In this case, we have to rebind all methods in

	146 # the lexstatere and lexstateerrorf tables.

	147

	148 if object:

	149 newtab = { }

	150 for key, ritem in self.lexstatere.items():

	151 newre = []

	152 for cre, findex in ritem:

	153 newfindex = []

	154 for f in findex:

	155 if not f or not f[0]:

	156 newfindex.append(f)

	157 continue

	158 newfindex.append((getattr(object,f[0].__name__),f[1]))

	159 newre.append((cre,newfindex))

	160 newtab[key] = newre

	161 c.lexstatere = newtab

	162 c.lexstateerrorf = { }

	163 for key, ef in self.lexstateerrorf.items():

	164 c.lexstateerrorf[key] = getattr(object,ef.__name__)

	165 c.lexmodule = object

	166 return c

	167

	168 # ------------------------------------------------------------

	169 # writetab() - Write lexer information to a table file

	170 # ------------------------------------------------------------

	171 def writetab(self,tabfile,outputdir=""):

	172 if isinstance(tabfile,types.ModuleType):

	173 return

	174 basetabfilename = tabfile.split(".")[-1]

	175 filename = os.path.join(outputdir,basetabfilename)+".py"

	176 tf = open(filename,"w")

	177 tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__))

	178 tf.write("_tabversion = %s\n" % repr(__version__))

	179 tf.write("_lextokens = %s\n" % repr(self.lextokens))

	180 tf.write("_lexreflags = %s\n" % repr(self.lexreflags))

	181 tf.write("_lexliterals = %s\n" % repr(self.lexliterals))

	182 tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo))

	183

	184 tabre = { }

	185 # Collect all functions in the initial state

	186 initial = self.lexstatere["INITIAL"]

	187 initialfuncs = []

	188 for part in initial:

	189 for f in part[1]:

	190 if f and f[0]:

	191 initialfuncs.append(f)

	192

	193 for key, lre in self.lexstatere.items():

	194 titem = []

	195 for i in range(len(lre)):

	196 titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[ i][1],self.lexstaterenames[key][i])))

	197 tabre[key] = titem

	198

	199 tf.write("_lexstatere = %s\n" % repr(tabre))

	200 tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore))

	201

	202 taberr = { }

	203 for key, ef in self.lexstateerrorf.items():

	204 if ef:

	205 taberr[key] = ef.__name__

	206 else:

	207 taberr[key] = None

	208 tf.write("_lexstateerrorf = %s\n" % repr(taberr))

	209 tf.close()

	210

	211 # ------------------------------------------------------------

	212 # readtab() - Read lexer information from a tab file

	213 # ------------------------------------------------------------

	214 def readtab(self,tabfile,fdict):

	215 if isinstance(tabfile,types.ModuleType):

	216 lextab = tabfile

	217 else:

	218 if sys.version_info[0] < 3:

	219 exec("import %s as lextab" % tabfile)

	220 else:

	221 env = { }

	222 exec("import %s as lextab" % tabfile, env,env)

	223 lextab = env['lextab']

	224

	225 if getattr(lextab,"_tabversion","0.0") != __version__:

	226 raise ImportError("Inconsistent PLY version")

	227

	228 self.lextokens = lextab._lextokens

	229 self.lexreflags = lextab._lexreflags

	230 self.lexliterals = lextab._lexliterals

	231 self.lexstateinfo = lextab._lexstateinfo

	232 self.lexstateignore = lextab._lexstateignore

	233 self.lexstatere = { }

	234 self.lexstateretext = { }

	235 for key,lre in lextab._lexstatere.items():

	236 titem = []

	237 txtitem = []

	238 for i in range(len(lre)):

	239 titem.append((re.compile(lre[i][0],lextab._lexreflags \| re.VER BOSE),_names_to_funcs(lre[i][1],fdict)))

	240 txtitem.append(lre[i][0])

	241 self.lexstatere[key] = titem

	242 self.lexstateretext[key] = txtitem

	243 self.lexstateerrorf = { }

	244 for key,ef in lextab._lexstateerrorf.items():

	245 self.lexstateerrorf[key] = fdict[ef]

	246 self.begin('INITIAL')

	247

	248 # ------------------------------------------------------------

	249 # input() - Push a new string into the lexer

	250 # ------------------------------------------------------------

	251 def input(self,s):

	252 # Pull off the first character to see if s looks like a string

	253 c = s[:1]

	254 if not isinstance(c,StringTypes):

	255 raise ValueError("Expected a string")

	256 self.lexdata = s

	257 self.lexpos = 0

	258 self.lexlen = len(s)

	259

	260 # ------------------------------------------------------------

	261 # begin() - Changes the lexing state

	262 # ------------------------------------------------------------

	263 def begin(self,state):

	264 if not state in self.lexstatere:

	265 raise ValueError("Undefined state")

	266 self.lexre = self.lexstatere[state]

	267 self.lexretext = self.lexstateretext[state]

	268 self.lexignore = self.lexstateignore.get(state,"")

	269 self.lexerrorf = self.lexstateerrorf.get(state,None)

	270 self.lexstate = state

	271

	272 # ------------------------------------------------------------

	273 # push_state() - Changes the lexing state and saves old on stack

	274 # ------------------------------------------------------------

	275 def push_state(self,state):

	276 self.lexstatestack.append(self.lexstate)

	277 self.begin(state)

	278

	279 # ------------------------------------------------------------

	280 # pop_state() - Restores the previous state

	281 # ------------------------------------------------------------

	282 def pop_state(self):

	283 self.begin(self.lexstatestack.pop())

	284

	285 # ------------------------------------------------------------

	286 # current_state() - Returns the current lexing state

	287 # ------------------------------------------------------------

	288 def current_state(self):

	289 return self.lexstate

	290

	291 # ------------------------------------------------------------

	292 # skip() - Skip ahead n characters

	293 # ------------------------------------------------------------

	294 def skip(self,n):

	295 self.lexpos += n

	296

	297 # ------------------------------------------------------------

	298 # opttoken() - Return the next token from the Lexer

	299 #

	300 # Note: This function has been carefully implemented to be as fast

	301 # as possible. Don't make changes unless you really know what

	302 # you are doing

	303 # ------------------------------------------------------------

	304 def token(self):

	305 # Make local copies of frequently referenced attributes

	306 lexpos = self.lexpos

	307 lexlen = self.lexlen

	308 lexignore = self.lexignore

	309 lexdata = self.lexdata

	310

	311 while lexpos < lexlen:

	312 # This code provides some short-circuit code for whitespace, tabs, a nd other ignored characters

	313 if lexdata[lexpos] in lexignore:

	314 lexpos += 1

	315 continue

	316

	317 # Look for a regular expression match

	318 for lexre,lexindexfunc in self.lexre:

	319 m = lexre.match(lexdata,lexpos)

	320 if not m: continue

	321

	322 # Create a token for return

	323 tok = LexToken()

	324 tok.value = m.group()

	325 tok.lineno = self.lineno

	326 tok.lexpos = lexpos

	327

	328 i = m.lastindex

	329 func,tok.type = lexindexfunc[i]

	330

	331 if not func:

	332 # If no token type was set, it's an ignored token

	333 if tok.type:

	334 self.lexpos = m.end()

	335 return tok

	336 else:

	337 lexpos = m.end()

	338 break

	339

	340 lexpos = m.end()

	341

	342 # If token is processed by a function, call it

	343

	344 tok.lexer = self # Set additional attributes useful in toke n rules

	345 self.lexmatch = m

	346 self.lexpos = lexpos

	347

	348 newtok = func(tok)

	349

	350 # Every function must return a token, if nothing, we just move t o next token

	351 if not newtok:

	352 lexpos = self.lexpos # This is here in case user has updated lexpos.

	353 lexignore = self.lexignore # This is here in case there was a state change

	354 break

	355

	356 # Verify type of the token. If not in the token map, raise an e rror

	357 if not self.lexoptimize:

	358 if not newtok.type in self.lextokens:

	359 raise LexError("%s:%d: Rule '%s' returned an unknown tok en type '%s'" % (

	360 func_code(func).co_filename, func_code(func).co_firs tlineno,

	361 func.__name__, newtok.type),lexdata[lexpos:])

	362

	363 return newtok

	364 else:

	365 # No match, see if in literals

	366 if lexdata[lexpos] in self.lexliterals:

	367 tok = LexToken()

	368 tok.value = lexdata[lexpos]

	369 tok.lineno = self.lineno

	370 tok.type = tok.value

	371 tok.lexpos = lexpos

	372 self.lexpos = lexpos + 1

	373 return tok

	374

	375 # No match. Call t_error() if defined.

	376 if self.lexerrorf:

	377 tok = LexToken()

	378 tok.value = self.lexdata[lexpos:]

	379 tok.lineno = self.lineno

	380 tok.type = "error"

	381 tok.lexer = self

	382 tok.lexpos = lexpos

	383 self.lexpos = lexpos

	384 newtok = self.lexerrorf(tok)

	385 if lexpos == self.lexpos:

	386 # Error method didn't change text position at all. This is an error.

	387 raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:])

	388 lexpos = self.lexpos

	389 if not newtok: continue

	390 return newtok

	391

	392 self.lexpos = lexpos

	393 raise LexError("Illegal character '%s' at index %d" % (lexdata[l expos],lexpos), lexdata[lexpos:])

	394

	395 self.lexpos = lexpos + 1

	396 if self.lexdata is None:

	397 raise RuntimeError("No input string given with input()")

	398 return None

	399

	400 # Iterator interface

	401 def __iter__(self):

	402 return self

	403

	404 def next(self):

	405 t = self.token()

	406 if t is None:

	407 raise StopIteration

	408 return t

	409

	410 __next__ = next

	411

	412 # -----------------------------------------------------------------------------

	413 # ==== Lex Builder ===

	414 #

	415 # The functions and classes below are used to collect lexing information

	416 # and build a Lexer object from it.

	417 # -----------------------------------------------------------------------------

	418

	419 # -----------------------------------------------------------------------------

	420 # get_caller_module_dict()

	421 #

	422 # This function returns a dictionary containing all of the symbols defined withi n

	423 # a caller further down the call stack. This is used to get the environment

	424 # associated with the yacc() call if none was provided.

	425 # -----------------------------------------------------------------------------

	426

	427 def get_caller_module_dict(levels):

	428 try:

	429 raise RuntimeError

	430 except RuntimeError:

	431 e,b,t = sys.exc_info()

	432 f = t.tb_frame

	433 while levels > 0:

	434 f = f.f_back

	435 levels -= 1

	436 ldict = f.f_globals.copy()

	437 if f.f_globals != f.f_locals:

	438 ldict.update(f.f_locals)

	439

	440 return ldict

	441

	442 # -----------------------------------------------------------------------------

	443 # _funcs_to_names()

	444 #

	445 # Given a list of regular expression functions, this converts it to a list

	446 # suitable for output to a table file

	447 # -----------------------------------------------------------------------------

	448

	449 def _funcs_to_names(funclist,namelist):

	450 result = []

	451 for f,name in zip(funclist,namelist):

	452 if f and f[0]:

	453 result.append((name, f[1]))

	454 else:

	455 result.append(f)

	456 return result

	457

	458 # -----------------------------------------------------------------------------

	459 # _names_to_funcs()

	460 #

	461 # Given a list of regular expression function names, this converts it back to

	462 # functions.

	463 # -----------------------------------------------------------------------------

	464

	465 def _names_to_funcs(namelist,fdict):

	466 result = []

	467 for n in namelist:

	468 if n and n[0]:

	469 result.append((fdict[n[0]],n[1]))

	470 else:

	471 result.append(n)

	472 return result

	473

	474 # -----------------------------------------------------------------------------

	475 # _form_master_re()

	476 #

	477 # This function takes a list of all of the regex components and attempts to

	478 # form the master regular expression. Given limitations in the Python re

	479 # module, it may be necessary to break the master regex into separate expression s.

	480 # -----------------------------------------------------------------------------

	481

	482 def _form_master_re(relist,reflags,ldict,toknames):

	483 if not relist: return []

	484 regex = "\|".join(relist)

	485 try:

	486 lexre = re.compile(regex,re.VERBOSE \| reflags)

	487

	488 # Build the index to function map for the matching engine

	489 lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1)

	490 lexindexnames = lexindexfunc[:]

	491

	492 for f,i in lexre.groupindex.items():

	493 handle = ldict.get(f,None)

	494 if type(handle) in (types.FunctionType, types.MethodType):

	495 lexindexfunc[i] = (handle,toknames[f])

	496 lexindexnames[i] = f

	497 elif handle is not None:

	498 lexindexnames[i] = f

	499 if f.find("ignore_") > 0:

	500 lexindexfunc[i] = (None,None)

	501 else:

	502 lexindexfunc[i] = (None, toknames[f])

	503

	504 return [(lexre,lexindexfunc)],[regex],[lexindexnames]

	505 except Exception:

	506 m = int(len(relist)/2)

	507 if m == 0: m = 1

	508 llist, lre, lnames = _form_master_re(relist[:m],reflags,ldict,toknames)

	509 rlist, rre, rnames = _form_master_re(relist[m:],reflags,ldict,toknames)

	510 return llist+rlist, lre+rre, lnames+rnames

	511

	512 # -----------------------------------------------------------------------------

	513 # def _statetoken(s,names)

	514 #

	515 # Given a declaration name s of the form "t_" and a dictionary whose keys are

	516 # state names, this function returns a tuple (states,tokenname) where states

	517 # is a tuple of state names and tokenname is the name of the token. For example ,

	518 # calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM')

	519 # -----------------------------------------------------------------------------

	520

	521 def _statetoken(s,names):

	522 nonstate = 1

	523 parts = s.split("_")

	524 for i in range(1,len(parts)):

	525 if not parts[i] in names and parts[i] != 'ANY': break

	526 if i > 1:

	527 states = tuple(parts[1:i])

	528 else:

	529 states = ('INITIAL',)

	530

	531 if 'ANY' in states:

	532 states = tuple(names)

	533

	534 tokenname = "_".join(parts[i:])

	535 return (states,tokenname)

	536

	537

	538 # -----------------------------------------------------------------------------

	539 # LexerReflect()

	540 #

	541 # This class represents information needed to build a lexer as extracted from a

	542 # user's input file.

	543 # -----------------------------------------------------------------------------

	544 class LexerReflect(object):

	545 def __init__(self,ldict,log=None,reflags=0):

	546 self.ldict = ldict

	547 self.error_func = None

	548 self.tokens = []

	549 self.reflags = reflags

	550 self.stateinfo = { 'INITIAL' : 'inclusive'}

	551 self.files = {}

	552 self.error = 0

	553

	554 if log is None:

	555 self.log = PlyLogger(sys.stderr)

	556 else:

	557 self.log = log

	558

	559 # Get all of the basic information

	560 def get_all(self):

	561 self.get_tokens()

	562 self.get_literals()

	563 self.get_states()

	564 self.get_rules()

	565

	566 # Validate all of the information

	567 def validate_all(self):

	568 self.validate_tokens()

	569 self.validate_literals()

	570 self.validate_rules()

	571 return self.error

	572

	573 # Get the tokens map

	574 def get_tokens(self):

	575 tokens = self.ldict.get("tokens",None)

	576 if not tokens:

	577 self.log.error("No token list is defined")

	578 self.error = 1

	579 return

	580

	581 if not isinstance(tokens,(list, tuple)):

	582 self.log.error("tokens must be a list or tuple")

	583 self.error = 1

	584 return

	585

	586 if not tokens:

	587 self.log.error("tokens is empty")

	588 self.error = 1

	589 return

	590

	591 self.tokens = tokens

	592

	593 # Validate the tokens

	594 def validate_tokens(self):

	595 terminals = {}

	596 for n in self.tokens:

	597 if not _is_identifier.match(n):

	598 self.log.error("Bad token name '%s'",n)

	599 self.error = 1

	600 if n in terminals:

	601 self.log.warning("Token '%s' multiply defined", n)

	602 terminals[n] = 1

	603

	604 # Get the literals specifier

	605 def get_literals(self):

	606 self.literals = self.ldict.get("literals","")

	607

	608 # Validate literals

	609 def validate_literals(self):

	610 try:

	611 for c in self.literals:

	612 if not isinstance(c,StringTypes) or len(c) > 1:

	613 self.log.error("Invalid literal %s. Must be a single charact er", repr(c))

	614 self.error = 1

	615 continue

	616

	617 except TypeError:

	618 self.log.error("Invalid literals specification. literals must be a s equence of characters")

	619 self.error = 1

	620

	621 def get_states(self):

	622 self.states = self.ldict.get("states",None)

	623 # Build statemap

	624 if self.states:

	625 if not isinstance(self.states,(tuple,list)):

	626 self.log.error("states must be defined as a tuple or list")

	627 self.error = 1

	628 else:

	629 for s in self.states:

	630 if not isinstance(s,tuple) or len(s) != 2:

	631 self.log.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive\|inclusive')",repr(s))

	632 self.error = 1

	633 continue

	634 name, statetype = s

	635 if not isinstance(name,StringTypes):

	636 self.log.error("State name %s must be a string", repr(name))

	637 self.error = 1

	638 continue

	639 if not (statetype == 'inclusive' or statetype == 'exclus ive'):

	640 self.log.error("State type for state %s must be ' inclusive' or 'exclusive'",name)

	641 self.error = 1

	642 continue

	643 if name in self.stateinfo:

	644 self.log.error("State '%s' already defined",name)

	645 self.error = 1

	646 continue

	647 self.stateinfo[name] = statetype

	648

	649 # Get all of the symbols with a t_ prefix and sort them into various

	650 # categories (functions, strings, error functions, and ignore characters)

	651

	652 def get_rules(self):

	653 tsymbols = [f for f in self.ldict if f[:2] == 't_' ]

	654

	655 # Now build up a list of functions and a list of strings

	656

	657 self.toknames = { } # Mapping of symbols to token names

	658 self.funcsym = { } # Symbols defined as functions

	659 self.strsym = { } # Symbols defined as strings

	660 self.ignore = { } # Ignore strings by state

	661 self.errorf = { } # Error functions by state

	662

	663 for s in self.stateinfo:

	664 self.funcsym[s] = []

	665 self.strsym[s] = []

	666

	667 if len(tsymbols) == 0:

	668 self.log.error("No rules of the form t_rulename are defined")

	669 self.error = 1

	670 return

	671

	672 for f in tsymbols:

	673 t = self.ldict[f]

	674 states, tokname = _statetoken(f,self.stateinfo)

	675 self.toknames[f] = tokname

	676

	677 if hasattr(t,"__call__"):

	678 if tokname == 'error':

	679 for s in states:

	680 self.errorf[s] = t

	681 elif tokname == 'ignore':

	682 line = func_code(t).co_firstlineno

	683 file = func_code(t).co_filename

	684 self.log.error("%s:%d: Rule '%s' must be defined as a string ",file,line,t.__name__)

	685 self.error = 1

	686 else:

	687 for s in states:

	688 self.funcsym[s].append((f,t))

	689 elif isinstance(t, StringTypes):

	690 if tokname == 'ignore':

	691 for s in states:

	692 self.ignore[s] = t

	693 if "\\" in t:

	694 self.log.warning("%s contains a literal backslash '\\'", f)

	695

	696 elif tokname == 'error':

	697 self.log.error("Rule '%s' must be defined as a function", f)

	698 self.error = 1

	699 else:

	700 for s in states:

	701 self.strsym[s].append((f,t))

	702 else:

	703 self.log.error("%s not defined as a function or string", f)

	704 self.error = 1

	705

	706 # Sort the functions by line number

	707 for f in self.funcsym.values():

	708 if sys.version_info[0] < 3:

	709 f.sort(lambda x,y: cmp(func_code(x[1]).co_firstlineno,func_code( y[1]).co_firstlineno))

	710 else:

	711 # Python 3.0

	712 f.sort(key=lambda x: func_code(x[1]).co_firstlineno)

	713

	714 # Sort the strings by regular expression length

	715 for s in self.strsym.values():

	716 if sys.version_info[0] < 3:

	717 s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[ 1])))

	718 else:

	719 # Python 3.0

	720 s.sort(key=lambda x: len(x[1]),reverse=True)

	721

	722 # Validate all of the t_rules collected

	723 def validate_rules(self):

	724 for state in self.stateinfo:

	725 # Validate all rules defined by functions

	726

	727

	728

	729 for fname, f in self.funcsym[state]:

	730 line = func_code(f).co_firstlineno

	731 file = func_code(f).co_filename

	732 self.files[file] = 1

	733

	734 tokname = self.toknames[fname]

	735 if isinstance(f, types.MethodType):

	736 reqargs = 2

	737 else:

	738 reqargs = 1

	739 nargs = func_code(f).co_argcount

	740 if nargs > reqargs:

	741 self.log.error("%s:%d: Rule '%s' has too many arguments",fil e,line,f.__name__)

	742 self.error = 1

	743 continue

	744

	745 if nargs < reqargs:

	746 self.log.error("%s:%d: Rule '%s' requires an argument", file ,line,f.__name__)

	747 self.error = 1

	748 continue

	749

	750 if not f.__doc__:

	751 self.log.error("%s:%d: No regular expression defined for rul e '%s'",file,line,f.__name__)

	752 self.error = 1

	753 continue

	754

	755 try:

	756 c = re.compile("(?P<%s>%s)" % (fname,f.__doc__), re.VERBOSE \| self.reflags)

	757 if c.match(""):

	758 self.log.error("%s:%d: Regular expression for rule '%s' matches empty string", file,line,f.__name__)

	759 self.error = 1

	760 except re.error:

	761 _etype, e, _etrace = sys.exc_info()

	762 self.log.error("%s:%d: Invalid regular expression for rule ' %s'. %s", file,line,f.__name__,e)

	763 if '#' in f.__doc__:

	764 self.log.error("%s:%d. Make sure '#' in rule '%s' is esc aped with '\\#'",file,line, f.__name__)

	765 self.error = 1

	766

	767 # Validate all rules defined by strings

	768 for name,r in self.strsym[state]:

	769 tokname = self.toknames[name]

	770 if tokname == 'error':

	771 self.log.error("Rule '%s' must be defined as a function", na me)

	772 self.error = 1

	773 continue

	774

	775 if not tokname in self.tokens and tokname.find("ignore_") < 0:

	776 self.log.error("Rule '%s' defined for an unspecified token % s",name,tokname)

	777 self.error = 1

	778 continue

	779

	780 try:

	781 c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE \| self.ref lags)

	782 if (c.match("")):

	783 self.log.error("Regular expression for rule '%s' matche s empty string",name)

	784 self.error = 1

	785 except re.error:

	786 _etype, e, _etrace = sys.exc_info()

	787 self.log.error("Invalid regular expression for rule '%s'. %s ",name,e)

	788 if '#' in r:

	789 self.log.error("Make sure '#' in rule '%s' is escaped w ith '\\#'",name)

	790 self.error = 1

	791

	792 if not self.funcsym[state] and not self.strsym[state]:

	793 self.log.error("No rules defined for state '%s'",state)

	794 self.error = 1

	795

	796 # Validate the error function

	797 efunc = self.errorf.get(state,None)

	798 if efunc:

	799 f = efunc

	800 line = func_code(f).co_firstlineno

	801 file = func_code(f).co_filename

	802 self.files[file] = 1

	803

	804 if isinstance(f, types.MethodType):

	805 reqargs = 2

	806 else:

	807 reqargs = 1

	808 nargs = func_code(f).co_argcount

	809 if nargs > reqargs:

	810 self.log.error("%s:%d: Rule '%s' has too many arguments",fil e,line,f.__name__)

	811 self.error = 1

	812

	813 if nargs < reqargs:

	814 self.log.error("%s:%d: Rule '%s' requires an argument", file ,line,f.__name__)

	815 self.error = 1

	816

	817 for f in self.files:

	818 self.validate_file(f)

	819

	820

	821 # -------------------------------------------------------------------------- ---

	822 # validate_file()

	823 #

	824 # This checks to see if there are duplicated t_rulename() functions or strin gs

	825 # in the parser input file. This is done using a simple regular expression

	826 # match on each line in the given file.

	827 # -------------------------------------------------------------------------- ---

	828

	829 def validate_file(self,filename):

	830 import os.path

	831 base,ext = os.path.splitext(filename)

	832 if ext != '.py': return # No idea what the file is. Return OK

	833

	834 try:

	835 f = open(filename)

	836 lines = f.readlines()

	837 f.close()

	838 except IOError:

	839 return # Couldn't find the file. Don't worry a bout it

	840

	841 fre = re.compile(r'\sdef\s+(t_[a-zA-Z_0-9])\(')

	842 sre = re.compile(r'\s(t_[a-zA-Z_0-9])\s*=')

	843

	844 counthash = { }

	845 linen = 1

	846 for l in lines:

	847 m = fre.match(l)

	848 if not m:

	849 m = sre.match(l)

	850 if m:

	851 name = m.group(1)

	852 prev = counthash.get(name)

	853 if not prev:

	854 counthash[name] = linen

	855 else:

	856 self.log.error("%s:%d: Rule %s redefined. Previously defined on line %d",filename,linen,name,prev)

	857 self.error = 1

	858 linen += 1

	859

	860 # -----------------------------------------------------------------------------

	861 # lex(module)

	862 #

	863 # Build all of the regular expression rules from definitions in the supplied mod ule

	864 # -----------------------------------------------------------------------------

	865 def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,now arn=0,outputdir="", debuglog=None, errorlog=None):

	866 global lexer

	867 ldict = None

	868 stateinfo = { 'INITIAL' : 'inclusive'}

	869 lexobj = Lexer()

	870 lexobj.lexoptimize = optimize

	871 global token,input

	872

	873 if errorlog is None:

	874 errorlog = PlyLogger(sys.stderr)

	875

	876 if debug:

	877 if debuglog is None:

	878 debuglog = PlyLogger(sys.stderr)

	879

	880 # Get the module dictionary used for the lexer

	881 if object: module = object

	882

	883 if module:

	884 _items = [(k,getattr(module,k)) for k in dir(module)]

	885 ldict = dict(_items)

	886 else:

	887 ldict = get_caller_module_dict(2)

	888

	889 # Collect parser information from the dictionary

	890 linfo = LexerReflect(ldict,log=errorlog,reflags=reflags)

	891 linfo.get_all()

	892 if not optimize:

	893 if linfo.validate_all():

	894 raise SyntaxError("Can't build lexer")

	895

	896 if optimize and lextab:

	897 try:

	898 lexobj.readtab(lextab,ldict)

	899 token = lexobj.token

	900 input = lexobj.input

	901 lexer = lexobj

	902 return lexobj

	903

	904 except ImportError:

	905 pass

	906

	907 # Dump some basic debugging information

	908 if debug:

	909 debuglog.info("lex: tokens = %r", linfo.tokens)

	910 debuglog.info("lex: literals = %r", linfo.literals)

	911 debuglog.info("lex: states = %r", linfo.stateinfo)

	912

	913 # Build a dictionary of valid token names

	914 lexobj.lextokens = { }

	915 for n in linfo.tokens:

	916 lexobj.lextokens[n] = 1

	917

	918 # Get literals specification

	919 if isinstance(linfo.literals,(list,tuple)):

	920 lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals)

	921 else:

	922 lexobj.lexliterals = linfo.literals

	923

	924 # Get the stateinfo dictionary

	925 stateinfo = linfo.stateinfo

	926

	927 regexs = { }

	928 # Build the master regular expressions

	929 for state in stateinfo:

	930 regex_list = []

	931

	932 # Add rules defined by functions first

	933 for fname, f in linfo.funcsym[state]:

	934 line = func_code(f).co_firstlineno

	935 file = func_code(f).co_filename

	936 regex_list.append("(?P<%s>%s)" % (fname,f.__doc__))

	937 if debug:

	938 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",fname,f .__doc__, state)

	939

	940 # Now add all of the simple rules

	941 for name,r in linfo.strsym[state]:

	942 regex_list.append("(?P<%s>%s)" % (name,r))

	943 if debug:

	944 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",name,r, state)

	945

	946 regexs[state] = regex_list

	947

	948 # Build the master regular expressions

	949

	950 if debug:

	951 debuglog.info("lex: ==== MASTER REGEXS FOLLOW ====")

	952

	953 for state in regexs:

	954 lexre, re_text, re_names = _form_master_re(regexs[state],reflags,ldict,l info.toknames)

	955 lexobj.lexstatere[state] = lexre

	956 lexobj.lexstateretext[state] = re_text

	957 lexobj.lexstaterenames[state] = re_names

	958 if debug:

	959 for i in range(len(re_text)):

	960 debuglog.info("lex: state '%s' : regex[%d] = '%s'",state, i, re_ text[i])

	961

	962 # For inclusive states, we need to add the regular expressions from the INIT IAL state

	963 for state,stype in stateinfo.items():

	964 if state != "INITIAL" and stype == 'inclusive':

	965 lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL'])

	966 lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL' ])

	967 lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIA L'])

	968

	969 lexobj.lexstateinfo = stateinfo

	970 lexobj.lexre = lexobj.lexstatere["INITIAL"]

	971 lexobj.lexretext = lexobj.lexstateretext["INITIAL"]

	972 lexobj.lexreflags = reflags

	973

	974 # Set up ignore variables

	975 lexobj.lexstateignore = linfo.ignore

	976 lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","")

	977

	978 # Set up error functions

	979 lexobj.lexstateerrorf = linfo.errorf

	980 lexobj.lexerrorf = linfo.errorf.get("INITIAL",None)

	981 if not lexobj.lexerrorf:

	982 errorlog.warning("No t_error rule is defined")

	983

	984 # Check state information for ignore and error rules

	985 for s,stype in stateinfo.items():

	986 if stype == 'exclusive':

	987 if not s in linfo.errorf:

	988 errorlog.warning("No error rule is defined for exclusive stat e '%s'", s)

	989 if not s in linfo.ignore and lexobj.lexignore:

	990 errorlog.warning("No ignore rule is defined for exclusive sta te '%s'", s)

	991 elif stype == 'inclusive':

	992 if not s in linfo.errorf:

	993 linfo.errorf[s] = linfo.errorf.get("INITIAL",None)

	994 if not s in linfo.ignore:

	995 linfo.ignore[s] = linfo.ignore.get("INITIAL","")

	996

	997 # Create global versions of the token() and input() functions

	998 token = lexobj.token

	999 input = lexobj.input

	1000 lexer = lexobj

	1001

	1002 # If in optimize mode, we write the lextab

	1003 if lextab and optimize:

	1004 lexobj.writetab(lextab,outputdir)

	1005

	1006 return lexobj

	1007

	1008 # -----------------------------------------------------------------------------

	1009 # runmain()

	1010 #

	1011 # This runs the lexer as a main program

	1012 # -----------------------------------------------------------------------------

	1013

	1014 def runmain(lexer=None,data=None):

	1015 if not data:

	1016 try:

	1017 filename = sys.argv[1]

	1018 f = open(filename)

	1019 data = f.read()

	1020 f.close()

	1021 except IndexError:

	1022 sys.stdout.write("Reading from standard input (type EOF to end):\n")

	1023 data = sys.stdin.read()

	1024

	1025 if lexer:

	1026 _input = lexer.input

	1027 else:

	1028 _input = input

	1029 _input(data)

	1030 if lexer:

	1031 _token = lexer.token

	1032 else:

	1033 _token = token

	1034

	1035 while 1:

	1036 tok = _token()

	1037 if not tok: break

	1038 sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno,to k.lexpos))

	1039

	1040 # -----------------------------------------------------------------------------

	1041 # @TOKEN(regex)

	1042 #

	1043 # This decorator function can be used to set the regex expression on a function

	1044 # when its docstring might need to be set in an alternative way

	1045 # -----------------------------------------------------------------------------

	1046

	1047 def TOKEN(r):

	1048 def set_doc(f):

	1049 if hasattr(r,"__call__"):

	1050 f.__doc__ = r.__doc__

	1051 else:

	1052 f.__doc__ = r

	1053 return f

	1054 return set_doc

	1055

	1056 # Alternative spelling of the TOKEN decorator

	1057 Token = TOKEN

	1058

OLD	NEW

« no previous file with comments | « third_party/ply/__init__.py ('k') | third_party/ply/yacc.py » ('j') | no next file with comments »