OLD | NEW |
| (Empty) |
1 # PLY based Lexer class, based on pycparser by Eli Bendersky. | |
2 # | |
3 # Copyright (c) 2012, Eli Bendersky | |
4 # All rights reserved. | |
5 # | |
6 # Redistribution and use in source and binary forms, with or without modificatio
n, | |
7 # are permitted provided that the following conditions are met: | |
8 # | |
9 # * Redistributions of source code must retain the above copyright notice, this | |
10 # list of conditions and the following disclaimer. | |
11 # * Redistributions in binary form must reproduce the above copyright notice, | |
12 # this list of conditions and the following disclaimer in the documentation | |
13 # and/or other materials provided with the distribution. | |
14 # * Neither the name of Eli Bendersky nor the names of its contributors may | |
15 # be used to endorse or promote products derived from this software without | |
16 # specific prior written permission. | |
17 # | |
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AN
D | |
19 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
20 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
21 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | |
22 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
23 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUT
E | |
24 # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
25 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
26 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | |
27 # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE
. | |
28 | |
29 import re | |
30 import sys | |
31 import os.path | |
32 | |
33 # Try to load the ply module, if not, then assume it is in the third_party | |
34 # directory. | |
35 try: | |
36 # Disable lint check which fails to find the ply module. | |
37 # pylint: disable=F0401 | |
38 from ply.lex import TOKEN | |
39 except ImportError: | |
40 module_path, module_name = os.path.split(__file__) | |
41 third_party = os.path.join( | |
42 module_path, os.pardir, os.pardir, os.pardir, os.pardir, 'third_party') | |
43 sys.path.append(third_party) | |
44 # pylint: disable=F0401 | |
45 from ply.lex import TOKEN | |
46 | |
47 | |
48 class Lexer(object): | |
49 ######################-- PRIVATE --###################### | |
50 | |
51 ## | |
52 ## Internal auxiliary methods | |
53 ## | |
54 def _error(self, msg, token): | |
55 print('%s at line %d' % (msg, token.lineno)) | |
56 self.lexer.skip(1) | |
57 | |
58 ## | |
59 ## Reserved keywords | |
60 ## | |
61 keywords = ( | |
62 'HANDLE', | |
63 'DATA_PIPE_CONSUMER', | |
64 'DATA_PIPE_PRODUCER', | |
65 'MESSAGE_PIPE', | |
66 | |
67 'MODULE', | |
68 'STRUCT', | |
69 'INTERFACE', | |
70 'ENUM', | |
71 'VOID', | |
72 ) | |
73 | |
74 keyword_map = {} | |
75 for keyword in keywords: | |
76 keyword_map[keyword.lower()] = keyword | |
77 | |
78 ## | |
79 ## All the tokens recognized by the lexer | |
80 ## | |
81 tokens = keywords + ( | |
82 # Identifiers | |
83 'NAME', | |
84 | |
85 # constants | |
86 'ORDINAL', | |
87 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', | |
88 'FLOAT_CONST', 'HEX_FLOAT_CONST', | |
89 'CHAR_CONST', | |
90 'WCHAR_CONST', | |
91 | |
92 # String literals | |
93 'STRING_LITERAL', | |
94 'WSTRING_LITERAL', | |
95 | |
96 # Operators | |
97 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD', | |
98 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT', | |
99 'LOR', 'LAND', 'LNOT', | |
100 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE', | |
101 | |
102 # Assignment | |
103 'EQUALS', | |
104 | |
105 # Conditional operator (?) | |
106 'CONDOP', | |
107 | |
108 # Delimeters | |
109 'LPAREN', 'RPAREN', # ( ) | |
110 'LBRACKET', 'RBRACKET', # [ ] | |
111 'LBRACE', 'RBRACE', # { } | |
112 'SEMI', 'COLON', # ; : | |
113 'COMMA', # . | |
114 ) | |
115 | |
116 ## | |
117 ## Regexes for use in tokens | |
118 ## | |
119 ## | |
120 | |
121 # valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers) | |
122 identifier = r'[a-zA-Z_$][0-9a-zA-Z_$]*' | |
123 | |
124 hex_prefix = '0[xX]' | |
125 hex_digits = '[0-9a-fA-F]+' | |
126 | |
127 # integer constants (K&R2: A.2.5.1) | |
128 integer_suffix_opt = \ | |
129 r'(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?' | |
130 decimal_constant = \ | |
131 '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')' | |
132 octal_constant = '0[0-7]*'+integer_suffix_opt | |
133 hex_constant = hex_prefix+hex_digits+integer_suffix_opt | |
134 | |
135 bad_octal_constant = '0[0-7]*[89]' | |
136 | |
137 # character constants (K&R2: A.2.5.2) | |
138 # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line | |
139 # directives with Windows paths as filenames (..\..\dir\file) | |
140 # For the same reason, decimal_escape allows all digit sequences. We want to | |
141 # parse all correct code, even if it means to sometimes parse incorrect | |
142 # code. | |
143 # | |
144 simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" | |
145 decimal_escape = r"""(\d+)""" | |
146 hex_escape = r"""(x[0-9a-fA-F]+)""" | |
147 bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" | |
148 | |
149 escape_sequence = \ | |
150 r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))' | |
151 cconst_char = r"""([^'\\\n]|"""+escape_sequence+')' | |
152 char_const = "'"+cconst_char+"'" | |
153 wchar_const = 'L'+char_const | |
154 unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)" | |
155 bad_char_const = \ | |
156 r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+ \ | |
157 bad_escape+r"""[^'\n]*')""" | |
158 | |
159 # string literals (K&R2: A.2.6) | |
160 string_char = r"""([^"\\\n]|"""+escape_sequence+')' | |
161 string_literal = '"'+string_char+'*"' | |
162 wstring_literal = 'L'+string_literal | |
163 bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' | |
164 | |
165 # floating constants (K&R2: A.2.5.3) | |
166 exponent_part = r"""([eE][-+]?[0-9]+)""" | |
167 fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" | |
168 floating_constant = \ | |
169 '(((('+fractional_constant+')'+ \ | |
170 exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)' | |
171 binary_exponent_part = r'''([pP][+-]?[0-9]+)''' | |
172 hex_fractional_constant = \ | |
173 '((('+hex_digits+r""")?\."""+hex_digits+')|('+hex_digits+r"""\.))""" | |
174 hex_floating_constant = \ | |
175 '('+hex_prefix+'('+hex_digits+'|'+hex_fractional_constant+')'+ \ | |
176 binary_exponent_part+'[FfLl]?)' | |
177 | |
178 ## | |
179 ## Rules for the normal state | |
180 ## | |
181 t_ignore = ' \t' | |
182 | |
183 # Newlines | |
184 def t_NEWLINE(self, t): | |
185 r'\n+' | |
186 t.lexer.lineno += t.value.count("\n") | |
187 | |
188 # Operators | |
189 t_PLUS = r'\+' | |
190 t_MINUS = r'-' | |
191 t_TIMES = r'\*' | |
192 t_DIVIDE = r'/' | |
193 t_MOD = r'%' | |
194 t_OR = r'\|' | |
195 t_AND = r'&' | |
196 t_NOT = r'~' | |
197 t_XOR = r'\^' | |
198 t_LSHIFT = r'<<' | |
199 t_RSHIFT = r'>>' | |
200 t_LOR = r'\|\|' | |
201 t_LAND = r'&&' | |
202 t_LNOT = r'!' | |
203 t_LT = r'<' | |
204 t_GT = r'>' | |
205 t_LE = r'<=' | |
206 t_GE = r'>=' | |
207 t_EQ = r'==' | |
208 t_NE = r'!=' | |
209 | |
210 # = | |
211 t_EQUALS = r'=' | |
212 | |
213 # ? | |
214 t_CONDOP = r'\?' | |
215 | |
216 # Delimeters | |
217 t_LPAREN = r'\(' | |
218 t_RPAREN = r'\)' | |
219 t_LBRACKET = r'\[' | |
220 t_RBRACKET = r'\]' | |
221 t_LBRACE = r'\{' | |
222 t_RBRACE = r'\}' | |
223 t_COMMA = r',' | |
224 t_SEMI = r';' | |
225 t_COLON = r':' | |
226 | |
227 t_STRING_LITERAL = string_literal | |
228 t_ORDINAL = r'@[0-9]*' | |
229 | |
230 # The following floating and integer constants are defined as | |
231 # functions to impose a strict order (otherwise, decimal | |
232 # is placed before the others because its regex is longer, | |
233 # and this is bad) | |
234 # | |
235 @TOKEN(floating_constant) | |
236 def t_FLOAT_CONST(self, t): | |
237 return t | |
238 | |
239 @TOKEN(hex_floating_constant) | |
240 def t_HEX_FLOAT_CONST(self, t): | |
241 return t | |
242 | |
243 @TOKEN(hex_constant) | |
244 def t_INT_CONST_HEX(self, t): | |
245 return t | |
246 | |
247 @TOKEN(bad_octal_constant) | |
248 def t_BAD_CONST_OCT(self, t): | |
249 msg = "Invalid octal constant" | |
250 self._error(msg, t) | |
251 | |
252 @TOKEN(octal_constant) | |
253 def t_INT_CONST_OCT(self, t): | |
254 return t | |
255 | |
256 @TOKEN(decimal_constant) | |
257 def t_INT_CONST_DEC(self, t): | |
258 return t | |
259 | |
260 # Must come before bad_char_const, to prevent it from | |
261 # catching valid char constants as invalid | |
262 # | |
263 @TOKEN(char_const) | |
264 def t_CHAR_CONST(self, t): | |
265 return t | |
266 | |
267 @TOKEN(wchar_const) | |
268 def t_WCHAR_CONST(self, t): | |
269 return t | |
270 | |
271 @TOKEN(unmatched_quote) | |
272 def t_UNMATCHED_QUOTE(self, t): | |
273 msg = "Unmatched '" | |
274 self._error(msg, t) | |
275 | |
276 @TOKEN(bad_char_const) | |
277 def t_BAD_CHAR_CONST(self, t): | |
278 msg = "Invalid char constant %s" % t.value | |
279 self._error(msg, t) | |
280 | |
281 @TOKEN(wstring_literal) | |
282 def t_WSTRING_LITERAL(self, t): | |
283 return t | |
284 | |
285 # unmatched string literals are caught by the preprocessor | |
286 | |
287 @TOKEN(bad_string_literal) | |
288 def t_BAD_STRING_LITERAL(self, t): | |
289 msg = "String contains invalid escape code" | |
290 self._error(msg, t) | |
291 | |
292 @TOKEN(identifier) | |
293 def t_NAME(self, t): | |
294 t.type = self.keyword_map.get(t.value, "NAME") | |
295 return t | |
296 | |
297 # Ignore C and C++ style comments | |
298 def t_COMMENT(self, t): | |
299 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)' | |
300 pass | |
301 | |
302 def t_error(self, t): | |
303 msg = 'Illegal character %s' % repr(t.value[0]) | |
304 self._error(msg, t) | |
OLD | NEW |