OLD | NEW |
| (Empty) |
1 # Copyright 2014 The Chromium Authors. All rights reserved. | |
2 # Use of this source code is governed by a BSD-style license that can be | |
3 # found in the LICENSE file. | |
4 | |
5 import re | |
6 import sys | |
7 import os.path | |
8 | |
9 # Try to load the ply module, if not, then assume it is in the third_party | |
10 # directory. | |
11 try: | |
12 # Disable lint check which fails to find the ply module. | |
13 # pylint: disable=F0401 | |
14 from ply.lex import TOKEN | |
15 except ImportError: | |
16 module_path, module_name = os.path.split(__file__) | |
17 third_party = os.path.join(module_path, os.pardir, os.pardir, os.pardir, | |
18 os.pardir, os.pardir, 'third_party') | |
19 sys.path.append(third_party) | |
20 # pylint: disable=F0401 | |
21 from ply.lex import TOKEN | |
22 | |
23 | |
24 class LexError(Exception): | |
25 def __init__(self, filename, lineno, msg): | |
26 self.filename = filename | |
27 self.lineno = lineno | |
28 self.msg = msg | |
29 | |
30 def __str__(self): | |
31 return "%s:%d: Error: %s" % (self.filename, self.lineno, self.msg) | |
32 | |
33 def __repr__(self): | |
34 return str(self) | |
35 | |
36 | |
37 class Lexer(object): | |
38 | |
39 def __init__(self, filename): | |
40 self.filename = filename | |
41 | |
42 ######################-- PRIVATE --###################### | |
43 | |
44 ## | |
45 ## Internal auxiliary methods | |
46 ## | |
47 def _error(self, msg, token): | |
48 raise LexError(self.filename, token.lineno, msg) | |
49 | |
50 ## | |
51 ## Reserved keywords | |
52 ## | |
53 keywords = ( | |
54 'HANDLE', | |
55 'DATA_PIPE_CONSUMER', | |
56 'DATA_PIPE_PRODUCER', | |
57 'MESSAGE_PIPE', | |
58 'SHARED_BUFFER', | |
59 | |
60 'IMPORT', | |
61 'MODULE', | |
62 'STRUCT', | |
63 'INTERFACE', | |
64 'ENUM', | |
65 ) | |
66 | |
67 keyword_map = {} | |
68 for keyword in keywords: | |
69 keyword_map[keyword.lower()] = keyword | |
70 | |
71 ## | |
72 ## All the tokens recognized by the lexer | |
73 ## | |
74 tokens = keywords + ( | |
75 # Identifiers | |
76 'NAME', | |
77 | |
78 # Constants | |
79 'ORDINAL', | |
80 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', | |
81 'FLOAT_CONST', | |
82 'CHAR_CONST', | |
83 | |
84 # String literals | |
85 'STRING_LITERAL', | |
86 | |
87 # Operators | |
88 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD', | |
89 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT', | |
90 | |
91 # Assignment | |
92 'EQUALS', | |
93 | |
94 # Request / response | |
95 'RESPONSE', | |
96 | |
97 # Delimiters | |
98 'LPAREN', 'RPAREN', # ( ) | |
99 'LBRACKET', 'RBRACKET', # [ ] | |
100 'LBRACE', 'RBRACE', # { } | |
101 'LANGLE', 'RANGLE', # < > | |
102 'SEMI', # ; | |
103 'COMMA', 'DOT' # , . | |
104 ) | |
105 | |
106 ## | |
107 ## Regexes for use in tokens | |
108 ## | |
109 | |
110 # valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers) | |
111 identifier = r'[a-zA-Z_$][0-9a-zA-Z_$]*' | |
112 | |
113 hex_prefix = '0[xX]' | |
114 hex_digits = '[0-9a-fA-F]+' | |
115 | |
116 # integer constants (K&R2: A.2.5.1) | |
117 integer_suffix_opt = \ | |
118 r'(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?' | |
119 decimal_constant = \ | |
120 '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')' | |
121 octal_constant = '0[0-7]*'+integer_suffix_opt | |
122 hex_constant = hex_prefix+hex_digits+integer_suffix_opt | |
123 | |
124 bad_octal_constant = '0[0-7]*[89]' | |
125 | |
126 # character constants (K&R2: A.2.5.2) | |
127 # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line | |
128 # directives with Windows paths as filenames (..\..\dir\file) | |
129 # For the same reason, decimal_escape allows all digit sequences. We want to | |
130 # parse all correct code, even if it means to sometimes parse incorrect | |
131 # code. | |
132 # | |
133 simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" | |
134 decimal_escape = r"""(\d+)""" | |
135 hex_escape = r"""(x[0-9a-fA-F]+)""" | |
136 bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" | |
137 | |
138 escape_sequence = \ | |
139 r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))' | |
140 cconst_char = r"""([^'\\\n]|"""+escape_sequence+')' | |
141 char_const = "'"+cconst_char+"'" | |
142 unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)" | |
143 bad_char_const = \ | |
144 r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+ \ | |
145 bad_escape+r"""[^'\n]*')""" | |
146 | |
147 # string literals (K&R2: A.2.6) | |
148 string_char = r"""([^"\\\n]|"""+escape_sequence+')' | |
149 string_literal = '"'+string_char+'*"' | |
150 bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' | |
151 | |
152 # floating constants (K&R2: A.2.5.3) | |
153 exponent_part = r"""([eE][-+]?[0-9]+)""" | |
154 fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" | |
155 floating_constant = \ | |
156 '(((('+fractional_constant+')'+ \ | |
157 exponent_part+'?)|([0-9]+'+exponent_part+')))' | |
158 | |
159 # Ordinals | |
160 ordinal = r'@[0-9]+' | |
161 missing_ordinal_value = r'@' | |
162 # Don't allow ordinal values in octal (even invalid octal, like 09) or | |
163 # hexadecimal. | |
164 octal_or_hex_ordinal_disallowed = r'@((0[0-9]+)|('+hex_prefix+hex_digits+'))' | |
165 | |
166 ## | |
167 ## Rules for the normal state | |
168 ## | |
169 t_ignore = ' \t\r' | |
170 | |
171 # Newlines | |
172 def t_NEWLINE(self, t): | |
173 r'\n+' | |
174 t.lexer.lineno += t.value.count("\n") | |
175 | |
176 # Operators | |
177 t_PLUS = r'\+' | |
178 t_MINUS = r'-' | |
179 t_TIMES = r'\*' | |
180 t_DIVIDE = r'/' | |
181 t_MOD = r'%' | |
182 t_OR = r'\|' | |
183 t_AND = r'&' | |
184 t_NOT = r'~' | |
185 t_XOR = r'\^' | |
186 t_LSHIFT = r'<<' | |
187 t_RSHIFT = r'>>' | |
188 | |
189 # = | |
190 t_EQUALS = r'=' | |
191 | |
192 # => | |
193 t_RESPONSE = r'=>' | |
194 | |
195 # Delimiters | |
196 t_LPAREN = r'\(' | |
197 t_RPAREN = r'\)' | |
198 t_LBRACKET = r'\[' | |
199 t_RBRACKET = r'\]' | |
200 t_LBRACE = r'\{' | |
201 t_RBRACE = r'\}' | |
202 t_LANGLE = r'<' | |
203 t_RANGLE = r'>' | |
204 t_COMMA = r',' | |
205 t_DOT = r'\.' | |
206 t_SEMI = r';' | |
207 | |
208 t_STRING_LITERAL = string_literal | |
209 | |
210 # The following floating and integer constants are defined as | |
211 # functions to impose a strict order (otherwise, decimal | |
212 # is placed before the others because its regex is longer, | |
213 # and this is bad) | |
214 # | |
215 @TOKEN(floating_constant) | |
216 def t_FLOAT_CONST(self, t): | |
217 return t | |
218 | |
219 @TOKEN(hex_constant) | |
220 def t_INT_CONST_HEX(self, t): | |
221 return t | |
222 | |
223 @TOKEN(bad_octal_constant) | |
224 def t_BAD_CONST_OCT(self, t): | |
225 msg = "Invalid octal constant" | |
226 self._error(msg, t) | |
227 | |
228 @TOKEN(octal_constant) | |
229 def t_INT_CONST_OCT(self, t): | |
230 return t | |
231 | |
232 @TOKEN(decimal_constant) | |
233 def t_INT_CONST_DEC(self, t): | |
234 return t | |
235 | |
236 # Must come before bad_char_const, to prevent it from | |
237 # catching valid char constants as invalid | |
238 # | |
239 @TOKEN(char_const) | |
240 def t_CHAR_CONST(self, t): | |
241 return t | |
242 | |
243 @TOKEN(unmatched_quote) | |
244 def t_UNMATCHED_QUOTE(self, t): | |
245 msg = "Unmatched '" | |
246 self._error(msg, t) | |
247 | |
248 @TOKEN(bad_char_const) | |
249 def t_BAD_CHAR_CONST(self, t): | |
250 msg = "Invalid char constant %s" % t.value | |
251 self._error(msg, t) | |
252 | |
253 # unmatched string literals are caught by the preprocessor | |
254 | |
255 @TOKEN(bad_string_literal) | |
256 def t_BAD_STRING_LITERAL(self, t): | |
257 msg = "String contains invalid escape code" | |
258 self._error(msg, t) | |
259 | |
260 # Handle ordinal-related tokens in the right order: | |
261 @TOKEN(octal_or_hex_ordinal_disallowed) | |
262 def t_OCTAL_OR_HEX_ORDINAL_DISALLOWED(self, t): | |
263 msg = "Octal and hexadecimal ordinal values not allowed" | |
264 self._error(msg, t) | |
265 | |
266 @TOKEN(ordinal) | |
267 def t_ORDINAL(self, t): | |
268 return t | |
269 | |
270 @TOKEN(missing_ordinal_value) | |
271 def t_BAD_ORDINAL(self, t): | |
272 msg = "Missing ordinal value" | |
273 self._error(msg, t) | |
274 | |
275 @TOKEN(identifier) | |
276 def t_NAME(self, t): | |
277 t.type = self.keyword_map.get(t.value, "NAME") | |
278 return t | |
279 | |
280 # Ignore C and C++ style comments | |
281 def t_COMMENT(self, t): | |
282 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)' | |
283 pass | |
284 | |
285 def t_error(self, t): | |
286 msg = 'Illegal character %s' % repr(t.value[0]) | |
287 self._error(msg, t) | |
OLD | NEW |