OLD | NEW |
(Empty) | |
| 1 # PLY based Lexer class, based on pycparser by Eli Bendersky. |
| 2 # |
| 3 # Copyright (C) 2008-2013, Eli Bendersky |
| 4 # License: BSD |
| 5 |
| 6 import re |
| 7 import sys |
| 8 import os.path |
| 9 |
| 10 # Try to load the ply module, if not, then assume it is in the third_party |
| 11 # directory. |
| 12 try: |
| 13 # Disable lint check which fails to find the ply module. |
| 14 # pylint: disable=F0401 |
| 15 from ply.lex import TOKEN |
| 16 except ImportError: |
| 17 module_path, module_name = os.path.split(__file__) |
| 18 third_party = os.path.join( |
| 19 module_path, os.pardir, os.pardir, os.pardir, os.pardir, 'third_party') |
| 20 sys.path.append(third_party) |
| 21 # pylint: disable=F0401 |
| 22 from ply.lex import TOKEN |
| 23 |
| 24 |
| 25 class Lexer(object): |
| 26 ######################-- PRIVATE --###################### |
| 27 |
| 28 ## |
| 29 ## Internal auxiliary methods |
| 30 ## |
| 31 def _error(self, msg, token): |
| 32 print('%s at line %d' % (msg, token.lineno)) |
| 33 self.lexer.skip(1) |
| 34 |
| 35 ## |
| 36 ## Reserved keywords |
| 37 ## |
| 38 keywords = ( |
| 39 'HANDLE', |
| 40 'DATA_PIPE_CONSUMER', |
| 41 'DATA_PIPE_PRODUCER', |
| 42 'MESSAGE_PIPE', |
| 43 |
| 44 'MODULE', |
| 45 'STRUCT', |
| 46 'INTERFACE', |
| 47 'ENUM', |
| 48 'VOID', |
| 49 ) |
| 50 |
| 51 keyword_map = {} |
| 52 for keyword in keywords: |
| 53 keyword_map[keyword.lower()] = keyword |
| 54 |
| 55 ## |
| 56 ## All the tokens recognized by the lexer |
| 57 ## |
| 58 tokens = keywords + ( |
| 59 # Identifiers |
| 60 'NAME', |
| 61 |
| 62 # constants |
| 63 'ORDINAL', |
| 64 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', |
| 65 'FLOAT_CONST', 'HEX_FLOAT_CONST', |
| 66 'CHAR_CONST', |
| 67 'WCHAR_CONST', |
| 68 |
| 69 # String literals |
| 70 'STRING_LITERAL', |
| 71 'WSTRING_LITERAL', |
| 72 |
| 73 # Operators |
| 74 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD', |
| 75 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT', |
| 76 'LOR', 'LAND', 'LNOT', |
| 77 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE', |
| 78 |
| 79 # Assignment |
| 80 'EQUALS', |
| 81 |
| 82 # Conditional operator (?) |
| 83 'CONDOP', |
| 84 |
| 85 # Delimeters |
| 86 'LPAREN', 'RPAREN', # ( ) |
| 87 'LBRACKET', 'RBRACKET', # [ ] |
| 88 'LBRACE', 'RBRACE', # { } |
| 89 'SEMI', 'COLON', # ; : |
| 90 'COMMA', # . |
| 91 ) |
| 92 |
| 93 ## |
| 94 ## Regexes for use in tokens |
| 95 ## |
| 96 ## |
| 97 |
| 98 # valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers) |
| 99 identifier = r'[a-zA-Z_$][0-9a-zA-Z_$]*' |
| 100 |
| 101 hex_prefix = '0[xX]' |
| 102 hex_digits = '[0-9a-fA-F]+' |
| 103 |
| 104 # integer constants (K&R2: A.2.5.1) |
| 105 integer_suffix_opt = \ |
| 106 r'(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?' |
| 107 decimal_constant = \ |
| 108 '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')' |
| 109 octal_constant = '0[0-7]*'+integer_suffix_opt |
| 110 hex_constant = hex_prefix+hex_digits+integer_suffix_opt |
| 111 |
| 112 bad_octal_constant = '0[0-7]*[89]' |
| 113 |
| 114 # character constants (K&R2: A.2.5.2) |
| 115 # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line |
| 116 # directives with Windows paths as filenames (..\..\dir\file) |
| 117 # For the same reason, decimal_escape allows all digit sequences. We want to |
| 118 # parse all correct code, even if it means to sometimes parse incorrect |
| 119 # code. |
| 120 # |
| 121 simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" |
| 122 decimal_escape = r"""(\d+)""" |
| 123 hex_escape = r"""(x[0-9a-fA-F]+)""" |
| 124 bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" |
| 125 |
| 126 escape_sequence = \ |
| 127 r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))' |
| 128 cconst_char = r"""([^'\\\n]|"""+escape_sequence+')' |
| 129 char_const = "'"+cconst_char+"'" |
| 130 wchar_const = 'L'+char_const |
| 131 unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)" |
| 132 bad_char_const = \ |
| 133 r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+ \ |
| 134 bad_escape+r"""[^'\n]*')""" |
| 135 |
| 136 # string literals (K&R2: A.2.6) |
| 137 string_char = r"""([^"\\\n]|"""+escape_sequence+')' |
| 138 string_literal = '"'+string_char+'*"' |
| 139 wstring_literal = 'L'+string_literal |
| 140 bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' |
| 141 |
| 142 # floating constants (K&R2: A.2.5.3) |
| 143 exponent_part = r"""([eE][-+]?[0-9]+)""" |
| 144 fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" |
| 145 floating_constant = \ |
| 146 '(((('+fractional_constant+')'+ \ |
| 147 exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)' |
| 148 binary_exponent_part = r'''([pP][+-]?[0-9]+)''' |
| 149 hex_fractional_constant = \ |
| 150 '((('+hex_digits+r""")?\."""+hex_digits+')|('+hex_digits+r"""\.))""" |
| 151 hex_floating_constant = \ |
| 152 '('+hex_prefix+'('+hex_digits+'|'+hex_fractional_constant+')'+ \ |
| 153 binary_exponent_part+'[FfLl]?)' |
| 154 |
| 155 ## |
| 156 ## Rules for the normal state |
| 157 ## |
| 158 t_ignore = ' \t' |
| 159 |
| 160 # Newlines |
| 161 def t_NEWLINE(self, t): |
| 162 r'\n+' |
| 163 t.lexer.lineno += t.value.count("\n") |
| 164 |
| 165 # Operators |
| 166 t_PLUS = r'\+' |
| 167 t_MINUS = r'-' |
| 168 t_TIMES = r'\*' |
| 169 t_DIVIDE = r'/' |
| 170 t_MOD = r'%' |
| 171 t_OR = r'\|' |
| 172 t_AND = r'&' |
| 173 t_NOT = r'~' |
| 174 t_XOR = r'\^' |
| 175 t_LSHIFT = r'<<' |
| 176 t_RSHIFT = r'>>' |
| 177 t_LOR = r'\|\|' |
| 178 t_LAND = r'&&' |
| 179 t_LNOT = r'!' |
| 180 t_LT = r'<' |
| 181 t_GT = r'>' |
| 182 t_LE = r'<=' |
| 183 t_GE = r'>=' |
| 184 t_EQ = r'==' |
| 185 t_NE = r'!=' |
| 186 |
| 187 # = |
| 188 t_EQUALS = r'=' |
| 189 |
| 190 # ? |
| 191 t_CONDOP = r'\?' |
| 192 |
| 193 # Delimeters |
| 194 t_LPAREN = r'\(' |
| 195 t_RPAREN = r'\)' |
| 196 t_LBRACKET = r'\[' |
| 197 t_RBRACKET = r'\]' |
| 198 t_LBRACE = r'\{' |
| 199 t_RBRACE = r'\}' |
| 200 t_COMMA = r',' |
| 201 t_SEMI = r';' |
| 202 t_COLON = r':' |
| 203 |
| 204 t_STRING_LITERAL = string_literal |
| 205 t_ORDINAL = r'@[0-9]*' |
| 206 |
| 207 # The following floating and integer constants are defined as |
| 208 # functions to impose a strict order (otherwise, decimal |
| 209 # is placed before the others because its regex is longer, |
| 210 # and this is bad) |
| 211 # |
| 212 @TOKEN(floating_constant) |
| 213 def t_FLOAT_CONST(self, t): |
| 214 return t |
| 215 |
| 216 @TOKEN(hex_floating_constant) |
| 217 def t_HEX_FLOAT_CONST(self, t): |
| 218 return t |
| 219 |
| 220 @TOKEN(hex_constant) |
| 221 def t_INT_CONST_HEX(self, t): |
| 222 return t |
| 223 |
| 224 @TOKEN(bad_octal_constant) |
| 225 def t_BAD_CONST_OCT(self, t): |
| 226 msg = "Invalid octal constant" |
| 227 self._error(msg, t) |
| 228 |
| 229 @TOKEN(octal_constant) |
| 230 def t_INT_CONST_OCT(self, t): |
| 231 return t |
| 232 |
| 233 @TOKEN(decimal_constant) |
| 234 def t_INT_CONST_DEC(self, t): |
| 235 return t |
| 236 |
| 237 # Must come before bad_char_const, to prevent it from |
| 238 # catching valid char constants as invalid |
| 239 # |
| 240 @TOKEN(char_const) |
| 241 def t_CHAR_CONST(self, t): |
| 242 return t |
| 243 |
| 244 @TOKEN(wchar_const) |
| 245 def t_WCHAR_CONST(self, t): |
| 246 return t |
| 247 |
| 248 @TOKEN(unmatched_quote) |
| 249 def t_UNMATCHED_QUOTE(self, t): |
| 250 msg = "Unmatched '" |
| 251 self._error(msg, t) |
| 252 |
| 253 @TOKEN(bad_char_const) |
| 254 def t_BAD_CHAR_CONST(self, t): |
| 255 msg = "Invalid char constant %s" % t.value |
| 256 self._error(msg, t) |
| 257 |
| 258 @TOKEN(wstring_literal) |
| 259 def t_WSTRING_LITERAL(self, t): |
| 260 return t |
| 261 |
| 262 # unmatched string literals are caught by the preprocessor |
| 263 |
| 264 @TOKEN(bad_string_literal) |
| 265 def t_BAD_STRING_LITERAL(self, t): |
| 266 msg = "String contains invalid escape code" |
| 267 self._error(msg, t) |
| 268 |
| 269 @TOKEN(identifier) |
| 270 def t_NAME(self, t): |
| 271 t.type = self.keyword_map.get(t.value, "NAME") |
| 272 return t |
| 273 |
| 274 # Ignore C and C++ style comments |
| 275 def t_COMMENT(self, t): |
| 276 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)' |
| 277 pass |
| 278 |
| 279 def t_error(self, t): |
| 280 msg = 'Illegal character %s' % repr(t.value[0]) |
| 281 self._error(msg, t) |
OLD | NEW |