| Index: mojo/public/bindings/parse/mojo_lexer.py
|
| diff --git a/mojo/public/bindings/parse/mojo_lexer.py b/mojo/public/bindings/parse/mojo_lexer.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..05b7e2fcc4bf077810efd07dea28acb7695c310f
|
| --- /dev/null
|
| +++ b/mojo/public/bindings/parse/mojo_lexer.py
|
| @@ -0,0 +1,281 @@
|
| +# PLY based Lexer class, based on pycparser by Eli Bendersky.
|
| +#
|
| +# Copyright (C) 2008-2013, Eli Bendersky
|
| +# License: BSD
|
| +
|
| +import re
|
| +import sys
|
| +import os.path
|
| +
|
| +# Try to load the ply module, if not, then assume it is in the third_party
|
| +# directory.
|
| +try:
|
| + # Disable lint check which fails to find the ply module.
|
| + # pylint: disable=F0401
|
| + from ply.lex import TOKEN
|
| +except ImportError:
|
| + module_path, module_name = os.path.split(__file__)
|
| + third_party = os.path.join(
|
| + module_path, os.pardir, os.pardir, os.pardir, os.pardir, 'third_party')
|
| + sys.path.append(third_party)
|
| + # pylint: disable=F0401
|
| + from ply.lex import TOKEN
|
| +
|
| +
|
| +class Lexer(object):
|
| + ######################-- PRIVATE --######################
|
| +
|
| + ##
|
| + ## Internal auxiliary methods
|
| + ##
|
| + def _error(self, msg, token):
|
| + print('%s at line %d' % (msg, token.lineno))
|
| + self.lexer.skip(1)
|
| +
|
| + ##
|
| + ## Reserved keywords
|
| + ##
|
| + keywords = (
|
| + 'HANDLE',
|
| + 'DATA_PIPE_CONSUMER',
|
| + 'DATA_PIPE_PRODUCER',
|
| + 'MESSAGE_PIPE',
|
| +
|
| + 'MODULE',
|
| + 'STRUCT',
|
| + 'INTERFACE',
|
| + 'ENUM',
|
| + 'VOID',
|
| + )
|
| +
|
| + keyword_map = {}
|
| + for keyword in keywords:
|
| + keyword_map[keyword.lower()] = keyword
|
| +
|
| + ##
|
| + ## All the tokens recognized by the lexer
|
| + ##
|
| + tokens = keywords + (
|
| + # Identifiers
|
| + 'NAME',
|
| +
|
| + # constants
|
| + 'ORDINAL',
|
| + 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX',
|
| + 'FLOAT_CONST', 'HEX_FLOAT_CONST',
|
| + 'CHAR_CONST',
|
| + 'WCHAR_CONST',
|
| +
|
| + # String literals
|
| + 'STRING_LITERAL',
|
| + 'WSTRING_LITERAL',
|
| +
|
| + # Operators
|
| + 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
|
| + 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT',
|
| + 'LOR', 'LAND', 'LNOT',
|
| + 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE',
|
| +
|
| + # Assignment
|
| + 'EQUALS',
|
| +
|
| + # Conditional operator (?)
|
| + 'CONDOP',
|
| +
|
| + # Delimeters
|
| + 'LPAREN', 'RPAREN', # ( )
|
| + 'LBRACKET', 'RBRACKET', # [ ]
|
| + 'LBRACE', 'RBRACE', # { }
|
| + 'SEMI', 'COLON', # ; :
|
| + 'COMMA', # .
|
| + )
|
| +
|
| + ##
|
| + ## Regexes for use in tokens
|
| + ##
|
| + ##
|
| +
|
| + # valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers)
|
| + identifier = r'[a-zA-Z_$][0-9a-zA-Z_$]*'
|
| +
|
| + hex_prefix = '0[xX]'
|
| + hex_digits = '[0-9a-fA-F]+'
|
| +
|
| + # integer constants (K&R2: A.2.5.1)
|
| + integer_suffix_opt = \
|
| + r'(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?'
|
| + decimal_constant = \
|
| + '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')'
|
| + octal_constant = '0[0-7]*'+integer_suffix_opt
|
| + hex_constant = hex_prefix+hex_digits+integer_suffix_opt
|
| +
|
| + bad_octal_constant = '0[0-7]*[89]'
|
| +
|
| + # character constants (K&R2: A.2.5.2)
|
| + # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
|
| + # directives with Windows paths as filenames (..\..\dir\file)
|
| + # For the same reason, decimal_escape allows all digit sequences. We want to
|
| + # parse all correct code, even if it means to sometimes parse incorrect
|
| + # code.
|
| + #
|
| + simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
|
| + decimal_escape = r"""(\d+)"""
|
| + hex_escape = r"""(x[0-9a-fA-F]+)"""
|
| + bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
|
| +
|
| + escape_sequence = \
|
| + r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'
|
| + cconst_char = r"""([^'\\\n]|"""+escape_sequence+')'
|
| + char_const = "'"+cconst_char+"'"
|
| + wchar_const = 'L'+char_const
|
| + unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)"
|
| + bad_char_const = \
|
| + r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+ \
|
| + bad_escape+r"""[^'\n]*')"""
|
| +
|
| + # string literals (K&R2: A.2.6)
|
| + string_char = r"""([^"\\\n]|"""+escape_sequence+')'
|
| + string_literal = '"'+string_char+'*"'
|
| + wstring_literal = 'L'+string_literal
|
| + bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'
|
| +
|
| + # floating constants (K&R2: A.2.5.3)
|
| + exponent_part = r"""([eE][-+]?[0-9]+)"""
|
| + fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
|
| + floating_constant = \
|
| + '(((('+fractional_constant+')'+ \
|
| + exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)'
|
| + binary_exponent_part = r'''([pP][+-]?[0-9]+)'''
|
| + hex_fractional_constant = \
|
| + '((('+hex_digits+r""")?\."""+hex_digits+')|('+hex_digits+r"""\.))"""
|
| + hex_floating_constant = \
|
| + '('+hex_prefix+'('+hex_digits+'|'+hex_fractional_constant+')'+ \
|
| + binary_exponent_part+'[FfLl]?)'
|
| +
|
| + ##
|
| + ## Rules for the normal state
|
| + ##
|
| + t_ignore = ' \t'
|
| +
|
| + # Newlines
|
| + def t_NEWLINE(self, t):
|
| + r'\n+'
|
| + t.lexer.lineno += t.value.count("\n")
|
| +
|
| + # Operators
|
| + t_PLUS = r'\+'
|
| + t_MINUS = r'-'
|
| + t_TIMES = r'\*'
|
| + t_DIVIDE = r'/'
|
| + t_MOD = r'%'
|
| + t_OR = r'\|'
|
| + t_AND = r'&'
|
| + t_NOT = r'~'
|
| + t_XOR = r'\^'
|
| + t_LSHIFT = r'<<'
|
| + t_RSHIFT = r'>>'
|
| + t_LOR = r'\|\|'
|
| + t_LAND = r'&&'
|
| + t_LNOT = r'!'
|
| + t_LT = r'<'
|
| + t_GT = r'>'
|
| + t_LE = r'<='
|
| + t_GE = r'>='
|
| + t_EQ = r'=='
|
| + t_NE = r'!='
|
| +
|
| + # =
|
| + t_EQUALS = r'='
|
| +
|
| + # ?
|
| + t_CONDOP = r'\?'
|
| +
|
| + # Delimeters
|
| + t_LPAREN = r'\('
|
| + t_RPAREN = r'\)'
|
| + t_LBRACKET = r'\['
|
| + t_RBRACKET = r'\]'
|
| + t_LBRACE = r'\{'
|
| + t_RBRACE = r'\}'
|
| + t_COMMA = r','
|
| + t_SEMI = r';'
|
| + t_COLON = r':'
|
| +
|
| + t_STRING_LITERAL = string_literal
|
| + t_ORDINAL = r'@[0-9]*'
|
| +
|
| + # The following floating and integer constants are defined as
|
| + # functions to impose a strict order (otherwise, decimal
|
| + # is placed before the others because its regex is longer,
|
| + # and this is bad)
|
| + #
|
| + @TOKEN(floating_constant)
|
| + def t_FLOAT_CONST(self, t):
|
| + return t
|
| +
|
| + @TOKEN(hex_floating_constant)
|
| + def t_HEX_FLOAT_CONST(self, t):
|
| + return t
|
| +
|
| + @TOKEN(hex_constant)
|
| + def t_INT_CONST_HEX(self, t):
|
| + return t
|
| +
|
| + @TOKEN(bad_octal_constant)
|
| + def t_BAD_CONST_OCT(self, t):
|
| + msg = "Invalid octal constant"
|
| + self._error(msg, t)
|
| +
|
| + @TOKEN(octal_constant)
|
| + def t_INT_CONST_OCT(self, t):
|
| + return t
|
| +
|
| + @TOKEN(decimal_constant)
|
| + def t_INT_CONST_DEC(self, t):
|
| + return t
|
| +
|
| + # Must come before bad_char_const, to prevent it from
|
| + # catching valid char constants as invalid
|
| + #
|
| + @TOKEN(char_const)
|
| + def t_CHAR_CONST(self, t):
|
| + return t
|
| +
|
| + @TOKEN(wchar_const)
|
| + def t_WCHAR_CONST(self, t):
|
| + return t
|
| +
|
| + @TOKEN(unmatched_quote)
|
| + def t_UNMATCHED_QUOTE(self, t):
|
| + msg = "Unmatched '"
|
| + self._error(msg, t)
|
| +
|
| + @TOKEN(bad_char_const)
|
| + def t_BAD_CHAR_CONST(self, t):
|
| + msg = "Invalid char constant %s" % t.value
|
| + self._error(msg, t)
|
| +
|
| + @TOKEN(wstring_literal)
|
| + def t_WSTRING_LITERAL(self, t):
|
| + return t
|
| +
|
| + # unmatched string literals are caught by the preprocessor
|
| +
|
| + @TOKEN(bad_string_literal)
|
| + def t_BAD_STRING_LITERAL(self, t):
|
| + msg = "String contains invalid escape code"
|
| + self._error(msg, t)
|
| +
|
| + @TOKEN(identifier)
|
| + def t_NAME(self, t):
|
| + t.type = self.keyword_map.get(t.value, "NAME")
|
| + return t
|
| +
|
| + # Ignore C and C++ style comments
|
| + def t_COMMENT(self, t):
|
| + r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
|
| + pass
|
| +
|
| + def t_error(self, t):
|
| + msg = 'Illegal character %s' % repr(t.value[0])
|
| + self._error(msg, t)
|
|
|