| Index: tools/lexer_generator/regex_parser.py
|
| diff --git a/tools/lexer_generator/regex_parser.py b/tools/lexer_generator/regex_parser.py
|
| index 4053bec03fde1635231fa2da50fb7728da8cb563..d19a5432570508bef88dd5e8e99fb28ec4fe7ea8 100644
|
| --- a/tools/lexer_generator/regex_parser.py
|
| +++ b/tools/lexer_generator/regex_parser.py
|
| @@ -25,11 +25,123 @@
|
| # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
| +import ply.lex as lex
|
| import ply.yacc as yacc
|
| from types import ListType, TupleType
|
| from regex_lexer import RegexLexer
|
| from action import Term
|
|
|
| +def build_escape_map(chars):
|
| + def add_escape(d, char):
|
| + d['\\' + char] = char
|
| + return d
|
| + return reduce(add_escape, chars,
|
| + {'\\t' : '\t', '\\r' : '\r', '\\n' : '\n', '\\v' : '\v', '\\f' : '\f'})
|
| +
|
| +class RegexLexer:
|
| +
|
| + tokens = (
|
| +
|
| + 'GROUP_BEGIN',
|
| + 'GROUP_END',
|
| +
|
| + 'CLASS_BEGIN',
|
| + 'CLASS_END',
|
| +
|
| + 'OR',
|
| + 'ONE_OR_MORE',
|
| + 'ZERO_OR_MORE',
|
| + 'ZERO_OR_ONE',
|
| + 'ANY',
|
| +
|
| + 'REPEAT_BEGIN',
|
| + 'REPEAT_END',
|
| +
|
| + 'NUMBER',
|
| + 'COMMA',
|
| + 'LITERAL',
|
| +
|
| + 'RANGE',
|
| + 'NOT',
|
| + 'CLASS_LITERAL',
|
| + 'CLASS_LITERAL_AS_OCTAL',
|
| + 'CHARACTER_CLASS',
|
| + )
|
| +
|
| + states = (
|
| + ('class','exclusive'),
|
| + ('repeat','exclusive'),
|
| + )
|
| +
|
| + __escaped_literals = build_escape_map("(){}[]?+.*|'\"\\")
|
| +
|
| + def t_ESCAPED_LITERAL(self, t):
|
| + r'\\.'
|
| + t.type = 'LITERAL'
|
| + t.value = RegexLexer.__escaped_literals[t.value]
|
| + return t
|
| +
|
| + t_GROUP_BEGIN = r'\('
|
| + t_GROUP_END = r'\)'
|
| +
|
| + t_OR = r'\|'
|
| + t_ONE_OR_MORE = r'\+'
|
| + t_ZERO_OR_MORE = r'\*'
|
| + t_ZERO_OR_ONE = r'\?'
|
| +
|
| + t_ANY = r'\.'
|
| +
|
| + t_LITERAL = r'.'
|
| +
|
| + def t_CLASS_BEGIN(self, t):
|
| + r'\['
|
| + self.lexer.push_state('class')
|
| + return t
|
| +
|
| + def t_class_CLASS_END(self, t):
|
| + r'\]'
|
| + self.lexer.pop_state()
|
| + return t
|
| +
|
| + t_class_RANGE = '-'
|
| + t_class_NOT = '\^'
|
| + t_class_CHARACTER_CLASS = r':\w+:'
|
| +
|
| + def t_class_CLASS_LITERAL_AS_OCTAL(self, t):
|
| + r'\\\d+'
|
| + return t
|
| +
|
| + __escaped_class_literals = build_escape_map("^[]-:\\")
|
| +
|
| + def t_class_ESCAPED_CLASS_LITERAL(self, t):
|
| + r'\\.'
|
| + t.type = 'CLASS_LITERAL'
|
| + t.value = RegexLexer.__escaped_class_literals[t.value]
|
| + return t
|
| +
|
| + t_class_CLASS_LITERAL = r'[\w *$_+\'\"/]'
|
| +
|
| + def t_REPEAT_BEGIN(self, t):
|
| + r'\{'
|
| + self.lexer.push_state('repeat')
|
| + return t
|
| +
|
| + def t_repeat_REPEAT_END(self, t):
|
| + r'\}'
|
| + self.lexer.pop_state()
|
| + return t
|
| +
|
| + t_repeat_NUMBER = r'[0-9]+'
|
| + t_repeat_COMMA = r','
|
| +
|
| + t_ANY_ignore = '\n'
|
| +
|
| + def t_ANY_error(self, t):
|
| + raise Exception("Illegal character '%s'" % t.value[0])
|
| +
|
| + def build(self, **kwargs):
|
| + self.lexer = lex.lex(module=self, **kwargs)
|
| +
|
| class RegexParser:
|
|
|
| tokens = RegexLexer.tokens
|
|
|