Index: third_party/cython/src/Cython/Plex/Lexicons.py |
diff --git a/third_party/cython/src/Cython/Plex/Lexicons.py b/third_party/cython/src/Cython/Plex/Lexicons.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..88074666b014ced0daa542c7c5accceb6c119e61 |
--- /dev/null |
+++ b/third_party/cython/src/Cython/Plex/Lexicons.py |
@@ -0,0 +1,195 @@ |
+#======================================================================= |
+# |
+# Python Lexical Analyser |
+# |
+# Lexical Analyser Specification |
+# |
+#======================================================================= |
+ |
+import types |
+ |
+import Actions |
+import DFA |
+import Errors |
+import Machines |
+import Regexps |
+ |
+# debug_flags for Lexicon constructor |
+DUMP_NFA = 1 |
+DUMP_DFA = 2 |
+ |
+class State(object): |
+ """ |
+ This class is used as part of a Plex.Lexicon specification to |
+ introduce a user-defined state. |
+ |
+ Constructor: |
+ |
+ State(name, token_specifications) |
+ """ |
+ |
+ name = None |
+ tokens = None |
+ |
+ def __init__(self, name, tokens): |
+ self.name = name |
+ self.tokens = tokens |
+ |
+class Lexicon(object): |
+ """ |
+ Lexicon(specification) builds a lexical analyser from the given |
+ |specification|. The specification consists of a list of |
+ specification items. Each specification item may be either: |
+ |
+ 1) A token definition, which is a tuple: |
+ |
+ (pattern, action) |
+ |
+ The |pattern| is a regular axpression built using the |
+ constructors defined in the Plex module. |
+ |
+ The |action| is the action to be performed when this pattern |
+ is recognised (see below). |
+ |
+ 2) A state definition: |
+ |
+ State(name, tokens) |
+ |
+ where |name| is a character string naming the state, |
+ and |tokens| is a list of token definitions as |
+ above. The meaning and usage of states is described |
+ below. |
+ |
+ Actions |
+ ------- |
+ |
+ The |action| in a token specication may be one of three things: |
+ |
+ 1) A function, which is called as follows: |
+ |
+ function(scanner, text) |
+ |
+ where |scanner| is the relevant Scanner instance, and |text| |
+ is the matched text. If the function returns anything |
+ other than None, that value is returned as the value of the |
+ token. If it returns None, scanning continues as if the IGNORE |
+ action were specified (see below). |
+ |
+ 2) One of the following special actions: |
+ |
+ IGNORE means that the recognised characters will be treated as |
+ white space and ignored. Scanning will continue until |
+ the next non-ignored token is recognised before returning. |
+ |
+ TEXT causes the scanned text itself to be returned as the |
+ value of the token. |
+ |
+ 3) Any other value, which is returned as the value of the token. |
+ |
+ States |
+ ------ |
+ |
+ At any given time, the scanner is in one of a number of states. |
+ Associated with each state is a set of possible tokens. When scanning, |
+ only tokens associated with the current state are recognised. |
+ |
+ There is a default state, whose name is the empty string. Token |
+ definitions which are not inside any State definition belong to |
+ the default state. |
+ |
+ The initial state of the scanner is the default state. The state can |
+ be changed in one of two ways: |
+ |
+ 1) Using Begin(state_name) as the action of a token. |
+ |
+ 2) Calling the begin(state_name) method of the Scanner. |
+ |
+ To change back to the default state, use '' as the state name. |
+ """ |
+ |
+ machine = None # Machine |
+ tables = None # StateTableMachine |
+ |
+ def __init__(self, specifications, debug = None, debug_flags = 7, timings = None): |
+ if type(specifications) != types.ListType: |
+ raise Errors.InvalidScanner("Scanner definition is not a list") |
+ if timings: |
+ from Timing import time |
+ total_time = 0.0 |
+ time1 = time() |
+ nfa = Machines.Machine() |
+ default_initial_state = nfa.new_initial_state('') |
+ token_number = 1 |
+ for spec in specifications: |
+ if isinstance(spec, State): |
+ user_initial_state = nfa.new_initial_state(spec.name) |
+ for token in spec.tokens: |
+ self.add_token_to_machine( |
+ nfa, user_initial_state, token, token_number) |
+ token_number = token_number + 1 |
+ elif type(spec) == types.TupleType: |
+ self.add_token_to_machine( |
+ nfa, default_initial_state, spec, token_number) |
+ token_number = token_number + 1 |
+ else: |
+ raise Errors.InvalidToken( |
+ token_number, |
+ "Expected a token definition (tuple) or State instance") |
+ if timings: |
+ time2 = time() |
+ total_time = total_time + (time2 - time1) |
+ time3 = time() |
+ if debug and (debug_flags & 1): |
+ debug.write("\n============= NFA ===========\n") |
+ nfa.dump(debug) |
+ dfa = DFA.nfa_to_dfa(nfa, debug = (debug_flags & 3) == 3 and debug) |
+ if timings: |
+ time4 = time() |
+ total_time = total_time + (time4 - time3) |
+ if debug and (debug_flags & 2): |
+ debug.write("\n============= DFA ===========\n") |
+ dfa.dump(debug) |
+ if timings: |
+ timings.write("Constructing NFA : %5.2f\n" % (time2 - time1)) |
+ timings.write("Converting to DFA: %5.2f\n" % (time4 - time3)) |
+ timings.write("TOTAL : %5.2f\n" % total_time) |
+ self.machine = dfa |
+ |
+ def add_token_to_machine(self, machine, initial_state, token_spec, token_number): |
+ try: |
+ (re, action_spec) = self.parse_token_definition(token_spec) |
+ # Disabled this -- matching empty strings can be useful |
+ #if re.nullable: |
+ # raise Errors.InvalidToken( |
+ # token_number, "Pattern can match 0 input symbols") |
+ if isinstance(action_spec, Actions.Action): |
+ action = action_spec |
+ else: |
+ try: |
+ action_spec.__call__ |
+ except AttributeError: |
+ action = Actions.Return(action_spec) |
+ else: |
+ action = Actions.Call(action_spec) |
+ final_state = machine.new_state() |
+ re.build_machine(machine, initial_state, final_state, |
+ match_bol = 1, nocase = 0) |
+ final_state.set_action(action, priority = -token_number) |
+ except Errors.PlexError, e: |
+ raise e.__class__("Token number %d: %s" % (token_number, e)) |
+ |
+ def parse_token_definition(self, token_spec): |
+ if type(token_spec) != types.TupleType: |
+ raise Errors.InvalidToken("Token definition is not a tuple") |
+ if len(token_spec) != 2: |
+ raise Errors.InvalidToken("Wrong number of items in token definition") |
+ pattern, action = token_spec |
+ if not isinstance(pattern, Regexps.RE): |
+ raise Errors.InvalidToken("Pattern is not an RE instance") |
+ return (pattern, action) |
+ |
+ def get_initial_state(self, name): |
+ return self.machine.get_initial_state(name) |
+ |
+ |
+ |