| OLD | NEW |
| 1 # Copyright 2013 the V8 project authors. All rights reserved. | 1 # Copyright 2013 the V8 project authors. All rights reserved. |
| 2 # Redistribution and use in source and binary forms, with or without | 2 # Redistribution and use in source and binary forms, with or without |
| 3 # modification, are permitted provided that the following conditions are | 3 # modification, are permitted provided that the following conditions are |
| 4 # met: | 4 # met: |
| 5 # | 5 # |
| 6 # * Redistributions of source code must retain the above copyright | 6 # * Redistributions of source code must retain the above copyright |
| 7 # notice, this list of conditions and the following disclaimer. | 7 # notice, this list of conditions and the following disclaimer. |
| 8 # * Redistributions in binary form must reproduce the above | 8 # * Redistributions in binary form must reproduce the above |
| 9 # copyright notice, this list of conditions and the following | 9 # copyright notice, this list of conditions and the following |
| 10 # disclaimer in the documentation and/or other materials provided | 10 # disclaimer in the documentation and/or other materials provided |
| (...skipping 11 matching lines...) Expand all Loading... |
| 22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 26 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 26 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 27 | 27 |
| 28 from transition_keys import TransitionKey | 28 from transition_keys import TransitionKey |
| 29 from automaton import Action | 29 from automaton import Action |
| 30 from dfa import Dfa | 30 from dfa import Dfa |
| 31 | 31 |
| 32 # --- Optimization: Replacing tokens with gotos --- |
| 33 # |
| 34 # This optimization reduces the code size for grammars which have constructs |
| 35 # similar to keywords and identifiers. Consider the following grammar: |
| 36 # |
| 37 # "baz" <|token(BAZ)|> |
| 38 # "bazz" <|token(BAZZ)|> |
| 39 # |
| 40 # /[a-z]/ <|token(IDENTIFIER)|Identifier> |
| 41 # |
| 42 # <<Identifier>> |
| 43 # /[a-z]/ <|token(IDENTIFIER)|continue> |
| 44 # |
| 45 # In the corresponding dfa, we have a set of nodes for recognizing the keywords, |
| 46 # and one node for the "Identifier" state. From every node in the keyword part |
| 47 # of the dfa, there is an edge to the Identifier state with the letters which |
| 48 # cannot be used for advancing in the keyword part (e.g., we have seen "baz" and |
| 49 # the next character is "s", so that's an identifier and not the keyword |
| 50 # "baz" or "bazz"). |
| 51 # |
| 52 # [ ] ---b---> [ ] ---a---> [ ] ---z---> [BAZ] ---z---> [BAZZ] |
| 53 # | | | | |
| 54 # [b-z] [a-y] [a-y] [a-z] |
| 55 # | | | | |
| 56 # \---------------------------------------/ |
| 57 # | |
| 58 # v |
| 59 # [ID] ----------\ |
| 60 # ^ [a-z] |
| 61 # --------------/ |
| 62 # |
| 63 # If we generate code from the dfa, these edges contribute a lot to the code |
| 64 # size. To reduce the code size, we do the following transformation: |
| 65 # |
| 66 # For each state which is an end of a keyword, we, add a match action "store |
| 67 # token and goto". The match action will be executed only if we cannot advance |
| 68 # in the graph with the next character. For example, if we have seen "baz", we |
| 69 # first check if the next character is "z" to get "bazz", and if the next |
| 70 # character is not "z", we execute the match action. |
| 71 # |
| 72 # When we execute the "store token and goto" action, we record that we have seen |
| 73 # the corresponding keyword (the token might still be an identifier, depending |
| 74 # on what the next character is). Then we jump to the Identifier state of the |
| 75 # graph. The Identifier state has an entry action "store_token(IDENTIFIER)" for |
| 76 # [a-z]. When it is executed, it overwrites the stored token with IDENTIFIER. |
| 77 # |
| 78 # Example: if we have seen "baz" and the next character is not "z", we store the |
| 79 # token BAZ and jump to the Identifier state. If the next character is "a", we |
| 80 # are sure the token is not the keyword "baz", and overwrite the stored token |
| 81 # with IDENTIFIER. |
| 82 # |
| 83 # The Identifier state has a match action "do stored token", which returns the |
| 84 # stored token to the upper layer. |
| 85 # |
| 86 # Example: if we have seen "baz" and the next character is not "z", we store the |
| 87 # token BAZ and jump to the Identifier state. If the next character is a space, |
| 88 # we cannot advance in the dfa, and the match action is executed. The match |
| 89 # action returns the stored token BAZ to the upper layer. |
| 90 # |
| 91 # For each state which is an intermediate state in a keyword, we add the same |
| 92 # "goto", but we don't need to store a token. |
| 93 # |
| 94 # [ ] ---b---> [ ] ---a---> [ ] ---z---> [BAZ] ---z---> [BAZZ] |
| 95 # | | | | |
| 96 # goto goto store BAZ, goto store BAZZ, goto |
| 97 # | | | | |
| 98 # \---------------------------------------/ |
| 99 # | |
| 100 # v |
| 101 # [ID] ----------\ |
| 102 # ^ [a-z], store IDENTIFIER |
| 103 # --------------/ |
| 104 # |
| 105 # (Note: this graph illustrates the logic, but is not accurate wrt the entry |
| 106 # actions and match actions of the states.) |
| 107 # |
| 108 # The code size decreases, because we remove the checks which correspond to |
| 109 # transitions from keyword states to the identifier state ([b-z], [a-y] etc.), |
| 110 # and replace them with a more general check ([a-z]) in one place. This works |
| 111 # because the more specialized checks (e.g., checking for "z" when we have seen |
| 112 # "baz") are done before we "goto" to the state which has the generic check. |
| 113 # |
| 114 # There is one complication though: When we consider adding a goto from a state |
| 115 # s1 to state s2, we need to check all possible transitions from s1 and from s2, |
| 116 # and see if they match. We cannot jump to a state which has different |
| 117 # transitions than the state where we're jumping from. |
| 118 # |
| 119 # For example, the following partial dfa allows distinguishing identifiers which |
| 120 # have only lower case letters from identifiers which have at least one upper |
| 121 # case letter. |
| 122 # |
| 123 # [s1] ---a---> [ ] |
| 124 # | |
| 125 # | /---[a-z]--\ |
| 126 # | v | |
| 127 # ---[b-z]--> [s2] ------/ |
| 128 # | | |
| 129 # | [A-Z] |
| 130 # | | |
| 131 # | v |
| 132 # \--[A-Z]---> [s3] |
| 133 # |
| 134 # We can add a goto from s1 to s2 (after checking for "a"), because the |
| 135 # transitions match: with [b-z], both states transition to s2, and with [A-Z], |
| 136 # both states transition to s3. Note that [a-z] is a superkey of [b-z], and the |
| 137 # transition from s2 is defined for the superkey, but that is ok. |
| 138 |
| 32 class DfaOptimizer(object): | 139 class DfaOptimizer(object): |
| 33 | 140 |
| 34 def __init__(self, dfa, log): | 141 def __init__(self, dfa, log): |
| 35 self.__dfa = dfa | 142 self.__dfa = dfa |
| 36 self.__log = log | 143 self.__log = log |
| 37 | 144 |
| 38 @staticmethod | 145 @staticmethod |
| 39 def transistions_match(encoding, incoming_key, incoming_state, state): | 146 def transistions_match(encoding, incoming_key, incoming_state, state): |
| 40 keys = set(state.key_iter()) | 147 keys = set(state.key_iter()) |
| 41 keys.add(incoming_key) | 148 keys.add(incoming_key) |
| (...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 191 counters['store_harmony_token_and_goto']) | 298 counters['store_harmony_token_and_goto']) |
| 192 print 'transitions removed %s' % counters['removals'] | 299 print 'transitions removed %s' % counters['removals'] |
| 193 print 'do_stored_token actions added %s' % len(store_states) | 300 print 'do_stored_token actions added %s' % len(store_states) |
| 194 self.__dfa = Dfa(self.__dfa.encoding(), start_name, states) | 301 self.__dfa = Dfa(self.__dfa.encoding(), start_name, states) |
| 195 | 302 |
| 196 @staticmethod | 303 @staticmethod |
| 197 def optimize(dfa, log): | 304 def optimize(dfa, log): |
| 198 optimizer = DfaOptimizer(dfa, log) | 305 optimizer = DfaOptimizer(dfa, log) |
| 199 optimizer.replace_tokens_with_gotos() | 306 optimizer.replace_tokens_with_gotos() |
| 200 return optimizer.__dfa | 307 return optimizer.__dfa |
| OLD | NEW |