| OLD | NEW |
| 1 # Copyright 2013 the V8 project authors. All rights reserved. | 1 # Copyright 2013 the V8 project authors. All rights reserved. |
| 2 # Redistribution and use in source and binary forms, with or without | 2 # Redistribution and use in source and binary forms, with or without |
| 3 # modification, are permitted provided that the following conditions are | 3 # modification, are permitted provided that the following conditions are |
| 4 # met: | 4 # met: |
| 5 # | 5 # |
| 6 # * Redistributions of source code must retain the above copyright | 6 # * Redistributions of source code must retain the above copyright |
| 7 # notice, this list of conditions and the following disclaimer. | 7 # notice, this list of conditions and the following disclaimer. |
| 8 # * Redistributions in binary form must reproduce the above | 8 # * Redistributions in binary form must reproduce the above |
| 9 # copyright notice, this list of conditions and the following | 9 # copyright notice, this list of conditions and the following |
| 10 # disclaimer in the documentation and/or other materials provided | 10 # disclaimer in the documentation and/or other materials provided |
| 11 # with the distribution. | 11 # with the distribution. |
| 12 # * Neither the name of Google Inc. nor the names of its | 12 # * Neither the name of Google Inc. nor the names of its |
| 13 # contributors may be used to endorse or promote products derived | 13 # contributors may be used to endorse or promote products derived |
| 14 # from this software without specific prior written permission. | 14 # from this software without specific prior written permission. |
| 15 # | 15 # |
| 16 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 16 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 17 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 17 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 18 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | 18 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 19 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | 19 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 20 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | 20 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 21 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | 21 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 26 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 26 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 27 | 27 |
| 28 from types import TupleType | 28 from itertools import chain |
| 29 from action import Term |
| 29 from string import printable | 30 from string import printable |
| 30 | 31 |
| 31 class KeyEncoding(object): | 32 class KeyEncoding(object): |
| 32 | 33 |
| 33 __encodings = {} | 34 __encodings = {} |
| 34 | 35 |
| 35 @staticmethod | 36 @staticmethod |
| 36 def get(name): | 37 def get(name): |
| 37 if not KeyEncoding.__encodings: | 38 if not KeyEncoding.__encodings: |
| 38 Latin1Encoding() | 39 Latin1Encoding() |
| 39 Utf16Encoding() | 40 Utf16Encoding() |
| 40 Utf8Encoding() | 41 Utf8Encoding() |
| 41 return KeyEncoding.__encodings[name] | 42 return KeyEncoding.__encodings[name] |
| 42 | 43 |
| 43 def __init__(self, name, primary_range, class_names): | 44 def __init__(self, name, primary_range, named_ranges, predefined_ranges): |
| 44 assert not name in KeyEncoding.__encodings | 45 assert not name in KeyEncoding.__encodings |
| 46 assert primary_range[0] <= primary_range[1] |
| 45 KeyEncoding.__encodings[name] = self | 47 KeyEncoding.__encodings[name] = self |
| 46 assert primary_range[0] <= primary_range[1] | |
| 47 assert primary_range[0] >= 0 | |
| 48 self.__name = name | 48 self.__name = name |
| 49 self.__primary_range = primary_range | 49 self.__primary_range = primary_range |
| 50 self.__primary_range_component = Term( |
| 51 'NUMERIC_RANGE_KEY', primary_range[0], primary_range[1]) |
| 50 self.__lower_bound = primary_range[0] | 52 self.__lower_bound = primary_range[0] |
| 51 self.__upper_bound = primary_range[1] + len(class_names) | 53 self.__upper_bound = primary_range[1] |
| 52 f = lambda i : (i + primary_range[1] + 1, i + primary_range[1] + 1) | 54 self.__named_ranges = { |
| 53 self.__class_ranges = {name : f(i) for i, name in enumerate(class_names)} | 55 k : Term('NAMED_RANGE_KEY', k) for k in named_ranges } |
| 54 self.__predefined_ranges = {} | 56 def f(v): |
| 57 if len(v) == 2: |
| 58 assert primary_range[0] <= v[0] and v[1] <= primary_range[1] |
| 59 return Term('NUMERIC_RANGE_KEY', v[0], v[1]) |
| 60 elif len(v) == 1: |
| 61 assert v[0] in self.__named_ranges |
| 62 return self.__named_ranges[v[0]] |
| 63 else: |
| 64 raise Exception() |
| 65 self.__predefined_ranges = { |
| 66 k : map(f, v) for k, v in predefined_ranges.iteritems() } |
| 55 | 67 |
| 56 def name(self): | 68 def name(self): |
| 57 return self.__name | 69 return self.__name |
| 58 | 70 |
| 59 def add_predefined_range(self, name, ranges): | |
| 60 self.__predefined_ranges[name] = ranges | |
| 61 | |
| 62 def lower_bound(self): | 71 def lower_bound(self): |
| 63 return self.__lower_bound | 72 return self.__lower_bound |
| 64 | 73 |
| 65 def upper_bound(self): | 74 def upper_bound(self): |
| 66 return self.__upper_bound | 75 return self.__upper_bound |
| 67 | 76 |
| 68 def primary_range(self): | 77 def primary_range(self): |
| 69 return self.__primary_range | 78 return self.__primary_range |
| 70 | 79 |
| 71 def class_range(self, name): | 80 def named_range(self, name): |
| 72 ranges = self.__class_ranges | 81 ranges = self.__named_ranges |
| 73 return None if not name in ranges else ranges[name] | 82 return Term.empty_term() if not name in ranges else ranges[name] |
| 74 | 83 |
| 75 def class_range_iter(self): | 84 def named_range_iter(self): |
| 76 return self.__class_ranges.iteritems() | 85 return self.__named_range.iteritems() |
| 77 | 86 |
| 78 def class_name_iter(self): | 87 def named_range_key_iter(self): |
| 79 return self.__class_ranges.iterkeys() | 88 return self.__named_ranges.iterkeys() |
| 80 | 89 |
| 81 def class_value_iter(self): | 90 def named_range_value_iter(self): |
| 82 return self.__class_ranges.itervalues() | 91 return self.__named_ranges.itervalues() |
| 83 | 92 |
| 84 def predefined_range_iter(self, name): | 93 def predefined_range_iter(self, name): |
| 85 ranges = self.__predefined_ranges | 94 ranges = self.__predefined_ranges |
| 86 return None if not name in ranges else iter(ranges[name]) | 95 return None if not name in ranges else iter(ranges[name]) |
| 87 | 96 |
| 97 def __primary_range_iter(self): |
| 98 yield self.__primary_range_component |
| 99 |
| 100 def all_components_iter(self): |
| 101 return chain(self.__primary_range_iter(), self.__named_ranges.itervalues()) |
| 102 |
| 88 def is_primary_range(self, r): | 103 def is_primary_range(self, r): |
| 89 assert self.lower_bound() <= r[0] and r[1] <= self.upper_bound() | 104 assert self.lower_bound() <= r[0] and r[1] <= self.upper_bound() |
| 90 primary_range = self.__primary_range | 105 primary_range = self.__primary_range |
| 91 if (primary_range[0] <= r[0] and r[1] <= primary_range[1]): | 106 if (primary_range[0] <= r[0] and r[1] <= primary_range[1]): |
| 92 return True | 107 return True |
| 93 assert r[0] == r[1] | 108 assert r[0] == r[1] |
| 94 return False | 109 return False |
| 95 | 110 |
| 96 def in_primary_range(self, c): | 111 def in_primary_range(self, c): |
| 97 return self.is_primary_range((c, c)) | 112 return self.is_primary_range((c, c)) |
| 98 | 113 |
| 99 def is_class_range(self, r): | |
| 100 return not self.is_primary_range(r) | |
| 101 | |
| 102 class TransitionKey(object): | 114 class TransitionKey(object): |
| 103 '''Represents a transition from a state in DFA or NFA to another state. | 115 '''Represents a transition from a state in DFA or NFA to another state. |
| 104 | 116 |
| 105 A transition key has a list of character ranges and a list of class ranges | 117 A transition key has a list of character ranges and a list of class ranges |
| 106 (e.g., "whitespace"), defining for which characters the transition | 118 (e.g., "whitespace"), defining for which characters the transition |
| 107 happens. When we generate code based on the transition key, the character | 119 happens. When we generate code based on the transition key, the character |
| 108 ranges generate simple checks and the class ranges generate more complicated | 120 ranges generate simple checks and the class ranges generate more complicated |
| 109 conditions, e.g., function calls.''' | 121 conditions, e.g., function calls.''' |
| 110 | 122 |
| 111 __cached_keys = { | 123 __cached_keys = { |
| 112 'no_encoding' : {}, | 124 'no_encoding' : {}, |
| 113 'latin1' : {}, | 125 'latin1' : {}, |
| 114 'utf8' : {}, | 126 'utf8' : {}, |
| 115 'utf16' : {}, | 127 'utf16' : {}, |
| 116 } | 128 } |
| 117 | 129 |
| 118 __unique_key_counter = -1 | |
| 119 | |
| 120 @staticmethod | 130 @staticmethod |
| 121 def __is_unique_range(r): | 131 def __cached_key(encoding, name, components_getter): |
| 122 return (r[0] == r[1] and | |
| 123 r[0] < 0 and | |
| 124 r[1] > TransitionKey.__unique_key_counter) | |
| 125 | |
| 126 @staticmethod | |
| 127 def __verify_ranges(encoding, ranges, check_merged): | |
| 128 assert ranges | |
| 129 last = None | |
| 130 for r in ranges: | |
| 131 r_is_class = (TransitionKey.__is_unique_range(r) or | |
| 132 encoding.is_class_range(r)) | |
| 133 # Assert that the ranges are in order. | |
| 134 if last != None and check_merged: | |
| 135 assert last[1] + 1 < r[0] or r_is_class | |
| 136 if r_is_class: | |
| 137 last = None | |
| 138 else: | |
| 139 last = r | |
| 140 | |
| 141 @staticmethod | |
| 142 def __cached_key(encoding, name, bounds_getter): | |
| 143 encoding_name = encoding.name() if encoding else 'no_encoding' | 132 encoding_name = encoding.name() if encoding else 'no_encoding' |
| 144 cache = TransitionKey.__cached_keys[encoding_name] | 133 cache = TransitionKey.__cached_keys[encoding_name] |
| 145 if not name in cache: | 134 if not name in cache: |
| 146 bounds = bounds_getter(name) | 135 cache[name] = TransitionKey(encoding, components_getter()) |
| 147 cache[name] = TransitionKey(encoding, bounds, name) | |
| 148 return cache[name] | 136 return cache[name] |
| 149 | 137 |
| 150 @staticmethod | 138 @staticmethod |
| 151 def epsilon(): | 139 def epsilon(): |
| 152 '''Returns a TransitionKey for the epsilon (empty) transition.''' | 140 '''Returns a TransitionKey for the epsilon (empty) transition.''' |
| 153 return TransitionKey.__cached_key(None, 'epsilon', lambda name : []) | 141 return TransitionKey.__cached_key(None, 'epsilon', |
| 142 lambda : Term("EPSILON_KEY")) |
| 143 |
| 144 @staticmethod |
| 145 def omega(): |
| 146 '''Always matches.''' |
| 147 return TransitionKey.__cached_key(None, 'omega', |
| 148 lambda : Term("OMEGA_KEY")) |
| 154 | 149 |
| 155 @staticmethod | 150 @staticmethod |
| 156 def any(encoding): | 151 def any(encoding): |
| 157 '''Returns a TransitionKey which matches any character.''' | 152 '''Returns a TransitionKey which matches any encoded character.''' |
| 158 return TransitionKey.__cached_key( | 153 return TransitionKey.__cached_key(encoding, 'any', |
| 159 encoding, | 154 lambda : encoding.all_components_iter()) |
| 160 'any', | |
| 161 lambda name : sorted( | |
| 162 list(encoding.class_value_iter()) + [encoding.primary_range()])) | |
| 163 | 155 |
| 164 @staticmethod | 156 @staticmethod |
| 165 def single_char(encoding, char): | 157 def single_char(encoding, char): # TODO(dcarney): char should be int |
| 166 '''Returns a TransitionKey for a single-character transition.''' | 158 '''Returns a TransitionKey for a single-character transition.''' |
| 167 r = (ord(char), ord(char)) | 159 return TransitionKey(encoding, Term("NUMERIC_RANGE_KEY", ord(char), ord(char
))) |
| 168 assert encoding.is_primary_range(r) | |
| 169 return TransitionKey(encoding, [r]) | |
| 170 | 160 |
| 171 @staticmethod | 161 @staticmethod |
| 172 def unique(name): | 162 def range(encoding, a, b): |
| 163 '''Returns a TransitionKey for a single-character transition.''' |
| 164 return TransitionKey(encoding, Term("NUMERIC_RANGE_KEY", a, b)) |
| 165 |
| 166 @staticmethod |
| 167 def unique(term): # TODO(dcarney): rename |
| 173 '''Returns a unique TransitionKey for the given name (and creates it if it | 168 '''Returns a unique TransitionKey for the given name (and creates it if it |
| 174 doesn't exist yet). The TransitionKey won't have any real character range, | 169 doesn't exist yet). The TransitionKey won't have any real character range, |
| 175 but a newly-created "mock" character range which is separate from all other | 170 but a newly-created "mock" character range which is separate from all other |
| 176 character ranges.''' | 171 character ranges.''' |
| 177 def get_bounds(name): | 172 return TransitionKey(None, Term("TERM_KEY", term)) |
| 178 bound = TransitionKey.__unique_key_counter | |
| 179 TransitionKey.__unique_key_counter -= 1 | |
| 180 return [(bound, bound)] | |
| 181 name = '__' + name | |
| 182 return TransitionKey.__cached_key(None, name, get_bounds) | |
| 183 | 173 |
| 184 @staticmethod | 174 @staticmethod |
| 185 def __process_term(encoding, term, ranges, key_map): | 175 def __process_term(encoding, term, components, key_map): |
| 186 key = term.name() | 176 key = term.name() |
| 187 args = term.args() | 177 args = term.args() |
| 188 if key == 'RANGE': | 178 if key == 'RANGE': |
| 189 ranges.append((ord(args[0]), ord(args[1]))) | 179 components.append(Term('NUMERIC_RANGE_KEY', ord(args[0]), ord(args[1]))) |
| 190 elif key == 'LITERAL': | 180 elif key == 'LITERAL': |
| 191 for char in args[0]: | 181 for char in args[0]: # TODO(dcarney): don't use strings for literals |
| 192 ranges.append((ord(char), ord(char))) | 182 components.append(Term('NUMERIC_RANGE_KEY', ord(char), ord(char))) |
| 193 elif key == 'CAT': | 183 elif key == 'CAT': |
| 194 for x in args: | 184 for x in args: |
| 195 TransitionKey.__process_term(encoding, x, ranges, key_map) | 185 TransitionKey.__process_term(encoding, x, components, key_map) |
| 196 elif key == 'CHARACTER_CLASS': | 186 elif key == 'CHARACTER_CLASS': |
| 197 class_name = args[0] | 187 class_name = args[0] |
| 198 if encoding.class_range(class_name): | 188 if encoding.named_range(class_name): |
| 199 r = encoding.class_range(class_name) | 189 c = encoding.named_range(class_name) |
| 200 if class_name in key_map: | 190 if class_name in key_map: |
| 201 assert key_map[class_name] == TransitionKey(encoding, [r]) | 191 assert key_map[class_name] == TransitionKey(encoding, c) |
| 202 ranges.append(r) | 192 components.append(c) |
| 203 elif encoding.predefined_range_iter(class_name): | 193 elif encoding.predefined_range_iter(class_name): |
| 204 rs = list(encoding.predefined_range_iter(class_name)) | 194 cs = list(encoding.predefined_range_iter(class_name)) |
| 205 if class_name in key_map: | 195 if class_name in key_map: |
| 206 assert key_map[class_name] == TransitionKey(encoding, rs) | 196 assert key_map[class_name] == TransitionKey(encoding, cs) |
| 207 ranges += rs | 197 components += cs |
| 208 elif class_name in key_map: | 198 elif class_name in key_map: |
| 209 ranges += key_map[class_name].__ranges | 199 components += key_map[class_name].__flatten() |
| 210 else: | 200 else: |
| 211 raise Exception('unknown character class [%s]' % args[0]) | 201 raise Exception('unknown character class [%s]' % args[0]) |
| 212 else: | 202 else: |
| 213 raise Exception('bad key [%s]' % key) | 203 raise Exception('bad key [%s]' % key) |
| 214 | 204 |
| 215 @staticmethod | 205 @staticmethod |
| 216 def character_class(encoding, term, key_map): | 206 def character_class(encoding, term, key_map): |
| 217 '''Processes 'term' (a representation of a character class in the rule | 207 '''Processes 'term' (a representation of a character class in the rule |
| 218 file), and constructs a TransitionKey based on it. 'key_map' contains | 208 file), and constructs a TransitionKey based on it. 'key_map' contains |
| 219 already constructed aliases for character classes (they can be used in the | 209 already constructed aliases for character classes (they can be used in the |
| 220 new character class). It is a map from strings (character class names) to | 210 new character class). It is a map from strings (character class names) to |
| 221 TransitionKeys. For example, graph might represent the character class | 211 TransitionKeys. For example, graph might represent the character class |
| 222 [a-z:digit:] where 'digit' is a previously constructed character class found | 212 [a-z:digit:] where 'digit' is a previously constructed character class found |
| 223 in "key_map".''' | 213 in "key_map".''' |
| 224 ranges = [] | 214 components = [] |
| 225 assert term.name() == 'CLASS' or term.name() == 'NOT_CLASS' | 215 assert term.name() == 'CLASS' or term.name() == 'NOT_CLASS' |
| 226 invert = term.name() == 'NOT_CLASS' | 216 invert = term.name() == 'NOT_CLASS' |
| 227 assert len(term.args()) == 1 | 217 assert len(term.args()) == 1 |
| 228 TransitionKey.__process_term(encoding, term.args()[0], ranges, key_map) | 218 TransitionKey.__process_term(encoding, term.args()[0], components, key_map) |
| 229 return TransitionKey.__key_from_ranges(encoding, invert, ranges) | 219 key = TransitionKey.__key_from_components(encoding, invert, components) |
| 220 assert key != None, "not invertible %s " % str(term) |
| 221 return key |
| 230 | 222 |
| 231 def matches_char(self, char): | 223 def matches_char(self, char): |
| 224 'this is just for simple lexer testing and is incomplete' |
| 232 char = ord(char) | 225 char = ord(char) |
| 233 assert char < 128 | 226 assert 0 <= char and char < 128 |
| 234 for r in self.__ranges: | 227 for c in self.__flatten(): |
| 235 if r[0] <= char and char <= r[1]: return True | 228 if c.name() == 'NUMERIC_RANGE_KEY': |
| 229 r = c.args() |
| 230 if r[0] <= char and char <= r[1]: |
| 231 return True |
| 236 return False | 232 return False |
| 237 | 233 |
| 234 # (disjoint, subset, advance_left, advance_right) |
| 235 __is_superset_of_key_helper = ( |
| 236 (True, True, False, False), # -5 : error |
| 237 (True, True, False, False), # -4 : error |
| 238 (False, True, False, True), # -3 : |
| 239 (False, False, True, False), # -2 : |
| 240 (False, False, True, False), # -1 : |
| 241 (False, True, True, True), # 0 : |
| 242 (True, False, False, True), # 1 : |
| 243 (True, False, False, True), # 2 : |
| 244 (True, True, False, False), # 3 : error |
| 245 (False, True, False, True), # 4 : |
| 246 (True, True, False, False), # 5 : error |
| 247 ) |
| 248 |
| 238 def is_superset_of_key(self, key): | 249 def is_superset_of_key(self, key): |
| 239 '''Returns true if 'key' is a sub-key of this TransitionKey.''' | 250 '''Returns true if 'key' is a sub-key of this TransitionKey. |
| 240 assert isinstance(key, self.__class__) | 251 must be called on a key that is either a subset or disjoint''' |
| 241 assert key != TransitionKey.epsilon() | 252 helper = TransitionKey.__is_superset_of_key_helper |
| 242 assert len(key.__ranges) == 1 | 253 (left, right) = (self.__flatten(), key.__flatten()) |
| 243 subkey = key.__ranges[0] | 254 (disjoint, subset, advance_left, advance_right) = (False, False, True, True) |
| 244 matches = False | 255 (right_exhausted, left_exhausted) = (False, False) |
| 245 for k in self.__ranges: | 256 while advance_left or advance_right: |
| 246 if k[0] <= subkey[0]: | 257 if advance_right: |
| 247 assert subkey[1] <= k[1] or subkey[0] > k[1] | 258 try: |
| 248 if subkey[0] < k[0]: | 259 r = right.next() |
| 249 assert subkey[1] < k[0] | 260 except StopIteration: |
| 250 if k[0] <= subkey[0] and k[1] >= subkey[1]: | 261 right_exhausted = True |
| 251 assert not matches | 262 if advance_left: |
| 252 matches = True | 263 try: |
| 253 return matches | 264 l = left.next() |
| 265 except StopIteration: |
| 266 left_exhausted = True |
| 267 if right_exhausted or left_exhausted: |
| 268 break |
| 269 c = TransitionKey.__component_compare(l, r) |
| 270 (d, s, advance_left, advance_right) = helper[c + 5] |
| 271 disjoint |= d |
| 272 subset |= s |
| 273 if not right_exhausted: |
| 274 disjoint = True |
| 275 if disjoint and subset: |
| 276 raise Exception('not a subset and not disjoint') |
| 277 return subset |
| 254 | 278 |
| 255 @staticmethod | 279 @staticmethod |
| 256 def canonical_compare(left, right): | 280 def compare(self, other): |
| 257 i = 0 | 281 left = list(self.__flatten()) |
| 258 left_length = len(left.__ranges) | 282 right = list(other.__flatten()) |
| 259 right_length = len(right.__ranges) | 283 offset = 0 |
| 260 while i < left_length and i < right_length: | 284 while offset < len(left) and offset < len(right): |
| 261 l = left.__ranges[i] | 285 c = TransitionKey.__component_compare(left[offset], right[offset]) |
| 262 r = right.__ranges[i] | 286 if c != 0: |
| 263 c = cmp(l, r) | |
| 264 if c: | |
| 265 return c | 287 return c |
| 266 i += 1 | 288 offset += 1 |
| 267 if i == left_length and i == right_length: | 289 return TransitionKey.__cmp(len(left), len(right)) |
| 268 return 0 | 290 |
| 269 return 1 if i != left_length else -1 | 291 def __cmp__(self, other): |
| 292 return TransitionKey.compare(self, other) |
| 270 | 293 |
| 271 def __hash__(self): | 294 def __hash__(self): |
| 272 if self.__cached_hash == None: | 295 return hash(self.__term) |
| 273 self.__cached_hash = hash(self.__ranges) | 296 |
| 274 return self.__cached_hash | 297 def __ne__(self, other): |
| 298 return not self == other |
| 275 | 299 |
| 276 def __eq__(self, other): | 300 def __eq__(self, other): |
| 277 return isinstance(other, self.__class__) and self.__ranges == other.__ranges | 301 return isinstance(other, TransitionKey) and self.__term == other.__term |
| 278 | 302 |
| 279 @staticmethod | 303 @staticmethod |
| 280 def __class_name(encoding, r): | 304 def __class_name(encoding, r): |
| 281 for name, v in encoding.class_range_iter(): | 305 for name, v in encoding.class_range_iter(): |
| 282 if r == v: | 306 if r == v: |
| 283 return name | 307 return name |
| 284 assert False | 308 assert False |
| 285 | 309 |
| 286 @staticmethod | 310 @staticmethod |
| 287 def __unique_name(r): | 311 def __unique_name(r): |
| 288 for name, v in TransitionKey.__cached_keys['no_encoding'].items(): | 312 for name, v in TransitionKey.__cached_keys['no_encoding'].items(): |
| 289 if v.__ranges and r == v.__ranges[0]: | 313 if v.__ranges and r == v.__ranges[0]: |
| 290 return name[2:] | 314 return name[2:] |
| 291 assert False | 315 assert False |
| 292 | 316 |
| 293 def range_iter(self, encoding): | 317 def range_iter(self, encoding): |
| 294 assert not self == TransitionKey.epsilon() | 318 for c in self.__flatten(): |
| 295 for r in self.__ranges: | 319 if c.name() == 'NUMERIC_RANGE_KEY': |
| 296 if self.__is_unique_range(r): | 320 yield ('PRIMARY_RANGE', (c.args()[0], c.args()[1])) |
| 297 yield ('UNIQUE', self.__unique_name(r)) | 321 elif c.name() == 'NAMED_RANGE_KEY': |
| 298 elif encoding.is_class_range(r): | 322 yield ('CLASS', c.args()[0]) |
| 299 yield ('CLASS', self.__class_name(encoding, r)) | 323 elif c.name() == 'TERM_KEY': |
| 324 yield ('UNIQUE', c.args()[0]) |
| 300 else: | 325 else: |
| 301 yield ('PRIMARY_RANGE', r) | 326 assert False, 'unimplemented %s' % c |
| 302 | 327 |
| 303 __printable_cache = { | 328 __printable_cache = { |
| 304 ord('\t') : '\\t', | 329 ord('\t') : '\\t', |
| 305 ord('\n') : '\\n', | 330 ord('\n') : '\\n', |
| 306 ord('\r') : '\\r', | 331 ord('\r') : '\\r', |
| 307 } | 332 } |
| 308 | 333 |
| 309 @staticmethod | 334 @staticmethod |
| 310 def __range_str(encoding, r): | 335 def __component_str(encoding, component): |
| 311 if TransitionKey.__is_unique_range(r): | 336 if component.name() == 'TERM_KEY': |
| 312 return TransitionKey.__unique_name(r) | 337 return component.args()[0] |
| 313 if encoding and encoding.is_class_range(r): | 338 elif component.name() == 'NAMED_RANGE_KEY': |
| 314 return TransitionKey.__class_name(encoding, r) | 339 return component.args()[0] |
| 340 elif component.name() == 'EPSILON_KEY': |
| 341 return 'epsilon' |
| 342 elif component.name() == 'OMEGA_KEY': |
| 343 return 'omega' |
| 344 elif component.name() != 'NUMERIC_RANGE_KEY': |
| 345 raise Exception('unprintable %s' % component) |
| 346 r = component.args() |
| 315 def to_str(x): | 347 def to_str(x): |
| 316 assert not encoding or encoding.in_primary_range(x) | 348 assert not encoding or encoding.in_primary_range(x) |
| 317 if x > 127: | 349 if x > 127: |
| 318 return str(x) | 350 return str(x) |
| 319 if not x in TransitionKey.__printable_cache: | 351 if not x in TransitionKey.__printable_cache: |
| 320 res = "'%s'" % chr(x) if chr(x) in printable else str(x) | 352 res = "'%s'" % chr(x) if chr(x) in printable else str(x) |
| 321 TransitionKey.__printable_cache[x] = res | 353 TransitionKey.__printable_cache[x] = res |
| 322 return TransitionKey.__printable_cache[x] | 354 return TransitionKey.__printable_cache[x] |
| 323 if r[0] == r[1]: | 355 if r[0] == r[1]: |
| 324 return '%s' % to_str(r[0]) | 356 return '%s' % to_str(r[0]) |
| 325 else: | 357 else: |
| 326 return '[%s-%s]' % (to_str(r[0]), to_str(r[1])) | 358 return '[%s-%s]' % (to_str(r[0]), to_str(r[1])) |
| 327 | 359 |
| 328 def __init__(self, encoding, ranges, name = None): | 360 def __flatten(self): |
| 329 if not ranges: | 361 return self.__flatten_components([self.__term]) |
| 330 assert name == 'epsilon' | 362 |
| 331 assert not name in TransitionKey.__cached_keys['no_encoding'] | 363 @staticmethod |
| 332 else: | 364 def __flatten_components(components): |
| 333 TransitionKey.__verify_ranges(encoding, ranges, True) | 365 for component in components: |
| 334 self.__name = name | 366 if component.name() != 'COMPOSITE_KEY': |
| 335 self.__ranges = tuple(ranges) # immutable | 367 yield component |
| 368 else: |
| 369 for arg in component.args(): |
| 370 yield arg |
| 371 |
| 372 __component_name_order = { |
| 373 'EPSILON_KEY' : 0, |
| 374 'NUMERIC_RANGE_KEY' : 1, |
| 375 'NAMED_RANGE_KEY' : 2, |
| 376 'TERM_KEY' : 3, |
| 377 'OMEGA_KEY' : 4 |
| 378 } |
| 379 |
| 380 @staticmethod |
| 381 def __cmp(a, b): |
| 382 'wraps standard cmp function to return correct results for components' |
| 383 c = cmp(a, b) |
| 384 return 0 if c == 0 else (-1 if c < 0 else 1) |
| 385 |
| 386 @staticmethod |
| 387 def __component_name_compare(a, b): |
| 388 return TransitionKey.__cmp(TransitionKey.__component_name_order[a], |
| 389 TransitionKey.__component_name_order[b]) |
| 390 |
| 391 @staticmethod |
| 392 def __component_compare(a, b): |
| 393 '''component-wise compare function, returns the following values when |
| 394 comparing non identical numerical ranges: |
| 395 non-overlapping : -2 -- a0 <= a1 < b0 <= b1 |
| 396 b subset of a : -3 -- a0 < b0 <= b1 <= a1 |
| 397 a subset of b : -4 -- a0 == b0 and a1 < b1 |
| 398 a overlaps to left : -5 -- a0 < b0 and b0 <= a1 < b1 |
| 399 otherwise a value in [-1, 1] is returned''' |
| 400 if a.name() != b.name(): |
| 401 return TransitionKey.__component_name_compare(a.name(), b.name()) |
| 402 if a.name() != 'NUMERIC_RANGE_KEY': |
| 403 return TransitionKey.__cmp(str(a), str(b)) |
| 404 (a, b) = (a.args(), b.args()) |
| 405 c0 = TransitionKey.__cmp(a[0], b[0]) |
| 406 if c0 == 0: |
| 407 return 4 * TransitionKey.__cmp(a[1], b[1]) # either == or a subset |
| 408 sign = -1 |
| 409 if c0 > 0: |
| 410 (a, b, sign) = (b, a, 1) # normalize ordering so that a0 < b0 |
| 411 assert a[0] < b[0] |
| 412 if b[1] <= a[1]: # subset |
| 413 return 3 * sign |
| 414 if a[1] < b[0]: # non overlap |
| 415 return 2 * sign |
| 416 return 5 * sign # partial overlap |
| 417 |
| 418 @staticmethod |
| 419 def __construct(encoding, components): |
| 420 is_unique = False |
| 421 acc = [] |
| 422 last = Term.empty_term() |
| 423 for current in TransitionKey.__flatten_components(components): |
| 424 name = current.name() |
| 425 if last: |
| 426 assert name != 'EPSILON_KEY' and name != 'OMEGA_KEY', 'cannot merge' |
| 427 c = TransitionKey.__component_compare(last, current) |
| 428 assert c == -1 or c == -2, 'bad order %s %s' % (str(last), str(current)) |
| 429 if name == 'EPSILON_KEY' or name == 'OMEGA_KEY' or name == 'TERM_KEY': |
| 430 pass |
| 431 elif name == 'NUMERIC_RANGE_KEY': |
| 432 assert encoding.is_primary_range(current.args()) |
| 433 if last.name() == 'NUMERIC_RANGE_KEY': |
| 434 assert last.args()[1] + 1 < current.args()[0], 'ranges must be merged' |
| 435 elif name == 'NAMED_RANGE_KEY': |
| 436 assert encoding.named_range(current.args()[0]) |
| 437 else: |
| 438 raise Exception('illegal component %s' % str(current)) |
| 439 acc.append(current) |
| 440 last = current |
| 441 assert acc, "must have components" |
| 442 return acc[0] if len(acc) == 1 else Term('COMPOSITE_KEY', *acc) |
| 443 |
| 444 def __init__(self, encoding, components): |
| 445 if isinstance(components, Term): |
| 446 components = [components] |
| 447 self.__term = TransitionKey.__construct(encoding, components) |
| 336 self.__cached_hash = None | 448 self.__cached_hash = None |
| 337 | 449 |
| 338 def to_string(self, encoding): | 450 def to_string(self, encoding): |
| 339 if self.__name: | 451 return ', '.join(map(lambda x : TransitionKey.__component_str(encoding, x), |
| 340 return self.__name | 452 self.__flatten())) |
| 341 strings = [TransitionKey.__range_str(encoding, x) for x in self.__ranges] | |
| 342 return ', '.join(strings) | |
| 343 | 453 |
| 344 def __str__(self): | 454 def __str__(self): |
| 345 return self.to_string(None) | 455 return self.to_string(None) |
| 346 | 456 |
| 347 @staticmethod | 457 @staticmethod |
| 348 def __disjoint_keys(encoding, range_map): | 458 def __disjoint_keys(encoding, range_map): |
| 349 '''Takes a set of possibly overlapping ranges, returns a list of ranges | 459 '''Takes a set of possibly overlapping ranges, returns a list of ranges |
| 350 which don't overlap and which cover the same points as the original | 460 which don't overlap and which cover the same points as the original |
| 351 set. range_map is a map from lower bounds to a list of upper bounds.''' | 461 set. range_map is a map from lower bounds to a list of upper bounds.''' |
| 352 sort = lambda x : sorted(set(x)) | 462 sort = lambda x : sorted(set(x)) |
| 353 range_map = sorted(map(lambda (k, v): (k, sort(v)), range_map.items())) | 463 range_map = sorted(map(lambda (k, v): (k, sort(v)), range_map.items())) |
| 354 ranges = [] | |
| 355 upper_bound = encoding.upper_bound() + 1 | 464 upper_bound = encoding.upper_bound() + 1 |
| 356 for i in range(len(range_map)): | 465 for i in range(len(range_map)): |
| 357 (left, left_values) = range_map[i] | 466 (left, left_values) = range_map[i] |
| 358 next = range_map[i + 1][0] if i != len(range_map) - 1 else upper_bound | 467 next = range_map[i + 1][0] if i != len(range_map) - 1 else upper_bound |
| 359 to_push = [] | 468 to_push = [] |
| 360 for v in left_values: | 469 for v in left_values: |
| 361 assert left <= next | 470 assert left <= next |
| 362 if v >= next: | 471 if v >= next: |
| 363 if not to_push and left < next: | 472 if not to_push and left < next: |
| 364 ranges.append((left, next - 1)) | 473 yield (left, next - 1) |
| 365 to_push.append(v) | 474 to_push.append(v) |
| 366 else: | 475 else: |
| 367 ranges.append((left, v)) | 476 yield (left, v) |
| 368 left = v + 1 | 477 left = v + 1 |
| 369 if to_push: | 478 if to_push: |
| 370 current = range_map[i + 1] | 479 current = range_map[i + 1] |
| 371 range_map[i + 1] = (current[0], sort(current[1] + to_push)) | 480 range_map[i + 1] = (current[0], sort(current[1] + to_push)) |
| 372 return ranges | |
| 373 | 481 |
| 374 @staticmethod | 482 @staticmethod |
| 375 def __disjoint_ranges_from_key_set(encoding, key_set): | 483 def __merge_ranges(encoding, ranges): |
| 376 if not key_set: | 484 last = None |
| 377 return [] | 485 for r in ranges: |
| 378 range_map = {} | 486 if last == None: |
| 379 for x in key_set: | 487 last = r |
| 380 assert x != TransitionKey.epsilon() | 488 elif last[1] + 1 == r[0]: |
| 381 for r in x.__ranges: | 489 last = (last[0], r[1]) |
| 382 if not r[0] in range_map: | 490 else: |
| 383 range_map[r[0]] = [] | 491 yield last |
| 384 range_map[r[0]].append(r[1]) | 492 last = r |
| 385 ranges = TransitionKey.__disjoint_keys(encoding, range_map) | 493 if last != None: |
| 386 TransitionKey.__verify_ranges(encoding, ranges, False) | 494 yield last |
| 387 return ranges | |
| 388 | 495 |
| 389 @staticmethod | 496 @staticmethod |
| 390 def disjoint_keys(encoding, key_set): | 497 def __flatten_keys(keys): |
| 498 return chain(*map(lambda x : x.__flatten(), keys)) |
| 499 |
| 500 @staticmethod |
| 501 def __disjoint_components_from_keys(encoding, keys, merge_ranges = False): |
| 502 return TransitionKey.__disjoint_components( |
| 503 encoding, TransitionKey.__flatten_keys(keys), merge_ranges) |
| 504 |
| 505 @staticmethod |
| 506 def __disjoint_components(encoding, components, merge_ranges): |
| 507 range_map = {} |
| 508 other_keys = set([]) |
| 509 for x in components: |
| 510 assert x.name() != 'EPSILON_KEY' and x.name() != 'OMEGA_KEY' |
| 511 if x.name() != 'NUMERIC_RANGE_KEY': |
| 512 other_keys.add(x) |
| 513 continue |
| 514 (start, end) = x.args() |
| 515 if not start in range_map: |
| 516 range_map[start] = [] |
| 517 range_map[start].append(end) |
| 518 ranges = TransitionKey.__disjoint_keys(encoding, range_map) |
| 519 if merge_ranges: |
| 520 ranges = TransitionKey.__merge_ranges(encoding, ranges) |
| 521 other_keys = sorted(other_keys, cmp=TransitionKey.__component_compare) |
| 522 range_terms = map(lambda x : Term('NUMERIC_RANGE_KEY', x[0], x[1]), ranges) |
| 523 return chain(iter(range_terms), iter(other_keys)) |
| 524 |
| 525 @staticmethod |
| 526 def __key_from_components(encoding, invert, components): |
| 527 components = TransitionKey.__disjoint_components(encoding, components, True) |
| 528 if invert: |
| 529 components = TransitionKey.__invert_components(encoding, components) |
| 530 return None if not components else TransitionKey(encoding, components) |
| 531 |
| 532 @staticmethod |
| 533 def disjoint_keys(encoding, keys): |
| 391 '''Takes a set of possibly overlapping TransitionKeys, returns a list of | 534 '''Takes a set of possibly overlapping TransitionKeys, returns a list of |
| 392 TransitionKeys which don't overlap and whose union is the same as the union | 535 TransitionKeys which don't overlap and whose union is the same as the union |
| 393 of the original key_set. In addition, TransitionKeys are not merged, only | 536 of the original key_set. In addition, TransitionKeys are not merged, only |
| 394 split. | 537 split. |
| 395 | 538 |
| 396 For example, if key_set contains two TransitionKeys for ranges [1-10] and | 539 For example, if key_set contains two TransitionKeys for ranges [1-10] and |
| 397 [5-15], disjoint_keys returns a set of three TransitionKeys: [1-4], [5-10], | 540 [5-15], disjoint_keys returns a set of three TransitionKeys: [1-4], [5-10], |
| 398 [11-16].''' | 541 [11-16].''' |
| 399 ranges = TransitionKey.__disjoint_ranges_from_key_set(encoding, key_set) | 542 return map(lambda x : TransitionKey(encoding, x), |
| 400 return map(lambda x : TransitionKey(encoding, [x]), ranges) | 543 TransitionKey.__disjoint_components_from_keys(encoding, keys)) |
| 401 | 544 |
| 402 @staticmethod | 545 @staticmethod |
| 403 def inverse_key(encoding, key_set): | 546 def inverse_key(encoding, keys): |
| 404 '''Returns a TransitionKey which matches represents the inverse of the union | 547 '''Returns a TransitionKey which matches represents the inverse of the union |
| 405 of 'key_set'. The TransitionKeys contain a set of character ranges and a set | 548 of 'keys'. The TransitionKeys contain a set of character ranges and a set |
| 406 of classes. The character ranges are inverted in relation to the latin_1 | 549 of classes. The character ranges are inverted in relation to the latin_1 |
| 407 character range, and the character classes are inverted in relation to all | 550 character range, and the character classes are inverted in relation to all |
| 408 character classes in __class_bounds.''' | 551 character classes in __class_bounds.''' |
| 409 ranges = TransitionKey.__disjoint_ranges_from_key_set(encoding, key_set) | 552 return TransitionKey.__key_from_components( |
| 410 inverse = TransitionKey.__invert_ranges(encoding, ranges) | 553 encoding, True, TransitionKey.__flatten_keys(keys)) |
| 411 if not inverse: | |
| 412 return None | |
| 413 return TransitionKey(encoding, inverse) | |
| 414 | |
| 415 @staticmethod | |
| 416 def __key_from_ranges(encoding, invert, ranges): | |
| 417 range_map = {} | |
| 418 for r in ranges: | |
| 419 assert r | |
| 420 if not r[0] in range_map: | |
| 421 range_map[r[0]] = [] | |
| 422 range_map[r[0]].append(r[1]) | |
| 423 ranges = TransitionKey.__disjoint_keys(encoding, range_map) | |
| 424 ranges = TransitionKey.__merge_ranges(encoding, ranges) | |
| 425 if invert: | |
| 426 ranges = TransitionKey.__invert_ranges(encoding, ranges) | |
| 427 return TransitionKey(encoding, ranges) | |
| 428 | |
| 429 @staticmethod | |
| 430 def __merge_ranges(encoding, ranges): | |
| 431 merged = [] | |
| 432 last = None | |
| 433 for r in ranges: | |
| 434 if last == None: | |
| 435 last = r | |
| 436 elif (last[1] + 1 == r[0] and | |
| 437 not TransitionKey.__is_unique_range(r) and | |
| 438 not encoding.is_class_range(r)): | |
| 439 last = (last[0], r[1]) | |
| 440 else: | |
| 441 merged.append(last) | |
| 442 last = r | |
| 443 if last != None: | |
| 444 merged.append(last) | |
| 445 return merged | |
| 446 | 554 |
| 447 @staticmethod | 555 @staticmethod |
| 448 def merged_key(encoding, keys): | 556 def merged_key(encoding, keys): |
| 449 f = lambda acc, key: acc + list(key.__ranges) | 557 return TransitionKey(encoding, |
| 450 return TransitionKey.__key_from_ranges(encoding, False, reduce(f, keys, [])) | 558 TransitionKey.__disjoint_components_from_keys(encoding, keys, True)) |
| 451 | 559 |
| 452 @staticmethod | 560 @staticmethod |
| 453 def __invert_ranges(encoding, ranges): | 561 def __invert_components(encoding, components): |
| 454 inverted = [] | 562 def key(x, y): |
| 563 return Term('NUMERIC_RANGE_KEY', x, y) |
| 455 last = None | 564 last = None |
| 456 classes = set(encoding.class_value_iter()) | 565 classes = set(encoding.named_range_value_iter()) |
| 457 for r in ranges: | 566 for c in components: |
| 458 assert not TransitionKey.__is_unique_range(r) | 567 if c in classes: |
| 459 if encoding.is_class_range(r): | 568 classes.remove(c) |
| 460 classes.remove(r) | |
| 461 continue | 569 continue |
| 570 assert c.name() == 'NUMERIC_RANGE_KEY', 'unencoded key not invertible' |
| 571 r = c.args() |
| 572 assert encoding.is_primary_range(r) |
| 462 if last == None: | 573 if last == None: |
| 463 if r[0] != encoding.lower_bound(): | 574 if r[0] != encoding.lower_bound(): |
| 464 inverted.append((encoding.lower_bound(), r[0] - 1)) | 575 yield key(encoding.lower_bound(), r[0] - 1) |
| 465 elif last[1] + 1 < r[0]: | 576 elif last[1] + 1 < r[0]: |
| 466 inverted.append((last[1] + 1, r[0] - 1)) | 577 yield key(last[1] + 1, r[0] - 1) |
| 467 last = r | 578 last = r |
| 468 upper_bound = encoding.primary_range()[1] | 579 upper_bound = encoding.primary_range()[1] |
| 469 if last == None: | 580 if last == None: |
| 470 inverted.append(encoding.primary_range()) | 581 r = encoding.primary_range() |
| 582 yield key(r[0], r[1]) |
| 471 elif last[1] < upper_bound: | 583 elif last[1] < upper_bound: |
| 472 inverted.append((last[1] + 1, upper_bound)) | 584 yield key(last[1] + 1, upper_bound) |
| 473 inverted += list(classes) | 585 for c in sorted(classes, TransitionKey.__component_compare): |
| 474 return inverted | 586 yield c |
| 475 | 587 |
| 476 class Latin1Encoding(KeyEncoding): | 588 class Latin1Encoding(KeyEncoding): |
| 477 | 589 |
| 478 def __init__(self): | 590 def __init__(self): |
| 479 super(Latin1Encoding, self).__init__( | 591 super(Latin1Encoding, self).__init__( |
| 480 'latin1', | 592 'latin1', |
| 481 (0, 255), | 593 (0, 255), |
| 482 []) | 594 [], |
| 483 self.add_predefined_range( | 595 { |
| 484 'whitespace', [(9, 9), (11, 12), (32, 32), (133, 133), (160, 160)]) | 596 'whitespace': |
| 485 self.add_predefined_range( | 597 [(9, 9), (11, 12), (32, 32), (133, 133), (160, 160)], |
| 486 'letter', [ | 598 'letter': |
| 487 (65, 90), (97, 122), (170, 170), (181, 181), | 599 [(65, 90), (97, 122), (170, 170), (181, 181), |
| 488 (186, 186), (192, 214), (216, 246), (248, 255)]) | 600 (186, 186), (192, 214), (216, 246), (248, 255)], |
| 489 self.add_predefined_range('line_terminator', [(10, 10), (13, 13)]) | 601 'line_terminator': |
| 490 self.add_predefined_range( | 602 [(10, 10), (13, 13)], |
| 491 'identifier_part_not_letter', [(48, 57), (95, 95)]) | 603 'identifier_part_not_letter': |
| 604 [(48, 57), (95, 95)] |
| 605 }) |
| 492 | 606 |
| 493 class Utf16Encoding(KeyEncoding): | 607 class Utf16Encoding(KeyEncoding): |
| 494 | 608 |
| 495 def __init__(self): | 609 def __init__(self): |
| 496 super(Utf16Encoding, self).__init__( | 610 super(Utf16Encoding, self).__init__( |
| 497 'utf16', | 611 'utf16', |
| 498 (0, 255), | 612 (0, 255), |
| 499 ['non_primary_whitespace', | 613 ['non_primary_whitespace', |
| 500 'non_primary_letter', | 614 'non_primary_letter', |
| 501 'non_primary_identifier_part_not_letter', | 615 'non_primary_identifier_part_not_letter', |
| 502 'non_primary_line_terminator', | 616 'non_primary_line_terminator', |
| 503 'non_primary_everything_else']) | 617 'non_primary_everything_else'], |
| 504 self.add_predefined_range( | 618 { |
| 505 'whitespace', | 619 'whitespace': |
| 506 [(9, 9), (11, 12), (32, 32), (133, 133), (160, 160), | 620 [(9, 9), (11, 12), (32, 32), (133, 133), (160, 160), |
| 507 self.class_range('non_primary_whitespace')]) | 621 ('non_primary_whitespace',)], |
| 508 self.add_predefined_range( | 622 'letter': |
| 509 'letter', [ | 623 [(65, 90), (97, 122), (170, 170), (181, 181), |
| 510 (65, 90), (97, 122), (170, 170), (181, 181), | 624 (186, 186), (192, 214), (216, 246), (248, 255), |
| 511 (186, 186), (192, 214), (216, 246), (248, 255), | 625 ('non_primary_letter',)], |
| 512 self.class_range('non_primary_letter')]) | 626 'line_terminator': |
| 513 self.add_predefined_range( | 627 [(10, 10), (13, 13), ('non_primary_line_terminator',)], |
| 514 'line_terminator', | 628 'identifier_part_not_letter': |
| 515 [(10, 10), (13, 13), self.class_range('non_primary_line_terminator')]) | 629 [(48, 57), (95, 95), ('non_primary_identifier_part_not_letter',)], |
| 516 self.add_predefined_range( | 630 }) |
| 517 'identifier_part_not_letter', | |
| 518 [(48, 57), (95, 95), | |
| 519 self.class_range('non_primary_identifier_part_not_letter')]) | |
| 520 | 631 |
| 521 class Utf8Encoding(KeyEncoding): | 632 class Utf8Encoding(KeyEncoding): |
| 522 | 633 |
| 523 def __init__(self): | 634 def __init__(self): |
| 524 super(Utf8Encoding, self).__init__( | 635 super(Utf8Encoding, self).__init__( |
| 525 'utf8', | 636 'utf8', |
| 526 (0, 127), | 637 (0, 127), |
| 527 ['non_primary_whitespace', | 638 ['non_primary_whitespace', |
| 528 'non_primary_letter', | 639 'non_primary_letter', |
| 529 'non_primary_identifier_part_not_letter', | 640 'non_primary_identifier_part_not_letter', |
| 530 'non_primary_line_terminator', | 641 'non_primary_line_terminator', |
| 531 'non_primary_everything_else']) | 642 'non_primary_everything_else'], |
| 532 self.add_predefined_range( | 643 { |
| 533 'whitespace', | 644 'whitespace': |
| 534 [(9, 9), (11, 12), (32, 32), | 645 [(9, 9), (11, 12), (32, 32), ('non_primary_whitespace',)], |
| 535 self.class_range('non_primary_whitespace')]) | 646 'letter': |
| 536 self.add_predefined_range( | 647 [(65, 90), (97, 122), ('non_primary_letter',)], |
| 537 'letter', [(65, 90), (97, 122), self.class_range('non_primary_letter')]) | 648 'line_terminator': |
| 538 self.add_predefined_range( | 649 [(10, 10), (13, 13), ('non_primary_line_terminator',)], |
| 539 'line_terminator', | 650 'identifier_part_not_letter': |
| 540 [(10, 10), (13, 13), self.class_range('non_primary_line_terminator')]) | 651 [(48, 57), (95, 95), ('non_primary_identifier_part_not_letter',)], |
| 541 self.add_predefined_range( | 652 }) |
| 542 'identifier_part_not_letter', | |
| 543 [(48, 57), (95, 95), | |
| 544 self.class_range('non_primary_identifier_part_not_letter')]) | |
| OLD | NEW |