| OLD | NEW |
| (Empty) |
| 1 # Copyright 2013 the V8 project authors. All rights reserved. | |
| 2 # Redistribution and use in source and binary forms, with or without | |
| 3 # modification, are permitted provided that the following conditions are | |
| 4 # met: | |
| 5 # | |
| 6 # * Redistributions of source code must retain the above copyright | |
| 7 # notice, this list of conditions and the following disclaimer. | |
| 8 # * Redistributions in binary form must reproduce the above | |
| 9 # copyright notice, this list of conditions and the following | |
| 10 # disclaimer in the documentation and/or other materials provided | |
| 11 # with the distribution. | |
| 12 # * Neither the name of Google Inc. nor the names of its | |
| 13 # contributors may be used to endorse or promote products derived | |
| 14 # from this software without specific prior written permission. | |
| 15 # | |
| 16 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| 17 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| 18 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
| 19 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
| 20 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
| 21 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
| 22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
| 23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
| 24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| 25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| 26 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 27 | |
| 28 from itertools import chain | |
| 29 from encoding import KeyEncoding | |
| 30 from action import Term | |
| 31 | |
| 32 class TransitionKey(object): | |
| 33 '''Represents a transition from a state in DFA or NFA to another state. | |
| 34 | |
| 35 A transition key has a list of character ranges and a list of class ranges | |
| 36 (e.g., "whitespace"), defining for which characters the transition | |
| 37 happens. When we generate code based on the transition key, the character | |
| 38 ranges generate simple checks and the class ranges generate more complicated | |
| 39 conditions, e.g., function calls.''' | |
| 40 | |
| 41 __cached_keys = { | |
| 42 'no_encoding' : {}, | |
| 43 'latin1' : {}, | |
| 44 'utf8' : {}, | |
| 45 'utf16' : {}, | |
| 46 } | |
| 47 | |
| 48 @staticmethod | |
| 49 def __cached_key(encoding, name, components_getter): | |
| 50 encoding_name = encoding.name() if encoding else 'no_encoding' | |
| 51 cache = TransitionKey.__cached_keys[encoding_name] | |
| 52 if not name in cache: | |
| 53 cache[name] = TransitionKey(encoding, components_getter()) | |
| 54 return cache[name] | |
| 55 | |
| 56 @staticmethod | |
| 57 def epsilon(): | |
| 58 '''Returns a TransitionKey for the epsilon (empty) transition.''' | |
| 59 return TransitionKey.__cached_key(None, 'epsilon', | |
| 60 lambda : Term("EPSILON_KEY")) | |
| 61 | |
| 62 @staticmethod | |
| 63 def omega(): | |
| 64 '''Always matches.''' | |
| 65 return TransitionKey.__cached_key(None, 'omega', lambda : Term("OMEGA_KEY")) | |
| 66 | |
| 67 @staticmethod | |
| 68 def any(encoding): | |
| 69 '''Returns a TransitionKey which matches any encoded character.''' | |
| 70 return TransitionKey.__cached_key(encoding, 'any', | |
| 71 lambda : encoding.all_components_iter()) | |
| 72 | |
| 73 @staticmethod | |
| 74 def single_char(encoding, char): | |
| 75 '''Returns a TransitionKey for a single-character transition.''' | |
| 76 return TransitionKey.range(encoding, char, char) | |
| 77 | |
| 78 @staticmethod | |
| 79 def range(encoding, a, b): | |
| 80 '''Returns a TransitionKey for a single-character transition.''' | |
| 81 return TransitionKey(encoding, encoding.numeric_range_term(a, b)) | |
| 82 | |
| 83 @staticmethod | |
| 84 def unique(term): # TODO(dcarney): rename | |
| 85 '''Returns a unique TransitionKey for the given name (and creates it if it | |
| 86 doesn't exist yet). The TransitionKey won't have any real character range, | |
| 87 but a newly-created "mock" character range which is separate from all other | |
| 88 character ranges.''' | |
| 89 return TransitionKey(None, Term("TERM_KEY", term)) | |
| 90 | |
| 91 @staticmethod | |
| 92 def __process_term(encoding, term, components, key_map): | |
| 93 key = term.name() | |
| 94 args = term.args() | |
| 95 if key == 'RANGE': | |
| 96 components.append(encoding.numeric_range_term(args[0], args[1])) | |
| 97 elif key == 'LITERAL': | |
| 98 components += map(lambda x : encoding.numeric_range_term(x, x), args) | |
| 99 elif key == 'CAT': | |
| 100 for x in args: | |
| 101 TransitionKey.__process_term(encoding, x, components, key_map) | |
| 102 elif key == 'CHARACTER_CLASS': | |
| 103 class_name = args[0] | |
| 104 if encoding.named_range(class_name): | |
| 105 c = encoding.named_range(class_name) | |
| 106 if class_name in key_map: | |
| 107 assert key_map[class_name] == TransitionKey(encoding, c) | |
| 108 components.append(c) | |
| 109 elif encoding.predefined_range_iter(class_name): | |
| 110 cs = list(encoding.predefined_range_iter(class_name)) | |
| 111 if class_name in key_map: | |
| 112 assert key_map[class_name] == TransitionKey(encoding, cs) | |
| 113 components += cs | |
| 114 elif class_name in key_map: | |
| 115 components += key_map[class_name].__flatten() | |
| 116 else: | |
| 117 raise Exception('unknown character class [%s]' % args[0]) | |
| 118 else: | |
| 119 raise Exception('bad key [%s]' % key) | |
| 120 | |
| 121 @staticmethod | |
| 122 def character_class(encoding, term, key_map): | |
| 123 '''Processes 'term' (a representation of a character class in the rule | |
| 124 file), and constructs a TransitionKey based on it. 'key_map' contains | |
| 125 already constructed aliases for character classes (they can be used in the | |
| 126 new character class). It is a map from strings (character class names) to | |
| 127 TransitionKeys. For example, graph might represent the character class | |
| 128 [a-z:digit:] where 'digit' is a previously constructed character class found | |
| 129 in "key_map".''' | |
| 130 components = [] | |
| 131 assert term.name() == 'CLASS' or term.name() == 'NOT_CLASS' | |
| 132 invert = term.name() == 'NOT_CLASS' | |
| 133 assert len(term.args()) == 1 | |
| 134 TransitionKey.__process_term(encoding, term.args()[0], components, key_map) | |
| 135 key = TransitionKey.__key_from_components(encoding, invert, components) | |
| 136 assert key != None, "not invertible %s " % str(term) | |
| 137 return key | |
| 138 | |
| 139 def matches_char(self, char): | |
| 140 'this is just for simple lexer testing and is incomplete' | |
| 141 char = ord(char) | |
| 142 assert 0 <= char and char < 128 | |
| 143 for c in self.__flatten(): | |
| 144 if c.name() == 'NUMERIC_RANGE_KEY': | |
| 145 r = c.args() | |
| 146 if r[0] <= char and char <= r[1]: | |
| 147 return True | |
| 148 return False | |
| 149 | |
| 150 # (disjoint, subset, advance_left, advance_right) | |
| 151 __is_superset_of_key_helper = ( | |
| 152 (True, True, False, False), # -5 : error | |
| 153 (True, True, False, False), # -4 : error | |
| 154 (False, True, False, True), # -3 : | |
| 155 (False, False, True, False), # -2 : | |
| 156 (False, False, True, False), # -1 : | |
| 157 (False, True, True, True), # 0 : | |
| 158 (True, False, False, True), # 1 : | |
| 159 (True, False, False, True), # 2 : | |
| 160 (True, True, False, False), # 3 : error | |
| 161 (False, True, False, True), # 4 : | |
| 162 (True, True, False, False), # 5 : error | |
| 163 ) | |
| 164 | |
| 165 def is_superset_of_key(self, key): | |
| 166 '''Returns true if 'key' is a sub-key of this TransitionKey. | |
| 167 must be called on a key that is either a subset or disjoint''' | |
| 168 helper = TransitionKey.__is_superset_of_key_helper | |
| 169 (left, right) = (self.__flatten(), key.__flatten()) | |
| 170 (disjoint, subset, advance_left, advance_right) = (False, False, True, True) | |
| 171 (right_exhausted, left_exhausted) = (False, False) | |
| 172 while advance_left or advance_right: | |
| 173 if advance_right: | |
| 174 try: | |
| 175 r = right.next() | |
| 176 except StopIteration: | |
| 177 right_exhausted = True | |
| 178 if advance_left: | |
| 179 try: | |
| 180 l = left.next() | |
| 181 except StopIteration: | |
| 182 left_exhausted = True | |
| 183 if right_exhausted or left_exhausted: | |
| 184 break | |
| 185 c = TransitionKey.__component_compare(l, r) | |
| 186 (d, s, advance_left, advance_right) = helper[c + 5] | |
| 187 disjoint |= d | |
| 188 subset |= s | |
| 189 if not right_exhausted: | |
| 190 disjoint = True | |
| 191 if disjoint and subset: | |
| 192 raise Exception('not a subset and not disjoint') | |
| 193 return subset | |
| 194 | |
| 195 @staticmethod | |
| 196 def compare(self, other): | |
| 197 left = list(self.__flatten()) | |
| 198 right = list(other.__flatten()) | |
| 199 offset = 0 | |
| 200 while offset < len(left) and offset < len(right): | |
| 201 c = TransitionKey.__component_compare(left[offset], right[offset]) | |
| 202 if c != 0: | |
| 203 return c | |
| 204 offset += 1 | |
| 205 return TransitionKey.__cmp(len(left), len(right)) | |
| 206 | |
| 207 def __cmp__(self, other): | |
| 208 return TransitionKey.compare(self, other) | |
| 209 | |
| 210 def __hash__(self): | |
| 211 return hash(self.__term) | |
| 212 | |
| 213 def __ne__(self, other): | |
| 214 return not self == other | |
| 215 | |
| 216 def __eq__(self, other): | |
| 217 return isinstance(other, TransitionKey) and self.__term == other.__term | |
| 218 | |
| 219 def range_iter(self, encoding): | |
| 220 for c in self.__flatten(): | |
| 221 if c.name() == 'NUMERIC_RANGE_KEY': | |
| 222 yield ('PRIMARY_RANGE', (c.args()[0], c.args()[1])) | |
| 223 elif c.name() == 'NAMED_RANGE_KEY': | |
| 224 yield ('CLASS', c.args()[0]) | |
| 225 elif c.name() == 'TERM_KEY': | |
| 226 yield ('UNIQUE', c.args()[0]) | |
| 227 elif c.name() == 'OMEGA_KEY': | |
| 228 yield ('OMEGA', ()) | |
| 229 else: | |
| 230 assert False, 'unimplemented %s' % c | |
| 231 | |
| 232 @staticmethod | |
| 233 def __component_str(encoding, component): | |
| 234 if component.name() == 'TERM_KEY': | |
| 235 return component.args()[0] | |
| 236 elif component.name() == 'NAMED_RANGE_KEY': | |
| 237 return component.args()[0] | |
| 238 elif component.name() == 'EPSILON_KEY': | |
| 239 return 'epsilon' | |
| 240 elif component.name() == 'OMEGA_KEY': | |
| 241 return 'omega' | |
| 242 elif component.name() == 'NUMERIC_RANGE_KEY': | |
| 243 r = component.args() | |
| 244 to_str = lambda x: KeyEncoding.to_str(encoding, x) | |
| 245 if r[0] == r[1]: | |
| 246 return "'%s'" % to_str(r[0]) | |
| 247 return "['%s'-'%s']" % (to_str(r[0]), to_str(r[1])) | |
| 248 raise Exception('unprintable %s' % component) | |
| 249 | |
| 250 def __flatten(self): | |
| 251 return self.__flatten_components([self.__term]) | |
| 252 | |
| 253 @staticmethod | |
| 254 def __flatten_components(components): | |
| 255 for component in components: | |
| 256 if component.name() != 'COMPOSITE_KEY': | |
| 257 yield component | |
| 258 else: | |
| 259 for arg in component.args(): | |
| 260 yield arg | |
| 261 | |
| 262 __component_name_order = { | |
| 263 'EPSILON_KEY' : 0, | |
| 264 'NUMERIC_RANGE_KEY' : 1, | |
| 265 'NAMED_RANGE_KEY' : 2, | |
| 266 'TERM_KEY' : 3, | |
| 267 'OMEGA_KEY' : 4 | |
| 268 } | |
| 269 | |
| 270 @staticmethod | |
| 271 def __cmp(a, b): | |
| 272 'wraps standard cmp function to return correct results for components' | |
| 273 c = cmp(a, b) | |
| 274 return 0 if c == 0 else (-1 if c < 0 else 1) | |
| 275 | |
| 276 @staticmethod | |
| 277 def __component_name_compare(a, b): | |
| 278 return TransitionKey.__cmp(TransitionKey.__component_name_order[a], | |
| 279 TransitionKey.__component_name_order[b]) | |
| 280 | |
| 281 @staticmethod | |
| 282 def __component_compare(a, b): | |
| 283 '''component-wise compare function, returns the following values when | |
| 284 comparing non identical numerical ranges: | |
| 285 non-overlapping : -2 -- a0 <= a1 < b0 <= b1 | |
| 286 b subset of a : -3 -- a0 < b0 <= b1 <= a1 | |
| 287 a subset of b : -4 -- a0 == b0 and a1 < b1 | |
| 288 a overlaps to left : -5 -- a0 < b0 and b0 <= a1 < b1 | |
| 289 otherwise a value in [-1, 1] is returned''' | |
| 290 if a.name() != b.name(): | |
| 291 return TransitionKey.__component_name_compare(a.name(), b.name()) | |
| 292 if a.name() != 'NUMERIC_RANGE_KEY': | |
| 293 return TransitionKey.__cmp(str(a), str(b)) | |
| 294 (a, b) = (a.args(), b.args()) | |
| 295 c0 = TransitionKey.__cmp(a[0], b[0]) | |
| 296 if c0 == 0: | |
| 297 return 4 * TransitionKey.__cmp(a[1], b[1]) # either == or a subset | |
| 298 sign = -1 | |
| 299 if c0 > 0: | |
| 300 (a, b, sign) = (b, a, 1) # normalize ordering so that a0 < b0 | |
| 301 assert a[0] < b[0] | |
| 302 if b[1] <= a[1]: # subset | |
| 303 return 3 * sign | |
| 304 if a[1] < b[0]: # non overlap | |
| 305 return 2 * sign | |
| 306 return 5 * sign # partial overlap | |
| 307 | |
| 308 @staticmethod | |
| 309 def __is_composable(term): | |
| 310 return term.name() != 'EPSILON_KEY' and term.name() != 'OMEGA_KEY' | |
| 311 | |
| 312 @staticmethod | |
| 313 def __construct(encoding, components): | |
| 314 if isinstance(components, Term): | |
| 315 components = [components] | |
| 316 is_unique = False | |
| 317 acc = [] | |
| 318 last = Term.empty_term() | |
| 319 for current in TransitionKey.__flatten_components(components): | |
| 320 name = current.name() | |
| 321 # verify arguments | |
| 322 if name == 'EPSILON_KEY' or name == 'OMEGA_KEY' or name == 'TERM_KEY': | |
| 323 pass | |
| 324 elif name == 'NUMERIC_RANGE_KEY': | |
| 325 assert encoding.is_primary_range(current.args()) | |
| 326 if last.name() == 'NUMERIC_RANGE_KEY': | |
| 327 assert last.args()[1] + 1 < current.args()[0], 'ranges must be merged' | |
| 328 elif name == 'NAMED_RANGE_KEY': | |
| 329 assert encoding.named_range(current.args()[0]) | |
| 330 else: | |
| 331 raise Exception('illegal component %s' % str(current)) | |
| 332 # verify ordering, composability | |
| 333 if last: | |
| 334 assert TransitionKey.__is_composable(current), 'cannot compose' | |
| 335 if len(acc) == 1: | |
| 336 assert TransitionKey.__is_composable(last), 'cannot compose' | |
| 337 c = TransitionKey.__component_compare(last, current) | |
| 338 assert c == -1 or c == -2, 'bad order %s %s' % (str(last), str(current)) | |
| 339 acc.append(current) | |
| 340 last = current | |
| 341 assert acc, "must have components" | |
| 342 return acc[0] if len(acc) == 1 else Term('COMPOSITE_KEY', *acc) | |
| 343 | |
| 344 def __init__(self, encoding, components): | |
| 345 self.__term = TransitionKey.__construct(encoding, components) | |
| 346 self.__cached_hash = None | |
| 347 | |
| 348 def to_string(self, encoding): | |
| 349 return ', '.join(map(lambda x : TransitionKey.__component_str(encoding, x), | |
| 350 self.__flatten())) | |
| 351 | |
| 352 def __str__(self): | |
| 353 return self.to_string(None) | |
| 354 | |
| 355 @staticmethod | |
| 356 def __disjoint_keys(encoding, range_map): | |
| 357 '''Takes a set of possibly overlapping ranges, returns a list of ranges | |
| 358 which don't overlap and which cover the same points as the original | |
| 359 set. range_map is a map from lower bounds to a list of upper bounds.''' | |
| 360 sort = lambda x : sorted(set(x)) | |
| 361 range_map = sorted(map(lambda (k, v): (k, sort(v)), range_map.items())) | |
| 362 upper_bound = encoding.upper_bound() + 1 | |
| 363 for i in range(len(range_map)): | |
| 364 (left, left_values) = range_map[i] | |
| 365 next = range_map[i + 1][0] if i != len(range_map) - 1 else upper_bound | |
| 366 to_push = [] | |
| 367 for v in left_values: | |
| 368 assert left <= next | |
| 369 if v >= next: | |
| 370 if not to_push and left < next: | |
| 371 yield (left, next - 1) | |
| 372 to_push.append(v) | |
| 373 else: | |
| 374 yield (left, v) | |
| 375 left = v + 1 | |
| 376 if to_push: | |
| 377 current = range_map[i + 1] | |
| 378 range_map[i + 1] = (current[0], sort(current[1] + to_push)) | |
| 379 | |
| 380 @staticmethod | |
| 381 def __merge_ranges(encoding, ranges): | |
| 382 last = None | |
| 383 for r in ranges: | |
| 384 if last == None: | |
| 385 last = r | |
| 386 elif last[1] + 1 == r[0]: | |
| 387 last = (last[0], r[1]) | |
| 388 else: | |
| 389 yield last | |
| 390 last = r | |
| 391 if last != None: | |
| 392 yield last | |
| 393 | |
| 394 @staticmethod | |
| 395 def __flatten_keys(keys): | |
| 396 return chain(*map(lambda x : x.__flatten(), keys)) | |
| 397 | |
| 398 @staticmethod | |
| 399 def __disjoint_components_from_keys(encoding, keys, merge_ranges = False): | |
| 400 return TransitionKey.__disjoint_components( | |
| 401 encoding, TransitionKey.__flatten_keys(keys), merge_ranges) | |
| 402 | |
| 403 @staticmethod | |
| 404 def __disjoint_components(encoding, components, merge_ranges): | |
| 405 range_map = {} | |
| 406 other_keys = set([]) | |
| 407 for x in components: | |
| 408 if x.name() != 'NUMERIC_RANGE_KEY': | |
| 409 other_keys.add(x) | |
| 410 continue | |
| 411 (start, end) = x.args() | |
| 412 if not start in range_map: | |
| 413 range_map[start] = [] | |
| 414 range_map[start].append(end) | |
| 415 ranges = TransitionKey.__disjoint_keys(encoding, range_map) | |
| 416 if merge_ranges: | |
| 417 ranges = TransitionKey.__merge_ranges(encoding, ranges) | |
| 418 other_keys = sorted(other_keys, cmp=TransitionKey.__component_compare) | |
| 419 range_terms = map( | |
| 420 lambda x : encoding.numeric_range_term(x[0], x[1]), ranges) | |
| 421 return chain(iter(range_terms), iter(other_keys)) | |
| 422 | |
| 423 @staticmethod | |
| 424 def __key_from_components(encoding, invert, components): | |
| 425 components = TransitionKey.__disjoint_components(encoding, components, True) | |
| 426 if invert: | |
| 427 components = TransitionKey.__invert_components(encoding, components) | |
| 428 return None if not components else TransitionKey(encoding, components) | |
| 429 | |
| 430 @staticmethod | |
| 431 def disjoint_keys(encoding, keys): | |
| 432 '''Takes a set of possibly overlapping TransitionKeys, returns a list of | |
| 433 TransitionKeys which don't overlap and whose union is the same as the union | |
| 434 of the original key_set. In addition, TransitionKeys are not merged, only | |
| 435 split. | |
| 436 | |
| 437 For example, if key_set contains two TransitionKeys for ranges [1-10] and | |
| 438 [5-15], disjoint_keys returns a set of three TransitionKeys: [1-4], [5-10], | |
| 439 [11-16].''' | |
| 440 return map(lambda x : TransitionKey(encoding, x), | |
| 441 TransitionKey.__disjoint_components_from_keys(encoding, keys)) | |
| 442 | |
| 443 @staticmethod | |
| 444 def inverse_key(encoding, keys): | |
| 445 '''Returns a TransitionKey which matches represents the inverse of the union | |
| 446 of 'keys'. The TransitionKeys contain a set of character ranges and a set | |
| 447 of classes. The character ranges are inverted in relation to the latin_1 | |
| 448 character range, and the character classes are inverted in relation to all | |
| 449 character classes in __class_bounds.''' | |
| 450 return TransitionKey.__key_from_components( | |
| 451 encoding, True, TransitionKey.__flatten_keys(keys)) | |
| 452 | |
| 453 @staticmethod | |
| 454 def merged_key(encoding, keys): | |
| 455 return TransitionKey(encoding, | |
| 456 TransitionKey.__disjoint_components_from_keys(encoding, keys, True)) | |
| 457 | |
| 458 @staticmethod | |
| 459 def __invert_components(encoding, components): | |
| 460 key = lambda x, y: encoding.numeric_range_term(x, y) | |
| 461 last = None | |
| 462 classes = set(encoding.named_range_value_iter()) | |
| 463 for c in components: | |
| 464 if c in classes: | |
| 465 classes.remove(c) | |
| 466 continue | |
| 467 assert c.name() == 'NUMERIC_RANGE_KEY', 'unencoded key not invertible' | |
| 468 r = c.args() | |
| 469 assert encoding.is_primary_range(r) | |
| 470 if last == None: | |
| 471 if r[0] != encoding.lower_bound(): | |
| 472 yield key(encoding.lower_bound(), r[0] - 1) | |
| 473 elif last[1] + 1 < r[0]: | |
| 474 yield key(last[1] + 1, r[0] - 1) | |
| 475 last = r | |
| 476 upper_bound = encoding.primary_range()[1] | |
| 477 if last == None: | |
| 478 r = encoding.primary_range() | |
| 479 yield key(r[0], r[1]) | |
| 480 elif last[1] < upper_bound: | |
| 481 yield key(last[1] + 1, upper_bound) | |
| 482 for c in sorted(classes, TransitionKey.__component_compare): | |
| 483 yield c | |
| OLD | NEW |