Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(160)

Side by Side Diff: tools/lexer_generator/transition_keys.py

Issue 159753009: Experimental parser: break KeyEncoding off into its own file (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser
Patch Set: Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « tools/lexer_generator/encoding.py ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # Copyright 2013 the V8 project authors. All rights reserved. 1 # Copyright 2013 the V8 project authors. All rights reserved.
2 # Redistribution and use in source and binary forms, with or without 2 # Redistribution and use in source and binary forms, with or without
3 # modification, are permitted provided that the following conditions are 3 # modification, are permitted provided that the following conditions are
4 # met: 4 # met:
5 # 5 #
6 # * Redistributions of source code must retain the above copyright 6 # * Redistributions of source code must retain the above copyright
7 # notice, this list of conditions and the following disclaimer. 7 # notice, this list of conditions and the following disclaimer.
8 # * Redistributions in binary form must reproduce the above 8 # * Redistributions in binary form must reproduce the above
9 # copyright notice, this list of conditions and the following 9 # copyright notice, this list of conditions and the following
10 # disclaimer in the documentation and/or other materials provided 10 # disclaimer in the documentation and/or other materials provided
11 # with the distribution. 11 # with the distribution.
12 # * Neither the name of Google Inc. nor the names of its 12 # * Neither the name of Google Inc. nor the names of its
13 # contributors may be used to endorse or promote products derived 13 # contributors may be used to endorse or promote products derived
14 # from this software without specific prior written permission. 14 # from this software without specific prior written permission.
15 # 15 #
16 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 27
28 from types import IntType
29 from itertools import chain 28 from itertools import chain
29 from encoding import KeyEncoding
30 from action import Term 30 from action import Term
31 from string import printable
32
33 class KeyEncoding(object):
34
35 __encodings = {}
36
37 @staticmethod
38 def get(name):
39 if not KeyEncoding.__encodings:
40 Latin1Encoding()
41 Utf16Encoding()
42 Utf8Encoding()
43 return KeyEncoding.__encodings[name]
44
45 def __init__(self, name, primary_range, named_ranges, predefined_ranges):
46 assert not name in KeyEncoding.__encodings
47 assert primary_range[0] <= primary_range[1]
48 KeyEncoding.__encodings[name] = self
49 self.__name = name
50 self.__primary_range = primary_range
51 self.__lower_bound = primary_range[0]
52 self.__upper_bound = primary_range[1]
53 self.__primary_range_component = self.numeric_range_term(primary_range[0],
54 primary_range[1])
55 self.__named_ranges = {
56 k : Term('NAMED_RANGE_KEY', k) for k in named_ranges }
57 def f(v):
58 if len(v) == 2:
59 return self.numeric_range_term(v[0], v[1])
60 elif len(v) == 1:
61 assert v[0] in self.__named_ranges
62 return self.__named_ranges[v[0]]
63 else:
64 raise Exception()
65 self.__predefined_ranges = {
66 k : map(f, v) for k, v in predefined_ranges.iteritems() }
67
68 def name(self):
69 return self.__name
70
71 def lower_bound(self):
72 return self.__lower_bound
73
74 def upper_bound(self):
75 return self.__upper_bound
76
77 def primary_range(self):
78 return self.__primary_range
79
80 def named_range(self, name):
81 ranges = self.__named_ranges
82 return Term.empty_term() if not name in ranges else ranges[name]
83
84 def named_range_iter(self):
85 return self.__named_range.iteritems()
86
87 def named_range_key_iter(self):
88 return self.__named_ranges.iterkeys()
89
90 def named_range_value_iter(self):
91 return self.__named_ranges.itervalues()
92
93 def predefined_range_iter(self, name):
94 ranges = self.__predefined_ranges
95 return None if not name in ranges else iter(ranges[name])
96
97 def __primary_range_iter(self):
98 yield self.__primary_range_component
99
100 def all_components_iter(self):
101 return chain(self.__primary_range_iter(), self.__named_ranges.itervalues())
102
103 def is_primary_range(self, r):
104 assert len(r) == 2
105 return self.in_primary_range(r[0], r[1])
106
107 def in_primary_range(self, a, b):
108 return self.lower_bound() <= a and b <= self.upper_bound()
109
110 def numeric_range_term(self, a, b):
111 assert type(a) == IntType and type(b) == IntType
112 assert self.in_primary_range(a, b)
113 return Term('NUMERIC_RANGE_KEY', a, b)
114 31
115 class TransitionKey(object): 32 class TransitionKey(object):
116 '''Represents a transition from a state in DFA or NFA to another state. 33 '''Represents a transition from a state in DFA or NFA to another state.
117 34
118 A transition key has a list of character ranges and a list of class ranges 35 A transition key has a list of character ranges and a list of class ranges
119 (e.g., "whitespace"), defining for which characters the transition 36 (e.g., "whitespace"), defining for which characters the transition
120 happens. When we generate code based on the transition key, the character 37 happens. When we generate code based on the transition key, the character
121 ranges generate simple checks and the class ranges generate more complicated 38 ranges generate simple checks and the class ranges generate more complicated
122 conditions, e.g., function calls.''' 39 conditions, e.g., function calls.'''
123 40
(...skipping 168 matching lines...) Expand 10 before | Expand all | Expand 10 after
292 209
293 def __hash__(self): 210 def __hash__(self):
294 return hash(self.__term) 211 return hash(self.__term)
295 212
296 def __ne__(self, other): 213 def __ne__(self, other):
297 return not self == other 214 return not self == other
298 215
299 def __eq__(self, other): 216 def __eq__(self, other):
300 return isinstance(other, TransitionKey) and self.__term == other.__term 217 return isinstance(other, TransitionKey) and self.__term == other.__term
301 218
302 @staticmethod
303 def __class_name(encoding, r):
304 for name, v in encoding.class_range_iter():
305 if r == v:
306 return name
307 assert False
308
309 @staticmethod
310 def __unique_name(r):
311 for name, v in TransitionKey.__cached_keys['no_encoding'].items():
312 if v.__ranges and r == v.__ranges[0]:
313 return name[2:]
314 assert False
315
316 def range_iter(self, encoding): 219 def range_iter(self, encoding):
317 for c in self.__flatten(): 220 for c in self.__flatten():
318 if c.name() == 'NUMERIC_RANGE_KEY': 221 if c.name() == 'NUMERIC_RANGE_KEY':
319 yield ('PRIMARY_RANGE', (c.args()[0], c.args()[1])) 222 yield ('PRIMARY_RANGE', (c.args()[0], c.args()[1]))
320 elif c.name() == 'NAMED_RANGE_KEY': 223 elif c.name() == 'NAMED_RANGE_KEY':
321 yield ('CLASS', c.args()[0]) 224 yield ('CLASS', c.args()[0])
322 elif c.name() == 'TERM_KEY': 225 elif c.name() == 'TERM_KEY':
323 yield ('UNIQUE', c.args()[0]) 226 yield ('UNIQUE', c.args()[0])
324 else: 227 else:
325 assert False, 'unimplemented %s' % c 228 assert False, 'unimplemented %s' % c
326 229
327 __printable_cache = {
328 ord('\t') : '\\t',
329 ord('\n') : '\\n',
330 ord('\r') : '\\r',
331 }
332
333 @staticmethod 230 @staticmethod
334 def __component_str(encoding, component): 231 def __component_str(encoding, component):
335 if component.name() == 'TERM_KEY': 232 if component.name() == 'TERM_KEY':
336 return component.args()[0] 233 return component.args()[0]
337 elif component.name() == 'NAMED_RANGE_KEY': 234 elif component.name() == 'NAMED_RANGE_KEY':
338 return component.args()[0] 235 return component.args()[0]
339 elif component.name() == 'EPSILON_KEY': 236 elif component.name() == 'EPSILON_KEY':
340 return 'epsilon' 237 return 'epsilon'
341 elif component.name() == 'OMEGA_KEY': 238 elif component.name() == 'OMEGA_KEY':
342 return 'omega' 239 return 'omega'
343 elif component.name() != 'NUMERIC_RANGE_KEY': 240 elif component.name() == 'NUMERIC_RANGE_KEY':
344 raise Exception('unprintable %s' % component) 241 r = component.args()
345 r = component.args() 242 to_str = lambda x: KeyEncoding.to_str(encoding, x)
346 def to_str(x): 243 if r[0] == r[1]:
347 assert not encoding or encoding.in_primary_range(x, x) 244 return '%s' % to_str(r[0])
348 if x > 127:
349 return str(x)
350 if not x in TransitionKey.__printable_cache:
351 res = "'%s'" % chr(x) if chr(x) in printable else str(x)
352 TransitionKey.__printable_cache[x] = res
353 return TransitionKey.__printable_cache[x]
354 if r[0] == r[1]:
355 return '%s' % to_str(r[0])
356 else:
357 return '[%s-%s]' % (to_str(r[0]), to_str(r[1])) 245 return '[%s-%s]' % (to_str(r[0]), to_str(r[1]))
246 raise Exception('unprintable %s' % component)
358 247
359 def __flatten(self): 248 def __flatten(self):
360 return self.__flatten_components([self.__term]) 249 return self.__flatten_components([self.__term])
361 250
362 @staticmethod 251 @staticmethod
363 def __flatten_components(components): 252 def __flatten_components(components):
364 for component in components: 253 for component in components:
365 if component.name() != 'COMPOSITE_KEY': 254 if component.name() != 'COMPOSITE_KEY':
366 yield component 255 yield component
367 else: 256 else:
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
409 (a, b, sign) = (b, a, 1) # normalize ordering so that a0 < b0 298 (a, b, sign) = (b, a, 1) # normalize ordering so that a0 < b0
410 assert a[0] < b[0] 299 assert a[0] < b[0]
411 if b[1] <= a[1]: # subset 300 if b[1] <= a[1]: # subset
412 return 3 * sign 301 return 3 * sign
413 if a[1] < b[0]: # non overlap 302 if a[1] < b[0]: # non overlap
414 return 2 * sign 303 return 2 * sign
415 return 5 * sign # partial overlap 304 return 5 * sign # partial overlap
416 305
417 @staticmethod 306 @staticmethod
418 def __construct(encoding, components): 307 def __construct(encoding, components):
308 if isinstance(components, Term):
309 components = [components]
419 is_unique = False 310 is_unique = False
420 acc = [] 311 acc = []
421 last = Term.empty_term() 312 last = Term.empty_term()
422 for current in TransitionKey.__flatten_components(components): 313 for current in TransitionKey.__flatten_components(components):
423 name = current.name() 314 name = current.name()
424 if last: 315 if last:
425 assert name != 'EPSILON_KEY' and name != 'OMEGA_KEY', 'cannot merge' 316 assert name != 'EPSILON_KEY' and name != 'OMEGA_KEY', 'cannot merge'
426 c = TransitionKey.__component_compare(last, current) 317 c = TransitionKey.__component_compare(last, current)
427 assert c == -1 or c == -2, 'bad order %s %s' % (str(last), str(current)) 318 assert c == -1 or c == -2, 'bad order %s %s' % (str(last), str(current))
428 if name == 'EPSILON_KEY' or name == 'OMEGA_KEY' or name == 'TERM_KEY': 319 if name == 'EPSILON_KEY' or name == 'OMEGA_KEY' or name == 'TERM_KEY':
429 pass 320 pass
430 elif name == 'NUMERIC_RANGE_KEY': 321 elif name == 'NUMERIC_RANGE_KEY':
431 assert encoding.is_primary_range(current.args()) 322 assert encoding.is_primary_range(current.args())
432 if last.name() == 'NUMERIC_RANGE_KEY': 323 if last.name() == 'NUMERIC_RANGE_KEY':
433 assert last.args()[1] + 1 < current.args()[0], 'ranges must be merged' 324 assert last.args()[1] + 1 < current.args()[0], 'ranges must be merged'
434 elif name == 'NAMED_RANGE_KEY': 325 elif name == 'NAMED_RANGE_KEY':
435 assert encoding.named_range(current.args()[0]) 326 assert encoding.named_range(current.args()[0])
436 else: 327 else:
437 raise Exception('illegal component %s' % str(current)) 328 raise Exception('illegal component %s' % str(current))
438 acc.append(current) 329 acc.append(current)
439 last = current 330 last = current
440 assert acc, "must have components" 331 assert acc, "must have components"
441 return acc[0] if len(acc) == 1 else Term('COMPOSITE_KEY', *acc) 332 return acc[0] if len(acc) == 1 else Term('COMPOSITE_KEY', *acc)
442 333
443 def __init__(self, encoding, components): 334 def __init__(self, encoding, components):
444 if isinstance(components, Term):
445 components = [components]
446 self.__term = TransitionKey.__construct(encoding, components) 335 self.__term = TransitionKey.__construct(encoding, components)
447 self.__cached_hash = None 336 self.__cached_hash = None
448 337
449 def to_string(self, encoding): 338 def to_string(self, encoding):
450 return ', '.join(map(lambda x : TransitionKey.__component_str(encoding, x), 339 return ', '.join(map(lambda x : TransitionKey.__component_str(encoding, x),
451 self.__flatten())) 340 self.__flatten()))
452 341
453 def __str__(self): 342 def __str__(self):
454 return self.to_string(None) 343 return self.to_string(None)
455 344
(...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after
552 return TransitionKey.__key_from_components( 441 return TransitionKey.__key_from_components(
553 encoding, True, TransitionKey.__flatten_keys(keys)) 442 encoding, True, TransitionKey.__flatten_keys(keys))
554 443
555 @staticmethod 444 @staticmethod
556 def merged_key(encoding, keys): 445 def merged_key(encoding, keys):
557 return TransitionKey(encoding, 446 return TransitionKey(encoding,
558 TransitionKey.__disjoint_components_from_keys(encoding, keys, True)) 447 TransitionKey.__disjoint_components_from_keys(encoding, keys, True))
559 448
560 @staticmethod 449 @staticmethod
561 def __invert_components(encoding, components): 450 def __invert_components(encoding, components):
562 def key(x, y): 451 key = lambda x, y: encoding.numeric_range_term(x, y)
563 return encoding.numeric_range_term(x, y)
564 last = None 452 last = None
565 classes = set(encoding.named_range_value_iter()) 453 classes = set(encoding.named_range_value_iter())
566 for c in components: 454 for c in components:
567 if c in classes: 455 if c in classes:
568 classes.remove(c) 456 classes.remove(c)
569 continue 457 continue
570 assert c.name() == 'NUMERIC_RANGE_KEY', 'unencoded key not invertible' 458 assert c.name() == 'NUMERIC_RANGE_KEY', 'unencoded key not invertible'
571 r = c.args() 459 r = c.args()
572 assert encoding.is_primary_range(r) 460 assert encoding.is_primary_range(r)
573 if last == None: 461 if last == None:
574 if r[0] != encoding.lower_bound(): 462 if r[0] != encoding.lower_bound():
575 yield key(encoding.lower_bound(), r[0] - 1) 463 yield key(encoding.lower_bound(), r[0] - 1)
576 elif last[1] + 1 < r[0]: 464 elif last[1] + 1 < r[0]:
577 yield key(last[1] + 1, r[0] - 1) 465 yield key(last[1] + 1, r[0] - 1)
578 last = r 466 last = r
579 upper_bound = encoding.primary_range()[1] 467 upper_bound = encoding.primary_range()[1]
580 if last == None: 468 if last == None:
581 r = encoding.primary_range() 469 r = encoding.primary_range()
582 yield key(r[0], r[1]) 470 yield key(r[0], r[1])
583 elif last[1] < upper_bound: 471 elif last[1] < upper_bound:
584 yield key(last[1] + 1, upper_bound) 472 yield key(last[1] + 1, upper_bound)
585 for c in sorted(classes, TransitionKey.__component_compare): 473 for c in sorted(classes, TransitionKey.__component_compare):
586 yield c 474 yield c
587
588 class Latin1Encoding(KeyEncoding):
589
590 def __init__(self):
591 super(Latin1Encoding, self).__init__(
592 'latin1',
593 (0, 255),
594 [],
595 {
596 'whitespace':
597 [(9, 9), (11, 12), (32, 32), (133, 133), (160, 160)],
598 'letter':
599 [(65, 90), (97, 122), (170, 170), (181, 181),
600 (186, 186), (192, 214), (216, 246), (248, 255)],
601 'line_terminator':
602 [(10, 10), (13, 13)],
603 'identifier_part_not_letter':
604 [(48, 57), (95, 95)]
605 })
606
607 class Utf16Encoding(KeyEncoding):
608
609 def __init__(self):
610 super(Utf16Encoding, self).__init__(
611 'utf16',
612 (0, 255),
613 ['non_primary_whitespace',
614 'non_primary_letter',
615 'non_primary_identifier_part_not_letter',
616 'non_primary_line_terminator',
617 'non_primary_everything_else'],
618 {
619 'whitespace':
620 [(9, 9), (11, 12), (32, 32), (133, 133), (160, 160),
621 ('non_primary_whitespace',)],
622 'letter':
623 [(65, 90), (97, 122), (170, 170), (181, 181),
624 (186, 186), (192, 214), (216, 246), (248, 255),
625 ('non_primary_letter',)],
626 'line_terminator':
627 [(10, 10), (13, 13), ('non_primary_line_terminator',)],
628 'identifier_part_not_letter':
629 [(48, 57), (95, 95), ('non_primary_identifier_part_not_letter',)],
630 })
631
632 class Utf8Encoding(KeyEncoding):
633
634 def __init__(self):
635 super(Utf8Encoding, self).__init__(
636 'utf8',
637 (0, 127),
638 ['non_primary_whitespace',
639 'non_primary_letter',
640 'non_primary_identifier_part_not_letter',
641 'non_primary_line_terminator',
642 'non_primary_everything_else'],
643 {
644 'whitespace':
645 [(9, 9), (11, 12), (32, 32), ('non_primary_whitespace',)],
646 'letter':
647 [(65, 90), (97, 122), ('non_primary_letter',)],
648 'line_terminator':
649 [(10, 10), (13, 13), ('non_primary_line_terminator',)],
650 'identifier_part_not_letter':
651 [(48, 57), (95, 95), ('non_primary_identifier_part_not_letter',)],
652 })
OLDNEW
« no previous file with comments | « tools/lexer_generator/encoding.py ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698