Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(4)

Side by Side Diff: tools/lexer_generator/transition_keys.py

Issue 63943005: Experimental lexer generator: refactor(TransitionKey) += comments. (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser
Patch Set: moar comments Created 7 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « tools/lexer_generator/nfa.py ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # Copyright 2013 the V8 project authors. All rights reserved. 1 # Copyright 2013 the V8 project authors. All rights reserved.
2 # Redistribution and use in source and binary forms, with or without 2 # Redistribution and use in source and binary forms, with or without
3 # modification, are permitted provided that the following conditions are 3 # modification, are permitted provided that the following conditions are
4 # met: 4 # met:
5 # 5 #
6 # * Redistributions of source code must retain the above copyright 6 # * Redistributions of source code must retain the above copyright
7 # notice, this list of conditions and the following disclaimer. 7 # notice, this list of conditions and the following disclaimer.
8 # * Redistributions in binary form must reproduce the above 8 # * Redistributions in binary form must reproduce the above
9 # copyright notice, this list of conditions and the following 9 # copyright notice, this list of conditions and the following
10 # disclaimer in the documentation and/or other materials provided 10 # disclaimer in the documentation and/or other materials provided
(...skipping 10 matching lines...) Expand all
21 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 27
28 from string import printable 28 from string import printable
29 29
30 class TransitionKey: 30 class TransitionKey:
31 '''Represents a transition from a state in DFA or NFA to another state.
32
33 A transition key has a list of character ranges and a list of class ranges
34 (e.g., "whitespace"), defining for which characters the transition
35 happens. When we generate code based on the transition key, the character
36 ranges generate simple checks and the class ranges generate more complicated
37 conditions, e.g., function calls.'''
31 38
32 __class_bounds = { 39 __class_bounds = {
33 'latin_1' : (1, 255), 40 'latin_1' : (1, 255),
34 # These are not real ranges; they just need to be separate from any real 41 # These are not real ranges; they just need to be separate from any real
35 # ranges. 42 # ranges.
36 'whitespace' : (256, 256), 43 'whitespace' : (256, 256),
37 'literal' : (257, 257), 44 'literal' : (257, 257),
38 'eos' : (258, 258), 45 'eos' : (258, 258),
39 'zero' : (259, 259), 46 'zero' : (259, 259),
40 } 47 }
41 __lower_bound = 1 48 __lower_bound = 1
42 __upper_bound = reduce(lambda acc, (k, v): max(acc, v[1]), __class_bounds.item s(), 0) 49 __upper_bound = max(__class_bounds.values(), key=lambda item: item[1])[1]
43 50
44 __cached_keys = {} 51 __cached_keys = {}
45 52
46 __unique_key_counter = -1 53 __unique_key_counter = -1
47 54
48 @staticmethod 55 @staticmethod
49 def __in_latin_1(char): 56 def __in_latin_1(char):
50 bound = TransitionKey.__class_bounds['latin_1'] 57 bound = TransitionKey.__class_bounds['latin_1']
51 return (bound[0] <= char and char <= bound[1]) 58 return (bound[0] <= char and char <= bound[1])
52 59
(...skipping 18 matching lines...) Expand all
71 r_is_class = TransitionKey.__is_class_range(r) 78 r_is_class = TransitionKey.__is_class_range(r)
72 # Assert that the ranges are in order. 79 # Assert that the ranges are in order.
73 if last != None and check_merged: 80 if last != None and check_merged:
74 assert last[1] + 1 < r[0] or r_is_class 81 assert last[1] + 1 < r[0] or r_is_class
75 if not TransitionKey.__in_latin_1(r[0]): 82 if not TransitionKey.__in_latin_1(r[0]):
76 assert r_is_class 83 assert r_is_class
77 if not TransitionKey.__in_latin_1(r[1]): 84 if not TransitionKey.__in_latin_1(r[1]):
78 assert r_is_class 85 assert r_is_class
79 last = r 86 last = r
80 87
81 @staticmethod
82 def __create(ranges, name = None):
83 if not ranges:
84 assert name == 'epsilon'
85 assert not name in TransitionKey.__cached_keys
86 else:
87 TransitionKey.__verify_ranges(ranges, True)
88 key = TransitionKey()
89 key.__ranges = tuple(ranges) # immutable
90 key.__name = name
91 key.__cached_hash = None
92 return key
93
94 def __is_unique(self): 88 def __is_unique(self):
95 return len(self.__ranges) == 1 and self.__is_unique_range(self.__ranges[0]) 89 return len(self.__ranges) == 1 and self.__is_unique_range(self.__ranges[0])
96 90
97 @staticmethod 91 @staticmethod
98 def __cached_key(name, bounds_getter): 92 def __cached_key(name, bounds_getter):
99 if not name in TransitionKey.__cached_keys: 93 if not name in TransitionKey.__cached_keys:
100 bounds = bounds_getter(name) 94 bounds = bounds_getter(name)
101 TransitionKey.__cached_keys[name] = TransitionKey.__create(bounds, name) 95 TransitionKey.__cached_keys[name] = TransitionKey(bounds, name)
102 return TransitionKey.__cached_keys[name] 96 return TransitionKey.__cached_keys[name]
103 97
104 @staticmethod 98 @staticmethod
105 def epsilon(): 99 def epsilon():
100 '''Returns a TransitionKey for the epsilon (empty) transition.'''
106 return TransitionKey.__cached_key('epsilon', lambda name : []) 101 return TransitionKey.__cached_key('epsilon', lambda name : [])
107 102
108 @staticmethod 103 @staticmethod
109 def any(): 104 def any():
105 '''Returns a TransitionKey which matches any character.'''
110 def bounds_getter(name): 106 def bounds_getter(name):
111 bounds = TransitionKey.__class_bounds.values() 107 bounds = TransitionKey.__class_bounds.values()
112 bounds.sort() 108 bounds.sort()
113 return bounds 109 return bounds
114 return TransitionKey.__cached_key('any', bounds_getter) 110 return TransitionKey.__cached_key('any', bounds_getter)
115 111
116 @staticmethod 112 @staticmethod
117 def single_char(char): 113 def single_char(char):
114 '''Returns a TransitionKey for a single-character transition.'''
118 char = ord(char) 115 char = ord(char)
119 assert TransitionKey.__in_latin_1(char) 116 assert TransitionKey.__in_latin_1(char)
120 return TransitionKey.__create([(char, char)]) 117 return TransitionKey([(char, char)])
121 118
122 @staticmethod 119 @staticmethod
123 def unique(name): 120 def unique(name):
121 '''Returns a unique TransitionKey for the given name (and creates it if it
122 doesn't exist yet). The TransitionKey won't have any real character range,
123 but a newly-created "mock" character range which is separate from all other
124 character ranges.'''
124 def get_bounds(name): 125 def get_bounds(name):
125 bound = TransitionKey.__unique_key_counter 126 bound = TransitionKey.__unique_key_counter
126 TransitionKey.__unique_key_counter -= 1 127 TransitionKey.__unique_key_counter -= 1
127 return [(bound, bound)] 128 return [(bound, bound)]
128 name = '__' + name 129 name = '__' + name
129 return TransitionKey.__cached_key(name, get_bounds) 130 return TransitionKey.__cached_key(name, get_bounds)
130 131
131 @staticmethod 132 @staticmethod
132 def __process_graph(graph, ranges, key_map): 133 def __process_graph(graph, ranges, key_map):
133 key = graph[0] 134 key = graph[0]
(...skipping 16 matching lines...) Expand all
150 ranges.append(TransitionKey.__class_bounds['zero']) 151 ranges.append(TransitionKey.__class_bounds['zero'])
151 elif class_name in key_map: 152 elif class_name in key_map:
152 ranges += key_map[class_name].__ranges 153 ranges += key_map[class_name].__ranges
153 else: 154 else:
154 raise Exception('unknown character class [%s]' % graph[1]) 155 raise Exception('unknown character class [%s]' % graph[1])
155 else: 156 else:
156 raise Exception('bad key [%s]' % key) 157 raise Exception('bad key [%s]' % key)
157 158
158 @staticmethod 159 @staticmethod
159 def character_class(graph, key_map): 160 def character_class(graph, key_map):
161 '''Processes 'graph' (a representation of a character class in the rule
162 file), and constructs a TransitionKey based on it. 'key_map' contains
163 already constructed aliases for character classes (they can be used in the
164 new character class). It is a map from strings (character class names) to
165 TransitionKeys. For example, graph might represent the character class
166 [a-z:digit:] where 'digit' is a previously constructed characte class found
167 in "key_map".'''
160 ranges = [] 168 ranges = []
161 assert graph[0] == 'CLASS' or graph[0] == 'NOT_CLASS' 169 assert graph[0] == 'CLASS' or graph[0] == 'NOT_CLASS'
162 invert = graph[0] == 'NOT_CLASS' 170 invert = graph[0] == 'NOT_CLASS'
163 TransitionKey.__process_graph(graph[1], ranges, key_map) 171 TransitionKey.__process_graph(graph[1], ranges, key_map)
164 return TransitionKey.__key_from_ranges(invert, ranges) 172 return TransitionKey.__key_from_ranges(invert, ranges)
165 173
166 def matches_char(self, char): 174 def matches_char(self, char):
167 char = ord(char) 175 char = ord(char)
168 # TODO class checks 176 # TODO class checks
169 for r in self.__ranges: 177 for r in self.__ranges:
170 if r[0] <= char and char <= r[1]: return True 178 if r[0] <= char and char <= r[1]: return True
171 return False 179 return False
172 180
173 def matches_key(self, key): 181 def is_superset_of_key(self, key):
182 '''Returns true if 'key' is a sub-key of this TransitionKey.'''
174 assert isinstance(key, self.__class__) 183 assert isinstance(key, self.__class__)
175 assert key != TransitionKey.epsilon() and not key.__is_unique() 184 assert key != TransitionKey.epsilon() and not key.__is_unique()
176 assert len(key.__ranges) == 1 185 assert len(key.__ranges) == 1
177 subkey = key.__ranges[0] 186 subkey = key.__ranges[0]
178 matches = False 187 matches = False
179 for k in self.__ranges: 188 for k in self.__ranges:
180 if k[0] <= subkey[0]: 189 if k[0] <= subkey[0]:
181 assert subkey[1] <= k[1] or subkey[0] > k[1] 190 assert subkey[1] <= k[1] or subkey[0] > k[1]
182 if subkey[0] < k[0]: 191 if subkey[0] < k[0]:
183 assert subkey[1] < k[0] 192 assert subkey[1] < k[0]
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
240 assert TransitionKey.__in_latin_1(x) 249 assert TransitionKey.__in_latin_1(x)
241 if not x in TransitionKey.__printable_cache: 250 if not x in TransitionKey.__printable_cache:
242 res = "'%s'" % chr(x) if chr(x) in printable else str(x) 251 res = "'%s'" % chr(x) if chr(x) in printable else str(x)
243 TransitionKey.__printable_cache[x] = res 252 TransitionKey.__printable_cache[x] = res
244 return TransitionKey.__printable_cache[x] 253 return TransitionKey.__printable_cache[x]
245 if r[0] == r[1]: 254 if r[0] == r[1]:
246 return '%s' % to_str(r[0]) 255 return '%s' % to_str(r[0])
247 else: 256 else:
248 return '[%s-%s]' % (to_str(r[0]), to_str(r[1])) 257 return '[%s-%s]' % (to_str(r[0]), to_str(r[1]))
249 258
259 def __init__(self, ranges, name = None):
260 if not ranges:
261 assert name == 'epsilon'
262 assert not name in TransitionKey.__cached_keys
263 else:
264 TransitionKey.__verify_ranges(ranges, True)
265 self.__name = name
266 self.__ranges = tuple(ranges) # immutable
267 self.__cached_hash = None
268
250 def __str__(self): 269 def __str__(self):
251 if self.__name: 270 if self.__name:
252 return self.__name 271 return self.__name
253 return ', '.join(TransitionKey.__range_str(x) for x in self.__ranges) 272 return ', '.join(TransitionKey.__range_str(x) for x in self.__ranges)
254 273
255 @staticmethod 274 @staticmethod
256 def __disjoint_keys(range_map): 275 def __disjoint_keys(range_map):
276 '''Takes a set of possibly overlapping ranges, returns a list of ranges
277 which don't overlap and which cover the same points as the original
278 set. range_map is a map from lower bounds to a list of upper bounds.'''
257 sort = lambda x : sorted(set(x)) 279 sort = lambda x : sorted(set(x))
258 range_map = sorted(map(lambda (k, v): (k, sort(v)), range_map.items())) 280 range_map = sorted(map(lambda (k, v): (k, sort(v)), range_map.items()))
259 ranges = [] 281 ranges = []
260 upper_bound = TransitionKey.__upper_bound + 1 282 upper_bound = TransitionKey.__upper_bound + 1
261 for i in range(len(range_map)): 283 for i in range(len(range_map)):
262 (left, left_values) = range_map[i] 284 (left, left_values) = range_map[i]
263 next = range_map[i + 1][0] if i != len(range_map) - 1 else upper_bound 285 next = range_map[i + 1][0] if i != len(range_map) - 1 else upper_bound
264 to_push = [] 286 to_push = []
265 for v in left_values: 287 for v in left_values:
266 assert left <= next 288 assert left <= next
(...skipping 20 matching lines...) Expand all
287 for r in x.__ranges: 309 for r in x.__ranges:
288 if not r[0] in range_map: 310 if not r[0] in range_map:
289 range_map[r[0]] = [] 311 range_map[r[0]] = []
290 range_map[r[0]].append(r[1]) 312 range_map[r[0]].append(r[1])
291 ranges = TransitionKey.__disjoint_keys(range_map) 313 ranges = TransitionKey.__disjoint_keys(range_map)
292 TransitionKey.__verify_ranges(ranges, False) 314 TransitionKey.__verify_ranges(ranges, False)
293 return ranges 315 return ranges
294 316
295 @staticmethod 317 @staticmethod
296 def disjoint_keys(key_set): 318 def disjoint_keys(key_set):
319 '''Takes a set of possibly overlapping TransitionKeys, returns a list of
320 TransitionKeys which don't overlap and whose union is the same as the union
321 of the original key_set. key_set is a set of TransitionKeys.'''
297 ranges = TransitionKey.__disjoint_ranges_from_key_set(key_set) 322 ranges = TransitionKey.__disjoint_ranges_from_key_set(key_set)
298 return map(lambda x : TransitionKey.__create([x]), ranges) 323 return map(lambda x : TransitionKey([x]), ranges)
299 324
300 @staticmethod 325 @staticmethod
301 def inverse_key(key_set): 326 def inverse_key(key_set):
327 '''Returns a TransitionKey which matches represents the inverse of the union
328 of 'key_set'. The TransitionKeys contain a set of character ranges and a set
329 of classes. The character ranges are inverted in relation to the latin_1
330 character range, and the character classes are inverted in relation to all
331 character classes in __class_bounds.'''
302 ranges = TransitionKey.__disjoint_ranges_from_key_set(key_set) 332 ranges = TransitionKey.__disjoint_ranges_from_key_set(key_set)
303 inverse = TransitionKey.__invert_ranges(ranges) 333 inverse = TransitionKey.__invert_ranges(ranges)
304 if not inverse: 334 if not inverse:
305 return None 335 return None
306 return TransitionKey.__create(inverse) 336 return TransitionKey(inverse)
307 337
308 @staticmethod 338 @staticmethod
309 def __key_from_ranges(invert, ranges): 339 def __key_from_ranges(invert, ranges):
310 range_map = {} 340 range_map = {}
311 for r in ranges: 341 for r in ranges:
312 if not r[0] in range_map: 342 if not r[0] in range_map:
313 range_map[r[0]] = [] 343 range_map[r[0]] = []
314 range_map[r[0]].append(r[1]) 344 range_map[r[0]].append(r[1])
315 ranges = TransitionKey.__disjoint_keys(range_map) 345 ranges = TransitionKey.__disjoint_keys(range_map)
316 ranges = TransitionKey.__merge_ranges(ranges) 346 ranges = TransitionKey.__merge_ranges(ranges)
317 if invert: 347 if invert:
318 ranges = TransitionKey.__invert_ranges(ranges) 348 ranges = TransitionKey.__invert_ranges(ranges)
319 return TransitionKey.__create(ranges) 349 return TransitionKey(ranges)
320 350
321 @staticmethod 351 @staticmethod
322 def __merge_ranges(ranges): 352 def __merge_ranges(ranges):
323 merged = [] 353 merged = []
324 last = None 354 last = None
325 for r in ranges: 355 for r in ranges:
326 assert not TransitionKey.__is_unique_range(r) 356 assert not TransitionKey.__is_unique_range(r)
327 if last == None: 357 if last == None:
328 last = r 358 last = r
329 elif (last[1] + 1 == r[0] and not TransitionKey.__is_class_range(r)): 359 elif (last[1] + 1 == r[0] and not TransitionKey.__is_class_range(r)):
330 last = (last[0], r[1]) 360 last = (last[0], r[1])
331 else: 361 else:
332 merged.append(last) 362 merged.append(last)
333 last = r 363 last = r
334 if last != None: 364 if last != None:
335 merged.append(last) 365 merged.append(last)
336 return merged 366 return merged
337 367
338 @staticmethod 368 @staticmethod
339 def merged_key(keys): 369 def merged_key(keys):
340 f = lambda acc, key: acc + list(key.__ranges) 370 f = lambda acc, key: acc + list(key.__ranges)
341 return TransitionKey.__key_from_ranges(False, reduce(f, keys, [])) 371 return TransitionKey.__key_from_ranges(False, reduce(f, keys, []))
342 372
343 @staticmethod 373 @staticmethod
344 def __invert_ranges(ranges): 374 def __invert_ranges(ranges):
345 inverted = [] 375 inverted = []
346 last = None 376 last = None
377 # Extract character classes (as opposed to character ranges) from
378 # __class_bounds. Since latin_1 is the only real character range, we can do
379 # this.
347 classes = set(TransitionKey.__class_bounds.values()) 380 classes = set(TransitionKey.__class_bounds.values())
348 latin_1 = TransitionKey.__class_bounds['latin_1'] 381 latin_1 = TransitionKey.__class_bounds['latin_1']
349 classes.remove(latin_1) 382 classes.remove(latin_1)
350 for r in ranges: 383 for r in ranges:
351 assert not TransitionKey.__is_unique_range(r) 384 assert not TransitionKey.__is_unique_range(r)
352 if TransitionKey.__is_class_range(r): 385 if TransitionKey.__is_class_range(r):
353 classes.remove(r) 386 classes.remove(r)
354 continue 387 continue
355 if last == None: 388 if last == None:
356 if r[0] != TransitionKey.__lower_bound: 389 if r[0] != TransitionKey.__lower_bound:
357 inverted.append((TransitionKey.__lower_bound, r[0] - 1)) 390 inverted.append((TransitionKey.__lower_bound, r[0] - 1))
358 elif last[1] + 1 < r[0]: 391 elif last[1] + 1 < r[0]:
359 inverted.append((last[1] + 1, r[0] - 1)) 392 inverted.append((last[1] + 1, r[0] - 1))
360 last = r 393 last = r
361 upper_bound = latin_1[1] 394 upper_bound = latin_1[1]
362 if last == None: 395 if last == None:
363 inverted.append(latin_1) 396 inverted.append(latin_1)
364 elif last[1] < upper_bound: 397 elif last[1] < upper_bound:
365 inverted.append((last[1] + 1, upper_bound)) 398 inverted.append((last[1] + 1, upper_bound))
366 inverted += list(classes) 399 inverted += list(classes)
367 return inverted 400 return inverted
OLDNEW
« no previous file with comments | « tools/lexer_generator/nfa.py ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698